--- /dev/null
+ GNU LESSER GENERAL PUBLIC LICENSE
+ Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL. It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+ This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it. You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+ When we speak of free software, we are referring to freedom of use,
+not price. Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+ To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights. These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+ For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you. You must make sure that they, too, receive or can get the source
+code. If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it. And you must show them these terms so they know their rights.
+
+ We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+ To protect each distributor, we want to make it very clear that
+there is no warranty for the free library. Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+\f
+ Finally, software patents pose a constant threat to the existence of
+any free program. We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder. Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+ Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License. This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License. We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+ When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library. The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom. The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+ We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License. It also provides other free software developers Less
+of an advantage over competing non-free programs. These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries. However, the Lesser license provides advantages in certain
+special circumstances.
+
+ For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard. To achieve this, non-free programs must be
+allowed to use the library. A more frequent case is that a free
+library does the same job as widely used non-free libraries. In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+ In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software. For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+ Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+ The precise terms and conditions for copying, distribution and
+modification follow. Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library". The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+\f
+ GNU LESSER GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+ A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+ The "Library", below, refers to any such software library or work
+which has been distributed under these terms. A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language. (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+ "Source code" for a work means the preferred form of the work for
+making modifications to it. For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+ Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it). Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+ 1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+ You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+\f
+ 2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) The modified work must itself be a software library.
+
+ b) You must cause the files modified to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ c) You must cause the whole of the work to be licensed at no
+ charge to all third parties under the terms of this License.
+
+ d) If a facility in the modified Library refers to a function or a
+ table of data to be supplied by an application program that uses
+ the facility, other than as an argument passed when the facility
+ is invoked, then you must make a good faith effort to ensure that,
+ in the event an application does not supply such function or
+ table, the facility still operates, and performs whatever part of
+ its purpose remains meaningful.
+
+ (For example, a function in a library to compute square roots has
+ a purpose that is entirely well-defined independent of the
+ application. Therefore, Subsection 2d requires that any
+ application-supplied function or table used by this function must
+ be optional: if the application does not supply it, the square
+ root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library. To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License. (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.) Do not make any other change in
+these notices.
+\f
+ Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+ This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+ 4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+ If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library". Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+ However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library". The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+ When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library. The
+threshold for this to be true is not precisely defined by law.
+
+ If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work. (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+ Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+\f
+ 6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+ You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License. You must supply a copy of this License. If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License. Also, you must do one
+of these things:
+
+ a) Accompany the work with the complete corresponding
+ machine-readable source code for the Library including whatever
+ changes were used in the work (which must be distributed under
+ Sections 1 and 2 above); and, if the work is an executable linked
+ with the Library, with the complete machine-readable "work that
+ uses the Library", as object code and/or source code, so that the
+ user can modify the Library and then relink to produce a modified
+ executable containing the modified Library. (It is understood
+ that the user who changes the contents of definitions files in the
+ Library will not necessarily be able to recompile the application
+ to use the modified definitions.)
+
+ b) Use a suitable shared library mechanism for linking with the
+ Library. A suitable mechanism is one that (1) uses at run time a
+ copy of the library already present on the user's computer system,
+ rather than copying library functions into the executable, and (2)
+ will operate properly with a modified version of the library, if
+ the user installs one, as long as the modified version is
+ interface-compatible with the version that the work was made with.
+
+ c) Accompany the work with a written offer, valid for at
+ least three years, to give the same user the materials
+ specified in Subsection 6a, above, for a charge no more
+ than the cost of performing this distribution.
+
+ d) If distribution of the work is made by offering access to copy
+ from a designated place, offer equivalent access to copy the above
+ specified materials from the same place.
+
+ e) Verify that the user has already received a copy of these
+ materials or that you have already sent this user a copy.
+
+ For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it. However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+ It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system. Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+\f
+ 7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+ a) Accompany the combined library with a copy of the same work
+ based on the Library, uncombined with any other library
+ facilities. This must be distributed under the terms of the
+ Sections above.
+
+ b) Give prominent notice with the combined library of the fact
+ that part of it is a work based on the Library, and explaining
+ where to find the accompanying uncombined form of the same work.
+
+ 8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License. Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License. However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+ 9. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Library or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+ 10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+\f
+ 11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all. For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded. In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+ 13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation. If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+\f
+ 14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission. For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this. Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+ NO WARRANTY
+
+ 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+\f
+ How to Apply These Terms to Your New Libraries
+
+ If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change. You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+ To apply these terms, attach the following notices to the library. It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the library's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the
+ library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+ <signature of Ty Coon>, 1 April 1990
+ Ty Coon, President of Vice
+
+That's all there is to it!
+
+
--- /dev/null
+
+# mpicxx must be on your path; on googoo, this means that
+# /usr/local/mpich2-1.0.2/bin must be on your path.
+
+# For now, use g++ most of the time.
+# When compiling MPI stuff, specify myfile.cc instead of myfile.o so that ${MPICC} is
+# invoked instead of the generic .o rule (or it'll use g++).
+# This makes it less annoying to build on non-mpi hosts for dev work, and seems to
+# behave just fine... change ${CC} back to mpicxx if you get paranoid.
+
+#CC = g++
+#CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE
+#LIBS = -lpthread
+
+# Hook for extra -I options, etc.
+EXTRA_CFLAGS =
+
+ifeq ($(target),darwin)
+# For Darwin
+CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -DDARWIN -D__FreeBSD__=10 ${EXTRA_CFLAGS}
+else
+# For linux
+CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE
+endif
+
+CC = g++
+LIBS = -lpthread
+
+#for normal mpich2 machines
+MPICC = mpicxx
+MPICFLAGS = ${CFLAGS}
+MPILIBS = ${LIBS}
+
+#for LLNL boxes without mpicxx
+#MPICC = g++
+#MPICFLAGS = ${CFLAGS} -I/usr/lib/mpi/include -L/usr/lib/mpi/mpi_gnu/lib
+#MPILIBS = ${LIBS} -lelan -lmpi
+
+EBOFS_OBJS= \
+ ebofs/BlockDevice.o\
+ ebofs/BufferCache.o\
+ ebofs/Ebofs.o\
+ ebofs/Allocator.o
+
+MDS_OBJS= \
+ mds/MDS.o\
+ mds/journal.o\
+ mds/Server.o\
+ mds/MDCache.o\
+ mds/Locker.o\
+ mds/Migrator.o\
+ mds/Renamer.o\
+ mds/MDBalancer.o\
+ mds/CDentry.o\
+ mds/CDir.o\
+ mds/CInode.o\
+ mds/AnchorTable.o\
+ mds/AnchorClient.o\
+ mds/MDStore.o\
+ mds/LogEvent.o\
+ mds/IdAllocator.o\
+ mds/MDLog.o
+
+OSD_OBJS= \
+ osd/PG.o\
+ osd/Ager.o\
+ osd/FakeStore.o\
+ osd/OSD.o
+
+OSDC_OBJS= \
+ osdc/Objecter.o\
+ osdc/ObjectCacher.o\
+ osdc/Filer.o\
+ osdc/Journaler.o
+
+MON_OBJS= \
+ mon/Monitor.o\
+ mon/OSDMonitor.o\
+ mon/MDSMonitor.o\
+ mon/ClientMonitor.o\
+ mon/Elector.o
+
+COMMON_OBJS= \
+ msg/Messenger.o\
+ msg/Message.o\
+ common/Logger.o\
+ common/Clock.o\
+ common/Timer.o\
+ config.o
+
+
+CLIENT_OBJS= \
+ client/FileCache.o\
+ client/Client.o\
+ client/SyntheticClient.o\
+ client/Trace.o
+
+TARGETS = cmon cosd cmds cfuse newsyn fakesyn
+
+SRCS=*.cc */*.cc *.h */*.h */*/*.h
+
+all: depend ${TARGETS}
+
+test: depend ${TEST_TARGETS}
+
+obfs: depend obfstest
+
+
+# real bits
+mkmonmap: mkmonmap.cc common.o
+ ${CC} ${CFLAGS} ${LIBS} $^ -o $@
+
+cmon: cmon.cc mon.o ebofs.o msg/SimpleMessenger.o common.o
+ ${CC} ${CFLAGS} ${LIBS} $^ -o $@
+
+cosd: cosd.cc osd.o ebofs.o msg/SimpleMessenger.o common.o
+ ${CC} ${CFLAGS} ${LIBS} $^ -o $@
+
+cmds: cmds.cc mds.o osdc.o msg/SimpleMessenger.o common.o
+ ${CC} ${CFLAGS} ${LIBS} $^ -o $@
+
+csyn: csyn.cc client.o osdc.o msg/SimpleMessenger.o common.o
+ ${CC} ${CFLAGS} ${LIBS} $^ -o $@
+
+cfuse: cfuse.cc client.o osdc.o client/fuse.o msg/SimpleMessenger.o common.o
+ ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
+
+
+# misc
+gprof-helper.so: test/gprof-helper.c
+ gcc -shared -fPIC test/gprof-helper.c -o gprof-helper.so -lpthread -ldl
+
+
+
+# fuse
+fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o msg/FakeMessenger.cc common.o
+ ${CC} -pg ${CFLAGS} ${LIBS} -lfuse $^ -o $@
+
+tcpfuse: tcpfuse.cc mds.o client.o client/fuse.o ${TCP_OBJS} common.o
+ ${MPICC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
+
+mpifuse: mpifuse.cc mds.o client.o client/fuse.o ${TCP_OBJS} common.o
+ ${MPICC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
+
+
+# synthetic workload
+fakesyn: fakesyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o
+ ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
+
+tcpsyn: tcpsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o ${TCP_OBJS} common.o
+ ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
+
+newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o
+ ${MPICC} -pg ${MPICFLAGS} ${MPILIBS} $^ -o $@
+
+newsyn.nopg: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/NewerMessenger.o common.o
+ ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
+
+# + obfs
+fakesynobfs: fakesyn.cc mds.o client.o osd_obfs.o msg/FakeMessenger.o common.o
+ ${CC} -DUSE_OBFS ${CFLAGS} ${LIBS} $^ -o $@
+
+tcpsynobfs: tcpsyn.cc mds.o client.o osd_obfs.o ${TCP_OBJS} common.o
+ ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@
+
+
+# ebofs
+
+mkfs.ebofs: ebofs/mkfs.ebofs.cc config.cc common/Clock.o ebofs.o
+ ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
+
+test.ebofs: ebofs/test.ebofs.cc config.cc common/Clock.o ebofs.o
+ ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
+
+
+
+
+# libceph
+libceph.o: client/ldceph.o client/Client.o ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS}
+ ar -rc $@ $^
+
+bench/mdtest/mdtest.o: bench/mdtest/mdtest.c
+ mpicc -c $^ -o $@
+
+mdtest: bench/mdtest/mdtest.o
+ ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
+
+mdtest.ceph: bench/mdtest/mdtest.o libceph.o
+ ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
+
+#
+
+%.so: %.cc
+ ${CC} -shared -fPIC ${CFLAGS} $< -o $@
+
+clean:
+ rm -f *.o */*.o ${TARGETS} ${TEST_TARGETS}
+
+common.o: ${COMMON_OBJS}
+ ar -rc $@ $^
+
+ebofs.o: ${EBOFS_OBJS}
+ ar -rc $@ $^
+
+client.o: ${CLIENT_OBJS}
+ ar -rc $@ $^
+
+osd.o: ${OSD_OBJS}
+ ar -rc $@ $^
+
+osdc.o: ${OSDC_OBJS}
+ ar -rc $@ $^
+
+osd_obfs.o: osd/OBFSStore.o osd/OSD.cc osd/PG.o osd/ObjectStore.o osd/FakeStore.o
+ ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ ../uofs/uofs.o
+
+mds.o: ${MDS_OBJS}
+ ar -rc $@ $^
+
+mon.o: ${MON_OBJS}
+ ar -rc $@ $^
+
+%.o: %.cc
+ ${CC} ${CFLAGS} -c $< -o $@
+
+%.po: %.cc
+ ${CC} -fPIC ${CFLAGS} -c $< -o $@
+
+count:
+ cat ${SRCS} | wc -l
+ cat ${SRCS} | grep -c \;
+
+.depend:
+ touch .depend
+
+depend:
+ $(RM) .depend
+ makedepend -f- -- $(CFLAGS) -- $(SRCS) > .depend 2>/dev/null
+
+# now add a line to include the dependency list.
+include .depend
--- /dev/null
+Ceph - a scalable distributed file system
+-----------------------------------------
+
+Please see http://ceph.sourceforge.net/ for current info.
--- /dev/null
+
+- paxos for monitor
+- lnet?
+- crush
+ - xml import/export?
+ - crush tools
+
+== todo
+
+1- pipelining writes?
+2- intervening reads?
+
+inode ops
+ utime -- no concurrency issues
+ chown/chmod -- should lock
+ truncate -- should lock
+ 1-> no. multiple process concurrency on a single inode is not important.
+ 2-> maybe... intervening stats? probably not important.
+
+directory ops. parent inode mtime, + dirent xlocks?
+ mknod
+ open+create
+ symlink
+ unlink
+ rmdir
+ rename
+ 1-> yes. but mtime updates are independent (mtime monotonically increasing), so it's easy.
+ 2-> yes.
+
+--> so, make let's make file/hard wrlock exclusive.
+
+locks
+ namespace
+ path pins -- read lock
+ dentry xlock -- write lock
+ inode
+ hard/file rd start/stop -- read lock
+ hard/file wr start/stop -- write lock
+
+
+
+
+- integrate revisions into ObjectCacher
+- clean up oid.rev vs op.rev in osd+osdc
+
+rados paper todo
+- better experiments
+- flush log only in response to subsequent read or write?
+- better behaving recovery
+- justify use of splay.
+ - dynamic replication
+- snapshots
+
+rados snapshots
+- attr.crev is rev we were created in.
+- oid.rev=0 is "live". defined for attr.crev <= rev.
+- otherwise, defined for attr.crev <= rev < oid.rev (i.e. oid.rev is upper bound, non-inclusive.)
+
+- write|delete is tagged with op.rev
+ - if attr.crev < op.rev
+ - we clone to oid.rev=rev (clone keeps old crev)
+ - change live attr.crev=rev.
+ - apply update
+- read is tagged with op.rev
+ - if 0, we read from 0 (if it exists).
+ - otherwise we choose object rev based on op.rev vs oid.rev, and then verifying attr.crev <= op.rev.
+
+- how to get usage feedback to monitor?
+
+- change messenger entity_inst_t
+ - no more rank! make it a uniquish nonce?
+
+- clean up mds caps release in exporter
+- figure out client failure modes
+- clean up messenger failure modes.
+- add connection retry.
+
+mds recovery
+- multiple passes?
+ 1- establish import/export map
+ ?-
+ 2- replay inode, dir, dentry updates
+- single pass
+ - each event needs to embed inode for trace up to the import
+ - second stage will reconcile cached items with other active mds nodes
+ - cached items will be shared with the primary to repopulate it's non-dirty cache
+ - query clients for their state too?
+ - mds must journal list of clients with whom we share state?
+
+
+journaler
+- should we pad with zeros to avoid splitting individual entries?
+ - make it a g_conf flag?
+ - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes)
+- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes
+
+
+monitor
+?- monitor user lib that handles resending, redirection of mon requests.
+- elector
+/- organize monitor store
+
+osdmon
+- distribute
+- recovery: store elector epochs with maps..
+- monitor needs to monitor some osds...
+- monitor pgs, notify on out
+- watch osd utilization; adjust overload in cluster map
+
+mdsmon
+
+osd/rados
+- efficiently replicate clone() objects
+- pg_num instead of pg_bits
+- flag missing log entries on crash recovery --> WRNOOP? or WRLOST?
+- consider implications of nvram writeahead logs
+- fix heartbeat wrt new replication
+- mark residual pgs obsolete ???
+- rdlocks
+- optimize remove wrt recovery pushes
+- pg_bit/pg_num changes
+- report crashed pgs?
+
+messenger
+/- share same tcp socket for sender and receiver
+/- graceful connection teardown
+- close idle connections
+- generalize out a transport layer?
+ - eg reliable tcp for most things, connectionless unreliable datagrams for monitors?
+ - or, aggressive connection closing on monitors? or just max_connections and an lru?
+- osds: forget idle client addrs
+
+objecter
+
+objectcacher
+- ocacher caps transitions vs locks
+- test read locks
+
+reliability
+- heartbeat vs ping
+- osdmonitor, filter
+
+ebofs
+- verify proper behavior of conflicting/overlapping reads of clones
+- test(fix) sync()
+- combine inodes and/or cnodes into same blocks
+- allow btree sets instead of maps
+- eliminate nodepools
+- nonblocking write on missing onodes?
+- fix bug in node rotation on insert (and reenable)
+- fix NEAR_LAST_FWD (?)
+- journaling? in NVRAM?
+- metadata in nvram? flash?
+
+
+
+bugs/stability
+- figure out weird 40ms latency with double log entries
+
+
+general
+- timer needs cancel sets, schedulers need to cancel outstanding events on shutdown
+- well, just figure out general timer cancellation strategy that avoids races
+ - use updated Timer as a model?
+
+
+remaining hard problems
+- how to cope with file size changes and read/write sharing
+- mds failure recovery (of course)
+
+
+crush
+- more efficient failure when all/too many osds are down
+- allow forcefeed for more complicated rule structures. (e.g. make force_stack a list< set<int> >)
+
+
+mds
+- distributed client management
+- anchormgr
+ - 2pc
+ - independent journal
+ - distributed?
+- link count management
+ - also 2pc
+- chdir (directory opens!)
+- rewrite logstream
+ - clean up
+ - be smart about rados ack vs reread
+ - log locking? root log object
+ - trimming, rotation
+
+- efficient stat for single writers
+- lstat vs stat
+- add FILE_CAP_EXTEND capability bit
+- only share osdmap updates with clients holding capabilities
+- delayed replica caps release... we need to set a timer event? (and cancel it when appropriate?)
+- finish hard links!
+ - reclaim danglers from inode file on discover...
+ - fix rename wrt hard links
+- interactive hash/unhash interface
+- test hashed readdir
+- make logstream.flush align itself to stripes
+
+- carefully define/document frozen wrt dir_auth vs hashing
+
+
+
+client
+- mixed lazy and non-lazy io will clobber each others' caps in the buffer cache
+
+- test client caps with meta exports
+- some heuristic behavior to consolidate caps to inode auth
+- client will re-tx anything it needed to say upon rx of new mds notification (?)
+
+
+
+
+
+
+MDS TODO
+- fix hashed readdir: should (optionally) do a lock on dir namespace?
+- fix hard links
+ - they mostly work, but they're fragile
+- sync clients on stat
+ - will need to ditch 10s client metadata caching before this is useful
+ - implement truncate
+- implement hashed directories
+- statfs?
+- rewrite journal + recovery
+- figure out online failure recovery
+- more distributed fh management?
+- btree directories (for efficient large directories)
+- consistency points/snapshots
+
+- fix MExportAck and others to use dir+dentry, not inode
+ (otherwise this all breaks with hard links.. altho it probably needs reworking already?)
+
+
+
+
+
+why qsync could be wrong (for very strict POSIX) : varying mds -> client message transit or processing times.
+- mds -> 1,2 : qsync
+- client1 writes at byte 100
+- client1 -> mds : qsync reply (size=100)
+- client1 writes at byte 300
+- client1 -> client2 (outside channel)
+- client2 writes at byte 200
+- client2 -> mds : qsync reply (size=200)
+-> stat results in size 200, even though at no single point in time was the max size 500.
+-> for correct result, need to _stop_ client writers while gathering metadata.
+
+
+SAGE:
+
+- string table?
+
+- hard links
+ - fix MExportAck and others to use dir+dentry, not inode
+ (otherwise this all breaks with hard links.. altho it probably needs reworking already!)
+
+- do real permission checks?
+
+
+
+CLIENT TODO
+
+- statfs
+
+
+
+
+
+ISSUES
+
+
+- discover
+ - soft: authority selectively repicates, or sets a 'forward' flag in reply
+ - hard: authority always replicates (eg. discover for export)
+ - forward flag (see soft)
+ - error flag (if file not found, etc.)
+ - [what was i talking about?] make sure waiters are properly triggered, either upon dir_rep update, or (empty!) discover reply
+
+
+
+DOCUMENT
+- cache, distributed cache structure and invariants
+- export process
+- hash/unhash process
+
+
+TEST
+- hashing
+ - test hash/unhash operation
+ - hash+export: encode list of replicated dir inodes so they can be discovered before import is procesed.
+ - test nauthitems (wrt hashing?)
+
+
+IMPLEMENT
+
+- smarter balancing
+ - popularity calculation and management is inconsistent/wrong.
+ - does it work?
+
+- dump active config in run output somewhere
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "client/Client.h"
+#include "client/fuse.h"
+
+#include "msg/SimpleMessenger.h"
+
+#include "common/Timer.h"
+
+#ifndef DARWIN
+#include <envz.h>
+#endif // DARWIN
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+int main(int argc, char **argv, char *envp[]) {
+
+ //cerr << "cfuse starting " << myrank << "/" << world << endl;
+ vector<char*> args;
+ argv_to_vec(argc, argv, args);
+ parse_config_options(args);
+
+ // args for fuse
+ vec_to_argv(args, argc, argv);
+
+ // load monmap
+ MonMap monmap;
+ int r = monmap.read(".ceph_monmap");
+ assert(r >= 0);
+
+ // start up network
+ rank.start_rank();
+
+ // start client
+ Client *client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), &monmap);
+ client->init();
+
+ // start up fuse
+ // use my argc, argv (make sure you pass a mount point!)
+ cout << "mounting" << endl;
+ client->mount();
+
+ cerr << "starting fuse on pid " << getpid() << endl;
+ ceph_fuse_main(client, argc, argv);
+ cerr << "fuse finished on pid " << getpid() << endl;
+
+ client->unmount();
+ cout << "unmounted" << endl;
+ client->shutdown();
+
+ delete client;
+
+ // wait for messenger to finish
+ rank.wait();
+
+ return 0;
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+// unix-ey fs stuff
+#include <unistd.h>
+#include <sys/types.h>
+#include <time.h>
+#include <utime.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#ifdef DARWIN
+#include <sys/statvfs.h>
+#endif // DARWIN
+
+
+#include <iostream>
+using namespace std;
+
+
+// ceph stuff
+#include "Client.h"
+
+
+#include "messages/MClientBoot.h"
+#include "messages/MClientMount.h"
+#include "messages/MClientMountAck.h"
+#include "messages/MClientFileCaps.h"
+
+#include "messages/MGenericMessage.h"
+
+#include "messages/MMDSGetMap.h"
+#include "messages/MMDSMap.h"
+
+#include "osdc/Filer.h"
+#include "osdc/Objecter.h"
+#include "osdc/ObjectCacher.h"
+
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/Logger.h"
+
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) cout << "client" << whoami << "." << pthread_self() << " "
+
+#define tout if (g_conf.client_trace) cout << "trace: "
+
+
+// static logger
+LogType client_logtype;
+Logger *client_logger = 0;
+
+
+
+class C_Client_CloseRelease : public Context {
+ Client *cl;
+ Inode *in;
+public:
+ C_Client_CloseRelease(Client *c, Inode *i) : cl(c), in(i) {}
+ void finish(int) {
+ cl->close_release(in);
+ }
+};
+
+class C_Client_CloseSafe : public Context {
+ Client *cl;
+ Inode *in;
+public:
+ C_Client_CloseSafe(Client *c, Inode *i) : cl(c), in(i) {}
+ void finish(int) {
+ cl->close_safe(in);
+ }
+};
+
+
+
+
+
+
+// cons/des
+
+Client::Client(Messenger *m, MonMap *mm)
+{
+ // which client am i?
+ whoami = m->get_myaddr().num();
+ monmap = mm;
+
+ mounted = false;
+ unmounting = false;
+
+ last_tid = 0;
+ unsafe_sync_write = 0;
+
+ mdsmap = 0;
+
+ //
+ root = 0;
+
+ set_cache_size(g_conf.client_cache_size);
+
+ // file handles
+ free_fh_set.insert(10, 1<<30);
+
+ // set up messengers
+ messenger = m;
+ messenger->set_dispatcher(this);
+
+ // osd interfaces
+ osdmap = new OSDMap(); // initially blank.. see mount()
+ objecter = new Objecter(messenger, monmap, osdmap);
+ objectcacher = new ObjectCacher(objecter, client_lock);
+ filer = new Filer(objecter);
+}
+
+
+Client::~Client()
+{
+ if (messenger) { delete messenger; messenger = 0; }
+ if (filer) { delete filer; filer = 0; }
+ if (objectcacher) { delete objectcacher; objectcacher = 0; }
+ if (objecter) { delete objecter; objecter = 0; }
+ if (osdmap) { delete osdmap; osdmap = 0; }
+
+ tear_down_cache();
+}
+
+
+void Client::tear_down_cache()
+{
+ // fh's
+ for (hash_map<fh_t, Fh*>::iterator it = fh_map.begin();
+ it != fh_map.end();
+ it++) {
+ Fh *fh = it->second;
+ dout(1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->inode.ino << endl;
+ put_inode(fh->inode);
+ delete fh;
+ }
+ fh_map.clear();
+
+ // caps!
+ // *** FIXME ***
+
+ // empty lru
+ lru.lru_set_max(0);
+ trim_cache();
+ assert(lru.lru_get_size() == 0);
+
+ // close root ino
+ assert(inode_map.size() <= 1);
+ if (root && inode_map.size() == 1) {
+ delete root;
+ root = 0;
+ inode_map.clear();
+ }
+
+ assert(inode_map.empty());
+}
+
+
+
+// debug crapola
+
+void Client::dump_inode(Inode *in, set<Inode*>& did)
+{
+ dout(1) << "dump_inode: inode " << in->ino() << " ref " << in->ref << " dir " << in->dir << endl;
+
+ if (in->dir) {
+ dout(1) << " dir size " << in->dir->dentries.size() << endl;
+ //for (hash_map<const char*, Dentry*, hash<const char*>, eqstr>::iterator it = in->dir->dentries.begin();
+ for (hash_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
+ it != in->dir->dentries.end();
+ it++) {
+ dout(1) << " dn " << it->first << " ref " << it->second->ref << endl;
+ dump_inode(it->second->inode, did);
+ }
+ }
+}
+
+void Client::dump_cache()
+{
+ set<Inode*> did;
+
+ if (root) dump_inode(root, did);
+
+ for (hash_map<inodeno_t, Inode*>::iterator it = inode_map.begin();
+ it != inode_map.end();
+ it++) {
+ if (did.count(it->second)) continue;
+
+ dout(1) << "dump_cache: inode " << it->first
+ << " ref " << it->second->ref
+ << " dir " << it->second->dir << endl;
+ if (it->second->dir) {
+ dout(1) << " dir size " << it->second->dir->dentries.size() << endl;
+ }
+ }
+
+}
+
+
+void Client::init() {
+
+}
+
+void Client::shutdown() {
+ dout(1) << "shutdown" << endl;
+ messenger->shutdown();
+}
+
+
+
+
+// ===================
+// metadata cache stuff
+
+void Client::trim_cache()
+{
+ unsigned last = 0;
+ while (lru.lru_get_size() != last) {
+ last = lru.lru_get_size();
+
+ if (lru.lru_get_size() <= lru.lru_get_max()) break;
+
+ // trim!
+ Dentry *dn = (Dentry*)lru.lru_expire();
+ if (!dn) break; // done
+
+ //dout(10) << "trim_cache unlinking dn " << dn->name << " in dir " << hex << dn->dir->inode->inode.ino << endl;
+ unlink(dn);
+ }
+
+ // hose root?
+ if (lru.lru_get_size() == 0 && root && inode_map.size() == 1) {
+ delete root;
+ root = 0;
+ inode_map.clear();
+ }
+}
+
+/** insert_inode
+ *
+ * insert + link a single dentry + inode into the metadata cache.
+ */
+Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname)
+{
+ Dentry *dn = NULL;
+ if (dir->dentries.count(dname))
+ dn = dir->dentries[dname];
+
+ dout(12) << "insert_inode " << dname << " ino " << st->inode.ino
+ << " size " << st->inode.size
+ << " mtime " << st->inode.mtime
+ << " hashed " << st->hashed
+ << endl;
+
+ if (dn) {
+ if (dn->inode->inode.ino == st->inode.ino) {
+ touch_dn(dn);
+ dout(12) << " had dentry " << dname
+ << " with correct ino " << dn->inode->inode.ino
+ << endl;
+ } else {
+ dout(12) << " had dentry " << dname
+ << " with WRONG ino " << dn->inode->inode.ino
+ << endl;
+ unlink(dn);
+ dn = NULL;
+ }
+ }
+
+ if (!dn) {
+ // have inode linked elsewhere? -> unlink and relink!
+ if (inode_map.count(st->inode.ino)) {
+ Inode *in = inode_map[st->inode.ino];
+ assert(in);
+
+ if (in->dn) {
+ dout(12) << " had ino " << in->inode.ino
+ << " linked at wrong position, unlinking"
+ << endl;
+ dn = relink(in->dn, dir, dname);
+ } else {
+ // link
+ dout(12) << " had ino " << in->inode.ino
+ << " unlinked, linking" << endl;
+ dn = link(dir, dname, in);
+ }
+ }
+ }
+
+ if (!dn) {
+ Inode *in = new Inode(st->inode, objectcacher);
+ inode_map[st->inode.ino] = in;
+ dn = link(dir, dname, in);
+ dout(12) << " new dentry+node with ino " << st->inode.ino << endl;
+ } else {
+ // actually update info
+ dout(12) << " stat inode mask is " << st->inode.mask << endl;
+ dn->inode->inode = st->inode;
+
+ // ...but don't clobber our mtime, size!
+ if ((dn->inode->inode.mask & INODE_MASK_SIZE) == 0 &&
+ dn->inode->file_wr_size > dn->inode->inode.size)
+ dn->inode->inode.size = dn->inode->file_wr_size;
+ if ((dn->inode->inode.mask & INODE_MASK_MTIME) == 0 &&
+ dn->inode->file_wr_mtime > dn->inode->inode.mtime)
+ dn->inode->inode.mtime = dn->inode->file_wr_mtime;
+ }
+
+ // OK, we found it!
+ assert(dn && dn->inode);
+
+ // or do we have newer size/mtime from writing?
+ if (dn->inode->file_caps() & CAP_FILE_WR) {
+ if (dn->inode->file_wr_size > dn->inode->inode.size)
+ dn->inode->inode.size = dn->inode->file_wr_size;
+ if (dn->inode->file_wr_mtime > dn->inode->inode.mtime)
+ dn->inode->inode.mtime = dn->inode->file_wr_mtime;
+ }
+
+ // symlink?
+ if ((dn->inode->inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) {
+ if (!dn->inode->symlink)
+ dn->inode->symlink = new string;
+ *(dn->inode->symlink) = st->symlink;
+ }
+
+ return dn->inode;
+}
+
+/** update_inode_dist
+ *
+ * update MDS location cache for a single inode
+ */
+void Client::update_inode_dist(Inode *in, InodeStat *st)
+{
+ // dir info
+ in->dir_auth = st->dir_auth;
+ in->dir_hashed = st->hashed;
+ in->dir_replicated = st->replicated;
+
+ // dir replication
+ if (st->spec_defined) {
+ if (st->dist.empty() && !in->dir_contacts.empty())
+ dout(9) << "lost dist spec for " << in->inode.ino
+ << " " << st->dist << endl;
+ if (!st->dist.empty() && in->dir_contacts.empty())
+ dout(9) << "got dist spec for " << in->inode.ino
+ << " " << st->dist << endl;
+ in->dir_contacts = st->dist;
+ }
+}
+
+
+/** insert_trace
+ *
+ * insert a trace from a MDS reply into the cache.
+ */
+Inode* Client::insert_trace(MClientReply *reply)
+{
+ Inode *cur = root;
+ time_t now = time(NULL);
+
+ dout(10) << "insert_trace got " << reply->get_trace_in().size() << " inodes" << endl;
+
+ list<string>::const_iterator pdn = reply->get_trace_dn().begin();
+
+ for (list<InodeStat*>::const_iterator pin = reply->get_trace_in().begin();
+ pin != reply->get_trace_in().end();
+ ++pin) {
+
+ if (pin == reply->get_trace_in().begin()) {
+ // root
+ dout(10) << "insert_trace root" << endl;
+ if (!root) {
+ // create
+ cur = root = new Inode((*pin)->inode, objectcacher);
+ inode_map[root->inode.ino] = root;
+ }
+ } else {
+ // not root.
+ dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << endl;
+ Dir *dir = cur->open_dir();
+ cur = this->insert_inode(dir, *pin, *pdn);
+ ++pdn;
+
+ // move to top of lru!
+ if (cur->dn)
+ lru.lru_touch(cur->dn);
+ }
+
+ // update dist info
+ update_inode_dist(cur, *pin);
+
+ // set cache ttl
+ if (g_conf.client_cache_stat_ttl)
+ cur->valid_until = now + g_conf.client_cache_stat_ttl;
+ }
+
+ return cur;
+}
+
+
+
+
+Dentry *Client::lookup(filepath& path)
+{
+ dout(14) << "lookup " << path << endl;
+
+ Inode *cur = root;
+ if (!cur) return NULL;
+
+ Dentry *dn = 0;
+ for (unsigned i=0; i<path.depth(); i++) {
+ dout(14) << " seg " << i << " = " << path[i] << endl;
+ if (cur->inode.mode & INODE_MODE_DIR &&
+ cur->dir) {
+ // dir, we can descend
+ Dir *dir = cur->dir;
+ if (dir->dentries.count(path[i])) {
+ dn = dir->dentries[path[i]];
+ dout(14) << " hit dentry " << path[i] << " inode is " << dn->inode << " valid_until " << dn->inode->valid_until << endl;
+ } else {
+ dout(14) << " dentry " << path[i] << " dne" << endl;
+ return NULL;
+ }
+ cur = dn->inode;
+ assert(cur);
+ } else {
+ return NULL; // not a dir
+ }
+ }
+
+ if (dn) {
+ dout(11) << "lookup '" << path << "' found " << dn->name << " inode " << dn->inode->inode.ino << " valid_until " << dn->inode->valid_until<< endl;
+ }
+
+ return dn;
+}
+
+// -------
+
+MClientReply *Client::make_request(MClientRequest *req,
+ bool auth_best,
+ int use_mds) // this param is icky, debug weirdness!
+{
+ // assign a unique tid
+ req->set_tid(++last_tid);
+
+ // find deepest known prefix
+ Inode *diri = root; // the deepest known containing dir
+ Inode *item = 0; // the actual item... if we know it
+ int missing_dn = -1; // which dn we miss on (if we miss)
+
+ unsigned depth = req->get_filepath().depth();
+ for (unsigned i=0; i<depth; i++) {
+ // dir?
+ if (diri && diri->inode.mode & INODE_MODE_DIR && diri->dir) {
+ Dir *dir = diri->dir;
+
+ // do we have the next dentry?
+ if (dir->dentries.count( req->get_filepath()[i] ) == 0) {
+ missing_dn = i; // no.
+ break;
+ }
+
+ dout(7) << " have path seg " << i << " on " << diri->dir_auth << " ino " << diri->inode.ino << " " << req->get_filepath()[i] << endl;
+
+ if (i == depth-1) { // last one!
+ item = dir->dentries[ req->get_filepath()[i] ]->inode;
+ break;
+ }
+
+ // continue..
+ diri = dir->dentries[ req->get_filepath()[i] ]->inode;
+ assert(diri);
+ } else {
+ missing_dn = i;
+ break;
+ }
+ }
+
+ // choose an mds
+ int mds = 0;
+ if (diri) {
+ if (auth_best) {
+ // pick the actual auth (as best we can)
+ if (item) {
+ mds = item->authority(mdsmap);
+ } else if (diri->dir_hashed && missing_dn >= 0) {
+ mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(),
+ mdsmap);
+ } else {
+ mds = diri->authority(mdsmap);
+ }
+ } else {
+ // balance our traffic!
+ if (diri->dir_hashed && missing_dn >= 0)
+ mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(),
+ mdsmap);
+ else
+ mds = diri->pick_replica(mdsmap);
+ }
+ } else {
+ // no root info, pick a random MDS
+ mds = rand() % mdsmap->get_num_mds();
+ }
+ dout(20) << "mds is " << mds << endl;
+
+ // force use of a particular mds?
+ if (use_mds >= 0) mds = use_mds;
+
+
+ // time the call
+ utime_t start = g_clock.now();
+
+ bool nojournal = false;
+ int op = req->get_op();
+ if (op == MDS_OP_STAT ||
+ op == MDS_OP_LSTAT ||
+ op == MDS_OP_READDIR ||
+ op == MDS_OP_OPEN ||
+ op == MDS_OP_RELEASE)
+ nojournal = true;
+
+ MClientReply *reply = sendrecv(req, mds);
+
+ if (client_logger) {
+ utime_t lat = g_clock.now();
+ lat -= start;
+ dout(20) << "lat " << lat << endl;
+ client_logger->finc("lsum",(double)lat);
+ client_logger->inc("lnum");
+
+ if (nojournal) {
+ client_logger->finc("lrsum",(double)lat);
+ client_logger->inc("lrnum");
+ } else {
+ client_logger->finc("lwsum",(double)lat);
+ client_logger->inc("lwnum");
+ }
+
+ if (op == MDS_OP_STAT) {
+ client_logger->finc("lstatsum",(double)lat);
+ client_logger->inc("lstatnum");
+ }
+ else if (op == MDS_OP_READDIR) {
+ client_logger->finc("ldirsum",(double)lat);
+ client_logger->inc("ldirnum");
+ }
+
+ }
+
+ return reply;
+}
+
+
+MClientReply* Client::sendrecv(MClientRequest *req, int mds)
+{
+ // NEW way.
+ Cond cond;
+ tid_t tid = req->get_tid();
+ mds_rpc_cond[tid] = &cond;
+
+ messenger->send_message(req, MSG_ADDR_MDS(mds), mdsmap->get_inst(mds), MDS_PORT_SERVER);
+
+ // wait
+ while (mds_rpc_reply.count(tid) == 0) {
+ dout(20) << "sendrecv awaiting reply kick on " << &cond << endl;
+ cond.Wait(client_lock);
+ }
+
+ // got it!
+ MClientReply *reply = mds_rpc_reply[tid];
+
+ // kick dispatcher (we've got it!)
+ assert(mds_rpc_dispatch_cond.count(tid));
+ mds_rpc_dispatch_cond[tid]->Signal();
+ dout(20) << "sendrecv kickback on tid " << tid << " " << mds_rpc_dispatch_cond[tid] << endl;
+
+ // clean up.
+ mds_rpc_cond.erase(tid);
+ mds_rpc_reply.erase(tid);
+
+ return reply;
+}
+
+void Client::handle_client_reply(MClientReply *reply)
+{
+ tid_t tid = reply->get_tid();
+
+ // store reply
+ mds_rpc_reply[tid] = reply;
+
+ // wake up waiter
+ assert(mds_rpc_cond.count(tid));
+ dout(20) << "handle_client_reply kicking caller on " << mds_rpc_cond[tid] << endl;
+ mds_rpc_cond[tid]->Signal();
+
+ // wake for kick back
+ assert(mds_rpc_dispatch_cond.count(tid) == 0);
+ Cond cond;
+ mds_rpc_dispatch_cond[tid] = &cond;
+ while (mds_rpc_cond.count(tid)) {
+ dout(20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << endl;
+ cond.Wait(client_lock);
+ }
+
+ // ok, clean up!
+ mds_rpc_dispatch_cond.erase(tid);
+}
+
+
+// ------------------------
+// incoming messages
+
+void Client::dispatch(Message *m)
+{
+ client_lock.Lock();
+
+ switch (m->get_type()) {
+ // osd
+ case MSG_OSD_OPREPLY:
+ objecter->handle_osd_op_reply((MOSDOpReply*)m);
+ break;
+
+ case MSG_OSD_MAP:
+ objecter->handle_osd_map((class MOSDMap*)m);
+ break;
+
+ // client
+ case MSG_MDS_MAP:
+ handle_mds_map((MMDSMap*)m);
+ break;
+
+ case MSG_CLIENT_REPLY:
+ handle_client_reply((MClientReply*)m);
+ break;
+
+ case MSG_CLIENT_FILECAPS:
+ handle_file_caps((MClientFileCaps*)m);
+ break;
+
+ case MSG_CLIENT_MOUNTACK:
+ handle_mount_ack((MClientMountAck*)m);
+ break;
+ case MSG_CLIENT_UNMOUNT:
+ handle_unmount_ack(m);
+ break;
+
+
+ default:
+ cout << "dispatch doesn't recognize message type " << m->get_type() << endl;
+ assert(0); // fail loudly
+ break;
+ }
+
+ // unmounting?
+ if (unmounting) {
+ dout(10) << "unmounting: trim pass, size was " << lru.lru_get_size()
+ << "+" << inode_map.size() << endl;
+ trim_cache();
+ if (lru.lru_get_size() == 0 && inode_map.empty()) {
+ dout(10) << "unmounting: trim pass, cache now empty, waking unmount()" << endl;
+ mount_cond.Signal();
+ } else {
+ dout(10) << "unmounting: trim pass, size still " << lru.lru_get_size()
+ << "+" << inode_map.size() << endl;
+ dump_cache();
+ }
+ }
+
+ client_lock.Unlock();
+}
+
+
+void Client::handle_mds_map(MMDSMap* m)
+{
+ if (mdsmap == 0)
+ mdsmap = new MDSMap;
+
+ if (whoami < 0) {
+ whoami = m->get_dest().num();
+ dout(1) << "handle_mds_map i am now " << m->get_dest() << endl;
+ messenger->reset_myaddr(m->get_dest());
+ }
+
+ map<epoch_t, bufferlist>::reverse_iterator p = m->maps.rbegin();
+
+ dout(1) << "handle_mds_map epoch " << p->first << endl;
+ mdsmap->decode(p->second);
+
+ delete m;
+
+ mount_cond.Signal(); // mount might be waiting for this.
+}
+
+
+/****
+ * caps
+ */
+
+
+class C_Client_ImplementedCaps : public Context {
+ Client *client;
+ MClientFileCaps *msg;
+ Inode *in;
+public:
+ C_Client_ImplementedCaps(Client *c, MClientFileCaps *m, Inode *i) : client(c), msg(m), in(i) {}
+ void finish(int r) {
+ client->implemented_caps(msg,in);
+ }
+};
+
+/** handle_file_caps
+ * handle caps update from mds. including mds to mds caps transitions.
+ * do not block.
+ */
+void Client::handle_file_caps(MClientFileCaps *m)
+{
+ int mds = m->get_source().num();
+ Inode *in = 0;
+ if (inode_map.count(m->get_ino())) in = inode_map[ m->get_ino() ];
+
+ m->clear_payload(); // for if/when we send back to MDS
+
+ // reap?
+ if (m->get_special() == MClientFileCaps::FILECAP_REAP) {
+ int other = m->get_mds();
+
+ if (in && in->stale_caps.count(other)) {
+ dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " reap on mds" << other << endl;
+
+ // fresh from new mds?
+ if (!in->caps.count(mds)) {
+ if (in->caps.empty()) in->get();
+ in->caps[mds].seq = m->get_seq();
+ in->caps[mds].caps = m->get_caps();
+ }
+
+ assert(in->stale_caps.count(other));
+ in->stale_caps.erase(other);
+ if (in->stale_caps.empty()) put_inode(in); // note: this will never delete *in
+
+ // fall-thru!
+ } else {
+ dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " premature (!!) reap on mds" << other << endl;
+ // delay!
+ cap_reap_queue[in->ino()][other] = m;
+ return;
+ }
+ }
+
+ assert(in);
+
+ // stale?
+ if (m->get_special() == MClientFileCaps::FILECAP_STALE) {
+ dout(5) << "handle_file_caps on ino " << m->get_ino() << " seq " << m->get_seq() << " from mds" << mds << " now stale" << endl;
+
+ // move to stale list
+ assert(in->caps.count(mds));
+ if (in->stale_caps.empty()) in->get();
+ in->stale_caps[mds] = in->caps[mds];
+
+ assert(in->caps.count(mds));
+ in->caps.erase(mds);
+ if (in->caps.empty()) in->put();
+
+ // delayed reap?
+ if (cap_reap_queue.count(in->ino()) &&
+ cap_reap_queue[in->ino()].count(mds)) {
+ dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " delayed reap on mds" << m->get_mds() << endl;
+
+ // process delayed reap
+ handle_file_caps( cap_reap_queue[in->ino()][mds] );
+
+ cap_reap_queue[in->ino()].erase(mds);
+ if (cap_reap_queue[in->ino()].empty())
+ cap_reap_queue.erase(in->ino());
+ }
+ return;
+ }
+
+ // release?
+ if (m->get_special() == MClientFileCaps::FILECAP_RELEASE) {
+ dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " release" << endl;
+ assert(in->caps.count(mds));
+ in->caps.erase(mds);
+ for (map<int,InodeCap>::iterator p = in->caps.begin();
+ p != in->caps.end();
+ p++)
+ dout(20) << " left cap " << p->first << " "
+ << cap_string(p->second.caps) << " "
+ << p->second.seq << endl;
+ for (map<int,InodeCap>::iterator p = in->stale_caps.begin();
+ p != in->stale_caps.end();
+ p++)
+ dout(20) << " left stale cap " << p->first << " "
+ << cap_string(p->second.caps) << " "
+ << p->second.seq << endl;
+
+ if (in->caps.empty()) {
+ //dout(0) << "did put_inode" << endl;
+ put_inode(in);
+ } else {
+ //dout(0) << "didn't put_inode" << endl;
+ }
+
+ return;
+ }
+
+
+ // don't want?
+ if (in->file_caps_wanted() == 0) {
+ dout(5) << "handle_file_caps on ino " << m->get_ino()
+ << " seq " << m->get_seq()
+ << " " << cap_string(m->get_caps())
+ << ", which we don't want caps for, releasing." << endl;
+ m->set_caps(0);
+ m->set_wanted(0);
+ messenger->send_message(m, m->get_source(), m->get_source_inst(), m->get_source_port());
+ return;
+ }
+
+ assert(in->caps.count(mds));
+
+ // update per-mds caps
+ const int old_caps = in->caps[mds].caps;
+ const int new_caps = m->get_caps();
+ in->caps[mds].caps = new_caps;
+ in->caps[mds].seq = m->get_seq();
+ dout(5) << "handle_file_caps on in " << m->get_ino()
+ << " mds" << mds << " seq " << m->get_seq()
+ << " caps now " << cap_string(new_caps)
+ << " was " << cap_string(old_caps) << endl;
+
+ // did file size decrease?
+ if ((old_caps & new_caps & CAP_FILE_RDCACHE) &&
+ in->inode.size > m->get_inode().size) {
+ dout(10) << "**** file size decreased from " << in->inode.size << " to " << m->get_inode().size << " FIXME" << endl;
+ // must have been a truncate() by someone.
+ // trim the buffer cache
+ // ***** fixme write me ****
+
+ in->file_wr_size = m->get_inode().size; //??
+ }
+
+ // update inode
+ in->inode = m->get_inode(); // might have updated size... FIXME this is overkill!
+
+ // preserve our (possibly newer) file size, mtime
+ if (in->file_wr_size > in->inode.size)
+ m->get_inode().size = in->inode.size = in->file_wr_size;
+ if (in->file_wr_mtime > in->inode.mtime)
+ m->get_inode().mtime = in->inode.mtime = in->file_wr_mtime;
+
+ if (g_conf.client_oc) {
+ // caching on, use FileCache.
+ Context *onimplement = 0;
+ if (old_caps & ~new_caps) { // this mds is revoking caps
+ if (in->fc.get_caps() & ~(in->file_caps())) // net revocation
+ onimplement = new C_Client_ImplementedCaps(this, m, in);
+ else {
+ implemented_caps(m, in); // ack now.
+ }
+ }
+ in->fc.set_caps(new_caps, onimplement);
+
+ } else {
+ // caching off.
+
+ // wake up waiters?
+ if (new_caps & CAP_FILE_RD) {
+ for (list<Cond*>::iterator it = in->waitfor_read.begin();
+ it != in->waitfor_read.end();
+ it++) {
+ dout(5) << "signaling read waiter " << *it << endl;
+ (*it)->Signal();
+ }
+ in->waitfor_read.clear();
+ }
+ if (new_caps & CAP_FILE_WR) {
+ for (list<Cond*>::iterator it = in->waitfor_write.begin();
+ it != in->waitfor_write.end();
+ it++) {
+ dout(5) << "signaling write waiter " << *it << endl;
+ (*it)->Signal();
+ }
+ in->waitfor_write.clear();
+ }
+ if (new_caps & CAP_FILE_LAZYIO) {
+ for (list<Cond*>::iterator it = in->waitfor_lazy.begin();
+ it != in->waitfor_lazy.end();
+ it++) {
+ dout(5) << "signaling lazy waiter " << *it << endl;
+ (*it)->Signal();
+ }
+ in->waitfor_lazy.clear();
+ }
+
+ // ack?
+ if (old_caps & ~new_caps) {
+ if (in->sync_writes) {
+ // wait for sync writes to finish
+ dout(5) << "sync writes in progress, will ack on finish" << endl;
+ in->waitfor_no_write.push_back(new C_Client_ImplementedCaps(this, m, in));
+ } else {
+ // ok now
+ implemented_caps(m, in);
+ }
+ } else {
+ // discard
+ delete m;
+ }
+ }
+}
+
+void Client::implemented_caps(MClientFileCaps *m, Inode *in)
+{
+ dout(5) << "implemented_caps " << cap_string(m->get_caps())
+ << ", acking to " << m->get_source() << endl;
+
+ if (in->file_caps() == 0) {
+ in->file_wr_mtime = 0;
+ in->file_wr_size = 0;
+ }
+
+ messenger->send_message(m, m->get_source(), m->get_source_inst(), m->get_source_port());
+}
+
+
+void Client::release_caps(Inode *in,
+ int retain)
+{
+ dout(5) << "releasing caps on ino " << in->inode.ino << dec
+ << " had " << cap_string(in->file_caps())
+ << " retaining " << cap_string(retain)
+ << endl;
+
+ for (map<int,InodeCap>::iterator it = in->caps.begin();
+ it != in->caps.end();
+ it++) {
+ //if (it->second.caps & ~retain) {
+ if (1) {
+ // release (some of?) these caps
+ it->second.caps = retain & it->second.caps;
+ // note: tell mds _full_ wanted; it'll filter/behave based on what it is allowed to do
+ MClientFileCaps *m = new MClientFileCaps(in->inode,
+ it->second.seq,
+ it->second.caps,
+ in->file_caps_wanted());
+ messenger->send_message(m, MSG_ADDR_MDS(it->first), mdsmap->get_inst(it->first), MDS_PORT_LOCKER);
+ }
+ }
+
+ if (in->file_caps() == 0) {
+ in->file_wr_mtime = 0;
+ in->file_wr_size = 0;
+ }
+}
+
+void Client::update_caps_wanted(Inode *in)
+{
+ dout(5) << "updating caps wanted on ino " << in->inode.ino
+ << " to " << cap_string(in->file_caps_wanted())
+ << endl;
+
+ // FIXME: pick a single mds and let the others off the hook..
+ for (map<int,InodeCap>::iterator it = in->caps.begin();
+ it != in->caps.end();
+ it++) {
+ MClientFileCaps *m = new MClientFileCaps(in->inode,
+ it->second.seq,
+ it->second.caps,
+ in->file_caps_wanted());
+ messenger->send_message(m,
+ MSG_ADDR_MDS(it->first), mdsmap->get_inst(it->first), MDS_PORT_LOCKER);
+ }
+}
+
+
+
+// -------------------
+// fs ops
+
+int Client::mount()
+{
+ client_lock.Lock();
+
+ assert(!mounted); // caller is confused?
+
+ // FIXME mds map update race with mount.
+
+ dout(2) << "sending boot msg to monitor" << endl;
+ if (mdsmap)
+ delete mdsmap;
+ int mon = monmap->pick_mon();
+ messenger->send_message(new MClientBoot(),
+ MSG_ADDR_MON(mon), monmap->get_inst(mon));
+
+ while (!mdsmap)
+ mount_cond.Wait(client_lock);
+
+ dout(2) << "mounting" << endl;
+ MClientMount *m = new MClientMount();
+
+ int who = 0; // mdsmap->get_root(); // mount at root, for now
+ messenger->send_message(m,
+ MSG_ADDR_MDS(who), mdsmap->get_inst(who),
+ MDS_PORT_SERVER);
+
+ while (!mounted)
+ mount_cond.Wait(client_lock);
+
+ client_lock.Unlock();
+
+ /*
+ dout(3) << "op: // client trace data structs" << endl;
+ dout(3) << "op: struct stat st;" << endl;
+ dout(3) << "op: struct utimbuf utim;" << endl;
+ dout(3) << "op: int readlinkbuf_len = 1000;" << endl;
+ dout(3) << "op: char readlinkbuf[readlinkbuf_len];" << endl;
+ dout(3) << "op: map<string, inode_t*> dir_contents;" << endl;
+ dout(3) << "op: map<fh_t, fh_t> open_files;" << endl;
+ dout(3) << "op: fh_t fh;" << endl;
+ */
+ return 0;
+}
+
+void Client::handle_mount_ack(MClientMountAck *m)
+{
+ // mdsmap!
+ if (!mdsmap) mdsmap = new MDSMap;
+ mdsmap->decode(m->get_mds_map_state());
+
+ // we got osdmap!
+ osdmap->decode(m->get_osd_map_state());
+
+ dout(2) << "mounted" << endl;
+ mounted = true;
+ mount_cond.Signal();
+
+ delete m;
+}
+
+
+int Client::unmount()
+{
+ client_lock.Lock();
+
+ assert(mounted); // caller is confused?
+
+ dout(2) << "unmounting" << endl;
+ unmounting = true;
+
+ // NOTE: i'm assuming all caches are already flushing (because all files are closed).
+ assert(fh_map.empty());
+
+ // empty lru cache
+ lru.lru_set_max(0);
+ trim_cache();
+
+ if (g_conf.client_oc) {
+ // release any/all caps
+ for (hash_map<inodeno_t, Inode*>::iterator p = inode_map.begin();
+ p != inode_map.end();
+ p++) {
+ Inode *in = p->second;
+ if (!in->caps.empty()) {
+ in->fc.release_clean();
+ if (in->fc.is_dirty()) {
+ dout(10) << "unmount residual caps on " << in->ino() << ", flushing" << endl;
+ in->fc.empty(new C_Client_CloseRelease(this, in));
+ } else {
+ dout(10) << "unmount residual caps on " << in->ino() << ", releasing" << endl;
+ release_caps(in);
+ }
+ }
+ }
+ }
+
+ while (lru.lru_get_size() > 0 ||
+ !inode_map.empty()) {
+ dout(2) << "cache still has " << lru.lru_get_size()
+ << "+" << inode_map.size() << " items"
+ << ", waiting (presumably for safe or for caps to be released?)"
+ << endl;
+ dump_cache();
+ mount_cond.Wait(client_lock);
+ }
+ assert(lru.lru_get_size() == 0);
+ assert(inode_map.empty());
+
+ // unsafe writes
+ if (!g_conf.client_oc) {
+ while (unsafe_sync_write > 0) {
+ dout(0) << unsafe_sync_write << " unsafe_sync_writes, waiting"
+ << endl;
+ mount_cond.Wait(client_lock);
+ }
+ }
+
+ // send unmount!
+ Message *req = new MGenericMessage(MSG_CLIENT_UNMOUNT);
+ messenger->send_message(req, MSG_ADDR_MDS(0), mdsmap->get_inst(0), MDS_PORT_SERVER);
+
+ while (mounted)
+ mount_cond.Wait(client_lock);
+
+ dout(2) << "unmounted" << endl;
+
+ client_lock.Unlock();
+ return 0;
+}
+
+void Client::handle_unmount_ack(Message* m)
+{
+ dout(1) << "got unmount ack" << endl;
+ mounted = false;
+ mount_cond.Signal();
+ delete m;
+}
+
+
+
+// namespace ops
+
+int Client::link(const char *existing, const char *newname)
+{
+ client_lock.Lock();
+ dout(3) << "op: client->link(\"" << existing << "\", \"" << newname << "\");" << endl;
+ tout << "link" << endl;
+ tout << existing << endl;
+ tout << newname << endl;
+
+
+ // main path arg is new link name
+ // sarg is target (existing file)
+
+
+ MClientRequest *req = new MClientRequest(MDS_OP_LINK, whoami);
+ req->set_path(newname);
+ req->set_sarg(existing);
+
+ // FIXME where does FUSE maintain user information
+ req->set_caller_uid(getuid());
+ req->set_caller_gid(getgid());
+
+ MClientReply *reply = make_request(req, true);
+ int res = reply->get_result();
+
+ insert_trace(reply);
+ delete reply;
+ dout(10) << "link result is " << res << endl;
+
+ trim_cache();
+ client_lock.Unlock();
+ return res;
+}
+
+
+int Client::unlink(const char *relpath)
+{
+ client_lock.Lock();
+
+ string abspath;
+ mkabspath(relpath, abspath);
+ const char *path = abspath.c_str();
+
+ dout(3) << "op: client->unlink\(\"" << path << "\");" << endl;
+ tout << "unlink" << endl;
+ tout << path << endl;
+
+
+ MClientRequest *req = new MClientRequest(MDS_OP_UNLINK, whoami);
+ req->set_path(path);
+
+ // FIXME where does FUSE maintain user information
+ req->set_caller_uid(getuid());
+ req->set_caller_gid(getgid());
+
+ //FIXME enforce caller uid rights?
+
+ MClientReply *reply = make_request(req, true);
+ int res = reply->get_result();
+ if (res == 0) {
+ // remove from local cache
+ filepath fp(path);
+ Dentry *dn = lookup(fp);
+ if (dn) {
+ assert(dn->inode);
+ unlink(dn);
+ }
+ }
+ insert_trace(reply);
+ delete reply;
+ dout(10) << "unlink result is " << res << endl;
+
+ trim_cache();
+ client_lock.Unlock();
+ return res;
+}
+
+int Client::rename(const char *relfrom, const char *relto)
+{
+ client_lock.Lock();
+
+ string absfrom;
+ mkabspath(relfrom, absfrom);
+ const char *from = absfrom.c_str();
+ string absto;
+ mkabspath(relto, absto);
+ const char *to = absto.c_str();
+
+ dout(3) << "op: client->rename(\"" << from << "\", \"" << to << "\");" << endl;
+ tout << "rename" << endl;
+ tout << from << endl;
+ tout << to << endl;
+
+
+ MClientRequest *req = new MClientRequest(MDS_OP_RENAME, whoami);
+ req->set_path(from);
+ req->set_sarg(to);
+
+ // FIXME where does FUSE maintain user information
+ req->set_caller_uid(getuid());
+ req->set_caller_gid(getgid());
+
+ //FIXME enforce caller uid rights?
+
+ MClientReply *reply = make_request(req, true);
+ int res = reply->get_result();
+ insert_trace(reply);
+ delete reply;
+ dout(10) << "rename result is " << res << endl;
+
+ trim_cache();
+ client_lock.Unlock();
+ return res;
+}
+
+// dirs
+
+int Client::mkdir(const char *relpath, mode_t mode)
+{
+ client_lock.Lock();
+
+ string abspath;
+ mkabspath(relpath, abspath);
+ const char *path = abspath.c_str();
+
+ dout(3) << "op: client->mkdir(\"" << path << "\", " << mode << ");" << endl;
+ tout << "mkdir" << endl;
+ tout << path << endl;
+ tout << mode << endl;
+
+
+ MClientRequest *req = new MClientRequest(MDS_OP_MKDIR, whoami);
+ req->set_path(path);
+ req->set_iarg( (int)mode );
+
+ // FIXME where does FUSE maintain user information
+ req->set_caller_uid(getuid());
+ req->set_caller_gid(getgid());
+
+ //FIXME enforce caller uid rights?
+
+ MClientReply *reply = make_request(req, true);
+ int res = reply->get_result();
+ insert_trace(reply);
+ delete reply;
+ dout(10) << "mkdir result is " << res << endl;
+
+ trim_cache();
+ client_lock.Unlock();
+ return res;
+}
+
+int Client::rmdir(const char *relpath)
+{
+ client_lock.Lock();
+
+ string abspath;
+ mkabspath(relpath, abspath);
+ const char *path = abspath.c_str();
+
+ dout(3) << "op: client->rmdir(\"" << path << "\");" << endl;
+ tout << "rmdir" << endl;
+ tout << path << endl;
+
+
+ MClientRequest *req = new MClientRequest(MDS_OP_RMDIR, whoami);
+ req->set_path(path);
+
+ // FIXME where does FUSE maintain user information
+ req->set_caller_uid(getuid());
+ req->set_caller_gid(getgid());
+
+ //FIXME enforce caller uid rights?
+
+ MClientReply *reply = make_request(req, true);
+ int res = reply->get_result();
+ if (res == 0) {
+ // remove from local cache
+ filepath fp(path);
+ Dentry *dn = lookup(fp);
+ if (dn) {
+ if (dn->inode->dir && dn->inode->dir->is_empty())
+ close_dir(dn->inode->dir); // FIXME: maybe i shoudl proactively hose the whole subtree from cache?
+ unlink(dn);
+ }
+ }
+ insert_trace(reply);
+ delete reply;
+ dout(10) << "rmdir result is " << res << endl;
+
+ trim_cache();
+ client_lock.Unlock();
+ return res;
+}
+
+// symlinks
+
+int Client::symlink(const char *reltarget, const char *rellink)
+{
+ client_lock.Lock();
+
+ string abstarget;
+ mkabspath(reltarget, abstarget);
+ const char *target = abstarget.c_str();
+ string abslink;
+ mkabspath(rellink, abslink);
+ const char *link = abslink.c_str();
+
+ dout(3) << "op: client->symlink(\"" << target << "\", \"" << link << "\");" << endl;
+ tout << "symlink" << endl;
+ tout << target << endl;
+ tout << link << endl;
+
+
+ MClientRequest *req = new MClientRequest(MDS_OP_SYMLINK, whoami);
+ req->set_path(link);
+ req->set_sarg(target);
+
+ // FIXME where does FUSE maintain user information
+ req->set_caller_uid(getuid());
+ req->set_caller_gid(getgid());
+
+ //FIXME enforce caller uid rights?
+
+ MClientReply *reply = make_request(req, true);
+ int res = reply->get_result();
+ insert_trace(reply); //FIXME assuming trace of link, not of target
+ delete reply;
+ dout(10) << "symlink result is " << res << endl;
+
+ trim_cache();
+ client_lock.Unlock();
+ return res;
+}
+
+int Client::readlink(const char *relpath, char *buf, off_t size)
+{
+ client_lock.Lock();
+
+ string abspath;
+ mkabspath(relpath, abspath);
+ const char *path = abspath.c_str();
+
+ dout(3) << "op: client->readlink(\"" << path << "\", readlinkbuf, readlinkbuf_len);" << endl;
+ tout << "readlink" << endl;
+ tout << path << endl;
+ client_lock.Unlock();
+
+ // stat first (FIXME, PERF access cache directly) ****
+ struct stat stbuf;
+ int r = this->lstat(path, &stbuf);
+ if (r != 0) return r;
+
+ client_lock.Lock();
+
+ // pull symlink content from cache
+ Inode *in = inode_map[stbuf.st_ino];
+ assert(in); // i just did a stat
+
+ // copy into buf (at most size bytes)
+ unsigned res = in->symlink->length();
+ if (res > size) res = size;
+ memcpy(buf, in->symlink->c_str(), res);
+
+ trim_cache();
+ client_lock.Unlock();
+ return res; // return length in bytes (to mimic the system call)
+}
+
+
+
+// inode stuff
+
+int Client::_lstat(const char *path, int mask, Inode **in)
+{
+ MClientRequest *req = 0;
+ filepath fpath(path);
+
+ // check whether cache content is fresh enough
+ int res = 0;
+
+ Dentry *dn = lookup(fpath);
+ inode_t inode;
+ time_t now = time(NULL);
+ if (dn &&
+ now <= dn->inode->valid_until &&
+ ((dn->inode->inode.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT)) {
+ inode = dn->inode->inode;
+ dout(10) << "lstat cache hit w/ sufficient inode.mask, valid until " << dn->inode->valid_until << endl;
+
+ if (g_conf.client_cache_stat_ttl == 0)
+ dn->inode->valid_until = 0; // only one stat allowed after each readdir
+
+ *in = dn->inode;
+ } else {
+ // FIXME where does FUSE maintain user information
+ //struct fuse_context *fc = fuse_get_context();
+ //req->set_caller_uid(fc->uid);
+ //req->set_caller_gid(fc->gid);
+
+ req = new MClientRequest(MDS_OP_LSTAT, whoami);
+ req->set_iarg(mask);
+ req->set_path(fpath);
+
+ MClientReply *reply = make_request(req);
+ res = reply->get_result();
+ dout(10) << "lstat res is " << res << endl;
+ if (res == 0) {
+ //Transfer information from reply to stbuf
+ inode = reply->get_inode();
+
+ //Update metadata cache
+ *in = insert_trace(reply);
+ }
+
+ delete reply;
+
+ if (res != 0)
+ *in = 0; // not a success.
+ }
+
+ return res;
+}
+
+
+void Client::fill_stat(inode_t& inode, struct stat *st)
+{
+ memset(st, 0, sizeof(struct stat));
+ st->st_ino = inode.ino;
+ st->st_mode = inode.mode;
+ st->st_nlink = inode.nlink;
+ st->st_uid = inode.uid;
+ st->st_gid = inode.gid;
+ st->st_ctime = inode.ctime;
+ st->st_atime = inode.atime;
+ st->st_mtime = inode.mtime;
+ st->st_size = inode.size;
+ st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0;
+ st->st_blksize = 4096;
+}
+
+void Client::fill_statlite(inode_t& inode, struct statlite *st)
+{
+ memset(st, 0, sizeof(struct stat));
+ st->st_ino = inode.ino;
+ st->st_mode = inode.mode;
+ st->st_nlink = inode.nlink;
+ st->st_uid = inode.uid;
+ st->st_gid = inode.gid;
+#ifndef DARWIN
+ // FIXME what's going on here with darwin?
+ st->st_ctime = inode.ctime;
+ st->st_atime = inode.atime;
+ st->st_mtime = inode.mtime;
+#endif
+ st->st_size = inode.size;
+ st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0;
+ st->st_blksize = 4096;
+
+ /*
+ S_REQUIREBLKSIZE(st->st_litemask);
+ if (inode.mask & INODE_MASK_BASE) S_REQUIRECTIME(st->st_litemask);
+ if (inode.mask & INODE_MASK_SIZE) {
+ S_REQUIRESIZE(st->st_litemask);
+ S_REQUIREBLOCKS(st->st_litemask);
+ }
+ if (inode.mask & INODE_MASK_MTIME) S_REQUIREMTIME(st->st_litemask);
+ if (inode.mask & INODE_MASK_ATIME) S_REQUIREATIME(st->st_litemask);
+ */
+}
+
+
+int Client::lstat(const char *relpath, struct stat *stbuf)
+{
+ client_lock.Lock();
+
+ string abspath;
+ mkabspath(relpath, abspath);
+ const char *path = abspath.c_str();
+
+ dout(3) << "op: client->lstat(\"" << path << "\", &st);" << endl;
+ tout << "lstat" << endl;
+ tout << path << endl;
+
+ Inode *in = 0;
+
+ int res = _lstat(path, INODE_MASK_ALL_STAT, &in);
+ if (res == 0) {
+ assert(in);
+ fill_stat(in->inode,stbuf);
+ dout(10) << "stat sez size = " << in->inode.size << " ino = " << stbuf->st_ino << endl;
+ }
+
+ trim_cache();
+ client_lock.Unlock();
+ return res;
+}
+
+
+int Client::lstatlite(const char *relpath, struct statlite *stl)
+{
+ client_lock.Lock();
+
+ string abspath;
+ mkabspath(relpath, abspath);
+ const char *path = abspath.c_str();
+
+ dout(3) << "op: client->lstatlite(\"" << path << "\", &st);" << endl;
+ tout << "lstatlite" << endl;
+ tout << path << endl;
+
+ // make mask
+ int mask = INODE_MASK_BASE | INODE_MASK_PERM;
+ if (S_ISVALIDSIZE(stl->st_litemask) ||
+ S_ISVALIDBLOCKS(stl->st_litemask)) mask |= INODE_MASK_SIZE;
+ if (S_ISVALIDMTIME(stl->st_litemask)) mask |= INODE_MASK_MTIME;
+ if (S_ISVALIDATIME(stl->st_litemask)) mask |= INODE_MASK_ATIME;
+
+ Inode *in = 0;
+ int res = _lstat(path, mask, &in);
+
+ if (res == 0) {
+ fill_statlite(in->inode,stl);
+ dout(10) << "stat sez size = " << in->inode.size << " ino = " << in->inode.ino << endl;
+ }
+
+ trim_cache();
+ client_lock.Unlock();
+ return res;
+}
+
+
+
+int Client::chmod(const char *relpath, mode_t mode)
+{
+ client_lock.Lock();
+
+ string abspath;
+ mkabspath(relpath, abspath);
+ const char *path = abspath.c_str();
+
+ dout(3) << "op: client->chmod(\"" << path << "\", " << mode << ");" << endl;
+ tout << "chmod" << endl;
+ tout << path << endl;
+ tout << mode << endl;
+
+
+ MClientRequest *req = new MClientRequest(MDS_OP_CHMOD, whoami);
+ req->set_path(path);
+ req->set_iarg( (int)mode );
+
+ // FIXME where does FUSE maintain user information
+ req->set_caller_uid(getuid());
+ req->set_caller_gid(getgid());
+
+ MClientReply *reply = make_request(req, true);
+ int res = reply->get_result();
+ insert_trace(reply);
+ delete reply;
+ dout(10) << "chmod result is " << res << endl;
+
+ trim_cache();
+ client_lock.Unlock();
+ return res;
+}
+
+int Client::chown(const char *relpath, uid_t uid, gid_t gid)
+{
+ client_lock.Lock();
+
+ string abspath;
+ mkabspath(relpath, abspath);
+ const char *path = abspath.c_str();
+
+ dout(3) << "op: client->chown(\"" << path << "\", " << uid << ", " << gid << ");" << endl;
+ tout << "chown" << endl;
+ tout << path << endl;
+ tout << uid << endl;
+ tout << gid << endl;
+
+
+ MClientRequest *req = new MClientRequest(MDS_OP_CHOWN, whoami);
+ req->set_path(path);
+ req->set_iarg( (int)uid );
+ req->set_iarg2( (int)gid );
+
+ // FIXME where does FUSE maintain user information
+ req->set_caller_uid(getuid());
+ req->set_caller_gid(getgid());
+
+ //FIXME enforce caller uid rights?
+
+ MClientReply *reply = make_request(req, true);
+ int res = reply->get_result();
+ insert_trace(reply);
+ delete reply;
+ dout(10) << "chown result is " << res << endl;
+
+ trim_cache();
+ client_lock.Unlock();
+ return res;
+}
+
+int Client::utime(const char *relpath, struct utimbuf *buf)
+{
+ client_lock.Lock();
+
+ string abspath;
+ mkabspath(relpath, abspath);
+ const char *path = abspath.c_str();
+
+ dout(3) << "op: utim.actime = " << buf->actime << "; utim.modtime = " << buf->modtime << ";" << endl;
+ dout(3) << "op: client->utime(\"" << path << "\", &utim);" << endl;
+ tout << "utime" << endl;
+ tout << path << endl;
+ tout << buf->actime << endl;
+ tout << buf->modtime << endl;
+
+
+ MClientRequest *req = new MClientRequest(MDS_OP_UTIME, whoami);
+ req->set_path(path);
+ req->set_targ( buf->modtime );
+ req->set_targ2( buf->actime );
+
+ // FIXME where does FUSE maintain user information
+ req->set_caller_uid(getuid());
+ req->set_caller_gid(getgid());
+
+ //FIXME enforce caller uid rights?
+
+ MClientReply *reply = make_request(req, true);
+ int res = reply->get_result();
+ insert_trace(reply);
+ delete reply;
+ dout(10) << "utime result is " << res << endl;
+
+ trim_cache();
+ client_lock.Unlock();
+ return res;
+}
+
+
+
+int Client::mknod(const char *relpath, mode_t mode)
+{
+ client_lock.Lock();
+
+ string abspath;
+ mkabspath(relpath, abspath);
+ const char *path = abspath.c_str();
+
+ dout(3) << "op: client->mknod(\"" << path << "\", " << mode << ");" << endl;
+ tout << "mknod" << endl;
+ tout << path << endl;
+ tout << mode << endl;
+
+
+ MClientRequest *req = new MClientRequest(MDS_OP_MKNOD, whoami);
+ req->set_path(path);
+ req->set_iarg( mode );
+
+ // FIXME where does FUSE maintain user information
+ req->set_caller_uid(getuid());
+ req->set_caller_gid(getgid());
+
+ //FIXME enforce caller uid rights?
+
+ MClientReply *reply = make_request(req, true);
+ int res = reply->get_result();
+ insert_trace(reply);
+
+ dout(10) << "mknod result is " << res << endl;
+
+ delete reply;
+
+ trim_cache();
+ client_lock.Unlock();
+ return res;
+}
+
+
+
+
+//readdir usually include inode info for each entry except of locked entries
+
+//
+// getdir
+
+// fyi: typedef int (*dirfillerfunc_t) (void *handle, const char *name, int type, inodeno_t ino);
+
+int Client::getdir(const char *relpath, map<string,inode_t>& contents)
+{
+ client_lock.Lock();
+
+ string abspath;
+ mkabspath(relpath, abspath);
+ const char *path = abspath.c_str();
+
+ dout(3) << "op: client->getdir(\"" << path << "\", dir_contents);" << endl;
+ tout << "getdir" << endl;
+ tout << path << endl;
+
+
+ MClientRequest *req = new MClientRequest(MDS_OP_READDIR, whoami);
+ req->set_path(path);
+
+ // FIXME where does FUSE maintain user information
+ req->set_caller_uid(getuid());
+ req->set_caller_gid(getgid());
+
+ //FIXME enforce caller uid rights?
+
+ MClientReply *reply = make_request(req, true);
+ int res = reply->get_result();
+ insert_trace(reply);
+
+ if (res == 0) {
+
+ // dir contents to cache!
+ inodeno_t ino = reply->get_ino();
+ Inode *diri = inode_map[ ino ];
+ assert(diri);
+ assert(diri->inode.mode & INODE_MODE_DIR);
+
+ if (!reply->get_dir_in().empty()) {
+ // only open dir if we're actually adding stuff to it!
+ Dir *dir = diri->open_dir();
+ assert(dir);
+ time_t now = time(NULL);
+
+ list<string>::const_iterator pdn = reply->get_dir_dn().begin();
+ for (list<InodeStat*>::const_iterator pin = reply->get_dir_in().begin();
+ pin != reply->get_dir_in().end();
+ ++pin, ++pdn) {
+ // count entries
+ res++;
+
+ // put in cache
+ Inode *in = this->insert_inode(dir, *pin, *pdn);
+
+ if (g_conf.client_cache_stat_ttl)
+ in->valid_until = now + g_conf.client_cache_stat_ttl;
+ else if (g_conf.client_cache_readdir_ttl)
+ in->valid_until = now + g_conf.client_cache_readdir_ttl;
+
+ // contents to caller too!
+ contents[*pdn] = in->inode;
+ }
+ }
+
+ // add .. too?
+ if (diri != root && diri->dn && diri->dn->dir) {
+ Inode *parent = diri->dn->dir->parent_inode;
+ contents[".."] = parent->inode;
+ }
+
+ // FIXME: remove items in cache that weren't in my readdir?
+ // ***
+ }
+
+ delete reply; //fix thing above first
+
+ client_lock.Unlock();
+ return res;
+}
+
+
+/** POSIX stubs **/
+
+DIR *Client::opendir(const char *name)
+{
+ DirResult *d = new DirResult;
+ d->size = getdir(name, d->contents);
+ d->p = d->contents.begin();
+ d->off = 0;
+ return (DIR*)d;
+}
+
+int Client::closedir(DIR *dir)
+{
+ DirResult *d = (DirResult*)dir;
+ delete d;
+ return 0;
+}
+
+//struct dirent {
+// ino_t d_ino; /* inode number */
+// off_t d_off; /* offset to the next dirent */
+// unsigned short d_reclen; /* length of this record */
+// unsigned char d_type; /* type of file */
+// char d_name[256]; /* filename */
+//};
+
+struct dirent *Client::readdir(DIR *dirp)
+{
+ DirResult *d = (DirResult*)dirp;
+
+ // end of dir?
+ if (d->p == d->contents.end())
+ return 0;
+
+ // fill the dirent
+ d->dp.d_dirent.d_ino = d->p->second.ino;
+#ifndef __CYGWIN__
+#ifndef DARWIN
+ if (d->p->second.is_symlink())
+ d->dp.d_dirent.d_type = DT_LNK;
+ else if (d->p->second.is_dir())
+ d->dp.d_dirent.d_type = DT_DIR;
+ else if (d->p->second.is_file())
+ d->dp.d_dirent.d_type = DT_REG;
+ else
+ d->dp.d_dirent.d_type = DT_UNKNOWN;
+
+ d->dp.d_dirent.d_off = d->off;
+ d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.)
+#endif // DARWIN
+#endif
+
+ strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256);
+
+ // move up
+ ++d->off;
+ ++d->p;
+
+ return &d->dp.d_dirent;
+}
+
+void Client::rewinddir(DIR *dirp)
+{
+ DirResult *d = (DirResult*)dirp;
+ d->p = d->contents.begin();
+ d->off = 0;
+}
+
+off_t Client::telldir(DIR *dirp)
+{
+ DirResult *d = (DirResult*)dirp;
+ return d->off;
+}
+
+void Client::seekdir(DIR *dirp, off_t offset)
+{
+ DirResult *d = (DirResult*)dirp;
+
+ d->p = d->contents.begin();
+ d->off = 0;
+
+ if (offset >= d->size) offset = d->size-1;
+ while (offset > 0) {
+ ++d->p;
+ ++d->off;
+ --offset;
+ }
+}
+
+struct dirent_plus *Client::readdirplus(DIR *dirp)
+{
+ DirResult *d = (DirResult*)dirp;
+
+ // end of dir?
+ if (d->p == d->contents.end())
+ return 0;
+
+ // fill the dirent
+ d->dp.d_dirent.d_ino = d->p->second.ino;
+#ifndef __CYGWIN__
+#ifndef DARWIN
+ if (d->p->second.is_symlink())
+ d->dp.d_dirent.d_type = DT_LNK;
+ else if (d->p->second.is_dir())
+ d->dp.d_dirent.d_type = DT_DIR;
+ else if (d->p->second.is_file())
+ d->dp.d_dirent.d_type = DT_REG;
+ else
+ d->dp.d_dirent.d_type = DT_UNKNOWN;
+
+ d->dp.d_dirent.d_off = d->off;
+ d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.)
+#endif // DARWIN
+#endif
+
+ strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256);
+
+ // plus
+ if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) {
+ // have it
+ fill_stat(d->p->second, &d->dp.d_stat);
+ d->dp.d_stat_err = 0;
+ } else {
+ // don't have it, stat it
+ string path = d->path;
+ path += "/";
+ path += d->p->first;
+ d->dp.d_stat_err = lstat(path.c_str(), &d->dp.d_stat);
+ }
+
+ // move up
+ ++d->off;
+ ++d->p;
+
+ return &d->dp;
+}
+
+/*
+struct dirent_lite *Client::readdirlite(DIR *dirp)
+{
+ DirResult *d = (DirResult*)dirp;
+
+ // end of dir?
+ if (d->p == d->contents.end())
+ return 0;
+
+ // fill the dirent
+ d->dp.d_dirent.d_ino = d->p->second.ino;
+ if (d->p->second.is_symlink())
+ d->dp.d_dirent.d_type = DT_LNK;
+ else if (d->p->second.is_dir())
+ d->dp.d_dirent.d_type = DT_DIR;
+ else if (d->p->second.is_file())
+ d->dp.d_dirent.d_type = DT_REG;
+ else
+ d->dp.d_dirent.d_type = DT_UNKNOWN;
+ strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256);
+
+ d->dp.d_dirent.d_off = d->off;
+ d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.)
+
+ // plus
+ if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) {
+ // have it
+ fill_statlite(d->p->second,d->dp.d_stat);
+ d->dp.d_stat_err = 0;
+ } else {
+ // don't have it, stat it
+ string path = p->path;
+ path += "/";
+ path += p->first;
+ d->dp.d_statlite
+ d->dp.d_stat_err = lstatlite(path.c_str(), &d->dp.d_statlite);
+ }
+
+ // move up
+ ++d->off;
+ ++d->p;
+
+ return &d->dp;
+}
+*/
+
+
+
+
+
+
+/****** file i/o **********/
+
+int Client::open(const char *relpath, int flags)
+{
+ client_lock.Lock();
+
+ string abspath;
+ mkabspath(relpath, abspath);
+ const char *path = abspath.c_str();
+
+ dout(3) << "op: fh = client->open(\"" << path << "\", " << flags << ");" << endl;
+ tout << "open" << endl;
+ tout << path << endl;
+ tout << flags << endl;
+
+ int cmode = 0;
+ bool tryauth = false;
+ if (flags & O_LAZY)
+ cmode = FILE_MODE_LAZY;
+ else if (flags & O_WRONLY) {
+ cmode = FILE_MODE_W;
+ tryauth = true;
+ } else if (flags & O_RDWR) {
+ cmode = FILE_MODE_RW;
+ tryauth = true;
+ } else if (flags & O_APPEND) {
+ cmode = FILE_MODE_W;
+ tryauth = true;
+ } else
+ cmode = FILE_MODE_R;
+
+ // go
+ MClientRequest *req = new MClientRequest(MDS_OP_OPEN, whoami);
+ req->set_path(path);
+ req->set_iarg(flags);
+ req->set_iarg2(cmode);
+
+ // FIXME where does FUSE maintain user information
+ req->set_caller_uid(getuid());
+ req->set_caller_gid(getgid());
+
+ MClientReply *reply = make_request(req, tryauth); // try auth if writer
+
+ assert(reply);
+ dout(3) << "op: open_files[" << reply->get_result() << "] = fh; // fh = " << reply->get_result() << endl;
+ tout << reply->get_result() << endl;
+
+ insert_trace(reply);
+ int result = reply->get_result();
+
+ // success?
+ fh_t fh = 0;
+ if (result >= 0) {
+ // yay
+ Fh *f = new Fh;
+ f->mode = cmode;
+
+ // inode
+ f->inode = inode_map[reply->get_ino()];
+ assert(f->inode);
+ f->inode->get();
+
+ if (cmode & FILE_MODE_R) f->inode->num_open_rd++;
+ if (cmode & FILE_MODE_W) f->inode->num_open_wr++;
+ if (cmode & FILE_MODE_LAZY) f->inode->num_open_lazy++;
+
+ // caps included?
+ int mds = reply->get_source().num();
+
+ if (f->inode->caps.empty()) {// first caps?
+ dout(7) << " first caps on " << f->inode->inode.ino << endl;
+ f->inode->get();
+ }
+
+ int new_caps = reply->get_file_caps();
+
+ assert(reply->get_file_caps_seq() >= f->inode->caps[mds].seq);
+ if (reply->get_file_caps_seq() > f->inode->caps[mds].seq) {
+ dout(7) << "open got caps " << cap_string(new_caps)
+ << " for " << f->inode->ino()
+ << " seq " << reply->get_file_caps_seq()
+ << " from mds" << mds << endl;
+
+ int old_caps = f->inode->caps[mds].caps;
+ f->inode->caps[mds].caps = new_caps;
+ f->inode->caps[mds].seq = reply->get_file_caps_seq();
+
+ // we shouldn't ever lose caps at this point.
+ // actually, we might...?
+ assert((old_caps & ~f->inode->caps[mds].caps) == 0);
+
+ if (g_conf.client_oc)
+ f->inode->fc.set_caps(new_caps);
+
+ } else {
+ dout(7) << "open got SAME caps " << cap_string(new_caps)
+ << " for " << f->inode->ino()
+ << " seq " << reply->get_file_caps_seq()
+ << " from mds" << mds << endl;
+ }
+
+ // put in map
+ result = fh = get_fh();
+ assert(fh_map.count(fh) == 0);
+ fh_map[fh] = f;
+
+ dout(3) << "open success, fh is " << fh << " combined caps " << cap_string(f->inode->file_caps()) << endl;
+ } else {
+ dout(0) << "open failure result " << result << endl;
+ }
+
+ delete reply;
+
+ trim_cache();
+ client_lock.Unlock();
+
+ return result;
+}
+
+
+
+
+
+void Client::close_release(Inode *in)
+{
+ dout(10) << "close_release on " << in->ino() << endl;
+
+ if (!in->num_open_rd)
+ in->fc.release_clean();
+
+ int retain = 0;
+ if (in->num_open_wr || in->fc.is_dirty()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER;
+ if (in->num_open_rd || in->fc.is_cached()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER;
+
+ release_caps(in, retain); // release caps now.
+}
+
+void Client::close_safe(Inode *in)
+{
+ dout(10) << "close_safe on " << in->ino() << endl;
+ put_inode(in);
+ if (unmounting)
+ mount_cond.Signal();
+}
+
+int Client::close(fh_t fh)
+{
+ client_lock.Lock();
+ dout(3) << "op: client->close(open_files[ " << fh << " ]);" << endl;
+ dout(3) << "op: open_files.erase( " << fh << " );" << endl;
+ tout << "close" << endl;
+ tout << fh << endl;
+
+ // get Fh, Inode
+ assert(fh_map.count(fh));
+ Fh *f = fh_map[fh];
+ Inode *in = f->inode;
+
+ // update inode rd/wr counts
+ int before = in->file_caps_wanted();
+ if (f->mode & FILE_MODE_R)
+ in->num_open_rd--;
+ if (f->mode & FILE_MODE_W)
+ in->num_open_wr--;
+ int after = in->file_caps_wanted();
+
+ // does this change what caps we want?
+ if (before != after && after)
+ update_caps_wanted(in);
+
+ // hose fh
+ fh_map.erase(fh);
+ delete f;
+
+ // release caps right away?
+ dout(10) << "num_open_rd " << in->num_open_rd << " num_open_wr " << in->num_open_wr << endl;
+
+ if (g_conf.client_oc) {
+ // caching on.
+ if (in->num_open_rd == 0 && in->num_open_wr == 0) {
+ in->fc.empty(new C_Client_CloseRelease(this, in));
+ }
+ else if (in->num_open_rd == 0) {
+ in->fc.release_clean();
+ close_release(in);
+ }
+ else if (in->num_open_wr == 0) {
+ in->fc.flush_dirty(new C_Client_CloseRelease(this,in));
+ }
+
+ // pin until safe?
+ if (in->num_open_wr == 0 && !in->fc.all_safe()) {
+ dout(10) << "pinning ino " << in->ino() << " until safe" << endl;
+ in->get();
+ in->fc.add_safe_waiter(new C_Client_CloseSafe(this, in));
+ }
+ } else {
+ // caching off.
+ if (in->num_open_rd == 0 && in->num_open_wr == 0) {
+ dout(10) << " releasing caps on " << in->ino() << endl;
+ release_caps(in); // release caps now.
+ }
+ }
+
+ put_inode( in );
+ int result = 0;
+
+ client_lock.Unlock();
+ return result;
+}
+
+
+
+// ------------
+// read, write
+
+// blocking osd interface
+
+int Client::read(fh_t fh, char *buf, off_t size, off_t offset)
+{
+ client_lock.Lock();
+
+ dout(3) << "op: client->read(" << fh << ", buf, " << size << ", " << offset << "); // that's " << offset << "~" << size << endl;
+ tout << "read" << endl;
+ tout << fh << endl;
+ tout << size << endl;
+ tout << offset << endl;
+
+ assert(offset >= 0);
+ assert(fh_map.count(fh));
+ Fh *f = fh_map[fh];
+ Inode *in = f->inode;
+
+ if (offset < 0)
+ offset = f->pos;
+
+ bool lazy = f->mode == FILE_MODE_LAZY;
+
+ // do we have read file cap?
+ while (!lazy && (in->file_caps() & CAP_FILE_RD) == 0) {
+ dout(7) << " don't have read cap, waiting" << endl;
+ Cond cond;
+ in->waitfor_read.push_back(&cond);
+ cond.Wait(client_lock);
+ }
+ // lazy cap?
+ while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) {
+ dout(7) << " don't have lazy cap, waiting" << endl;
+ Cond cond;
+ in->waitfor_lazy.push_back(&cond);
+ cond.Wait(client_lock);
+ }
+
+ // determine whether read range overlaps with file
+ // ...ONLY if we're doing async io
+ if (!lazy && (in->file_caps() & (CAP_FILE_WRBUFFER|CAP_FILE_RDCACHE))) {
+ // we're doing buffered i/o. make sure we're inside the file.
+ // we can trust size info bc we get accurate info when buffering/caching caps are issued.
+ dout(10) << "file size: " << in->inode.size << endl;
+ if (offset > 0 && offset >= in->inode.size) {
+ client_lock.Unlock();
+ return 0;
+ }
+ if (offset + size > (unsigned)in->inode.size) size = (unsigned)in->inode.size - offset;
+
+ if (size == 0) {
+ dout(10) << "read is size=0, returning 0" << endl;
+ client_lock.Unlock();
+ return 0;
+ }
+ } else {
+ // unbuffered, synchronous file i/o.
+ // or lazy.
+ // defer to OSDs for file bounds.
+ }
+
+ bufferlist blist; // data will go here
+ int rvalue = 0;
+ int r = 0;
+
+ if (g_conf.client_oc) {
+ // object cache ON
+ rvalue = r = in->fc.read(offset, size, blist, client_lock); // may block.
+ } else {
+ // object cache OFF -- legacy inconsistent way.
+ Cond cond;
+ bool done = false;
+ C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue);
+
+ r = filer->read(in->inode, offset, size, &blist, onfinish);
+
+ assert(r >= 0);
+
+ // wait!
+ while (!done)
+ cond.Wait(client_lock);
+ }
+
+ // adjust fd pos
+ f->pos = offset+blist.length();
+
+ // copy data into caller's char* buf
+ blist.copy(0, blist.length(), buf);
+
+ //dout(10) << "i read '" << blist.c_str() << "'" << endl;
+ dout(10) << "read rvalue " << rvalue << ", r " << r << endl;
+
+ // done!
+ client_lock.Unlock();
+ return rvalue;
+}
+
+
+
+/*
+ * hack --
+ * until we properly implement synchronous writes wrt buffer cache,
+ * make sure we delay shutdown until they're all safe on disk!
+ */
+class C_Client_HackUnsafe : public Context {
+ Client *cl;
+public:
+ C_Client_HackUnsafe(Client *c) : cl(c) {}
+ void finish(int) {
+ cl->hack_sync_write_safe();
+ }
+};
+
+void Client::hack_sync_write_safe()
+{
+ client_lock.Lock();
+ assert(unsafe_sync_write > 0);
+ unsafe_sync_write--;
+ if (unsafe_sync_write == 0 && unmounting) {
+ dout(10) << "hack_sync_write_safe -- no more unsafe writes, unmount can proceed" << endl;
+ mount_cond.Signal();
+ }
+ client_lock.Unlock();
+}
+
+int Client::write(fh_t fh, const char *buf, off_t size, off_t offset)
+{
+ client_lock.Lock();
+
+ //dout(7) << "write fh " << fh << " size " << size << " offset " << offset << endl;
+ dout(3) << "op: client->write(" << fh << ", buf, " << size << ", " << offset << ");" << endl;
+ tout << "write" << endl;
+ tout << fh << endl;
+ tout << size << endl;
+ tout << offset << endl;
+
+ assert(offset >= 0);
+ assert(fh_map.count(fh));
+ Fh *f = fh_map[fh];
+ Inode *in = f->inode;
+
+ if (offset < 0)
+ offset = f->pos;
+
+ bool lazy = f->mode == FILE_MODE_LAZY;
+
+ dout(10) << "cur file size is " << in->inode.size << " wr size " << in->file_wr_size << endl;
+
+ // do we have write file cap?
+ while (!lazy && (in->file_caps() & CAP_FILE_WR) == 0) {
+ dout(7) << " don't have write cap, waiting" << endl;
+ Cond cond;
+ in->waitfor_write.push_back(&cond);
+ cond.Wait(client_lock);
+ }
+ while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) {
+ dout(7) << " don't have lazy cap, waiting" << endl;
+ Cond cond;
+ in->waitfor_lazy.push_back(&cond);
+ cond.Wait(client_lock);
+ }
+
+ // adjust fd pos
+ f->pos = offset+size;
+
+ // time it.
+ utime_t start = g_clock.now();
+
+ // copy into fresh buffer (since our write may be resub, async)
+ bufferptr bp = buffer::copy(buf, size);
+ bufferlist blist;
+ blist.push_back( bp );
+
+ if (g_conf.client_oc) { // buffer cache ON?
+ assert(objectcacher);
+
+ // write (this may block!)
+ in->fc.write(offset, size, blist, client_lock);
+
+ } else {
+ // legacy, inconsistent synchronous write.
+ dout(7) << "synchronous write" << endl;
+
+ // prepare write
+ Cond cond;
+ bool done = false;
+ C_Cond *onfinish = new C_Cond(&cond, &done);
+ C_Client_HackUnsafe *onsafe = new C_Client_HackUnsafe(this);
+ unsafe_sync_write++;
+ in->sync_writes++;
+
+ dout(20) << " sync write start " << onfinish << endl;
+
+ filer->write(in->inode, offset, size, blist, 0,
+ onfinish, onsafe
+ //, 1+((int)g_clock.now()) / 10 //f->pos // hack hack test osd revision snapshots
+ );
+
+ while (!done) {
+ cond.Wait(client_lock);
+ dout(20) << " sync write bump " << onfinish << endl;
+ }
+
+ in->sync_writes--;
+ if (in->sync_writes == 0 &&
+ !in->waitfor_no_write.empty()) {
+ for (list<Context*>::iterator i = in->waitfor_no_write.begin();
+ i != in->waitfor_no_write.end();
+ i++)
+ (*i)->finish(0);
+ in->waitfor_no_write.clear();
+ }
+
+ dout(20) << " sync write done " << onfinish << endl;
+ }
+
+ // time
+ utime_t lat = g_clock.now();
+ lat -= start;
+ if (client_logger) {
+ client_logger->finc("wrlsum",(double)lat);
+ client_logger->inc("wrlnum");
+ }
+
+ // assume success for now. FIXME.
+ off_t totalwritten = size;
+
+ // extend file?
+ if (totalwritten + offset > in->inode.size) {
+ in->inode.size = in->file_wr_size = totalwritten + offset;
+ dout(7) << "wrote to " << totalwritten+offset << ", extending file size" << endl;
+ } else {
+ dout(7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->inode.size << endl;
+ }
+
+ // mtime
+ in->file_wr_mtime = in->inode.mtime = g_clock.gettime();
+
+ // ok!
+ client_lock.Unlock();
+ return totalwritten;
+}
+
+
+int Client::truncate(const char *file, off_t size)
+{
+ client_lock.Lock();
+ dout(3) << "op: client->truncate(\"" << file << "\", " << size << ");" << endl;
+ tout << "truncate" << endl;
+ tout << file << endl;
+ tout << size << endl;
+
+
+ MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, whoami);
+ req->set_path(file);
+ req->set_sizearg( size );
+
+ // FIXME where does FUSE maintain user information
+ req->set_caller_uid(getuid());
+ req->set_caller_gid(getgid());
+
+ MClientReply *reply = make_request(req, true);
+ int res = reply->get_result();
+ insert_trace(reply);
+ delete reply;
+
+ dout(10) << " truncate result is " << res << endl;
+
+ client_lock.Unlock();
+ return res;
+}
+
+
+int Client::fsync(fh_t fh, bool syncdataonly)
+{
+ client_lock.Lock();
+ dout(3) << "op: client->fsync(open_files[ " << fh << " ], " << syncdataonly << ");" << endl;
+ tout << "fsync" << endl;
+ tout << fh << endl;
+ tout << syncdataonly << endl;
+
+ int r = 0;
+
+ assert(fh_map.count(fh));
+ Fh *f = fh_map[fh];
+ Inode *in = f->inode;
+
+ dout(3) << "fsync fh " << fh << " ino " << in->inode.ino << " syncdataonly " << syncdataonly << endl;
+
+ // metadata?
+ if (!syncdataonly) {
+ dout(0) << "fsync - not syncing metadata yet.. implement me" << endl;
+ }
+
+ // data?
+ Cond cond;
+ bool done = false;
+ if (!objectcacher->commit_set(in->ino(),
+ new C_Cond(&cond, &done))) {
+ // wait for callback
+ while (!done) cond.Wait(client_lock);
+ }
+
+ client_lock.Unlock();
+ return r;
+}
+
+
+// not written yet, but i want to link!
+
+int Client::chdir(const char *path)
+{
+ // fake it for now!
+ string abs;
+ mkabspath(path, abs);
+ dout(3) << "chdir " << path << " -> cwd now " << abs << endl;
+ cwd = abs;
+ return 0;
+}
+
+#ifdef DARWIN
+int Client::statfs(const char *path, struct statvfs *stbuf)
+{
+ bzero (stbuf, sizeof (struct statvfs));
+ // FIXME
+ stbuf->f_bsize = 1024;
+ stbuf->f_frsize = 1024;
+ stbuf->f_blocks = 1024 * 1024;
+ stbuf->f_bfree = 1024 * 1024;
+ stbuf->f_bavail = 1024 * 1024;
+ stbuf->f_files = 1024 * 1024;
+ stbuf->f_ffree = 1024 * 1024;
+ stbuf->f_favail = 1024 * 1024;
+ stbuf->f_namemax = 1024;
+
+ return 0;
+}
+#else
+int Client::statfs(const char *path, struct statfs *stbuf)
+{
+ assert(0); // implement me
+ return 0;
+}
+#endif
+
+
+int Client::lazyio_propogate(int fd, off_t offset, size_t count)
+{
+ client_lock.Lock();
+ dout(3) << "op: client->lazyio_propogate(" << fd
+ << ", " << offset << ", " << count << ")" << endl;
+
+ assert(fh_map.count(fd));
+ Fh *f = fh_map[fd];
+ Inode *in = f->inode;
+
+ if (f->mode & FILE_MODE_LAZY) {
+ // wait for lazy cap
+ while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) {
+ dout(7) << " don't have lazy cap, waiting" << endl;
+ Cond cond;
+ in->waitfor_lazy.push_back(&cond);
+ cond.Wait(client_lock);
+ }
+
+ if (g_conf.client_oc) {
+ Cond cond;
+ bool done = false;
+ in->fc.flush_dirty(new C_SafeCond(&client_lock, &cond, &done));
+
+ while (!done)
+ cond.Wait(client_lock);
+
+ } else {
+ // mmm, nothin to do.
+ }
+ }
+
+ client_lock.Unlock();
+ return 0;
+}
+
+int Client::lazyio_synchronize(int fd, off_t offset, size_t count)
+{
+ client_lock.Lock();
+ dout(3) << "op: client->lazyio_synchronize(" << fd
+ << ", " << offset << ", " << count << ")" << endl;
+
+ assert(fh_map.count(fd));
+ Fh *f = fh_map[fd];
+ Inode *in = f->inode;
+
+ if (f->mode & FILE_MODE_LAZY) {
+ // wait for lazy cap
+ while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) {
+ dout(7) << " don't have lazy cap, waiting" << endl;
+ Cond cond;
+ in->waitfor_lazy.push_back(&cond);
+ cond.Wait(client_lock);
+ }
+
+ if (g_conf.client_oc) {
+ in->fc.flush_dirty(0); // flush to invalidate.
+ in->fc.release_clean();
+ } else {
+ // mm, nothin to do.
+ }
+ }
+
+ client_lock.Unlock();
+ return 0;
+}
+
+
+void Client::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst)
+{
+ if (dest.is_mon()) {
+ // resend to a different monitor.
+ int mon = monmap->pick_mon(true);
+ dout(0) << "ms_handle_failure " << dest << " inst " << inst
+ << ", resending to mon" << mon
+ << endl;
+ messenger->send_message(m, MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ }
+ else if (dest.is_osd()) {
+ objecter->ms_handle_failure(m, dest, inst);
+ }
+ else if (dest.is_mds()) {
+ dout(0) << "ms_handle_failure " << dest << " inst " << inst << endl;
+ // help!
+ assert(0);
+ }
+ else {
+ // client?
+ dout(0) << "ms_handle_failure " << dest << " inst " << inst
+ << ", dropping" << endl;
+ delete m;
+ }
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __CLIENT_H
+#define __CLIENT_H
+
+
+#include "mds/MDSMap.h"
+#include "osd/OSDMap.h"
+#include "mon/MonMap.h"
+
+#include "msg/Message.h"
+#include "msg/Dispatcher.h"
+#include "msg/Messenger.h"
+#include "msg/SerialMessenger.h"
+
+#include "messages/MClientRequest.h"
+#include "messages/MClientReply.h"
+
+//#include "msgthread.h"
+
+#include "include/types.h"
+#include "include/lru.h"
+#include "include/filepath.h"
+#include "include/interval_set.h"
+
+#include "common/Mutex.h"
+
+#include "FileCache.h"
+
+// stl
+#include <set>
+#include <map>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#define O_LAZY 01000000
+
+
+class Filer;
+class Objecter;
+class ObjectCacher;
+
+extern class LogType client_logtype;
+extern class Logger *client_logger;
+
+
+
+// ============================================
+// types for my local metadata cache
+/* basic structure:
+
+ - Dentries live in an LRU loop. they get expired based on last access.
+ see include/lru.h. items can be bumped to "mid" or "top" of list, etc.
+ - Inode has ref count for each Fh, Dir, or Dentry that points to it.
+ - when Inode ref goes to 0, it's expired.
+ - when Dir is empty, it's removed (and it's Inode ref--)
+
+*/
+
+typedef int fh_t;
+
+class Dir;
+class Inode;
+
+class Dentry : public LRUObject {
+ public:
+ string name; // sort of lame
+ //const char *name;
+ Dir *dir;
+ Inode *inode;
+ int ref; // 1 if there's a dir beneath me.
+
+ void get() { assert(ref == 0); ref++; lru_pin(); }
+ void put() { assert(ref == 1); ref--; lru_unpin(); }
+
+ Dentry() : dir(0), inode(0), ref(0) { }
+
+ /*Dentry() : name(0), dir(0), inode(0), ref(0) { }
+ Dentry(string& n) : name(0), dir(0), inode(0), ref(0) {
+ name = new char[n.length()+1];
+ strcpy((char*)name, n.c_str());
+ }
+ ~Dentry() {
+ delete[] name;
+ }*/
+};
+
+class Dir {
+ public:
+ Inode *parent_inode; // my inode
+ //hash_map<const char*, Dentry*, hash<const char*>, eqstr> dentries;
+ hash_map<string, Dentry*> dentries;
+
+ Dir(Inode* in) { parent_inode = in; }
+
+ bool is_empty() { return dentries.empty(); }
+};
+
+
+class InodeCap {
+ public:
+ int caps;
+ long seq;
+ InodeCap() : caps(0), seq(0) {}
+};
+
+
+class Inode {
+ public:
+ inode_t inode; // the actual inode
+ time_t valid_until;
+
+ // about the dir (if this is one!)
+ int dir_auth;
+ set<int> dir_contacts;
+ bool dir_hashed, dir_replicated;
+
+ // per-mds caps
+ map<int,InodeCap> caps; // mds -> InodeCap
+ map<int,InodeCap> stale_caps; // mds -> cap .. stale
+
+ time_t file_wr_mtime; // [writers] time of last write
+ off_t file_wr_size; // [writers] largest offset we've written to
+ int num_open_rd, num_open_wr, num_open_lazy; // num readers, writers
+
+ int ref; // ref count. 1 for each dentry, fh that links to me.
+ Dir *dir; // if i'm a dir.
+ Dentry *dn; // if i'm linked to a dentry.
+ string *symlink; // symlink content, if it's a symlink
+
+ // for caching i/o mode
+ FileCache fc;
+
+ // for sync i/o mode
+ int sync_reads; // sync reads in progress
+ int sync_writes; // sync writes in progress
+
+ list<Cond*> waitfor_write;
+ list<Cond*> waitfor_read;
+ list<Cond*> waitfor_lazy;
+ list<Context*> waitfor_no_read, waitfor_no_write;
+
+ void get() {
+ ref++;
+ //cout << "inode.get on " << hex << inode.ino << dec << " now " << ref << endl;
+ }
+ void put() {
+ ref--; assert(ref >= 0);
+ //cout << "inode.put on " << hex << inode.ino << dec << " now " << ref << endl;
+ }
+
+ Inode(inode_t _inode, ObjectCacher *_oc) :
+ inode(_inode),
+ valid_until(0),
+ dir_auth(-1), dir_hashed(false), dir_replicated(false),
+ file_wr_mtime(0), file_wr_size(0),
+ num_open_rd(0), num_open_wr(0), num_open_lazy(0),
+ ref(0), dir(0), dn(0), symlink(0),
+ fc(_oc, _inode),
+ sync_reads(0), sync_writes(0)
+ { }
+ ~Inode() {
+ if (symlink) { delete symlink; symlink = 0; }
+ }
+
+ inodeno_t ino() { return inode.ino; }
+
+ bool is_dir() {
+ return (inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR;
+ }
+
+ int file_caps() {
+ int c = 0;
+ for (map<int,InodeCap>::iterator it = caps.begin();
+ it != caps.end();
+ it++)
+ c |= it->second.caps;
+ for (map<int,InodeCap>::iterator it = stale_caps.begin();
+ it != stale_caps.end();
+ it++)
+ c |= it->second.caps;
+ return c;
+ }
+
+ int file_caps_wanted() {
+ int w = 0;
+ if (num_open_rd) w |= CAP_FILE_RD|CAP_FILE_RDCACHE;
+ if (num_open_wr) w |= CAP_FILE_WR|CAP_FILE_WRBUFFER;
+ if (num_open_lazy) w |= CAP_FILE_LAZYIO;
+ return w;
+ }
+
+ int authority(MDSMap *mdsmap) {
+ //cout << "authority on " << inode.ino << " .. dir_auth is " << dir_auth<< endl;
+ // parent?
+ if (dn && dn->dir && dn->dir->parent_inode) {
+ // parent hashed?
+ if (dn->dir->parent_inode->dir_hashed) {
+ // hashed
+ assert(0);
+ // fixme
+ //return mdcluster->hash_dentry( dn->dir->parent_inode->ino(),
+ //dn->name );
+ }
+
+ if (dir_auth >= 0)
+ return dir_auth;
+ else
+ return dn->dir->parent_inode->authority(mdsmap);
+ }
+
+ if (dir_auth >= 0)
+ return dir_auth;
+
+ assert(0); // !!!
+ return 0;
+ }
+ int dentry_authority(const char *dn,
+ MDSMap *mdsmap) {
+ assert(0);
+ return 0;
+ //return ->hash_dentry( ino(),
+ //dn );
+ }
+ int pick_replica(MDSMap *mdsmap) {
+ // replicas?
+ if (ino() > 1ULL && dir_contacts.size()) {
+ //cout << "dir_contacts if " << dir_contacts << endl;
+ set<int>::iterator it = dir_contacts.begin();
+ if (dir_contacts.size() == 1)
+ return *it;
+ else {
+ int r = rand() % dir_contacts.size();
+ while (r--) it++;
+ return *it;
+ }
+ }
+
+ if (dir_replicated || ino() == 1) {
+ //cout << "num_mds is " << mdcluster->get_num_mds() << endl;
+ return rand() % mdsmap->get_num_mds(); // huh.. pick a random mds!
+ }
+ else
+ return authority(mdsmap);
+ }
+
+
+ // open Dir for an inode. if it's not open, allocated it (and pin dentry in memory).
+ Dir *open_dir() {
+ if (!dir) {
+ if (dn) dn->get(); // pin dentry
+ get();
+ dir = new Dir(this);
+ }
+ return dir;
+ }
+
+};
+
+
+
+
+// file handle for any open file state
+
+struct Fh {
+ Inode *inode;
+ off_t pos;
+ int mds; // have to talk to mds we opened with (for now)
+ int mode; // the mode i opened the file with
+
+ bool is_lazy() { return mode & O_LAZY; }
+
+ Fh() : inode(0), pos(0), mds(0), mode(0) {}
+};
+
+
+
+
+
+// ========================================================
+// client interface
+
+class Client : public Dispatcher {
+ public:
+
+ /* getdir result */
+ struct DirResult {
+ string path;
+ map<string,inode_t> contents;
+ map<string,inode_t>::iterator p;
+ int off;
+ int size;
+ struct dirent_plus dp;
+ struct dirent_lite dl;
+ DirResult() : p(contents.end()), off(-1), size(0) {}
+ };
+
+
+ protected:
+ Messenger *messenger;
+ int whoami;
+ MonMap *monmap;
+
+ // mds fake RPC
+ tid_t last_tid;
+ map<tid_t, Cond*> mds_rpc_cond;
+ map<tid_t, class MClientReply*> mds_rpc_reply;
+ map<tid_t, Cond*> mds_rpc_dispatch_cond;
+
+ // cluster descriptors
+ MDSMap *mdsmap;
+ OSDMap *osdmap;
+
+ bool mounted;
+ bool unmounting;
+ Cond mount_cond;
+
+ int unsafe_sync_write;
+public:
+ msg_addr_t get_myaddr() { return messenger->get_myaddr(); }
+ void hack_sync_write_safe();
+
+protected:
+ Filer *filer;
+ ObjectCacher *objectcacher;
+ Objecter *objecter; // (non-blocking) osd interface
+
+ // cache
+ hash_map<inodeno_t, Inode*> inode_map;
+ Inode* root;
+ LRU lru; // lru list of Dentry's in our local metadata cache.
+
+ // cap weirdness
+ map<inodeno_t, map<int, class MClientFileCaps*> > cap_reap_queue; // ino -> mds -> msg .. set of (would-be) stale caps to reap
+
+
+ // file handles, etc.
+ string cwd;
+ interval_set<fh_t> free_fh_set; // unused fh's
+ hash_map<fh_t, Fh*> fh_map;
+
+ fh_t get_fh() {
+ fh_t fh = free_fh_set.start();
+ free_fh_set.erase(fh, 1);
+ return fh;
+ }
+ void put_fh(fh_t fh) {
+ free_fh_set.insert(fh, 1);
+ }
+
+ void mkabspath(const char *rel, string& abs) {
+ if (rel[0] == '/') {
+ abs = rel;
+ } else {
+ abs = cwd;
+ abs += "/";
+ abs += rel;
+ }
+ }
+
+
+ // global client lock
+ // - protects Client and buffer cache both!
+ Mutex client_lock;
+
+
+ // -- metadata cache stuff
+
+ // decrease inode ref. delete if dangling.
+ void put_inode(Inode *in) {
+ in->put();
+ if (in->ref == 0) {
+ inode_map.erase(in->inode.ino);
+ if (in == root) root = 0;
+ delete in;
+ }
+ }
+
+ void close_dir(Dir *dir) {
+ assert(dir->is_empty());
+
+ Inode *in = dir->parent_inode;
+ if (in->dn) in->dn->put(); // unpin dentry
+
+ delete in->dir;
+ in->dir = 0;
+ put_inode(in);
+ }
+
+ int get_cache_size() { return lru.lru_get_size(); }
+ void set_cache_size(int m) { lru.lru_set_max(m); }
+
+ Dentry* link(Dir *dir, const string& name, Inode *in) {
+ Dentry *dn = new Dentry;
+ dn->name = name;
+
+ // link to dir
+ dn->dir = dir;
+ dir->dentries[dn->name] = dn;
+
+ // link to inode
+ dn->inode = in;
+ in->dn = dn;
+ in->get();
+
+ lru.lru_insert_mid(dn); // mid or top?
+ return dn;
+ }
+
+ void unlink(Dentry *dn) {
+ Inode *in = dn->inode;
+
+ // unlink from inode
+ dn->inode = 0;
+ in->dn = 0;
+ put_inode(in);
+
+ // unlink from dir
+ dn->dir->dentries.erase(dn->name);
+ if (dn->dir->is_empty())
+ close_dir(dn->dir);
+ dn->dir = 0;
+
+ // delete den
+ lru.lru_remove(dn);
+ delete dn;
+ }
+
+ Dentry *relink(Dentry *dn, Dir *dir, const string& name) {
+ // first link new dn to dir
+ /*
+ char *oldname = (char*)dn->name;
+ dn->name = new char[name.length()+1];
+ strcpy((char*)dn->name, name.c_str());
+ dir->dentries[dn->name] = dn;
+ */
+ dir->dentries[name] = dn;
+
+ // unlink from old dir
+ dn->dir->dentries.erase(dn->name);
+ //delete[] oldname;
+ if (dn->dir->is_empty())
+ close_dir(dn->dir);
+
+ // fix up dn
+ dn->name = name;
+ dn->dir = dir;
+
+ return dn;
+ }
+
+ // move dentry to top of lru
+ void touch_dn(Dentry *dn) { lru.lru_touch(dn); }
+
+ // trim cache.
+ void trim_cache();
+ void dump_inode(Inode *in, set<Inode*>& did);
+ void dump_cache(); // debug
+
+ // find dentry based on filepath
+ Dentry *lookup(filepath& path);
+
+ // make blocking mds request
+ MClientReply *make_request(MClientRequest *req, bool auth_best=false, int use_auth=-1);
+ MClientReply* sendrecv(MClientRequest *req, int mds);
+ void handle_client_reply(MClientReply *reply);
+
+ void fill_stat(inode_t& inode, struct stat *st);
+ void fill_statlite(inode_t& inode, struct statlite *st);
+
+
+ // friends
+ friend class SyntheticClient;
+
+ public:
+ Client(Messenger *m, MonMap *mm);
+ ~Client();
+ void tear_down_cache();
+
+ int get_nodeid() { return whoami; }
+
+ void init();
+ void shutdown();
+
+ // messaging
+ void dispatch(Message *m);
+
+ void handle_mount_ack(class MClientMountAck*);
+ void handle_unmount_ack(Message*);
+ void handle_mds_map(class MMDSMap *m);
+
+ // file caps
+ void handle_file_caps(class MClientFileCaps *m);
+ void implemented_caps(class MClientFileCaps *m, Inode *in);
+ void release_caps(Inode *in, int retain=0);
+ void update_caps_wanted(Inode *in);
+
+ void close_release(Inode *in);
+ void close_safe(Inode *in);
+
+ // metadata cache
+ Inode* insert_inode(Dir *dir, InodeStat *in_info, const string& dn);
+ void update_inode_dist(Inode *in, InodeStat *st);
+ Inode* insert_trace(MClientReply *reply);
+
+ // ----------------------
+ // fs ops.
+ int mount();
+ int unmount();
+
+ // these shoud (more or less) mirror the actual system calls.
+#ifdef DARWIN
+ int statfs(const char *path, struct statvfs *stbuf);
+#else
+ int statfs(const char *path, struct statfs *stbuf);
+#endif
+
+ // crap
+ int chdir(const char *s);
+
+ // namespace ops
+ int getdir(const char *path, list<string>& contents);
+ int getdir(const char *path, map<string,inode_t>& contents);
+
+ DIR *opendir(const char *name);
+ int closedir(DIR *dir);
+ struct dirent *readdir(DIR *dir);
+ void rewinddir(DIR *dir);
+ off_t telldir(DIR *dir);
+ void seekdir(DIR *dir, off_t offset);
+
+ struct dirent_plus *readdirplus(DIR *dirp);
+ int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result);
+ struct dirent_lite *readdirlite(DIR *dirp);
+ int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result);
+
+
+ int link(const char *existing, const char *newname);
+ int unlink(const char *path);
+ int rename(const char *from, const char *to);
+
+ // dirs
+ int mkdir(const char *path, mode_t mode);
+ int rmdir(const char *path);
+
+ // symlinks
+ int readlink(const char *path, char *buf, off_t size);
+ int symlink(const char *existing, const char *newname);
+
+ // inode stuff
+ int _lstat(const char *path, int mask, Inode **in);
+ int lstat(const char *path, struct stat *stbuf);
+ int lstatlite(const char *path, struct statlite *buf);
+
+ int chmod(const char *path, mode_t mode);
+ int chown(const char *path, uid_t uid, gid_t gid);
+ int utime(const char *path, struct utimbuf *buf);
+
+ // file ops
+ int mknod(const char *path, mode_t mode);
+ int open(const char *path, int mode);
+ int close(fh_t fh);
+ int read(fh_t fh, char *buf, off_t size, off_t offset=-1);
+ int write(fh_t fh, const char *buf, off_t size, off_t offset=-1);
+ int truncate(const char *file, off_t size);
+ //int truncate(fh_t fh, long long size);
+ int fsync(fh_t fh, bool syncdataonly);
+
+ // hpc lazyio
+ int lazyio_propogate(int fd, off_t offset, size_t count);
+ int lazyio_synchronize(int fd, off_t offset, size_t count);
+
+ int describe_layout(char *fn, list<ObjectExtent>& result);
+
+ void ms_handle_failure(Message*, msg_addr_t dest, const entity_inst_t& inst);
+};
+
+#endif
--- /dev/null
+
+#include "config.h"
+#include "include/types.h"
+
+#include "FileCache.h"
+#include "osdc/ObjectCacher.h"
+
+#include "msg/Messenger.h"
+
+#undef dout
+#define dout(x) if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myaddr() << ".filecache "
+#define derr(x) if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myaddr() << ".filecache "
+
+
+// flush/release/clean
+
+void FileCache::flush_dirty(Context *onflush)
+{
+ if (oc->flush_set(inode.ino, onflush)) {
+ onflush->finish(0);
+ delete onflush;
+ }
+}
+
+off_t FileCache::release_clean()
+{
+ return oc->release_set(inode.ino);
+}
+
+bool FileCache::is_cached()
+{
+ return oc->set_is_cached(inode.ino);
+}
+
+bool FileCache::is_dirty()
+{
+ return oc->set_is_dirty_or_committing(inode.ino);
+}
+
+void FileCache::empty(Context *onempty)
+{
+ off_t unclean = release_clean();
+ bool clean = oc->flush_set(inode.ino, onempty);
+ assert(!unclean == clean);
+
+ if (clean) {
+ onempty->finish(0);
+ delete onempty;
+ }
+}
+
+
+// caps
+
+void FileCache::set_caps(int caps, Context *onimplement)
+{
+ if (onimplement) {
+ assert(latest_caps & ~caps); // we should be losing caps.
+ caps_callbacks[caps].push_back(onimplement);
+ }
+
+ latest_caps = caps;
+ check_caps();
+}
+
+
+void FileCache::check_caps()
+{
+ int used = 0;
+ if (num_reading) used |= CAP_FILE_RD;
+ if (oc->set_is_cached(inode.ino)) used |= CAP_FILE_RDCACHE;
+ if (num_writing) used |= CAP_FILE_WR;
+ if (oc->set_is_dirty_or_committing(inode.ino)) used |= CAP_FILE_WRBUFFER;
+ dout(10) << "check_caps used " << cap_string(used) << endl;
+
+ // check callbacks
+ map<int, list<Context*> >::iterator p = caps_callbacks.begin();
+ while (p != caps_callbacks.end()) {
+ if (used == 0 || (~(p->first) & used)) {
+ // implemented.
+ dout(10) << "used is " << cap_string(used)
+ << ", caps " << cap_string(p->first) << " implemented, doing callback(s)" << endl;
+ finish_contexts(p->second);
+ map<int, list<Context*> >::iterator o = p;
+ p++;
+ caps_callbacks.erase(o);
+ } else {
+ dout(10) << "used is " << cap_string(used)
+ << ", caps " << cap_string(p->first) << " not yet implemented" << endl;
+ p++;
+ }
+ }
+}
+
+
+
+// read/write
+
+int FileCache::read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock)
+{
+ int r = 0;
+
+ // inc reading counter
+ num_reading++;
+
+ if (latest_caps & CAP_FILE_RDCACHE) {
+ // read (and block)
+ Cond cond;
+ bool done = false;
+ int rvalue = 0;
+ C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue);
+
+ r = oc->file_read(inode, offset, size, &blist, onfinish);
+
+ if (r == 0) {
+ // block
+ while (!done)
+ cond.Wait(client_lock);
+ r = rvalue;
+ } else {
+ // it was cached.
+ delete onfinish;
+ }
+ } else {
+ r = oc->file_atomic_sync_read(inode, offset, size, &blist, client_lock);
+ }
+
+ // dec reading counter
+ num_reading--;
+
+ if (num_reading == 0 && !caps_callbacks.empty())
+ check_caps();
+
+ return r;
+}
+
+void FileCache::write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock)
+{
+ // inc writing counter
+ num_writing++;
+
+ if (latest_caps & CAP_FILE_WRBUFFER) { // caps buffered write?
+ // wait? (this may block!)
+ oc->wait_for_write(size, client_lock);
+
+ // async, caching, non-blocking.
+ oc->file_write(inode, offset, size, blist);
+ } else {
+ // atomic, synchronous, blocking.
+ oc->file_atomic_sync_write(inode, offset, size, blist, client_lock);
+ }
+
+ // dec writing counter
+ num_writing--;
+ if (num_writing == 0 && !caps_callbacks.empty())
+ check_caps();
+}
+
+bool FileCache::all_safe()
+{
+ return !oc->set_is_dirty_or_committing(inode.ino);
+}
+
+void FileCache::add_safe_waiter(Context *c)
+{
+ bool safe = oc->commit_set(inode.ino, c);
+ if (safe) {
+ c->finish(0);
+ delete c;
+ }
+}
--- /dev/null
+#ifndef __FILECACHE_H
+#define __FILECACHE_H
+
+#include <iostream>
+using namespace std;
+
+#include "common/Cond.h"
+#include "mds/Capability.h"
+
+class ObjectCacher;
+
+class FileCache {
+ ObjectCacher *oc;
+ inode_t inode;
+
+ // caps
+ int latest_caps;
+ map<int, list<Context*> > caps_callbacks;
+
+ int num_reading;
+ int num_writing;
+ //int num_unsafe;
+
+ // waiters
+ list<Cond*> waitfor_read;
+ list<Cond*> waitfor_write;
+ //list<Context*> waitfor_safe;
+ bool waitfor_release;
+
+ public:
+ FileCache(ObjectCacher *_oc, inode_t _inode) :
+ oc(_oc),
+ inode(_inode),
+ latest_caps(0),
+ num_reading(0), num_writing(0),// num_unsafe(0),
+ waitfor_release(false) {}
+
+ // waiters/waiting
+ bool can_read() { return latest_caps & CAP_FILE_RD; }
+ bool can_write() { return latest_caps & CAP_FILE_WR; }
+ bool all_safe();// { return num_unsafe == 0; }
+
+ void add_read_waiter(Cond *c) { waitfor_read.push_back(c); }
+ void add_write_waiter(Cond *c) { waitfor_write.push_back(c); }
+ void add_safe_waiter(Context *c);// { waitfor_safe.push_back(c); }
+
+ // ...
+ void flush_dirty(Context *onflush=0);
+ off_t release_clean();
+ void empty(Context *onempty=0);
+ bool is_empty() { return !(is_cached() || is_dirty()); }
+ bool is_cached();
+ bool is_dirty();
+
+ int get_caps() { return latest_caps; }
+ void set_caps(int caps, Context *onimplement=0);
+ void check_caps();
+
+ int read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block.
+ void write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block.
+
+};
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <iostream>
+using namespace std;
+
+
+
+#include "SyntheticClient.h"
+
+#include "include/filepath.h"
+#include "mds/MDS.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <utime.h>
+#include <math.h>
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) cout << "synthetic" << client->get_nodeid() << " "
+
+// traces
+//void trace_include(SyntheticClient *syn, Client *cl, string& prefix);
+//void trace_openssh(SyntheticClient *syn, Client *cl, string& prefix);
+
+
+list<int> syn_modes;
+list<int> syn_iargs;
+list<string> syn_sargs;
+
+void parse_syn_options(vector<char*>& args)
+{
+ vector<char*> nargs;
+
+ for (unsigned i=0; i<args.size(); i++) {
+ if (strcmp(args[i],"--syn") == 0) {
+ ++i;
+
+ if (strcmp(args[i],"writefile") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_WRITEFILE );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"wrshared") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_WRSHARED );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"writebatch") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_WRITEBATCH );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"readfile") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_READFILE );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"rw") == 0) {
+ int a = atoi(args[++i]);
+ int b = atoi(args[++i]);
+ syn_modes.push_back( SYNCLIENT_MODE_WRITEFILE );
+ syn_iargs.push_back( a );
+ syn_iargs.push_back( b );
+ syn_modes.push_back( SYNCLIENT_MODE_READFILE );
+ syn_iargs.push_back( a );
+ syn_iargs.push_back( b );
+
+ } else if (strcmp(args[i],"makedirs") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_MAKEDIRS );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"statdirs") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_STATDIRS );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"readdirs") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_READDIRS );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+
+ } else if (strcmp(args[i],"makefiles") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_MAKEFILES );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"createshared") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_CREATESHARED );
+ syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"openshared") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_OPENSHARED );
+ syn_iargs.push_back( atoi(args[++i]) );
+ syn_iargs.push_back( atoi(args[++i]) );
+
+ } else if (strcmp(args[i],"fullwalk") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_FULLWALK );
+ //syn_sargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"randomwalk") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_RANDOMWALK );
+ syn_iargs.push_back( atoi(args[++i]) );
+
+ } else if (strcmp(args[i],"trace") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_TRACE );
+ syn_sargs.push_back( args[++i] );
+ syn_iargs.push_back( atoi(args[++i]) );
+
+ } else if (strcmp(args[i],"until") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_UNTIL );
+ syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"sleepuntil") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_SLEEPUNTIL );
+ syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"only") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_ONLY );
+ syn_iargs.push_back( atoi(args[++i]) );
+
+ } else if (strcmp(args[i],"sleep") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_SLEEP );
+ syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"randomsleep") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_RANDOMSLEEP );
+ syn_iargs.push_back( atoi(args[++i]) );
+
+ } else if (strcmp(args[i],"opentest") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_OPENTEST );
+ syn_iargs.push_back( atoi(args[++i]) );
+ } else if (strcmp(args[i],"optest") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_OPTEST );
+ syn_iargs.push_back( atoi(args[++i]) );
+ } else {
+ cerr << "unknown syn arg " << args[i] << endl;
+ assert(0);
+ }
+ }
+ else {
+ nargs.push_back(args[i]);
+ }
+ }
+
+ args = nargs;
+}
+
+
+SyntheticClient::SyntheticClient(Client *client)
+{
+ this->client = client;
+ thread_id = 0;
+
+ did_readdir = false;
+
+ run_only = -1;
+
+ this->modes = syn_modes;
+ this->iargs = syn_iargs;
+ this->sargs = syn_sargs;
+
+ run_start = g_clock.now();
+}
+
+
+
+
+#define DBL 2
+
+void *synthetic_client_thread_entry(void *ptr)
+{
+ SyntheticClient *sc = (SyntheticClient*)ptr;
+ //int r =
+ sc->run();
+ return 0;//(void*)r;
+}
+
+string SyntheticClient::get_sarg(int seq)
+{
+ string a;
+ if (!sargs.empty()) {
+ a = sargs.front();
+ sargs.pop_front();
+ }
+ if (a.length() == 0 || a == "~") {
+ char s[20];
+ sprintf(s,"syn.%d.%d", client->whoami, seq);
+ a = s;
+ }
+ //cout << "a is " << a << endl;
+ return a;
+}
+
+int SyntheticClient::run()
+{
+ //run_start = g_clock.now();
+ run_until = utime_t(0,0);
+ dout(5) << "run" << endl;
+
+ for (list<int>::iterator it = modes.begin();
+ it != modes.end();
+ it++) {
+ int mode = *it;
+ dout(3) << "mode " << mode << endl;
+
+ switch (mode) {
+ case SYNCLIENT_MODE_RANDOMSLEEP:
+ {
+ int iarg1 = iargs.front();
+ iargs.pop_front();
+ if (run_me()) {
+ srand(time(0) + getpid() + client->whoami);
+ sleep(rand() % iarg1);
+ }
+ }
+ break;
+
+ case SYNCLIENT_MODE_SLEEP:
+ {
+ int iarg1 = iargs.front();
+ iargs.pop_front();
+ if (run_me()) {
+ dout(2) << "sleep " << iarg1 << endl;
+ sleep(iarg1);
+ }
+ }
+ break;
+
+ case SYNCLIENT_MODE_ONLY:
+ {
+ run_only = iargs.front();
+ iargs.pop_front();
+ if (run_only == client->get_nodeid())
+ dout(2) << "only " << run_only << endl;
+ }
+ break;
+
+ case SYNCLIENT_MODE_UNTIL:
+ {
+ int iarg1 = iargs.front();
+ iargs.pop_front();
+ if (iarg1) {
+ dout(2) << "until " << iarg1 << endl;
+ utime_t dur(iarg1,0);
+ run_until = run_start + dur;
+ } else {
+ dout(2) << "until " << iarg1 << " (no limit)" << endl;
+ run_until = utime_t(0,0);
+ }
+ }
+ break;
+
+ case SYNCLIENT_MODE_SLEEPUNTIL:
+ {
+ int iarg1 = iargs.front();
+ iargs.pop_front();
+ if (iarg1) {
+ dout(2) << "sleepuntil " << iarg1 << endl;
+ utime_t at = g_clock.now() - run_start;
+ if (at.sec() < iarg1)
+ sleep(iarg1 - at.sec());
+ }
+ }
+ break;
+
+ case SYNCLIENT_MODE_RANDOMWALK:
+ {
+ int iarg1 = iargs.front();
+ iargs.pop_front();
+ if (run_me()) {
+ dout(2) << "randomwalk " << iarg1 << endl;
+ random_walk(iarg1);
+ }
+ }
+ break;
+
+ case SYNCLIENT_MODE_MAKEDIRS:
+ {
+ string sarg1 = get_sarg(0);
+ int iarg1 = iargs.front(); iargs.pop_front();
+ int iarg2 = iargs.front(); iargs.pop_front();
+ int iarg3 = iargs.front(); iargs.pop_front();
+ if (run_me()) {
+ dout(2) << "makedirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl;
+ make_dirs(sarg1.c_str(), iarg1, iarg2, iarg3);
+ }
+ }
+ break;
+ case SYNCLIENT_MODE_STATDIRS:
+ {
+ string sarg1 = get_sarg(0);
+ int iarg1 = iargs.front(); iargs.pop_front();
+ int iarg2 = iargs.front(); iargs.pop_front();
+ int iarg3 = iargs.front(); iargs.pop_front();
+ if (run_me()) {
+ dout(2) << "statdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl;
+ stat_dirs(sarg1.c_str(), iarg1, iarg2, iarg3);
+ }
+ }
+ break;
+ case SYNCLIENT_MODE_READDIRS:
+ {
+ string sarg1 = get_sarg(0);
+ int iarg1 = iargs.front(); iargs.pop_front();
+ int iarg2 = iargs.front(); iargs.pop_front();
+ int iarg3 = iargs.front(); iargs.pop_front();
+ if (run_me()) {
+ dout(2) << "readdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl;
+ read_dirs(sarg1.c_str(), iarg1, iarg2, iarg3);
+ }
+ }
+ break;
+
+
+ case SYNCLIENT_MODE_MAKEFILES:
+ {
+ int num = iargs.front(); iargs.pop_front();
+ int count = iargs.front(); iargs.pop_front();
+ int priv = iargs.front(); iargs.pop_front();
+ if (run_me()) {
+ dout(2) << "makefiles " << num << " " << count << " " << priv << endl;
+ make_files(num, count, priv, false);
+ }
+ }
+ break;
+ case SYNCLIENT_MODE_MAKEFILES2:
+ {
+ int num = iargs.front(); iargs.pop_front();
+ int count = iargs.front(); iargs.pop_front();
+ int priv = iargs.front(); iargs.pop_front();
+ if (run_me()) {
+ dout(2) << "makefiles2 " << num << " " << count << " " << priv << endl;
+ make_files(num, count, priv, true);
+ }
+ }
+ break;
+ case SYNCLIENT_MODE_CREATESHARED:
+ {
+ string sarg1 = get_sarg(0);
+ int num = iargs.front(); iargs.pop_front();
+ if (run_me()) {
+ dout(2) << "createshared " << num << endl;
+ create_shared(num);
+ }
+ }
+ break;
+ case SYNCLIENT_MODE_OPENSHARED:
+ {
+ string sarg1 = get_sarg(0);
+ int num = iargs.front(); iargs.pop_front();
+ int count = iargs.front(); iargs.pop_front();
+ if (run_me()) {
+ dout(2) << "openshared " << num << endl;
+ open_shared(num, count);
+ }
+ }
+ break;
+
+ case SYNCLIENT_MODE_FULLWALK:
+ {
+ string sarg1 = get_sarg(0);
+ if (run_me()) {
+ dout(2) << "fullwalk" << sarg1 << endl;
+ full_walk(sarg1);
+ }
+ }
+ break;
+ case SYNCLIENT_MODE_REPEATWALK:
+ {
+ string sarg1 = get_sarg(0);
+ if (run_me()) {
+ dout(2) << "repeatwalk " << sarg1 << endl;
+ while (full_walk(sarg1) == 0) ;
+ }
+ }
+ break;
+
+ case SYNCLIENT_MODE_WRITEFILE:
+ {
+ string sarg1 = get_sarg(0);
+ int iarg1 = iargs.front(); iargs.pop_front();
+ int iarg2 = iargs.front(); iargs.pop_front();
+ if (run_me())
+ write_file(sarg1, iarg1, iarg2);
+ }
+ break;
+ case SYNCLIENT_MODE_WRSHARED:
+ {
+ string sarg1 = "shared";
+ int iarg1 = iargs.front(); iargs.pop_front();
+ int iarg2 = iargs.front(); iargs.pop_front();
+ if (run_me())
+ write_file(sarg1, iarg1, iarg2);
+ }
+ break;
+ case SYNCLIENT_MODE_WRITEBATCH:
+ {
+ int iarg1 = iargs.front(); iargs.pop_front();
+ int iarg2 = iargs.front(); iargs.pop_front();
+ int iarg3 = iargs.front(); iargs.pop_front();
+
+ if (run_me())
+ write_batch(iarg1, iarg2, iarg3);
+ }
+ break;
+
+ case SYNCLIENT_MODE_READFILE:
+ {
+ string sarg1 = get_sarg(0);
+ int iarg1 = iargs.front(); iargs.pop_front();
+ int iarg2 = iargs.front(); iargs.pop_front();
+ if (run_me())
+ read_file(sarg1, iarg1, iarg2);
+ }
+ break;
+
+ case SYNCLIENT_MODE_TRACE:
+ {
+ string tfile = get_sarg(0);
+ sargs.push_front(string("~"));
+ int iarg1 = iargs.front(); iargs.pop_front();
+ string prefix = get_sarg(0);
+
+ if (run_me()) {
+ dout(2) << "trace " << tfile << " prefix " << prefix << " ... " << iarg1 << " times" << endl;
+
+ Trace t(tfile.c_str());
+
+ client->mkdir(prefix.c_str(), 0755);
+
+ for (int i=0; i<iarg1; i++) {
+ utime_t start = g_clock.now();
+
+ if (time_to_stop()) break;
+ play_trace(t, prefix);
+ if (time_to_stop()) break;
+ clean_dir(prefix);
+
+ utime_t lat = g_clock.now();
+ lat -= start;
+
+ dout(1) << " trace " << tfile << " loop " << (i+1) << "/" << iarg1 << " done in " << (double)lat << " seconds" << endl;
+ if (client_logger
+ && i > 0
+ && i < iarg1-1
+ ) {
+ client_logger->finc("trsum", (double)lat);
+ client_logger->inc("trnum");
+ }
+ }
+ }
+ }
+ break;
+
+
+ case SYNCLIENT_MODE_OPENTEST:
+ {
+ int count = iargs.front(); iargs.pop_front();
+ if (run_me()) {
+ for (int i=0; i<count; i++) {
+ int fd = client->open("test", rand()%2 ? (O_WRONLY|O_CREAT):O_RDONLY);
+ if (fd > 0) client->close(fd);
+ }
+ }
+ }
+ break;
+
+ case SYNCLIENT_MODE_OPTEST:
+ {
+ int count = iargs.front(); iargs.pop_front();
+ if (run_me()) {
+ client->mknod("test",0777);
+ struct stat st;
+ for (int i=0; i<count; i++) {
+ client->lstat("test", &st);
+ client->chmod("test", 0777);
+ }
+ }
+ }
+ break;
+
+ default:
+ assert(0);
+ }
+ }
+ return 0;
+}
+
+
+int SyntheticClient::start_thread()
+{
+ assert(!thread_id);
+
+ pthread_create(&thread_id, NULL, synthetic_client_thread_entry, this);
+ assert(thread_id);
+ return 0;
+}
+
+int SyntheticClient::join_thread()
+{
+ assert(thread_id);
+ void *rv;
+ pthread_join(thread_id, &rv);
+ return 0;
+}
+
+
+bool roll_die(float p)
+{
+ float r = (float)(rand() % 100000) / 100000.0;
+ if (r < p)
+ return true;
+ else
+ return false;
+}
+
+void SyntheticClient::init_op_dist()
+{
+ op_dist.clear();
+ op_dist.add( MDS_OP_STAT, g_conf.fakeclient_op_stat );
+ op_dist.add( MDS_OP_UTIME, g_conf.fakeclient_op_utime );
+ op_dist.add( MDS_OP_CHMOD, g_conf.fakeclient_op_chmod );
+ op_dist.add( MDS_OP_CHOWN, g_conf.fakeclient_op_chown );
+
+ op_dist.add( MDS_OP_READDIR, g_conf.fakeclient_op_readdir );
+ op_dist.add( MDS_OP_MKNOD, g_conf.fakeclient_op_mknod );
+ op_dist.add( MDS_OP_LINK, g_conf.fakeclient_op_link );
+ op_dist.add( MDS_OP_UNLINK, g_conf.fakeclient_op_unlink );
+ op_dist.add( MDS_OP_RENAME, g_conf.fakeclient_op_rename );
+
+ op_dist.add( MDS_OP_MKDIR, g_conf.fakeclient_op_mkdir );
+ op_dist.add( MDS_OP_RMDIR, g_conf.fakeclient_op_rmdir );
+ op_dist.add( MDS_OP_SYMLINK, g_conf.fakeclient_op_symlink );
+
+ op_dist.add( MDS_OP_OPEN, g_conf.fakeclient_op_openrd );
+ //op_dist.add( MDS_OP_READ, g_conf.fakeclient_op_read );
+ //op_dist.add( MDS_OP_WRITE, g_conf.fakeclient_op_write );
+ op_dist.add( MDS_OP_TRUNCATE, g_conf.fakeclient_op_truncate );
+ op_dist.add( MDS_OP_FSYNC, g_conf.fakeclient_op_fsync );
+ op_dist.add( MDS_OP_RELEASE, g_conf.fakeclient_op_close ); // actually, close()
+ op_dist.normalize();
+}
+
+void SyntheticClient::up()
+{
+ cwd = cwd.prefixpath(cwd.depth()-1);
+ dout(DBL) << "cd .. -> " << cwd << endl;
+ clear_dir();
+}
+
+
+int SyntheticClient::play_trace(Trace& t, string& prefix)
+{
+ dout(4) << "play trace" << endl;
+ t.start();
+
+ utime_t start = g_clock.now();
+
+ const char *p = prefix.c_str();
+
+ map<__int64_t, __int64_t> open_files;
+
+ while (!t.end()) {
+
+ if (time_to_stop()) break;
+
+ // op
+ const char *op = t.get_string();
+ dout(4) << "trace op " << op << endl;
+ if (strcmp(op, "link") == 0) {
+ const char *a = t.get_string(p);
+ const char *b = t.get_string(p);
+ client->link(a,b);
+ } else if (strcmp(op, "unlink") == 0) {
+ const char *a = t.get_string(p);
+ client->unlink(a);
+ } else if (strcmp(op, "rename") == 0) {
+ const char *a = t.get_string(p);
+ const char *b = t.get_string(p);
+ client->rename(a,b);
+ } else if (strcmp(op, "mkdir") == 0) {
+ const char *a = t.get_string(p);
+ __int64_t b = t.get_int();
+ client->mkdir(a, b);
+ } else if (strcmp(op, "rmdir") == 0) {
+ const char *a = t.get_string(p);
+ client->rmdir(a);
+ } else if (strcmp(op, "symlink") == 0) {
+ const char *a = t.get_string(p);
+ const char *b = t.get_string(p);
+ client->symlink(a,b);
+ } else if (strcmp(op, "readlink") == 0) {
+ const char *a = t.get_string(p);
+ char buf[100];
+ client->readlink(a, buf, 100);
+ } else if (strcmp(op, "lstat") == 0) {
+ struct stat st;
+ const char *a = t.get_string(p);
+ client->lstat(a, &st);
+ } else if (strcmp(op, "chmod") == 0) {
+ const char *a = t.get_string(p);
+ __int64_t b = t.get_int();
+ client->chmod(a, b);
+ } else if (strcmp(op, "chown") == 0) {
+ const char *a = t.get_string(p);
+ __int64_t b = t.get_int();
+ __int64_t c = t.get_int();
+ client->chown(a, b, c);
+ } else if (strcmp(op, "utime") == 0) {
+ const char *a = t.get_string(p);
+ __int64_t b = t.get_int();
+ __int64_t c = t.get_int();
+ struct utimbuf u;
+ u.actime = b;
+ u.modtime = c;
+ client->utime(a, &u);
+ } else if (strcmp(op, "mknod") == 0) {
+ const char *a = t.get_string(p);
+ __int64_t b = t.get_int();
+ client->mknod(a, b);
+ } else if (strcmp(op, "getdir") == 0) {
+ const char *a = t.get_string(p);
+ map<string,inode_t> contents;
+ client->getdir(a, contents);
+ } else if (strcmp(op, "open") == 0) {
+ const char *a = t.get_string(p);
+ __int64_t b = t.get_int();
+ __int64_t id = t.get_int();
+ __int64_t fh = client->open(a, b);
+ open_files[id] = fh;
+ } else if (strcmp(op, "close") == 0) {
+ __int64_t id = t.get_int();
+ __int64_t fh = open_files[id];
+ if (fh > 0) client->close(fh);
+ open_files.erase(id);
+ } else if (strcmp(op, "truncate") == 0) {
+ const char *a = t.get_string(p);
+ __int64_t b = t.get_int();
+ client->truncate(a,b);
+ } else if (strcmp(op, "read") == 0) {
+ __int64_t id = t.get_int();
+ __int64_t fh = open_files[id];
+ int size = t.get_int();
+ int off = t.get_int();
+ char *buf = new char[size];
+ client->read(fh, buf, size, off);
+ delete[] buf;
+ } else if (strcmp(op, "write") == 0) {
+ __int64_t id = t.get_int();
+ __int64_t fh = open_files[id];
+ int size = t.get_int();
+ int off = t.get_int();
+ char *buf = new char[size];
+ memset(buf, 1, size); // let's write 1's!
+ client->write(fh, buf, size, off);
+ delete[] buf;
+ } else if (strcmp(op, "fsync") == 0) {
+ assert(0);
+ } else
+ assert(0);
+ }
+
+ // close open files
+ for (map<__int64_t, __int64_t>::iterator fi = open_files.begin();
+ fi != open_files.end();
+ fi++) {
+ dout(1) << "leftover close " << fi->second << endl;
+ if (fi->second > 0) client->close(fi->second);
+ }
+
+ return 0;
+}
+
+
+int SyntheticClient::clean_dir(string& basedir)
+{
+ // read dir
+ map<string, inode_t> contents;
+ int r = client->getdir(basedir.c_str(), contents);
+ if (r < 0) {
+ dout(1) << "readdir on " << basedir << " returns " << r << endl;
+ return r;
+ }
+
+ for (map<string, inode_t>::iterator it = contents.begin();
+ it != contents.end();
+ it++) {
+ string file = basedir + "/" + it->first;
+
+ if (time_to_stop()) break;
+
+ struct stat st;
+ int r = client->lstat(file.c_str(), &st);
+ if (r < 0) {
+ dout(1) << "stat error on " << file << " r=" << r << endl;
+ continue;
+ }
+
+ if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) {
+ clean_dir(file);
+ client->rmdir(file.c_str());
+ } else {
+ client->unlink(file.c_str());
+ }
+ }
+
+ return 0;
+
+}
+
+
+int SyntheticClient::full_walk(string& basedir)
+{
+ if (time_to_stop()) return -1;
+
+ // read dir
+ map<string, inode_t> contents;
+ int r = client->getdir(basedir.c_str(), contents);
+ if (r < 0) {
+ dout(1) << "readdir on " << basedir << " returns " << r << endl;
+ return r;
+ }
+
+ for (map<string, inode_t>::iterator it = contents.begin();
+ it != contents.end();
+ it++) {
+ string file = basedir + "/" + it->first;
+
+ struct stat st;
+ int r = client->lstat(file.c_str(), &st);
+ if (r < 0) {
+ dout(1) << "stat error on " << file << " r=" << r << endl;
+ continue;
+ }
+
+ if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) full_walk(file);
+ }
+
+ return 0;
+}
+
+int SyntheticClient::make_dirs(const char *basedir, int dirs, int files, int depth)
+{
+ if (time_to_stop()) return 0;
+
+ // make sure base dir exists
+ int r = client->mkdir(basedir, 0755);
+ if (r != 0) {
+ dout(1) << "can't make base dir? " << basedir << endl;
+ return -1;
+ }
+
+ // children
+ char d[500];
+ dout(3) << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl;
+ for (int i=0; i<files; i++) {
+ sprintf(d,"%s/file.%d", basedir, i);
+ client->mknod(d, 0644);
+ }
+
+ if (depth == 0) return 0;
+
+ for (int i=0; i<dirs; i++) {
+ sprintf(d, "%s/dir.%d", basedir, i);
+ make_dirs(d, dirs, files, depth-1);
+ }
+
+ return 0;
+}
+
+int SyntheticClient::stat_dirs(const char *basedir, int dirs, int files, int depth)
+{
+ if (time_to_stop()) return 0;
+
+ // make sure base dir exists
+ struct stat st;
+ int r = client->lstat(basedir, &st);
+ if (r != 0) {
+ dout(1) << "can't make base dir? " << basedir << endl;
+ return -1;
+ }
+
+ // children
+ char d[500];
+ dout(3) << "stat_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl;
+ for (int i=0; i<files; i++) {
+ sprintf(d,"%s/file.%d", basedir, i);
+ client->lstat(d, &st);
+ }
+
+ if (depth == 0) return 0;
+
+ for (int i=0; i<dirs; i++) {
+ sprintf(d, "%s/dir.%d", basedir, i);
+ stat_dirs(d, dirs, files, depth-1);
+ }
+
+ return 0;
+}
+int SyntheticClient::read_dirs(const char *basedir, int dirs, int files, int depth)
+{
+ if (time_to_stop()) return 0;
+
+ struct stat st;
+
+ // children
+ char d[500];
+ dout(3) << "read_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl;
+
+ map<string,inode_t> contents;
+ utime_t s = g_clock.now();
+ int r = client->getdir(basedir, contents);
+ utime_t e = g_clock.now();
+ e -= s;
+ if (client_logger) client_logger->finc("readdir", e);
+ if (r < 0) {
+ dout(0) << "read_dirs couldn't readdir " << basedir << ", stopping" << endl;
+ return -1;
+ }
+
+ for (int i=0; i<files; i++) {
+ sprintf(d,"%s/file.%d", basedir, i);
+ utime_t s = g_clock.now();
+ if (client->lstat(d, &st) < 0) {
+ dout(2) << "read_dirs failed stat on " << d << ", stopping" << endl;
+ return -1;
+ }
+ utime_t e = g_clock.now();
+ e -= s;
+ if (client_logger) client_logger->finc("stat", e);
+ }
+
+ if (depth > 0)
+ for (int i=0; i<dirs; i++) {
+ sprintf(d, "%s/dir.%d", basedir, i);
+ if (read_dirs(d, dirs, files, depth-1) < 0) return -1;
+ }
+
+ return 0;
+}
+
+
+int SyntheticClient::make_files(int num, int count, int priv, bool more)
+{
+ int whoami = client->get_nodeid();
+ char d[255];
+
+ if (priv) {
+ for (int c=0; c<count; c++) {
+ sprintf(d,"dir.%d.run%d", whoami, c);
+ client->mkdir(d, 0755);
+ }
+ } else {
+ // shared
+ if (whoami == 0) {
+ for (int c=0; c<count; c++) {
+ sprintf(d,"dir.%d.run%d", 0, c);
+ client->mkdir(d, 0755);
+ }
+ } else {
+ sleep(5);
+ }
+ }
+
+ // files
+ struct stat st;
+ for (int c=0; c<count; c++) {
+ for (int n=0; n<num; n++) {
+ sprintf(d,"dir.%d.run%d/file.client%d.%d", priv ? whoami:0, c, whoami, n);
+
+ client->mknod(d, 0644);
+
+ if (more) {
+ client->lstat(d, &st);
+ int fd = client->open(d, O_RDONLY);
+ client->unlink(d);
+ client->close(fd);
+ }
+
+ if (time_to_stop()) return 0;
+ }
+ }
+
+ return 0;
+}
+
+
+int SyntheticClient::create_shared(int num)
+{
+ // files
+ char d[255];
+ for (int n=0; n<num; n++) {
+ sprintf(d,"file.%d", n);
+ client->mknod(d, 0644);
+ }
+
+ return 0;
+}
+
+int SyntheticClient::open_shared(int num, int count)
+{
+ // files
+ char d[255];
+ for (int c=0; c<count; c++) {
+ // open
+ list<int> fds;
+ for (int n=0; n<num; n++) {
+ sprintf(d,"file.%d", n);
+ int fd = client->open(d,O_RDONLY);
+ fds.push_back(fd);
+ }
+
+ while (!fds.empty()) {
+ int fd = fds.front();
+ fds.pop_front();
+ client->close(fd);
+ }
+ }
+
+ return 0;
+}
+
+
+int SyntheticClient::write_file(string& fn, int size, int wrsize) // size is in MB, wrsize in bytes
+{
+ //__uint64_t wrsize = 1024*256;
+ char *buf = new char[wrsize+100]; // 1 MB
+ memset(buf, 7, wrsize);
+ __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)wrsize;
+
+ int fd = client->open(fn.c_str(), O_RDWR|O_CREAT);
+ dout(5) << "writing to " << fn << " fd " << fd << endl;
+ if (fd < 0) return fd;
+
+ for (unsigned i=0; i<chunks; i++) {
+ if (time_to_stop()) {
+ dout(0) << "stopping" << endl;
+ break;
+ }
+ dout(2) << "writing block " << i << "/" << chunks << endl;
+
+ // fill buf with a fingerprint
+ int *p = (int*)buf;
+ while ((char*)p < buf + wrsize) {
+ *p = (char*)p - buf;
+ p++;
+ *p = i;
+ p++;
+ *p = client->get_nodeid();
+ p++;
+ *p = 0;
+ p++;
+ }
+
+ client->write(fd, buf, wrsize, i*wrsize);
+ }
+
+ client->close(fd);
+ delete[] buf;
+
+ return 0;
+}
+
+int SyntheticClient::write_batch(int nfile, int size, int wrsize)
+{
+ for (int i=0; i<nfile; i++) {
+ string sarg1 = get_sarg(i);
+ dout(0) << "Write file " << sarg1 << endl;
+ write_file(sarg1, size, wrsize);
+ }
+ return 0;
+}
+
+int SyntheticClient::read_file(string& fn, int size, int rdsize) // size is in MB, wrsize in bytes
+{
+ char *buf = new char[rdsize];
+ memset(buf, 1, rdsize);
+ __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)rdsize;
+
+ int fd = client->open(fn.c_str(), O_RDONLY);
+ dout(5) << "reading from " << fn << " fd " << fd << endl;
+ if (fd < 0) return fd;
+
+ for (unsigned i=0; i<chunks; i++) {
+ if (time_to_stop()) break;
+ dout(2) << "reading block " << i << "/" << chunks << endl;
+ client->read(fd, buf, rdsize, i*rdsize);
+
+ // verify fingerprint
+ int *p = (int*)buf;
+ int bad = 0;
+ int boff, bgoff, bchunk, bclient, bzero;
+ while ((char*)p + 32 < buf + rdsize) {
+ boff = *p;
+ bgoff = (int)((char*)p - buf);
+ p++;
+ bchunk = *p;
+ p++;
+ bclient = *p;
+ p++;
+ bzero = *p;
+ p++;
+ if (boff != bgoff ||
+ bchunk != (int)i ||
+ bclient != client->get_nodeid() ||
+ bzero != 0) {
+ if (!bad)
+ dout(0) << "WARNING: wrong data from OSD, it should be "
+ << "(block=" << i
+ << " offset=" << bgoff
+ << " client=" << client->get_nodeid() << ")"
+ << " .. but i read back .. "
+ << "(block=" << bchunk
+ << " offset=" << boff
+ << " client=" << bclient << " zero=" << bzero << ")" << endl;
+
+ bad++;
+ }
+ }
+ if (bad)
+ dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << endl;
+
+ }
+
+ client->close(fd);
+ delete[] buf;
+
+ return 0;
+}
+
+
+
+int SyntheticClient::random_walk(int num_req)
+{
+ int left = num_req;
+
+ //dout(1) << "random_walk() will do " << left << " ops" << endl;
+
+ init_op_dist(); // set up metadata op distribution
+
+ while (left > 0) {
+ left--;
+
+ if (time_to_stop()) break;
+
+ // ascend?
+ if (cwd.depth() && !roll_die(::pow((double).9, (double)cwd.depth()))) {
+ dout(DBL) << "die says up" << endl;
+ up();
+ continue;
+ }
+
+ // descend?
+ if (.9*roll_die(::pow((double).9,(double)cwd.depth())) && subdirs.size()) {
+ string s = get_random_subdir();
+ cwd.add_dentry( s );
+ dout(DBL) << "cd " << s << " -> " << cwd << endl;
+ clear_dir();
+ continue;
+ }
+
+ int op = 0;
+ filepath path;
+
+ if (contents.empty() && roll_die(.3)) {
+ if (did_readdir) {
+ dout(DBL) << "empty dir, up" << endl;
+ up();
+ } else
+ op = MDS_OP_READDIR;
+ } else {
+ op = op_dist.sample();
+ }
+ //dout(DBL) << "op is " << op << endl;
+
+ int r = 0;
+
+ // do op
+ if (op == MDS_OP_UNLINK) {
+ if (contents.empty())
+ op = MDS_OP_READDIR;
+ else
+ r = client->unlink( get_random_sub() ); // will fail on dirs
+ }
+
+ if (op == MDS_OP_RENAME) {
+ if (contents.empty())
+ op = MDS_OP_READDIR;
+ else {
+ r = client->rename( get_random_sub(), make_sub("ren") );
+ }
+ }
+
+ if (op == MDS_OP_MKDIR) {
+ r = client->mkdir( make_sub("mkdir"), 0755);
+ }
+
+ if (op == MDS_OP_RMDIR) {
+ if (!subdirs.empty())
+ r = client->rmdir( get_random_subdir() );
+ else
+ r = client->rmdir( cwd.c_str() ); // will pbly fail
+ }
+
+ if (op == MDS_OP_SYMLINK) {
+ }
+
+ if (op == MDS_OP_CHMOD) {
+ if (contents.empty())
+ op = MDS_OP_READDIR;
+ else
+ r = client->chmod( get_random_sub(), rand() & 0755 );
+ }
+
+ if (op == MDS_OP_CHOWN) {
+ if (contents.empty()) r = client->chown( cwd.c_str(), rand(), rand() );
+ else
+ r = client->chown( get_random_sub(), rand(), rand() );
+ }
+
+ if (op == MDS_OP_LINK) {
+ }
+
+ if (op == MDS_OP_UTIME) {
+ struct utimbuf b;
+ memset(&b, 1, sizeof(b));
+ if (contents.empty())
+ r = client->utime( cwd.c_str(), &b );
+ else
+ r = client->utime( get_random_sub(), &b );
+ }
+
+ if (op == MDS_OP_MKNOD) {
+ r = client->mknod( make_sub("mknod"), 0644);
+ }
+
+ if (op == MDS_OP_OPEN) {
+ if (contents.empty())
+ op = MDS_OP_READDIR;
+ else {
+ r = client->open( get_random_sub(), O_RDONLY );
+ if (r > 0) {
+ assert(open_files.count(r) == 0);
+ open_files.insert(r);
+ }
+ }
+ }
+
+ if (op == MDS_OP_RELEASE) { // actually, close
+ if (open_files.empty())
+ op = MDS_OP_STAT;
+ else {
+ int fh = get_random_fh();
+ r = client->close( fh );
+ if (r == 0) open_files.erase(fh);
+ }
+ }
+
+ if (op == MDS_OP_STAT) {
+ struct stat st;
+ if (contents.empty()) {
+ if (did_readdir) {
+ if (roll_die(.1)) {
+ dout(DBL) << "stat in empty dir, up" << endl;
+ up();
+ } else {
+ op = MDS_OP_MKNOD;
+ }
+ } else
+ op = MDS_OP_READDIR;
+ } else
+ r = client->lstat(get_random_sub(), &st);
+ }
+
+ if (op == MDS_OP_READDIR) {
+ clear_dir();
+
+ map<string, inode_t> c;
+ r = client->getdir( cwd.c_str(), c );
+
+ for (map<string, inode_t>::iterator it = c.begin();
+ it != c.end();
+ it++) {
+ //dout(DBL) << " got " << it->first << endl;
+ contents[it->first] = it->second;
+ if (it->second.is_dir())
+ subdirs.insert(it->first);
+ }
+
+ did_readdir = true;
+ }
+
+ // errors?
+ if (r < 0) {
+ // reevaluate cwd.
+ //while (cwd.depth()) {
+ //if (client->lookup(cwd)) break; // it's in the cache
+
+ //dout(DBL) << "r = " << r << ", client doesn't have " << cwd << ", cd .." << endl;
+ dout(DBL) << "r = " << r << ", client may not have " << cwd << ", cd .." << endl;
+ up();
+ //}
+ }
+ }
+
+ // close files
+ dout(DBL) << "closing files" << endl;
+ while (!open_files.empty()) {
+ int fh = get_random_fh();
+ int r = client->close( fh );
+ if (r == 0) open_files.erase(fh);
+ }
+
+ dout(DBL) << "done" << endl;
+ return 0;
+}
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __SYNTHETICCLIENT_H
+#define __SYNTHETICCLIENT_H
+
+#include <pthread.h>
+
+#include "Client.h"
+#include "include/Distribution.h"
+
+#include "Trace.h"
+
+#define SYNCLIENT_MODE_RANDOMWALK 1
+#define SYNCLIENT_MODE_FULLWALK 2
+#define SYNCLIENT_MODE_REPEATWALK 7
+
+#define SYNCLIENT_MODE_MAKEDIRS 8 // dirs files depth
+#define SYNCLIENT_MODE_STATDIRS 9 // dirs files depth
+#define SYNCLIENT_MODE_READDIRS 10 // dirs files depth
+
+#define SYNCLIENT_MODE_MAKEFILES 11 // num count private
+#define SYNCLIENT_MODE_MAKEFILES2 12 // num count private
+#define SYNCLIENT_MODE_CREATESHARED 13 // num
+#define SYNCLIENT_MODE_OPENSHARED 14 // num count
+
+#define SYNCLIENT_MODE_WRITEFILE 20
+#define SYNCLIENT_MODE_READFILE 21
+#define SYNCLIENT_MODE_WRITEBATCH 22
+#define SYNCLIENT_MODE_WRSHARED 23
+
+#define SYNCLIENT_MODE_TRACE 30
+
+#define SYNCLIENT_MODE_OPENTEST 40
+#define SYNCLIENT_MODE_OPTEST 41
+
+#define SYNCLIENT_MODE_ONLY 50
+#define SYNCLIENT_MODE_UNTIL 51
+#define SYNCLIENT_MODE_SLEEPUNTIL 52
+
+#define SYNCLIENT_MODE_RANDOMSLEEP 61
+#define SYNCLIENT_MODE_SLEEP 62
+
+
+
+
+void parse_syn_options(vector<char*>& args);
+
+class SyntheticClient {
+ Client *client;
+
+ pthread_t thread_id;
+
+ Distribution op_dist;
+
+ void init_op_dist();
+ int get_op();
+
+
+ filepath cwd;
+ map<string, inode_t> contents;
+ set<string> subdirs;
+ bool did_readdir;
+ set<int> open_files;
+
+ void up();
+
+ void clear_dir() {
+ contents.clear();
+ subdirs.clear();
+ did_readdir = false;
+ }
+
+ int get_random_fh() {
+ int r = rand() % open_files.size();
+ set<int>::iterator it = open_files.begin();
+ while (r--) it++;
+ return *it;
+ }
+
+
+ filepath n1;
+ const char *get_random_subdir() {
+ assert(!subdirs.empty());
+ int r = ((rand() % subdirs.size()) + (rand() % subdirs.size())) / 2; // non-uniform distn
+ set<string>::iterator it = subdirs.begin();
+ while (r--) it++;
+
+ n1 = cwd;
+ n1.add_dentry( *it );
+ return n1.get_path().c_str();
+ }
+ filepath n2;
+ const char *get_random_sub() {
+ assert(!contents.empty());
+ int r = ((rand() % contents.size()) + (rand() % contents.size())) / 2; // non-uniform distn
+ if (cwd.depth() && cwd.last_bit().length())
+ r += cwd.last_bit().c_str()[0]; // slightly permuted
+ r %= contents.size();
+
+ map<string,inode_t>::iterator it = contents.begin();
+ while (r--) it++;
+
+ n2 = cwd;
+ n2.add_dentry( it->first );
+ return n2.get_path().c_str();
+ }
+
+ filepath sub;
+ char sub_s[50];
+ const char *make_sub(char *base) {
+ sprintf(sub_s, "%s.%d", base, rand() % 100);
+ string f = sub_s;
+ sub = cwd;
+ sub.add_dentry(f);
+ return sub.c_str();
+ }
+
+ public:
+ SyntheticClient(Client *client);
+
+ int start_thread();
+ int join_thread();
+
+ int run();
+
+ bool run_me() {
+ if (run_only >= 0) {
+ if (run_only == client->get_nodeid()) {
+ run_only = -1;
+ return true;
+ }
+ run_only = -1;
+ return false;
+ }
+ return true;
+ }
+
+ // run() will do one of these things:
+ list<int> modes;
+ list<string> sargs;
+ list<int> iargs;
+ utime_t run_start;
+ utime_t run_until;
+
+ int run_only;
+
+ string get_sarg(int seq);
+
+ bool time_to_stop() {
+ utime_t now = g_clock.now();
+ if (0) cout << "time_to_stop .. now " << now
+ << " until " << run_until
+ << " start " << run_start
+ << endl;
+ if (run_until.sec() && now > run_until)
+ return true;
+ else
+ return false;
+ }
+
+ string compose_path(string& prefix, char *rest) {
+ return prefix + rest;
+ }
+
+ int full_walk(string& fromdir);
+ int random_walk(int n);
+
+ int make_dirs(const char *basedir, int dirs, int files, int depth);
+ int stat_dirs(const char *basedir, int dirs, int files, int depth);
+ int read_dirs(const char *basedir, int dirs, int files, int depth);
+ int make_files(int num, int count, int priv, bool more);
+
+ int create_shared(int num);
+ int open_shared(int num, int count);
+
+ int write_file(string& fn, int mb, int chunk);
+ int write_batch(int nfile, int mb, int chunk);
+ int read_file(string& fn, int mb, int chunk);
+
+ int clean_dir(string& basedir);
+
+ int play_trace(Trace& t, string& prefix);
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "Trace.h"
+
+#include <iostream>
+#include <cassert>
+#include <map>
+#include <ext/rope>
+using namespace __gnu_cxx;
+
+#include "common/Mutex.h"
+
+#include "config.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+
+Mutex trace_lock;
+
+class TokenList {
+public:
+ string filename;
+ char *data;
+ int len;
+ list<const char *> tokens;
+
+ int ref;
+
+ TokenList() : data(0), ref(0) {}
+ ~TokenList() {
+ delete[] data;
+ }
+};
+
+map<string, TokenList*> traces;
+
+
+//
+Trace::Trace(const char* f)
+{
+ string filename = f;
+
+ trace_lock.Lock();
+
+ if (traces.count(filename))
+ tl = traces[filename];
+ else {
+ tl = new TokenList;
+ tl->filename = filename;
+
+ // open file
+ crope cr;
+ int fd = open(filename.c_str(), O_RDONLY);
+ assert(fd > 0);
+ char buf[100];
+ while (1) {
+ int r = read(fd, buf, 100);
+ if (r == 0) break;
+ assert(r > 0);
+ cr.append(buf, r);
+ }
+ close(fd);
+
+ // copy
+ tl->len = cr.length()+1;
+ tl->data = new char[tl->len];
+ memcpy(tl->data, cr.c_str(), cr.length());
+ tl->data[tl->len-1] = '\n';
+
+ // index!
+ int o = 0;
+ while (o < tl->len) {
+ char *n = tl->data + o;
+
+ // find newline
+ while (tl->data[o] != '\n') o++;
+ assert(tl->data[o] == '\n');
+ tl->data[o] = 0;
+
+ if (tl->data + o > n) tl->tokens.push_back(n);
+ o++;
+ }
+
+ dout(1) << "trace " << filename << " loaded with " << tl->tokens.size() << " tokens" << endl;
+ traces[filename] = tl;
+ }
+
+ tl->ref++;
+
+ trace_lock.Unlock();
+}
+
+Trace::~Trace()
+{
+ trace_lock.Lock();
+
+ tl->ref--;
+ if (tl->ref == 0) {
+ traces.erase(tl->filename);
+ delete tl;
+ }
+
+ trace_lock.Unlock();
+}
+
+
+list<const char*>& Trace::get_list()
+{
+ return tl->tokens;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __CLIENT_TRACE_H
+#define __CLIENT_TRACE_H
+
+#include <cassert>
+#include <list>
+#include <string>
+using namespace std;
+
+/*
+
+ this class is more like an iterator over a constant tokenlist (which
+ is protected by a mutex, see Trace.cc)
+
+ */
+
+class Trace {
+ class TokenList *tl;
+
+ public:
+ Trace(const char* filename);
+ ~Trace();
+
+ list<const char*>& get_list();
+
+ list<const char*>::iterator _cur;
+ list<const char*>::iterator _end;
+
+ void start() {
+ _cur = get_list().begin();
+ _end = get_list().end();
+ ns = 0;
+ }
+
+ char strings[10][200];
+ int ns;
+ const char *get_string(const char *prefix = 0) {
+ assert(_cur != _end);
+ const char *s = *_cur;
+ _cur++;
+ if (prefix) {
+ if (strstr(s, "/prefix") == s ||
+ strstr(s, "/prefix") == s+1) {
+ strcpy(strings[ns], prefix);
+ strcpy(strings[ns] + strlen(prefix),
+ s + strlen("/prefix"));
+ s = (const char*)strings[ns];
+ ns++;
+ if (ns == 10) ns = 0;
+ }
+ }
+ return s;
+ }
+ __int64_t get_int() {
+ return atoll(get_string());
+ }
+ bool end() {
+ return _cur == _end;
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+
+// fuse crap
+#ifdef linux
+/* For pread()/pwrite() */
+#define _XOPEN_SOURCE 500
+#endif
+
+#define FUSE_USE_VERSION 25
+
+#include <fuse.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <errno.h>
+#ifdef DARWIN
+#include <sys/statvfs.h>
+#else
+#include <sys/statfs.h>
+#endif // DARWIN
+
+
+// ceph stuff
+#include "include/types.h"
+
+#include "Client.h"
+
+#include "config.h"
+
+// stl
+#include <map>
+using namespace std;
+
+
+// globals
+Client *client; // the ceph client
+
+
+
+// ------
+// fuse hooks
+
+static int ceph_getattr(const char *path, struct stat *stbuf)
+{
+ return client->lstat(path, stbuf);
+}
+
+static int ceph_readlink(const char *path, char *buf, size_t size)
+{
+ int res;
+
+ res = client->readlink(path, buf, size - 1);
+ if (res < 0) return res;
+
+ buf[res] = '\0';
+ return 0;
+}
+
+
+static int ceph_getdir(const char *path, fuse_dirh_t h, fuse_dirfil_t filler)
+{
+ map<string, inode_t> contents;
+
+ int res = client->getdir(path, contents);
+ if (res < 0) return res;
+
+ // return contents to fuse via callback
+ for (map<string, inode_t>::iterator it = contents.begin();
+ it != contents.end();
+ it++) {
+ // (immutable) inode contents too.
+ res = filler(h, // fuse's handle
+ it->first.c_str(), // dentry as char*
+ it->second.mode & INODE_TYPE_MASK, // mask type bits from mode
+ it->second.ino); // ino.. 64->32 bit issue here? FIXME
+ if (res != 0) break; // fuse has had enough
+ }
+ return res;
+}
+
+static int ceph_mknod(const char *path, mode_t mode, dev_t rdev)
+{
+ return client->mknod(path, mode);
+}
+
+static int ceph_mkdir(const char *path, mode_t mode)
+{
+ return client->mkdir(path, mode);
+}
+
+static int ceph_unlink(const char *path)
+{
+ return client->unlink(path);
+}
+
+static int ceph_rmdir(const char *path)
+{
+ return client->rmdir(path);
+}
+
+static int ceph_symlink(const char *from, const char *to)
+{
+ return client->symlink(from, to);
+}
+
+static int ceph_rename(const char *from, const char *to)
+{
+ return client->rename(from, to);
+}
+
+static int ceph_link(const char *from, const char *to)
+{
+ return client->link(from, to);
+}
+
+static int ceph_chmod(const char *path, mode_t mode)
+{
+ return client->chmod(path, mode);
+}
+
+static int ceph_chown(const char *path, uid_t uid, gid_t gid)
+{
+ return client->chown(path, uid, gid);
+}
+
+static int ceph_truncate(const char *path, off_t size)
+{
+ return client->truncate(path, size);
+}
+
+static int ceph_utime(const char *path, struct utimbuf *buf)
+{
+ return client->utime(path, buf);
+}
+
+
+static int ceph_open(const char *path, struct fuse_file_info *fi)
+{
+ int res;
+
+ res = client->open(path, fi->flags);
+ if (res < 0) return res;
+ fi->fh = res;
+ return 0; // fuse wants 0 onsucess
+}
+
+static int ceph_read(const char *path, char *buf, size_t size, off_t offset,
+ struct fuse_file_info *fi)
+{
+ fh_t fh = fi->fh;
+ return client->read(fh, buf, size, offset);
+}
+
+static int ceph_write(const char *path, const char *buf, size_t size,
+ off_t offset, struct fuse_file_info *fi)
+{
+ fh_t fh = fi->fh;
+ return client->write(fh, buf, size, offset);
+}
+
+/*
+static int ceph_flush(const char *path, struct fuse_file_info *fi)
+{
+ fh_t fh = fi->fh;
+ return client->flush(fh);
+}
+*/
+
+
+#ifdef DARWIN
+static int ceph_statfs(const char *path, struct statvfs *stbuf)
+{
+ return client->statfs(path, stbuf);
+}
+#else
+static int ceph_statfs(const char *path, struct statfs *stbuf)
+{
+ return client->statfs(path, stbuf);
+}
+#endif
+
+
+
+static int ceph_release(const char *path, struct fuse_file_info *fi)
+{
+ fh_t fh = fi->fh;
+ int r = client->close(fh); // close the file
+ return r;
+}
+
+static int ceph_fsync(const char *path, int isdatasync,
+ struct fuse_file_info *fi)
+{
+ fh_t fh = fi->fh;
+ return client->fsync(fh, isdatasync ? true:false);
+}
+
+
+static struct fuse_operations ceph_oper = {
+ getattr: ceph_getattr,
+ readlink: ceph_readlink,
+ getdir: ceph_getdir,
+ mknod: ceph_mknod,
+ mkdir: ceph_mkdir,
+ unlink: ceph_unlink,
+ rmdir: ceph_rmdir,
+ symlink: ceph_symlink,
+ rename: ceph_rename,
+ link: ceph_link,
+ chmod: ceph_chmod,
+ chown: ceph_chown,
+ truncate: ceph_truncate,
+ utime: ceph_utime,
+ open: ceph_open,
+ read: ceph_read,
+ write: ceph_write,
+ statfs: ceph_statfs,
+ flush: 0, //ceph_flush,
+ release: ceph_release,
+ fsync: ceph_fsync
+};
+
+
+int ceph_fuse_main(Client *c, int argc, char *argv[])
+{
+ // init client
+ client = c;
+
+ // set up fuse argc/argv
+ int newargc = 0;
+ char **newargv = (char **) malloc((argc + 10) * sizeof(char *));
+ newargv[newargc++] = argv[0];
+
+ // allow other (all!) users to see my file system
+ // NOTE: echo user_allow_other >> /etc/fuse.conf
+ // NB: seems broken on Darwin
+#ifndef DARWIN
+ newargv[newargc++] = "-o";
+ newargv[newargc++] = "allow_other";
+#endif // DARWIN
+
+ // use inos
+ newargv[newargc++] = "-o";
+ newargv[newargc++] = "use_ino";
+
+ // large reads, direct_io (no kernel cachine)
+ //newargv[newargc++] = "-o";
+ //newargv[newargc++] = "large_read";
+ if (g_conf.fuse_direct_io) {
+ newargv[newargc++] = "-o";
+ newargv[newargc++] = "direct_io";
+ }
+
+ // disable stupid fuse unlink hiding thing
+ newargv[newargc++] = "-o";
+ newargv[newargc++] = "hard_remove";
+
+ // force into foreground
+ // -> we can watch stdout this way!!
+ newargv[newargc++] = "-f";
+
+ // copy rest of cmdline (hopefully, the mount point!)
+ for (int argctr = 1; argctr < argc; argctr++) newargv[newargc++] = argv[argctr];
+
+ // go fuse go
+ cout << "ok, calling fuse_main" << endl;
+ return fuse_main(newargc, newargv, &ceph_oper);
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+/* ceph_fuse_main
+ * - start up fuse glue, attached to Client* cl.
+ * - argc, argv should include a mount point, and
+ * any weird fuse options you want. by default,
+ * we will put fuse in the foreground so that it
+ * won't fork and we can see stdout.
+ */
+int ceph_fuse_main(Client *cl, int argc, char *argv[]);
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include <iostream>
+using namespace std;
+
+// ceph stuff
+#include "config.h"
+#include "client/Client.h"
+#include "msg/TCPMessenger.h"
+
+// syscall fun
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <sys/types.h>
+//#include <sys/stat.h>
+
+#define _FCNTL_H
+#include <bits/fcntl.h>
+
+#define CEPH_FD_OFF 50000
+
+
+/****** startup etc *******/
+
+class LdCeph {
+public:
+ // globals
+ bool started;
+ char *mount_point;
+ char *mount_point_parent;
+ int mount_point_len;
+
+ Client *client;
+
+ filepath fp_mount_point;
+ filepath cwd;
+ bool cwd_above_mp, cwd_in_mp;
+
+ const char *get_ceph_path(const char *orig, char *buf) {
+ if (!started) return 0;
+
+ // relative path? BUG: this won't catch "blah/../../asdf"
+ if (orig[0] &&
+ orig[0] != '/' &&
+ !(orig[0] == '.' && orig[1] == '.')) {
+
+ if (cwd_in_mp) return orig; // inside mount point, definitely ceph
+ if (!cwd_above_mp) return 0; // not above mount point, definitely not ceph
+
+ // relative, above mp.
+ filepath o = orig;
+ filepath p = cwd;
+ for (unsigned b = 0; b < o.depth(); b++) {
+ if (o[b] == "..")
+ p.pop_dentry();
+ else
+ p.add_dentry(o[b]);
+ }
+
+ // FIXME rewrite
+ if (strncmp(p.c_str(), mount_point, mount_point_len) == 0) {
+ if (p.c_str()[mount_point_len] == 0)
+ return "/";
+ if (p.c_str()[mount_point_len] == '/') {
+ strcpy(buf, p.c_str() + mount_point_len);
+ return buf;
+ }
+ }
+ return 0;
+ } else {
+ // absolute
+ if (strncmp(orig, mount_point, mount_point_len) == 0) {
+ if (orig[mount_point_len] == 0)
+ return "/";
+ if (orig[mount_point_len] == '/')
+ return orig + mount_point_len;
+ }
+ return 0;
+ }
+ }
+
+ void refresh_cwd() {
+ char buf[255];
+ syscall(SYS_getcwd, buf, 255);
+ cwd = buf;
+
+ if (strncmp(buf, mount_point, mount_point_len) == 0 &&
+ (buf[mount_point_len] == 0 ||
+ buf[mount_point_len] == '/'))
+ cwd_in_mp = true;
+ else {
+ if (cwd.depth() > fp_mount_point.depth())
+ cwd_above_mp = false;
+ else {
+ cwd_above_mp = true;
+ for (unsigned i=0; i<cwd.depth(); i++) {
+ if (cwd[i] != fp_mount_point[i]) {
+ cwd_above_mp = false;
+ break;
+ }
+ }
+ }
+ }
+ //cout << "refresh_cwd '" << cwd << "', above=" << cwd_above_mp << ", in=" << cwd_in_mp << endl;
+ }
+
+
+ LdCeph() :
+ started(false),
+ mount_point(0), mount_point_parent(0),
+ mount_point_len(0),
+ cwd_above_mp(false), cwd_in_mp(false) {
+
+ // args
+ vector<char *> args;
+ env_to_vec(args);
+ parse_config_options(args);
+
+
+ tcpaddr_t nsa;
+ if (tcpmessenger_findns(nsa) < 0)
+ return;
+ tcpmessenger_init();
+ tcpmessenger_start();
+ tcpmessenger_start_rankserver(nsa);
+
+ client = new Client(new TCPMessenger(MSG_ADDR_CLIENT_NEW));
+ client->init();
+ int r = client->mount();
+ if (r < 0) {
+ // failure
+ cerr << "ldceph init: mount failed " << r << endl;
+ delete client;
+ client = 0;
+ } else {
+ // success
+ started = true;
+ mount_point = "/ceph";
+ mount_point_parent = "/";
+ mount_point_len = 5;
+
+ fp_mount_point = mount_point;
+
+ cerr << "ldceph init: mounted on " << mount_point << " as " << client->get_myaddr() << endl;
+
+ refresh_cwd();
+ }
+ }
+ ~LdCeph() {
+ cout << "ldceph fini" << endl;
+ if (false && client) {
+ client->unmount();
+ client->shutdown();
+ delete client;
+ client = 0;
+ tcpmessenger_wait();
+ tcpmessenger_shutdown();
+ }
+ }
+
+} ldceph;
+
+
+
+/****** original functions ****/
+
+
+
+/****** captured functions ****/
+
+
+#define MYFD(f) ((fd) > CEPH_FD_OFF && ldceph.started)
+#define TO_FD(fd) (fd > 0 ? fd+CEPH_FD_OFF:fd)
+#define FROM_FD(fd) (fd - CEPH_FD_OFF)
+
+extern "C" {
+
+ // open/close
+ //int open(const char *pathname, int flags) {
+ int open(const char *pathname, int flags, mode_t mode) {
+ char buf[255];
+ if (const char *c = ldceph.get_ceph_path(pathname, buf))
+ return TO_FD(ldceph.client->open(c, flags));
+ else
+ return syscall(SYS_open, pathname, flags, mode);
+ }
+
+ int creat(const char *pathname, mode_t mode) {
+ return open(pathname, O_CREAT|O_WRONLY|O_TRUNC, mode);
+ }
+ int close(int fd) {
+ if (MYFD(fd))
+ return ldceph.client->close(FROM_FD(fd));
+ else
+ return syscall(SYS_close, fd);
+ }
+
+
+ // read/write
+ ssize_t write(int fd, const void *buf, size_t count) {
+ if (MYFD(fd))
+ return ldceph.client->write(FROM_FD(fd), (char*)buf, count);
+ else
+ return syscall(SYS_write, fd, buf, count);
+ }
+
+ ssize_t read(int fd, void *buf, size_t count) {
+ if (MYFD(fd))
+ return ldceph.client->read(FROM_FD(fd), (char*)buf, count);
+ else
+ return syscall(SYS_read, fd, buf, count);
+ }
+
+ //int fsync(int fd);
+ //int fdatasync(int fd);
+
+
+ // namespace
+ int rmdir(const char *pathname) {
+ char buf[255];
+ if (const char *c = ldceph.get_ceph_path(pathname, buf))
+ return ldceph.client->rmdir(c);
+ else
+ return syscall(SYS_rmdir, pathname);
+ }
+ int mkdir(const char *pathname, mode_t mode) {
+ char buf[255];
+ if (const char *c = ldceph.get_ceph_path(pathname, buf))
+ return ldceph.client->mkdir(c, mode);
+ else
+ return syscall(SYS_mkdir, pathname, mode);
+ }
+ int unlink(const char *pathname) {
+ char buf[255];
+ if (const char *c = ldceph.get_ceph_path(pathname, buf))
+ return ldceph.client->unlink(c);
+ else
+ return syscall(SYS_unlink, pathname);
+ }
+
+ int stat(const char *pathname, struct stat *st) {
+ //int __xstat64(int __ver, const char *pathname, struct stat64 *st64) { // stoopid GLIBC
+ //struct stat *st = (struct stat*)st64;
+ char buf[255];
+ if (const char *c = ldceph.get_ceph_path(pathname, buf))
+ return ldceph.client->lstat(c, st); // FIXME
+ else
+ return syscall(SYS_stat, pathname, st);
+ }
+ //int fstat(int filedes, struct stat *buf);
+ //int lstat(const char *file_name, struct stat *buf);
+
+ int chdir(const char *pathname) {
+ char buf[255];
+ if (const char *c = ldceph.get_ceph_path(pathname, buf)) {
+ int r = ldceph.client->chdir(c);
+ if (r == 0) {
+ if (!ldceph.cwd_in_mp)
+ syscall(SYS_chdir, ldceph.mount_point_parent);
+ ldceph.cwd_in_mp = true;
+ ldceph.cwd_above_mp = false;
+ ldceph.cwd = ldceph.mount_point;
+ filepath fpc = c;
+ ldceph.cwd.append(fpc);
+ }
+ return r;
+ } else {
+ int r = syscall(SYS_chdir, pathname);
+ if (r) {
+ ldceph.refresh_cwd();
+ }
+ return r;
+ }
+ }
+ char *getcwd(char *buf, size_t size) {
+ strncpy(buf, ldceph.cwd.c_str(), size);
+ return buf;
+ }
+ //int fchdir(int fd);
+
+
+
+
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "msg/Message.h"
+
+// send the message, expecting no response. threads other than the
+// MPI thread use this function; if the MPI thread uses this function
+// it could deadlock: this function could wait for the out queue to be
+// emptied, but only the MPI thread can empty it.
+void obfsmpi_send(Message *m)
+
+// send the message to a server and wait for the response. threads
+// other than the MPI thread use this function.
+Message *obfsmpi_sendrecv(Message *m)
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mon/MonMap.h"
+#include "mds/MDS.h"
+
+#include "msg/SimpleMessenger.h"
+
+#include "common/Timer.h"
+
+
+class C_Die : public Context {
+public:
+ void finish(int) {
+ cerr << "die" << endl;
+ exit(1);
+ }
+};
+
+class C_Debug : public Context {
+ public:
+ void finish(int) {
+ int size = &g_conf.debug_after - &g_conf.debug;
+ memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size);
+ dout(0) << "debug_after flipping debug settings" << endl;
+ }
+};
+
+
+int main(int argc, char **argv)
+{
+ vector<char*> args;
+ argv_to_vec(argc, argv, args);
+
+ parse_config_options(args);
+
+ if (g_conf.kill_after)
+ g_timer.add_event_after(g_conf.kill_after, new C_Die);
+ if (g_conf.debug_after)
+ g_timer.add_event_after(g_conf.debug_after, new C_Debug);
+
+
+ // load monmap
+ MonMap monmap;
+ int r = monmap.read(".ceph_monmap");
+ assert(r >= 0);
+
+ // start up network
+ rank.start_rank();
+
+ // start mds
+ Messenger *m = rank.register_entity(MSG_ADDR_MDS_NEW);
+ assert(m);
+
+ MDS *mds = new MDS(m->get_myaddr().num(), m, &monmap);
+ mds->init();
+
+ // wait
+ rank.wait();
+
+ // done
+ delete mds;
+
+ return 0;
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mon/MonMap.h"
+#include "mon/Monitor.h"
+
+#include "msg/SimpleMessenger.h"
+
+#include "common/Timer.h"
+
+
+class C_Die : public Context {
+public:
+ void finish(int) {
+ cerr << "die" << endl;
+ exit(1);
+ }
+};
+
+class C_Debug : public Context {
+ public:
+ void finish(int) {
+ int size = &g_conf.debug_after - &g_conf.debug;
+ memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size);
+ dout(0) << "debug_after flipping debug settings" << endl;
+ }
+};
+
+
+int main(int argc, char **argv)
+{
+ vector<char*> args;
+ argv_to_vec(argc, argv, args);
+
+ parse_config_options(args);
+
+ if (g_conf.kill_after)
+ g_timer.add_event_after(g_conf.kill_after, new C_Die);
+ if (g_conf.debug_after)
+ g_timer.add_event_after(g_conf.debug_after, new C_Debug);
+
+ // args
+ int whoami = -1;
+ char *monmap_fn = ".ceph_monmap";
+ for (unsigned i=0; i<args.size(); i++) {
+ if (strcmp(args[i], "--mon") == 0)
+ whoami = atoi(args[++i]);
+ else if (strcmp(args[i], "--monmap") == 0)
+ monmap_fn = args[++i];
+ else {
+ cerr << "unrecognized arg " << args[i] << endl;
+ return -1;
+ }
+ }
+
+ MonMap monmap;
+
+ if (whoami < 0) {
+ // let's assume a standalone monitor
+ cout << "starting standalone mon0" << endl;
+ whoami = 0;
+
+ // start messenger
+ rank.start_rank();
+ cout << "bound to " << rank.get_listen_addr() << endl;
+
+ // add single mon0
+ monmap.add_mon(rank.my_inst);
+
+ // write monmap
+ cout << "writing monmap to " << monmap_fn << endl;;
+ int r = monmap.write(monmap_fn);
+ assert(r >= 0);
+ } else {
+ // i am specific monitor.
+
+ // read monmap
+ cout << "reading monmap from .ceph_monmap" << endl;
+ int r = monmap.read(monmap_fn);
+ assert(r >= 0);
+
+ // bind to a specific port
+ cout << "starting mon" << whoami << " at " << monmap.get_inst(whoami) << endl;
+ tcpaddr_t addr = monmap.get_inst(whoami).addr;
+ rank.set_listen_addr(addr);
+ rank.start_rank();
+ }
+
+ // start monitor
+ Messenger *m = rank.register_entity(MSG_ADDR_MON(whoami));
+ Monitor *mon = new Monitor(whoami, m, &monmap);
+ mon->init();
+
+ // wait
+ cout << "waiting for shutdown ..." << endl;
+ rank.wait();
+
+ // done
+ delete mon;
+
+ return 0;
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "Clock.h"
+
+// public
+Clock g_clock;
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __CLOCK_H
+#define __CLOCK_H
+
+#include <iostream>
+#include <iomanip>
+
+#include <sys/time.h>
+#include <time.h>
+#include <math.h>
+
+#include "Mutex.h"
+
+
+// --------
+// utime_t
+
+class utime_t {
+ private:
+ struct timeval tv;
+
+ struct timeval& timeval() { return tv; }
+ friend class Clock;
+
+
+ public:
+ void normalize() {
+ if (tv.tv_usec > 1000*1000) {
+ tv.tv_sec += tv.tv_usec / (1000*1000);
+ tv.tv_usec %= 1000*1000;
+ }
+ }
+
+ // cons
+ utime_t() { tv.tv_sec = 0; tv.tv_usec = 0; normalize(); }
+ utime_t(time_t s, int u) { tv.tv_sec = s; tv.tv_usec = u; normalize(); }
+
+ // accessors
+ time_t sec() const { return tv.tv_sec; }
+ long usec() const { return tv.tv_usec; }
+ int nsec() const { return tv.tv_usec*1000; }
+
+ // ref accessors/modifiers
+ time_t& sec_ref() { return tv.tv_sec; }
+ // FIXME: tv.tv_usec is a __darwin_suseconds_t on Darwin.
+ // is just casting it to long& OK?
+ long& usec_ref() { return (long&) tv.tv_usec; }
+
+ // cast to double
+ operator double() {
+ return (double)sec() + ((double)usec() / 1000000.0L);
+ }
+};
+
+// arithmetic operators
+inline utime_t operator+(const utime_t& l, const utime_t& r) {
+ return utime_t( l.sec() + r.sec() + (l.usec()+r.usec())/1000000L,
+ (l.usec()+r.usec())%1000000L );
+}
+inline utime_t& operator+=(utime_t& l, const utime_t& r) {
+ l.sec_ref() += r.sec() + (l.usec()+r.usec())/1000000L;
+ l.usec_ref() += r.usec();
+ l.usec_ref() %= 1000000L;
+ return l;
+}
+inline utime_t& operator+=(utime_t& l, double f) {
+ double fs = trunc(f);
+ double us = (f - fs) / (double)1000000.0;
+ l.sec_ref() += (long)fs;
+ l.usec_ref() += (long)us;
+ l.normalize();
+ return l;
+}
+
+inline utime_t operator-(const utime_t& l, const utime_t& r) {
+ return utime_t( l.sec() - r.sec() - (l.usec()<r.usec() ? 1:0),
+ l.usec() - r.usec() + (l.usec()<r.usec() ? 1000000:0) );
+}
+inline utime_t& operator-=(utime_t& l, const utime_t& r) {
+ l.sec_ref() -= r.sec();
+ if (l.usec() >= r.usec())
+ l.usec_ref() -= r.usec();
+ else {
+ l.usec_ref() += 1000000L - r.usec();
+ l.sec_ref()--;
+ }
+ return l;
+}
+
+inline bool operator>(const utime_t& a, const utime_t& b)
+{
+ return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.usec() > b.usec());
+}
+inline bool operator<(const utime_t& a, const utime_t& b)
+{
+ return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.usec() < b.usec());
+}
+
+// ostream
+inline std::ostream& operator<<(std::ostream& out, const utime_t& t)
+{
+ //return out << t.sec() << "." << t.usec();
+ out << (long)t.sec() << ".";
+ out.setf(std::ios::right);
+ out.fill('0');
+ out << std::setw(6) << t.usec();
+ out.unsetf(std::ios::right);
+ return out;
+
+ //return out << (long)t.sec << "." << ios::setf(ios::right) << ios::fill('0') << t.usec() << ios::usetf();
+}
+
+
+
+
+// -- clock --
+class Clock {
+ protected:
+ //utime_t start_offset;
+ //utime_t abs_last;
+ utime_t last;
+ utime_t zero;
+
+ Mutex lock;
+
+ public:
+ Clock() {
+ // set offset
+ tare();
+ }
+
+ // real time.
+ utime_t real_now() {
+ utime_t realnow = now();
+ realnow += zero;
+ //gettimeofday(&realnow.timeval(), NULL);
+ return realnow;
+ }
+
+ // relative time (from startup)
+ void tare() {
+ gettimeofday(&zero.timeval(), NULL);
+ }
+ utime_t now() {
+ //lock.Lock();
+ utime_t n;
+ gettimeofday(&n.timeval(), NULL);
+ n -= zero;
+ if (n < last) {
+ //std::cerr << "WARNING: clock jumped backwards from " << last << " to " << n << std::endl;
+ n = last; // clock jumped backwards!
+ } else
+ last = n;
+ //lock.Unlock();
+ return n;
+ }
+ utime_t recent_now() {
+ return last;
+ }
+
+ void realify(utime_t& t) {
+ t += zero;
+ }
+
+ void make_timespec(utime_t& t, struct timespec *ts) {
+ utime_t real = t;
+ realify(real);
+
+ memset(ts, 0, sizeof(*ts));
+ ts->tv_sec = real.sec();
+ ts->tv_nsec = real.nsec();
+ }
+
+
+
+ // absolute time
+ time_t gettime() {
+ return real_now().sec();
+ }
+
+};
+
+extern Clock g_clock;
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __COND_H
+#define __COND_H
+
+#include <time.h>
+
+#include "Mutex.h"
+#include "Clock.h"
+
+#include "include/Context.h"
+
+#include <pthread.h>
+#include <cassert>
+
+class Cond {
+ // my bits
+ pthread_cond_t _c;
+
+ // don't allow copying.
+ void operator=(Cond &C) {}
+ Cond( const Cond &C ) {}
+
+ public:
+ Cond() {
+ int r = pthread_cond_init(&_c,NULL);
+ assert(r == 0);
+ }
+ virtual ~Cond() {
+ pthread_cond_destroy(&_c);
+ }
+
+ int Wait(Mutex &mutex) {
+ int r = pthread_cond_wait(&_c, &mutex._m);
+ return r;
+ }
+
+ int Wait(Mutex &mutex, char* s) {
+ //cout << "Wait: " << s << endl;
+ int r = pthread_cond_wait(&_c, &mutex._m);
+ return r;
+ }
+
+ int WaitUntil(Mutex &mutex, utime_t when) {
+ struct timespec ts;
+ g_clock.make_timespec(when, &ts);
+ //cout << "timedwait for " << ts.tv_sec << " sec " << ts.tv_nsec << " nsec" << endl;
+ int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
+ return r;
+ }
+ int WaitInterval(Mutex &mutex, utime_t interval) {
+ utime_t when = g_clock.now();
+ when += interval;
+ return WaitUntil(mutex, when);
+ }
+
+ int Signal() {
+ //int r = pthread_cond_signal(&_c);
+ int r = pthread_cond_broadcast(&_c);
+ return r;
+ }
+ int SignalOne() {
+ int r = pthread_cond_signal(&_c);
+ return r;
+ }
+ int SignalAll() {
+ //int r = pthread_cond_signal(&_c);
+ int r = pthread_cond_broadcast(&_c);
+ return r;
+ }
+};
+
+class C_Cond : public Context {
+ Cond *cond;
+ bool *done;
+ int *rval;
+public:
+ C_Cond(Cond *c, bool *d, int *r=0) : cond(c), done(d), rval(r) {
+ *done = false;
+ }
+ void finish(int r) {
+ if (rval) *rval = r;
+ *done = true;
+ cond->Signal();
+ }
+};
+
+class C_SafeCond : public Context {
+ Mutex *lock;
+ Cond *cond;
+ bool *done;
+ int *rval;
+public:
+ C_SafeCond(Mutex *l, Cond *c, bool *d, int *r=0) : lock(l), cond(c), done(d), rval(r) {
+ *done = false;
+ }
+ void finish(int r) {
+ lock->Lock();
+ if (rval) *rval = r;
+ *done = true;
+ cond->Signal();
+ lock->Unlock();
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __DECAYCOUNTER_H
+#define __DECAYCOUNTER_H
+
+#include <math.h>
+#include "Clock.h"
+
+#include "config.h"
+
+class DecayCounter {
+ protected:
+ double val; // value
+
+ double half_life; // in seconds
+ double k; // k = ln(.5)/half_life
+
+ utime_t last_decay; // time of last decay
+
+ public:
+ DecayCounter() : val(0) {
+ set_halflife( g_conf.mds_decay_halflife );
+ reset();
+ }
+ /*
+ DecayCounter(double hl) : val(0) {
+ set_halflife(hl);
+ reset();
+ }
+ */
+
+ void adjust(double a) {
+ decay();
+ val += a;
+ }
+ void adjust_down(const DecayCounter& other) {
+ // assume other has same time stamp as us...
+ val -= other.val;
+ }
+
+ void set_halflife(double hl) {
+ half_life = hl;
+ k = log(.5) / hl;
+ }
+
+ void take(DecayCounter& other) {
+ *this = other;
+ other.reset();
+ }
+
+ void reset() {
+ last_decay.sec_ref() = 0;
+ last_decay.usec_ref() = 0;
+ val = 0;
+ }
+
+ void decay() {
+ utime_t el = g_clock.recent_now();
+ el -= last_decay;
+ if (el.sec() >= 1) {
+ val = val * exp((double)el * k);
+ if (val < .01) val = 0;
+ last_decay = g_clock.recent_now();
+ }
+ }
+
+ double get() {
+ decay();
+ return val;
+ }
+
+ double hit(double v = 1.0) {
+ decay();
+ val += v;
+ return val;
+ }
+
+};
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __LOGTYPE_H
+#define __LOGTYPE_H
+
+#include "include/types.h"
+
+#include <string>
+#include <fstream>
+using namespace std;
+#include <ext/hash_map>
+#include <ext/hash_set>
+using namespace __gnu_cxx;
+
+#include "Mutex.h"
+
+
+class LogType {
+ protected:
+ hash_map<__uint64_t, int> keymap;
+ vector<const char*> keys;
+ set<int> inc_keys;
+
+ int version;
+
+ // HACK to avoid the hash table as often as possible...
+ // cache recent key name lookups in a small ring buffer
+ const static int cache_keys = 10;
+ __uint64_t kc_ptr[cache_keys];
+ int kc_val[cache_keys];
+ int kc_pos;
+
+ friend class Logger;
+
+ public:
+ LogType() {
+ version = 1;
+
+ for (int i=0;i<cache_keys;i++)
+ kc_ptr[i] = 0;
+ kc_pos = 0;
+ }
+ int add_key(const char* key, bool is_inc) {
+ int i = lookup_key(key);
+ if (i >= 0) return i;
+
+ i = keys.size();
+ keys.push_back(key);
+
+#ifdef __LP64__
+ __uint64_t p = (__uint64_t)key;
+#else
+ __uint64_t p = (__uint32_t)key;
+#endif
+ keymap[p] = i;
+ if (is_inc) inc_keys.insert(i);
+
+ version++;
+ return i;
+ }
+ int add_inc(const char* key) {
+ return add_key(key, true);
+ }
+ int add_set(const char *key) {
+ return add_key(key, false);
+ }
+
+ bool have_key(const char* key) {
+ return lookup_key(key) < 0;
+ }
+
+ int lookup_key(const char* key) {
+#ifdef __LP64__
+ __uint64_t p = (__uint64_t)key;
+#else
+ __uint64_t p = (__uint32_t)key;
+#endif
+
+ if (keymap.count(p))
+ return keymap[p];
+
+ // try kc ringbuffer
+ int pos = kc_pos-1;
+ for (int j=0; j<cache_keys; j++) {
+ if (pos < 0) pos = cache_keys - 1;
+ if (kc_ptr[pos] == p) return kc_val[pos];
+ pos--;
+ }
+
+ for (unsigned i=0; i<keys.size(); i++)
+ if (strcmp(keys[i], key) == 0) {
+ keymap[p] = i;
+
+ // put in kc ringbuffer
+ kc_ptr[kc_pos] = p;
+ kc_val[kc_pos] = i;
+ kc_pos++;
+ if (kc_pos == cache_keys) kc_pos = 0;
+
+ return i;
+ }
+ return -1;
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include <string>
+
+#include "LogType.h"
+#include "Logger.h"
+
+#include <iostream>
+#include "Clock.h"
+
+#include "config.h"
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+
+// per-process lock. lame, but this way I protect LogType too!
+Mutex logger_lock;
+
+Logger::Logger(string fn, LogType *type)
+{
+ logger_lock.Lock();
+ {
+ filename = "log/";
+ if (g_conf.log_name) {
+ filename += g_conf.log_name;
+ ::mkdir( filename.c_str(), 0755 ); // make sure dir exists
+ filename += "/";
+ }
+ filename += fn;
+ //cout << "log " << filename << endl;
+ interval = g_conf.log_interval;
+
+ //start = g_clock.now(); // time 0!
+ last_logged = 0;
+ wrote_header = -1;
+ open = false;
+ this->type = type;
+ wrote_header_last = 0;
+
+ version = 0;
+ }
+ logger_lock.Unlock();
+ flush(false);
+}
+
+Logger::~Logger()
+{
+ flush(true);
+ out.close();
+}
+
+long Logger::inc(const char *key, long v)
+{
+ if (!g_conf.log) return 0;
+ logger_lock.Lock();
+ int i = type->lookup_key(key);
+ if (i < 0) i = type->add_inc(key);
+ flush();
+ vals[i] += v;
+ long r = vals[i];
+ logger_lock.Unlock();
+ return r;
+}
+
+double Logger::finc(const char *key, double v)
+{
+ if (!g_conf.log) return 0;
+ logger_lock.Lock();
+ int i = type->lookup_key(key);
+ if (i < 0) i = type->add_inc(key);
+ flush();
+ fvals[i] += v;
+ double r = fvals[i];
+ logger_lock.Unlock();
+ return r;
+}
+
+long Logger::set(const char *key, long v)
+{
+ if (!g_conf.log) return 0;
+ logger_lock.Lock();
+ int i = type->lookup_key(key);
+ if (i < 0) i = type->add_set(key);
+ flush();
+ long r = vals[i] = v;
+ logger_lock.Unlock();
+ return r;
+}
+
+
+double Logger::fset(const char *key, double v)
+{
+ if (!g_conf.log) return 0;
+ logger_lock.Lock();
+ int i = type->lookup_key(key);
+ if (i < 0) i = type->add_set(key);
+ flush();
+ double r = fvals[i] = v;
+ logger_lock.Unlock();
+ return r;
+}
+
+long Logger::get(const char* key)
+{
+ if (!g_conf.log) return 0;
+ logger_lock.Lock();
+ int i = type->lookup_key(key);
+ long r = 0;
+ if (i >= 0 && (int)vals.size() > i)
+ r = vals[i];
+ logger_lock.Unlock();
+ return r;
+}
+
+void Logger::flush(bool force)
+{
+ if (!g_conf.log) return;
+ logger_lock.Lock();
+
+ if (version != type->version) {
+ while (type->keys.size() > vals.size())
+ vals.push_back(0);
+ while (type->keys.size() > fvals.size())
+ fvals.push_back(0);
+ version = type->version;
+ }
+
+ if (!open) {
+ out.open(filename.c_str(), ofstream::out);
+ open = true;
+ //cout << "opening log file " << filename << endl;
+ }
+
+ utime_t fromstart = g_clock.now();
+ if (fromstart < start) {
+ cerr << "logger time jumped backwards from " << start << " to " << fromstart << endl;
+ assert(0);
+ start = fromstart;
+ }
+ fromstart -= start;
+
+ while (force ||
+ ((fromstart.sec() > last_logged) &&
+ (fromstart.sec() - last_logged >= interval))) {
+ last_logged += interval;
+ force = false;
+
+ //cout << "logger " << this << " advancing from " << last_logged << " now " << now << endl;
+
+ if (!open) {
+ out.open(filename.c_str(), ofstream::out);
+ open = true;
+ //cout << "opening log file " << filename << endl;
+ }
+
+ // header?
+ wrote_header_last++;
+ if (wrote_header != type->version ||
+ wrote_header_last > 10) {
+ out << "#" << type->keymap.size();
+ for (unsigned i=0; i<type->keys.size(); i++)
+ out << "\t" << type->keys[i];
+ out << endl; //out << "\t (" << type->keymap.size() << ")" << endl;
+ wrote_header = type->version;
+ wrote_header_last = 0;
+ }
+
+ // write line to log
+ out << last_logged;
+ for (unsigned i=0; i<type->keys.size(); i++) {
+ if (fvals[i] > 0 && vals[i] == 0)
+ out << "\t" << fvals[i];
+ else
+ out << "\t" << vals[i];
+ }
+ out << endl;
+
+ // reset the counters
+ for (unsigned i=0; i<type->keys.size(); i++) {
+ if (type->inc_keys.count(i)) {
+ this->vals[i] = 0;
+ this->fvals[i] = 0;
+ }
+ }
+ }
+
+ logger_lock.Unlock();
+}
+
+
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __LOGGER_H
+#define __LOGGER_H
+
+#include "include/types.h"
+#include "Clock.h"
+#include "Mutex.h"
+
+#include <string>
+#include <fstream>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#include "LogType.h"
+
+
+
+
+class Logger {
+ protected:
+ //hash_map<const char*, long, hash<const char*>, eqstr> vals;
+ //hash_map<const char*, double, hash<const char*>, eqstr> fvals;
+ vector<long> vals;
+ vector<double> fvals;
+
+ //Mutex lock;
+ LogType *type;
+
+ utime_t start;
+ int last_logged;
+ int interval;
+ int wrote_header;
+ int wrote_header_last;
+
+ string filename;
+
+ int version;
+
+ ofstream out;
+ bool open;
+
+ public:
+ Logger(string fn, LogType *type);
+ ~Logger();
+
+ void set_start(const utime_t& a) { start = a; }
+ utime_t& get_start() { return start; }
+
+ long inc(const char *s, long v = 1);
+ long set(const char *s, long v);
+ long get(const char *s);
+
+ double fset(const char *s, double v);
+ double finc(const char *s, double v);
+
+ void flush(bool force = false);
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MUTEX_H
+#define __MUTEX_H
+
+#include <pthread.h>
+#include <cassert>
+
+class Mutex {
+private:
+ pthread_mutex_t _m;
+ int nlock;
+ bool recursive;
+
+ // don't allow copying.
+ void operator=(Mutex &M) {}
+ Mutex( const Mutex &M ) {}
+
+public:
+ Mutex(bool r = true) : nlock(0), recursive(r) {
+ if (recursive) {
+ pthread_mutexattr_t attr;
+ pthread_mutexattr_init(&attr);
+ pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE);
+ pthread_mutex_init(&_m,&attr);
+ pthread_mutexattr_destroy(&attr);
+ } else {
+ pthread_mutex_init(&_m,NULL);
+ }
+ }
+ virtual ~Mutex() {
+ assert(nlock == 0);
+ pthread_mutex_destroy(&_m);
+ }
+
+ bool is_locked() {
+ return (nlock > 0);
+ }
+
+ void Lock() {
+ int r = pthread_mutex_lock(&_m);
+ assert(r == 0);
+ nlock++;
+ assert(nlock == 1 || recursive);
+ }
+
+ void Unlock() {
+ assert(nlock > 0);
+ --nlock;
+ int r = pthread_mutex_unlock(&_m);
+ assert(r == 0);
+ }
+
+ friend class Cond;
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef _Sem_Posix_
+#define _Sem_Posix_
+
+#include <cassert>
+
+class Semaphore
+{
+ Mutex m;
+ Cond c;
+ int count;
+
+ public:
+
+ Semaphore()
+ {
+ count = 0;
+ }
+
+ void Put()
+ {
+ m.Lock();
+ count++;
+ c.Signal();
+ m.Unlock();
+ }
+
+ void Get()
+ {
+ m.Lock();
+ while(count <= 0) {
+ c.Wait(m);
+ }
+ count--;
+ m.Unlock();
+ }
+};
+
+#endif // !_Mutex_Posix_
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __THREAD_H
+#define __THREAD_H
+
+#include <pthread.h>
+
+class Thread {
+ private:
+ pthread_t thread_id;
+
+ public:
+ Thread() : thread_id(0) {}
+ virtual ~Thread() {}
+
+ pthread_t &get_thread_id() { return thread_id; }
+ bool is_started() { return thread_id != 0; }
+
+ virtual void *entry() = 0;
+
+ private:
+ static void *_entry_func(void *arg) {
+ return ((Thread*)arg)->entry();
+ }
+
+ public:
+ int create() {
+ return pthread_create( &thread_id, NULL, _entry_func, (void*)this );
+ }
+
+ bool am_self() {
+ return (pthread_self() == thread_id);
+ }
+
+ int join(void **prval = 0) {
+ if (thread_id == 0) return -1; // never started.
+ int status = pthread_join(thread_id, prval);
+ if (status == 0)
+ thread_id = 0;
+ else {
+ cout << "join status = " << status << endl;
+ assert(0);
+ }
+ return status;
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef THREADPOOL
+#define THREADPOOL
+
+#include <list>
+using namespace std;
+
+
+#include <pthread.h>
+#include <common/Mutex.h>
+#include <common/Cond.h>
+#include <common/Semaphore.h>
+
+
+// debug output
+#include "config.h"
+#define tpdout(x) if (x <= g_conf.debug) cout << myname
+#define DBLVL 15
+
+
+using namespace std;
+
+#define MAX_THREADS 1000
+
+template <class U, class T>
+class ThreadPool {
+
+ private:
+ list<T> q;
+ Mutex q_lock;
+ Semaphore q_sem;
+
+ int num_ops;
+ int num_threads;
+ vector<pthread_t> thread;
+
+ U u;
+ void (*func)(U,T);
+ void (*prefunc)(U,T);
+ string myname;
+
+ static void *foo(void *arg)
+ {
+ ThreadPool *t = (ThreadPool *)arg;
+ t->do_ops(arg);
+ return 0;
+ }
+
+ void *do_ops(void *nothing)
+ {
+ tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " starting" << endl;
+ while (1) {
+ q_sem.Get();
+ if (q.empty()) break;
+
+ T op = get_op();
+ tpdout(DBLVL) << ".func thread "<< pthread_self() << " on " << op << endl;
+ func(u, op);
+ }
+ tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " exiting" << endl;
+ return 0;
+ }
+
+
+ T get_op()
+ {
+ T op;
+ q_lock.Lock();
+ {
+ op = q.front();
+ q.pop_front();
+ num_ops--;
+
+ if (prefunc && op) {
+ tpdout(DBLVL) << ".prefunc thread "<< pthread_self() << " on " << op << endl;
+ prefunc(u, op);
+ }
+ }
+ q_lock.Unlock();
+
+ return op;
+ }
+
+ public:
+
+ ThreadPool(char *myname, int howmany, void (*f)(U,T), U obj, void (*pf)(U,T) = 0) :
+ num_ops(0), num_threads(howmany),
+ thread(num_threads),
+ u(obj),
+ func(f), prefunc(pf),
+ myname(myname) {
+ tpdout(DBLVL) << ".cons num_threads " << num_threads << endl;
+
+ // start threads
+ int status;
+ for(int i = 0; i < howmany; i++) {
+ status = pthread_create(&thread[i], NULL, (void*(*)(void *))&ThreadPool::foo, this);
+ assert(status == 0);
+ }
+ }
+
+ ~ThreadPool() {
+ // bump sem to make threads exit cleanly
+ for(int i = 0; i < num_threads; i++)
+ q_sem.Put();
+
+ // wait for them to die
+ for(int i = 0; i < num_threads; i++) {
+ tpdout(DBLVL) << ".des joining thread " << thread[i] << endl;
+ void *rval = 0; // we don't actually care
+ pthread_join(thread[i], &rval);
+ }
+ }
+
+ void put_op(T op) {
+ tpdout(DBLVL) << ".put_op " << op << endl;
+ q_lock.Lock();
+ q.push_back(op);
+ num_ops++;
+ q_sem.Put();
+ q_lock.Unlock();
+ }
+
+};
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+
+#include "Timer.h"
+#include "Cond.h"
+
+#include "config.h"
+#include "include/Context.h"
+
+#undef dout
+#define dout(x) if (x <= g_conf.debug) cout << "Timer: "
+
+#define DBL 10
+
+#include <signal.h>
+#include <sys/time.h>
+#include <math.h>
+
+// single global instance
+Timer g_timer;
+
+
+/**** thread solution *****/
+
+void Timer::timer_entry()
+{
+ lock.Lock();
+
+ while (!thread_stop) {
+
+ // now
+ utime_t now = g_clock.now();
+
+ // any events due?
+ utime_t next;
+ Context *event = get_next_scheduled(next);
+
+ list<Context*> pending;
+
+ if (event && now >= next) {
+ // move to pending list
+ map< utime_t, multiset<Context*> >::iterator it = scheduled.begin();
+ while (it != scheduled.end()) {
+ if (it->first > now) break;
+
+ utime_t t = it->first;
+ dout(DBL) << "queueing event(s) scheduled at " << t << endl;
+
+ for (multiset<Context*>::iterator cit = it->second.begin();
+ cit != it->second.end();
+ cit++) {
+ pending.push_back(*cit);
+ event_times.erase(*cit);
+ num_event--;
+ }
+
+ map< utime_t, multiset<Context*> >::iterator previt = it;
+ it++;
+ scheduled.erase(previt);
+ }
+
+ if (!pending.empty()) {
+ sleeping = false;
+ lock.Unlock();
+ { // make sure we're not holding any locks while we do callbacks
+ // make the callbacks myself.
+ for (list<Context*>::iterator cit = pending.begin();
+ cit != pending.end();
+ cit++) {
+ dout(DBL) << "doing callback " << *cit << endl;
+ (*cit)->finish(0);
+ }
+ pending.clear();
+ assert(pending.empty());
+ }
+ lock.Lock();
+ }
+
+ }
+
+ else {
+ // sleep
+ if (event) {
+ dout(DBL) << "sleeping until " << next << endl;
+ timed_sleep = true;
+ sleeping = true;
+ timeout_cond.WaitUntil(lock, next); // wait for waker or time
+ utime_t now = g_clock.now();
+ dout(DBL) << "kicked or timed out at " << now << endl;
+ } else {
+ dout(DBL) << "sleeping" << endl;
+ timed_sleep = false;
+ sleeping = true;
+ sleep_cond.Wait(lock); // wait for waker
+ utime_t now = g_clock.now();
+ dout(DBL) << "kicked at " << now << endl;
+ }
+ }
+ }
+
+ lock.Unlock();
+}
+
+
+
+/**
+ * Timer bits
+ */
+
+void Timer::register_timer()
+{
+ if (timer_thread.is_started()) {
+ if (sleeping) {
+ dout(DBL) << "register_timer kicking thread" << endl;
+ if (timed_sleep)
+ timeout_cond.SignalAll();
+ else
+ sleep_cond.SignalAll();
+ } else {
+ dout(DBL) << "register_timer doing nothing; thread is alive but not sleeping" << endl;
+ // it's probably doing callbacks.
+ }
+ } else {
+ dout(DBL) << "register_timer starting thread" << endl;
+ timer_thread.create();
+ }
+}
+
+void Timer::cancel_timer()
+{
+ // clear my callback pointers
+ if (timer_thread.is_started()) {
+ dout(10) << "setting thread_stop flag" << endl;
+ lock.Lock();
+ thread_stop = true;
+ if (timed_sleep)
+ timeout_cond.SignalAll();
+ else
+ sleep_cond.SignalAll();
+ lock.Unlock();
+
+ dout(10) << "waiting for thread to finish" << endl;
+ void *ptr;
+ timer_thread.join(&ptr);
+
+ dout(10) << "thread finished, exit code " << ptr << endl;
+ }
+}
+
+
+/*
+ * schedule
+ */
+
+
+void Timer::add_event_after(float seconds,
+ Context *callback)
+{
+ utime_t when = g_clock.now();
+ when.sec_ref() += (int)seconds;
+ add_event_at(when, callback);
+}
+
+void Timer::add_event_at(utime_t when,
+ Context *callback)
+{
+ // insert
+ dout(DBL) << "add_event " << callback << " at " << when << endl;
+
+ lock.Lock();
+ scheduled[ when ].insert(callback);
+ assert(event_times.count(callback) == 0); // err.. there can be only one (for now!)
+ event_times[callback] = when;
+
+ num_event++;
+
+ // make sure i wake up
+ register_timer();
+
+ lock.Unlock();
+}
+
+bool Timer::cancel_event(Context *callback)
+{
+ lock.Lock();
+
+ dout(DBL) << "cancel_event " << callback << endl;
+
+ if (!event_times.count(callback)) {
+ dout(DBL) << "cancel_event " << callback << " wasn't scheduled?" << endl;
+ lock.Unlock();
+ //assert(0);
+ return false; // wasn't scheduled.
+ }
+
+ utime_t tp = event_times[callback];
+ assert(scheduled.count(tp));
+
+ multiset<Context*>::iterator p = scheduled[tp].find(callback); // there may be more than one?
+ assert(p != scheduled[tp].end());
+ scheduled[tp].erase(p);
+
+ event_times.erase(callback);
+
+ lock.Unlock();
+ return true;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __TIMER_H
+#define __TIMER_H
+
+#include "include/types.h"
+#include "include/Context.h"
+#include "Clock.h"
+
+#include "Mutex.h"
+#include "Cond.h"
+#include "Thread.h"
+
+#include <map>
+#include <set>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+
+/*** Timer
+ * schedule callbacks
+ */
+
+//class Messenger;
+
+
+namespace __gnu_cxx {
+ template<> struct hash<Context*> {
+ size_t operator()(const Context *p) const {
+ static hash<unsigned long> H;
+ return H((unsigned long)p);
+ }
+ };
+}
+
+
+class Timer {
+ private:
+ map< utime_t, multiset<Context*> > scheduled; // time -> (context ...)
+ hash_map< Context*, utime_t > event_times; // event -> time
+
+ // get time of the next event
+ Context* get_next_scheduled(utime_t& when) {
+ if (scheduled.empty()) return 0;
+ map< utime_t, multiset<Context*> >::iterator it = scheduled.begin();
+ when = it->first;
+ multiset<Context*>::iterator sit = it->second.begin();
+ return *sit;
+ }
+
+ void register_timer(); // make sure i get a callback
+ void cancel_timer(); // make sure i get a callback
+
+ //pthread_t thread_id;
+ bool thread_stop;
+ Mutex lock;
+ bool timed_sleep;
+ bool sleeping;
+ Cond sleep_cond;
+ Cond timeout_cond;
+
+ public:
+ void timer_entry(); // waiter thread (that wakes us up)
+
+ class TimerThread : public Thread {
+ Timer *t;
+ public:
+ void *entry() {
+ t->timer_entry();
+ return 0;
+ }
+ TimerThread(Timer *_t) : t(_t) {}
+ } timer_thread;
+
+
+ int num_event;
+
+
+ public:
+ Timer() :
+ thread_stop(false),
+ timed_sleep(false),
+ sleeping(false),
+ timer_thread(this),
+ num_event(0)
+ {
+ }
+ ~Timer() {
+ // stop.
+ cancel_timer();
+
+ // scheduled
+ for (map< utime_t, multiset<Context*> >::iterator it = scheduled.begin();
+ it != scheduled.end();
+ it++) {
+ for (multiset<Context*>::iterator sit = it->second.begin();
+ sit != it->second.end();
+ sit++)
+ delete *sit;
+ }
+ scheduled.clear();
+ }
+
+ void init() {
+ register_timer();
+ }
+ void shutdown() {
+ cancel_timer();
+ }
+
+ // schedule events
+ void add_event_after(float seconds,
+ Context *callback);
+ void add_event_at(utime_t when,
+ Context *callback);
+ bool cancel_event(Context *callback);
+
+ // execute pending events
+ void execute_pending();
+
+};
+
+
+// single global instance
+extern Timer g_timer;
+
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "config.h"
+#include "include/types.h"
+
+//#define MDS_CACHE_SIZE 4*10000 -> <20mb
+//#define MDS_CACHE_SIZE 80000 62mb
+
+#define AVG_PER_INODE_SIZE 450
+#define MDS_CACHE_MB_TO_INODES(x) ((x)*1000000/AVG_PER_INODE_SIZE)
+
+//#define MDS_CACHE_SIZE MDS_CACHE_MB_TO_INODES( 50 )
+//#define MDS_CACHE_SIZE 1500000
+#define MDS_CACHE_SIZE 150000
+
+
+// hack hack hack ugly FIXME
+#include "common/Mutex.h"
+long buffer_total_alloc = 0;
+Mutex bufferlock;
+
+
+
+FileLayout g_OSD_FileLayout( 1<<20, 1, 1<<20, 2 ); // stripe over 1M objects, 2x replication
+//FileLayout g_OSD_FileLayout( 1<<17, 4, 1<<20 ); // 128k stripes over sets of 4
+
+// ??
+//FileLayout g_OSD_MDDirLayout( 1<<8, 1<<2, 1<<19, 3 ); // this is stupid, but can bring out an ebofs table bug?
+FileLayout g_OSD_MDDirLayout( 1<<20, 1, 1<<20, 2 ); // 1M objects, 2x replication
+
+// stripe mds log over 128 byte bits (see mds_log_pad_entry below to match!)
+FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20, 2 ); // 1M objects
+//FileLayout g_OSD_MDLogLayout( 1<<8, 1<<2, 1<<19, 3 ); // 256 byte bits
+//FileLayout g_OSD_MDLogLayout( 1<<7, 32, 1<<20, 3 ); // 128 byte stripes over 32 1M objects
+//FileLayout g_OSD_MDLogLayout( 57, 32, 1<<20 ); // pathological case to test striping buffer mapping
+//FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20 ); // old way
+
+// fake osd failures: osd -> time
+std::map<int,float> g_fake_osd_down;
+std::map<int,float> g_fake_osd_out;
+
+md_config_t g_debug_after_conf;
+
+md_config_t g_conf = {
+ num_mon: 1,
+ num_mds: 1,
+ num_osd: 4,
+ num_client: 1,
+
+ mkfs: false,
+
+ // profiling and debugging
+ log: true,
+ log_interval: 1,
+ log_name: (char*)0,
+
+ log_messages: true,
+ log_pins: true,
+
+ fake_clock: false,
+ fakemessenger_serialize: true,
+
+ fake_osdmap_expand: 0,
+ fake_osdmap_updates: 0,
+ fake_osd_mttf: 0,
+ fake_osd_mttr: 0,
+
+ osd_remount_at: 0,
+
+ kill_after: 0,
+
+ tick: 0,
+
+ debug: 0,
+ debug_mds: 1,
+ debug_mds_balancer: 1,
+ debug_mds_log: 1,
+ debug_buffer: 0,
+ debug_filer: 0,
+ debug_objecter: 0,
+ debug_objectcacher: 0,
+ debug_client: 0,
+ debug_osd: 0,
+ debug_ebofs: 1,
+ debug_bdev: 1, // block device
+ debug_ns: 0,
+ debug_ms: 0,
+ debug_mon: 0,
+
+ debug_after: 0,
+
+ // --- clock ---
+ clock_lock: false,
+
+ // --- messenger ---
+ ms_single_dispatch: false,
+ ms_requeue_on_sender_fail: false,
+
+ ms_stripe_osds: false,
+ ms_skip_rank0: false,
+ ms_overlay_clients: false,
+
+ ms_die_on_failure: false,
+
+ /*tcp_skip_rank0: false,
+ tcp_overlay_clients: false, // over osds!
+ tcp_log: false,
+ tcp_serial_marshall: true,
+ tcp_serial_out: false,
+ tcp_multi_out: true,
+ tcp_multi_dispatch: false, // not fully implemented yet
+ */
+
+ // --- mon ---
+ mon_tick_interval: 5,
+ mon_osd_down_out_interval: 5, // seconds
+ mon_lease: 2.000, // seconds
+
+ // --- client ---
+ client_cache_size: 300,
+ client_cache_mid: .5,
+ client_cache_stat_ttl: 0, // seconds until cached stat results become invalid
+ client_cache_readdir_ttl: 1, // 1 second only
+ client_use_random_mds: false,
+
+ client_sync_writes: 0,
+
+ client_oc: true,
+ client_oc_size: 1024*1024* 5, // MB * n
+ client_oc_max_dirty: 1024*1024* 5, // MB * n
+ client_oc_max_sync_write: 128*1024, // writes >= this use wrlock
+
+ client_trace: 0,
+ fuse_direct_io: 0,
+
+ // --- objecter ---
+ objecter_buffer_uncommitted: true,
+
+ // --- journaler ---
+ journaler_allow_split_entries: false,
+
+ // --- mds ---
+ mds_cache_size: MDS_CACHE_SIZE,
+ mds_cache_mid: .7,
+
+ mds_decay_halflife: 30,
+
+ mds_log: true,
+ mds_log_max_len: MDS_CACHE_SIZE / 3,
+ mds_log_max_trimming: 10000,
+ mds_log_read_inc: 1<<20,
+ mds_log_pad_entry: 128,//256,//64,
+ mds_log_before_reply: true,
+ mds_log_flush_on_shutdown: true,
+
+ mds_bal_replicate_threshold: 2000,
+ mds_bal_unreplicate_threshold: 0,//500,
+ mds_bal_hash_rd: 10000,
+ mds_bal_unhash_rd: 1000,
+ mds_bal_hash_wr: 10000,
+ mds_bal_unhash_wr: 1000,
+ mds_bal_interval: 30, // seconds
+ mds_bal_hash_interval: 5, // seconds
+ mds_bal_idle_threshold: .1,
+ mds_bal_max: -1,
+ mds_bal_max_until: -1,
+
+ mds_bal_mode: 0,
+ mds_bal_min_start: .2, // if we need less than this, we don't do anything
+ mds_bal_need_min: .8, // take within this range of what we need
+ mds_bal_need_max: 1.2,
+ mds_bal_midchunk: .3, // any sub bigger than this taken in full
+ mds_bal_minchunk: .001, // never take anything smaller than this
+
+ mds_commit_on_shutdown: true,
+ mds_shutdown_check: 0, //30,
+
+ mds_verify_export_dirauth: true,
+
+ mds_local_osd: false,
+
+
+ // --- osd ---
+ osd_rep: OSD_REP_PRIMARY,
+ osd_balance_reads: false,
+ osd_pg_bits: 0, // 0 == let osdmonitor decide
+ osd_object_layout: OBJECT_LAYOUT_HASHINO,
+ osd_pg_layout: PG_LAYOUT_CRUSH,
+ osd_max_rep: 4,
+ osd_maxthreads: 2, // 0 == no threading
+ osd_max_opq: 10,
+ osd_mkfs: false,
+ osd_age: .8,
+ osd_age_time: 0,
+ osd_heartbeat_interval: 5, // shut up while i'm debugging
+ osd_replay_window: 5,
+ osd_max_pull: 2,
+ osd_pad_pg_log: false,
+
+ // --- fakestore ---
+ fakestore_fake_sync: 2, // 2 seconds
+ fakestore_fsync: false,//true,
+ fakestore_writesync: false,
+ fakestore_syncthreads: 4,
+ fakestore_fakeattr: true,
+ fakestore_dev: 0,
+
+ // --- ebofs ---
+ ebofs: 1,
+ ebofs_cloneable: false,
+ ebofs_verify: false,
+ ebofs_commit_ms: 2000, // 0 = no forced commit timeout (for debugging/tracing)
+ ebofs_idle_commit_ms: 100, // 0 = no idle detection. use this -or- bdev_idle_kick_after_ms
+ ebofs_oc_size: 10000, // onode cache
+ ebofs_cc_size: 10000, // cnode cache
+ ebofs_bc_size: (80 *256), // 4k blocks, *256 for MB
+ ebofs_bc_max_dirty: (60 *256), // before write() will block
+ ebofs_max_prefetch: 1000, // 4k blocks
+ ebofs_realloc: true,
+
+ ebofs_abp_zero: false, // zero newly allocated buffers (may shut up valgrind)
+ ebofs_abp_max_alloc: 4096*16, // max size of new buffers (larger -> more memory fragmentation)
+
+ // --- obfs ---
+ uofs: 0,
+ uofs_fake_sync: 2, // 2 seconds
+ uofs_cache_size: 1 << 28, //256MB
+ uofs_onode_size: (int)1024,
+ uofs_small_block_size: (int)4096, //4KB
+ uofs_large_block_size: (int)524288, //512KB
+ uofs_segment_size: (int)268435456, //256MB
+ uofs_block_meta_ratio: (int)10,
+ uofs_sync_write: (int)0,
+ uofs_nr_hash_buckets: (int)1023,
+ uofs_flush_interval: (int)5, //seconds
+ uofs_min_flush_pages: (int)1024, //4096 4k-pages
+ uofs_delay_allocation: (int)1, //true
+
+ // --- block device ---
+ bdev_lock: true,
+ bdev_iothreads: 1, // number of ios to queue with kernel
+ bdev_idle_kick_after_ms: 0,//100, // ms ** FIXME ** this seems to break things, not sure why yet **
+ bdev_el_fw_max_ms: 10000, // restart elevator at least once every 1000 ms
+ bdev_el_bw_max_ms: 3000, // restart elevator at least once every 300 ms
+ bdev_el_bidir: true, // bidirectional elevator?
+ bdev_iov_max: 512, // max # iov's to collect into a single readv()/writev() call
+ bdev_debug_check_io_overlap: true, // [DEBUG] check for any pending io overlaps
+ bdev_fake_mb: 0,
+ bdev_fake_max_mb: 0,
+
+ // --- fakeclient (mds regression testing) (ancient history) ---
+ num_fakeclient: 100,
+ fakeclient_requests: 100,
+ fakeclient_deterministic: false,
+
+ fakeclient_op_statfs: false,
+
+ // loosely based on Roselli workload paper numbers
+ fakeclient_op_stat: 610,
+ fakeclient_op_lstat: false,
+ fakeclient_op_utime: 0,
+ fakeclient_op_chmod: 1,
+ fakeclient_op_chown: 1,
+
+ fakeclient_op_readdir: 2,
+ fakeclient_op_mknod: 30,
+ fakeclient_op_link: false,
+ fakeclient_op_unlink: 20,
+ fakeclient_op_rename: 0,//40,
+
+ fakeclient_op_mkdir: 10,
+ fakeclient_op_rmdir: 20,
+ fakeclient_op_symlink: 20,
+
+ fakeclient_op_openrd: 200,
+ fakeclient_op_openwr: 0,
+ fakeclient_op_openwrc: 0,
+ fakeclient_op_read: false, // osd!
+ fakeclient_op_write: false, // osd!
+ fakeclient_op_truncate: false,
+ fakeclient_op_fsync: false,
+ fakeclient_op_close: 200
+};
+
+
+#include <stdlib.h>
+#include <string.h>
+
+
+void env_to_vec(std::vector<char*>& args)
+{
+ const char *p = getenv("CEPH_ARGS");
+ if (!p) return;
+
+ static char buf[1000];
+ int len = strlen(p);
+ memcpy(buf, p, len);
+ buf[len] = 0;
+ //cout << "CEPH_ARGS " << buf << endl;
+
+ int l = 0;
+ for (int i=0; i<len; i++) {
+ if (buf[i] == ' ') {
+ buf[i] = 0;
+ args.push_back(buf+l);
+ //cout << "arg " << (buf+l) << endl;
+ l = i+1;
+ }
+ }
+ args.push_back(buf+l);
+ //cout << "arg " << (buf+l) << endl;
+}
+
+
+void argv_to_vec(int argc, char **argv,
+ std::vector<char*>& args)
+{
+ for (int i=1; i<argc; i++)
+ args.push_back(argv[i]);
+}
+
+void vec_to_argv(std::vector<char*>& args,
+ int& argc, char **&argv)
+{
+ argv = (char**)malloc(sizeof(char*) * argc);
+ argc = 1;
+ argv[0] = "asdf";
+
+ for (unsigned i=0; i<args.size(); i++)
+ argv[argc++] = args[i];
+}
+
+void parse_config_options(std::vector<char*>& args)
+{
+ std::vector<char*> nargs;
+
+ for (unsigned i=0; i<args.size(); i++) {
+ if (strcmp(args[i], "--nummon") == 0)
+ g_conf.num_mon = atoi(args[++i]);
+ else if (strcmp(args[i], "--nummds") == 0)
+ g_conf.num_mds = atoi(args[++i]);
+ else if (strcmp(args[i], "--numclient") == 0)
+ g_conf.num_client = atoi(args[++i]);
+ else if (strcmp(args[i], "--numosd") == 0)
+ g_conf.num_osd = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--ms_single_dispatch") == 0)
+ g_conf.ms_single_dispatch = atoi(args[++i]);
+ else if (strcmp(args[i], "--ms_stripe_osds") == 0)
+ g_conf.ms_stripe_osds = true;
+ else if (strcmp(args[i], "--ms_skip_rank0") == 0)
+ g_conf.ms_skip_rank0 = true;
+ else if (strcmp(args[i], "--ms_overlay_clients") == 0)
+ g_conf.ms_overlay_clients = true;
+ else if (strcmp(args[i], "--ms_die_on_failure") == 0)
+ g_conf.ms_die_on_failure = true;
+
+ /*else if (strcmp(args[i], "--tcp_log") == 0)
+ g_conf.tcp_log = true;
+ else if (strcmp(args[i], "--tcp_multi_out") == 0)
+ g_conf.tcp_multi_out = atoi(args[++i]);
+ */
+
+ else if (strcmp(args[i], "--mkfs") == 0)
+ g_conf.osd_mkfs = g_conf.mkfs = 1; //atoi(args[++i]);
+
+ else if (strcmp(args[i], "--fake_osdmap_expand") == 0)
+ g_conf.fake_osdmap_expand = atoi(args[++i]);
+ else if (strcmp(args[i], "--fake_osdmap_updates") == 0)
+ g_conf.fake_osdmap_updates = atoi(args[++i]);
+ else if (strcmp(args[i], "--fake_osd_mttf") == 0)
+ g_conf.fake_osd_mttf = atoi(args[++i]);
+ else if (strcmp(args[i], "--fake_osd_mttr") == 0)
+ g_conf.fake_osd_mttr = atoi(args[++i]);
+ else if (strcmp(args[i], "--fake_osd_down") == 0) {
+ int osd = atoi(args[++i]);
+ float when = atof(args[++i]);
+ g_fake_osd_down[osd] = when;
+ }
+ else if (strcmp(args[i], "--fake_osd_out") == 0) {
+ int osd = atoi(args[++i]);
+ float when = atof(args[++i]);
+ g_fake_osd_out[osd] = when;
+ }
+ else if (strcmp(args[i], "--osd_remount_at") == 0)
+ g_conf.osd_remount_at = atoi(args[++i]);
+ //else if (strcmp(args[i], "--fake_osd_sync") == 0)
+ //g_conf.fake_osd_sync = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--debug") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_mds") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_mds = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_mds = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_mds_balancer") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_mds_balancer = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_mds_balancer = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_mds_log") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_mds_log = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_mds_log = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_buffer") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_buffer = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_buffer = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_filer") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_filer = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_filer = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_objecter") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_objecter = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_objecter = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_objectcacher") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_objectcacher = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_objectcacher = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_client") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_client = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_client = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_osd") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_osd = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_osd = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_ebofs") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_ebofs = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_ebofs = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_bdev") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_bdev = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_bdev = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_ms") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_ms = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_ms = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_mon") == 0)
+ if (!g_conf.debug_after)
+ g_conf.debug_mon = atoi(args[++i]);
+ else
+ g_debug_after_conf.debug_mon = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--debug_after") == 0) {
+ g_conf.debug_after = atoi(args[++i]);
+ g_debug_after_conf = g_conf;
+ }
+
+ else if (strcmp(args[i], "--log") == 0)
+ g_conf.log = atoi(args[++i]);
+ else if (strcmp(args[i], "--log_name") == 0)
+ g_conf.log_name = args[++i];
+
+ else if (strcmp(args[i], "--fakemessenger_serialize") == 0)
+ g_conf.fakemessenger_serialize = atoi(args[++i]);
+
+
+ else if (strcmp(args[i], "--clock_lock") == 0)
+ g_conf.clock_lock = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--objecter_buffer_uncommitted") == 0)
+ g_conf.objecter_buffer_uncommitted = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--mds_cache_size") == 0)
+ g_conf.mds_cache_size = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--mds_log") == 0)
+ g_conf.mds_log = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_log_before_reply") == 0)
+ g_conf.mds_log_before_reply = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_log_max_len") == 0)
+ g_conf.mds_log_max_len = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_log_read_inc") == 0)
+ g_conf.mds_log_read_inc = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_log_max_trimming") == 0)
+ g_conf.mds_log_max_trimming = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--mds_commit_on_shutdown") == 0)
+ g_conf.mds_commit_on_shutdown = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_shutdown_check") == 0)
+ g_conf.mds_shutdown_check = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_log_flush_on_shutdown") == 0)
+ g_conf.mds_log_flush_on_shutdown = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--mds_decay_halflife") == 0)
+ g_conf.mds_decay_halflife = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--mds_bal_interval") == 0)
+ g_conf.mds_bal_interval = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_rep") == 0)
+ g_conf.mds_bal_replicate_threshold = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_unrep") == 0)
+ g_conf.mds_bal_unreplicate_threshold = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_max") == 0)
+ g_conf.mds_bal_max = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_max_until") == 0)
+ g_conf.mds_bal_max_until = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--mds_bal_hash_rd") == 0)
+ g_conf.mds_bal_hash_rd = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_hash_wr") == 0)
+ g_conf.mds_bal_hash_wr = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_unhash_rd") == 0)
+ g_conf.mds_bal_unhash_rd = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_unhash_wr") == 0)
+ g_conf.mds_bal_unhash_wr = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--mds_bal_mode") == 0)
+ g_conf.mds_bal_mode = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_min_start") == 0)
+ g_conf.mds_bal_min_start = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_need_min") == 0)
+ g_conf.mds_bal_need_min = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_need_max") == 0)
+ g_conf.mds_bal_need_max = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_midchunk") == 0)
+ g_conf.mds_bal_midchunk = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_minchunk") == 0)
+ g_conf.mds_bal_minchunk = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--mds_local_osd") == 0)
+ g_conf.mds_local_osd = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--client_cache_size") == 0)
+ g_conf.client_cache_size = atoi(args[++i]);
+ else if (strcmp(args[i], "--client_cache_stat_ttl") == 0)
+ g_conf.client_cache_stat_ttl = atoi(args[++i]);
+ else if (strcmp(args[i], "--client_cache_readdir_ttl") == 0)
+ g_conf.client_cache_readdir_ttl = atoi(args[++i]);
+ else if (strcmp(args[i], "--client_trace") == 0)
+ g_conf.client_trace = atoi(args[++i]);
+ else if (strcmp(args[i], "--fuse_direct_io") == 0)
+ g_conf.fuse_direct_io = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--mon_osd_down_out_interval") == 0)
+ g_conf.mon_osd_down_out_interval = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--client_sync_writes") == 0)
+ g_conf.client_sync_writes = atoi(args[++i]);
+ else if (strcmp(args[i], "--client_oc") == 0)
+ g_conf.client_oc = atoi(args[++i]);
+ else if (strcmp(args[i], "--client_oc_size") == 0)
+ g_conf.client_oc_size = atoi(args[++i]);
+ else if (strcmp(args[i], "--client_oc_max_dirty") == 0)
+ g_conf.client_oc_max_dirty = atoi(args[++i]);
+
+
+ else if (strcmp(args[i], "--ebofs") == 0)
+ g_conf.ebofs = 1;
+ else if (strcmp(args[i], "--ebofs_cloneable") == 0)
+ g_conf.ebofs_cloneable = atoi(args[++i]);
+ else if (strcmp(args[i], "--ebofs_verify") == 0)
+ g_conf.ebofs_verify = atoi(args[++i]);
+ else if (strcmp(args[i], "--ebofs_commit_ms") == 0)
+ g_conf.ebofs_commit_ms = atoi(args[++i]);
+ else if (strcmp(args[i], "--ebofs_idle_commit_ms") == 0)
+ g_conf.ebofs_idle_commit_ms = atoi(args[++i]);
+ else if (strcmp(args[i], "--ebofs_oc_size") == 0)
+ g_conf.ebofs_oc_size = atoi(args[++i]);
+ else if (strcmp(args[i], "--ebofs_cc_size") == 0)
+ g_conf.ebofs_cc_size = atoi(args[++i]);
+ else if (strcmp(args[i], "--ebofs_bc_size") == 0)
+ g_conf.ebofs_bc_size = atoi(args[++i]);
+ else if (strcmp(args[i], "--ebofs_bc_max_dirty") == 0)
+ g_conf.ebofs_bc_max_dirty = atoi(args[++i]);
+ else if (strcmp(args[i], "--ebofs_abp_max_alloc") == 0)
+ g_conf.ebofs_abp_max_alloc = atoi(args[++i]);
+ else if (strcmp(args[i], "--ebofs_max_prefetch") == 0)
+ g_conf.ebofs_max_prefetch = atoi(args[++i]);
+ else if (strcmp(args[i], "--ebofs_realloc") == 0)
+ g_conf.ebofs_realloc = atoi(args[++i]);
+
+
+ else if (strcmp(args[i], "--fakestore") == 0) {
+ g_conf.ebofs = 0;
+ //g_conf.osd_pg_bits = 5;
+ //g_conf.osd_maxthreads = 1; // fucking hell
+ }
+ else if (strcmp(args[i], "--fakestore_fsync") == 0)
+ g_conf.fakestore_fsync = atoi(args[++i]);
+ else if (strcmp(args[i], "--fakestore_writesync") == 0)
+ g_conf.fakestore_writesync = atoi(args[++i]);
+ else if (strcmp(args[i], "--fakestore_dev") == 0)
+ g_conf.fakestore_dev = args[++i];
+
+ else if (strcmp(args[i], "--obfs") == 0) {
+ g_conf.uofs = 1;
+ g_conf.osd_maxthreads = 1; // until feng merges joel's fixes
+ }
+
+
+ else if (strcmp(args[i], "--osd_balance_reads") == 0)
+ g_conf.osd_balance_reads = atoi(args[++i]);
+ else if (strcmp(args[i], "--osd_rep") == 0)
+ g_conf.osd_rep = atoi(args[++i]);
+ else if (strcmp(args[i], "--osd_rep_chain") == 0)
+ g_conf.osd_rep = OSD_REP_CHAIN;
+ else if (strcmp(args[i], "--osd_rep_splay") == 0)
+ g_conf.osd_rep = OSD_REP_SPLAY;
+ else if (strcmp(args[i], "--osd_rep_primary") == 0)
+ g_conf.osd_rep = OSD_REP_PRIMARY;
+ else if (strcmp(args[i], "--osd_mkfs") == 0)
+ g_conf.osd_mkfs = atoi(args[++i]);
+ else if (strcmp(args[i], "--osd_age") == 0)
+ g_conf.osd_age = atof(args[++i]);
+ else if (strcmp(args[i], "--osd_age_time") == 0)
+ g_conf.osd_age_time = atoi(args[++i]);
+ else if (strcmp(args[i], "--osd_pg_bits") == 0)
+ g_conf.osd_pg_bits = atoi(args[++i]);
+ else if (strcmp(args[i], "--osd_max_rep") == 0)
+ g_conf.osd_max_rep = atoi(args[++i]);
+ else if (strcmp(args[i], "--osd_maxthreads") == 0)
+ g_conf.osd_maxthreads = atoi(args[++i]);
+ else if (strcmp(args[i], "--osd_max_pull") == 0)
+ g_conf.osd_max_pull = atoi(args[++i]);
+ else if (strcmp(args[i], "--osd_pad_pg_log") == 0)
+ g_conf.osd_pad_pg_log = atoi(args[++i]);
+
+
+ else if (strcmp(args[i], "--bdev_lock") == 0)
+ g_conf.bdev_lock = atoi(args[++i]);
+ else if (strcmp(args[i], "--bdev_el_bidir") == 0)
+ g_conf.bdev_el_bidir = atoi(args[++i]);
+ else if (strcmp(args[i], "--bdev_iothreads") == 0)
+ g_conf.bdev_iothreads = atoi(args[++i]);
+ else if (strcmp(args[i], "--bdev_idle_kick_after_ms") == 0)
+ g_conf.bdev_idle_kick_after_ms = atoi(args[++i]);
+ else if (strcmp(args[i], "--bdev_fake_mb") == 0)
+ g_conf.bdev_fake_mb = atoi(args[++i]);
+ else if (strcmp(args[i], "--bdev_fake_max_mb") == 0)
+ g_conf.bdev_fake_max_mb = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--osd_object_layout") == 0) {
+ i++;
+ if (strcmp(args[i], "linear") == 0) g_conf.osd_object_layout = OBJECT_LAYOUT_LINEAR;
+ else if (strcmp(args[i], "hashino") == 0) g_conf.osd_object_layout = OBJECT_LAYOUT_HASHINO;
+ else if (strcmp(args[i], "hash") == 0) g_conf.osd_object_layout = OBJECT_LAYOUT_HASH;
+ else assert(0);
+ }
+
+ else if (strcmp(args[i], "--osd_pg_layout") == 0) {
+ i++;
+ if (strcmp(args[i], "linear") == 0) g_conf.osd_pg_layout = PG_LAYOUT_LINEAR;
+ else if (strcmp(args[i], "hash") == 0) g_conf.osd_pg_layout = PG_LAYOUT_HASH;
+ else if (strcmp(args[i], "hybrid") == 0) g_conf.osd_pg_layout = PG_LAYOUT_HYBRID;
+ else if (strcmp(args[i], "crush") == 0) g_conf.osd_pg_layout = PG_LAYOUT_CRUSH;
+ else assert(0);
+ }
+
+ else if (strcmp(args[i], "--kill_after") == 0)
+ g_conf.kill_after = atoi(args[++i]);
+ else if (strcmp(args[i], "--tick") == 0)
+ g_conf.tick = atoi(args[++i]);
+
+ else if (strcmp(args[i], "--file_layout_ssize") == 0)
+ g_OSD_FileLayout.stripe_size = atoi(args[++i]);
+ else if (strcmp(args[i], "--file_layout_scount") == 0)
+ g_OSD_FileLayout.stripe_count = atoi(args[++i]);
+ else if (strcmp(args[i], "--file_layout_osize") == 0)
+ g_OSD_FileLayout.object_size = atoi(args[++i]);
+ else if (strcmp(args[i], "--file_layout_num_rep") == 0)
+ g_OSD_FileLayout.num_rep = atoi(args[++i]);
+ else if (strcmp(args[i], "--meta_dir_layout_ssize") == 0)
+ g_OSD_MDDirLayout.stripe_size = atoi(args[++i]);
+ else if (strcmp(args[i], "--meta_dir_layout_scount") == 0)
+ g_OSD_MDDirLayout.stripe_count = atoi(args[++i]);
+ else if (strcmp(args[i], "--meta_dir_layout_osize") == 0)
+ g_OSD_MDDirLayout.object_size = atoi(args[++i]);
+ else if (strcmp(args[i], "--meta_dir_layout_num_rep") == 0)
+ g_OSD_MDDirLayout.num_rep = atoi(args[++i]);
+ else if (strcmp(args[i], "--meta_log_layout_ssize") == 0)
+ g_OSD_MDLogLayout.stripe_size = atoi(args[++i]);
+ else if (strcmp(args[i], "--meta_log_layout_scount") == 0)
+ g_OSD_MDLogLayout.stripe_count = atoi(args[++i]);
+ else if (strcmp(args[i], "--meta_log_layout_osize") == 0)
+ g_OSD_MDLogLayout.object_size = atoi(args[++i]);
+ else if (strcmp(args[i], "--meta_log_layout_num_rep") == 0) {
+ g_OSD_MDLogLayout.num_rep = atoi(args[++i]);
+ if (!g_OSD_MDLogLayout.num_rep)
+ g_conf.mds_log = false;
+ }
+
+ else {
+ nargs.push_back(args[i]);
+ }
+ }
+
+ args = nargs;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __CONFIG_H
+#define __CONFIG_H
+
+extern class FileLayout g_OSD_FileLayout;
+extern class FileLayout g_OSD_MDDirLayout;
+extern class FileLayout g_OSD_MDLogLayout;
+
+#include <vector>
+#include <map>
+
+extern std::map<int,float> g_fake_osd_down;
+extern std::map<int,float> g_fake_osd_out;
+
+#define OSD_REP_PRIMARY 0
+#define OSD_REP_SPLAY 1
+#define OSD_REP_CHAIN 2
+
+struct md_config_t {
+ int num_mon;
+ int num_mds;
+ int num_osd;
+ int num_client;
+
+ bool mkfs;
+
+ // profiling
+ bool log;
+ int log_interval;
+ char *log_name;
+
+ bool log_messages;
+ bool log_pins;
+
+ bool fake_clock;
+ bool fakemessenger_serialize;
+
+ int fake_osdmap_expand;
+ int fake_osdmap_updates;
+ int fake_osd_mttf;
+ int fake_osd_mttr;
+
+ int osd_remount_at;
+
+ int kill_after;
+
+ int tick;
+
+ int debug;
+ int debug_mds;
+ int debug_mds_balancer;
+ int debug_mds_log;
+ int debug_buffer;
+ int debug_filer;
+ int debug_objecter;
+ int debug_objectcacher;
+ int debug_client;
+ int debug_osd;
+ int debug_ebofs;
+ int debug_bdev;
+ int debug_ns;
+ int debug_ms;
+ int debug_mon;
+
+ int debug_after;
+
+ // clock
+ bool clock_lock;
+
+ // messenger
+
+ /*bool tcp_skip_rank0;
+ bool tcp_overlay_clients;
+ bool tcp_log;
+ bool tcp_serial_marshall;
+ bool tcp_serial_out;
+ bool tcp_multi_out;
+ bool tcp_multi_dispatch;
+ */
+
+ bool ms_single_dispatch;
+ bool ms_requeue_on_sender_fail;
+
+ bool ms_stripe_osds;
+ bool ms_skip_rank0;
+ bool ms_overlay_clients;
+ bool ms_die_on_failure;
+
+ // mon
+ int mon_tick_interval;
+ int mon_osd_down_out_interval;
+ float mon_lease;
+
+ // client
+ int client_cache_size;
+ float client_cache_mid;
+ int client_cache_stat_ttl;
+ int client_cache_readdir_ttl;
+ bool client_use_random_mds; // debug flag
+
+ bool client_sync_writes;
+
+ bool client_oc;
+ int client_oc_size;
+ int client_oc_max_dirty;
+ size_t client_oc_max_sync_write;
+
+
+
+ /*
+ bool client_bcache;
+ int client_bcache_alloc_minsize;
+ int client_bcache_alloc_maxsize;
+ int client_bcache_ttl;
+ off_t client_bcache_size;
+ int client_bcache_lowater;
+ int client_bcache_hiwater;
+ size_t client_bcache_align;
+ */
+
+ int client_trace;
+ int fuse_direct_io;
+
+ // objecter
+ bool objecter_buffer_uncommitted;
+
+ // journaler
+ bool journaler_allow_split_entries;
+
+ // mds
+ int mds_cache_size;
+ float mds_cache_mid;
+
+ float mds_decay_halflife;
+
+ bool mds_log;
+ int mds_log_max_len;
+ int mds_log_max_trimming;
+ int mds_log_read_inc;
+ int mds_log_pad_entry;
+ bool mds_log_before_reply;
+ bool mds_log_flush_on_shutdown;
+
+ float mds_bal_replicate_threshold;
+ float mds_bal_unreplicate_threshold;
+ float mds_bal_hash_rd;
+ float mds_bal_unhash_rd;
+ float mds_bal_hash_wr;
+ float mds_bal_unhash_wr;
+ int mds_bal_interval;
+ int mds_bal_hash_interval;
+ float mds_bal_idle_threshold;
+ int mds_bal_max;
+ int mds_bal_max_until;
+
+ int mds_bal_mode;
+ float mds_bal_min_start;
+ float mds_bal_need_min;
+ float mds_bal_need_max;
+ float mds_bal_midchunk;
+ float mds_bal_minchunk;
+
+ bool mds_commit_on_shutdown;
+ int mds_shutdown_check;
+ bool mds_verify_export_dirauth; // debug flag
+
+ bool mds_local_osd;
+
+
+ // osd
+ int osd_rep;
+ bool osd_balance_reads;
+ int osd_pg_bits;
+ int osd_object_layout;
+ int osd_pg_layout;
+ int osd_max_rep;
+ int osd_maxthreads;
+ int osd_max_opq;
+ bool osd_mkfs;
+ float osd_age;
+ int osd_age_time;
+ int osd_heartbeat_interval;
+ int osd_replay_window;
+ int osd_max_pull;
+ bool osd_pad_pg_log;
+
+ int fakestore_fake_sync;
+ bool fakestore_fsync;
+ bool fakestore_writesync;
+ int fakestore_syncthreads; // such crap
+ bool fakestore_fakeattr;
+ char *fakestore_dev;
+
+ // ebofs
+ int ebofs;
+ bool ebofs_cloneable;
+ bool ebofs_verify;
+ int ebofs_commit_ms;
+ int ebofs_idle_commit_ms;
+ int ebofs_oc_size;
+ int ebofs_cc_size;
+ off_t ebofs_bc_size;
+ off_t ebofs_bc_max_dirty;
+ unsigned ebofs_max_prefetch;
+ bool ebofs_realloc;
+
+ bool ebofs_abp_zero;
+ size_t ebofs_abp_max_alloc;
+
+ int uofs;
+ int uofs_fake_sync;
+ int uofs_cache_size;
+ int uofs_onode_size;
+ int uofs_small_block_size;
+ int uofs_large_block_size;
+ int uofs_segment_size;
+ int uofs_block_meta_ratio;
+ int uofs_sync_write;
+
+ int uofs_nr_hash_buckets;
+ int uofs_flush_interval;
+ int uofs_min_flush_pages;
+ int uofs_delay_allocation;
+
+ // block device
+ bool bdev_lock;
+ int bdev_iothreads;
+ int bdev_idle_kick_after_ms;
+ int bdev_el_fw_max_ms;
+ int bdev_el_bw_max_ms;
+ bool bdev_el_bidir;
+ int bdev_iov_max;
+ bool bdev_debug_check_io_overlap;
+ int bdev_fake_mb;
+ int bdev_fake_max_mb;
+
+ // fake client
+ int num_fakeclient;
+ unsigned fakeclient_requests;
+ bool fakeclient_deterministic; // debug flag
+
+ int fakeclient_op_statfs;
+
+ int fakeclient_op_stat;
+ int fakeclient_op_lstat;
+ int fakeclient_op_utime;
+ int fakeclient_op_chmod;
+ int fakeclient_op_chown;
+
+ int fakeclient_op_readdir;
+ int fakeclient_op_mknod;
+ int fakeclient_op_link;
+ int fakeclient_op_unlink;
+ int fakeclient_op_rename;
+
+ int fakeclient_op_mkdir;
+ int fakeclient_op_rmdir;
+ int fakeclient_op_symlink;
+
+ int fakeclient_op_openrd;
+ int fakeclient_op_openwr;
+ int fakeclient_op_openwrc;
+ int fakeclient_op_read;
+ int fakeclient_op_write;
+ int fakeclient_op_truncate;
+ int fakeclient_op_fsync;
+ int fakeclient_op_close;
+
+};
+
+extern md_config_t g_conf;
+extern md_config_t g_debug_after_conf;
+
+#define dout(x) if ((x) <= g_conf.debug) std::cout
+#define dout2(x) if ((x) <= g_conf.debug) std::cout
+
+void env_to_vec(std::vector<char*>& args);
+void argv_to_vec(int argc, char **argv,
+ std::vector<char*>& args);
+void vec_to_argv(std::vector<char*>& args,
+ int& argc, char **&argv);
+
+void parse_config_options(std::vector<char*>& args);
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mon/MonMap.h"
+
+#include "osd/OSD.h"
+#include "ebofs/Ebofs.h"
+
+#include "msg/NewMessenger.h"
+
+#include "common/Timer.h"
+
+
+class C_Die : public Context {
+public:
+ void finish(int) {
+ cerr << "die" << endl;
+ exit(1);
+ }
+};
+
+class C_Debug : public Context {
+ public:
+ void finish(int) {
+ int size = &g_conf.debug_after - &g_conf.debug;
+ memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size);
+ dout(0) << "debug_after flipping debug settings" << endl;
+ }
+};
+
+
+int main(int argc, char **argv)
+{
+ vector<char*> args;
+ argv_to_vec(argc, argv, args);
+
+ parse_config_options(args);
+
+ if (g_conf.kill_after)
+ g_timer.add_event_after(g_conf.kill_after, new C_Die);
+ if (g_conf.debug_after)
+ g_timer.add_event_after(g_conf.debug_after, new C_Debug);
+
+
+ char *dev;
+ int whoami = -1;
+ for (unsigned i=0; i<args.size(); i++) {
+ if (strcmp(args[i],"--dev") == 0)
+ dev = args[++i];
+ else if (strcmp(args[i],"--osd") == 0)
+ whoami = atoi(args[++i]);
+ else {
+ cerr << "unrecognized arg " << args[i] << endl;
+ return -1;
+ }
+ }
+ cout << "dev " << dev << endl;
+
+
+ if (whoami < 0) {
+ // who am i? peek at superblock!
+ OSDSuperblock sb;
+ ObjectStore *store = new Ebofs(dev);
+ bufferlist bl;
+ store->mount();
+ int r = store->read(object_t(0,0), 0, sizeof(sb), bl);
+ if (r < 0) {
+ cerr << "couldn't read superblock object on " << dev << endl;
+ exit(0);
+ }
+ bl.copy(0, sizeof(sb), (char*)&sb);
+ store->umount();
+ delete store;
+ whoami = sb.whoami;
+
+ cout << "osd fs says i am osd" << whoami << endl;
+ } else {
+ cout << "command line arg says i am osd" << whoami << endl;
+ }
+
+ // load monmap
+ MonMap monmap;
+ int r = monmap.read(".ceph_monmap");
+ assert(r >= 0);
+
+ // start up network
+ rank.start_rank();
+
+ // start osd
+ Messenger *m = rank.register_entity(MSG_ADDR_OSD(whoami));
+ assert(m);
+ OSD *osd = new OSD(whoami, m, &monmap, dev);
+ osd->init();
+
+ // wait
+ rank.wait();
+
+ // done
+ delete osd;
+
+ return 0;
+}
+
--- /dev/null
+#ifndef __crush_BINARYTREE_H
+#define __crush_BINARYTREE_H
+
+#include <cassert>
+#include <iostream>
+#include <map>
+#include <vector>
+//#include <set>
+using namespace std;
+
+#include "include/buffer.h"
+
+namespace crush {
+
+ class BinaryTree {
+ private:
+ // tree def
+ int root_node; // 0 for empty tree.
+ int alloc;
+ vector<int> node_nested; // all existing nodes in this map
+ vector<float> node_weight; // and this one
+ vector<int> node_complete; // only nodes with all possible children
+
+ public:
+ BinaryTree() : root_node(0), alloc(0) {}
+
+ void _encode(bufferlist& bl) {
+ bl.append((char*)&root_node, sizeof(root_node));
+ bl.append((char*)&alloc, sizeof(alloc));
+ ::_encode(node_nested, bl);
+ ::_encode(node_weight, bl);
+ ::_encode(node_complete, bl);
+ }
+ void _decode(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(root_node), (char*)&root_node);
+ off += sizeof(root_node);
+ bl.copy(off, sizeof(alloc), (char*)&alloc);
+ off += sizeof(alloc);
+ ::_decode(node_nested, bl, off);
+ ::_decode(node_weight, bl, off);
+ ::_decode(node_complete, bl, off);
+ }
+
+ // accessors
+ bool empty() const { return root_node == 0; }
+ bool exists(int n) const { return n < alloc && node_nested[n]; }
+ int nested(int n) const { return exists(n) ? node_nested[n]:0; }
+ float weight(int n) const { return exists(n) ? node_weight[n]:0; }
+ bool complete(int n) const { return exists(n) ? node_complete[n]:false; }
+
+ int root() const { return root_node; }
+
+ void realloc(int n) {
+ /*
+ while (alloc <= n) {
+ node_nested.push_back(0);
+ node_weight.push_back(0);
+ node_complete.push_back(0);
+ alloc++;
+ }
+ */
+ if (alloc <= n) {
+ int add = n - alloc + 1;
+ node_nested.insert(node_nested.end(), add, 0);
+ node_weight.insert(node_weight.end(), add, 0);
+ node_complete.insert(node_complete.end(), add, 0);
+ alloc = n+1;
+ }
+ }
+
+ // tree navigation
+ bool terminal(int n) const { return n & 1; } // odd nodes are leaves.
+ int height(int n) const {
+ assert(n);
+ int h = 0;
+ while ((n & 1) == 0) {
+ assert(n > 0);
+ h++; n = n >> 1;
+ }
+ return h;
+ }
+ int left(int n) const {
+ int h = height(n);
+ //cout << "left of " << n << " is " << (n - (1 << h)) << endl;
+ return n - (1 << (h-1));
+ }
+ int right(int n) const {
+ int h = height(n);
+ //cout << "right of " << n << " is " << (n + (1 << h)) << endl;
+ return n + (1 << (h-1));
+ }
+ bool on_right(int n, int h = -1) const {
+ if (h < 0) h = height(n);
+ return n & (1 << (h+1));
+ }
+ bool on_left(int n) const { return !on_right(n); }
+ int parent(int n) const {
+ int h = height(n);
+ if (on_right(n, h))
+ return n - (1<<h);
+ else
+ return n + (1<<h);
+ }
+
+ // modifiers
+ void adjust_node_weight(int n, float w) {
+ assert(exists(n));
+ node_weight[n] += w;
+
+ int p = n;
+ while (p != root_node) {
+ p = parent(p);
+ node_weight[p] += w;
+ }
+ }
+
+ void remove_node(int n) {
+ assert(exists(n));
+
+ // erase node
+ node_nested[n] = 0;
+ node_weight[n] = 0;
+
+ // adjust parents (!complete, -weight)
+ int p = n;
+ while (p != root_node) {
+ p = parent(p);
+
+ node_complete[p] = 0;
+ node_weight[p] = weight(left(p)) + weight(right(p));
+ node_nested[p]--;
+
+ if (nested(p) == 0) {
+ node_weight[p] = 0;
+ node_nested[p] = 0;
+ }
+ }
+
+ // hose root?
+ while (!terminal(root_node) &&
+ (nested(left(root_node)) == 0 ||
+ nested(right(root_node)) == 0)) {
+ // root now one child..
+ node_weight[root_node] = 0;
+ node_nested[root_node] = 0;
+ if (nested(left(root_node)) == 0)
+ root_node = right(root_node);
+ else
+ root_node = left(root_node);
+ }
+
+ if (terminal(root_node) &&
+ nested(root_node) == 0) {
+ // empty!
+ node_weight[root_node] = 0;
+ node_nested[root_node] = 0;
+ root_node = 0;
+ }
+
+ }
+
+ int add_node_root(float w) {
+ return add_node(w, true);
+ }
+
+ int add_node(float w, bool force_root=false) {
+ int n;
+ if (!root_node) {
+ // empty tree!
+ root_node = n = 1;
+ } else {
+ // existing tree.
+ // expand tree?
+ if (force_root || complete(root_node)) {
+ // add new root
+ int newroot = parent(root_node);
+ realloc(newroot);
+ node_weight[newroot] = node_weight[root_node];
+ node_nested[newroot] = nested(root_node);
+
+ // go right or left?
+ if (left(newroot) == root_node)
+ n = right(newroot);
+ else
+ n = left(newroot);
+ root_node = newroot;
+
+ // then go left until terminal
+ while (!terminal(n))
+ n = left(n);
+ }
+ else {
+ // tree isn't complete.
+ n = root_node;
+ while (!terminal(n)) {
+ if (!exists(left(n)) || !complete(left(n))) {
+ // left isn't complete
+ n = left(n);
+ } else {
+ assert(!exists(right(n)) || !complete(right(n)));
+ // right isn't complete
+ n = right(n);
+ }
+ }
+ }
+ }
+
+ // create at n
+ //cout << "creating " << n << endl;
+ realloc(n);
+ node_weight[n] = w;
+ node_nested[n] = 1;
+ node_complete[n] = 1;
+
+ // ancestors: create, adjust weight, complete as appropriate
+ int p = n;
+ while (p != root_node) {
+ p = parent(p);
+ realloc(p);
+
+ // complete?
+ if (!complete(p) &&
+ complete(left(p)) &&
+ complete(right(p)))
+ node_complete[p] = 1;
+
+ // weight (and implicitly create)
+ node_weight[p] += w;
+ node_nested[p]++;
+ }
+
+ return n;
+
+ }
+
+
+ };
+
+
+ // print it out
+ inline void print_binary_tree_node(ostream& out, const BinaryTree& tree, int n, int i) {
+ for (int t=i; t>0; t--) out << " ";
+ if (tree.root() == n)
+ out << "root ";
+ else {
+ if (tree.on_left(n))
+ out << "left ";
+ else
+ out << "right ";
+ }
+ out << n << " : nested " << tree.nested(n) << " weight " << tree.weight(n);
+ if (tree.complete(n)) out << " complete";
+ out << endl;
+ if (!tree.terminal(n)) {
+ if (tree.exists(tree.left(n)))
+ print_binary_tree_node(out, tree, tree.left(n), i+2);
+ if (tree.exists(tree.right(n)))
+ print_binary_tree_node(out, tree, tree.right(n), i+2);
+ }
+ }
+
+ inline ostream& operator<<(ostream& out, const BinaryTree& tree) {
+ if (tree.empty())
+ return out << "tree is empty";
+ print_binary_tree_node(out, tree, tree.root(), 0);
+ return out;
+ }
+
+}
+
+#endif
--- /dev/null
+#ifndef __crush_BUCKET_H
+#define __crush_BUCKET_H
+
+#include "BinaryTree.h"
+#include "Hash.h"
+
+#include <list>
+#include <vector>
+#include <map>
+#include <set>
+using namespace std;
+
+#include <math.h>
+
+#include "include/buffer.h"
+
+namespace crush {
+
+
+ const int CRUSH_BUCKET_UNIFORM = 1;
+ const int CRUSH_BUCKET_TREE = 2;
+ const int CRUSH_BUCKET_LIST = 3;
+ const int CRUSH_BUCKET_STRAW = 4;
+
+ /** abstract bucket **/
+ class Bucket {
+ protected:
+ int id;
+ int parent;
+ int type;
+ float weight;
+
+ public:
+ Bucket(int _type,
+ float _weight) :
+ id(0), parent(0),
+ type(_type),
+ weight(_weight) { }
+
+ Bucket(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(id), (char*)&id);
+ off += sizeof(id);
+ bl.copy(off, sizeof(parent), (char*)&parent);
+ off += sizeof(parent);
+ bl.copy(off, sizeof(type), (char*)&type);
+ off += sizeof(type);
+ bl.copy(off, sizeof(weight), (char*)&weight);
+ off += sizeof(weight);
+ }
+
+ virtual ~Bucket() { }
+
+ virtual const char *get_bucket_type() const = 0;
+ virtual bool is_uniform() const = 0;
+
+ int get_id() const { return id; }
+ int get_type() const { return type; }
+ float get_weight() const { return weight; }
+ int get_parent() const { return parent; }
+ virtual int get_size() const = 0;
+
+ void set_id(int i) { id = i; }
+ void set_parent(int p) { parent = p; }
+ void set_weight(float w) { weight = w; }
+
+ virtual void get_items(vector<int>& i) const = 0;
+ virtual float get_item_weight(int item) const = 0;
+ virtual void add_item(int item, float w, bool back=false) = 0;
+ virtual void adjust_item_weight(int item, float w) = 0;
+ virtual void set_item_weight(int item, float w) {
+ adjust_item_weight(item, w - get_item_weight(item));
+ }
+
+ virtual int choose_r(int x, int r, Hash& h) const = 0;
+
+ virtual void _encode(bufferlist& bl) = 0;
+ };
+
+
+
+
+ /** uniform bucket **/
+ class UniformBucket : public Bucket {
+ protected:
+ public:
+ vector<int> items;
+ int item_type;
+ float item_weight;
+
+ // primes
+ vector<unsigned> primes;
+
+ int get_prime(int j) const {
+ return primes[ j % primes.size() ];
+ }
+ void make_primes() {
+ if (items.empty()) return;
+
+ //cout << "make_primes " << get_id() << " " << items.size() << endl;
+ Hash h(123+get_id());
+ primes.clear();
+
+ // start with odd number > num_items
+ unsigned x = items.size() + 1; // this is the minimum!
+ x += h(items.size()) % (3*items.size()); // bump it up some
+ x |= 1; // make it odd
+
+ while (primes.size() < items.size()) {
+ unsigned j;
+ for (j=2; j*j<=x; j++)
+ if (x % j == 0) break;
+ if (j*j > x) {
+ primes.push_back(x);
+ //cout << "prime " << x << endl;
+ }
+ x += 2;
+ }
+ }
+
+ public:
+ UniformBucket(int _type, int _item_type) :
+ Bucket(_type, 0),
+ item_type(_item_type) { }
+ UniformBucket(int _type, int _item_type,
+ float _item_weight, vector<int>& _items) :
+ Bucket(_type, _item_weight*_items.size()),
+ item_type(_item_type),
+ item_weight(_item_weight) {
+ items = _items;
+ make_primes();
+ }
+
+ UniformBucket(bufferlist& bl, int& off) : Bucket(bl, off) {
+ bl.copy(off, sizeof(item_type), (char*)&item_type);
+ off += sizeof(item_type);
+ bl.copy(off, sizeof(item_weight), (char*)&item_weight);
+ off += sizeof(item_weight);
+ ::_decode(items, bl, off);
+ make_primes();
+ }
+
+ void _encode(bufferlist& bl) {
+ char t = CRUSH_BUCKET_UNIFORM;
+ bl.append((char*)&t, sizeof(t));
+ bl.append((char*)&id, sizeof(id));
+ bl.append((char*)&parent, sizeof(parent));
+ bl.append((char*)&type, sizeof(type));
+ bl.append((char*)&weight, sizeof(weight));
+
+ bl.append((char*)&item_type, sizeof(item_type));
+ bl.append((char*)&item_weight, sizeof(item_weight));
+
+ ::_encode(items, bl);
+ }
+
+ const char *get_bucket_type() const { return "uniform"; }
+ bool is_uniform() const { return true; }
+
+ int get_size() const { return items.size(); }
+
+ // items
+ void get_items(vector<int>& i) const {
+ i = items;
+ }
+ int get_item_type() const { return item_type; }
+ float get_item_weight(int item) const { return item_weight; }
+
+ void add_item(int item, float w, bool back=false) {
+ if (items.empty())
+ item_weight = w;
+ items.push_back(item);
+ weight += item_weight;
+ make_primes();
+ }
+
+ void adjust_item_weight(int item, float w) {
+ assert(0);
+ }
+
+ int choose_r(int x, int r, Hash& hash) const {
+ //cout << "uniformbucket.choose_r(" << x << ", " << r << ")" << endl;
+ //if (r >= get_size()) cout << "warning: r " << r << " >= " << get_size() << " uniformbucket.size" << endl;
+
+ unsigned v = hash(x, get_id());// % get_size();
+ unsigned p = get_prime( hash(get_id(), x) ); // choose a prime based on hash(x, get_id(), 2)
+ unsigned s = (x + v + (r+1)*p) % get_size();
+ return items[s];
+ }
+
+ };
+
+
+
+
+
+ // list bucket.. RUSH_P sorta
+
+ class ListBucket : public Bucket {
+ protected:
+ list<int> items;
+ list<float> item_weight;
+ list<float> sum_weight;
+
+ public:
+ ListBucket(int _type) : Bucket(_type, 0) { }
+
+ ListBucket(bufferlist& bl, int& off) : Bucket(bl, off) {
+ ::_decode(items, bl, off);
+ ::_decode(item_weight, bl, off);
+ ::_decode(sum_weight, bl, off);
+ }
+
+ void _encode(bufferlist& bl) {
+ char t = CRUSH_BUCKET_LIST;
+ bl.append((char*)&t, sizeof(t));
+ bl.append((char*)&id, sizeof(id));
+ bl.append((char*)&parent, sizeof(parent));
+ bl.append((char*)&type, sizeof(type));
+ bl.append((char*)&weight, sizeof(weight));
+
+ ::_encode(items, bl);
+ ::_encode(item_weight, bl);
+ ::_encode(sum_weight, bl);
+ }
+
+ const char *get_bucket_type() const { return "list"; }
+ bool is_uniform() const { return false; }
+
+ int get_size() const { return items.size(); }
+
+ void get_items(vector<int>& i) const {
+ for (list<int>::const_iterator it = items.begin();
+ it != items.end();
+ it++)
+ i.push_back(*it);
+ }
+ float get_item_weight(int item) const {
+ list<int>::const_iterator i = items.begin();
+ list<float>::const_iterator w = item_weight.begin();
+ while (i != items.end()) {
+ if (*i == item) return *w;
+ i++; w++;
+ }
+ assert(0);
+ return 0;
+ }
+
+ void add_item(int item, float w, bool back=false) {
+ if (back) {
+ items.push_back(item);
+ item_weight.push_back(w);
+ sum_weight.clear();
+ float s = 0.0;
+ for (list<float>::reverse_iterator i = item_weight.rbegin();
+ i != item_weight.rend();
+ i++) {
+ s += *i;
+ sum_weight.push_front(s);
+ }
+ weight += w;
+ assert(weight == s);
+ } else {
+ items.push_front(item);
+ item_weight.push_front(w);
+ weight += w;
+ sum_weight.push_front(weight);
+ }
+ }
+
+ void adjust_item_weight(int item, float dw) {
+ // find it
+ list<int>::iterator p = items.begin();
+ list<float>::iterator pw = item_weight.begin();
+ list<float>::iterator ps = sum_weight.begin();
+
+ while (*p != item) {
+ *ps += dw;
+ p++; pw++; ps++; // next!
+ assert(p != items.end());
+ }
+
+ assert(*p == item);
+ *pw += dw;
+ *ps += dw;
+ }
+
+
+ int choose_r(int x, int r, Hash& h) const {
+ //cout << "linearbucket.choose_r(" << x << ", " << r << ")" << endl;
+
+ list<int>::const_iterator p = items.begin();
+ list<float>::const_iterator pw = item_weight.begin();
+ list<float>::const_iterator ps = sum_weight.begin();
+
+ while (p != items.end()) {
+ const int item = *p;
+ const float iw = *pw;
+ const float tw = *ps;
+ const float f = (float)(h(x, item, r, get_id()) % 10000) * tw / 10000.0;
+ //cout << "item " << item << " iw = " << iw << " tw = " << tw << " f = " << f << endl;
+ if (f < iw) {
+ //cout << "linearbucket.choose_r(" << x << ", " << r << ") = " << item << endl;
+ return item;
+ }
+ p++; pw++; ps++; // next!
+ }
+ assert(0);
+ return 0;
+ }
+
+
+ };
+
+
+
+
+ // mixed bucket, based on RUSH_T type binary tree
+
+ class TreeBucket : public Bucket {
+ protected:
+ //vector<float> item_weight;
+
+ // public:
+ BinaryTree tree;
+ map<int,int> node_item; // node id -> item
+ vector<int> node_item_vec; // fast version of above
+ map<int,int> item_node; // item -> node id
+ map<int,float> item_weight;
+
+ public:
+ TreeBucket(int _type) : Bucket(_type, 0) { }
+
+ TreeBucket(bufferlist& bl, int& off) : Bucket(bl, off) {
+ tree._decode(bl, off);
+
+ ::_decode(node_item, bl, off);
+ ::_decode(node_item_vec, bl, off);
+ ::_decode(item_node, bl, off);
+ ::_decode(item_weight, bl, off);
+ }
+
+ void _encode(bufferlist& bl) {
+ char t = CRUSH_BUCKET_TREE;
+ bl.append((char*)&t, sizeof(t));
+ bl.append((char*)&id, sizeof(id));
+ bl.append((char*)&parent, sizeof(parent));
+ bl.append((char*)&type, sizeof(type));
+ bl.append((char*)&weight, sizeof(weight));
+
+ tree._encode(bl);
+
+ ::_encode(node_item, bl);
+ ::_encode(node_item_vec, bl);
+ ::_encode(item_node, bl);
+ ::_encode(item_weight, bl);
+ }
+
+ const char *get_bucket_type() const { return "tree"; }
+ bool is_uniform() const { return false; }
+
+ int get_size() const { return node_item.size(); }
+
+ // items
+ void get_items(vector<int>& i) const {
+ for (map<int,int>::const_iterator it = node_item.begin();
+ it != node_item.end();
+ it++)
+ i.push_back(it->second);
+ }
+ float get_item_weight(int i) const {
+ assert(item_weight.count(i));
+ return ((map<int,float>)item_weight)[i];
+ }
+
+
+ void add_item(int item, float w, bool back=false) {
+ item_weight[item] = w;
+ weight += w;
+
+ unsigned n = tree.add_node(w);
+ node_item[n] = item;
+ item_node[item] = n;
+
+ while (node_item_vec.size() <= n)
+ node_item_vec.push_back(0);
+ node_item_vec[n] = item;
+ }
+
+ void adjust_item_weight(int item, float dw) {
+ // adjust my weight
+ weight += dw;
+ item_weight[item] += dw;
+
+ // adjust tree weights
+ tree.adjust_node_weight(item_node[item], dw);
+ }
+
+ int choose_r(int x, int r, Hash& h) const {
+ //cout << "mixedbucket.choose_r(" << x << ", " << r << ")" << endl;
+ int n = tree.root();
+ while (!tree.terminal(n)) {
+ // pick a point in [0,w)
+ float w = tree.weight(n);
+ float f = (float)(h(x, n, r, get_id()) % 10000) * w / 10000.0;
+
+ // left or right?
+ int l = tree.left(n);
+ if (tree.exists(l) &&
+ f < tree.weight(l))
+ n = l;
+ else
+ n = tree.right(n);
+ }
+ //assert(node_item.count(n));
+ //return ((map<int,int>)node_item)[n];
+ return node_item_vec[n];
+ }
+ };
+
+
+
+
+
+ // straw bucket.. new thing!
+
+ class StrawBucket : public Bucket {
+ protected:
+ map<int, float> item_weight;
+ map<int, float> item_straw;
+
+ list<int> _items;
+ list<float> _straws;
+
+ public:
+ StrawBucket(int _type) : Bucket(_type, 0) { }
+
+ StrawBucket(bufferlist& bl, int& off) : Bucket(bl, off) {
+ ::_decode(item_weight, bl, off);
+ calc_straws();
+ }
+
+ void _encode(bufferlist& bl) {
+ char t = CRUSH_BUCKET_TREE;
+ bl.append((char*)&t, sizeof(t));
+ bl.append((char*)&id, sizeof(id));
+ bl.append((char*)&parent, sizeof(parent));
+ bl.append((char*)&type, sizeof(type));
+ bl.append((char*)&weight, sizeof(weight));
+
+ ::_encode(item_weight, bl);
+ }
+
+ const char *get_bucket_type() const { return "straw"; }
+ bool is_uniform() const { return false; }
+
+ int get_size() const { return item_weight.size(); }
+
+
+ // items
+ void get_items(vector<int>& i) const {
+ for (map<int,float>::const_iterator it = item_weight.begin();
+ it != item_weight.end();
+ it++)
+ i.push_back(it->first);
+ }
+ float get_item_weight(int item) const {
+ assert(item_weight.count(item));
+ return ((map<int,float>)item_weight)[item];
+ }
+
+ void add_item(int item, float w, bool back=false) {
+ item_weight[item] = w;
+ weight += w;
+ calc_straws();
+ }
+
+ void adjust_item_weight(int item, float dw) {
+ //cout << "adjust " << item << " " << dw << endl;
+ weight += dw;
+ item_weight[item] += dw;
+ calc_straws();
+ }
+
+
+ /* calculate straw lengths.
+ this is kind of ugly. not sure if there's a closed form way to calculate this or not!
+ */
+ void calc_straws() {
+ //cout << get_id() << ": calc_straws ============" << endl;
+
+ item_straw.clear();
+ _items.clear();
+ _straws.clear();
+
+ // reverse sort by weight; skip zero weight items
+ map<float, set<int> > reverse;
+ for (map<int, float>::iterator p = item_weight.begin();
+ p != item_weight.end();
+ p++) {
+ //cout << get_id() << ":" << p->first << " " << p->second << endl;
+ if (p->second > 0) {
+ //p->second /= minw;
+ reverse[p->second].insert(p->first);
+ }
+ }
+
+ /* 1:2:7
+ item_straw[0] = 1.0;
+ item_straw[1] = item_straw[0]*sqrt(1.0/.6);
+ item_straw[2] = item_straw[1]*2.0;
+ */
+
+ // work from low to high weights
+ float straw = 1.0;
+ float numleft = item_weight.size();
+ float wbelow = 0.0;
+ float lastw = 0.0;
+
+ map<float, set<int> >::iterator next = reverse.begin();
+ //while (next != reverse.end()) {
+ while (1) {
+ //cout << "hi " << next->first << endl;
+ map<float, set<int> >::iterator cur = next;
+
+ // set straw length for this set
+ for (set<int>::iterator s = cur->second.begin();
+ s != cur->second.end();
+ s++) {
+ item_straw[*s] = straw;
+ //cout << "straw " << *s << " w " << item_weight[*s] << " -> " << straw << endl;
+ _items.push_back(*s);
+ _straws.push_back(straw);
+ }
+
+ next++;
+ if (next == reverse.end()) break;
+
+ wbelow += (cur->first-lastw) * numleft;
+ //cout << "wbelow " << wbelow << endl;
+
+ numleft -= 1.0 * (float)cur->second.size();
+ //cout << "numleft now " << numleft << endl;
+
+ float wnext = numleft * (next->first - cur->first);
+ //cout << "wnext " << wnext << endl;
+
+ float pbelow = wbelow / (wbelow+wnext);
+ //cout << "pbelow " << pbelow << endl;
+
+ straw *= pow((double)(1.0/pbelow), (double)1.0/numleft);
+
+ lastw = cur->first;
+ }
+ //cout << "============" << endl;
+ }
+
+ int choose_r(int x, int r, Hash& h) const {
+ //cout << "strawbucket.choose_r(" << x << ", " << r << ")" << endl;
+
+ float high_draw = -1;
+ int high = 0;
+
+ list<int>::const_iterator pi = _items.begin();
+ list<float>::const_iterator ps = _straws.begin();
+ while (pi != _items.end()) {
+ const int item = *pi;
+ const float rnd = (float)(h(x, item, r) % 1000000) / 1000000.0;
+ const float straw = *ps * rnd;
+
+ if (high_draw < 0 ||
+ straw > high_draw) {
+ high = *pi;
+ high_draw = straw;
+ }
+
+ pi++;
+ ps++;
+ }
+ return high;
+ }
+ };
+
+
+
+
+
+ inline Bucket* decode_bucket(bufferlist& bl, int& off) {
+ char t;
+ bl.copy(off, sizeof(t), (char*)&t);
+ off += sizeof(t);
+
+ switch (t) {
+ case CRUSH_BUCKET_UNIFORM:
+ return new UniformBucket(bl, off);
+ case CRUSH_BUCKET_LIST:
+ return new ListBucket(bl, off);
+ case CRUSH_BUCKET_TREE:
+ return new TreeBucket(bl, off);
+ case CRUSH_BUCKET_STRAW:
+ return new StrawBucket(bl, off);
+ default:
+ assert(0);
+ }
+ return 0;
+ }
+
+
+
+}
+
+
+
+
+
+
+
+
+#endif
--- /dev/null
+
+// Robert Jenkins' function for mixing 32-bit values
+// http://burtleburtle.net/bob/hash/evahash.html
+// a, b = random bits, c = input and output
+#define hashmix(a,b,c) \
+ a=a-b; a=a-c; a=a^(c>>13); \
+ b=b-c; b=b-a; b=b^(a<<8); \
+ c=c-a; c=c-b; c=c^(b>>13); \
+ a=a-b; a=a-c; a=a^(c>>12); \
+ b=b-c; b=b-a; b=b^(a<<16); \
+ c=c-a; c=c-b; c=c^(b>>5); \
+ a=a-b; a=a-c; a=a^(c>>3); \
+ b=b-c; b=b-a; b=b^(a<<10); \
+ c=c-a; c=c-b; c=c^(b>>15);
+
+namespace crush {
+
+ class Hash {
+ int seed;
+
+ public:
+ int get_seed() { return seed; }
+ void set_seed(int s) { seed = s; }
+
+ Hash(int s) {
+ unsigned int hash = 1315423911;
+ int x = 231232;
+ int y = 1232;
+ hashmix(s, x, hash);
+ hashmix(y, s, hash);
+ seed = s;
+ }
+
+ inline int operator()(int a) {
+ unsigned int hash = seed ^ a;
+ int b = a;
+ int x = 231232;
+ int y = 1232;
+ hashmix(b, x, hash);
+ hashmix(y, a, hash);
+ return (hash & 0x7FFFFFFF);
+ }
+
+ inline int operator()(int a, int b) {
+ unsigned int hash = seed ^ a ^ b;
+ int x = 231232;
+ int y = 1232;
+ hashmix(a, b, hash);
+ hashmix(x, a, hash);
+ hashmix(b, y, hash);
+ return (hash & 0x7FFFFFFF);
+ }
+
+ inline int operator()(int a, int b, int c) {
+ unsigned int hash = seed ^ a ^ b ^ c;
+ int x = 231232;
+ int y = 1232;
+ hashmix(a, b, hash);
+ hashmix(c, x, hash);
+ hashmix(y, a, hash);
+ hashmix(b, x, hash);
+ hashmix(y, c, hash);
+ return (hash & 0x7FFFFFFF);
+ }
+
+ inline int operator()(int a, int b, int c, int d) {
+ unsigned int hash = seed ^a ^ b ^ c ^ d;
+ int x = 231232;
+ int y = 1232;
+ hashmix(a, b, hash);
+ hashmix(c, d, hash);
+ hashmix(a, x, hash);
+ hashmix(y, b, hash);
+ hashmix(c, x, hash);
+ hashmix(y, d, hash);
+ return (hash & 0x7FFFFFFF);
+ }
+
+ inline int operator()(int a, int b, int c, int d, int e) {
+ unsigned int hash = seed ^ a ^ b ^ c ^ d ^ e;
+ int x = 231232;
+ int y = 1232;
+ hashmix(a, b, hash);
+ hashmix(c, d, hash);
+ hashmix(e, x, hash);
+ hashmix(y, a, hash);
+ hashmix(b, x, hash);
+ hashmix(y, c, hash);
+ hashmix(d, x, hash);
+ hashmix(y, e, hash);
+ return (hash & 0x7FFFFFFF);
+ }
+ };
+
+}
+
+
+
+#if 0
+
+
+ //return myhash(a) ^ seed;
+ return myhash(a, seed);
+ }
+ int operator()(int a, int b) {
+ //return myhash( myhash(a) ^ myhash(b) ^ seed );
+ return myhash(a, b, seed);
+ }
+ int operator()(int a, int b, int c) {
+ //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ seed );
+ return myhash(a, b, c, seed);
+ }
+ int operator()(int a, int b, int c, int d) {
+ //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ myhash(d ^ seed) ^ seed );
+ return myhash(a, b, c, d, seed);
+ }
+
+ // ethan's rush hash?
+ if (0)
+ return (n ^ 0xdead1234) * (884811920 * 3 + 1);
+
+ if (1) {
+
+ // before
+ hash ^= ((hash << 5) + (n&255) + (hash >> 2));
+ hashmix(a, b, hash);
+ n = n >> 8;
+ hash ^= ((hash << 5) + (n&255) + (hash >> 2));
+ hashmix(a, b, hash);
+ n = n >> 8;
+ hash ^= ((hash << 5) + (n&255) + (hash >> 2));
+ hashmix(a, b, hash);
+ n = n >> 8;
+ hash ^= ((hash << 5) + (n&255) + (hash >> 2));
+ hashmix(a, b, hash);
+ n = n >> 8;
+
+ //return hash;
+ return (hash & 0x7FFFFFFF);
+ }
+
+ // JS
+ // a little better than RS
+ // + jenkin's mixing thing (which sucks on its own but helps tons here)
+ // best so far
+ if (1) {
+ unsigned int hash = 1315423911;
+ int a = 231232;
+ int b = 1232;
+
+ for(unsigned int i = 0; i < 4; i++)
+ {
+ hash ^= ((hash << 5) + (n&255) + (hash >> 2));
+ hashmix(a, b, hash);
+ n = n >> 8;
+ }
+
+ return (hash & 0x7FFFFFFF);
+ }
+
+
+ // Robert jenkins' 96 bit mix
+ // sucks
+ if (0) {
+ int c = n;
+ int a = 12378912;
+ int b = 2982827;
+ a=a-b; a=a-c; a=a^(c>>13);
+ b=b-c; b=b-a; b=b^(a<<8);
+ c=c-a; c=c-b; c=c^(b>>13);
+ a=a-b; a=a-c; a=a^(c>>12);
+ b=b-c; b=b-a; b=b^(a<<16);
+ c=c-a; c=c-b; c=c^(b>>5);
+ a=a-b; a=a-c; a=a^(c>>3);
+ b=b-c; b=b-a; b=b^(a<<10);
+ c=c-a; c=c-b; c=c^(b>>15);
+ return c;
+ }
+ // robert jenkins 32-bit
+ // sucks
+ if (0) {
+ n += (n << 12);
+ n ^= (n >> 22);
+ n += (n << 4);
+ n ^= (n >> 9);
+ n += (n << 10);
+ n ^= (n >> 2);
+ n += (n << 7);
+ n ^= (n >> 12);
+ return n;
+ }
+
+ // djb2
+ if (0) {
+ unsigned int hash = 5381;
+ for (int i=0; i<4; i++) {
+ hash = ((hash << 5) + hash) + ((n&255) ^ 123);
+ n = n >> 8;
+ }
+ return hash;
+ }
+
+
+ // SDBM
+ if (1) {
+ unsigned int hash = 0;
+
+ for(unsigned int i = 0; i < 4; i++)
+ {
+ hash = (n&255) + (hash << 6) + (hash << 16) - hash;
+ n = n >> 8;
+ }
+
+ return (hash & 0x7FFFFFFF);
+ }
+
+ // PJW
+ // horrid
+ if (0) {
+ unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8);
+ unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4);
+ unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8);
+ unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
+ unsigned int hash = 0;
+ unsigned int test = 0;
+
+ for(unsigned int i = 0; i < 4; i++)
+ {
+ hash = (hash << OneEighth) + (n&255);
+
+ if((test = hash & HighBits) != 0)
+ {
+ hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits));
+ }
+ n = n >> 8;
+ }
+
+ return (hash & 0x7FFFFFFF);
+ }
+
+ // RS Hash function, from Robert Sedgwicks Algorithms in C book, w/ some changes.
+ if (0) {
+ unsigned int b = 378551;
+ unsigned int a = 63689;
+ unsigned int hash = 0;
+
+ for(unsigned int i=0; i<4; i++)
+ {
+ hash = hash * a + (n&0xff);
+ a = a * b;
+ n = n >> 8;
+ }
+
+ return (hash & 0x7FFFFFFF);
+ }
+
+ // DJB
+ // worse than rs
+ if (0) {
+ unsigned int hash = 5381;
+
+ for(unsigned int i = 0; i < 4; i++)
+ {
+ hash = ((hash << 5) + hash) + (n&255);
+ n = n >> 8;
+ }
+
+ return (hash & 0x7FFFFFFF);
+ }
+
+ // AP
+ // even worse
+ if (1) {
+ unsigned int hash = 0;
+
+ for(unsigned int i = 0; i < 4; i++)
+ {
+ hash ^= ((i & 1) == 0) ? ( (hash << 7) ^ (n&255) ^ (hash >> 3)) :
+ (~((hash << 11) ^ (n&255) ^ (hash >> 5)));
+ n = n >> 8;
+ }
+
+ return (hash & 0x7FFFFFFF);
+ }
+
+
+#endif
--- /dev/null
+#ifndef __crush_CRUSH_H
+#define __crush_CRUSH_H
+
+#include <iostream>
+#include <list>
+#include <vector>
+#include <set>
+#include <map>
+using namespace std;
+#include <ext/hash_map>
+#include <ext/hash_set>
+using namespace __gnu_cxx;
+
+
+#include "Bucket.h"
+
+#include "include/buffer.h"
+
+
+namespace crush {
+
+
+ // *** RULES ***
+
+ class RuleStep {
+ public:
+ int cmd;
+ vector<int> args;
+
+ RuleStep(int c) : cmd(c) {}
+ RuleStep(int c, int a) : cmd(c) {
+ args.push_back(a);
+ }
+ RuleStep(int c, int a, int b) : cmd(c) {
+ args.push_back(a);
+ args.push_back(b);
+ }
+ RuleStep(int o, int a, int b, int c) : cmd(o) {
+ args.push_back(a);
+ args.push_back(b);
+ args.push_back(c);
+ }
+
+ void _encode(bufferlist& bl) {
+ bl.append((char*)&cmd, sizeof(cmd));
+ ::_encode(args, bl);
+ }
+ void _decode(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(cmd), (char*)&cmd);
+ off += sizeof(cmd);
+ ::_decode(args, bl, off);
+ }
+ };
+
+
+ // Rule operations
+ const int CRUSH_RULE_TAKE = 0;
+ const int CRUSH_RULE_CHOOSE = 1; // first n by default
+ const int CRUSH_RULE_CHOOSE_FIRSTN = 1;
+ const int CRUSH_RULE_CHOOSE_INDEP = 2;
+ const int CRUSH_RULE_EMIT = 3;
+
+ class Rule {
+ public:
+ vector< RuleStep > steps;
+
+ void _encode(bufferlist& bl) {
+ int n = steps.size();
+ bl.append((char*)&n, sizeof(n));
+ for (int i=0; i<n; i++)
+ steps[i]._encode(bl);
+ }
+ void _decode(bufferlist& bl, int& off) {
+ steps.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ steps.push_back(RuleStep(0));
+ steps[i]._decode(bl, off);
+ }
+ }
+ };
+
+
+
+
+ // *** CRUSH ***
+
+ class Crush {
+ protected:
+ map<int, Bucket*> buckets;
+ int bucketno;
+ Hash h;
+
+ hash_map<int, int> parent_map; // what bucket each leaf/bucket lives in
+
+ public:
+ map<int, Rule> rules;
+
+ //map<int,int> collisions;
+ //map<int,int> bumps;
+
+ void _encode(bufferlist& bl) {
+ // buckets
+ int n = buckets.size();
+ bl.append((char*)&n, sizeof(n));
+ for (map<int, Bucket*>::const_iterator it = buckets.begin();
+ it != buckets.end();
+ it++) {
+ bl.append((char*)&it->first, sizeof(it->first));
+ it->second->_encode(bl);
+ }
+ bl.append((char*)&bucketno, sizeof(bucketno));
+
+ // hash
+ int s = h.get_seed();
+ bl.append((char*)&s, sizeof(s));
+
+ //::_encode(out, bl);
+ //::_encode(overload, bl);
+
+ // rules
+ n = rules.size();
+ bl.append((char*)&n, sizeof(n));
+ for(map<int, Rule>::iterator it = rules.begin();
+ it != rules.end();
+ it++) {
+ bl.append((char*)&it->first, sizeof(it->first));
+ it->second._encode(bl);
+ }
+
+ }
+
+ void _decode(bufferlist& bl, int& off) {
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ int bid;
+ bl.copy(off, sizeof(bid), (char*)&bid);
+ off += sizeof(bid);
+ Bucket *b = decode_bucket(bl, off);
+ buckets[bid] = b;
+ }
+ bl.copy(off, sizeof(bucketno), (char*)&bucketno);
+ off += sizeof(bucketno);
+
+ int s;
+ bl.copy(off, sizeof(s), (char*)&s);
+ off += sizeof(s);
+ h.set_seed(s);
+
+ //::_decode(out, bl, off);
+ //::_decode(overload, bl, off);
+
+ // rules
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ int r;
+ bl.copy(off, sizeof(r), (char*)&r);
+ off += sizeof(r);
+ rules[r]._decode(bl,off);
+ }
+
+ // index
+ build_parent_map();
+ }
+
+ void build_parent_map() {
+ parent_map.clear();
+
+ // index every bucket
+ for (map<int, Bucket*>::iterator bp = buckets.begin();
+ bp != buckets.end();
+ ++bp) {
+ // index bucket items
+ vector<int> items;
+ bp->second->get_items(items);
+ for (vector<int>::iterator ip = items.begin();
+ ip != items.end();
+ ++ip)
+ parent_map[*ip] = bp->first;
+ }
+ }
+
+
+
+ public:
+ Crush(int seed=123) : bucketno(-1), h(seed) {}
+ ~Crush() {
+ // hose buckets
+ for (map<int, Bucket*>::iterator it = buckets.begin();
+ it != buckets.end();
+ it++) {
+ delete it->second;
+ }
+ }
+
+ int print(ostream& out, int root, int indent=0) {
+ for (int i=0; i<indent; i++) out << " ";
+ Bucket *b = buckets[root];
+ assert(b);
+ out << b->get_weight() << "\t" << b->get_id() << "\t";
+ for (int i=0; i<indent; i++) out << " ";
+ out << b->get_bucket_type() << ": ";
+
+ vector<int> items;
+ b->get_items(items);
+
+ if (buckets.count(items[0])) {
+ out << endl;
+ for (unsigned i=0; i<items.size(); i++)
+ print(out, items[i], indent+1);
+ } else {
+ out << "[";
+ for (unsigned i=0; i<items.size(); i++) {
+ if (i) out << " ";
+ out << items[i];
+ }
+ out << "]";
+ }
+ return 0;
+ }
+
+
+ int add_bucket( Bucket *b ) {
+ int n = bucketno;
+ bucketno--;
+ b->set_id(n);
+ buckets[n] = b;
+ return n;
+ }
+
+ void add_item(int parent, int item, float w, bool back=false) {
+ // add item
+ assert(!buckets[parent]->is_uniform());
+ Bucket *p = buckets[parent];
+
+ p->add_item(item, w, back);
+
+ // set item's parent
+ Bucket *n = buckets[item];
+ if (n)
+ n->set_parent(parent);
+
+ // update weights
+ while (buckets.count(p->get_parent())) {
+ int child = p->get_id();
+ p = buckets[p->get_parent()];
+ p->adjust_item_weight(child, w);
+ }
+ }
+
+
+ /*
+ this is a hack, fix me! weights should be consistent throughout hierarchy!
+
+ */
+ void set_bucket_weight(int item, float w) {
+ Bucket *b = buckets[item];
+ float adj = w - b->get_weight();
+
+ while (buckets.count(b->get_parent())) {
+ Bucket *p = buckets[b->get_parent()];
+ p->adjust_item_weight(b->get_id(), adj);
+ b = p;
+ }
+ }
+
+
+ /*
+ * choose numrep distinct items of type type
+ */
+ void choose(int x,
+ int numrep,
+ int type,
+ Bucket *inbucket,
+ vector<int>& outvec,
+ bool firstn,
+ set<int>& outset, map<int,float>& overloadmap,
+ bool forcefeed=false,
+ int forcefeedval=-1) {
+ int off = outvec.size();
+
+ // for each replica
+ for (int rep=0; rep<numrep; rep++) {
+ int outv = -1; // my result
+
+ // forcefeed?
+ if (forcefeed) {
+ forcefeed = false;
+ outvec.push_back(forcefeedval);
+ continue;
+ }
+
+ // keep trying until we get a non-out, non-colliding item
+ int ftotal = 0;
+ bool skip_rep = false;
+
+ while (1) {
+ // start with the input bucket
+ Bucket *in = inbucket;
+
+ // choose through intervening buckets
+ int flocal = 0;
+ bool retry_rep = false;
+
+ while (1) {
+ // r may be twiddled to (try to) avoid past collisions
+ int r = rep;
+ if (in->is_uniform()) {
+ // uniform bucket; be careful!
+ if (firstn || numrep >= in->get_size()) {
+ // uniform bucket is too small; just walk thru elements
+ r += ftotal; // r' = r + f_total (first n)
+ } else {
+ // make sure numrep is not a multple of bucket size
+ int add = numrep*flocal; // r' = r + n*f_local
+ if (in->get_size() % numrep == 0) {
+ add += add/in->get_size(); // shift seq once per pass through the bucket
+ }
+ r += add;
+ }
+ } else {
+ // mixed bucket; just make a distinct-ish r sequence
+ if (firstn)
+ r += ftotal; // r' = r + f_total
+ else
+ r += numrep * flocal; // r' = r + n*f_local
+ }
+
+ // choose
+ outv = in->choose_r(x, r, h);
+
+ // did we get the type we want?
+ int itemtype = 0; // 0 is terminal type
+ Bucket *newin = 0; // remember bucket we hit
+ if (in->is_uniform()) {
+ itemtype = ((UniformBucket*)in)->get_item_type();
+ } else {
+ if (buckets.count(outv)) { // another bucket
+ newin = buckets[outv];
+ itemtype = newin->get_type();
+ }
+ }
+ if (itemtype == type) { // this is what we want!
+ // collision?
+ bool collide = false;
+ for (int prep=0; prep<rep; prep++) {
+ if (outvec[off+prep] == outv) {
+ collide = true;
+ break;
+ }
+ }
+
+ // ok choice?
+ bool bad = false;
+ if (type == 0 && outset.count(outv))
+ bad = true;
+ if (overloadmap.count(outv)) {
+ float f = (float)(h(x, outv) % 1000) / 1000.0;
+ if (f > overloadmap[outv])
+ bad = true;
+ }
+
+ if (collide || bad) {
+ ftotal++;
+ flocal++;
+
+ if (collide && flocal < 3)
+ continue; // try locally a few times!
+
+ if (ftotal >= 10) {
+ // ok fine, just ignore dup. FIXME.
+ skip_rep = true;
+ break;
+ }
+
+ retry_rep = true;
+ }
+
+ break; // ok then!
+ }
+
+ // next
+ in = newin;
+ }
+
+ if (retry_rep) continue; // try again
+
+ break;
+ }
+
+ // skip this rep? (e.g. too many collisions, we give up)
+ if (skip_rep) continue;
+
+ // output this value
+ outvec.push_back(outv);
+ } // for rep
+
+ // double check!
+ if (0) {
+ for (unsigned i=1; i<outvec.size(); i++)
+ for (unsigned j=0; j<i; j++)
+ assert(outvec[i] != outvec[j]);
+ }
+ }
+
+
+ void do_rule(Rule& rule, int x, vector<int>& result,
+ set<int>& outset, map<int,float>& overloadmap,
+ int forcefeed=-1) {
+ //int numresult = 0;
+ result.clear();
+
+ // determine hierarchical context for first.
+ list<int> force_stack;
+ if (forcefeed >= 0) {
+ int t = forcefeed;
+ while (1) {
+ force_stack.push_front(t);
+ if (parent_map.count(t) == 0) break; // reached root, presumably.
+ //cout << " " << t << " parent is " << parent_map[t] << endl;
+ t = parent_map[t];
+ }
+ }
+
+ // working vector
+ vector<int> w; // working variable
+
+ // go through each statement
+ for (vector<RuleStep>::iterator pc = rule.steps.begin();
+ pc != rule.steps.end();
+ pc++) {
+ // move input?
+
+ // do it
+ switch (pc->cmd) {
+ case CRUSH_RULE_TAKE:
+ {
+ const int arg = pc->args[0];
+ //cout << "take " << arg << endl;
+
+ if (!force_stack.empty()) {
+ int forceval = force_stack.front();
+ force_stack.pop_front();
+ assert(arg == forceval);
+ }
+
+ w.clear();
+ w.push_back(arg);
+ }
+ break;
+
+ case CRUSH_RULE_CHOOSE_FIRSTN:
+ case CRUSH_RULE_CHOOSE_INDEP:
+ {
+ const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN;
+ const int numrep = pc->args[0];
+ const int type = pc->args[1];
+
+ //cout << "choose " << numrep << " of type " << type << endl;
+
+ assert(!w.empty());
+
+ // reset output
+ vector<int> out;
+
+ // forcefeeding?
+ bool forcing = false;
+ int forceval;
+ if (!force_stack.empty()) {
+ forceval = force_stack.front();
+ force_stack.pop_front();
+ //cout << "priming out with " << forceval << endl;
+ forcing = true;
+ }
+
+ // do each row independently
+ for (vector<int>::iterator i = w.begin();
+ i != w.end();
+ i++) {
+ assert(buckets.count(*i));
+ Bucket *b = buckets[*i];
+ choose(x, numrep, type, b, out, firstn,
+ outset, overloadmap,
+ forcing,
+ forceval);
+ forcing = false; // only once
+ } // for inrow
+
+ // put back into w
+ w.swap(out);
+ out.clear();
+ }
+ break;
+
+ case CRUSH_RULE_EMIT:
+ {
+ for (unsigned i=0; i<w.size(); i++)
+ result.push_back(w[i]);
+ //result[numresult++] = w[i];
+ w.clear();
+ }
+ break;
+
+ default:
+ assert(0);
+ }
+ }
+
+ }
+
+
+ };
+
+}
+
+#endif
--- /dev/null
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+void place(Crush& c, Rule& rule, int numpg, int numrep, map<int, vector<int> >& placement)
+{
+ vector<int> v(numrep);
+ map<int,int> ocount;
+
+ for (int x=1; x<=numpg; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+ //if (bad)
+ // cout << "bad set " << x << ": " << v << endl;
+
+ placement[x] = v;
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+ if (0)
+ for (map<int,int>::iterator it = ocount.begin();
+ it != ocount.end();
+ it++)
+ cout << it->first << "\t" << it->second << endl;
+
+}
+
+
+float testmovement(int n, float f, int buckettype)
+{
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+ int ndisks = 0;
+
+ // bucket
+ Bucket *b;
+ if (buckettype == 0)
+ b = new TreeBucket(1);
+ else if (buckettype == 1 || buckettype == 2)
+ b = new ListBucket(1);
+ else if (buckettype == 3)
+ b = new StrawBucket(1);
+ else if (buckettype == 4)
+ b = new UniformBucket(0,0);
+
+ for (int i=0; i<n; i++)
+ b->add_item(ndisks++,1);
+
+ c.add_bucket(b);
+ int root = b->get_id();
+
+ //c.print(cout,root);
+
+ // rule
+ int numrep = 2;
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+ //c.overload[10] = .1;
+
+
+ int pg_per = 1000;
+ int numpg = pg_per*ndisks/numrep;
+
+ vector<int> ocount(ndisks);
+
+ /*
+ cout << ndisks << " disks, " << endl;
+ cout << pg_per << " pgs per disk" << endl;
+ cout << numpg << " logical pgs" << endl;
+ cout << "numrep is " << numrep << endl;
+ */
+ map<int, vector<int> > placement1, placement2;
+
+ //c.print(cout, root);
+
+
+ // ORIGINAL
+ place(c, rule, numpg, numrep, placement1);
+
+ int olddisks = ndisks;
+
+ // add item
+ if (buckettype == 2) {
+ // start over!
+ ndisks = 0;
+ b = new ListBucket(1);
+ for (int i=0; i<=n; i++)
+ b->add_item(ndisks++,1);
+ c.add_bucket(b);
+ root = b->get_id();
+
+ rule.steps.clear();
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+ }
+ else
+ b->add_item(ndisks++, 1);
+
+
+ // ADDED
+ //c.print(cout, root);
+ place(c, rule, numpg, numrep, placement2);
+
+ int moved = 0;
+ for (int x=1; x<=numpg; x++)
+ if (placement1[x] != placement2[x])
+ for (int j=0; j<numrep; j++)
+ if (placement1[x][j] != placement2[x][j])
+ moved++;
+
+ int total = numpg*numrep;
+ float actual = (float)moved / (float)(total);
+ float ideal = (float)(ndisks-olddisks) / (float)(ndisks);
+ float fac = actual/ideal;
+ //cout << add << "\t" << olddisks << "\t" << ndisks << "\t" << moved << "\t" << total << "\t" << actual << "\t" << ideal << "\t" << fac << endl;
+ cout << "\t" << fac;
+ return fac;
+}
+
+
+int main()
+{
+ //cout << "// " << depth << ", modifydepth " << modifydepth << ", branching " << branching << ", disks " << n << endl;
+ cout << "n\ttree\tlhead\tltail\tstraw\tuniform" << endl;
+
+ //for (int s=2; s<=64; s+= (s<4?1:(s<16?2:4))) {
+ for (int s=2; s<=64; s+= (s<4?1:4)) {
+ float f = 1.0 / (float)s;
+ //cout << f << "\t" << s;
+ cout << s;
+ for (int buckettype=0; buckettype<5; buckettype++)
+ testmovement(s, f, buckettype);
+ cout << endl;
+ }
+}
+
--- /dev/null
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<wid[h]; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+ b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+ return b;
+ } else {
+ // mixed
+ //Bucket *b = new MixedBucket(h+1);
+ Bucket *b = new StrawBucket(h+1);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ }
+ c.add_bucket(b);
+ //cout << h << " mixedbucket with " << wid[h] << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+ return b->get_id();
+}
+
+
+float go(int dep)
+{
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ vector<int> wid;
+ if (0) {
+ for (int d=0; d<dep; d++)
+ wid.push_back(10);
+ }
+ if (1) {
+ if (dep == 0)
+ wid.push_back(1000);
+ if (dep == 1) {
+ wid.push_back(1);
+ wid.push_back(1000);
+ }
+ if (dep == 2) {
+ wid.push_back(5);
+ wid.push_back(5);
+ wid.push_back(8);
+ wid.push_back(5);
+ }
+ }
+
+ if (1) {
+ root = make_hierarchy(c, wid, ndisks);
+ }
+ if (0) {
+ MixedBucket *b = new MixedBucket(1);
+ for (int i=0; i<10000; i++)
+ b->add_item(ndisks++, 10);
+ root = c.add_bucket(b);
+ }
+ if (0) {
+ vector<int> disks;
+ for (int i=0; i<10000; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 10000, disks);
+ Hash h(123);
+ b->make_primes(h);
+ root = c.add_bucket(b);
+ }
+
+
+
+ // rule
+ int numrep = 1;
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+ //c.overload[10] = .1;
+
+
+ int pg_per = 100;
+ int numpg = pg_per*ndisks/numrep;
+
+ vector<int> ocount(ndisks);
+ //cout << ndisks << " disks, " << endl;
+ //cout << pg_per << " pgs per disk" << endl;
+ // cout << numpg << " logical pgs" << endl;
+ //cout << "numrep is " << numrep << endl;
+
+
+ int place = 100000;
+ int times = place / numpg;
+ if (!times) times = 1;
+
+ cout << "#looping " << times << " times" << endl;
+
+ float tvar = 0;
+ int tvarnum = 0;
+
+ int x = 0;
+ for (int t=0; t<times; t++) {
+ vector<int> v(numrep);
+
+ for (int z=0; z<ndisks; z++) ocount[z] = 0;
+
+ for (int xx=1; xx<numpg; xx++) {
+ x++;
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+ if (bad)
+ cout << "bad set " << x << ": " << v << endl;
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+ /*
+ for (int i=0; i<ocount.size(); i++) {
+ cout << "disk " << i << " has " << ocount[i] << endl;
+ }
+ */
+
+ //cout << "collisions: " << c.collisions << endl;
+ //cout << "r bumps: " << c.bumps << endl;
+
+
+ float avg = 0.0;
+ for (int i=0; i<ocount.size(); i++)
+ avg += ocount[i];
+ avg /= ocount.size();
+ float var = 0.0;
+ for (int i=0; i<ocount.size(); i++)
+ var += (ocount[i] - avg) * (ocount[i] - avg);
+ var /= ocount.size();
+
+ //cout << "avg " << avg << " var " << var << " sd " << sqrt(var) << endl;
+ //cout << avg << "\t";
+
+ tvar += var;
+ tvarnum++;
+ }
+
+ tvar /= tvarnum;
+
+ //cout << "total variance " << tvar << endl;
+
+ return tvar;
+}
+
+
+int main()
+{
+ for (int d=0; d<=2; d++) {
+ float var = go(d);
+ //cout << "## depth = " << d << endl;
+ cout << d << "\t" << var << "\t" << sqrt(var) << endl;
+ }
+}
--- /dev/null
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+int buckettype = 0;
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<wid[h]; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+ //b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+ buckets[h].push_back(b);
+ return b;
+ } else {
+ // mixed
+ //Bucket *b = new TreeBucket(h+1);
+ //Bucket *b = new ListBucket(h+1);
+ //Bucket *b = new StrawBucket(h+1);
+ Bucket *b;
+ if (buckettype == 0)
+ b = new TreeBucket(h+1);
+ else if (buckettype == 1 || buckettype == 2)
+ b = new ListBucket(h+1);
+ else if (buckettype == 3)
+ b = new StrawBucket(h+1);
+
+ c.add_bucket(b);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, buckets, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ n->set_parent(b->get_id());
+ }
+ buckets[h].push_back(b);
+ //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks);
+ return b->get_id();
+}
+
+
+void place(Crush& c, Rule& rule, int numpg, int numrep, map<int, vector<int> >& placement)
+{
+ vector<int> v(numrep);
+ map<int,int> ocount;
+
+ for (int x=1; x<=numpg; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+
+ placement[x] = v;
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+ if (0)
+ for (map<int,int>::iterator it = ocount.begin();
+ it != ocount.end();
+ it++)
+ cout << it->first << "\t" << it->second << endl;
+
+}
+
+
+float testmovement(int depth, int branching, int udisks, int add, int modifydepth)
+{
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ vector<int> wid;
+ wid.push_back(udisks);
+ for (int d=1; d<depth; d++)
+ wid.push_back(branching);
+
+ map< int, list<Bucket*> > buckets;
+
+ root = make_hierarchy(c, wid, buckets, ndisks);
+
+ //c.print(cout,root);
+
+ // rule
+ int numrep = 2;
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+ //c.overload[10] = .1;
+
+
+ int pg_per = 100;
+ int numpg = pg_per*ndisks/numrep;
+
+ vector<int> ocount(ndisks);
+
+ /*
+ cout << ndisks << " disks, " << endl;
+ cout << pg_per << " pgs per disk" << endl;
+ cout << numpg << " logical pgs" << endl;
+ cout << "numrep is " << numrep << endl;
+ */
+ map<int, vector<int> > placement1, placement2;
+
+ //c.print(cout, root);
+
+
+ // ORIGINAL
+ place(c, rule, numpg, numrep, placement1);
+
+ int olddisks = ndisks;
+
+ // add disks
+ //cout << " adding " << add << " disks" << endl;
+ vector<int> disks;
+ for (int i=0; i<add; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+ //b->make_primes(h);
+
+ //Bucket *o = buckets[2].back();
+ Bucket *o;
+ if (buckettype == 2)
+ o = buckets[modifydepth].front();
+ else
+ o = buckets[modifydepth].back();
+
+ c.add_bucket(b);
+ //cout << " adding under " << o->get_id() << endl;
+ c.add_item(o->get_id(), b->get_id(), b->get_weight());
+ //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight());
+ //newbucket = b;
+
+
+ // ADDED
+ //c.print(cout, root);
+ place(c, rule, numpg, numrep, placement2);
+
+ int moved = 0;
+ for (int x=1; x<=numpg; x++)
+ if (placement1[x] != placement2[x])
+ for (int j=0; j<numrep; j++)
+ if (placement1[x][j] != placement2[x][j])
+ moved++;
+
+ int total = numpg*numrep;
+ float actual = (float)moved / (float)(total);
+ float ideal = (float)(ndisks-olddisks) / (float)(ndisks);
+ float fac = actual/ideal;
+ //cout << add << "\t" << olddisks << "\t" << ndisks << "\t" << moved << "\t" << total << "\t" << actual << "\t" << ideal << "\t" << fac << endl;
+ cout << "\t" << fac;
+ return fac;
+}
+
+
+int main()
+{
+
+ int udisks = 10;
+ int add = udisks;
+
+ //int depth = 3;
+ //int branching = 25;
+ int depth = 4;
+ int branching = 9;
+
+ int modifydepth = 1;
+ int bfac = (int)(sqrt((double)branching));
+ int n = (int)(udisks * pow((float)branching, (float)depth-1));
+
+ cout << "// depth " << depth << ", modifydepth " << modifydepth << ", branching " << branching << ", disks " << n << endl;
+ cout << "n\ttree\tlhead\tltail\tstraw" << endl;
+ for (int add = udisks; add <= n; add *= bfac) {
+ cout << add;
+ for (buckettype=0; buckettype<4; buckettype++)
+ testmovement(depth, branching, udisks, add, modifydepth);
+ cout << endl;
+ }
+}
+
--- /dev/null
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+int buckettype = 2; // 0 = mixed, 1 = linear, 2 = straw
+
+int big_one_skip = 255;
+int big_one_size;
+Bucket *big_one = 0;
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+
+ int s = wid[h];
+ if (big_one_skip > 0)
+ big_one_skip--;
+ if (!big_one_skip && !big_one)
+ s = big_one_size;
+
+
+ for (int i=0; i<s; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+ if (!big_one_skip && !big_one) big_one = b;
+ b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks " << disks.size()<< endl;
+ buckets[h].push_back(b);
+ return b;
+ } else {
+ // mixed
+ Bucket *b;
+ if (buckettype == 0)
+ b = new TreeBucket(h+1);
+ else if (buckettype == 1)
+ b = new ListBucket(h+1);
+ else if (buckettype == 2)
+ b = new StrawBucket(h+1);
+ c.add_bucket(b);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, buckets, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ n->set_parent(b->get_id());
+ }
+ buckets[h].push_back(b);
+ //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks);
+ return b->get_id();
+}
+
+
+void place(Crush& c, Rule& rule, int numpg, int numrep, map<int, vector<int> >& placement)
+{
+ vector<int> v(numrep);
+ map<int,int> ocount;
+
+ for (int x=1; x<=numpg; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+ if (bad)
+ cout << "bad set " << x << ": " << v << endl;
+
+ placement[x] = v;
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+ if (0)
+ for (map<int,int>::iterator it = ocount.begin();
+ it != ocount.end();
+ it++)
+ cout << it->first << "\t" << it->second << endl;
+
+}
+
+
+float testmovement(int depth, int branching, int udisks, int add)
+{
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ vector<int> wid;
+ wid.push_back(udisks);
+ for (int d=1; d<depth; d++)
+ wid.push_back(branching + ((d==2)?1:0));
+
+ map< int, list<Bucket*> > buckets;
+
+ big_one_size = add;
+ big_one = 0;
+
+ //cout << "making tree" << endl;
+ root = make_hierarchy(c, wid, buckets, ndisks);
+
+ //c.print(cout, root);
+
+
+ // rule
+ int numrep = 2;
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+ //c.overload[10] = .1;
+
+
+ int pg_per = 100;
+ int numpg = pg_per*ndisks/numrep;
+
+ vector<int> ocount(ndisks);
+
+ /*
+ cout << ndisks << " disks, " << endl;
+ cout << pg_per << " pgs per disk" << endl;
+ cout << numpg << " logical pgs" << endl;
+ cout << "numrep is " << numrep << endl;
+ */
+ map<int, vector<int> > placement1, placement2;
+
+ //c.print(cout, root);
+
+ int olddisks = ndisks;
+
+
+ place(c, rule, numpg, numrep, placement1);
+
+ if (1) {
+ // remove disks
+ assert(big_one);
+ c.adjust_item(big_one->get_id(), 0);
+ }
+
+ int newdisks = ndisks - add;
+
+ //c.print(cout, root);
+ place(c, rule, numpg, numrep, placement2);
+
+ int moved = 0;
+ for (int x=1; x<=numpg; x++)
+ if (placement1[x] != placement2[x])
+ for (int j=0; j<numrep; j++)
+ if (placement1[x][j] != placement2[x][j])
+ moved++;
+
+ int total = numpg*numrep;
+ float actual = (float)moved / (float)(total);
+ //float ideal = (float)(newdisks-olddisks) / (float)(ndisks);
+ float ideal = (float)(olddisks-newdisks) / (float)(olddisks);
+ float fac = actual/ideal;
+ cout << add << "\t" << olddisks << "\t" << ndisks << "\t" << moved << "\t" << total << "\t" << actual << "\t" << ideal << "\t" << fac << endl;
+ return fac;
+}
+
+
+int main()
+{
+
+ int udisks = 10;
+ int ndisks = 10;
+ int depth = 4;
+ int branching = 9;
+ int add = udisks;
+
+ //cout << "\t" << n;
+ // cout << endl;
+
+ buckettype = 2; // 0 = tree, 1 = linear, 2 = straw
+
+ int n = udisks * pow((float)branching, (float)depth-1);
+ for (int add = udisks; add <= n; add *= 3) {
+ big_one_skip = 0;
+ big_one_skip = 9;
+ testmovement(depth, branching, udisks, add);
+ }
+
+ /*
+ cout << "##" << endl;
+ for (map<int, map<float,float> >::iterator i = r.begin();
+ i != r.end();
+ i++) {
+ cout << i->first;
+ for (map<float,float>::iterator j = i->second.begin();
+ j != i->second.end();
+ j++)
+ cout << "\t" << j->first << "\t" << j->second;
+ cout << endl;
+ }
+ */
+}
+
--- /dev/null
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+int buckettype = 0;
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<wid[h]; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+ //b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+ buckets[h].push_back(b);
+ return b;
+ } else {
+ // mixed
+ //Bucket *b = new TreeBucket(h+1);
+ //Bucket *b = new ListBucket(h+1);
+ //Bucket *b = new StrawBucket(h+1);
+ Bucket *b;
+ if (buckettype == 0)
+ b = new TreeBucket(h+1);
+ else if (buckettype == 1 || buckettype == 2)
+ b = new ListBucket(h+1);
+ else if (buckettype == 3)
+ b = new StrawBucket(h+1);
+
+ c.add_bucket(b);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, buckets, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ n->set_parent(b->get_id());
+ }
+ buckets[h].push_back(b);
+ //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks);
+ return b->get_id();
+}
+
+
+void place(Crush& c, Rule& rule, int numpg, int numrep, map<int, vector<int> >& placement)
+{
+ vector<int> v(numrep);
+ map<int,int> ocount;
+
+ for (int x=1; x<=numpg; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+
+ placement[x] = v;
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+ if (0)
+ for (map<int,int>::iterator it = ocount.begin();
+ it != ocount.end();
+ it++)
+ cout << it->first << "\t" << it->second << endl;
+
+}
+
+
+float testmovement(int depth, int branching, int udisks, int add, int modifydepth)
+{
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ vector<int> wid;
+ wid.push_back(udisks);
+ for (int d=1; d<depth; d++)
+ wid.push_back(branching);
+
+ map< int, list<Bucket*> > buckets;
+
+ root = make_hierarchy(c, wid, buckets, ndisks);
+
+ //c.print(cout,root);
+
+ // rule
+ int numrep = 2;
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+ //c.overload[10] = .1;
+
+
+ int pg_per = 100;
+ int numpg = pg_per*ndisks/numrep;
+
+ vector<int> ocount(ndisks);
+
+ /*
+ cout << ndisks << " disks, " << endl;
+ cout << pg_per << " pgs per disk" << endl;
+ cout << numpg << " logical pgs" << endl;
+ cout << "numrep is " << numrep << endl;
+ */
+ map<int, vector<int> > placement1, placement2;
+
+ //c.print(cout, root);
+
+
+ // ORIGINAL
+ place(c, rule, numpg, numrep, placement1);
+
+ int olddisks = ndisks;
+
+ // add disks
+ //cout << " adding " << add << " disks" << endl;
+ vector<int> disks;
+ for (int i=0; i<add; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+ //b->make_primes(h);
+
+ //Bucket *o = buckets[2].back();
+ Bucket *o;
+ if (buckettype == 2)
+ o = buckets[modifydepth].front();
+ else
+ o = buckets[modifydepth].back();
+
+ c.add_bucket(b);
+ //cout << " adding under " << o->get_id() << endl;
+ c.add_item(o->get_id(), b->get_id(), b->get_weight(), buckettype == 2);
+ //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight());
+ //newbucket = b;
+
+
+ // ADDED
+ //c.print(cout, root);
+ place(c, rule, numpg, numrep, placement2);
+
+ int moved = 0;
+ for (int x=1; x<=numpg; x++)
+ if (placement1[x] != placement2[x])
+ for (int j=0; j<numrep; j++)
+ if (placement1[x][j] != placement2[x][j])
+ moved++;
+
+ int total = numpg*numrep;
+ float actual = (float)moved / (float)(total);
+ float ideal = (float)(ndisks-olddisks) / (float)(ndisks);
+ float fac = actual/ideal;
+ //cout << add << "\t" << olddisks << "\t" << ndisks << "\t" << moved << "\t" << total << "\t" << actual << "\t" << ideal << "\t" << fac << endl;
+ cout << "\t" << fac;
+ return fac;
+}
+
+
+int main()
+{
+
+ int udisks = 10;
+ int add = udisks;
+
+ //int depth = 3;
+ //int branching = 25;
+ int depth = 2;
+ int branching = 9*9*9;
+
+ int modifydepth = 1;
+ int bfac = (int)(sqrt((double)branching));
+ bfac = 3;
+ int n = (int)(udisks * pow((float)branching, (float)depth-1));
+
+ cout << "// depth " << depth << ", modifydepth " << modifydepth << ", branching " << branching << ", disks " << n << endl;
+ cout << "n\ttree\tlhead\tltail\tstraw" << endl;
+ for (int add = udisks; add <= n; add *= bfac) {
+ cout << add;
+ for (buckettype=0; buckettype<3; buckettype++)
+ testmovement(depth, branching, udisks, add, modifydepth);
+ cout << endl;
+ }
+}
+
--- /dev/null
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include "../../common/Clock.h"
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Clock g_clock;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<wid[h]; i++)
+ disks.push_back(ndisks++);
+ float w = 10;//((ndisks-1)/100+1)*10;
+ UniformBucket *b = new UniformBucket(1, 0, w, disks);
+ //b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl;
+ return b;
+ } else {
+ // mixed
+ Bucket *b = new TreeBucket(h+1);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ }
+ c.add_bucket(b);
+ //cout << h << " mixedbucket with " << wid[h] << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+ return b->get_id();
+}
+
+
+
+float go(int dep, int failpc)
+{
+ Hash h(73232313);
+
+ //int overloadcutoff = (int)((float)10000.0 / (float)utilization);
+
+ //cout << "util " << utilization << " cutoff " << overloadcutoff << endl;
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ vector<int> wid;
+ for (int d=0; d<dep; d++)
+ wid.push_back(10);
+
+ if (1) {
+ root = make_hierarchy(c, wid, ndisks);
+ }
+
+ //cout << ndisks << " disks" << endl;
+
+
+ int numf = ndisks * failpc / 100;
+
+
+
+
+ // rule
+ int numrep = 1;
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+ //c.overload[10] = .1;
+
+ int pg_per_base = 100;//20;
+ int pg_med = 10*pg_per_base;
+ int pg_per = pg_per_base*5.5;//100;
+ int numpg = pg_per*ndisks/numrep;
+
+ vector<int> ocount(ndisks);
+ //cout << ndisks << " disks, " << endl;
+ //cout << pg_per << " pgs per disk" << endl;
+ // cout << numpg << " logical pgs" << endl;
+ //cout << "numrep is " << numrep << endl;
+
+
+ int place = 1000000;
+ int times = place / numpg;
+ if (!times) times = 1;
+
+
+ //cout << "looping " << times << " times" << endl;
+
+ float tavg[10];
+ float tvar[10];
+ for (int j=0;j<10;j++) {
+ tvar[j] = 0;
+ tavg[j] = 0;
+ }
+ int tvarnum = 0;
+ float trvar = 0.0;
+
+ float overloadsum = 0.0;
+ float adjustsum = 0.0;
+ float afteroverloadsum = 0.0;
+ float aslowdown = 0.0;
+ int chooses = 0;
+ int xs = 1;
+ for (int t=0; t<times; t++) {
+ vector<int> v(numrep);
+
+ c.out.clear();
+
+ for (int z=0; z<ndisks; z++) ocount[z] = 0;
+
+ utime_t t1a = g_clock.now();
+ for (int x=xs; x<numpg+xs; x++) {
+ c.do_rule(rule, x, v);
+ //chooses += numrep;
+ for (int i=0; i<v.size(); i++) {
+ //if (v[i] >= ndisks) cout << "v[i] " << i << " is " << v[i] << " .. x = " << x << endl;
+ //assert(v[i] < ndisks);
+ ocount[v[i]]++;
+ }
+ }
+ utime_t t1b = g_clock.now();
+
+ // add in numf failed disks
+ for (int f = 0; f < numf; f++) {
+ int d = rand() % ndisks;
+ while (c.out.count(d)) d = rand() % ndisks;
+ c.out.insert(d);
+ }
+
+ utime_t t3a = g_clock.now();
+ for (int x=xs; x<numpg+xs; x++) {
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ for (int i=0; i<v.size(); i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ }
+
+ //cout << v << "\t" << ocount << endl;
+ }
+ xs += numpg;
+
+ utime_t t3b = g_clock.now();
+
+ t1b -= t1a;
+ double t1 = (double)t1b;
+ t3b -= t3a;
+ double t3 = (double)t3b;
+ double slowdown = t3/t1;
+ //cout << "slowdown " << slowdown << endl;
+ aslowdown += slowdown;
+
+ //cout << "collisions: " << c.collisions << endl;
+ //cout << "r bumps: " << c.bumps << endl;
+
+ // stair var calc
+ int n = ndisks/10;
+ float avg[10];
+ float var[10];
+ for (int i=0;i<10;i++) {
+ int s = n*i;
+ avg[i] = 0.0;
+ int nf = 0;
+ for (int j=0; j<n; j++) {
+ if (c.out.count(j+s)) { nf++; continue; }
+ avg[i] += ocount[j+s];
+ }
+ avg[i] /= (n-nf);//ocount.size();
+ var[i] = 0.0;
+ for (int j=0; j<n; j++) {
+ if (c.out.count(j+s)) continue;
+ var[i] += (ocount[j+s] - avg[i]) * (ocount[j+s] - avg[i]);
+ }
+ var[i] /= (n-nf);//ocount.size();
+
+ tvar[i] += var[i];
+ tavg[i] += avg[i];
+ }
+ //cout << "avg " << avg << " var " << var << " sd " << sqrt(var) << endl;
+
+ tvarnum++;
+
+ // flat var calc
+ int na = ndisks - numf; // num active
+ float ravg = 0.0;
+ for (int i=0;i<ndisks;i++) {
+ if (c.out.count(i)) continue;
+ ravg += ocount[i];
+ }
+ ravg /= (float)na;
+ float rvar = 0.0;
+ for (int i=0; i<ndisks; i++) {
+ if (c.out.count(i)) continue;
+ rvar += (ravg-(float)ocount[i])*(ravg-(float)ocount[i]);
+ }
+ rvar /= (float)na;
+
+ trvar += rvar;
+ }
+
+
+ trvar /= (float)tvarnum;
+
+ //overloadsum /= tvarnum;
+ //adjustsum /= tvarnum;
+ float avar = 0.0;
+ for (int j=0;j<10;j++) {
+ tvar[j] /= tvarnum;
+ tavg[j] /= tvarnum;
+ avar += tvar[j];
+ }
+ avar /= 10;
+ avar = sqrt(avar);
+ avar /= /*5.5 **/ (float)pg_per_base;
+ //afteroverloadsum /= tvarnum;
+ aslowdown /= tvarnum;
+
+ //int collisions = c.collisions[0] + c.collisions[1] + c.collisions[2] + c.collisions[3];
+ //float crate = (float) collisions / (float)chooses;
+ //cout << "collisions: " << c.collisions << endl;
+
+
+ //cout << "total variance " << tvar << endl;
+ //cout << " overlaod " << overloadsum << endl;
+
+ cout << failpc
+ << "\t" << numf
+ //<< "\t" << adjustsum
+ //<< "\t" << afteroverloadsum
+ << "\t" << aslowdown
+ << "\t" << trvar
+ << "\t" << sqrt(trvar) / (float)pg_per_base
+ << "\t..\t" << avar
+ << "\t-";
+
+ for (int i=0;i<10;i++)
+ cout << "\t" << tavg[i] << "\t" << sqrt(tvar[i]);// << "\t" << tvar[i]/tavg[i];
+
+ cout << endl;
+ return tvar[0];
+}
+
+
+int main()
+{
+ for (int pc = 0; pc < 90; pc += 5) {
+ float var = go(3, pc);
+ }
+
+
+}
--- /dev/null
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<wid[h]; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+ //b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+ return b;
+ } else {
+ // mixed
+ MixedBucket *b = new MixedBucket(h+1);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ }
+ c.add_bucket(b);
+ //cout << h << " mixedbucket with " << wid[h] << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+ return b->get_id();
+}
+
+
+Bucket *make_random(Crush& c, int wid, int height, int& ndisks)
+{
+ int w = rand() % (wid-1) + 2;
+
+ if (height == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<w; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+ b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+ return b;
+ } else {
+ // mixed
+ int h = rand() % height + 1;
+ MixedBucket *b = new MixedBucket(h+1);
+ for (int i=0; i<w; i++) {
+ Bucket *n = make_random(c, wid, h-1, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ }
+ c.add_bucket(b);
+ //cout << h << " mixedbucket with " << wid[h] << endl;
+ return b;
+ }
+
+}
+
+
+float go(int dep, int overloadcutoff)
+{
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ vector<int> wid;
+ for (int d=0; d<dep; d++)
+ wid.push_back(10);
+
+ if (1) {
+ root = make_hierarchy(c, wid, ndisks);
+ }
+ if (0) {
+ Bucket *r = make_random(c, 20, 4, ndisks);
+ root = r->get_id();
+ //c.print(cout, root);
+ }
+ if (0) {
+ MixedBucket *b = new MixedBucket(1);
+ for (int i=0; i<10000; i++)
+ b->add_item(ndisks++, 10);
+ root = c.add_bucket(b);
+ }
+ if (0) {
+ vector<int> disks;
+ for (int i=0; i<10000; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 10000, disks);
+ Hash h(123);
+ b->make_primes(h);
+ root = c.add_bucket(b);
+ }
+ //cout << ndisks << " disks" << endl;
+
+
+
+ // rule
+ int numrep = 1;
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+ //c.overload[10] = .1;
+
+
+ int pg_per = 100;
+ int numpg = pg_per*ndisks/numrep;
+
+ vector<int> ocount(ndisks);
+ //cout << ndisks << " disks, " << endl;
+ //cout << pg_per << " pgs per disk" << endl;
+ // cout << numpg << " logical pgs" << endl;
+ //cout << "numrep is " << numrep << endl;
+
+
+ int place = 1000000;
+ int times = place / numpg;
+ if (!times) times = 1;
+
+
+ //cout << "looping " << times << " times" << endl;
+
+ float tvar = 0;
+ int tvarnum = 0;
+
+ float overloadsum = 0.0;
+ float adjustsum = 0.0;
+ float afteroverloadsum = 0.0;
+ int chooses = 0;
+ int xs = 1;
+ for (int t=0; t<times; t++) {
+ vector<int> v(numrep);
+
+ c.overload.clear();
+
+ for (int z=0; z<ndisks; z++) ocount[z] = 0;
+
+ for (int x=xs; x<numpg+xs; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ chooses += numrep;
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+ if (bad)
+ cout << "bad set " << x << ": " << v << endl;
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+ // overloaded?
+ int overloaded = 0;
+ int adjusted = 0;
+ for (int i=0; i<ocount.size(); i++) {
+ if (ocount[i] > overloadcutoff)
+ overloaded++;
+
+ if (ocount[i] > 100+(overloadcutoff-100)/2) {
+ adjusted++;
+ c.overload[i] = 100.0 / (float)ocount[i];
+ //cout << "disk " << i << " has " << ocount[i] << endl;
+ }
+ ocount[i] = 0;
+ }
+ //cout << overloaded << " overloaded" << endl;
+ overloadsum += (float)overloaded / (float)ndisks;
+ adjustsum += (float)adjusted / (float)ndisks;
+
+
+ for (int x=xs; x<numpg+xs; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+ if (bad)
+ cout << "bad set " << x << ": " << v << endl;
+
+ //cout << v << "\t" << ocount << endl;
+ }
+ xs += numpg;
+
+ int still = 0;
+ for (int i=0; i<ocount.size(); i++) {
+ if (ocount[i] > overloadcutoff) {
+ still++;
+ //c.overload[ocount[i]] = 100.0 / (float)ocount[i];
+ //cout << "disk " << i << " has " << ocount[i] << endl;
+ }
+ }
+ //if (still) cout << "overload was " << overloaded << " now " << still << endl;
+ afteroverloadsum += (float)still / (float)ndisks;
+
+ //cout << "collisions: " << c.collisions << endl;
+ //cout << "r bumps: " << c.bumps << endl;
+
+ float avg = 0.0;
+ for (int i=0; i<ocount.size(); i++)
+ avg += ocount[i];
+ avg /= ocount.size();
+ float var = 0.0;
+ for (int i=0; i<ocount.size(); i++)
+ var += (ocount[i] - avg) * (ocount[i] - avg);
+ var /= ocount.size();
+
+ //cout << "avg " << avg << " var " << var << " sd " << sqrt(var) << endl;
+
+ tvar += var;
+ tvarnum++;
+ }
+
+ overloadsum /= tvarnum;
+ adjustsum /= tvarnum;
+ tvar /= tvarnum;
+ afteroverloadsum /= tvarnum;
+
+ int collisions = c.collisions[0] + c.collisions[1] + c.collisions[2] + c.collisions[3];
+ float crate = (float) collisions / (float)chooses;
+ //cout << "collisions: " << c.collisions << endl;
+
+
+ //cout << "total variance " << tvar << endl;
+ //cout << " overlaod " << overloadsum << endl;
+
+ cout << overloadcutoff << "\t" << (10000.0 / (float)overloadcutoff) << "\t" << tvar << "\t" << overloadsum << "\t" << adjustsum << "\t" << afteroverloadsum << "\t" << crate << endl;
+ return tvar;
+}
+
+
+int main()
+{
+ for (int d=140; d>100; d -= 5) {
+ float var = go(3,d);
+ //cout << "## depth = " << d << endl;
+ //cout << d << "\t" << var << endl;
+ }
+}
--- /dev/null
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<wid[h]; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+ //b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+ return b;
+ } else {
+ // mixed
+ Bucket *b = new TreeBucket(h+1);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ }
+ c.add_bucket(b);
+ //cout << h << " mixedbucket with " << wid[h] << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+ return b->get_id();
+}
+
+
+float go(int dep)
+{
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ vector<int> wid;
+ if (1) {
+ for (int d=0; d<dep; d++)
+ wid.push_back(10);
+ }
+ if (0) {
+ if (dep == 0)
+ wid.push_back(1000);
+ if (dep == 1) {
+ wid.push_back(1);
+ wid.push_back(1000);
+ }
+ if (dep == 2) {
+ wid.push_back(5);
+ wid.push_back(5);
+ wid.push_back(8);
+ wid.push_back(5);
+ }
+ }
+
+ if (1) {
+ root = make_hierarchy(c, wid, ndisks);
+ }
+
+
+
+ // rule
+ int numrep = 1;
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+ //c.overload[10] = .1;
+
+
+ int pg_per = 100;
+ int numpg = pg_per*ndisks/numrep;
+
+ vector<int> ocount(ndisks);
+ //cout << ndisks << " disks, " << endl;
+ //cout << pg_per << " pgs per disk" << endl;
+ // cout << numpg << " logical pgs" << endl;
+ //cout << "numrep is " << numrep << endl;
+
+
+ int place = 100000;
+ int times = place / numpg;
+ if (!times) times = 1;
+
+ cout << "#looping " << times << " times" << endl;
+
+ float tvar = 0;
+ int tvarnum = 0;
+ float tavg = 0;
+
+ int x = 0;
+ for (int t=0; t<times; t++) {
+ vector<int> v(numrep);
+
+ for (int z=0; z<ndisks; z++) ocount[z] = 0;
+
+ for (int xx=1; xx<numpg; xx++) {
+ x++;
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+ /*
+ for (int i=0; i<ocount.size(); i++) {
+ cout << "disk " << i << " has " << ocount[i] << endl;
+ }
+ */
+
+ //cout << "collisions: " << c.collisions << endl;
+ //cout << "r bumps: " << c.bumps << endl;
+
+
+ float avg = 0.0;
+ for (int i=0; i<ocount.size(); i++)
+ avg += ocount[i];
+ avg /= ocount.size();
+ float var = 0.0;
+ for (int i=0; i<ocount.size(); i++)
+ var += (ocount[i] - avg) * (ocount[i] - avg);
+ var /= ocount.size();
+
+ if (times < 10)
+ cout << "avg " << avg << " evar " << sqrt(avg) << " sd " << sqrt(var) << endl;
+ //cout << avg << "\t";
+
+ tvar += var;
+ tavg += avg;
+ tvarnum++;
+ }
+
+ tavg /= tvarnum;
+ tvar /= tvarnum;
+
+ cout << "total variance " << sqrt(tvar) << " expected " << sqrt(tavg) << endl;
+
+ return tvar;
+}
+
+
+int main()
+{
+ for (int d=2; d<=5; d++) {
+ float var = go(d);
+ //cout << "## depth = " << d << endl;
+ //cout << d << "\t" << var << "\t" << sqrt(var) << endl;
+ }
+}
--- /dev/null
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<wid[h]; i++)
+ disks.push_back(ndisks++);
+ float w = ((ndisks-1)/100+1)*10;
+ UniformBucket *b = new UniformBucket(1, 0, w, disks);
+ //b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl;
+ return b;
+ } else {
+ // mixed
+ Bucket *b = new TreeBucket(h+1);
+ //Bucket *b = new StrawBucket(h+1);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ }
+ c.add_bucket(b);
+ //cout << h << " mixedbucket with " << wid[h] << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+ return b->get_id();
+}
+
+
+
+float go(int dep, int overloadcutoff)
+{
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ vector<int> wid;
+ for (int d=0; d<dep; d++)
+ wid.push_back(10);
+
+ if (1) {
+ root = make_hierarchy(c, wid, ndisks);
+ }
+
+
+ // rule
+ int numrep = 1;
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+ //c.overload[10] = .1;
+
+
+ int pg_per_base = 10;
+ int pg_per = pg_per_base*5.5;//100;
+ int numpg = pg_per*ndisks/numrep;
+
+ vector<int> ocount(ndisks);
+ //cout << ndisks << " disks, " << endl;
+ //cout << pg_per << " pgs per disk" << endl;
+ // cout << numpg << " logical pgs" << endl;
+ //cout << "numrep is " << numrep << endl;
+
+
+ int place = 100000;
+ int times = place / numpg;
+ if (!times) times = 1;
+
+
+ //cout << "looping " << times << " times" << endl;
+
+ float tavg[10];
+ float tvar[10];
+ for (int j=0;j<10;j++) {
+ tvar[j] = 0;
+ tavg[j] = 0;
+ }
+ int tvarnum = 0;
+
+ float overloadsum = 0.0;
+ float adjustsum = 0.0;
+ float afteroverloadsum = 0.0;
+ int chooses = 0;
+ int xs = 1;
+ for (int t=0; t<times; t++) {
+ vector<int> v(numrep);
+
+ c.overload.clear();
+
+ for (int z=0; z<ndisks; z++) ocount[z] = 0;
+
+ for (int x=xs; x<numpg+xs; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ chooses += numrep;
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+ //if (bad)
+ //cout << "bad set " << x << ": " << v << endl;
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+ // overloaded?
+ int overloaded = 0;
+ int adjusted = 0;
+ for (int i=0; i<ocount.size(); i++) {
+ int target = (i/100+1)*10;
+ int cutoff = target * overloadcutoff / 100;
+ int adjoff = target + (cutoff - target)*3/4;
+ if (ocount[i] > cutoff)
+ overloaded++;
+
+ if (ocount[i] > adjoff) {
+ adjusted++;
+ c.overload[i] = (float)target / (float)ocount[i];
+ //cout << "setting overload " << i << " to " << c.overload[i] << endl;
+ //cout << "disk " << i << " has " << ocount[i] << endl;
+ }
+ ocount[i] = 0;
+ }
+ //cout << overloaded << " overloaded" << endl;
+ overloadsum += (float)overloaded / (float)ndisks;
+ adjustsum += (float)adjusted / (float)ndisks;
+
+
+
+ if (1) {
+ // second pass
+ for (int x=xs; x<numpg+xs; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+ for (int i=0; i<ocount.size(); i++) {
+ int target = (i/100+1)*10;
+ int cutoff = target * overloadcutoff / 100;
+ int adjoff = cutoff;//target + (cutoff - target)*3/4;
+
+ if (ocount[i] >= adjoff) {
+ adjusted++;
+ if (c.overload.count(i) == 0) {
+ c.overload[i] = 1.0;
+ adjusted++;
+ }
+ //else cout << "(re)adjusting " << i << endl;
+ c.overload[i] *= (float)target / (float)ocount[i];
+ //cout << "setting overload " << i << " to " << c.overload[i] << endl;
+ //cout << "disk " << i << " has " << ocount[i] << endl;
+ }
+ ocount[i] = 0;
+ }
+ }
+
+ for (int x=xs; x<numpg+xs; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+ //cout << v << "\t" << ocount << endl;
+ }
+ xs += numpg;
+
+ int still = 0;
+ for (int i=0; i<ocount.size(); i++) {
+ int target = (i/100+1)*10;
+ int cutoff = target * overloadcutoff / 100;
+ int adjoff = target + (cutoff - target)/3;
+
+ if (ocount[i] > cutoff) {
+ still++;
+ //c.overload[ocount[i]] = 100.0 / (float)ocount[i];
+ if (c.overload.count(i)) cout << "[adjusted] ";
+ cout << "disk " << i << " has " << ocount[i] << endl;
+ }
+ }
+ //if (still) cout << "overload was " << overloaded << " now " << still << endl;
+ afteroverloadsum += (float)still / (float)ndisks;
+
+ //cout << "collisions: " << c.collisions << endl;
+ //cout << "r bumps: " << c.bumps << endl;
+
+ int n = ndisks/10;
+ float avg[10];
+ float var[10];
+ for (int i=0;i<10;i++) {
+ int s = n*i;
+ avg[i] = 0.0;
+ for (int j=0; j<n; j++)
+ avg[i] += ocount[j+s];
+ avg[i] /= n;//ocount.size();
+ var[i] = 0.0;
+ for (int j=0; j<n; j++)
+ var[i] += (ocount[j+s] - avg[i]) * (ocount[j+s] - avg[i]);
+ var[i] /= n;//ocount.size();
+
+ tvar[i] += var[i];
+ tavg[i] += avg[i];
+ }
+ //cout << "avg " << avg << " var " << var << " sd " << sqrt(var) << endl;
+
+ tvarnum++;
+ }
+
+ overloadsum /= tvarnum;
+ adjustsum /= tvarnum;
+ for (int j=0;j<10;j++) {
+ tvar[j] /= tvarnum;
+ tavg[j] /= tvarnum;
+ }
+ afteroverloadsum /= tvarnum;
+
+ //int collisions = c.collisions[0] + c.collisions[1] + c.collisions[2] + c.collisions[3];
+ //float crate = (float) collisions / (float)chooses;
+ //cout << "collisions: " << c.collisions << endl;
+
+
+ //cout << "total variance " << tvar << endl;
+ //cout << " overlaod " << overloadsum << endl;
+
+ cout << overloadcutoff << "\t" << (10000.0 / (float)overloadcutoff) << "\t" << overloadsum << "\t" << adjustsum << "\t" << afteroverloadsum;
+ for (int i=0;i<10;i++)
+ cout << "\t" << tavg[i] << "\t" << tvar[i];// << "\t" << tvar[i]/tavg[i];
+ cout << endl;
+ return tvar[0];
+}
+
+
+int main()
+{
+ float var = go(3,200);
+ for (int d=140; d>100; d -= 5) {
+ float var = go(3,d);
+ //cout << "## depth = " << d << endl;
+ //cout << d << "\t" << var << endl;
+ }
+}
--- /dev/null
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<wid[h]; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+ b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+ buckets[h].push_back(b);
+ return b;
+ } else {
+ // mixed
+ MixedBucket *b = new MixedBucket(h+1);
+ c.add_bucket(b);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, buckets, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ n->set_parent(b->get_id());
+ }
+ buckets[h].push_back(b);
+ //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks);
+ return b->get_id();
+}
+
+
+void place(Crush& c, Rule& rule, int numpg, int numrep, map<int, vector<int> >& placement)
+{
+ vector<int> v(numrep);
+ map<int,int> ocount;
+
+ for (int x=1; x<=numpg; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+ if (bad)
+ cout << "bad set " << x << ": " << v << endl;
+
+ placement[x] = v;
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+ if (0)
+ for (map<int,int>::iterator it = ocount.begin();
+ it != ocount.end();
+ it++)
+ cout << it->first << "\t" << it->second << endl;
+
+}
+
+
+float testmovement(int depth, int branching, int udisks)
+{
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ vector<int> wid;
+ wid.push_back(udisks);
+ for (int d=1; d<depth; d++)
+ wid.push_back(branching);
+
+ map< int, list<Bucket*> > buckets;
+
+ if (1) {
+ root = make_hierarchy(c, wid, buckets, ndisks);
+ }
+ if (0) {
+ MixedBucket *b = new MixedBucket(1);
+ for (int i=0; i<10000; i++)
+ b->add_item(ndisks++, 10);
+ root = c.add_bucket(b);
+ }
+ if (0) {
+ vector<int> disks;
+ for (int i=0; i<10000; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+ Hash h(123);
+ b->make_primes(h);
+ root = c.add_bucket(b);
+ }
+
+
+
+ // rule
+ int numrep = 2;
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+ //c.overload[10] = .1;
+
+
+ int pg_per = 100;
+ int numpg = pg_per*ndisks/numrep;
+
+ vector<int> ocount(ndisks);
+
+ /*
+ cout << ndisks << " disks, " << endl;
+ cout << pg_per << " pgs per disk" << endl;
+ cout << numpg << " logical pgs" << endl;
+ cout << "numrep is " << numrep << endl;
+ */
+ map<int, vector<int> > placement1, placement2;
+
+ //c.print(cout, root);
+
+ place(c, rule, numpg, numrep, placement1);
+
+ if (1) {
+ // failed
+
+ //for (int i=500; i<1000; i++)
+ //c.failed.insert(i);
+ c.failed.insert(0);
+ }
+
+ int olddisks = ndisks;
+
+ if (1) {
+ int n = udisks;
+ //cout << " adding " << n << " disks" << endl;
+ vector<int> disks;
+ for (int i=0; i<n; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+ Hash h(123);
+ b->make_primes(h);
+ Bucket *o = buckets[1].back();
+ c.add_bucket(b);
+ //cout << " adding under " << o->get_id() << endl;
+ c.add_item(o->get_id(), b->get_id(), b->get_weight());
+ //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight());
+ }
+
+ //c.print(cout, root);
+ place(c, rule, numpg, numrep, placement2);
+
+ int moved = 0;
+ for (int x=1; x<=numpg; x++) {
+ if (placement1[x] != placement2[x]) {
+ for (int j=0; j<numrep; j++)
+ if (placement1[x][j] != placement2[x][j])
+ moved++;
+
+ }
+ }
+
+ float f = (float)moved / (float)(numpg*numrep);
+ float ideal = (float)(ndisks-olddisks) / (float)(ndisks);
+ float fac = f/ideal;
+ //cout << moved << " moved or " << f << ", ideal " << ideal << ", factor of " << fac << endl;
+ return fac;
+}
+
+
+int main()
+{
+
+ int udisks = 10;
+ int ndisks = 10;
+ for (int depth = 2; depth <= 4; depth++) {
+ vector<float> v;
+ cout << depth;
+ for (int branching = 3; branching < 16; branching += 1) {
+ float fac = testmovement(depth, branching, udisks);
+ v.push_back(fac);
+ int n = udisks * pow((float)branching, (float)depth-1);
+ cout << "\t" << n;
+ cout << "\t" << fac;
+ }
+ //for (int i=0; i<v.size(); i++)
+ //cout << "\t" << v[i];
+ cout << endl;
+
+ }
+
+}
+
--- /dev/null
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<wid[h]; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+ b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+ buckets[h].push_back(b);
+ return b;
+ } else {
+ // mixed
+ MixedBucket *b = new MixedBucket(h+1);
+ c.add_bucket(b);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, buckets, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ n->set_parent(b->get_id());
+ }
+ buckets[h].push_back(b);
+ //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks);
+ return b->get_id();
+}
+
+
+void place(Crush& c, Rule& rule, int numpg, int numrep, map<int, set<int> >& placement)
+{
+ vector<int> v(numrep);
+ map<int,int> ocount;
+
+ for (int x=1; x<=numpg; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ placement[v[i]].insert(x);
+ }
+ if (bad)
+ cout << "bad set " << x << ": " << v << endl;
+
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+ if (0)
+ for (map<int,int>::iterator it = ocount.begin();
+ it != ocount.end();
+ it++)
+ cout << it->first << "\t" << it->second << endl;
+
+}
+
+
+float testmovement(int depth, int branching, int udisks)
+{
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ vector<int> wid;
+ wid.push_back(udisks);
+ for (int d=1; d<depth; d++)
+ wid.push_back(branching);
+
+ map< int, list<Bucket*> > buckets;
+
+ if (1) {
+ root = make_hierarchy(c, wid, buckets, ndisks);
+ }
+ if (0) {
+ MixedBucket *b = new MixedBucket(1);
+ for (int i=0; i<10000; i++)
+ b->add_item(ndisks++, 10);
+ root = c.add_bucket(b);
+ }
+ if (0) {
+ vector<int> disks;
+ for (int i=0; i<10000; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+ Hash h(123);
+ b->make_primes(h);
+ root = c.add_bucket(b);
+ }
+
+
+
+ // rule
+ int numrep = 2;
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+ //c.overload[10] = .1;
+
+
+ int pg_per = 100;
+ int numpg = pg_per*ndisks/numrep;
+
+ vector<int> ocount(ndisks);
+
+ /*
+ cout << ndisks << " disks, " << endl;
+ cout << pg_per << " pgs per disk" << endl;
+ cout << numpg << " logical pgs" << endl;
+ cout << "numrep is " << numrep << endl;
+ */
+ map<int, set<int> > placement1, placement2;
+
+ //c.print(cout, root);
+
+ place(c, rule, numpg, numrep, placement1);
+
+ float over = .5;
+
+ if (1) {
+ // failed
+
+ //for (int i=500; i<1000; i++)
+ //c.failed.insert(i);
+ //c.failed.insert(0);
+ c.overload[0] = over;
+ }
+
+ int olddisks = ndisks;
+
+
+
+ if (0) {
+ int n = udisks;
+ //cout << " adding " << n << " disks" << endl;
+ vector<int> disks;
+ for (int i=0; i<n; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+ Hash h(123);
+ b->make_primes(h);
+ Bucket *o = buckets[1].back();
+ c.add_bucket(b);
+ //cout << " adding under " << o->get_id() << endl;
+ c.add_item(o->get_id(), b->get_id(), b->get_weight());
+ //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight());
+ }
+
+ //c.print(cout, root);
+ place(c, rule, numpg, numrep, placement2);
+
+ vector<int> moved(ndisks);
+
+ //int moved = 0;
+ for (int d=0; d<ndisks; d++) {
+ for (set<int>::iterator it = placement1[d].begin();
+ it != placement1[d].end();
+ it++) {
+ placement2[d].erase(*it);
+ }
+ }
+
+ float avg = 0;
+ for (int d=0; d<ndisks; d++) {
+ moved[d] = placement2[d].size();
+ avg += moved[d];
+ }
+ avg /= (float)ndisks;
+ float var = 0;
+ for (int d=0; d<ndisks; d++) {
+ var += (moved[d]-avg)*(moved[d]-avg);
+ }
+ var /= (float)ndisks;
+
+ float expected = over * 100.0 / (float)(ndisks-1);
+
+ cout << ndisks << "\t" << expected << "\t" << avg << "\t" << var << endl;
+ /*
+ float f = (float)moved / (float)(numpg*numrep);
+ float ideal = (float)(ndisks-olddisks) / (float)(ndisks);
+ float fac = f/ideal;
+ //cout << moved << " moved or " << f << ", ideal " << ideal << ", factor of " << fac << endl;
+ return fac;
+ */
+}
+
+
+int main()
+{
+
+ int udisks = 10;
+ int ndisks = 10;
+ for (int depth = 2; depth <= 4; depth++) {
+ vector<float> v;
+ cout << depth;
+ for (int branching = 3; branching < 16; branching += 1) {
+ float fac = testmovement(depth, branching, udisks);
+ v.push_back(fac);
+ int n = udisks * pow((float)branching, (float)depth-1);
+ //cout << "\t" << n;
+ //cout << "\t" << fac;
+ }
+ //for (int i=0; i<v.size(); i++)
+ //cout << "\t" << v[i];
+ //cout << endl;
+
+ }
+
+}
+
--- /dev/null
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include "../../common/Clock.h"
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Clock g_clock;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<wid[h]; i++)
+ disks.push_back(ndisks++);
+ float w = ((ndisks-1)/100+1)*10;
+ UniformBucket *b = new UniformBucket(1, 0, w, disks);
+ //b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl;
+ return b;
+ } else {
+ // mixed
+ Bucket *b = new TreeBucket(h+1);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ }
+ c.add_bucket(b);
+ //cout << h << " mixedbucket with " << wid[h] << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+ return b->get_id();
+}
+
+
+
+float go(int dep, int utilization )
+{
+ Hash h(73232313);
+
+ int overloadcutoff = (int)((float)10000.0 / (float)utilization);
+
+ //cout << "util " << utilization << " cutoff " << overloadcutoff << endl;
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ vector<int> wid;
+ for (int d=0; d<dep; d++)
+ wid.push_back(10);
+
+ if (1) {
+ root = make_hierarchy(c, wid, ndisks);
+ }
+
+ //cout << ndisks << " disks" << endl;
+
+
+
+ // rule
+ int numrep = 1;
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+ //c.overload[10] = .1;
+
+ int pg_per_base = 20;
+ int pg_med = 10*pg_per_base;
+ int pg_per = pg_per_base*5.5;//100;
+ int numpg = pg_per*ndisks/numrep;
+
+ vector<int> ocount(ndisks);
+ //cout << ndisks << " disks, " << endl;
+ //cout << pg_per << " pgs per disk" << endl;
+ // cout << numpg << " logical pgs" << endl;
+ //cout << "numrep is " << numrep << endl;
+
+
+ int place = 100000;
+ int times = place / numpg;
+ if (!times) times = 1;
+
+
+ //cout << "looping " << times << " times" << endl;
+
+ float tavg[10];
+ float tvar[10];
+ for (int j=0;j<10;j++) {
+ tvar[j] = 0;
+ tavg[j] = 0;
+ }
+ int tvarnum = 0;
+
+ float overloadsum = 0.0;
+ float adjustsum = 0.0;
+ float afteroverloadsum = 0.0;
+ float aslowdown = 0.0;
+ int chooses = 0;
+ int xs = 1;
+ for (int t=0; t<times; t++) {
+ vector<int> v(numrep);
+
+ c.overload.clear();
+
+ for (int z=0; z<ndisks; z++) ocount[z] = 0;
+
+
+ utime_t t1a = g_clock.now();
+
+ for (int x=xs; x<numpg+xs; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ chooses += numrep;
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ }
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+ utime_t t1b = g_clock.now();
+
+ // overloaded?
+ int overloaded = 0;
+ int adjusted = 0;
+ for (int i=0; i<ocount.size(); i++) {
+ int target = (i/100+1)*pg_per_base;
+ int cutoff = target * overloadcutoff / 100;
+ int adjoff = target + (cutoff - target)*3/4;
+ if (ocount[i] > cutoff)
+ overloaded++;
+
+ if (ocount[i] > adjoff) {
+ adjusted++;
+ c.overload[i] = (float)target / (float)ocount[i];
+ //cout << "setting overload " << i << " to " << c.overload[i] << endl;
+ //cout << "disk " << i << " has " << ocount[i] << endl;
+ }
+ ocount[i] = 0;
+ }
+ //cout << overloaded << " overloaded" << endl;
+ overloadsum += (float)overloaded / (float)ndisks;
+ adjustsum += (float)adjusted / (float)ndisks;
+
+
+
+ // keep adjusting!
+ for (int bla=0; bla<5; bla++) {
+ utime_t t2a = g_clock.now();
+
+ // second pass
+ for (int x=xs; x<numpg+xs; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ }
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+ utime_t t2b = g_clock.now();
+
+ int numover = 0;
+ for (int i=0; i<ocount.size(); i++) {
+ int target = (i/100+1)*pg_per_base;
+ int cutoff = target * overloadcutoff / 100;
+ int adjoff = cutoff;//target + (cutoff - target)*3/4;
+
+ if (ocount[i] >= adjoff) {
+ numover++;
+ if (c.overload.count(i) == 0) {
+ c.overload[i] = 1.0;
+ adjusted++;
+ }
+ //else cout << "(re)adjusting " << i << endl;
+ c.overload[i] *= (float)target / (float)ocount[i];
+ //cout << "setting overload " << i << " to " << c.overload[i] << endl;
+ //cout << "disk " << i << " has " << ocount[i] << endl;
+ }
+ ocount[i] = 0;
+ }
+ if (!numover) break;
+ cout << "readjusting" << endl;
+ }
+
+ utime_t t3a = g_clock.now();
+
+ for (int x=xs; x<numpg+xs; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ }
+
+ //cout << v << "\t" << ocount << endl;
+ }
+ xs += numpg;
+
+ utime_t t3b = g_clock.now();
+
+ t1b -= t1a;
+ double t1 = (double)t1b;
+ t3b -= t3a;
+ double t3 = (double)t3b;
+ double slowdown = t3/t1;
+ //cout << "slowdown " << slowdown << endl;
+ aslowdown += slowdown;
+
+ int still = 0;
+ for (int i=0; i<ocount.size(); i++) {
+ int target = (i/100+1)*pg_per_base;
+ int cutoff = target * overloadcutoff / 100;
+ //int adjoff = target + (cutoff - target)/3;
+
+ if (ocount[i] > cutoff) {
+ still++;
+ //c.overload[ocount[i]] = 100.0 / (float)ocount[i];
+ if (c.overload.count(i)) cout << "[adjusted] ";
+ cout << "disk " << i << " has " << ocount[i] << endl;
+ }
+ }
+ //if (still) cout << "overload was " << overloaded << " now " << still << endl;
+ afteroverloadsum += (float)still / (float)ndisks;
+
+ //cout << "collisions: " << c.collisions << endl;
+ //cout << "r bumps: " << c.bumps << endl;
+
+ int n = ndisks/10;
+ float avg[10];
+ float var[10];
+ for (int i=0;i<10;i++) {
+ int s = n*i;
+ avg[i] = 0.0;
+ for (int j=0; j<n; j++)
+ avg[i] += ocount[j+s];
+ avg[i] /= n;//ocount.size();
+ var[i] = 0.0;
+ for (int j=0; j<n; j++)
+ var[i] += (ocount[j+s] - avg[i]) * (ocount[j+s] - avg[i]);
+ var[i] /= n;//ocount.size();
+
+ tvar[i] += var[i];
+ tavg[i] += avg[i];
+ }
+ //cout << "avg " << avg << " var " << var << " sd " << sqrt(var) << endl;
+
+ tvarnum++;
+ }
+
+ overloadsum /= tvarnum;
+ adjustsum /= tvarnum;
+ float avar = 0.0;
+ for (int j=0;j<10;j++) {
+ tvar[j] /= tvarnum;
+ tavg[j] /= tvarnum;
+ avar += tvar[j];
+ }
+ avar /= 10;
+ avar = sqrt(avar);
+ avar /= 5.5 * (float)pg_per_base;
+ afteroverloadsum /= tvarnum;
+ aslowdown /= tvarnum;
+
+ //int collisions = c.collisions[0] + c.collisions[1] + c.collisions[2] + c.collisions[3];
+ //float crate = (float) collisions / (float)chooses;
+ //cout << "collisions: " << c.collisions << endl;
+
+
+ //cout << "total variance " << tvar << endl;
+ //cout << " overlaod " << overloadsum << endl;
+
+ cout << overloadcutoff << "\t" << utilization
+ << "\t" << overloadsum << "\t" << adjustsum << "\t" << afteroverloadsum
+ << "\t" << aslowdown << "\t" << avar << "\t-";
+
+ for (int i=0;i<10;i++)
+ cout << "\t" << tavg[i] << "\t" << tvar[i];// << "\t" << tvar[i]/tavg[i];
+ cout << endl;
+ return tvar[0];
+}
+
+
+int main()
+{
+ float var = go(3,50);
+ /* for (int d=70; d<100; d += 5) {
+ float var = go(3,d);
+ //cout << "## depth = " << d << endl;
+ //cout << d << "\t" << var << endl;
+ }*/
+ go(3,96);
+ go(3,97);
+ go(3,98);
+ go(3,99);
+
+
+}
--- /dev/null
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<wid[h]; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+ b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+ return b;
+ } else {
+ // mixed
+ MixedBucket *b = new MixedBucket(h+1);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ }
+ c.add_bucket(b);
+ //cout << h << " mixedbucket with " << wid[h] << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+ return b->get_id();
+}
+
+
+Bucket *make_random(Crush& c, int wid, int height, int& ndisks)
+{
+ int w = rand() % (wid-1) + 2;
+
+ if (height == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<w; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+ b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+ return b;
+ } else {
+ // mixed
+ int h = rand() % height + 1;
+ MixedBucket *b = new MixedBucket(h+1);
+ for (int i=0; i<w; i++) {
+ Bucket *n = make_random(c, wid, h-1, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ }
+ c.add_bucket(b);
+ //cout << h << " mixedbucket with " << wid[h] << endl;
+ return b;
+ }
+
+}
+
+
+float go(int dep, int overloadcutoff)
+{
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ vector<int> wid;
+ for (int d=0; d<dep; d++)
+ wid.push_back(10);
+
+ if (1) {
+ root = make_hierarchy(c, wid, ndisks);
+ }
+ if (0) {
+ Bucket *r = make_random(c, 20, 4, ndisks);
+ root = r->get_id();
+ //c.print(cout, root);
+ }
+ if (0) {
+ MixedBucket *b = new MixedBucket(1);
+ for (int i=0; i<10000; i++)
+ b->add_item(ndisks++, 10);
+ root = c.add_bucket(b);
+ }
+ if (0) {
+ vector<int> disks;
+ for (int i=0; i<10000; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 10000, disks);
+ Hash h(123);
+ b->make_primes(h);
+ root = c.add_bucket(b);
+ }
+ //cout << ndisks << " disks" << endl;
+
+
+
+ // rule
+ int numrep = 1;
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+ //c.overload[10] = .1;
+
+
+ int pg_per = 100;
+ int numpg = pg_per*ndisks/numrep;
+
+ vector<int> ocount(ndisks);
+ //cout << ndisks << " disks, " << endl;
+ //cout << pg_per << " pgs per disk" << endl;
+ // cout << numpg << " logical pgs" << endl;
+ //cout << "numrep is " << numrep << endl;
+
+
+ int place = 1000000;
+ int times = place / numpg;
+ if (!times) times = 1;
+
+
+ //cout << "looping " << times << " times" << endl;
+
+ float tvar = 0;
+ int tvarnum = 0;
+
+ float overloadsum = 0.0;
+ float adjustsum = 0.0;
+ float afteroverloadsum = 0.0;
+ int chooses = 0;
+ int xs = 1;
+ for (int t=0; t<times; t++) {
+ vector<int> v(numrep);
+
+ c.overload.clear();
+
+ for (int z=0; z<ndisks; z++) ocount[z] = 0;
+
+ for (int x=xs; x<numpg+xs; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ chooses += numrep;
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+ if (bad)
+ cout << "bad set " << x << ": " << v << endl;
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+ // overloaded?
+ int overloaded = 0;
+ int adjusted = 0;
+ for (int i=0; i<ocount.size(); i++) {
+ if (ocount[i] > overloadcutoff)
+ overloaded++;
+
+ if (ocount[i] > 100+(overloadcutoff-100)/2) {
+ adjusted++;
+ c.overload[i] = 100.0 / (float)ocount[i];
+ //cout << "disk " << i << " has " << ocount[i] << endl;
+ }
+ ocount[i] = 0;
+ }
+ //cout << overloaded << " overloaded" << endl;
+ overloadsum += (float)overloaded / (float)ndisks;
+ adjustsum += (float)adjusted / (float)ndisks;
+
+
+ for (int x=xs; x<numpg+xs; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+ if (bad)
+ cout << "bad set " << x << ": " << v << endl;
+
+ //cout << v << "\t" << ocount << endl;
+ }
+ xs += numpg;
+
+ int still = 0;
+ for (int i=0; i<ocount.size(); i++) {
+ if (ocount[i] > overloadcutoff) {
+ still++;
+ //c.overload[ocount[i]] = 100.0 / (float)ocount[i];
+ //cout << "disk " << i << " has " << ocount[i] << endl;
+ }
+ }
+ //if (still) cout << "overload was " << overloaded << " now " << still << endl;
+ afteroverloadsum += (float)still / (float)ndisks;
+
+ //cout << "collisions: " << c.collisions << endl;
+ //cout << "r bumps: " << c.bumps << endl;
+
+ float avg = 0.0;
+ for (int i=0; i<ocount.size(); i++)
+ avg += ocount[i];
+ avg /= ocount.size();
+ float var = 0.0;
+ for (int i=0; i<ocount.size(); i++)
+ var += (ocount[i] - avg) * (ocount[i] - avg);
+ var /= ocount.size();
+
+ //cout << "avg " << avg << " var " << var << " sd " << sqrt(var) << endl;
+
+ tvar += var;
+ tvarnum++;
+ }
+
+ overloadsum /= tvarnum;
+ adjustsum /= tvarnum;
+ tvar /= tvarnum;
+ afteroverloadsum /= tvarnum;
+
+ int collisions = c.collisions[0] + c.collisions[1] + c.collisions[2] + c.collisions[3];
+ float crate = (float) collisions / (float)chooses;
+ //cout << "collisions: " << c.collisions << endl;
+
+
+ //cout << "total variance " << tvar << endl;
+ //cout << " overlaod " << overloadsum << endl;
+
+ cout << overloadcutoff << "\t" << (10000.0 / (float)overloadcutoff) << "\t" << tvar << "\t" << overloadsum << "\t" << adjustsum << "\t" << afteroverloadsum << "\t" << crate << endl;
+ return tvar;
+}
+
+
+int main()
+{
+ for (int d=140; d>100; d -= 5) {
+ float var = go(3,d);
+ //cout << "## depth = " << d << endl;
+ //cout << d << "\t" << var << endl;
+ }
+}
--- /dev/null
+
+#include "include/types.h"
+#include "include/Distribution.h"
+#include "osd/OSDMap.h"
+
+
+Distribution file_size_distn; //kb
+
+
+list<int> object_queue;
+int max_object_size = 1024*1024*100; //kb
+
+off_t no;
+
+int get_object() //kb
+{
+ if (object_queue.empty()) {
+ int max = file_size_distn.sample();
+ no++;
+ int filesize = max/2 + (rand() % 100) * max/200 + 1;
+ //cout << "file " << filesize << endl;
+ while (filesize > max_object_size) {
+ object_queue.push_back(max_object_size);
+ filesize -= max_object_size;
+ }
+ object_queue.push_back(filesize);
+ }
+ int s = object_queue.front();
+ object_queue.pop_front();
+ //cout << "object " << s << endl;
+ return s;
+}
+
+void getdist(vector<off_t>& v, float& avg, float& var)
+{
+ avg = 0.0;
+ for (int i=0; i<v.size(); i++)
+ avg += v[i];
+ avg /= v.size();
+
+ var = 0.0;
+ for (int i=0; i<v.size(); i++)
+ var += (v[i] - avg) * (v[i] - avg);
+ var /= v.size();
+}
+
+
+void testpgs(int n, // numpg
+ off_t pggb,
+ float& avg,
+ float& var,
+ off_t& numo
+ )
+{
+ off_t dist = (off_t)n * 1024LL*1024LL * (off_t)pggb; //kb
+ vector<off_t> pgs(n);
+ off_t did = 0;
+
+ no = 0;
+ while (did < dist) {
+ off_t s = get_object();
+ pgs[rand()%n] += s;
+ did += s;
+ }
+ while (!object_queue.empty())
+ pgs[rand()%n] += get_object();
+
+ numo = no;
+ //cout << did/n << endl;
+
+ //for (int i=0; i<n; i++) cout << pgs[i] << endl;
+
+ getdist(pgs, avg, var);
+ //cout << "avg " << avg << " var " << var << " dev " << sqrt(var) << endl;
+
+}
+
+
+
+int main()
+{
+ /*
+
+// File Size
+//cate count_mean size_mean
+1b -0.5 0.65434375 0
+1k 0.5 19.0758125 0.00875
+512K 1.5 35.6566 2.85875
+1M 2.5 27.7271875 25.0084375
+2M 3.5 16.63503125 20.8046875
+4M 4.5 106.82384375 296.053125
+8M 5.5 81.493375 335.77625
+16M 6.5 14.13553125 185.9775
+32M 7.5 2.176 52.921875
+256M 8.5 0.655938 47.8066
+512M 9.5 0.1480625 57.83375
+2G 10.5 0.020125 19.2888
+ */
+ file_size_distn.add(1, 19.0758125+0.65434375);
+ file_size_distn.add(512, 35.6566);
+ file_size_distn.add(1024, 27.7271875);
+ file_size_distn.add(2*1024, 16.63503125);
+ file_size_distn.add(4*1024, 106.82384375);
+ file_size_distn.add(8*1024, 81.493375);
+ file_size_distn.add(16*1024, 14.13553125);
+ file_size_distn.add(32*1024, 2.176);
+ file_size_distn.add(256*1024, 0.655938);
+ file_size_distn.add(512*1024, 0.1480625);
+ file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit
+ file_size_distn.normalize();
+
+
+ for (int pggb = 1; pggb < 16; pggb++) {
+ cout << pggb;
+ for (int max = 1; max <= 1024; max *= 2) {
+ float avg, var, var2, var3;
+ off_t no;
+ max_object_size = max*1024;
+ testpgs(100, pggb, avg, var, no);
+ testpgs(100, pggb, avg, var2, no);
+ testpgs(100, pggb, avg, var3, no);
+ float dev = sqrt((var+var2+var3)/3.0);
+ cout << "\t" << no << "\t" << max << "\t" << dev;
+ }
+ cout << endl;
+ }
+
+
+
+
+}
--- /dev/null
+
+
+#include "../crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<wid[h]; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+ b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+ buckets[h].push_back(b);
+ return b;
+ } else {
+ // mixed
+ Bucket *b = new TreeBucket(h+1);
+ c.add_bucket(b);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, buckets, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ n->set_parent(b->get_id());
+ }
+ buckets[h].push_back(b);
+ //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, map< int, list<Bucket*> >& buckets, int& ndisks)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks);
+ return b->get_id();
+}
+
+
+void place(Crush& c, Rule& rule, int numpg, int numrep, vector<int>& ocount)
+{
+ vector<int> v(numrep);
+ //map<int,int> ocount;
+
+ for (int x=1; x<=numpg; x++) {
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+ if (bad)
+ cout << "bad set " << x << ": " << v << endl;
+
+ //placement[x] = v;
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+
+}
+
+
+int main()//float testmovement(int depth, int branching, int udisks)
+{
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ vector<int> wid;
+ wid.push_back(10);
+ wid.push_back(2);
+
+ map< int, list<Bucket*> > buckets;
+ root = make_hierarchy(c, wid, buckets, ndisks);
+
+ // add small bucket
+ vector<int> disks;
+ for (int i=0; i<3; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 1, disks);
+ b->make_primes(h);
+ Bucket *o = buckets[1].back();
+ c.add_bucket(b);
+ //cout << " adding under " << o->get_id() << endl;
+ c.add_item(o->get_id(), b->get_id(), b->get_weight());
+
+
+ // rule
+ int numrep = 6;
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+ //c.overload[10] = .1;
+
+ int pg_per = 10000;
+ int numpg = pg_per*ndisks/numrep;
+
+ vector<int> ocount(ndisks);
+
+ c.print(cout, root);
+
+ place(c, rule, numpg, numrep, ocount);
+
+ for (int i=0; i<ocount.size(); i++) {
+ cout << "disk " << i << " = " << ocount[i] << endl;
+ }
+
+ return 0;
+}
+
+
--- /dev/null
+
+#include "../../common/Clock.h"
+#include "../crush.h"
+using namespace crush;
+
+
+Clock g_clock;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+int numrep = 1;
+
+
+double go(int n, int bucket)
+{
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ Bucket *b;
+ vector<int> items;
+ if (bucket == 0) b = new UniformBucket(1,0,10,items);
+ if (bucket == 1) b = new TreeBucket(1);
+ if (bucket == 2) b = new ListBucket(1);
+ if (bucket == 3) b = new StrawBucket(1);
+
+ for (int d=0; d<n; d++)
+ b->add_item(ndisks++, 1);
+
+ //if (!bucket) ((UniformBucket*)b)->make_primes(h);
+
+ root = c.add_bucket(b);
+
+ // rule
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+
+ int place = 1000000;
+
+
+ vector<int> v(numrep);
+ set<int> out;
+ map<int,float> overload;
+
+ utime_t start = g_clock.now();
+
+ for (int x=1; x <= place; x++)
+ c.do_rule(rule, x, v, out, overload);
+
+ utime_t end = g_clock.now();
+
+ end -= start;
+ double el = (double)end;
+
+ //cout << "\t" << ndisks;
+
+ return el;
+}
+
+
+int main()
+{
+
+ for (int n=4; n<=50; n += 4) {
+ cout << n;
+ for (int b=0; b<4; b++) {
+ double el = go(n,b);
+ cout << "\t" << el;
+ }
+ cout << endl;
+ }
+}
--- /dev/null
+
+#include "../../common/Clock.h"
+#include "../crush.h"
+using namespace crush;
+
+
+Clock g_clock;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+int uniform = 10;
+int branching = 10;
+int buckettype = 0;
+int numrep = 1;
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<wid[h]; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+ //b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+ return b;
+ } else {
+ // mixed
+ Bucket *b;
+ if (buckettype == 0)
+ b = new TreeBucket(h+1);
+ else if (buckettype == 1 || buckettype == 2)
+ b = new ListBucket(h+1);
+ else if (buckettype == 3)
+ b = new StrawBucket(h+1);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ }
+ c.add_bucket(b);
+ //cout << h << " mixedbucket with " << wid[h] << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+ return b->get_id();
+}
+
+
+double go(int dep, int per)
+{
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ vector<int> wid;
+ if (1) {
+ wid.push_back(uniform);
+ for (int d=1; d<dep; d++)
+ wid.push_back(per);
+ }
+ if (0) {
+ if (dep == 0)
+ wid.push_back(1000);
+ if (dep == 1) {
+ wid.push_back(1);
+ wid.push_back(1000);
+ }
+ if (dep == 2) {
+ wid.push_back(5);
+ wid.push_back(5);
+ wid.push_back(8);
+ wid.push_back(5);
+ }
+ }
+
+ if (1) {
+ root = make_hierarchy(c, wid, ndisks);
+ }
+
+
+
+ // rule
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+
+ int place = 1000000;
+
+
+ vector<int> v(numrep);
+
+ utime_t start = g_clock.now();
+
+ set<int> out;
+ map<int,float> overload;
+
+ for (int x=1; x <= place; x++)
+ c.do_rule(rule, x, v, out, overload);
+
+ utime_t end = g_clock.now();
+
+ end -= start;
+ double el = (double)end;
+
+ //cout << "\t" << ndisks;
+
+ return el;
+}
+
+
+int main()
+{
+ uniform = branching = 8;
+
+ cout << "// dep\tuniform\tbranch\tndisks" << endl;
+
+ for (int d=2; d<=5; d++) {
+ cout << d;// << "\t" << branching;
+ cout << "\t" << uniform;
+ cout << "\t" << branching;
+
+ int n = 1;
+ for (int i=0; i<d; i++)
+ n *= branching;
+ cout << "\t" << n;
+
+ numrep = 2;
+
+ // crush
+ for (buckettype = 0; buckettype <= 3; buckettype++) {
+ switch (buckettype) {
+ case 0: cout << "\ttree"; break;
+ case 1: cout << "\tlist"; break;
+ case 2: continue;
+ case 3: cout << "\tstraw"; break;
+ }
+
+ //for (numrep = 1; numrep <= 3; numrep++) {
+ //cout << "\t" << numrep;
+
+ double el = go(d, branching);
+ cout << "\t" << el;
+ }
+
+ // rush
+
+ buckettype = 0;
+ cout << "\trush_T\t" << go(2, n/uniform);
+
+ buckettype = 1;
+ cout << "\trush_P\t" << go(2, n/uniform);
+
+ cout << endl;
+ }
+}
--- /dev/null
+
+#include "../../common/Clock.h"
+#include "../crush.h"
+using namespace crush;
+
+
+Clock g_clock;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+int branching = 10;
+bool linear = false;
+int numrep = 1;
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<wid[h]; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(1, 0, 10, disks);
+ //b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+ return b;
+ } else {
+ // mixed
+ Bucket *b;
+ if (linear)
+ b = new ListBucket(h+1);
+ else
+ b = new TreeBucket(h+1);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, ndisks);
+ b->add_item(n->get_id(), n->get_weight());
+ }
+ c.add_bucket(b);
+ //cout << h << " mixedbucket with " << wid[h] << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks);
+ return b->get_id();
+}
+
+
+double go(int s)
+{
+ int dep = 2;
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+
+ // buckets
+ int root = -1;
+ int ndisks = 0;
+
+ vector<int> wid;
+ if (1) {
+ //for (int d=0; d<dep; d++)
+ wid.push_back(8);
+ wid.push_back(s/8);
+ }
+ if (0) {
+ if (dep == 0)
+ wid.push_back(1000);
+ if (dep == 1) {
+ wid.push_back(1);
+ wid.push_back(1000);
+ }
+ if (dep == 2) {
+ wid.push_back(5);
+ wid.push_back(5);
+ wid.push_back(8);
+ wid.push_back(5);
+ }
+ }
+
+ if (1) {
+ root = make_hierarchy(c, wid, ndisks);
+ }
+
+
+
+ // rule
+ Rule rule;
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+
+
+ int place = 1000000;
+
+
+ vector<int> v(numrep);
+
+ utime_t start = g_clock.now();
+
+ for (int x=1; x <= place; x++)
+ c.do_rule(rule, x, v);
+
+ utime_t end = g_clock.now();
+
+ end -= start;
+ double el = (double)end;
+
+ cout << "\t" << ndisks;
+
+ return el;
+}
+
+
+int main()
+{
+ branching = 8;
+
+ int d = 2;
+ numrep = 2;
+
+ for (int s = 64; s <= 32768; s *= 8) {
+ cout << "t";
+ linear = false;
+ double el = go(s, d);
+ cout << "\t" << el;
+
+ cout << "\tp";
+ linear = true;
+ el = go(s, d);
+ cout << "\t" << el;
+
+ cout << endl;
+ }
+}
--- /dev/null
+
+#include "../../common/Clock.h"
+#include "../crush.h"
+using namespace crush;
+
+
+Clock g_clock;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+int branching = 10;
+bool linear = false;
+int numrep = 1;
+
+int main() {
+
+ Bucket *b = new UniformBucket(1, 0);
+ //b = new TreeBucket(1);
+}
+
--- /dev/null
+
+
+#include "../Bucket.h"
+using namespace crush;
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+ostream& operator<<(ostream& out, vector<int>& v)
+{
+ out << "[";
+ for (int i=0; i<v.size(); i++) {
+ if (i) out << " ";
+ out << v[i];
+ }
+ out << "]";
+ return out;
+}
+
+
+int main()
+{
+ Hash h(73);
+
+ int ndisks = 0;
+ int numrep = 3;
+
+ StrawBucket mb(1);
+ /*for (int i=0;i<10;i++)
+ mb.add_item(ndisks++, 10);
+ */
+ mb.add_item(ndisks++, 1);
+ mb.add_item(ndisks++, 1);
+ mb.add_item(ndisks++, 10);
+ mb.add_item(ndisks++, 10);
+ mb.add_item(ndisks++, 100);
+ mb.add_item(ndisks++, 1000);
+
+ vector<int> ocount(ndisks);
+
+ vector<int> v(numrep);
+ int nplace = 0;
+ for (int x=1; x<1000000; x++) {
+ //cout << H(x) << "\t" << h(x) << endl;
+ for (int i=0; i<numrep; i++) {
+ int d = mb.choose_r(x, i, h);
+ v[i] = d;
+ ocount[d]++;
+ nplace++;
+ }
+ //cout << v << "\t" << endl;//ocount << endl;
+ }
+
+ for (int i=0; i<ocount.size(); i++) {
+ float f = ocount[i] / (float)nplace;
+ cout << "disk " << i << " has " << ocount[i] << " " << f << endl;
+ }
+
+}
--- /dev/null
+
+#include <vector>
+#include <iostream>
+using namespace std;
+
+
+void getdist(vector<int>& v, float& avg, float& var)
+{
+ avg = 0.0;
+ for (int i=0; i<v.size(); i++)
+ avg += v[i];
+ avg /= v.size();
+
+ var = 0.0;
+ for (int i=0; i<v.size(); i++)
+ var += (v[i] - avg) * (v[i] - avg);
+ var /= v.size();
+}
+
+int main()
+{
+ int n = 50;
+ vector<int> a(n);
+ vector<int> b(n);
+
+ for (int i=0; i<n*n; i++)
+ a[rand()%n]++;
+
+ float aavg, avar;
+ getdist(a, aavg, avar);
+
+ for (int i=0; i<7*n*n; i++)
+ b[rand()%n]++;
+
+ float bavg, bvar;
+ getdist(b, bavg, bvar);
+
+ cout << "a avg " << aavg << " var " << avar << endl;
+ cout << "b avg " << bavg << " var " << bvar << endl;
+
+
+ vector<int> c(n);
+ for (int i=0; i<n; i++)
+ c[i] = a[i] * b[i];
+
+ float cavg, cvar;
+ getdist(c, cavg, cvar);
+
+ cout << "c avg " << cavg << " var " << cvar << endl;
+
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "client/SyntheticClient.h"
+#include "client/Client.h"
+#include "client/fuse.h"
+
+#include "msg/SimpleMessenger.h"
+
+#include "common/Timer.h"
+
+#ifndef DARWIN
+#include <envz.h>
+#endif // DARWIN
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+int main(int argc, char **argv, char *envp[]) {
+
+ //cerr << "cfuse starting " << myrank << "/" << world << endl;
+ vector<char*> args;
+ argv_to_vec(argc, argv, args);
+ parse_config_options(args);
+ parse_syn_options(args); // for SyntheticClient
+
+ // args for fuse
+ vec_to_argv(args, argc, argv);
+
+ // load monmap
+ MonMap monmap;
+ int r = monmap.read(".ceph_monmap");
+ assert(r >= 0);
+
+ // start up network
+ rank.start_rank();
+
+ // start client
+ Client *client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), &monmap);
+ client->init();
+
+ // start syntheticclient
+ SyntheticClient *syn = new SyntheticClient(client);
+
+ // start up fuse
+ // use my argc, argv (make sure you pass a mount point!)
+ cout << "mounting" << endl;
+ client->mount();
+
+ cout << "starting syn client" << endl;
+ syn->start_thread();
+
+ // wait
+ syn->join_thread();
+
+ // unmount
+ client->unmount();
+ cout << "unmounted" << endl;
+ client->shutdown();
+
+ delete client;
+
+ // wait for messenger to finish
+ rank.wait();
+
+ return 0;
+}
+
--- /dev/null
+
+How Directory Committing Works:
+
+Each CDir has:
+ version - current version of directory
+ committing_version - which version was sent to stable storage
+ last_committed_version - last version to be safely stored
+
+Each Inode has:
+ parent_dir_version - what dir version i was in when i was dirtied. (*)
+
+ (*) note that if you change an inode, mark_dirty() again, even if it's already dirty!
+
+
+How committing works:
+
+A call to commit_dir(dir, context) will ensure tha the _current_ version is stored safely on disk before the context is finished.
+
+When a commit completes, inodes in the directory are checked. If they are dirty and belonged to the _committed_ (or earlier) version, then they are marked clean. If they belong to a newer version, then they are _still dirty_.
+
+
+
--- /dev/null
+
+Primary copy replication.
+
+Inodes:
+
+- The primary's list of replicas (cached_by) is inclusive at all times.
+- The primary's list never includes the local node.
+- The primary's list of replicas will only include non-replicas when the relevant CacheExpire notifications are in-flight.
+
+- Replicas can be created in two ways:
+ - via a Discover + DiscoverReply
+ - via an export and import. (The old auth keeps a copy, and adds itself to the replica list as it exports.)
+
+
+Directories (and their dentries):
+
+- The primary has an open_by list that is inclusive at all times.
+- ..Never includes local node
+- No per-dentry replica lists. All dentry lock operations (for unlink, etc.) are sent to all nodes in open_by list.
\ No newline at end of file
--- /dev/null
+
+
+AUTHORITY
+
+The authority maintains a list of what nodes cache each inode.
+Additionally, each replica is assigned a serial (normally 0) to
+disambiguate multiple replicas of the same item (see below).
+
+ set<int> cached_by;
+ map<int, int> cached_by_serial;
+
+The cached_by set _always_ includes all nodes that cache the
+partcuarly inode, but may additionally include nodes that used to
+cache it but no longer do. In those cases, an expire message should
+be in transit.
+
+
+REPLICA
+
+The replica maintains a notion of who it believes is the authority for
+each replicated inode. There are two possibilities:
+
+ - Ordinarily, this notion is correct.
+ - If the part of the file system in question was recently exported to
+ a new MDS, the inodes old authority is acting as a CACHEPROXY,
+ and will forward relevant messages on to the authority.
+
+When a repica is expired from cache, and expire is sent to the
+authority. The expire includes the serial number issued when the
+replica was originally created to disambiguate potentially concurrent
+replication activity.
+
+
+EXPORTS
+
+- The old authority suddenly becomes a replica. It's serial is well
+ defined. It also becomes a CACHEPROXY, which means its cached_by
+ remains defined (with an alternate meaning!). While a proxy, the
+ node will forward relevant messages from the replica to the
+ authority (but not the other way around--the authority knows all
+ replicas).
+
+- Once the export is acked, the old authority sends a
+ message to the replica notifying it of the new authority. As soon
+ as all replicas acknowedge receipt of this notice, the old authority
+ can cease CACHEPROXY responsibilities and become a regular replica.
+ At this point it's cached_by is no longer defined.
+
+- Replicas always know who the authority for the inode is, OR they
+ know prior owner acting as a CACHEPROXY. (They don't know which it
+ is.)
+
+
+CACHED_BY
+
+The authority always has an inclusive list of nodes who cache an item.
+As such it can confidently send updates to replicas for locking,
+invalidating, etc. When a replica is expired from cache, an expire is
+sent to the authority. If the serial matches, the node is removed
+from the cached_by list.
+
+
+
+
+
+SUBTREE AUTHORITY DELEGATION: imports versus hashing
+
+Authority is generally defined recursively: an inode's authority
+matches the containing directory, and a directory's authority matches
+the directory inode's. Thus the authority delegation chain can be
+broken/redefined in two ways:
+
+ - Imports and exports redefine the directory inode -> directory
+ linkage, such that the directory authority is explicitly specified
+ via dir.dir_auth:
+
+ dir.dir_auth == -1 -> directory matches its inode
+ dir.dir_auth >= 0 -> directory authority is dir.dir_auth
+
+ - Hashed directories redefine the directory -> inode linkage. In
+ non-hashed directories, inodes match their containing directory.
+ In hashed directories, each dentry's authority is defined by a hash
+ function.
+
+ inode.hash_seed == 0 -> inode matches containing directory
+ inode.hash_seed > 0 -> defined by hash(hash_seed, dentry)
+
+A directory's "containing_import" (bad name, FIXME) is either the
+import or hashed directory that is responsible for delegating a
+subtree. Note that the containing_import of a directory may be itself
+because it is an import, but it cannot be itself because it is hashed.
+
+Thus:
+
+ - Import and export operations' manipulation of dir_auth is
+ completely orthogonal to hashing operations. Hashing methods can
+ ignore dir_auth, except when they create imports/exports (and break
+ the inode<->dir auth linkage).
+
+ - Hashdirs act sort of like imports in that they bound an
+ authoritative region. That is, either hashdirs or imports can be
+ the key for nested_exports. In some cases, a dir may be both an
+ import and a hash.
+
+ - Export_dir won't export a hashdir. This is because it's tricky
+ (tho not necessarily impossible) due to the way nested_exports is
+ used with imports versus hashdirs.
+
+
+
+
+FREEZING
+
+There are two types of freezing:
+
+ - TREE: recursively freezes everything nested beneath a directory,
+ until an export of edge of cache is reached.
+ - DIR: freezes the contents of a single directory.
+
+Some notes:
+
+ - Occurs on the authoritative node only.
+
+ - Used for suspending critical operations while migrating authority
+ between nodes or hashing/unhashing directories.
+
+ - Freezes the contents of the cache such that items may not be added,
+ items cannot be auth pinned, and/or subsequently reexported. The
+ namespace of the affected portions of the hierarchy may not change.
+ The content of inodes and other orthogonal operations
+ (e.g. replication, inode locking and modification) are unaffected.
+
+Two states are defined: freezing and frozen. The freezing state is
+used while waiting for auth_pins to be removed. Once all auth_pins
+are gone, the state is changed to frozen. New auth_pins cannot be
+added while freezing or frozen.
+
+
+AUTH PINS
+
+An auth pin keeps a given item on the authoritative node until it is
+removed. The pins are tracked recursively, so that a subtree cannot
+be frozen if it contains any auth pins.
+
+If a pin is placed on a non-authoritative item, the item is allowed to
+become authoritative; the specific restriction is it cannot be frozen,
+which only happens during export-type operations.
+
+
+TYPES OF EXPORTS
+
+- Actual export of a subtree from one node to another
+- A rename between directories on different nodes exports the renamed
+_inode_. (If it is a directory, it becomes an export such that the
+directory itself does not move.)
+- A hash or unhash operation will migrate inodes within the directory
+either to or from the directory's main authority.
+
+EXPORT PROCESS
+
+
+
+
+HASHING
+
+- All nodes discover and open directory
+
+- Prep message distributes subdir inode replicas for exports so that
+ peers can open those dirs. This is necessary because subdirs are
+ converted into exports or imports as needed to avoid migrating
+ anything except the hashed dir itself. The prep is needed for the
+ same reasons its important with exports: the inode authority must
+ always have the exported dir open so that it gets accurate dir
+ authority updates, and can keep the inode->dir_auth up to date.
+
+- MHashDir messsage distributes the directory contents.
+
+- While auth is frozen_dir, we can't get_or_open_dir. Otherwise the
+ Prep messages won't be inclusive of all dirs, and the
+ imports/exports won't get set up properly.
+
+TODO
+readdir
+
+
+- subtrees stop at hashed dir. hashed dir's dir_auth follows parent
+ subtree, unless the dir is also an explicit import. thus a hashed
+ dir can also be an import dir.
+
+
+bananas
+apples
+blueberries
+green pepper
+carrots
+celery
+
+
+
+
--- /dev/null
+
+null dentires only exist
+ - on auth
+ - on replica, if they are xlock
\ No newline at end of file
--- /dev/null
+
+underlying client capabilities:
+
+- read + cache
+- read sync
+- write sync
+- write + buffer
+ (...potentially eventually augmented by byte ranges)
+
+whatever system of modes, tokens, etc. has to satisfy the basic
+constraint that no conflicting capabilities are ever in the
+hands of clients.
+
+
+questions:
+- is there any use to clients writing to a replica?
+ - reading, yes.. 100,000 open same file..
+
+
+------
+
+simplest approach:
+- all readers, writers go through authority
+- all open, close traffic at replicas forwarded to auth
+
+- fh state migrates with exports.
+
+
+
+--------
+
+less simple:
+- all writers go through authority
+ - open, close traffic fw
+- readers from any replica
+ - need token from auth
+- weird auth <-> replica <-> client interactions ensue!
+
+
+--------
+
+even more complex (and totally FLAWED, ignore this!)
+
+- clients can open a file with any replica (for read or write).
+- replica gets a read or write token from the primary
+ - primary thus knows if it's all read, all write, mixed, or none.
+- once replica has a token it can service as many clients (of given type(s)) as it wants.
+- on export, tokens are moved too.
+ - primary give _itself_ a token too! much simpler.
+
+- clients maintain a mode for each open file: rdonly, wronly, rdwr, lock
+- globally, the mode is controlled by the primary, based on the mixture of
+ read and write tokens issued
+
+
+
+- [optional] if a client has a file open rdwr and the mode is rdonly or wronly, it can
+ request to read or write from the mds (which might twiddle the mode for performance
+ reasons.. e.g. lots of ppl rdwr but no actual reading)
+
+
+
+
+--------
+
+
--- /dev/null
+// -*- mode:C++; tab-width:4; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
--- /dev/null
+
+inodeno_t namespace
+ - relevant both for ino's, and for the (ino) input for Filer and object storage namespace...
+
+1 - root inode
+
+100+mds - mds log/journal
+200+mds - mds ino, fh allocation tables
+300+mds - mds inode files (for non-embedded inodes)
+
+1000+ - regular files and directories
\ No newline at end of file
--- /dev/null
+
+
+journal is distributed among different nodes. because authority changes over time, it's not immedicatley clear to a recoverying node relaying the journal whether the data is "real" or not (it might be exported later in the journal).
+
+
+possibilities:
+
+
+ONE.. bloat the journal!
+
+- journal entry includes full trace of dirty data (dentries, inodes) up until import point
+ - local renames implicit.. cache is reattached on replay
+ - exports are a list of exported dirs.. which are then dumped
+ ...
+
+recovery phase 1
+- each entry includes full trace (inodes + dentries) up until the import point
+- cache during recovery is fragmetned/dangling beneath import points
+- when export is encountered items are discarded (marked clean)
+
+recovery phase 2
+- import roots ping store to determine attachment points (if not already known)
+ - if it was imported during period, attachment point is already known.
+ - renames affecting imports are logged too
+- import roots discovered from other nodes, attached to hierarchy
+
+then
+- maybe resume normal operations
+- if recovery is a background process on a takeover mds, "export" everything to that node.
+
+
+-> journal contains lots of clean data.. maybe 5+ times bigger as a result!
+
+possible fixes:
+ - collect dir traces into journal chunks so they aren't repeated as often
+ - each chunk summarizes traces in previous chunk
+ - hopefully next chunk will include many of the same traces
+ - if not, then the entry will include it
+
+
+
+
+=== log entry types ===
+- all inode, dentry, dir items include a dirty flag.
+- dirs are implicitly _never_ complete; even if they are, a fetch before commit is necessary to confirm
+
+ImportPath - log change in import path
+Import - log import addition (w/ path, dirino)
+
+InoAlloc - allocate ino
+InoRelease - release ino
+
+Inode - inode info, along with dentry+inode trace up to import point
+Unlink - (null) dentry + trace, + flag (whether inode/dir is destroyed)
+Link - (new) dentry + inode + trace
+
+
+-----------------------------
+
+TWO..
+- directories in store contain path at time of commit (relative to import, and root)
+- replay without attaching anything to heirarchy
+- after replay, directories pinged in store to attach to hierarchy
+
+-> phase 2 too slow!
+-> and nested dirs may reattach... that won't be apparent from journal.
+ - put just parent dir+dentry in dir store.. even worse on phase 2!
+
+
+THREE
+-
+
+
+
+
+
+
+
+metadata journal/log
+
+
+event types:
+
+chown, chmod, utime
+ InodeUpdate
+
+mknod, mkdir, symlink
+ Mknod .. new inode + link
+
+unlink, rmdir
+ Unlink
+
+rename
+ Link + Unlink (foreign)
+or Rename (local)
+
+link
+ Link .. link existing inode
+
+
+
+
+InodeUpdate
+DentryLink
+DentryUnlink
+InodeCreate
+InodeDestroy
+Mkdir?
--- /dev/null
+
+http://www.usenix.org/events/fast05/wips/slides/welch.pdf
+
+
+
+-- STATLITE
+ statlite(const char *filename, struct statlite *buf);
+ fstatlite(int fd, struct statlite *buf);
+ lstatlite(const char *filename, struct statlite *buf);
+
+ * file size, mtime are optionally not guaranteed to be correct
+ * mask field to specify which fields you need to be correct
+
+
+-- READDIR+
+
+ struct dirent_plus *readdirplus(DIR *dirp);
+ int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result);
+ struct dirent_lite *readdirlite(DIR *dirp);
+ int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result);
+
+ * plus returns lstat
+ * lite returns lstatlite
+
+
+-- lazy i/o integrity
+
+ O_LAZY to open(2)
+
+ * relax data coherency
+ * writes may not be visible until lazyio_propagate, fsync, close
+
+ lazyio_propagate(int fd, off_t offset, size_t count);
+ * my writes are safe
+
+ lazyio_synchronize(int fd, off_t offset, size_t count);
+ * i will see everyone else's propagated writes
+
+-- read/write non-serial vectors
+
+ ssize_t readx(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count);
+ ssize_t writex(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count);
+
+ * like readv/writev, but serial
+ *
+
+
+int lockg(int fd, int cmd, lgid_t *lgid)
+ group locks
+
+int openg(char *path, int mode, fh_t *handle);
+ portable file handle
+int sutoc(fh_t *fh);
\ No newline at end of file
--- /dev/null
+
+intro
+
+osd cluster map
+ requirements
+ desireable properties
+ (c)rush
+
+failure detection
+ distributed ping or heartbeat
+ central filter, notifier
+
+design
+ placement seed, class/superset, groups
+
+normal operation
+ reads
+ writes
+
+recovery
+ triggers: failed disk, or total cluster reorganization
+
+ notify
+ peering
+ pull
+ push
+ clean
+
+writes during recovery
+
+graceful data loss + recovery?
+
+
+
+
+
+
--- /dev/null
+
+
+SOME GENERAL REQUIREMENTS
+
+- cluster expansion:
+ - any or all of the replicas may move to new OSDs.
+
+- cluster map may change frequently
+ - map change should translate into pending replication/migration
+ state quickly (or better yet, instantly), so that we could push
+ through a series of (say, botched) maps quickly and be fine, so long
+ as the final map is correct.
+
+- ideally, unordered osd<->osd, client<->osd communication
+ (mds<->mds, client<->mds communication is ordered, but file i/o
+ would be too slow that way?)
+
+
+
+
+PRIMARY ONLY PICTURE
+
+let's completely ignore replication for a while, and see how
+complicated the picture needs to be to reliably support cluster expansion.
+
+typedef __uint64_t version_t;
+
+
+per-Object metadata:
+- version #. incremented when an object is modified.
+ e.g. version_t version;
+- on primary, keep list of stray replicas
+ e.g. map<int,version_t> stray_replicas; // osds w/ stray replicas
+ includes old primary osd(s), until deletion is confirmed. used while rg
+ is importing.
+
+
+per-RG metadata
+- object list. well, a method to fetch it by querying a collection or whatever.
+- negative <object,version> list
+ e.g. map<object_t, version_t> deleted_objects;
+ - used to enumerate deleted objects, when in "importing" state.
+- a RG "state" (enum/int)
+
+
+
+
+
+
+Normal RG state:
+- role=primary
+ clean - i am primary, all is well. no stray copies. i can
+ discard my negative object list, since my local
+ object store tells me everything.
+
+
+After a map change:
+- new primary
+ undef - initially; i don't know RG exists.
+- old primary
+ homeless - i was primary, still have unmolested data. new primary is not yet migrating
+ (presumably it's state=undef.) i need to contact new primary and tell them
+ this RG exists.
+
+- new primary
+ importing - i am migrating data from old primary. keep negative dir entries for deletions.
+ write locally. proxy reads (force immediately migration). do whole objects
+ initially (on write, block until i migrate the object). later we can do
+ sub-object state (where "live" object data is spread across new/old primaries..
+- old primary
+ exporting - primary is migrating my data.
+ undef - when it finishes. (i will forget this RG existed.)
+
+
+After a second map change (scenario 1):
+ as above, if we were clean again.
+
+After a second map change (scenario 2):
+ we weren't clean yet.
+- new primary
+ undef - initially (until i learn RG exists)
+- old primary
+ importing - i'm still migrating from old old primary
+- old old primary
+ exporting - ...
+- old primary
+?? importing+exporting - proxy reads as before. continue migrating from old old primary.
+
+
+After a second map change (scenario 3):
+ we weren't clean yet, and old old primary is also new primary
+- new primary (and old old primary)
+ exporting - change state to importing. be sure to compare object versions, and neg dir
+ entries (as we always should do, really!).
+- old primary
+ importing - note that the old import source matches new primary, and change
+ state to exporting, and stop importing. (unlike scenario 2)
+
+-> this approach could mean that a series of fast map changes could
+ force data to migrate down a "chain" of old primaries to reach the
+ new one. maybe old primary should go from importing -> exporting,
+ and pass along old old primary id to new primary such that the
+ import is a many-to-one thing, instead of one-to-one. version
+ numbers and neg entries will make it easy to pick out correct versions.
+
+
+
+For the importing process on a given RG:
+
+- metadata for each source
+ - each source has a state:
+ 'starting' - don't know anything about source yet. query source!
+ this probaby induces the source to change from
+ 'homeless' or something similar to 'exporting'.
+ 'importing' - i've fetched the source's object list (and neg
+ object list). i'm busy reading them! these lists
+ will shrink as the process continues. after i fetch
+ an object, i will erase it from the source.
+ (object metadata will include stray copy info
+ until i confirm that its removed.)
+ 'finishing' - i've read all my data, and i'm telling the old person
+ to discard any remaining RG metadata (RG contents
+ should already be gone)
+ - unmigrated object list
+ - migrated but not deleted object list
+ - stray osd is also listed in per-object MD during this stage
+ - negative object list
+ - i can remove these items if i see a newer object version (say,
+ from a different import source or something).
+ - i can remove any local objects or ignore imported ones if it is
+ older than deleted version
+
+- the lists should be sets or otherwise queryable so that while i'm
+ importing and a real op comes through I can quickly determine if a
+ given object_id is pending migration etc or if my local store is to
+ be trusted.
+
+
+
+
+
+SOME CODE BITS
+
+
+typedef __uint64_t version_t;
+class Object {
+ version_t version;
+ map<int, version_t> stray_replicas;
+};
+
+
+class ReplicaGroup {
+ int enumerate_objects(list<object_t>& ls);
+
+ int state;
+
+ // for unstable states,
+ map<object_t, version_t> deleted_objects; // locally
+ map<int, RGExporter_t> exporters; // importing from these guys.
+};
+
+// primary
+#define RG_STATE_CLEAN 1
+#define RG_STATE_IMPORTING 2 // pulling data
+
+// non-primary
+#define RG_STATE_HOMELESS 5 // old primary; new primary not yet
+ // notified; not yet exporting.
+#define RG_STATE_EXPORTING 6 // a newer primary is extracting my
+ // data.
+
+
+struct RGExporter_t {
+ int import_state;
+
+ set<object_t> remaining_objects; // remote object list
+ set<object_t> stray_objects; // imported but not deleted.
+
+};
+
+
+
+
+
+----
+all crap from here on down
+
+
+
+
+REPLICAS
+-
+
+
+
+
+OSD STATES
+- primary, up to date.
+- replica, up to date.
+
+- primary, proxy to old primary (primaries?)
+
+- replica, not up to date.
+
+
+REPLICATION STUFF
+
+Per-RG metadata
+- primary
+ - per-replica state: clean, catching up?
+- replica
+
+Per-object metadata
+- primary and replica
+ - version number/mtime
+ - rg (reverse indexed)
+- primary
+ - replication level and state.
+ - commited to memory and/or disk, on which replicas (#1, #2, etc.)
+- replica
+
+
+
+
+
+->
\ No newline at end of file
--- /dev/null
+
+
+quick performance test 2005-05-11. fakemds, 100x100, asdf/asdf, debug 13
+ -g marshalling
+real 3m8.697s
+user 2m53.282s
+sys 0m6.291s
+
+real 3m3.337s
+user 2m49.467s
+sys 0m6.243s
+
+ -g no marshalling
+real 2m1.464s
+user 1m42.680s
+sys 0m8.128s
+
+real 1m49.469s
+user 1m34.523s
+sys 0m6.410s
+
+ -O3 marshalling
+real 1m29.833s
+user 1m11.474s
+sys 0m7.588s
+
+real 1m9.439s
+user 0m56.071s
+sys 0m5.643s
+
+
+ -O3 no marshalling
+real 1m2.739s
+user 0m46.578s
+sys 0m7.882s
+
--- /dev/null
+
+// stable states // ------auth----- -----replica-----
+#define LOCK_SYNC 0 // R . / . . . WB same ... for stat()
+#define LOCK_LOCK 1 // R W / RC . . . . . / RC . . . ... for truncate(), fsync()
+#define LOCK_RDONLY 2 // R . / RC R . . same
+#define LOCK_MIXED 3 // . . / . R W . same
+#define LOCK_WRONLY 4 // . . / . . W WB same
+
+// transition states
+#define LOCK_GSYNCR 8 // R . / RC . . . same
+#define LOCK_GSYNCMW 9 // . . / RC . . WB same
+#define LOCK_GSYNCMW2 9 // . . / RC . . WB same
+
+#define LOCK_GLOCKSR 5 // R . / RC . . . . . / RC . . .
+#define LOCK_GLOCKMW 7 // . . / RC . . . same
+
+#define LOCK_GRDONLYM 10 // . . / . R . . same
+#define LOCK_GRDONLYM2 10 // --- . . / . R . .
+#define LOCK_GRDONLYW 11 // . . / . . . . same
+#define LOCK_GRDONLYW2 11 // --- . . / . . . .
+#define LOCK_GRDONLYS 12 // R . / RC . . . same
+#define LOCK_GRDONLYL 13 // R . / RC . . . ---
+
+#define LOCK_GMIXEDR 14 // R . / . R . . . . / . R . .
+#define LOCK_GMIXEDR2 15 // --- . . / . R . .
+#define LOCK_GMIXEDW 16 // . . / . . W . same
+#define LOCK_GMIXEDW2 16 // --- . . / . . W .
+#define LOCK_GMIXEDS 16 // R . / . . . . . . / . . . .
+#define LOCK_GMIXEDS2 16 // --- . . / . . . .
+#define LOCK_GMIXEDL 17 // R . / . . . . ---
+
+#define LOCK_GWRONLYR 18 // R . / . . . . same
+#define LOCK_GWRONLYR2 18 // --- . . / . . . .
+#define LOCK_GWRONLYM 19 // . . / . . . . same
+#define LOCK_GWRONLYM2 19 // --- . . / . . . .
+#define LOCK_GWRONLYS 20 // R . / . . . WB same
+#define LOCK_GWRONLYS2 20 // --- . . / . . . .
+#define LOCK_GWRONLYL 21
+
--- /dev/null
+
+- mds0 triggers shutdown by sending a shutdown_start to all nodes.
+
+- from here on out, all client requests are discarded (unless they are a file close?)
+
+- each mds checks for outstanding inter-mds transations. e.g imports, discoveries, etc. once they're all done, send a shutdown_ready to mds0
+
+- each mds successively disassembles its cache, flushing data to long-term storage, and sending inodeexpires, exporting imported dirs to parent (after they're clean + empty)
+
+- when the cache is empty, send shutdown_done to mds0 and exit.
+
+- mds0 exits when all mdss have finished.
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "Allocator.h"
+#include "Ebofs.h"
+
+
+#undef dout
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << fs->dev.get_device_name() << ").allocator."
+
+
+void Allocator::dump_freelist()
+{
+ if (1) {
+ interval_set<block_t> free; // validate too
+
+ block_t n = 0;
+ for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) {
+ Table<block_t,block_t> *tab;
+ if (b < EBOFS_NUM_FREE_BUCKETS) {
+ tab = fs->free_tab[b];
+ dout(0) << "dump bucket " << b << " " << tab->get_num_keys() << endl;
+ } else {
+ tab = fs->limbo_tab;
+ dout(0) << "dump limbo " << tab->get_num_keys() << endl;;
+ }
+
+ if (tab->get_num_keys() > 0) {
+ Table<block_t,block_t>::Cursor cursor(tab);
+ assert(tab->find(0, cursor) >= 0);
+ while (1) {
+ dout(0) << "dump ex " << cursor.current().key << "~" << cursor.current().value << endl;
+ assert(cursor.current().value > 0);
+
+ if (b < EBOFS_NUM_FREE_BUCKETS)
+ n += cursor.current().value;
+
+ if (free.contains( cursor.current().key, cursor.current().value ))
+ dout(0) << "dump bad " << cursor.current().key << "~" << cursor.current().value << endl;
+ assert(!free.contains( cursor.current().key, cursor.current().value ));
+ free.insert( cursor.current().key, cursor.current().value );
+ if (cursor.move_right() <= 0) break;
+ }
+ } else {
+ //cout << " empty" << endl;
+ }
+ }
+
+ assert(n == fs->free_blocks);
+ dout(0) << "dump combined freelist is " << free << endl;
+
+
+ // alloc_tab
+ if (fs->alloc_tab->get_num_keys() > 0) {
+ Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
+ assert(fs->alloc_tab->find(0, cursor) >= 0);
+ while (1) {
+ dout(0) << "alloc ex " << cursor.current().key << "~" << cursor.current().value.first << " ref "
+ << cursor.current().value.second
+ << endl;
+ assert(cursor.current().value.first > 0);
+
+ if (cursor.move_right() <= 0) break;
+ }
+ }
+ }
+}
+
+
+int Allocator::find(Extent& ex, int bucket, block_t num, block_t near, int dir)
+{
+ Table<block_t,block_t>::Cursor cursor(fs->free_tab[bucket]);
+ bool found = false;
+
+ if ((dir == DIR_ANY || dir == DIR_FWD) &&
+ fs->free_tab[bucket]->find( near, cursor ) >= 0) {
+ // look to the right
+ do {
+ if (cursor.current().value >= num)
+ found = true;
+ } while (!found && cursor.move_right() > 0);
+ }
+
+ if ((dir == DIR_ANY || dir == DIR_BACK) &&
+ !found) {
+ // look to the left
+ fs->free_tab[bucket]->find( near, cursor );
+
+ while (!found && cursor.move_left() >= 0)
+ if (cursor.current().value >= num)
+ found = true;
+ }
+
+ if (found) {
+ ex.start = cursor.current().key;
+ ex.length = cursor.current().value;
+ return 0;
+ }
+
+ return -1;
+}
+
+int Allocator::allocate(Extent& ex, block_t num, block_t near)
+{
+ //dump_freelist();
+
+ int dir = DIR_ANY; // no dir
+ if (near == NEAR_LAST_FWD) {
+ near = last_pos;
+ dir = DIR_FWD; // fwd
+ }
+ else if (near == NEAR_LAST)
+ near = last_pos;
+
+ int bucket;
+
+ while (1) { // try twice, if fwd = true
+
+ // look for contiguous extent
+ for (bucket = pick_bucket(num); bucket < EBOFS_NUM_FREE_BUCKETS; bucket++) {
+ if (find(ex, bucket, num, near, dir) >= 0) {
+ // yay!
+
+ // remove original
+ fs->free_tab[bucket]->remove( ex.start );
+ fs->free_blocks -= ex.length;
+
+ if (ex.length > num) {
+ if (ex.start < near) {
+ // to the left
+ if (ex.start + ex.length - num <= near) {
+ // by a lot. take right-most portion.
+ Extent left;
+ left.start = ex.start;
+ left.length = ex.length - num;
+ ex.start += left.length;
+ ex.length -= left.length;
+ assert(ex.length == num);
+ _release_loner(left);
+ } else {
+ // take middle part.
+ Extent left,right;
+ left.start = ex.start;
+ left.length = near - ex.start;
+ ex.start = near;
+ right.start = ex.start + num;
+ right.length = ex.length - left.length - num;
+ ex.length = num;
+ _release_loner(left);
+ _release_loner(right);
+ }
+ }
+ else {
+ // to the right. take left-most part.
+ Extent right;
+ right.start = ex.start + num;
+ right.length = ex.length - num;
+ ex.length = num;
+ _release_loner(right);
+ }
+ }
+
+ dout(20) << "allocate " << ex << " near " << near << endl;
+ last_pos = ex.end();
+ //dump_freelist();
+ if (g_conf.ebofs_cloneable)
+ alloc_inc(ex);
+ return num;
+ }
+ }
+
+ if (dir == DIR_BACK || dir == DIR_ANY) break;
+ dir = DIR_BACK;
+ }
+
+ // ok, find partial extent instead.
+ for (block_t trysize = num/2; trysize >= 1; trysize /= 2) {
+ int bucket = pick_bucket(trysize);
+ if (find(ex, bucket, trysize, near) >= 0) {
+ // yay!
+ assert(ex.length < num);
+
+ fs->free_tab[bucket]->remove(ex.start);
+ fs->free_blocks -= ex.length;
+ last_pos = ex.end();
+ dout(20) << "allocate partial " << ex << " (wanted " << num << ") near " << near << endl;
+ //dump_freelist();
+ if (g_conf.ebofs_cloneable)
+ alloc_inc(ex);
+ return ex.length;
+ }
+ }
+
+ dout(1) << "allocate failed, fs completely full! " << fs->free_blocks << endl;
+ assert(0);
+ //dump_freelist();
+ return -1;
+}
+
+int Allocator::_release_into_limbo(Extent& ex)
+{
+ dout(10) << "_release_into_limbo " << ex << endl;
+ dout(10) << "limbo is " << limbo << endl;
+ assert(ex.length > 0);
+ limbo.insert(ex.start, ex.length);
+ fs->limbo_blocks += ex.length;
+ return 0;
+}
+
+int Allocator::release(Extent& ex)
+{
+ if (g_conf.ebofs_cloneable)
+ return alloc_dec(ex);
+
+ _release_into_limbo(ex);
+ return 0;
+}
+
+int Allocator::commit_limbo()
+{
+ dout(20) << "commit_limbo" << endl;
+ for (map<block_t,block_t>::iterator i = limbo.m.begin();
+ i != limbo.m.end();
+ i++) {
+ fs->limbo_tab->insert(i->first, i->second);
+ //fs->free_blocks += i->second;
+ }
+ limbo.clear();
+ //fs->limbo_blocks = 0;
+ //dump_freelist();
+ return 0;
+}
+
+int Allocator::release_limbo()
+{
+ //dump_freelist();
+ if (fs->limbo_tab->get_num_keys() > 0) {
+ Table<block_t,block_t>::Cursor cursor(fs->limbo_tab);
+ fs->limbo_tab->find(0, cursor);
+ while (1) {
+ Extent ex(cursor.current().key, cursor.current().value);
+ dout(20) << "release_limbo ex " << ex << endl;
+
+ fs->limbo_blocks -= ex.length;
+ _release_merge(ex);
+
+ if (cursor.move_right() <= 0) break;
+ }
+ }
+ fs->limbo_tab->clear();
+ //dump_freelist();
+ return 0;
+}
+
+
+
+/*
+int Allocator::_alloc_loner_inc(Extent& ex)
+{
+ Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
+
+ if (fs->alloc_tab->find( ex.start, cursor )
+ == Table<block_t,pair<block_t,int> >::Cursor::MATCH) {
+ assert(cursor.current().value.first == ex.length);
+ pair<block_t,int>& v = cursor.dirty_current_value();
+ v.second++;
+ dout(10) << "_alloc_loner_inc " << ex << " "
+ << (v.second-1) << " -> " << v.second
+ << endl;
+ } else {
+ // insert it, @1
+ fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length,1));
+ dout(10) << "_alloc_loner_inc " << ex << " 0 -> 1" << endl;
+ }
+ return 0;
+}
+
+int Allocator::_alloc_loner_dec(Extent& ex)
+{
+ Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
+
+ if (fs->alloc_tab->find( ex.start, cursor )
+ == Table<block_t,pair<block_t,int> >::Cursor::MATCH) {
+ assert(cursor.current().value.first == ex.length);
+ if (cursor.current().value.second == 1) {
+ dout(10) << "_alloc_loner_dec " << ex << " 1 -> 0" << endl;
+ fs->alloc_tab->remove( cursor.current().key );
+ } else {
+ pair<block_t,int>& v = cursor.dirty_current_value();
+ --v.second;
+ dout(10) << "_alloc_loner_dec " << ex << " "
+ << (v.second+1) << " -> " << v.second
+ << endl;
+ }
+ } else {
+ assert(0);
+ }
+ return 0;
+}
+*/
+
+
+int Allocator::alloc_inc(Extent ex)
+{
+ dout(10) << "alloc_inc " << ex << endl;
+
+ // empty table?
+ if (fs->alloc_tab->get_num_keys() == 0) {
+ // easy.
+ fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length,1));
+ dout(10) << "alloc_inc + " << ex << " 0 -> 1 (first entry)" << endl;
+ return 0;
+ }
+
+ Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
+
+ // try to move to left (to check for overlap)
+ int r = fs->alloc_tab->find( ex.start, cursor );
+ if (r == Table<block_t,pair<block_t,int> >::Cursor::OOB ||
+ cursor.current().key > ex.start) {
+ r = cursor.move_left();
+ dout(10) << "alloc_inc move_left r = " << r << endl;
+ }
+
+ while (1) {
+ dout(10) << "alloc_inc loop at " << cursor.current().key
+ << "~" << cursor.current().value.first
+ << " ref " << cursor.current().value.second
+ << endl;
+
+ // too far left?
+ if (cursor.current().key < ex.start &&
+ cursor.current().key + cursor.current().value.first <= ex.start) {
+ // adjacent?
+ bool adjacent = false;
+ if (cursor.current().key + cursor.current().value.first == ex.start &&
+ cursor.current().value.second == 1)
+ adjacent = true;
+
+ // no overlap.
+ r = cursor.move_right();
+ dout(10) << "alloc_inc move_right r = " << r << endl;
+
+ // at end?
+ if (r <= 0) {
+ // hmm!
+ if (adjacent) {
+ // adjust previous entry
+ cursor.move_left();
+ pair<block_t,int> &v = cursor.dirty_current_value();
+ v.first += ex.length; // yay!
+ dout(10) << "alloc_inc + " << ex << " 0 -> 1 (adjust at end)" << endl;
+ } else {
+ // insert at end, finish.
+ int r = fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length,1));
+ dout(10) << "alloc_inc + " << ex << " 0 -> 1 (at end) .. r = " << r << endl;
+ //dump_freelist();
+ }
+ return 0;
+ }
+ }
+
+ if (cursor.current().key > ex.start) {
+ // gap.
+ // oooooo
+ // nnnnn.....
+ block_t l = MIN(ex.length, cursor.current().key - ex.start);
+
+ fs->alloc_tab->insert(ex.start, pair<block_t,int>(l,1));
+ dout(10) << "alloc_inc + " << ex.start << "~" << l << " 0 -> 1" << endl;
+ ex.start += l;
+ ex.length -= l;
+ if (ex.length == 0) break;
+ fs->alloc_tab->find( ex.start, cursor );
+ }
+ else if (cursor.current().key < ex.start) {
+ block_t end = cursor.current().value.first + cursor.current().key;
+
+ if (end <= ex.end()) {
+ // single split
+ // oooooo
+ // nnnnn
+ pair<block_t,int>& v = cursor.dirty_current_value();
+ v.first = ex.start - cursor.current().key;
+ int ref = v.second;
+
+ block_t l = end - ex.start;
+ fs->alloc_tab->insert(ex.start, pair<block_t,int>(l, 1+ref));
+
+ dout(10) << "alloc_inc " << ex.start << "~" << l
+ << " " << ref << " -> " << ref+1
+ << " (right split)" << endl;
+
+ ex.start += l;
+ ex.length -= l;
+ if (ex.length == 0) break;
+ fs->alloc_tab->find( ex.start, cursor );
+
+ } else {
+ // double split, finish.
+ // -------------
+ // ------
+ pair<block_t,int>& v = cursor.dirty_current_value();
+ v.first = ex.start - cursor.current().key;
+ int ref = v.second;
+
+ fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length, 1+ref));
+
+ int rl = end - ex.end();
+ fs->alloc_tab->insert(ex.end(), pair<block_t,int>(rl, ref));
+
+ dout(10) << "alloc_inc " << ex
+ << " " << ref << " -> " << ref+1
+ << " (double split finish)"
+ << endl;
+
+ break;
+ }
+ }
+ else {
+ assert(cursor.current().key == ex.start);
+
+ if (cursor.current().value.first <= ex.length) {
+ // inc.
+ // oooooo
+ // nnnnnnnn
+ pair<block_t,int>& v = cursor.dirty_current_value();
+ v.second++;
+ dout(10) << "alloc_inc " << ex.start << "~" << cursor.current().value.first
+ << " " << cursor.current().value.second-1 << " -> "
+ << cursor.current().value.second
+ << " (left split)" << endl;
+ ex.start += v.first;
+ ex.length -= v.first;
+ if (ex.length == 0) break;
+ cursor.move_right();
+ } else {
+ // single split, finish.
+ // oooooo
+ // nnn
+ block_t l = cursor.current().value.first - ex.length;
+ int ref = cursor.current().value.second;
+
+ pair<block_t,int>& v = cursor.dirty_current_value();
+ v.first = ex.length;
+ v.second++;
+
+ fs->alloc_tab->insert(ex.end(), pair<block_t,int>(l, ref));
+
+ dout(10) << "alloc_inc " << ex
+ << " " << ref << " -> " << ref+1
+ << " (left split finish)"
+ << endl;
+
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+int Allocator::alloc_dec(Extent ex)
+{
+ dout(10) << "alloc_dec " << ex << endl;
+
+ assert(fs->alloc_tab->get_num_keys() >= 0);
+
+ Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
+
+ // try to move to left (to check for overlap)
+ int r = fs->alloc_tab->find( ex.start, cursor );
+ dout(10) << "alloc_dec find r = " << r << endl;
+
+ if (r == Table<block_t,pair<block_t,int> >::Cursor::OOB ||
+ cursor.current().key > ex.start) {
+ r = cursor.move_left();
+ dout(10) << "alloc_dec move_left r = " << r << endl;
+
+ // too far left?
+ if (cursor.current().key < ex.start &&
+ cursor.current().key + cursor.current().value.first <= ex.start) {
+ // no overlap.
+ dump_freelist();
+ assert(0);
+ }
+ }
+
+ while (1) {
+ dout(10) << "alloc_dec ? " << cursor.current().key
+ << "~" << cursor.current().value.first
+ << " " << cursor.current().value.second
+ << ", ex is " << ex
+ << endl;
+
+ assert(cursor.current().key <= ex.start); // no gap allowed.
+
+ if (cursor.current().key < ex.start) {
+ block_t end = cursor.current().value.first + cursor.current().key;
+
+ if (end <= ex.end()) {
+ // single split
+ // oooooo
+ // -----
+ pair<block_t,int>& v = cursor.dirty_current_value();
+ v.first = ex.start - cursor.current().key;
+ int ref = v.second;
+ dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first
+ << " " << ref
+ << " shortened left bit of single" << endl;
+
+ block_t l = end - ex.start;
+ if (ref > 1) {
+ fs->alloc_tab->insert(ex.start, pair<block_t,int>(l, ref-1));
+ dout(10) << "alloc_dec . " << ex.start << "~" << l
+ << " " << ref << " -> " << ref-1
+ << endl;
+ } else {
+ Extent r(ex.start, l);
+ _release_into_limbo(r);
+ }
+
+ ex.start += l;
+ ex.length -= l;
+ if (ex.length == 0) break;
+ fs->alloc_tab->find( ex.start, cursor );
+
+ } else {
+ // double split, finish.
+ // ooooooooooooo
+ // ------
+ pair<block_t,int>& v = cursor.dirty_current_value();
+ v.first = ex.start - cursor.current().key;
+ int ref = v.second;
+ dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first
+ << " " << ref
+ << " shorted left bit of double split" << endl;
+
+ if (ref > 1) {
+ fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length, ref-1));
+ dout(10) << "alloc_inc s " << ex
+ << " " << ref << " -> " << ref-1
+ << " reinserted middle bit of double split"
+ << endl;
+ } else {
+ _release_into_limbo(ex);
+ }
+
+ int rl = end - ex.end();
+ fs->alloc_tab->insert(ex.end(), pair<block_t,int>(rl, ref));
+ dout(10) << "alloc_dec s " << ex.end() << "~" << rl
+ << " " << ref
+ << " reinserted right bit of double split" << endl;
+ break;
+ }
+ }
+ else {
+ assert(cursor.current().key == ex.start);
+
+ if (cursor.current().value.first <= ex.length) {
+ // inc.
+ // oooooo
+ // nnnnnnnn
+ if (cursor.current().value.second > 1) {
+ pair<block_t,int>& v = cursor.dirty_current_value();
+ v.second--;
+ dout(10) << "alloc_dec s " << ex.start << "~" << cursor.current().value.first
+ << " " << cursor.current().value.second+1 << " -> " << cursor.current().value.second
+ << endl;
+ ex.start += v.first;
+ ex.length -= v.first;
+ if (ex.length == 0) break;
+ cursor.move_right();
+ } else {
+ Extent r(cursor.current().key, cursor.current().value.first);
+ _release_into_limbo(r);
+
+ ex.start += cursor.current().value.first;
+ ex.length -= cursor.current().value.first;
+ cursor.remove();
+
+ if (ex.length == 0) break;
+ fs->alloc_tab->find( ex.start, cursor );
+ }
+ } else {
+ // single split, finish.
+ // oooooo
+ // nnn
+ block_t l = cursor.current().value.first - ex.length;
+ int ref = cursor.current().value.second;
+
+ if (ref > 1) {
+ pair<block_t,int>& v = cursor.dirty_current_value();
+ v.first = ex.length;
+ v.second--;
+ dout(10) << "alloc_inc . " << ex
+ << " " << ref << " -> " << ref-1
+ << endl;
+ } else {
+ _release_into_limbo(ex);
+ cursor.remove();
+ }
+
+ dout(10) << "alloc_dec s " << ex.end() << "~" << l
+ << " " << ref
+ << " reinserted right bit of single split" << endl;
+ fs->alloc_tab->insert(ex.end(), pair<block_t,int>(l, ref));
+ break;
+ }
+ }
+
+
+ }
+
+ return 0;
+}
+
+
+/*
+ * release extent into freelist
+ * WARNING: *ONLY* use this if you _know_ there are no adjacent free extents
+ */
+int Allocator::_release_loner(Extent& ex)
+{
+ assert(ex.length > 0);
+ int b = pick_bucket(ex.length);
+ fs->free_tab[b]->insert(ex.start, ex.length);
+ fs->free_blocks += ex.length;
+ return 0;
+}
+
+/*
+ * release extent into freelist
+ * look for any adjacent extents and merge with them!
+ */
+int Allocator::_release_merge(Extent& orig)
+{
+ dout(15) << "_release_merge " << orig << endl;
+ assert(orig.length > 0);
+
+ Extent newex = orig;
+
+ // one after us?
+ for (int b=0; b<EBOFS_NUM_FREE_BUCKETS; b++) {
+ Table<block_t,block_t>::Cursor cursor(fs->free_tab[b]);
+
+ if (fs->free_tab[b]->find( newex.start+newex.length, cursor )
+ == Table<block_t,block_t>::Cursor::MATCH) {
+ // add following extent to ours
+ newex.length += cursor.current().value;
+
+ // remove it
+ fs->free_blocks -= cursor.current().value;
+ fs->free_tab[b]->remove( cursor.current().key );
+ break;
+ }
+ }
+
+ // one before us?
+ for (int b=0; b<EBOFS_NUM_FREE_BUCKETS; b++) {
+ Table<block_t,block_t>::Cursor cursor(fs->free_tab[b]);
+ fs->free_tab[b]->find( newex.start+newex.length, cursor );
+ if (cursor.move_left() >= 0 &&
+ (cursor.current().key + cursor.current().value == newex.start)) {
+ // merge
+ newex.start = cursor.current().key;
+ newex.length += cursor.current().value;
+
+ // remove it
+ fs->free_blocks -= cursor.current().value;
+ fs->free_tab[b]->remove( cursor.current().key );
+ break;
+ }
+ }
+
+ // ok, insert newex
+ _release_loner(newex);
+ return 0;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __EBOFS_ALLOCATOR_H
+#define __EBOFS_ALLOCATOR_H
+
+#include "types.h"
+
+#include "include/interval_set.h"
+
+class Ebofs;
+
+class Allocator {
+public:
+ const static block_t NEAR_LAST = 0;
+ const static block_t NEAR_LAST_FWD = 1;
+
+ const static int DIR_ANY = 0;
+ const static int DIR_FWD = 2;
+ const static int DIR_BACK = 1;
+
+protected:
+ Ebofs *fs;
+ block_t last_pos;
+
+
+ interval_set<block_t> limbo;
+
+ static int pick_bucket(block_t num) {
+ int b = 0;
+ while (num > 1) {
+ b++;
+ num = num >> EBOFS_FREE_BUCKET_BITS;
+ }
+ if (b >= EBOFS_NUM_FREE_BUCKETS)
+ b = EBOFS_NUM_FREE_BUCKETS-1;
+ return b;
+ }
+
+ int find(Extent& ex, int bucket, block_t num, block_t near, int dir = DIR_ANY);
+
+ void dump_freelist();
+
+ public:
+ int _release_into_limbo(Extent& ex);
+
+ int _release_loner(Extent& ex); // release loner extent
+ int _release_merge(Extent& ex); // release any extent (searches for adjacent)
+
+ //int _alloc_loner_inc(Extent& ex);
+ //int _alloc_loner_dec(Extent& ex);
+
+
+ public:
+ Allocator(Ebofs *f) : fs(f), last_pos(0) {}
+
+ int allocate(Extent& ex, block_t num, block_t near=NEAR_LAST);
+ int release(Extent& ex); // alias for alloc_dec
+
+ int alloc_inc(Extent ex);
+ int alloc_dec(Extent ex);
+
+
+ /*int unallocate(Extent& ex) { // skip limbo
+ return _release_merge(ex);
+ }
+ */
+
+ int commit_limbo(); // limbo -> fs->limbo_tab
+ int release_limbo(); // fs->limbo_tab -> free_tabs
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "BlockDevice.h"
+
+#include "config.h"
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <iostream>
+#include <cassert>
+#include <errno.h>
+
+#include <sys/uio.h>
+
+#include <sys/ioctl.h>
+
+#ifndef __CYGWIN__
+#ifndef DARWIN
+#include <linux/fs.h>
+#endif
+#endif
+
+
+/*******************************************
+ * biovec
+ */
+
+inline ostream& operator<<(ostream& out, BlockDevice::biovec &bio)
+{
+ out << "bio(";
+ if (bio.type == BlockDevice::biovec::IO_READ) out << "rd ";
+ if (bio.type == BlockDevice::biovec::IO_WRITE) out << "wr ";
+ out << bio.start << "~" << bio.length;
+ if (bio.note) out << " " << bio.note;
+ out << " " << &bio;
+ out << ")";
+ return out;
+}
+
+
+
+/*******************************************
+ * ElevatorQueue
+ */
+
+#undef dout
+#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").elevatorq."
+#define derr(x) if (x <= g_conf.debug_bdev) cerr << "bdev(" << dev << ").elevatorq."
+
+
+int BlockDevice::ElevatorQueue::dequeue_io(list<biovec*>& biols,
+ block_t& start, block_t& length,
+ interval_set<block_t>& block_lock)
+{
+ // queue empty?
+ assert(!io_map.empty());
+
+ dout(20) << "dequeue_io el_pos " << el_pos << " dir " << el_dir_forward << endl;
+
+ // find our position: i >= pos
+ map<block_t,biovec*>::iterator i;
+
+ int tries = g_conf.bdev_el_bidir + 1;
+ while (tries > 0) {
+ if (el_dir_forward) {
+ i = io_map.lower_bound(el_pos);
+ if (i != io_map.end()) {
+ break; // not at end. good.
+ }
+ } else {
+ i = io_map.upper_bound(el_pos);
+ if (i != io_map.begin()) {
+ i--; // and back down one (to get i <= pos). good.
+ break;
+ }
+ }
+
+ // reverse (or initial startup)?
+ if (g_conf.bdev_el_bidir || !el_dir_forward) {
+ // dout(20) << "restart reversing" << endl;
+ el_dir_forward = !el_dir_forward;
+ }
+
+ if (el_dir_forward) {
+ // forward
+ el_pos = 0;
+
+ if (g_conf.bdev_el_fw_max_ms) {
+ el_stop = g_clock.now();
+ utime_t max(0, 1000*g_conf.bdev_el_fw_max_ms); // (s,us), convert ms -> us!
+ el_stop += max;
+ // dout(20) << "restart forward sweep for " << max << endl;
+ } else {
+ // dout(20) << "restart fowrard sweep" << endl;
+ }
+ } else {
+ // reverse
+ el_pos = bdev->get_num_blocks();
+
+ if (g_conf.bdev_el_bw_max_ms) {
+ el_stop = g_clock.now();
+ utime_t max(0, 1000*g_conf.bdev_el_bw_max_ms); // (s,us), convert ms -> us!
+ el_stop += max;
+ // dout(20) << "restart reverse sweep for " << max << endl;
+ } else {
+ // dout(20) << "restart reverse sweep" << endl;
+ }
+ }
+
+ tries--;
+ }
+
+ assert(tries > 0); // this shouldn't happen if the queue is non-empty.
+
+ // get some biovecs
+ int num_bio = 0;
+
+ dout(20) << "dequeue_io starting with " << i->first << " " << *i->second << endl;
+
+ // merge contiguous ops
+ char type = i->second->type; // read or write
+ int num_iovs = 0; // count eventual iov's for readv/writev
+
+ start = i->first;
+ length = 0;
+
+ if (el_dir_forward)
+ el_pos = start;
+ else
+ el_pos = i->first + i->second->length;
+
+ // while (contiguous)
+ while ((( el_dir_forward && el_pos == i->first) ||
+ (!el_dir_forward && el_pos == i->first + i->second->length)) &&
+ type == i->second->type) {
+ biovec *bio = i->second;
+
+ // allowed? (not already submitted to kernel?)
+ if (block_lock.intersects(bio->start, bio->length)) {
+ // dout(20) << "dequeue_io " << bio->start << "~" << bio->length
+ // << " intersects block_lock " << block_lock << endl;
+ break; // stop, or go with what we've got so far
+ }
+
+ // add to biols
+ int nv = bio->bl.buffers().size(); // how many iov's in this bio's bufferlist?
+ if (num_iovs + nv >= g_conf.bdev_iov_max) break; // too many!
+ num_iovs += nv;
+
+ start = MIN(start, bio->start);
+ length += bio->length;
+
+ if (el_dir_forward) {
+ //dout(20) << "dequeue_io fw dequeue io at " << el_pos << " " << *i->second << endl;
+ biols.push_back(bio); // add at back
+ } else {
+ // dout(20) << "dequeue_io bw dequeue io at " << el_pos << " " << *i->second << endl;
+ biols.push_front(bio); // add at front
+ }
+ num_bio++;
+
+ // move elevator pointer
+ bool at_end = false;
+ map<block_t,biovec*>::iterator prev = i;
+ if (el_dir_forward) {
+ el_pos += bio->length; // cont. next would start right after us
+ i++;
+ if (i == io_map.end()) {
+ at_end = true;
+ }
+ } else {
+ el_pos -= bio->length;
+ if (i == io_map.begin()) {
+ at_end = true;
+ } else {
+ i--;
+ }
+ }
+
+ // dequeue
+ io_map.erase(prev);
+ bio->in_queue = 0;
+
+ if (at_end) break;
+ }
+
+ return num_bio;
+}
+
+
+
+/*******************************************
+ * BarrierQueue
+ */
+#undef dout
+#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").barrierq."
+
+void BlockDevice::BarrierQueue::barrier()
+{
+ if (!qls.empty() && qls.front()->empty()) {
+ assert(qls.size() == 1);
+ dout(10) << "barrier not adding new queue, front is empty" << endl;
+ } else {
+ qls.push_back(new ElevatorQueue(bdev, dev));
+ dout(10) << "barrier adding new elevator queue (now " << qls.size() << "), front queue has "
+ << qls.front()->size() << " ios left" << endl;
+ }
+}
+
+bool BlockDevice::BarrierQueue::bump()
+{
+ assert(!qls.empty());
+
+ // is the front queue empty?
+ if (qls.front()->empty() &&
+ qls.front() != qls.back()) {
+ delete qls.front();
+ qls.pop_front();
+ dout(10) << "dequeue_io front empty, moving to next queue (" << qls.front()->size() << ")" << endl;
+ return true;
+ }
+
+ return false;
+}
+
+int BlockDevice::BarrierQueue::dequeue_io(list<biovec*>& biols,
+ block_t& start, block_t& length,
+ interval_set<block_t>& locked)
+{
+ assert(!qls.empty());
+ int n = qls.front()->dequeue_io(biols, start, length, locked);
+ bump(); // in case we emptied the front queue
+ return n;
+}
+
+
+
+
+/*******************************************
+ * BlockDevice
+ */
+
+#undef dout
+#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ")."
+
+
+
+block_t BlockDevice::get_num_blocks()
+{
+ if (!num_blocks) {
+ assert(fd > 0);
+
+#ifdef BLKGETSIZE64
+ // ioctl block device?
+ ioctl(fd, BLKGETSIZE64, &num_blocks);
+#endif
+
+ if (!num_blocks) {
+ // hmm, try stat!
+ struct stat st;
+ fstat(fd, &st);
+ num_blocks = st.st_size;
+ }
+
+ num_blocks /= (__uint64_t)EBOFS_BLOCK_SIZE;
+
+ if (g_conf.bdev_fake_mb) {
+ num_blocks = g_conf.bdev_fake_mb * 256;
+ dout(0) << "faking dev size " << g_conf.bdev_fake_mb << " mb" << endl;
+ }
+ if (g_conf.bdev_fake_max_mb &&
+ num_blocks > (block_t)g_conf.bdev_fake_max_mb * 256ULL) {
+ dout(0) << "faking dev size " << g_conf.bdev_fake_max_mb << " mb" << endl;
+ num_blocks = g_conf.bdev_fake_max_mb * 256;
+ }
+
+ }
+ return num_blocks;
+}
+
+
+
+/** io thread
+ * each worker thread dequeues ios from the root_queue and submits them to the kernel.
+ */
+void* BlockDevice::io_thread_entry()
+{
+ lock.Lock();
+
+ int whoami = io_threads_started++;
+ io_threads_running++;
+ assert(io_threads_running <= g_conf.bdev_iothreads);
+ dout(10) << "io_thread" << whoami << " start, " << io_threads_running << " now running" << endl;
+
+ // get my own fd (and file position pointer)
+ int fd = open_fd();
+ assert(fd > 0);
+
+ while (!io_stop) {
+ bool do_sleep = false;
+
+ // queue empty?
+ if (root_queue.empty()) {
+ // sleep
+ do_sleep = true;
+ } else {
+ dout(20) << "io_thread" << whoami << " going" << endl;
+
+ block_t start, length;
+ list<biovec*> biols;
+ int n = root_queue.dequeue_io(biols, start, length, io_block_lock);
+
+ if (n == 0) {
+ // failed to dequeue a do-able op, sleep for now
+ dout(20) << "io_thread" << whoami << " couldn't dequeue doable op, sleeping" << endl;
+ assert(io_threads_running > 1); // there must be someone else, if we couldn't dequeue something doable.
+ do_sleep = true;
+ }
+ else {
+ // lock blocks
+ assert(start == biols.front()->start);
+ io_block_lock.insert(start, length);
+
+ // drop lock to do the io
+ lock.Unlock();
+ do_io(fd, biols);
+ lock.Lock();
+
+ // unlock blocks
+ io_block_lock.erase(start, length);
+
+ // someone might have blocked on our block_lock?
+ if (io_threads_running < g_conf.bdev_iothreads &&
+ (int)root_queue.size() > io_threads_running)
+ io_wakeup.SignalAll();
+ }
+ }
+
+ if (do_sleep) {
+ do_sleep = false;
+
+ // sleep
+ io_threads_running--;
+ dout(20) << "io_thread" << whoami << " sleeping, " << io_threads_running << " threads now running,"
+ << " queue has " << root_queue.size() << endl;
+
+ if (g_conf.bdev_idle_kick_after_ms > 0 &&
+ io_threads_running == 0 &&
+ idle_kicker) {
+ // first wait for signal | timeout
+ io_wakeup.WaitInterval(lock, utime_t(0, g_conf.bdev_idle_kick_after_ms*1000));
+
+ // should we still be sleeping? (did we get woken up, or did timer expire?
+ if (root_queue.empty() && io_threads_running == 0) {
+ idle_kicker->kick(); // kick
+ io_wakeup.Wait(lock); // and wait
+ }
+ } else {
+ // normal, just wait.
+ io_wakeup.Wait(lock);
+ }
+
+ io_threads_running++;
+ assert(io_threads_running <= g_conf.bdev_iothreads);
+ dout(20) << "io_thread" << whoami << " woke up, " << io_threads_running << " threads now running" << endl;
+ }
+ }
+
+ // clean up
+ ::close(fd);
+ io_threads_running--;
+
+ lock.Unlock();
+
+ dout(10) << "io_thread" << whoami << " finish" << endl;
+ return 0;
+}
+
+
+
+/** do_io
+ * do a single io operation
+ * (lock is NOT held, but we own the *biovec)
+ */
+void BlockDevice::do_io(int fd, list<biovec*>& biols)
+{
+ int r;
+ assert(!biols.empty());
+
+ // get full range, type, bl
+ bufferlist bl;
+ bl.claim(biols.front()->bl);
+ block_t start = biols.front()->start;
+ block_t length = biols.front()->length;
+ char type = biols.front()->type;
+
+ list<biovec*>::iterator p = biols.begin();
+ int numbio = 1;
+ for (p++; p != biols.end(); p++) {
+ length += (*p)->length;
+ bl.claim_append((*p)->bl);
+ numbio++;
+ }
+
+ // do it
+ dout(20) << "do_io start " << (type==biovec::IO_WRITE?"write":"read")
+ << " " << start << "~" << length
+ << " " << numbio << " bits" << endl;
+ if (type == biovec::IO_WRITE) {
+ r = _write(fd, start, length, bl);
+ } else if (type == biovec::IO_READ) {
+ r = _read(fd, start, length, bl);
+ } else assert(0);
+ dout(20) << "do_io finish " << (type==biovec::IO_WRITE?"write":"read")
+ << " " << start << "~" << length << endl;
+
+ // set rval
+ for (p = biols.begin(); p != biols.end(); p++)
+ (*p)->rval = r;
+
+ if (1) {
+ // put in completion queue
+ complete_lock.Lock();
+ complete_queue.splice( complete_queue.end(), biols );
+ complete_queue_len += numbio;
+ complete_wakeup.Signal();
+ complete_lock.Unlock();
+ } else {
+ // be slow and finish synchronously
+ for (p = biols.begin(); p != biols.end(); p++)
+ finish_io(*p);
+ }
+}
+
+
+/** finish_io
+ *
+ * finish an io by signaling the cond or performing a callback.
+ * called by completion thread, unless that's disabled above.
+ */
+void BlockDevice::finish_io(biovec *bio)
+{
+ bio->done = true;
+ if (bio->cond) {
+ bio->cond->Signal();
+ }
+ else if (bio->cb) {
+ bio->cb->finish((ioh_t)bio, bio->rval);
+ delete bio->cb;
+ delete bio;
+ }
+}
+
+/*** completion_thread
+ * handle Cond signals or callbacks for completed ios
+ */
+void* BlockDevice::complete_thread_entry()
+{
+ complete_lock.Lock();
+ dout(10) << "complete_thread start" << endl;
+
+ while (!io_stop) {
+
+ while (!complete_queue.empty()) {
+ list<biovec*> ls;
+ ls.swap(complete_queue);
+ dout(10) << "complete_thread grabbed " << complete_queue_len << " biovecs" << endl;
+ complete_queue_len = 0;
+
+ complete_lock.Unlock();
+
+ // finish
+ for (list<biovec*>::iterator p = ls.begin();
+ p != ls.end();
+ p++) {
+ biovec *bio = *p;
+ dout(20) << "complete_thread finishing " << *bio << endl;
+ finish_io(bio);
+ }
+
+ complete_lock.Lock();
+ }
+ if (io_stop) break;
+
+ /*
+ if (io_threads_running == 0 && idle_kicker) {
+ complete_lock.Unlock();
+ idle_kicker->kick();
+ complete_lock.Lock();
+ if (!complete_queue.empty() || io_stop)
+ continue;
+ }
+ */
+
+ dout(25) << "complete_thread sleeping" << endl;
+ complete_wakeup.Wait(complete_lock);
+ }
+
+ dout(10) << "complete_thread finish" << endl;
+ complete_lock.Unlock();
+ return 0;
+}
+
+
+
+
+// io queue
+
+void BlockDevice::_submit_io(biovec *b)
+{
+ // NOTE: lock must be held
+ dout(15) << "_submit_io " << *b << endl;
+
+ // wake up io_thread(s)?
+ if ((int)root_queue.size() == io_threads_running)
+ io_wakeup.SignalOne();
+ else if ((int)root_queue.size() > io_threads_running)
+ io_wakeup.SignalAll();
+
+ // queue
+ root_queue.submit_io(b);
+
+ /*
+ // [DEBUG] check for overlapping ios
+ // BUG: this doesn't detect all overlaps w/ the next queue thing.
+ if (g_conf.bdev_debug_check_io_overlap) {
+ // BUG: this doesn't catch everything! eg 1~10000000 will be missed....
+ multimap<block_t, biovec*>::iterator p = io_queue.lower_bound(b->start);
+ if ((p != io_queue.end() &&
+ p->first < b->start+b->length) ||
+ (p != io_queue.begin() &&
+ (p--, p->second->start + p->second->length > b->start))) {
+ dout(1) << "_submit_io new io " << *b
+ << " overlaps with existing " << *p->second << endl;
+ cerr << "_submit_io new io " << *b
+ << " overlaps with existing " << *p->second << endl;
+ }
+ }
+ */
+
+}
+
+int BlockDevice::_cancel_io(biovec *bio)
+{
+ // NOTE: lock must be held
+
+ if (bio->in_queue == 0) {
+ dout(15) << "_cancel_io " << *bio << " FAILED" << endl;
+ return -1;
+ } else {
+ dout(15) << "_cancel_io " << *bio << endl;
+ bio->in_queue->cancel_io(bio);
+ if (root_queue.bump())
+ io_wakeup.SignalAll(); // something happened!
+ return 0;
+ }
+}
+
+
+
+// low level io
+
+int BlockDevice::_read(int fd, block_t bno, unsigned num, bufferlist& bl)
+{
+ dout(10) << "_read " << bno << "~" << num << endl;
+
+ assert(fd > 0);
+
+ off_t offset = bno * EBOFS_BLOCK_SIZE;
+ off_t actual = lseek(fd, offset, SEEK_SET);
+ assert(actual == offset);
+
+ size_t len = num*EBOFS_BLOCK_SIZE;
+ assert(bl.length() >= len);
+
+ struct iovec iov[ bl.buffers().size() ];
+ int n = 0;
+ size_t left = len;
+ for (list<bufferptr>::const_iterator i = bl.buffers().begin();
+ i != bl.buffers().end();
+ i++) {
+ assert(i->length() % EBOFS_BLOCK_SIZE == 0);
+
+ iov[n].iov_base = (void*)i->c_str();
+ iov[n].iov_len = MIN(left, i->length());
+
+ left -= iov[n].iov_len;
+ n++;
+ if (left == 0) break;
+ }
+
+ int got = ::readv(fd, iov, n);
+ assert(got <= (int)len);
+
+ return 0;
+}
+
+int BlockDevice::_write(int fd, unsigned bno, unsigned num, bufferlist& bl)
+{
+ dout(10) << "_write " << bno << "~" << num << endl;
+
+ assert(fd > 0);
+
+ off_t offset = (off_t)bno << EBOFS_BLOCK_BITS;
+ assert((off_t)bno * (off_t)EBOFS_BLOCK_SIZE == offset);
+ off_t actual = lseek(fd, offset, SEEK_SET);
+ assert(actual == offset);
+
+ // write buffers
+ size_t len = num*EBOFS_BLOCK_SIZE;
+
+ struct iovec iov[ bl.buffers().size() ];
+
+ int n = 0;
+ size_t left = len;
+ for (list<bufferptr>::const_iterator i = bl.buffers().begin();
+ i != bl.buffers().end();
+ i++) {
+ assert(i->length() % EBOFS_BLOCK_SIZE == 0);
+
+ iov[n].iov_base = (void*)i->c_str();
+ iov[n].iov_len = MIN(left, i->length());
+
+ assert((((unsigned long long)iov[n].iov_base) & 4095ULL) == 0);
+ assert((iov[n].iov_len & 4095) == 0);
+
+ left -= iov[n].iov_len;
+ n++;
+ if (left == 0) break;
+ }
+
+ int r = ::writev(fd, iov, n);
+
+ if (r < 0) {
+ dout(1) << "couldn't write bno " << bno << " num " << num
+ << " (" << len << " bytes) in " << n << " iovs, r=" << r
+ << " errno " << errno << " " << strerror(errno) << endl;
+ dout(1) << "bl is " << bl << endl;
+ assert(0);
+ } else {
+ assert(r == (int)len);
+ }
+
+ return 0;
+}
+
+
+
+// open/close
+
+int BlockDevice::open_fd()
+{
+#ifdef DARWIN
+ int fd = ::open(dev.c_str(), O_RDWR|O_SYNC, 0);
+ ::fcntl(fd, F_NOCACHE);
+ return fd;
+#else
+ return ::open(dev.c_str(), O_RDWR|O_SYNC|O_DIRECT, 0);
+#endif
+}
+
+int BlockDevice::open(kicker *idle)
+{
+ assert(fd == 0);
+
+ // open?
+ fd = open_fd();
+ if (fd < 0) {
+ dout(1) << "open failed, r = " << fd << " " << strerror(errno) << endl;
+ fd = 0;
+ return -1;
+ }
+
+ // lock
+ if (g_conf.bdev_lock) {
+ int r = ::flock(fd, LOCK_EX|LOCK_NB);
+ if (r < 0) {
+ derr(1) << "open " << dev << " failed to get LOCK_EX" << endl;
+ assert(0);
+ return -1;
+ }
+ }
+
+ // figure size
+ __uint64_t bsize = get_num_blocks();
+
+ dout(2) << "open " << bsize << " bytes, " << num_blocks << " blocks" << endl;
+
+ // start thread
+ io_threads_started = 0;
+ io_threads.clear();
+ for (int i=0; i<g_conf.bdev_iothreads; i++) {
+ io_threads.push_back(new IOThread(this));
+ io_threads.back()->create();
+ }
+ complete_thread.create();
+
+ // idle kicker?
+ idle_kicker = idle;
+
+ return fd;
+}
+
+
+int BlockDevice::close()
+{
+ assert(fd>0);
+
+ idle_kicker = 0;
+
+ // shut down io thread
+ dout(10) << "close stopping io+complete threads" << endl;
+ lock.Lock();
+ complete_lock.Lock();
+ io_stop = true;
+ io_wakeup.SignalAll();
+ complete_wakeup.SignalAll();
+ complete_lock.Unlock();
+ lock.Unlock();
+
+
+ for (int i=0; i<g_conf.bdev_iothreads; i++) {
+ io_threads[i]->join();
+ delete io_threads[i];
+ }
+ io_threads.clear();
+
+ complete_thread.join();
+
+ io_stop = false; // in case we start again
+
+ dout(2) << "close " << endl;
+
+ if (g_conf.bdev_lock)
+ ::flock(fd, LOCK_UN);
+
+ ::close(fd);
+ fd = 0;
+
+ return 0;
+}
+
+int BlockDevice::cancel_io(ioh_t ioh)
+{
+ biovec *pbio = (biovec*)ioh;
+
+ lock.Lock();
+ int r = _cancel_io(pbio);
+ lock.Unlock();
+
+ // FIXME?
+ if (r == 0 && pbio->cb) {
+ //pbio->cb->finish(ioh, 0);
+ delete pbio->cb;
+ delete pbio;
+ }
+
+ return r;
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __EBOFS_BLOCKDEVICE_H
+#define __EBOFS_BLOCKDEVICE_H
+
+#include "include/buffer.h"
+#include "include/interval_set.h"
+#include "include/Context.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "types.h"
+
+
+typedef void *ioh_t; // opaque handle to an io request. (in actuality, a biovec*)
+
+
+class BlockDevice {
+ public:
+ // callback type for io completion notification
+ class callback {
+ public:
+ virtual ~callback() {}
+ virtual void finish(ioh_t ioh, int rval) = 0;
+ };
+
+ // kicker for idle notification
+ class kicker {
+ public:
+ virtual ~kicker() {}
+ virtual void kick() = 0;
+ };
+
+
+ /********************************************************/
+
+ class Queue;
+
+ // io item
+ // two variants: one with Cond*, one with callback*.
+ class biovec {
+ public:
+ static const char IO_WRITE = 1;
+ static const char IO_READ = 2;
+
+ char type;
+ block_t start, length;
+ bufferlist bl;
+ callback *cb;
+ Cond *cond;
+ int rval;
+ char *note;
+ bool done;
+
+ Queue *in_queue;
+
+ biovec(char t, block_t s, block_t l, bufferlist& b, callback *c, char *n=0) :
+ type(t), start(s), length(l), bl(b), cb(c), cond(0), rval(0), note(n), done(false), in_queue(0) {}
+ biovec(char t, block_t s, block_t l, bufferlist& b, Cond *c, char *n=0) :
+ type(t), start(s), length(l), bl(b), cb(0), cond(c), rval(0), note(n), done(false), in_queue(0) {}
+ };
+ friend ostream& operator<<(ostream& out, biovec &bio);
+
+
+ /********************************************************/
+
+ /*
+ * Queue -- abstract IO queue interface
+ */
+ class Queue {
+ public:
+ virtual ~Queue() {}
+ virtual void submit_io(biovec *b) = 0;
+ virtual void cancel_io(biovec *b) = 0;
+ virtual int dequeue_io(list<biovec*>& biols,
+ block_t& start, block_t& length,
+ interval_set<block_t>& locked) = 0;
+ virtual int size() = 0;
+ virtual bool empty() { return size() == 0; }
+ };
+
+ /*
+ * ElevatorQueue - simple elevator scheduler queue
+ */
+ class ElevatorQueue : public Queue {
+ BlockDevice *bdev;
+ const char *dev;
+ map<block_t, biovec*> io_map;
+ bool el_dir_forward;
+ block_t el_pos;
+ utime_t el_stop;
+
+ public:
+ ElevatorQueue(BlockDevice *bd, const char *d) :
+ bdev(bd), dev(d),
+ el_dir_forward(false),
+ el_pos(0) {}
+ void submit_io(biovec *b) {
+ b->in_queue = this;
+ assert(io_map.count(b->start) == 0);
+ io_map[b->start] = b;
+ }
+ void cancel_io(biovec *b) {
+ assert(b->in_queue == this);
+ assert(io_map.count(b->start) &&
+ io_map[b->start] == b);
+ io_map.erase(b->start);
+ b->in_queue = 0;
+ }
+ int dequeue_io(list<biovec*>& biols,
+ block_t& start, block_t& length,
+ interval_set<block_t>& locked);
+ int size() {
+ return io_map.size();
+ }
+ };
+
+ /*
+ * BarrierQueue - lets you specify io "barriers"
+ * barrier() - force completion of all prior IOs before
+ * future ios are started.
+ * bump() - must be called after cancel_io to properly
+ * detect empty subqueue.
+ */
+ class BarrierQueue : public Queue {
+ BlockDevice *bdev;
+ const char *dev;
+ list<Queue*> qls;
+ public:
+ BarrierQueue(BlockDevice *bd, const char *d) : bdev(bd), dev(d) {
+ barrier();
+ }
+ int size() {
+ // this isn't perfectly accurate.
+ if (!qls.empty())
+ return qls.front()->size();
+ return 0;
+ }
+ void submit_io(biovec *b) {
+ assert(!qls.empty());
+ qls.back()->submit_io(b);
+ }
+ void cancel_io(biovec *b) {
+ assert(0); // shouldn't happen.
+ }
+ int dequeue_io(list<biovec*>& biols,
+ block_t& start, block_t& length,
+ interval_set<block_t>& locked);
+ void barrier();
+ bool bump();
+ };
+
+
+ private:
+ string dev; // my device file
+ int fd;
+ block_t num_blocks;
+
+ Mutex lock;
+
+ /** the root io queue.
+ * i current assumeit's a barrier queue,but this can be changed
+ * with some minor rearchitecting.
+ */
+ BarrierQueue root_queue;
+
+ kicker *idle_kicker; // not used..
+
+ /* io_block_lock - block ranges current dispatched to kernel
+ * once a bio is dispatched, it cannot be canceled, so an overlapping
+ * io and be submitted. the overlapping io cannot be dispatched
+ * to the kernel, however, until the original io finishes, or else
+ * there will be a race condition.
+ */
+ interval_set<block_t> io_block_lock; // blocks currently dispatched to kernel
+
+ // io threads
+ Cond io_wakeup;
+ bool io_stop;
+ int io_threads_started, io_threads_running;
+
+ void *io_thread_entry();
+
+ class IOThread : public Thread {
+ BlockDevice *dev;
+ public:
+ IOThread(BlockDevice *d) : dev(d) {}
+ void *entry() { return (void*)dev->io_thread_entry(); }
+ } ;
+
+ vector<IOThread*> io_threads;
+
+ // private io interface
+ int open_fd(); // get an fd (for a thread)
+
+ void _submit_io(biovec *b);
+ int _cancel_io(biovec *bio);
+ void do_io(int fd, list<biovec*>& biols); // called by an io thread
+
+ // low level io
+ int _read(int fd, block_t bno, unsigned num, bufferlist& bl);
+ int _write(int fd, unsigned bno, unsigned num, bufferlist& bl);
+
+
+ // completion callback queue
+ Mutex complete_lock;
+ Cond complete_wakeup;
+ list<biovec*> complete_queue;
+ int complete_queue_len;
+
+ void finish_io(biovec *bio);
+
+ // complete thread
+ void *complete_thread_entry();
+ class CompleteThread : public Thread {
+ BlockDevice *dev;
+ public:
+ CompleteThread(BlockDevice *d) : dev(d) {}
+ void *entry() { return (void*)dev->complete_thread_entry(); }
+ } complete_thread;
+
+
+ public:
+ BlockDevice(const char *d) :
+ dev(d), fd(0), num_blocks(0),
+ root_queue(this, dev.c_str()),
+ idle_kicker(0),
+ io_stop(false), io_threads_started(0), io_threads_running(0),
+ complete_queue_len(0),
+ complete_thread(this) { }
+ ~BlockDevice() {
+ if (fd > 0) close();
+ }
+
+ // get size in blocks
+ block_t get_num_blocks();
+ const char *get_device_name() const { return dev.c_str(); }
+
+ // open/close
+ int open(kicker *idle = 0);
+ int close();
+
+ // state stuff
+ bool is_idle() {
+ lock.Lock();
+ bool idle = (io_threads_running == 0) && root_queue.empty();
+ lock.Unlock();
+ return idle;
+ }
+ void barrier() {
+ lock.Lock();
+ root_queue.barrier();
+ lock.Unlock();
+ }
+
+ // ** blocking interface **
+
+ // read
+ int read(block_t bno, unsigned num, bufferptr& bptr, char *n=0) {
+ bufferlist bl;
+ bl.push_back(bptr);
+ return read(bno, num, bl, n);
+ }
+ int read(block_t bno, unsigned num, bufferlist& bl, char *n=0) {
+ Cond c;
+ biovec bio(biovec::IO_READ, bno, num, bl, &c, n);
+
+ lock.Lock();
+ _submit_io(&bio);
+ barrier(); // need this, to prevent starvation!
+ while (!bio.done)
+ c.Wait(lock);
+ lock.Unlock();
+ return bio.rval;
+ }
+
+ // write
+ int write(unsigned bno, unsigned num, bufferptr& bptr, char *n=0) {
+ bufferlist bl;
+ bl.push_back(bptr);
+ return write(bno, num, bl, n);
+ }
+ int write(unsigned bno, unsigned num, bufferlist& bl, char *n=0) {
+ Cond c;
+ biovec bio(biovec::IO_WRITE, bno, num, bl, &c, n);
+
+ lock.Lock();
+ _submit_io(&bio);
+ barrier(); // need this, to prevent starvation!
+ while (!bio.done)
+ c.Wait(lock);
+ lock.Unlock();
+ return bio.rval;
+ }
+
+ // ** non-blocking interface **
+ ioh_t read(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) {
+ biovec *pbio = new biovec(biovec::IO_READ, bno, num, bl, fin, n);
+ lock.Lock();
+ _submit_io(pbio);
+ lock.Unlock();
+ return (ioh_t)pbio;
+ }
+ ioh_t write(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) {
+ biovec *pbio = new biovec(biovec::IO_WRITE, bno, num, bl, fin, n);
+ lock.Lock();
+ _submit_io(pbio);
+ lock.Unlock();
+ return (ioh_t)pbio;
+ }
+ int cancel_io(ioh_t ioh);
+
+};
+
+
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "BufferCache.h"
+#include "Onode.h"
+
+
+/*********** BufferHead **************/
+
+
+#undef dout
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bh."
+
+
+
+
+
+
+/************ ObjectCache **************/
+
+
+#undef dout
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.oc."
+
+
+
+void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl)
+{
+ list<Context*> waiters;
+
+ dout(10) << "rx_finish " << start << "~" << length << endl;
+ for (map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
+ p != data.end();
+ p++) {
+ BufferHead *bh = p->second;
+ dout(10) << "rx_finish ?" << *bh << endl;
+ assert(p->first == bh->start());
+
+ // past?
+ if (p->first >= start+length) break;
+ if (bh->end() > start+length) break; // past
+
+ assert(p->first >= start);
+ assert(bh->end() <= start+length);
+
+ dout(10) << "rx_finish !" << *bh << endl;
+
+ if (bh->rx_ioh == ioh)
+ bh->rx_ioh = 0;
+
+ if (bh->is_rx()) {
+ assert(bh->get_version() == 0);
+ assert(bh->end() <= start+length);
+ assert(bh->start() >= start);
+ dout(10) << "rx_finish rx -> clean on " << *bh << endl;
+ bh->data.substr_of(bl, (bh->start()-start)*EBOFS_BLOCK_SIZE, bh->length()*EBOFS_BLOCK_SIZE);
+ bc->mark_clean(bh);
+ }
+ else if (bh->is_partial()) {
+ dout(10) << "rx_finish partial -> tx on " << *bh << endl;
+
+ if (1) {
+ // double-check what block i am
+ vector<Extent> exv;
+ on->map_extents(bh->start(), 1, exv);
+ assert(exv.size() == 1);
+ block_t cur_block = exv[0].start;
+ assert(cur_block == bh->partial_tx_to);
+ }
+
+ // ok, cancel my low-level partial (since we're still here, and can bh_write ourselves)
+ bc->cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch );
+
+ // apply partial to myself
+ assert(bh->data.length() == 0);
+ bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+ bh->data.push_back( bp );
+ bh->data.copy_in(0, EBOFS_BLOCK_SIZE, bl);
+ bh->apply_partial();
+
+ // write "normally"
+ bc->mark_dirty(bh);
+ bc->bh_write(on, bh, bh->partial_tx_to);//cur_block);
+
+ // clean up a bit
+ bh->partial_tx_to = 0;
+ bh->partial_tx_epoch = 0;
+ bh->partial.clear();
+ }
+ else {
+ dout(10) << "rx_finish ignoring status on (dirty|tx|clean) " << *bh << endl;
+ assert(bh->is_dirty() || // was overwritten
+ bh->is_tx() || // was overwritten and queued
+ bh->is_clean()); // was overwritten, queued, _and_ flushed to disk
+ }
+
+ // trigger waiters
+ for (map<block_t,list<Context*> >::iterator p = bh->waitfor_read.begin();
+ p != bh->waitfor_read.end();
+ p++) {
+ assert(p->first >= bh->start() && p->first < bh->end());
+ waiters.splice(waiters.begin(), p->second);
+ }
+ bh->waitfor_read.clear();
+ }
+
+ finish_contexts(waiters);
+}
+
+
+void ObjectCache::tx_finish(ioh_t ioh, block_t start, block_t length,
+ version_t version, version_t epoch)
+{
+ dout(10) << "tx_finish " << start << "~" << length << " v" << version << endl;
+ for (map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
+ p != data.end();
+ p++) {
+ BufferHead *bh = p->second;
+ dout(30) << "tx_finish ?bh " << *bh << endl;
+ assert(p->first == bh->start());
+
+ // past?
+ if (p->first >= start+length) break;
+
+ if (bh->tx_ioh == ioh)
+ bh->tx_ioh = 0;
+
+ if (!bh->is_tx()) {
+ dout(10) << "tx_finish bh not marked tx, skipping" << endl;
+ continue;
+ }
+ assert(bh->is_tx());
+
+ if (version == bh->version) {
+ dout(10) << "tx_finish tx -> clean on " << *bh << endl;
+ assert(bh->end() <= start+length);
+ bh->set_last_flushed(version);
+ bc->mark_clean(bh);
+ } else {
+ dout(10) << "tx_finish leaving tx, " << bh->version << " > " << version
+ << " on " << *bh << endl;
+ assert(bh->version > version);
+ }
+ }
+}
+
+
+
+/*
+ * return any bh's that are (partially) in this range that are TX.
+ */
+int ObjectCache::find_tx(block_t start, block_t len,
+ list<BufferHead*>& tx)
+{
+ map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
+
+ block_t cur = start;
+ block_t left = len;
+
+ /* don't care about overlap, we want things _fully_ in start~len.
+ if (p != data.begin() &&
+ (p == data.end() || p->first > cur)) {
+ p--; // might overlap!
+ if (p->first + p->second->length() <= cur)
+ p++; // doesn't overlap.
+ }
+ */
+
+ while (left > 0) {
+ assert(cur+left == start+len);
+
+ // at end?
+ if (p == data.end())
+ break;
+
+ if (p->first <= cur) {
+ // have it (or part of it)
+ BufferHead *e = p->second;
+
+ if (e->end() <= start+len &&
+ e->is_tx())
+ tx.push_back(e);
+
+ block_t lenfromcur = MIN(e->end() - cur, left);
+ cur += lenfromcur;
+ left -= lenfromcur;
+ p++;
+ continue; // more?
+ } else if (p->first > cur) {
+ // gap.. miss
+ block_t next = p->first;
+ left -= (next-cur);
+ cur = next;
+ continue;
+ }
+ else
+ assert(0);
+ }
+
+ return 0;
+}
+
+
+
+/*
+ * map a range of blocks into buffer_heads.
+ * - create missing buffer_heads as necessary.
+ * - fragment along disk extent boundaries
+ */
+int ObjectCache::map_read(block_t start, block_t len,
+ map<block_t, BufferHead*>& hits,
+ map<block_t, BufferHead*>& missing,
+ map<block_t, BufferHead*>& rx,
+ map<block_t, BufferHead*>& partial) {
+
+ map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
+
+ block_t cur = start;
+ block_t left = len;
+
+ if (p != data.begin() &&
+ (p == data.end() || p->first > cur)) {
+ p--; // might overlap!
+ if (p->first + p->second->length() <= cur)
+ p++; // doesn't overlap.
+ }
+
+ while (left > 0) {
+ // at end?
+ if (p == data.end()) {
+ // rest is a miss.
+ vector<Extent> exv;
+ //on->map_extents(cur, left, exv); // we might consider some prefetch here.
+ on->map_extents(cur,
+ //MIN(left + g_conf.ebofs_max_prefetch, // prefetch
+ //on->object_blocks-cur),
+ left, // no prefetch
+ exv);
+ for (unsigned i=0; i<exv.size() && left > 0; i++) {
+ BufferHead *n = new BufferHead(this);
+ n->set_start( cur );
+ n->set_length( exv[i].length );
+ bc->add_bh(n);
+ missing[cur] = n;
+ dout(20) << "map_read miss " << left << " left, " << *n << endl;
+ cur += MIN(left,exv[i].length);
+ left -= MIN(left,exv[i].length);
+ }
+ assert(left == 0);
+ assert(cur == start+len);
+ break;
+ }
+
+ if (p->first <= cur) {
+ // have it (or part of it)
+ BufferHead *e = p->second;
+
+ if (e->is_clean() ||
+ e->is_dirty() ||
+ e->is_tx()) {
+ hits[cur] = e; // readable!
+ dout(20) << "map_read hit " << *e << endl;
+ bc->touch(e);
+ }
+ else if (e->is_rx()) {
+ rx[cur] = e; // missing, not readable.
+ dout(20) << "map_read rx " << *e << endl;
+ }
+ else if (e->is_partial()) {
+ partial[cur] = e;
+ dout(20) << "map_read partial " << *e << endl;
+ }
+ else {
+ dout(0) << "map_read ??? " << *e << endl;
+ assert(0);
+ }
+
+ block_t lenfromcur = MIN(e->end() - cur, left);
+ cur += lenfromcur;
+ left -= lenfromcur;
+ p++;
+ continue; // more?
+ } else if (p->first > cur) {
+ // gap.. miss
+ block_t next = p->first;
+ vector<Extent> exv;
+ on->map_extents(cur,
+ //MIN(next-cur, MIN(left + g_conf.ebofs_max_prefetch, // prefetch
+ // on->object_blocks-cur)),
+ MIN(next-cur, left), // no prefetch
+ exv);
+
+ for (unsigned i=0; i<exv.size() && left>0; i++) {
+ BufferHead *n = new BufferHead(this);
+ n->set_start( cur );
+ n->set_length( exv[i].length );
+ bc->add_bh(n);
+ missing[cur] = n;
+ cur += MIN(left, n->length());
+ left -= MIN(left, n->length());
+ dout(20) << "map_read gap " << *n << endl;
+ }
+ continue; // more?
+ }
+ else
+ assert(0);
+ }
+
+ assert(left == 0);
+ assert(cur == start+len);
+ return 0;
+}
+
+
+/*
+ * map a range of pages on an object's buffer cache.
+ *
+ * - break up bufferheads that don't fall completely within the range
+ * - cancel rx ops we obsolete.
+ * - resubmit rx ops if we split bufferheads
+ *
+ * - leave potentially obsoleted tx ops alone (for now)
+ * - don't worry about disk extent boundaries (yet)
+ */
+int ObjectCache::map_write(block_t start, block_t len,
+ interval_set<block_t>& alloc,
+ map<block_t, BufferHead*>& hits,
+ version_t super_epoch)
+{
+ map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
+
+ dout(10) << "map_write " << *on << " " << start << "~" << len << " ... alloc " << alloc << endl;
+ // p->first >= start
+
+ block_t cur = start;
+ block_t left = len;
+
+ if (p != data.begin() &&
+ (p == data.end() || p->first > cur)) {
+ p--; // might overlap!
+ if (p->first + p->second->length() <= cur)
+ p++; // doesn't overlap.
+ }
+
+ //dump();
+
+ while (left > 0) {
+ // max for this bh (bc of (re)alloc on disk)
+ block_t max = left;
+ bool newalloc = false;
+
+ // based on alloc/no-alloc boundary ...
+ if (alloc.contains(cur, left)) {
+ if (alloc.contains(cur)) {
+ block_t ends = alloc.end_after(cur);
+ max = MIN(left, ends-cur);
+ newalloc = true;
+ } else {
+ if (alloc.starts_after(cur)) {
+ block_t st = alloc.start_after(cur);
+ max = MIN(left, st-cur);
+ }
+ }
+ }
+
+ // based on disk extent boundary ...
+ vector<Extent> exv;
+ on->map_extents(cur, max, exv);
+ if (exv.size() > 1)
+ max = exv[0].length;
+
+ if (newalloc) {
+ dout(10) << "map_write " << cur << "~" << max << " is new alloc on disk" << endl;
+ } else {
+ dout(10) << "map_write " << cur << "~" << max << " keeps old alloc on disk" << endl;
+ }
+
+ // at end?
+ if (p == data.end()) {
+ BufferHead *n = new BufferHead(this);
+ n->set_start( cur );
+ n->set_length( max );
+ bc->add_bh(n);
+ hits[cur] = n;
+ left -= max;
+ cur += max;
+ continue;
+ }
+
+ dout(10) << "p is " << *p->second << endl;
+
+
+ if (p->first <= cur) {
+ BufferHead *bh = p->second;
+ dout(10) << "map_write bh " << *bh << " intersected" << endl;
+
+ if (p->first < cur) {
+ if (cur+max >= p->first+p->second->length()) {
+ // we want right bit (one splice)
+ if (bh->is_rx() && bc->bh_cancel_read(bh)) {
+ BufferHead *right = bc->split(bh, cur);
+ bc->bh_read(on, bh); // reread left bit
+ bh = right;
+ } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) {
+ BufferHead *right = bc->split(bh, cur);
+ bc->bh_write(on, bh); // rewrite left bit
+ bh = right;
+ } else {
+ bh = bc->split(bh, cur); // just split it
+ }
+ p++;
+ assert(p->second == bh);
+ } else {
+ // we want middle bit (two splices)
+ if (bh->is_rx() && bc->bh_cancel_read(bh)) {
+ BufferHead *middle = bc->split(bh, cur);
+ bc->bh_read(on, bh); // reread left
+ p++;
+ assert(p->second == middle);
+ BufferHead *right = bc->split(middle, cur+max);
+ bc->bh_read(on, right); // reread right
+ bh = middle;
+ } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) {
+ BufferHead *middle = bc->split(bh, cur);
+ bc->bh_write(on, bh); // redo left
+ p++;
+ assert(p->second == middle);
+ BufferHead *right = bc->split(middle, cur+max);
+ bc->bh_write(on, right); // redo right
+ bh = middle;
+ } else {
+ BufferHead *middle = bc->split(bh, cur);
+ p++;
+ assert(p->second == middle);
+ bc->split(middle, cur+max);
+ bh = middle;
+ }
+ }
+ } else if (p->first == cur) {
+ if (p->second->length() <= max) {
+ // whole bufferhead, piece of cake.
+ } else {
+ // we want left bit (one splice)
+ if (bh->is_rx() && bc->bh_cancel_read(bh)) {
+ BufferHead *right = bc->split(bh, cur+max);
+ bc->bh_read(on, right); // re-rx the right bit
+ } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) {
+ BufferHead *right = bc->split(bh, cur+max);
+ bc->bh_write(on, right); // re-tx the right bit
+ } else {
+ bc->split(bh, cur+max); // just split
+ }
+ }
+ }
+
+ // try to cancel tx?
+ if (bh->is_tx() && !newalloc) bc->bh_cancel_write(bh, super_epoch);
+
+ // put in our map
+ hits[cur] = bh;
+
+ // keep going.
+ block_t lenfromcur = bh->end() - cur;
+ cur += lenfromcur;
+ left -= lenfromcur;
+ p++;
+ continue;
+ } else {
+ // gap!
+ block_t next = p->first;
+ block_t glen = MIN(next-cur, max);
+ dout(10) << "map_write gap " << cur << "~" << glen << endl;
+ BufferHead *n = new BufferHead(this);
+ n->set_start( cur );
+ n->set_length( glen );
+ bc->add_bh(n);
+ hits[cur] = n;
+
+ cur += glen;
+ left -= glen;
+ continue; // more?
+ }
+ }
+
+ assert(left == 0);
+ assert(cur == start+len);
+ return 0;
+}
+
+/* don't need this.
+int ObjectCache::scan_versions(block_t start, block_t len,
+ version_t& low, version_t& high)
+{
+ map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
+ // p->first >= start
+
+ if (p != data.begin() && p->first > start) {
+ p--; // might overlap?
+ if (p->first + p->second->length() <= start)
+ p++; // doesn't overlap.
+ }
+ if (p->first >= start+len)
+ return -1; // to the right. no hits.
+
+ // start
+ low = high = p->second->get_version();
+
+ for (p++; p != data.end(); p++) {
+ // past?
+ if (p->first >= start+len) break;
+
+ const version_t v = p->second->get_version();
+ if (low > v) low = v;
+ if (high < v) high = v;
+ }
+
+ return 0;
+}
+*/
+
+void ObjectCache::truncate(block_t blocks, version_t super_epoch)
+{
+ dout(7) << "truncate " << object_id
+ << " " << blocks << " blocks"
+ << endl;
+
+ while (!data.empty()) {
+ block_t bhoff = data.rbegin()->first;
+ BufferHead *bh = data.rbegin()->second;
+
+ if (bh->end() <= blocks) break;
+
+ bool uncom = on->uncommitted.contains(bh->start(), bh->length());
+ dout(10) << "truncate " << *bh << " uncom " << uncom
+ << " of " << on->uncommitted
+ << endl;
+
+ if (bhoff < blocks) {
+ // we want right bit (one splice)
+ if (bh->is_rx() && bc->bh_cancel_read(bh)) {
+ BufferHead *right = bc->split(bh, blocks);
+ bc->bh_read(on, bh); // reread left bit
+ bh = right;
+ } else if (bh->is_tx() && uncom && bc->bh_cancel_write(bh, super_epoch)) {
+ BufferHead *right = bc->split(bh, blocks);
+ bc->bh_write(on, bh); // rewrite left bit
+ bh = right;
+ } else {
+ bh = bc->split(bh, blocks); // just split it
+ }
+ // no worries about partials up here, they're always 1 block (and thus never split)
+ } else {
+ // whole thing
+ // cancel any pending/queued io, if possible.
+ if (bh->is_rx())
+ bc->bh_cancel_read(bh);
+ if (bh->is_tx() && uncom)
+ bc->bh_cancel_write(bh, super_epoch);
+ if (bh->shadow_of) {
+ dout(10) << "truncate " << *bh << " unshadowing " << *bh->shadow_of << endl;
+ // shadow
+ bh->shadow_of->remove_shadow(bh);
+ if (bh->is_partial())
+ bc->cancel_shadow_partial(bh->rx_from.start, bh);
+ } else {
+ // normal
+ if (bh->is_partial() && uncom)
+ bc->bh_cancel_partial_write(bh);
+ }
+ }
+
+ for (map<block_t,list<Context*> >::iterator p = bh->waitfor_read.begin();
+ p != bh->waitfor_read.end();
+ p++) {
+ finish_contexts(p->second, -1);
+ }
+
+ bc->remove_bh(bh);
+ delete bh;
+ }
+}
+
+
+void ObjectCache::clone_to(Onode *other)
+{
+ ObjectCache *ton = 0;
+
+ for (map<block_t, BufferHead*>::iterator p = data.begin();
+ p != data.end();
+ p++) {
+ BufferHead *bh = p->second;
+ dout(10) << "clone_to ? " << *bh << endl;
+ if (bh->is_dirty() || bh->is_tx() || bh->is_partial()) {
+ // dup dirty or tx bh's
+ if (!ton)
+ ton = other->get_oc(bc);
+ BufferHead *nbh = new BufferHead(ton);
+ nbh->set_start( bh->start() );
+ nbh->set_length( bh->length() );
+ nbh->data = bh->data; // just copy refs to underlying buffers.
+ bc->add_bh(nbh);
+
+ if (bh->is_partial()) {
+ dout(0) << "clone_to PARTIAL FIXME NOT FULLY IMPLEMENTED ******" << endl;
+ nbh->partial = bh->partial;
+ bc->mark_partial(nbh);
+ // register as shadow_partial
+ bc->add_shadow_partial(bh->rx_from.start, nbh);
+ } else {
+ // clean buffer will shadow
+ bh->add_shadow(nbh);
+ bc->mark_clean(nbh);
+ }
+
+ dout(10) << "clone_to dup " << *bh << " -> " << *nbh << endl;
+ }
+ }
+}
+
+
+
+/************** BufferCache ***************/
+
+#undef dout
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bc."
+
+
+
+BufferHead *BufferCache::split(BufferHead *orig, block_t after)
+{
+ dout(20) << "split " << *orig << " at " << after << endl;
+
+ // split off right
+ BufferHead *right = new BufferHead(orig->get_oc());
+ right->set_version(orig->get_version());
+ right->epoch_modified = orig->epoch_modified;
+ right->last_flushed = orig->last_flushed;
+ right->set_state(orig->get_state());
+
+ block_t newleftlen = after - orig->start();
+ right->set_start( after );
+ right->set_length( orig->length() - newleftlen );
+
+ // shorten left
+ stat_sub(orig);
+ orig->set_length( newleftlen );
+ stat_add(orig);
+
+ // add right
+ add_bh(right);
+
+ // adjust rx_from
+ if (orig->is_rx()) {
+ right->rx_from = orig->rx_from;
+ orig->rx_from.length = newleftlen;
+ right->rx_from.length -= newleftlen;
+ right->rx_from.start += newleftlen;
+ }
+
+ // dup shadows
+ for (set<BufferHead*>::iterator p = orig->shadows.begin();
+ p != orig->shadows.end();
+ ++p)
+ right->add_shadow(*p);
+
+ // split buffers too
+ bufferlist bl;
+ bl.claim(orig->data);
+ if (bl.length()) {
+ assert(bl.length() == (orig->length()+right->length())*EBOFS_BLOCK_SIZE);
+ right->data.substr_of(bl, orig->length()*EBOFS_BLOCK_SIZE, right->length()*EBOFS_BLOCK_SIZE);
+ orig->data.substr_of(bl, 0, orig->length()*EBOFS_BLOCK_SIZE);
+ }
+
+ // move read waiters
+ if (!orig->waitfor_read.empty()) {
+ map<block_t, list<Context*> >::iterator o, p = orig->waitfor_read.end();
+ p--;
+ while (p != orig->waitfor_read.begin()) {
+ if (p->first < right->start()) break;
+ dout(0) << "split moving waiters at block " << p->first << " to right bh" << endl;
+ right->waitfor_read[p->first].swap( p->second );
+ o = p;
+ p--;
+ orig->waitfor_read.erase(o);
+ }
+ }
+
+ dout(20) << "split left is " << *orig << endl;
+ dout(20) << "split right is " << *right << endl;
+ return right;
+}
+
+
+void BufferCache::bh_read(Onode *on, BufferHead *bh, block_t from)
+{
+ dout(10) << "bh_read " << *on << " on " << *bh << endl;
+
+ if (bh->is_missing()) {
+ mark_rx(bh);
+ } else {
+ assert(bh->is_partial());
+ }
+
+ // get extent. there should be only one!
+ vector<Extent> exv;
+ on->map_extents(bh->start(), bh->length(), exv);
+ assert(exv.size() == 1);
+ Extent ex = exv[0];
+
+ if (from) { // force behavior, used for reading partials
+ dout(10) << "bh_read forcing read from block " << from << " (for a partial)" << endl;
+ ex.start = from;
+ ex.length = 1;
+ }
+
+ // this should be empty!!
+ assert(bh->rx_ioh == 0);
+
+ dout(20) << "bh_read " << *bh << " from " << ex << endl;
+
+ C_OC_RxFinish *fin = new C_OC_RxFinish(ebofs_lock, on->oc,
+ bh->start(), bh->length(),
+ ex.start);
+
+ //bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), fin->bl); // new buffers!
+ fin->bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
+
+ bh->rx_ioh = dev.read(ex.start, ex.length, fin->bl,
+ fin);
+ bh->rx_from = ex;
+ on->oc->get();
+
+}
+
+bool BufferCache::bh_cancel_read(BufferHead *bh)
+{
+ if (bh->rx_ioh && dev.cancel_io(bh->rx_ioh) >= 0) {
+ dout(10) << "bh_cancel_read on " << *bh << endl;
+ bh->rx_ioh = 0;
+ mark_missing(bh);
+ int l = bh->oc->put();
+ assert(l);
+ return true;
+ }
+ return false;
+}
+
+void BufferCache::bh_write(Onode *on, BufferHead *bh, block_t shouldbe)
+{
+ dout(10) << "bh_write " << *on << " on " << *bh << " in epoch " << bh->epoch_modified << endl;
+ assert(bh->get_version() > 0);
+
+ assert(bh->is_dirty());
+ mark_tx(bh);
+
+ // get extents
+ vector<Extent> exv;
+ on->map_extents(bh->start(), bh->length(), exv);
+ assert(exv.size() == 1);
+ Extent ex = exv[0];
+
+ if (shouldbe)
+ assert(ex.length == 1 && ex.start == shouldbe);
+
+ dout(20) << "bh_write " << *bh << " to " << ex << endl;
+
+ //assert(bh->tx_ioh == 0);
+
+ assert(bh->get_last_flushed() < bh->get_version());
+
+ bh->tx_block = ex.start;
+ bh->tx_ioh = dev.write(ex.start, ex.length, bh->data,
+ new C_OC_TxFinish(ebofs_lock, on->oc,
+ bh->start(), bh->length(),
+ bh->get_version(),
+ bh->epoch_modified),
+ "bh_write");
+
+ on->oc->get();
+ inc_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified );
+
+ /*
+ // assert: no partials on the same block
+ // hose any partial on the same block
+ if (bh->partial_write.count(ex.start)) {
+ dout(10) << "bh_write hosing parital write on same block " << ex.start << " " << *bh << endl;
+ dec_unflushed( bh->partial_write[ex.start].epoch );
+ bh->partial_write.erase(ex.start);
+ }
+ */
+}
+
+
+bool BufferCache::bh_cancel_write(BufferHead *bh, version_t cur_epoch)
+{
+ if (bh->tx_ioh && dev.cancel_io(bh->tx_ioh) >= 0) {
+ dout(10) << "bh_cancel_write on " << *bh << endl;
+ bh->tx_ioh = 0;
+ mark_dirty(bh);
+
+ assert(bh->epoch_modified == cur_epoch);
+ assert(bh->epoch_modified > 0);
+ dec_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); // assert.. this should be the same epoch!
+
+ int l = bh->oc->put();
+ assert(l);
+ return true;
+ }
+ return false;
+}
+
+void BufferCache::tx_finish(ObjectCache *oc,
+ ioh_t ioh, block_t start, block_t length,
+ version_t version, version_t epoch)
+{
+ ebofs_lock.Lock();
+
+ // finish oc
+ if (oc->put() == 0) {
+ delete oc;
+ } else
+ oc->tx_finish(ioh, start, length, version, epoch);
+
+ // update unflushed counter
+ assert(get_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch) > 0);
+ dec_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch);
+
+ ebofs_lock.Unlock();
+}
+
+void BufferCache::rx_finish(ObjectCache *oc,
+ ioh_t ioh, block_t start, block_t length,
+ block_t diskstart,
+ bufferlist& bl)
+{
+ ebofs_lock.Lock();
+ dout(10) << "rx_finish ioh " << ioh << " on " << start << "~" << length
+ << ", at device block " << diskstart << endl;
+
+ // oc
+ if (oc->put() == 0)
+ delete oc;
+ else
+ oc->rx_finish(ioh, start, length, bl);
+
+ // finish any partials?
+ // note: these are partials that were re-written after a commit,
+ // or for whom the OC was destroyed (eg truncated after a commit)
+ map<block_t, map<block_t, PartialWrite> >::iterator sp = partial_write.lower_bound(diskstart);
+ while (sp != partial_write.end()) {
+ if (sp->first >= diskstart+length) break;
+ assert(sp->first >= diskstart);
+
+ block_t pblock = sp->first;
+ map<block_t, PartialWrite> writes;
+ writes.swap( sp->second );
+
+ map<block_t, map<block_t, PartialWrite> >::iterator t = sp;
+ sp++;
+ partial_write.erase(t);
+
+ for (map<block_t, PartialWrite>::iterator p = writes.begin();
+ p != writes.end();
+ p++) {
+ dout(10) << "rx_finish partial from " << pblock << " -> " << p->first
+ << " for epoch " << p->second.epoch
+ //<< " (bh.epoch_modified is now " << bh->epoch_modified << ")"
+ << endl;
+ // this had better be a past epoch
+ //assert(p->epoch == epoch_modified - 1); // ??
+
+ // make the combined block
+ bufferlist combined;
+ bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+ combined.push_back( bp );
+ combined.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, bl);
+ BufferHead::apply_partial( combined, p->second.partial );
+
+ // write it!
+ dev.write( pblock, 1, combined,
+ new C_OC_PartialTxFinish( this, p->second.epoch ),
+ "finish_partials");
+ }
+ }
+
+ // shadow partials?
+ {
+ list<Context*> waiters;
+ map<block_t, set<BufferHead*> >::iterator sp = shadow_partials.lower_bound(diskstart);
+ while (sp != shadow_partials.end()) {
+ if (sp->first >= diskstart+length) break;
+ assert(sp->first >= diskstart);
+
+ block_t pblock = sp->first;
+ set<BufferHead*> ls;
+ ls.swap( sp->second );
+
+ map<block_t, set<BufferHead*> >::iterator t = sp;
+ sp++;
+ shadow_partials.erase(t);
+
+ for (set<BufferHead*>::iterator p = ls.begin();
+ p != ls.end();
+ ++p) {
+ BufferHead *bh = *p;
+ dout(10) << "rx_finish applying shadow_partial for " << pblock
+ << " to " << *bh << endl;
+ bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+ bh->data.clear();
+ bh->data.push_back( bp );
+ bh->data.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE,
+ (pblock-diskstart+1)*EBOFS_BLOCK_SIZE,
+ bl);
+ bh->apply_partial();
+ bh->set_state(BufferHead::STATE_CLEAN);
+
+ // trigger waiters
+ for (map<block_t,list<Context*> >::iterator p = bh->waitfor_read.begin();
+ p != bh->waitfor_read.end();
+ p++) {
+ assert(p->first >= bh->start() && p->first < bh->end());
+ waiters.splice(waiters.begin(), p->second);
+ }
+ bh->waitfor_read.clear();
+ }
+ }
+
+ // kick waiters
+ finish_contexts(waiters);
+ }
+
+ // done.
+ ebofs_lock.Unlock();
+}
+
+void BufferCache::partial_tx_finish(version_t epoch)
+{
+ ebofs_lock.Lock();
+
+ dout(10) << "partial_tx_finish in epoch " << epoch << endl;
+
+ // update unflushed counter
+ assert(get_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch) > 0);
+ dec_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch);
+
+ ebofs_lock.Unlock();
+}
+
+
+
+
+void BufferCache::bh_queue_partial_write(Onode *on, BufferHead *bh)
+{
+ assert(bh->get_version() > 0);
+
+ assert(bh->is_partial());
+ assert(bh->length() == 1);
+
+ // get the block no
+ vector<Extent> exv;
+ on->map_extents(bh->start(), bh->length(), exv);
+ assert(exv.size() == 1);
+ block_t b = exv[0].start;
+ assert(exv[0].length == 1);
+ bh->partial_tx_to = exv[0].start;
+ bh->partial_tx_epoch = bh->epoch_modified;
+
+ dout(10) << "bh_queue_partial_write " << *on << " on " << *bh << " block " << b << " epoch " << bh->epoch_modified << endl;
+
+
+ // copy map state, queue for this block
+ assert(bh->rx_from.length == 1);
+ queue_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial, bh->partial_tx_epoch );
+}
+
+void BufferCache::bh_cancel_partial_write(BufferHead *bh)
+{
+ assert(bh->is_partial());
+ assert(bh->length() == 1);
+
+ cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch );
+}
+
+
+void BufferCache::queue_partial(block_t from, block_t to,
+ map<off_t, bufferlist>& partial, version_t epoch)
+{
+ dout(10) << "queue_partial " << from << " -> " << to
+ << " in epoch " << epoch
+ << endl;
+
+ if (partial_write[from].count(to)) {
+ // this should be in the same epoch.
+ assert( partial_write[from][to].epoch == epoch);
+ assert(0); // actually.. no!
+ } else {
+ inc_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch );
+ }
+
+ partial_write[from][to].partial = partial;
+ partial_write[from][to].epoch = epoch;
+}
+
+void BufferCache::cancel_partial(block_t from, block_t to, version_t epoch)
+{
+ assert(partial_write.count(from));
+ assert(partial_write[from].count(to));
+ assert(partial_write[from][to].epoch == epoch);
+
+ dout(10) << "cancel_partial " << from << " -> " << to
+ << " (was epoch " << partial_write[from][to].epoch << ")"
+ << endl;
+
+ partial_write[from].erase(to);
+ if (partial_write[from].empty())
+ partial_write.erase(from);
+
+ dec_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch );
+}
+
+
+void BufferCache::add_shadow_partial(block_t from, BufferHead *bh)
+{
+ dout(10) << "add_shadow_partial from " << from << " " << *bh << endl;
+ shadow_partials[from].insert(bh);
+}
+
+void BufferCache::cancel_shadow_partial(block_t from, BufferHead *bh)
+{
+ dout(10) << "cancel_shadow_partial from " << from << " " << *bh << endl;
+ shadow_partials[from].erase(bh);
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __EBOFS_BUFFERCACHE_H
+#define __EBOFS_BUFFERCACHE_H
+
+#include "include/lru.h"
+#include "include/Context.h"
+
+#include "common/Clock.h"
+
+#include "types.h"
+#include "BlockDevice.h"
+
+#include "include/interval_set.h"
+
+class ObjectCache;
+class BufferCache;
+class Onode;
+
+class BufferHead : public LRUObject {
+ public:
+ /*
+ * - buffer_heads should always break across disk extent boundaries
+ * - partial buffer_heads are always 1 block.
+ */
+ const static int STATE_MISSING = 0; // missing; data is on disk, but not loaded.
+ const static int STATE_CLEAN = 1; // Rw clean
+ const static int STATE_DIRTY = 2; // RW dirty
+ const static int STATE_TX = 3; // Rw flushing to disk
+ const static int STATE_RX = 4; // w reading from disk
+ const static int STATE_PARTIAL = 5; // reading from disk, + partial content map. always 1 block.
+
+ public:
+ ObjectCache *oc;
+
+ bufferlist data;
+
+ ioh_t rx_ioh; //
+ Extent rx_from;
+ ioh_t tx_ioh; //
+ block_t tx_block;
+ block_t partial_tx_to;
+ version_t partial_tx_epoch;
+
+ map<off_t, bufferlist> partial; // partial dirty content overlayed onto incoming data
+
+ map< block_t, list<Context*> > waitfor_read;
+
+ set<BufferHead*> shadows; // shadow bh's that clone()ed me.
+ BufferHead* shadow_of;
+
+ private:
+ int ref;
+ int state;
+
+ public:
+ version_t epoch_modified;
+
+ version_t version; // current version in cache
+ version_t last_flushed; // last version flushed to disk
+
+ Extent object_loc; // block position _in_object_
+
+ utime_t dirty_stamp;
+
+ public:
+ BufferHead(ObjectCache *o) :
+ oc(o), //cancellable_ioh(0), tx_epoch(0),
+ rx_ioh(0), tx_ioh(0), tx_block(0), partial_tx_to(0), partial_tx_epoch(0),
+ shadow_of(0),
+ ref(0), state(STATE_MISSING), epoch_modified(0), version(0), last_flushed(0)
+ {}
+ ~BufferHead() {
+ unpin_shadows();
+ }
+
+ ObjectCache *get_oc() { return oc; }
+
+ int get() {
+ assert(ref >= 0);
+ if (ref == 0) lru_pin();
+ return ++ref;
+ }
+ int put() {
+ assert(ref > 0);
+ if (ref == 1) lru_unpin();
+ --ref;
+ return ref;
+ }
+
+ block_t start() { return object_loc.start; }
+ void set_start(block_t s) { object_loc.start = s; }
+ block_t length() { return object_loc.length; }
+ void set_length(block_t l) { object_loc.length = l; }
+ block_t end() { return start() + length(); }
+ block_t last() { return end()-1; }
+
+ version_t get_version() { return version; }
+ void set_version(version_t v) { version = v; }
+ version_t get_last_flushed() { return last_flushed; }
+ void set_last_flushed(version_t v) {
+ if (v <= last_flushed) cout << "last_flushed begin set to " << v << ", was " << last_flushed << endl;
+ assert(v > last_flushed);
+ last_flushed = v;
+ }
+
+ utime_t get_dirty_stamp() { return dirty_stamp; }
+ void set_dirty_stamp(utime_t t) { dirty_stamp = t; }
+
+ void set_state(int s) {
+ if (s == STATE_PARTIAL || s == STATE_RX || s == STATE_TX) get();
+ if (state == STATE_PARTIAL || state == STATE_RX || state == STATE_TX) put();
+
+ if ((state == STATE_TX && s != STATE_TX) ||
+ (state == STATE_PARTIAL && s != STATE_PARTIAL))
+ unpin_shadows();
+
+ state = s;
+ }
+ int get_state() { return state; }
+
+ bool is_missing() { return state == STATE_MISSING; }
+ bool is_dirty() { return state == STATE_DIRTY; }
+ bool is_clean() { return state == STATE_CLEAN; }
+ bool is_tx() { return state == STATE_TX; }
+ bool is_rx() { return state == STATE_RX; }
+ bool is_partial() { return state == STATE_PARTIAL; }
+
+ //bool is_partial_writes() { return !partial_write.empty(); }
+ //void finish_partials();
+ //void cancel_partials();
+ //void queue_partial_write(block_t b);
+
+ void add_shadow(BufferHead *dup) {
+ shadows.insert(dup);
+ dup->shadow_of = this;
+ dup->get();
+ }
+ void remove_shadow(BufferHead *dup) {
+ shadows.erase(dup);
+ dup->shadow_of = 0;
+ dup->put();
+ }
+ void unpin_shadows() {
+ for (set<BufferHead*>::iterator p = shadows.begin();
+ p != shadows.end();
+ ++p) {
+ //cout << "unpin shadow " << *p << endl;
+ (*p)->shadow_of = 0;
+ (*p)->put();
+ }
+ shadows.clear();
+ }
+
+ void copy_partial_substr(off_t start, off_t end, bufferlist& bl) {
+ map<off_t, bufferlist>::iterator i = partial.begin();
+
+ // skip first bits (fully to left)
+ while ((i->first + i->second.length() < start) &&
+ i != partial.end())
+ i++;
+ assert(i != partial.end());
+ assert(i->first <= start);
+
+ // first
+ unsigned bhoff = MAX(start, i->first) - i->first;
+ unsigned bhlen = MIN(end-start, i->second.length());
+ bl.substr_of( i->second, bhoff, bhlen );
+
+ off_t pos = i->first + i->second.length();
+
+ // have continuous to end?
+ for (i++; i != partial.end(); i++) {
+ if (pos >= end) break;
+ assert(pos == i->first);
+
+ pos = i->first + i->second.length();
+
+ if (pos <= end) { // this whole frag
+ bl.append( i->second );
+ } else { // partial end
+ unsigned bhlen = end-start-bl.length();
+ bufferlist frag;
+ frag.substr_of( i->second, 0, bhlen );
+ bl.claim_append(frag);
+ break; // done.
+ }
+ }
+
+ assert(pos >= end);
+ assert(bl.length() == (unsigned)(end-start));
+ }
+
+ bool have_partial_range(off_t start, off_t end) {
+ map<off_t, bufferlist>::iterator i = partial.begin();
+
+ // skip first bits (fully to left)
+ while ((i->first + i->second.length() < start) &&
+ i != partial.end())
+ i++;
+ if (i == partial.end()) return false;
+
+ // have start?
+ if (i->first > start) return false;
+ off_t pos = i->first + i->second.length();
+
+ // have continuous to end?
+ for (i++; i != partial.end(); i++) {
+ assert(pos <= i->first);
+ if (pos < i->first) return false;
+ assert(pos == i->first);
+ pos = i->first + i->second.length();
+ if (pos >= end) break; // gone far enough
+ }
+
+ if (pos >= end) return true;
+ return false;
+ }
+
+ bool partial_is_complete(off_t size) {
+ return have_partial_range( 0, MIN(size, EBOFS_BLOCK_SIZE) );
+ //(off_t)(start()*EBOFS_BLOCK_SIZE),
+ //MIN( size, (off_t)(end()*EBOFS_BLOCK_SIZE) ) );
+ }
+ void apply_partial() {
+ apply_partial(data, partial);
+ partial.clear();
+ }
+ static void apply_partial(bufferlist& bl, map<off_t, bufferlist>& pm) {
+ assert(bl.length() == (unsigned)EBOFS_BLOCK_SIZE);
+ //assert(partial_is_complete());
+ //cout << "apply_partial" << endl;
+ for (map<off_t, bufferlist>::iterator i = pm.begin();
+ i != pm.end();
+ i++) {
+ int pos = i->first;
+ //cout << " frag at opos " << i->first << " bhpos " << pos << " len " << i->second.length() << endl;
+ bl.copy_in(pos, i->second.length(), i->second);
+ }
+ pm.clear();
+ }
+ void add_partial(off_t off, bufferlist& p) {
+ unsigned len = p.length();
+ assert(len <= (unsigned)EBOFS_BLOCK_SIZE);
+ //assert(off >= (off_t)(start()*EBOFS_BLOCK_SIZE));
+ //assert(off + len <= (off_t)(end()*EBOFS_BLOCK_SIZE));
+ assert(off >= 0);
+ assert(off + len <= EBOFS_BLOCK_SIZE);
+
+ // trim any existing that overlaps
+ for (map<off_t, bufferlist>::iterator i = partial.begin();
+ i != partial.end();
+ ) {
+ if (i->first + i->second.length() <= off) { // before
+ i++;
+ continue;
+ }
+ if (i->first >= off+len) break; // past affected area.
+
+ // overlap all?
+ if (off <= i->first && i->first + i->second.length() <= off+len) {
+ // erase it and move on.
+ off_t dead = i->first;
+ i++;
+ partial.erase(dead);
+ continue;
+ }
+ // overlap tail?
+ else if (i->first < off && off < i->first + i->second.length()) {
+ // shorten.
+ unsigned newlen = off - i->first;
+ bufferlist o;
+ o.claim( i->second );
+ i->second.substr_of(o, 0, newlen);
+ i++;
+ continue;
+ }
+ // overlap head?
+ else if (off < i->first && off+len < i->first + i->second.length()) {
+ // move.
+ off_t oldoff = i->first;
+ off_t newoff = off+len;
+ unsigned trim = newoff - oldoff;
+ partial[newoff].substr_of(i->second, trim, i->second.length()-trim);
+ i++; // should be at newoff!
+ partial.erase( oldoff );
+ i++;
+ continue;
+ } else
+ assert(0);
+ }
+
+ // insert
+ partial[off] = p;
+ }
+
+
+};
+
+inline ostream& operator<<(ostream& out, BufferHead& bh)
+{
+ out << "bufferhead(" << bh.start() << "~" << bh.length();
+ out << " v" << bh.get_version() << "/" << bh.get_last_flushed();
+ if (bh.is_missing()) out << " missing";
+ if (bh.is_dirty()) out << " dirty";
+ if (bh.is_clean()) out << " clean";
+ if (bh.is_rx()) out << " rx";
+ if (bh.is_tx()) out << " tx";
+ if (bh.is_partial()) out << " partial";
+ //out << " " << bh.data.length();
+ out << " " << &bh;
+ out << ")";
+ return out;
+}
+
+
+class ObjectCache {
+ public:
+ object_t object_id;
+ Onode *on;
+ BufferCache *bc;
+
+ private:
+ map<block_t, BufferHead*> data;
+ int ref;
+
+ public:
+ version_t write_count;
+
+
+ public:
+ ObjectCache(object_t o, Onode *_on, BufferCache *b) :
+ object_id(o), on(_on), bc(b), ref(0),
+ write_count(0) { }
+ ~ObjectCache() {
+ assert(data.empty());
+ assert(ref == 0);
+ }
+
+ int get() {
+ ++ref;
+ //cout << "oc.get " << object_id << " " << ref << endl;
+ return ref;
+ }
+ int put() {
+ assert(ref > 0);
+ --ref;
+ //cout << "oc.put " << object_id << " " << ref << endl;
+ return ref;
+ }
+
+ object_t get_object_id() { return object_id; }
+
+ void add_bh(BufferHead *bh) {
+ // add to my map
+ assert(data.count(bh->start()) == 0);
+
+ if (0) { // sanity check FIXME DEBUG
+ //cout << "add_bh " << bh->start() << "~" << bh->length() << endl;
+ map<block_t,BufferHead*>::iterator p = data.lower_bound(bh->start());
+ if (p != data.end()) {
+ //cout << " after " << *p->second << endl;
+ //cout << " after starts at " << p->first << endl;
+ assert(p->first >= bh->end());
+ }
+ if (p != data.begin()) {
+ p--;
+ //cout << " before starts at " << p->second->start()
+ //<< " and ends at " << p->second->end() << endl;
+ //cout << " before " << *p->second << endl;
+ assert(p->second->end() <= bh->start());
+ }
+ }
+
+ data[bh->start()] = bh;
+ }
+ void remove_bh(BufferHead *bh) {
+ assert(data.count(bh->start()));
+ data.erase(bh->start());
+ }
+ bool is_empty() { return data.empty(); }
+
+ int find_tx(block_t start, block_t len,
+ list<BufferHead*>& tx);
+
+ int map_read(block_t start, block_t len,
+ map<block_t, BufferHead*>& hits, // hits
+ map<block_t, BufferHead*>& missing, // read these from disk
+ map<block_t, BufferHead*>& rx, // wait for these to finish reading from disk
+ map<block_t, BufferHead*>& partial); // (maybe) wait for these to read from disk
+
+ int map_write(block_t start, block_t len,
+ interval_set<block_t>& alloc,
+ map<block_t, BufferHead*>& hits,
+ version_t super_epoch); // can write to these.
+
+ BufferHead *split(BufferHead *bh, block_t off);
+
+ /*int scan_versions(block_t start, block_t len,
+ version_t& low, version_t& high);
+ */
+
+ void rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl);
+ void tx_finish(ioh_t ioh, block_t start, block_t length, version_t v, version_t epoch);
+
+ void truncate(block_t blocks, version_t super_epoch);
+ // void tear_down();
+
+ void clone_to(Onode *other);
+
+ void dump() {
+ for (map<block_t,BufferHead*>::iterator i = data.begin();
+ i != data.end();
+ i++)
+ cout << "dump: " << i->first << ": " << *i->second << endl;
+ }
+
+};
+
+
+
+class BufferCache {
+ public:
+ Mutex &ebofs_lock; // hack: this is a ref to global ebofs_lock
+ BlockDevice &dev;
+
+ set<BufferHead*> dirty_bh;
+
+ LRU lru_dirty, lru_rest;
+
+ private:
+ Cond stat_cond;
+ Cond flush_cond;
+ int stat_waiter;
+
+ off_t stat_clean;
+ off_t stat_dirty;
+ off_t stat_rx;
+ off_t stat_tx;
+ off_t stat_partial;
+ off_t stat_missing;
+
+#define EBOFS_BC_FLUSH_BHWRITE 0
+#define EBOFS_BC_FLUSH_PARTIAL 1
+
+ map<version_t, int> epoch_unflushed[2];
+
+ /* partial writes - incomplete blocks that can't be written until
+ * their prior content is read and overlayed with the new data.
+ *
+ * we put partial block management here because objects may be deleted
+ * before the read completes, but the write may have been committed in a
+ * prior epoch.
+ *
+ * we map: src block -> dest block -> PartialWrite
+ *
+ * really, at most there will only ever be two of these, for current+previous epochs.
+ */
+ class PartialWrite {
+ public:
+ map<off_t, bufferlist> partial; // partial dirty content overlayed onto incoming data
+ version_t epoch;
+ };
+
+ map<block_t, map<block_t, PartialWrite> > partial_write; // queued writes w/ partial content
+ map<block_t, set<BufferHead*> > shadow_partials;
+
+ public:
+ BufferCache(BlockDevice& d, Mutex& el) :
+ ebofs_lock(el), dev(d),
+ stat_waiter(0),
+ stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0)
+ {}
+
+
+ off_t get_size() {
+ return stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial;
+ }
+ off_t get_trimmable() {
+ return stat_clean;
+ }
+
+
+ // bh's in cache
+ void add_bh(BufferHead *bh) {
+ bh->get_oc()->add_bh(bh);
+ if (bh->is_dirty()) {
+ lru_dirty.lru_insert_mid(bh);
+ dirty_bh.insert(bh);
+ } else
+ lru_rest.lru_insert_mid(bh);
+ stat_add(bh);
+ }
+ void touch(BufferHead *bh) {
+ if (bh->is_dirty()) {
+ lru_dirty.lru_touch(bh);
+ } else
+ lru_rest.lru_touch(bh);
+ }
+ void remove_bh(BufferHead *bh) {
+ bh->get_oc()->remove_bh(bh);
+ stat_sub(bh);
+ if (bh->is_dirty()) {
+ lru_dirty.lru_remove(bh);
+ dirty_bh.erase(bh);
+ } else
+ lru_rest.lru_remove(bh);
+ }
+
+ // stats
+ void stat_add(BufferHead *bh) {
+ switch (bh->get_state()) {
+ case BufferHead::STATE_MISSING: stat_missing += bh->length(); break;
+ case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break;
+ case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break;
+ case BufferHead::STATE_TX: stat_tx += bh->length(); break;
+ case BufferHead::STATE_RX: stat_rx += bh->length(); break;
+ case BufferHead::STATE_PARTIAL: stat_partial += bh->length(); break;
+ }
+ if (stat_waiter) stat_cond.Signal();
+ }
+ void stat_sub(BufferHead *bh) {
+ switch (bh->get_state()) {
+ case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break;
+ case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break;
+ case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break;
+ case BufferHead::STATE_TX: stat_tx -= bh->length(); break;
+ case BufferHead::STATE_RX: stat_rx -= bh->length(); break;
+ case BufferHead::STATE_PARTIAL: stat_partial -= bh->length(); break;
+ }
+ }
+ off_t get_stat_tx() { return stat_tx; }
+ off_t get_stat_rx() { return stat_rx; }
+ off_t get_stat_dirty() { return stat_dirty; }
+ off_t get_stat_clean() { return stat_clean; }
+ off_t get_stat_partial() { return stat_partial; }
+
+
+ map<version_t, int> &get_unflushed(int what) {
+ return epoch_unflushed[what];
+ }
+
+ int get_unflushed(int what, version_t epoch) {
+ return epoch_unflushed[what][epoch];
+ }
+ void inc_unflushed(int what, version_t epoch) {
+ epoch_unflushed[what][epoch]++;
+ //cout << "inc_unflushed " << epoch << " now " << epoch_unflushed[epoch] << endl;
+ }
+ void dec_unflushed(int what, version_t epoch) {
+ epoch_unflushed[what][epoch]--;
+ //cout << "dec_unflushed " << epoch << " now " << epoch_unflushed[epoch] << endl;
+ if (epoch_unflushed[what][epoch] == 0)
+ flush_cond.Signal();
+ }
+
+ void waitfor_stat() {
+ stat_waiter++;
+ stat_cond.Wait(ebofs_lock);
+ stat_waiter--;
+ }
+ void waitfor_flush() {
+ flush_cond.Wait(ebofs_lock);
+ }
+
+
+ // bh state
+ void set_state(BufferHead *bh, int s) {
+ // move between lru lists?
+ if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) {
+ lru_rest.lru_remove(bh);
+ lru_dirty.lru_insert_top(bh);
+ dirty_bh.insert(bh);
+ }
+ if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) {
+ lru_dirty.lru_remove(bh);
+ lru_rest.lru_insert_mid(bh);
+ dirty_bh.erase(bh);
+ }
+
+ // set state
+ stat_sub(bh);
+ bh->set_state(s);
+ stat_add(bh);
+ }
+
+ void copy_state(BufferHead *bh1, BufferHead *bh2) {
+ set_state(bh2, bh1->get_state());
+ }
+
+ void mark_missing(BufferHead *bh) { set_state(bh, BufferHead::STATE_MISSING); };
+ void mark_clean(BufferHead *bh) { set_state(bh, BufferHead::STATE_CLEAN); };
+ void mark_rx(BufferHead *bh) { set_state(bh, BufferHead::STATE_RX); };
+ void mark_partial(BufferHead *bh) { set_state(bh, BufferHead::STATE_PARTIAL); };
+ void mark_tx(BufferHead *bh) { set_state(bh, BufferHead::STATE_TX); };
+ void mark_dirty(BufferHead *bh) {
+ set_state(bh, BufferHead::STATE_DIRTY);
+ bh->set_dirty_stamp(g_clock.now());
+ };
+
+
+ // io
+ void bh_read(Onode *on, BufferHead *bh, block_t from=0);
+ void bh_write(Onode *on, BufferHead *bh, block_t shouldbe=0);
+
+ bool bh_cancel_read(BufferHead *bh);
+ bool bh_cancel_write(BufferHead *bh, version_t cur_epoch);
+
+ void bh_queue_partial_write(Onode *on, BufferHead *bh);
+ void bh_cancel_partial_write(BufferHead *bh);
+
+ void queue_partial(block_t from, block_t to, map<off_t, bufferlist>& partial, version_t epoch);
+ void cancel_partial(block_t from, block_t to, version_t epoch);
+
+ void add_shadow_partial(block_t from, BufferHead *bh);
+ void cancel_shadow_partial(block_t from, BufferHead *bh);
+
+ void rx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, block_t diskstart, bufferlist& bl);
+ void tx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, version_t v, version_t e);
+ void partial_tx_finish(version_t epoch);
+
+ friend class C_E_FlushPartial;
+
+ // bh fun
+ BufferHead *split(BufferHead *orig, block_t after);
+};
+
+
+class C_OC_RxFinish : public BlockDevice::callback {
+ Mutex &lock;
+ ObjectCache *oc;
+ block_t start, length;
+ block_t diskstart;
+public:
+ bufferlist bl;
+ C_OC_RxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, block_t ds) :
+ lock(m), oc(o), start(s), length(l), diskstart(ds) {}
+ void finish(ioh_t ioh, int r) {
+ oc->bc->rx_finish(oc, ioh, start, length, diskstart, bl);
+ }
+};
+
+class C_OC_TxFinish : public BlockDevice::callback {
+ Mutex &lock;
+ ObjectCache *oc;
+ block_t start, length;
+ version_t version;
+ version_t epoch;
+ public:
+ C_OC_TxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, version_t v, version_t e) :
+ lock(m), oc(o), start(s), length(l), version(v), epoch(e) {}
+ void finish(ioh_t ioh, int r) {
+ oc->bc->tx_finish(oc, ioh, start, length, version, epoch);
+ }
+};
+
+class C_OC_PartialTxFinish : public BlockDevice::callback {
+ BufferCache *bc;
+ version_t epoch;
+public:
+ C_OC_PartialTxFinish(BufferCache *b, version_t e) :
+ bc(b), epoch(e) {}
+ void finish(ioh_t ioh, int r) {
+ bc->partial_tx_finish(epoch);
+ }
+};
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __EBOFS_CNODE_H
+#define __EBOFS_CNODE_H
+
+#include "Onode.h"
+
+/*
+ * collection node
+ *
+ * holds attribute metadata for collections.
+ * colletion membership is stored in b+tree tables, independent of tte cnode.
+ */
+
+class Cnode : public LRUObject
+{
+ private:
+ int ref;
+ bool dirty;
+
+ public:
+ coll_t coll_id;
+ Extent cnode_loc;
+
+ map<string,bufferptr> attr;
+
+ public:
+ Cnode(coll_t cid) : ref(0), dirty(false), coll_id(cid) {
+ cnode_loc.length = 0;
+ }
+ ~Cnode() {
+ }
+
+ block_t get_cnode_id() { return cnode_loc.start; }
+ int get_cnode_len() { return cnode_loc.length; }
+
+ void get() {
+ if (ref == 0) lru_pin();
+ ref++;
+ }
+ void put() {
+ ref--;
+ if (ref == 0) lru_unpin();
+ }
+ int get_ref_count() { return ref; }
+
+ void mark_dirty() {
+ if (!dirty) {
+ dirty = true;
+ get();
+ }
+ }
+ void mark_clean() {
+ if (dirty) {
+ dirty = false;
+ put();
+ }
+ }
+ bool is_dirty() { return dirty; }
+
+
+ int get_attr_bytes() {
+ int s = 0;
+ for (map<string, bufferptr>::iterator i = attr.begin();
+ i != attr.end();
+ i++) {
+ s += i->first.length() + 1;
+ s += i->second.length() + sizeof(int);
+ }
+ return s;
+ }
+
+ //
+ //???void clear();
+
+
+};
+
+inline ostream& operator<<(ostream& out, Cnode& cn)
+{
+ out << "cnode(" << hex << cn.coll_id << dec;
+ if (cn.is_dirty()) out << " dirty";
+ //out << " " << &cn;
+ out << ")";
+ return out;
+}
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "Ebofs.h"
+
+#include <errno.h>
+#include <sys/vfs.h>
+
+// *******************
+
+#undef dout
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << dev.get_device_name() << ")."
+#define derr(x) if (x <= g_conf.debug_ebofs) cerr << "ebofs(" << dev.get_device_name() << ")."
+
+char *nice_blocks(block_t b)
+{
+ static char s[20];
+ float sz = b*4.0;
+ if (sz > (10 << 20))
+ sprintf(s,"%.1f GB", sz / (1024.0*1024.0));
+ else if (sz > (10 << 10))
+ sprintf(s,"%.1f MB", sz / (1024.0));
+ else
+ sprintf(s,"%llu KB", b*4ULL);
+ return s;
+}
+
+int Ebofs::mount()
+{
+ ebofs_lock.Lock();
+ assert(!mounted);
+
+ int r = dev.open(&idle_kicker);
+ if (r < 0) {
+ ebofs_lock.Unlock();
+ return r;
+ }
+
+ dout(2) << "mounting " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl;
+
+ // read super
+ bufferptr bp1 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+ bufferptr bp2 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+ dev.read(0, 1, bp1);
+ dev.read(1, 1, bp2);
+
+ struct ebofs_super *sb1 = (struct ebofs_super*)bp1.c_str();
+ struct ebofs_super *sb2 = (struct ebofs_super*)bp2.c_str();
+ dout(3) << "mount super @0 epoch " << sb1->epoch << endl;
+ dout(3) << "mount super @1 epoch " << sb2->epoch << endl;
+
+ // pick newest super
+ struct ebofs_super *sb = 0;
+ if (sb1->epoch > sb2->epoch)
+ sb = sb1;
+ else
+ sb = sb2;
+ super_epoch = sb->epoch;
+ dout(3) << "mount epoch " << super_epoch << endl;
+ assert(super_epoch == sb->epoch);
+
+ free_blocks = sb->free_blocks;
+ limbo_blocks = sb->limbo_blocks;
+
+ // init node pools
+ dout(3) << "mount nodepool" << endl;
+ nodepool.init( &sb->nodepool );
+ nodepool.read_usemap( dev, super_epoch );
+ nodepool.read_clean_nodes( dev );
+
+ // open tables
+ dout(3) << "mount opening tables" << endl;
+ object_tab = new Table<object_t, Extent>( nodepool, sb->object_tab );
+ for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
+ free_tab[i] = new Table<block_t, block_t>( nodepool, sb->free_tab[i] );
+ limbo_tab = new Table<block_t, block_t>( nodepool, sb->limbo_tab );
+ alloc_tab = new Table<block_t, pair<block_t,int> >( nodepool, sb->alloc_tab );
+
+ collection_tab = new Table<coll_t, Extent>( nodepool, sb->collection_tab );
+ co_tab = new Table<coll_object_t, bool>( nodepool, sb->co_tab );
+
+ allocator.release_limbo();
+
+ dout(3) << "mount starting commit+finisher threads" << endl;
+ commit_thread.create();
+ finisher_thread.create();
+
+ dout(1) << "mounted " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl;
+ mounted = true;
+
+ ebofs_lock.Unlock();
+ return 0;
+}
+
+
+int Ebofs::mkfs()
+{
+ ebofs_lock.Lock();
+ assert(!mounted);
+
+ int r = dev.open();
+ if (r < 0) {
+ ebofs_lock.Unlock();
+ return r;
+ }
+
+ block_t num_blocks = dev.get_num_blocks();
+
+ free_blocks = 0;
+ limbo_blocks = 0;
+
+ // create first noderegion
+ Extent nr;
+ nr.start = 2;
+ nr.length = 20+ (num_blocks / 1000);
+ if (nr.length < 10) nr.length = 10;
+ nodepool.add_region(nr);
+ dout(10) << "mkfs: first node region at " << nr << endl;
+
+ // allocate two usemaps
+ block_t usemap_len = nodepool.get_usemap_len();
+ nodepool.usemap_even.start = nr.end();
+ nodepool.usemap_even.length = usemap_len;
+ nodepool.usemap_odd.start = nodepool.usemap_even.end();
+ nodepool.usemap_odd.length = usemap_len;
+ dout(10) << "mkfs: even usemap at " << nodepool.usemap_even << endl;
+ dout(10) << "mkfs: odd usemap at " << nodepool.usemap_odd << endl;
+
+ // init tables
+ struct ebofs_table empty;
+ empty.num_keys = 0;
+ empty.root = -1;
+ empty.depth = 0;
+
+ object_tab = new Table<object_t, Extent>( nodepool, empty );
+ collection_tab = new Table<coll_t, Extent>( nodepool, empty );
+
+ for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
+ free_tab[i] = new Table<block_t,block_t>( nodepool, empty );
+ limbo_tab = new Table<block_t,block_t>( nodepool, empty );
+ alloc_tab = new Table<block_t,pair<block_t,int> >( nodepool, empty );
+
+ co_tab = new Table<coll_object_t, bool>( nodepool, empty );
+
+ // add free space
+ Extent left;
+ left.start = nodepool.usemap_odd.end();
+ left.length = num_blocks - left.start;
+ dout(10) << "mkfs: free data blocks at " << left << endl;
+ allocator._release_into_limbo( left );
+ if (g_conf.ebofs_cloneable) {
+ allocator.alloc_inc(nr);
+ allocator.alloc_inc(nodepool.usemap_even);
+ allocator.alloc_inc(nodepool.usemap_odd);
+ }
+ allocator.commit_limbo(); // -> limbo_tab
+ allocator.release_limbo(); // -> free_tab
+
+ // write nodes, super, 2x
+ dout(10) << "mkfs: flushing nodepool and superblocks (2x)" << endl;
+
+ nodepool.commit_start( dev, 0 );
+ nodepool.commit_wait();
+ bufferptr superbp0;
+ prepare_super(0, superbp0);
+ write_super(0, superbp0);
+
+ nodepool.commit_start( dev, 1 );
+ nodepool.commit_wait();
+ bufferptr superbp1;
+ prepare_super(1, superbp1);
+ write_super(1, superbp1);
+
+ // free memory
+ dout(10) << "mkfs: cleaning up" << endl;
+ close_tables();
+
+ dev.close();
+
+ dout(2) << "mkfs: " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl;
+ ebofs_lock.Unlock();
+ return 0;
+}
+
+void Ebofs::close_tables()
+{
+ // close tables
+ delete object_tab;
+ for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
+ delete free_tab[i];
+ delete limbo_tab;
+ delete alloc_tab;
+ delete collection_tab;
+ delete co_tab;
+
+ nodepool.close();
+}
+
+int Ebofs::umount()
+{
+ ebofs_lock.Lock();
+
+ // mark unmounting
+ dout(1) << "umount start" << endl;
+ readonly = true;
+ unmounting = true;
+
+ // kick commit thread
+ dout(5) << "umount stopping commit thread" << endl;
+ commit_cond.Signal();
+ ebofs_lock.Unlock();
+ commit_thread.join();
+ ebofs_lock.Lock();
+
+ // kick finisher thread
+ dout(5) << "umount stopping finisher thread" << endl;
+ finisher_lock.Lock();
+ finisher_stop = true;
+ finisher_cond.Signal();
+ finisher_lock.Unlock();
+
+ finisher_thread.join();
+
+ trim_bc(0);
+ trim_inodes(0);
+
+ for (hash_map<object_t,Onode*>::iterator i = onode_map.begin();
+ i != onode_map.end();
+ i++) {
+ dout(0) << "umount *** leftover: " << i->first << " " << *(i->second) << endl;
+ }
+
+ // free memory
+ dout(5) << "umount cleaning up" << endl;
+ close_tables();
+ dev.close();
+ readonly = unmounting = mounted = false;
+
+ dout(1) << "umount done on " << dev.get_device_name() << endl;
+ ebofs_lock.Unlock();
+ return 0;
+}
+
+
+
+void Ebofs::prepare_super(version_t epoch, bufferptr& bp)
+{
+ struct ebofs_super sb;
+
+ dout(10) << "prepare_super v" << epoch << endl;
+
+ // fill in super
+ memset(&sb, 0, sizeof(sb));
+ sb.s_magic = EBOFS_MAGIC;
+ sb.epoch = epoch;
+ sb.num_blocks = dev.get_num_blocks();
+
+ sb.free_blocks = free_blocks;
+ sb.limbo_blocks = limbo_blocks;
+
+
+ // tables
+ sb.object_tab.num_keys = object_tab->get_num_keys();
+ sb.object_tab.root = object_tab->get_root();
+ sb.object_tab.depth = object_tab->get_depth();
+
+ for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++) {
+ sb.free_tab[i].num_keys = free_tab[i]->get_num_keys();
+ sb.free_tab[i].root = free_tab[i]->get_root();
+ sb.free_tab[i].depth = free_tab[i]->get_depth();
+ }
+ sb.limbo_tab.num_keys = limbo_tab->get_num_keys();
+ sb.limbo_tab.root = limbo_tab->get_root();
+ sb.limbo_tab.depth = limbo_tab->get_depth();
+
+ sb.alloc_tab.num_keys = alloc_tab->get_num_keys();
+ sb.alloc_tab.root = alloc_tab->get_root();
+ sb.alloc_tab.depth = alloc_tab->get_depth();
+
+ sb.collection_tab.num_keys = collection_tab->get_num_keys();
+ sb.collection_tab.root = collection_tab->get_root();
+ sb.collection_tab.depth = collection_tab->get_depth();
+
+ sb.co_tab.num_keys = co_tab->get_num_keys();
+ sb.co_tab.root = co_tab->get_root();
+ sb.co_tab.depth = co_tab->get_depth();
+
+ // pools
+ sb.nodepool.num_regions = nodepool.region_loc.size();
+ for (unsigned i=0; i<nodepool.region_loc.size(); i++) {
+ sb.nodepool.region_loc[i] = nodepool.region_loc[i];
+ }
+ sb.nodepool.node_usemap_even = nodepool.usemap_even;
+ sb.nodepool.node_usemap_odd = nodepool.usemap_odd;
+
+ // put in a buffer
+ bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+ memcpy(bp.c_str(), (const char*)&sb, sizeof(sb));
+}
+
+void Ebofs::write_super(version_t epoch, bufferptr& bp)
+{
+ block_t bno = epoch & 1;
+
+ dout(10) << "write_super v" << epoch << " to b" << bno << endl;
+
+ dev.write(bno, 1, bp, "write_super");
+}
+
+int Ebofs::commit_thread_entry()
+{
+ ebofs_lock.Lock();
+ dout(10) << "commit_thread start" << endl;
+
+ assert(!commit_thread_started); // there can be only one
+ commit_thread_started = true;
+ sync_cond.Signal();
+
+ while (mounted) {
+
+ // wait for kick, or timeout
+ if (g_conf.ebofs_commit_ms) {
+ if (g_conf.ebofs_idle_commit_ms > 0) {
+ // periodically check for idle block device
+ dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms, "
+ << g_conf.ebofs_idle_commit_ms << " ms if idle" << endl;
+ long left = g_conf.ebofs_commit_ms;
+ while (left > 0) {
+ long next = MIN(left, g_conf.ebofs_idle_commit_ms);
+ if (commit_cond.WaitInterval(ebofs_lock, utime_t(0, next*1000)) != ETIMEDOUT)
+ break; // we got kicked
+ if (dev.is_idle()) {
+ dout(20) << "commit_thread bdev is idle, early commit" << endl;
+ break; // dev is idle
+ }
+ left -= next;
+ dout(20) << "commit_thread " << left << " ms left" << endl;
+
+ // hack hack
+ //if (!left) g_conf.debug_ebofs = 10;
+ // /hack hack
+ }
+ } else {
+ // normal wait+timeout
+ dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << endl;
+ commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000));
+ }
+
+ } else {
+ // DEBUG.. wait until kicked
+ dout(10) << "commit_thread no commit_ms, waiting until kicked" << endl;
+ commit_cond.Wait(ebofs_lock);
+ }
+
+ if (unmounting) {
+ dout(10) << "commit_thread unmounting: final commit pass" << endl;
+ assert(readonly);
+ unmounting = false;
+ mounted = false;
+ dirty = true;
+ }
+
+ if (!dirty && !limbo_blocks) {
+ dout(10) << "commit_thread not dirty" << endl;
+ }
+ else {
+ super_epoch++;
+ dirty = false;
+
+ dout(10) << "commit_thread commit start, new epoch " << super_epoch << endl;
+ dout(2) << "commit_thread data: "
+ << 100*(dev.get_num_blocks()-get_free_blocks())/dev.get_num_blocks() << "% used, "
+ << get_free_blocks() << " (" << 100*get_free_blocks()/dev.get_num_blocks()
+ << "%) free in " << get_free_extents()
+ << ", " << get_limbo_blocks() << " (" << 100*get_limbo_blocks()/dev.get_num_blocks()
+ << "%) limbo in " << get_limbo_extents()
+ << endl;
+ dout(2) << "commit_thread nodes: "
+ << 100*nodepool.num_used()/nodepool.num_total() << "% used, "
+ << nodepool.num_free() << " (" << 100*nodepool.num_free()/nodepool.num_total() << "%) free, "
+ << nodepool.num_limbo() << " (" << 100*nodepool.num_limbo()/nodepool.num_total() << "%) limbo, "
+ << nodepool.num_total() << " total." << endl;
+ dout(2) << "commit_thread bc: "
+ << "size " << bc.get_size()
+ << ", trimmable " << bc.get_trimmable()
+ << ", max " << g_conf.ebofs_bc_size
+ << "; dirty " << bc.get_stat_dirty()
+ << ", tx " << bc.get_stat_tx()
+ << ", max dirty " << g_conf.ebofs_bc_max_dirty
+ << endl;
+
+
+ // (async) write onodes+condes (do this first; it currently involves inode reallocation)
+ commit_inodes_start();
+
+ allocator.commit_limbo(); // limbo -> limbo_tab
+
+ // (async) write btree nodes
+ nodepool.commit_start( dev, super_epoch );
+
+ // blockdev barrier (prioritize our writes!)
+ dout(30) << "commit_thread barrier. flushing inodes " << inodes_flushing << endl;
+ dev.barrier();
+
+ // prepare super (before any changes get made!)
+ bufferptr superbp;
+ prepare_super(super_epoch, superbp);
+
+ // wait for it all to flush (drops global lock)
+ commit_bc_wait(super_epoch-1);
+ dout(30) << "commit_thread bc flushed" << endl;
+ commit_inodes_wait();
+ dout(30) << "commit_thread inodes flushed" << endl;
+ nodepool.commit_wait();
+ dout(30) << "commit_thread btree nodes flushed" << endl;
+
+ // ok, now (synchronously) write the prior super!
+ dout(10) << "commit_thread commit flushed, writing super for prior epoch" << endl;
+ ebofs_lock.Unlock();
+ write_super(super_epoch, superbp);
+ ebofs_lock.Lock();
+
+ dout(10) << "commit_thread wrote super" << endl;
+
+ // free limbo space now
+ // (since we're done allocating things,
+ // AND we've flushed all previous epoch data)
+ allocator.release_limbo(); // limbo_tab -> free_tabs
+
+ // do we need more node space?
+ if (nodepool.num_free() < nodepool.num_total() / 3) {
+ dout(2) << "commit_thread running low on node space, allocating more." << endl;
+ alloc_more_node_space();
+ }
+
+ // kick waiters
+ dout(10) << "commit_thread queueing commit + kicking sync waiters" << endl;
+
+ finisher_lock.Lock();
+ finisher_queue.splice(finisher_queue.end(), commit_waiters[super_epoch-1]);
+ commit_waiters.erase(super_epoch-1);
+ finisher_cond.Signal();
+ finisher_lock.Unlock();
+
+ sync_cond.Signal();
+
+ dout(10) << "commit_thread commit finish" << endl;
+ }
+
+ // trim bc?
+ trim_bc();
+ trim_inodes();
+
+ }
+
+ dout(10) << "commit_thread finish" << endl;
+ commit_thread_started = false;
+ ebofs_lock.Unlock();
+ return 0;
+}
+
+
+void Ebofs::alloc_more_node_space()
+{
+ dout(1) << "alloc_more_node_space free " << nodepool.num_free() << "/" << nodepool.num_total() << endl;
+
+ if (nodepool.num_regions() < EBOFS_MAX_NODE_REGIONS) {
+ int want = nodepool.num_total();
+
+ Extent ex;
+ allocator.allocate(ex, want, 2);
+ dout(1) << "alloc_more_node_space wants " << want << " more, got " << ex << endl;
+
+ Extent even, odd;
+ unsigned ulen = nodepool.get_usemap_len(nodepool.num_total() + ex.length);
+ allocator.allocate(even, ulen, 2);
+ allocator.allocate(odd, ulen, 2);
+ dout(1) << "alloc_more_node_space maps need " << ulen << " x2, got " << even << " " << odd << endl;
+
+ if (even.length == ulen && odd.length == ulen) {
+ dout(1) << "alloc_more_node_space got " << ex << ", new usemaps at even " << even << " odd " << odd << endl;
+ allocator.release(nodepool.usemap_even);
+ allocator.release(nodepool.usemap_odd);
+ nodepool.add_region(ex);
+ nodepool.usemap_even = even;
+ nodepool.usemap_odd = odd;
+ } else {
+ dout (1) << "alloc_more_node_space failed to get space for new usemaps" << endl;
+ allocator.release(ex);
+ allocator.release(even);
+ allocator.release(odd);
+ //assert(0);
+ }
+ } else {
+ dout(1) << "alloc_more_node_space already have max node regions!" << endl;
+ assert(0);
+ }
+}
+
+
+void *Ebofs::finisher_thread_entry()
+{
+ finisher_lock.Lock();
+ dout(10) << "finisher_thread start" << endl;
+
+ while (!finisher_stop) {
+ while (!finisher_queue.empty()) {
+ list<Context*> ls;
+ ls.swap(finisher_queue);
+
+ finisher_lock.Unlock();
+
+ //ebofs_lock.Lock(); // um.. why lock this? -sage
+ finish_contexts(ls, 0);
+ //ebofs_lock.Unlock();
+
+ finisher_lock.Lock();
+ }
+ if (finisher_stop) break;
+
+ dout(30) << "finisher_thread sleeping" << endl;
+ finisher_cond.Wait(finisher_lock);
+ }
+
+ dout(10) << "finisher_thread start" << endl;
+ finisher_lock.Unlock();
+ return 0;
+}
+
+
+// *** onodes ***
+
+Onode* Ebofs::new_onode(object_t oid)
+{
+ Onode* on = new Onode(oid);
+
+ assert(onode_map.count(oid) == 0);
+ onode_map[oid] = on;
+ onode_lru.lru_insert_top(on);
+
+ assert(object_tab->lookup(oid) < 0);
+ object_tab->insert( oid, on->onode_loc ); // even tho i'm not placed yet
+
+ on->get();
+ on->onode_loc.start = 0;
+ on->onode_loc.length = 0;
+
+ dirty_onode(on);
+
+ dout(7) << "new_onode " << *on << endl;
+ return on;
+}
+
+
+Onode* Ebofs::get_onode(object_t oid)
+{
+ while (1) {
+ // in cache?
+ if (onode_map.count(oid)) {
+ // yay
+ Onode *on = onode_map[oid];
+ on->get();
+ //cout << "get_onode " << *on << endl;
+ return on;
+ }
+
+ // on disk?
+ Extent onode_loc;
+ if (object_tab->lookup(oid, onode_loc) < 0) {
+ dout(10) << "onode lookup failed on " << oid << endl;
+ // object dne.
+ return 0;
+ }
+
+ // already loading?
+ if (waitfor_onode.count(oid)) {
+ // yep, just wait.
+ Cond c;
+ waitfor_onode[oid].push_back(&c);
+ dout(10) << "get_onode " << oid << " already loading, waiting" << endl;
+ c.Wait(ebofs_lock);
+ continue;
+ }
+
+ dout(10) << "get_onode reading " << oid << " from " << onode_loc << endl;
+
+ assert(waitfor_onode.count(oid) == 0);
+ waitfor_onode[oid].clear(); // this should be empty initially.
+
+ // read it!
+ bufferlist bl;
+ bl.push_back( buffer::create_page_aligned( EBOFS_BLOCK_SIZE*onode_loc.length ) );
+
+ ebofs_lock.Unlock();
+ dev.read( onode_loc.start, onode_loc.length, bl );
+ ebofs_lock.Lock();
+
+ // add onode
+ Onode *on = new Onode(oid);
+ onode_map[oid] = on;
+ onode_lru.lru_insert_top(on);
+
+ // parse data block
+ struct ebofs_onode *eo = (struct ebofs_onode*)bl.c_str();
+ if (eo->object_id != oid) {
+ cerr << " wrong oid in onode block: " << eo->object_id << " != " << oid << endl;
+ cerr << " onode_loc is " << eo->onode_loc << endl;
+ cerr << " object_size " << eo->object_size << endl;
+ cerr << " object_blocks " << eo->object_blocks << endl;
+ cerr << " " << eo->num_collections << " coll + "
+ << eo->num_attr << " attr + "
+ << eo->num_extents << " extents" << endl;
+ assert(eo->object_id == oid);
+ }
+ on->readonly = eo->readonly;
+ on->onode_loc = eo->onode_loc;
+ on->object_size = eo->object_size;
+ on->object_blocks = eo->object_blocks;
+
+ // parse
+ char *p = bl.c_str() + sizeof(*eo);
+
+ // parse collection list
+ for (int i=0; i<eo->num_collections; i++) {
+ coll_t c = *((coll_t*)p);
+ p += sizeof(c);
+ on->collections.insert(c);
+ }
+
+ // parse attributes
+ for (int i=0; i<eo->num_attr; i++) {
+ string key = p;
+ p += key.length() + 1;
+ int len = *(int*)(p);
+ p += sizeof(len);
+ on->attr[key] = buffer::copy(p, len);
+ p += len;
+ dout(15) << "get_onode " << *on << " attr " << key << " len " << len << endl;
+ }
+
+ // parse extents
+ on->extent_map.clear();
+ block_t n = 0;
+ for (int i=0; i<eo->num_extents; i++) {
+ Extent ex = *((Extent*)p);
+ on->extent_map[n] = ex;
+ dout(15) << "get_onode " << *on << " ex " << i << ": " << ex << endl;
+ n += ex.length;
+ p += sizeof(Extent);
+ }
+ assert(n == on->object_blocks);
+
+ // wake up other waiters
+ for (list<Cond*>::iterator i = waitfor_onode[oid].begin();
+ i != waitfor_onode[oid].end();
+ i++)
+ (*i)->Signal();
+ waitfor_onode.erase(oid); // remove Cond list
+
+ on->get();
+ //cout << "get_onode " << *on << " (loaded)" << endl;
+ return on;
+ }
+}
+
+
+class C_E_InodeFlush : public BlockDevice::callback {
+ Ebofs *ebofs;
+public:
+ C_E_InodeFlush(Ebofs *e) : ebofs(e) {}
+ void finish(ioh_t ioh, int r) {
+ ebofs->flush_inode_finish();
+ }
+};
+
+
+void Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off)
+{
+ // onode
+ struct ebofs_onode eo;
+ eo.readonly = on->readonly;
+ eo.onode_loc = on->onode_loc;
+ eo.object_id = on->object_id;
+ eo.object_size = on->object_size;
+ eo.object_blocks = on->object_blocks;
+ eo.num_collections = on->collections.size();
+ eo.num_attr = on->attr.size();
+ eo.num_extents = on->extent_map.size();
+ bl.copy_in(off, sizeof(eo), (char*)&eo);
+ off += sizeof(eo);
+
+ // collections
+ for (set<coll_t>::iterator i = on->collections.begin();
+ i != on->collections.end();
+ i++) {
+ bl.copy_in(off, sizeof(*i), (char*)&(*i));
+ off += sizeof(*i);
+ }
+
+ // attr
+ for (map<string, bufferptr>::iterator i = on->attr.begin();
+ i != on->attr.end();
+ i++) {
+ bl.copy_in(off, i->first.length()+1, i->first.c_str());
+ off += i->first.length()+1;
+ int l = i->second.length();
+ bl.copy_in(off, sizeof(int), (char*)&l);
+ off += sizeof(int);
+ bl.copy_in(off, l, i->second.c_str());
+ off += l;
+ dout(15) << "write_onode " << *on << " attr " << i->first << " len " << l << endl;
+ }
+
+ // extents
+ for (map<block_t,Extent>::iterator i = on->extent_map.begin();
+ i != on->extent_map.end();
+ i++) {
+ bl.copy_in(off, sizeof(Extent), (char*)&(i->second));
+ off += sizeof(Extent);
+ dout(15) << "write_onode " << *on << " ex " << i->first << ": " << i->second << endl;
+ }
+}
+
+void Ebofs::write_onode(Onode *on)
+{
+ // buffer
+ unsigned bytes = sizeof(ebofs_onode) + on->get_collection_bytes() + on->get_attr_bytes() + on->get_extent_bytes();
+ unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1;
+
+ bufferlist bl;
+ bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) );
+
+ // (always) relocate onode
+ if (1) {
+ if (on->onode_loc.length)
+ allocator.release(on->onode_loc);
+
+ block_t first = 0;
+ if (on->extent_map.size())
+ first = on->extent_map.begin()->second.start;
+
+ allocator.allocate(on->onode_loc, blocks, first);
+ object_tab->remove( on->object_id );
+ object_tab->insert( on->object_id, on->onode_loc );
+ //object_tab->verify();
+ }
+
+ dout(10) << "write_onode " << *on << " to " << on->onode_loc << endl;
+
+ unsigned off = 0;
+ encode_onode(on, bl, off);
+ assert(off == bytes);
+
+ // write
+ dev.write( on->onode_loc.start, on->onode_loc.length, bl,
+ new C_E_InodeFlush(this), "write_onode" );
+}
+
+void Ebofs::remove_onode(Onode *on)
+{
+ dout(8) << "remove_onode " << *on << endl;
+
+ assert(on->get_ref_count() >= 1); // caller
+
+ // tear down buffer cache
+ if (on->oc) {
+ on->oc->truncate(0, super_epoch); // this will kick readers along the way.
+ on->close_oc();
+ }
+
+ // remove from onode map, mark dangling/deleted
+ onode_map.erase(on->object_id);
+ onode_lru.lru_remove(on);
+ on->deleted = true;
+ on->dangling = true;
+
+ // remove from object table
+ //dout(0) << "remove_onode on " << *on << endl;
+ object_tab->remove(on->object_id);
+
+ // free onode space
+ if (on->onode_loc.length)
+ allocator.release(on->onode_loc);
+
+ // free data space
+ for (map<block_t,Extent>::iterator i = on->extent_map.begin();
+ i != on->extent_map.end();
+ i++)
+ allocator.release(i->second);
+ on->extent_map.clear();
+
+ // remove from collections
+ for (set<coll_t>::iterator i = on->collections.begin();
+ i != on->collections.end();
+ i++) {
+ co_tab->remove(coll_object_t(*i,on->object_id));
+ }
+ on->collections.clear();
+
+ // dirty -> clean?
+ if (on->is_dirty()) {
+ on->mark_clean(); // this unpins *on
+ dirty_onodes.erase(on);
+ }
+
+ if (on->get_ref_count() > 1) cout << "remove_onode **** will survive " << *on << endl;
+ put_onode(on);
+
+ dirty = true;
+}
+
+void Ebofs::put_onode(Onode *on)
+{
+ on->put();
+ //cout << "put_onode " << *on << endl;
+
+ if (on->get_ref_count() == 0 && on->dangling) {
+ //cout << " *** hosing on " << *on << endl;
+ delete on;
+ }
+}
+
+void Ebofs::dirty_onode(Onode *on)
+{
+ if (!on->is_dirty()) {
+ on->mark_dirty();
+ dirty_onodes.insert(on);
+ }
+ dirty = true;
+}
+
+void Ebofs::trim_inodes(int max)
+{
+ unsigned omax = onode_lru.lru_get_max();
+ unsigned cmax = cnode_lru.lru_get_max();
+ if (max >= 0) omax = cmax = max;
+ dout(10) << "trim_inodes start " << onode_lru.lru_get_size() << " / " << omax << " onodes, "
+ << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << endl;
+
+ // onodes
+ while (onode_lru.lru_get_size() > omax) {
+ // expire an item
+ Onode *on = (Onode*)onode_lru.lru_expire();
+ if (on == 0) break; // nothing to expire
+
+ // expire
+ dout(20) << "trim_inodes removing onode " << *on << endl;
+ onode_map.erase(on->object_id);
+ on->dangling = true;
+
+ if (on->get_ref_count() == 0) {
+ assert(on->oc == 0); // an open oc pins the onode!
+ delete on;
+ } else {
+ dout(-20) << "trim_inodes still active: " << *on << endl;
+ assert(0); // huh?
+ }
+ }
+
+
+ // cnodes
+ while (cnode_lru.lru_get_size() > cmax) {
+ // expire an item
+ Cnode *cn = (Cnode*)cnode_lru.lru_expire();
+ if (cn == 0) break; // nothing to expire
+
+ // expire
+ dout(20) << "trim_inodes removing cnode " << *cn << endl;
+ cnode_map.erase(cn->coll_id);
+
+ delete cn;
+ }
+
+ dout(10) << "trim_inodes finish "
+ << onode_lru.lru_get_size() << " / " << omax << " onodes, "
+ << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << endl;
+}
+
+
+
+// *** cnodes ****
+
+Cnode* Ebofs::new_cnode(coll_t cid)
+{
+ Cnode* cn = new Cnode(cid);
+
+ assert(cnode_map.count(cid) == 0);
+ cnode_map[cid] = cn;
+ cnode_lru.lru_insert_top(cn);
+
+ assert(collection_tab->lookup(cid) < 0);
+ collection_tab->insert( cid, cn->cnode_loc ); // even tho i'm not placed yet
+
+ cn->get();
+ cn->cnode_loc.start = 0;
+ cn->cnode_loc.length = 0;
+
+ dirty_cnode(cn);
+
+ return cn;
+}
+
+Cnode* Ebofs::get_cnode(coll_t cid)
+{
+ while (1) {
+ // in cache?
+ if (cnode_map.count(cid)) {
+ // yay
+ Cnode *cn = cnode_map[cid];
+ cn->get();
+ return cn;
+ }
+
+ // on disk?
+ Extent cnode_loc;
+ if (collection_tab->lookup(cid, cnode_loc) < 0) {
+ // object dne.
+ return 0;
+ }
+
+ // already loading?
+ if (waitfor_cnode.count(cid)) {
+ // yep, just wait.
+ Cond c;
+ waitfor_cnode[cid].push_back(&c);
+ dout(10) << "get_cnode " << cid << " already loading, waiting" << endl;
+ c.Wait(ebofs_lock);
+ continue;
+ }
+
+ dout(10) << "get_cnode reading " << cid << " from " << cnode_loc << endl;
+
+ assert(waitfor_cnode.count(cid) == 0);
+ waitfor_cnode[cid].clear(); // this should be empty initially.
+
+ // read it!
+ bufferlist bl;
+ //bufferpool.alloc( EBOFS_BLOCK_SIZE*cnode_loc.length, bl );
+ bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*cnode_loc.length) );
+
+ ebofs_lock.Unlock();
+ dev.read( cnode_loc.start, cnode_loc.length, bl );
+ ebofs_lock.Lock();
+
+ // parse data block
+ Cnode *cn = new Cnode(cid);
+
+ cnode_map[cid] = cn;
+ cnode_lru.lru_insert_top(cn);
+
+ struct ebofs_cnode *ec = (struct ebofs_cnode*)bl.c_str();
+ cn->cnode_loc = ec->cnode_loc;
+
+ // parse attributes
+ char *p = bl.c_str() + sizeof(*ec);
+ for (int i=0; i<ec->num_attr; i++) {
+ string key = p;
+ p += key.length() + 1;
+ int len = *(int*)(p);
+ p += sizeof(len);
+ cn->attr[key] = buffer::copy(p, len);
+ p += len;
+ dout(15) << "get_cnode " << *cn << " attr " << key << " len " << len << endl;
+ }
+
+ // wake up other waiters
+ for (list<Cond*>::iterator i = waitfor_cnode[cid].begin();
+ i != waitfor_cnode[cid].end();
+ i++)
+ (*i)->Signal();
+ waitfor_cnode.erase(cid); // remove Cond list
+
+ cn->get();
+ return cn;
+ }
+}
+
+void Ebofs::encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off)
+{
+ // cnode
+ struct ebofs_cnode ec;
+ ec.cnode_loc = cn->cnode_loc;
+ ec.coll_id = cn->coll_id;
+ ec.num_attr = cn->attr.size();
+ bl.copy_in(off, sizeof(ec), (char*)&ec);
+ off += sizeof(ec);
+
+ // attr
+ for (map<string, bufferptr >::iterator i = cn->attr.begin();
+ i != cn->attr.end();
+ i++) {
+ bl.copy_in(off, i->first.length()+1, i->first.c_str());
+ off += i->first.length()+1;
+ int len = i->second.length();
+ bl.copy_in(off, sizeof(int), (char*)&len);
+ off += sizeof(int);
+ bl.copy_in(off, len, i->second.c_str());
+ off += len;
+
+ dout(15) << "write_cnode " << *cn << " attr " << i->first << " len " << len << endl;
+ }
+}
+
+void Ebofs::write_cnode(Cnode *cn)
+{
+ // allocate buffer
+ unsigned bytes = sizeof(ebofs_cnode) + cn->get_attr_bytes();
+ unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1;
+
+ bufferlist bl;
+ //bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl );
+ bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) );
+
+ // (always) relocate cnode!
+ if (1) {
+ if (cn->cnode_loc.length)
+ allocator.release(cn->cnode_loc);
+
+ allocator.allocate(cn->cnode_loc, blocks, Allocator::NEAR_LAST_FWD);
+ collection_tab->remove( cn->coll_id );
+ collection_tab->insert( cn->coll_id, cn->cnode_loc );
+ }
+
+ dout(10) << "write_cnode " << *cn << " to " << cn->cnode_loc << endl;
+
+ unsigned off = 0;
+ encode_cnode(cn, bl, off);
+ assert(off == bytes);
+
+ // write
+ dev.write( cn->cnode_loc.start, cn->cnode_loc.length, bl,
+ new C_E_InodeFlush(this), "write_cnode" );
+}
+
+void Ebofs::remove_cnode(Cnode *cn)
+{
+ dout(10) << "remove_cnode " << *cn << endl;
+
+ // remove from table
+ collection_tab->remove(cn->coll_id);
+
+ // free cnode space
+ if (cn->cnode_loc.length)
+ allocator.release(cn->cnode_loc);
+
+ // remove from dirty list?
+ if (cn->is_dirty())
+ dirty_cnodes.erase(cn);
+
+ // remove from map and lru
+ cnode_map.erase(cn->coll_id);
+ cnode_lru.lru_remove(cn);
+
+ // count down refs
+ cn->mark_clean();
+ cn->put();
+ assert(cn->get_ref_count() == 0);
+
+ // hose.
+ delete cn;
+
+ dirty = true;
+}
+
+void Ebofs::put_cnode(Cnode *cn)
+{
+ cn->put();
+}
+
+void Ebofs::dirty_cnode(Cnode *cn)
+{
+ if (!cn->is_dirty()) {
+ cn->mark_dirty();
+ dirty_cnodes.insert(cn);
+ }
+ dirty = true;
+}
+
+
+
+
+
+void Ebofs::flush_inode_finish()
+{
+ ebofs_lock.Lock();
+ {
+ inodes_flushing--;
+ if (inodes_flushing < 1000)
+ dout(20) << "flush_inode_finish, " << inodes_flushing << " left" << endl;
+ if (inodes_flushing == 0)
+ inode_commit_cond.Signal();
+ }
+ ebofs_lock.Unlock();
+}
+
+void Ebofs::commit_inodes_start()
+{
+ dout(10) << "commit_inodes_start" << endl;
+
+ assert(inodes_flushing == 0);
+
+ // onodes
+ for (set<Onode*>::iterator i = dirty_onodes.begin();
+ i != dirty_onodes.end();
+ i++) {
+ Onode *on = *i;
+ inodes_flushing++;
+ write_onode(on);
+ on->mark_clean();
+ on->uncommitted.clear(); // commit allocated blocks
+ on->commit_waiters.clear(); // these guys are gonna get taken care of, bc we committed.
+ }
+ dirty_onodes.clear();
+
+ // cnodes
+ for (set<Cnode*>::iterator i = dirty_cnodes.begin();
+ i != dirty_cnodes.end();
+ i++) {
+ Cnode *cn = *i;
+ inodes_flushing++;
+ write_cnode(cn);
+ cn->mark_clean();
+ }
+ dirty_cnodes.clear();
+
+ dout(10) << "commit_inodes_start writing " << inodes_flushing << " onodes+cnodes" << endl;
+}
+
+void Ebofs::commit_inodes_wait()
+{
+ // caller must hold ebofs_lock
+ while (inodes_flushing > 0) {
+ dout(10) << "commit_inodes_wait waiting for " << inodes_flushing << " onodes+cnodes to flush" << endl;
+ inode_commit_cond.Wait(ebofs_lock);
+ }
+ dout(10) << "commit_inodes_wait all flushed" << endl;
+}
+
+
+
+
+
+
+
+// *** buffer cache ***
+
+void Ebofs::trim_buffer_cache()
+{
+ ebofs_lock.Lock();
+ trim_bc(0);
+ ebofs_lock.Unlock();
+}
+
+void Ebofs::trim_bc(off_t max)
+{
+ if (max < 0)
+ max = g_conf.ebofs_bc_size;
+ dout(10) << "trim_bc start: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << endl;
+
+ while (bc.get_size() > max &&
+ bc.get_trimmable()) {
+ BufferHead *bh = (BufferHead*) bc.lru_rest.lru_expire();
+ if (!bh) break;
+
+ dout(25) << "trim_bc trimming " << *bh << endl;
+ assert(bh->is_clean());
+
+ ObjectCache *oc = bh->oc;
+ bc.remove_bh(bh);
+ delete bh;
+
+ if (oc->is_empty()) {
+ Onode *on = oc->on;
+ dout(10) << "trim_bc closing oc on " << *on << endl;
+ on->close_oc();
+ }
+ }
+
+ dout(10) << "trim_bc finish: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << endl;
+}
+
+
+void Ebofs::kick_idle()
+{
+ dout(10) << "kick_idle" << endl;
+ commit_cond.Signal();
+
+ /*
+ ebofs_lock.Lock();
+ if (mounted && !unmounting && dirty) {
+ dout(0) << "kick_idle dirty, doing commit" << endl;
+ commit_cond.Signal();
+ } else {
+ dout(0) << "kick_idle !dirty or !mounted or unmounting, doing nothing" << endl;
+ }
+ ebofs_lock.Unlock();
+ */
+}
+
+void Ebofs::sync(Context *onsafe)
+{
+ ebofs_lock.Lock();
+ if (onsafe) {
+ dirty = true;
+ commit_waiters[super_epoch].push_back(onsafe);
+ }
+ ebofs_lock.Unlock();
+}
+
+void Ebofs::sync()
+{
+ ebofs_lock.Lock();
+ if (!dirty) {
+ dout(7) << "sync in " << super_epoch << ", not dirty" << endl;
+ } else {
+ epoch_t start = super_epoch;
+ dout(7) << "sync start in " << start << endl;
+ while (super_epoch == start) {
+ dout(7) << "sync kicking commit in " << super_epoch << endl;
+ dirty = true;
+ commit_cond.Signal();
+ sync_cond.Wait(ebofs_lock);
+ }
+ dout(10) << "sync finish in " << super_epoch << endl;
+ }
+ ebofs_lock.Unlock();
+}
+
+
+
+void Ebofs::commit_bc_wait(version_t epoch)
+{
+ dout(10) << "commit_bc_wait on epoch " << epoch << endl;
+
+ while (bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE,epoch) > 0 ||
+ bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL,epoch) > 0) {
+ //dout(10) << "commit_bc_wait " << bc.get_unflushed(epoch) << " unflushed in epoch " << epoch << endl;
+ dout(10) << "commit_bc_wait epoch " << epoch
+ << ", unflushed bhwrite " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE)
+ << ", unflushed partial " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL)
+ << endl;
+ bc.waitfor_flush();
+ }
+
+ bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE).erase(epoch);
+ bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL).erase(epoch);
+
+ dout(10) << "commit_bc_wait all flushed for epoch " << epoch
+ << "; " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE)
+ << " " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL)
+ << endl;
+}
+
+
+
+int Ebofs::statfs(struct statfs *buf)
+{
+ dout(7) << "statfs" << endl;
+
+ buf->f_type = EBOFS_MAGIC; /* type of filesystem */
+ buf->f_bsize = 4096; /* optimal transfer block size */
+ buf->f_blocks = dev.get_num_blocks(); /* total data blocks in file system */
+ buf->f_bfree = get_free_blocks()
+ + get_limbo_blocks(); /* free blocks in fs */
+ buf->f_bavail = get_free_blocks(); /* free blocks avail to non-superuser -- actually, for writing. */
+ buf->f_files = nodepool.num_total(); /* total file nodes in file system */
+ buf->f_ffree = nodepool.num_free(); /* free file nodes in fs */
+ //buf->f_fsid = 0; /* file system id */
+ buf->f_namelen = 8; /* maximum length of filenames */
+
+ return 0;
+}
+
+
+
+
+/*
+ * allocate a write to blocks on disk.
+ * - take care to not overwrite any "safe" data blocks.
+ * - allocate/map new extents on disk as necessary
+ */
+void Ebofs::alloc_write(Onode *on,
+ block_t start, block_t len,
+ interval_set<block_t>& alloc,
+ block_t& old_bfirst, block_t& old_blast)
+{
+ // first decide what pages to (re)allocate
+ alloc.insert(start, len); // start with whole range
+
+ // figure out what bits are already uncommitted
+ interval_set<block_t> already_uncom;
+ already_uncom.intersection_of(alloc, on->uncommitted);
+
+ // subtract those off, so we're left with the committed bits (that must be reallocated).
+ alloc.subtract(already_uncom);
+
+ dout(10) << "alloc_write must (re)alloc " << alloc << " on " << *on << endl;
+
+ // release it (into limbo)
+ for (map<block_t,block_t>::iterator i = alloc.m.begin();
+ i != alloc.m.end();
+ i++) {
+ // get old region
+ vector<Extent> old;
+ on->map_extents(i->first, i->second, old);
+ for (unsigned o=0; o<old.size(); o++)
+ allocator.release(old[o]);
+
+ // take note if first/last blocks in write range are remapped.. in case we need to do a partial read/write thing
+ // these are for partial, so we don't care about TX bh's, so don't worry about bits canceling stuff below.
+ if (!old.empty()) {
+ if (i->first == start) {
+ old_bfirst = old[0].start;
+ dout(20) << "alloc_write old_bfirst " << old_bfirst << " of " << old[0] << endl;
+ }
+ if (i->first+i->second == start+len) {
+ old_blast = old[old.size()-1].last();
+ dout(20) << "alloc_write old_blast " << old_blast << " of " << old[old.size()-1] << endl;
+ }
+ }
+ }
+
+ // reallocate uncommitted too?
+ // ( --> yes. we can always make better allocation decisions later, with more information. )
+ if (g_conf.ebofs_realloc) {
+ list<BufferHead*> tx;
+
+ ObjectCache *oc = on->get_oc(&bc);
+ oc->find_tx(start, len, tx);
+
+ for (list<BufferHead*>::reverse_iterator p = tx.rbegin();
+ p != tx.rend();
+ p++) {
+ BufferHead *bh = *p;
+
+ // cancelable/moveable?
+ if (alloc.contains(bh->start(), bh->length())) {
+ dout(10) << "alloc_write " << *bh << " already in " << alloc << endl;
+ continue;
+ }
+
+ vector<Extent> old;
+ on->map_extents(bh->start(), bh->length(), old);
+ assert(old.size() == 1);
+
+ if (bh->start() >= start && bh->end() <= start+len) {
+ assert(bh->epoch_modified == super_epoch);
+ if (bc.bh_cancel_write(bh, super_epoch)) {
+ if (bh->length() == 1)
+ dout(10) << "alloc_write unallocated tx " << old[0] << ", canceled " << *bh << endl;
+ // no, this isn't compatible with clone() and extent reference counting.
+ //allocator.unallocate(old[0]); // release (into free)
+ allocator.release(old[0]);
+ alloc.insert(bh->start(), bh->length());
+ } else {
+ if (bh->length() == 1)
+ dout(10) << "alloc_write released tx " << old[0] << ", couldn't cancel " << *bh << endl;
+ allocator.release(old[0]); // release (into limbo)
+ alloc.insert(bh->start(), bh->length());
+ }
+ } else {
+ if (bh->length() == 1)
+ dout(10) << "alloc_write skipped tx " << old[0] << ", not entirely within "
+ << start << "~" << len
+ << " bh " << *bh << endl;
+ }
+ }
+
+ dout(10) << "alloc_write will (re)alloc " << alloc << " on " << *on << endl;
+ }
+
+ if (alloc.empty()) return; // no need to dirty the onode below!
+
+
+ // merge alloc into onode uncommitted map
+ //dout(10) << " union of " << on->uncommitted << " and " << alloc << endl;
+ interval_set<block_t> old = on->uncommitted;
+ on->uncommitted.union_of(alloc);
+
+ dout(10) << "alloc_write onode.uncommitted is now " << on->uncommitted << endl;
+
+ if (0) {
+ // verify
+ interval_set<block_t> ta;
+ ta.intersection_of(on->uncommitted, alloc);
+ cout << " ta " << ta << endl;
+ assert(alloc == ta);
+
+ interval_set<block_t> tb;
+ tb.intersection_of(on->uncommitted, old);
+ cout << " tb " << tb << endl;
+ assert(old == tb);
+ }
+
+ dirty_onode(on);
+
+ // allocate the space
+ for (map<block_t,block_t>::iterator i = alloc.m.begin();
+ i != alloc.m.end();
+ i++) {
+ dout(15) << "alloc_write alloc " << i->first << "~" << i->second << " (of " << start << "~" << len << ")" << endl;
+
+ // allocate new space
+ block_t left = i->second;
+ block_t cur = i->first;
+ while (left > 0) {
+ Extent ex;
+ allocator.allocate(ex, left, Allocator::NEAR_LAST_FWD);
+ dout(10) << "alloc_write got " << ex << " for object offset " << cur << endl;
+ on->set_extent(cur, ex); // map object to new region
+ left -= ex.length;
+ cur += ex.length;
+ }
+ }
+}
+
+
+
+
+void Ebofs::apply_write(Onode *on, off_t off, size_t len, bufferlist& bl)
+{
+ ObjectCache *oc = on->get_oc(&bc);
+
+ // map into blocks
+ off_t opos = off; // byte pos in object
+ size_t zleft = 0; // zeros left to write
+ size_t left = len; // bytes left
+
+ block_t bstart = off / EBOFS_BLOCK_SIZE;
+
+ if (off > on->object_size) {
+ zleft = off - on->object_size;
+ opos = on->object_size;
+ bstart = on->object_size / EBOFS_BLOCK_SIZE;
+ }
+ if (off+(off_t)len > on->object_size) {
+ dout(10) << "apply_write extending size on " << *on << ": " << on->object_size
+ << " -> " << off+len << endl;
+ on->object_size = off+len;
+ dirty_onode(on);
+ }
+ if (bl.length() == 0) {
+ zleft += len;
+ left = 0;
+ }
+ if (zleft)
+ dout(10) << "apply_write zeroing first " << zleft << " bytes of " << *on << endl;
+
+ block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE;
+ block_t blen = blast-bstart+1;
+
+ // allocate write on disk.
+ interval_set<block_t> alloc;
+ block_t old_bfirst = 0; // zero means not defined here (since we ultimately pass to bh_read)
+ block_t old_blast = 0;
+ alloc_write(on, bstart, blen, alloc, old_bfirst, old_blast);
+ dout(20) << "apply_write old_bfirst " << old_bfirst << ", old_blast " << old_blast << endl;
+
+ if (fake_writes) {
+ on->uncommitted.clear(); // worst case!
+ return;
+ }
+
+ // map b range onto buffer_heads
+ map<block_t, BufferHead*> hits;
+ oc->map_write(bstart, blen, alloc, hits, super_epoch);
+
+ // get current versions
+ //version_t lowv, highv;
+ //oc->scan_versions(bstart, blen, lowv, highv);
+ //highv++;
+ version_t highv = ++oc->write_count;
+
+ // copy from bl into buffer cache
+ unsigned blpos = 0; // byte pos in input buffer
+
+ // write data into buffers
+ for (map<block_t, BufferHead*>::iterator i = hits.begin();
+ i != hits.end();
+ i++) {
+ BufferHead *bh = i->second;
+ bh->set_version(highv);
+ bh->epoch_modified = super_epoch;
+
+ // old write in progress?
+ if (bh->is_tx()) { // copy the buffer to avoid munging up in-flight write
+ dout(10) << "apply_write tx pending, copying buffer on " << *bh << endl;
+ bufferlist temp;
+ temp.claim(bh->data);
+ //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data);
+ bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
+ bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp);
+ }
+
+ // need to split off partial? (partials can only be ONE block)
+ if ((bh->is_missing() || bh->is_rx()) && bh->length() > 1) {
+ if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0)) {
+ BufferHead *right = bc.split(bh, bh->start()+1);
+ hits[right->start()] = right;
+ dout(10) << "apply_write split off left block for partial write; rest is " << *right << endl;
+ }
+ if ((bh->last() == blast && (len+off) % EBOFS_BLOCK_SIZE != 0) &&
+ ((off_t)len+off < on->object_size)) {
+ BufferHead *right = bc.split(bh, bh->last());
+ hits[right->start()] = right;
+ dout(10) << "apply_write split off right block for upcoming partial write; rest is " << *right << endl;
+ }
+ }
+
+ // partial at head or tail?
+ if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0) || // opos, not off, in case we're zeroing...
+ (bh->last() == blast && ((off_t)len+off) % EBOFS_BLOCK_SIZE != 0 && ((off_t)len+off) < on->object_size)) {
+ // locate ourselves in bh
+ unsigned off_in_bh = opos - bh->start()*EBOFS_BLOCK_SIZE;
+ assert(off_in_bh >= 0);
+ unsigned len_in_bh = MIN( (off_t)(zleft+left),
+ (off_t)(bh->end()*EBOFS_BLOCK_SIZE)-opos );
+
+ if (bh->is_partial() || bh->is_rx() || bh->is_missing()) {
+ assert(bh->is_partial() || bh->is_rx() || bh->is_missing());
+ assert(bh->length() == 1);
+
+ // add frag to partial
+ dout(10) << "apply_write writing into partial " << *bh << ":"
+ << " off_in_bh " << off_in_bh
+ << " len_in_bh " << len_in_bh
+ << endl;
+ unsigned z = MIN( zleft, len_in_bh );
+ if (z) {
+ bufferptr zp(z);
+ zp.zero();
+ bufferlist zb;
+ zb.push_back(zp);
+ bh->add_partial(off_in_bh, zb);
+ zleft -= z;
+ opos += z;
+ }
+
+ bufferlist sb;
+ sb.substr_of(bl, blpos, len_in_bh-z); // substr in existing buffer
+ bufferlist cp;
+ cp.append(sb.c_str(), len_in_bh-z); // copy the partial bit!
+ bh->add_partial(off_in_bh, cp);
+ left -= len_in_bh-z;
+ blpos += len_in_bh-z;
+ opos += len_in_bh-z;
+
+ if (bh->partial_is_complete(on->object_size - bh->start()*EBOFS_BLOCK_SIZE)) {
+ dout(10) << "apply_write completed partial " << *bh << endl;
+ //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); // new buffers!
+ bh->data.clear();
+ bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
+ bh->data.zero();
+ bh->apply_partial();
+ bc.mark_dirty(bh);
+ bc.bh_write(on, bh);
+ }
+ else if (bh->is_rx()) {
+ dout(10) << "apply_write rx -> partial " << *bh << endl;
+ assert(bh->length() == 1);
+ bc.mark_partial(bh);
+ bc.bh_queue_partial_write(on, bh); // queue the eventual write
+ }
+ else if (bh->is_missing()) {
+ dout(10) << "apply_write missing -> partial " << *bh << endl;
+ assert(bh->length() == 1);
+ bc.mark_partial(bh);
+
+ // take care to read from _old_ disk block locations!
+ if (bh->start() == bstart)
+ bc.bh_read(on, bh, old_bfirst);
+ else if (bh->start() == blast)
+ bc.bh_read(on, bh, old_blast);
+ else assert(0);
+
+ bc.bh_queue_partial_write(on, bh); // queue the eventual write
+ }
+ else if (bh->is_partial()) {
+ dout(10) << "apply_write already partial, no need to submit rx on " << *bh << endl;
+ if (bh->partial_tx_epoch == super_epoch)
+ bc.bh_cancel_partial_write(bh);
+ bc.bh_queue_partial_write(on, bh); // queue the eventual write
+ }
+
+
+ } else {
+ assert(bh->is_clean() || bh->is_dirty() || bh->is_tx());
+
+ // just write into the bh!
+ dout(10) << "apply_write writing leading/tailing partial into " << *bh << ":"
+ << " off_in_bh " << off_in_bh
+ << " len_in_bh " << len_in_bh
+ << endl;
+
+ // copy data into new buffers first (copy on write!)
+ // FIXME: only do the modified pages? this might be a big bh!
+ bufferlist temp;
+ temp.claim(bh->data);
+ //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data);
+ bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
+ bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp);
+
+ unsigned z = MIN( zleft, len_in_bh );
+ if (z) {
+ bufferptr zp(z);
+ zp.zero();
+ bufferlist zb;
+ zb.push_back(zp);
+ bh->data.copy_in(off_in_bh, z, zb);
+ zleft -= z;
+ opos += z;
+ }
+
+ bufferlist sub;
+ sub.substr_of(bl, blpos, len_in_bh-z);
+ bh->data.copy_in(off_in_bh+z, len_in_bh-z, sub);
+ blpos += len_in_bh-z;
+ left -= len_in_bh-z;
+ opos += len_in_bh-z;
+
+ if (!bh->is_dirty())
+ bc.mark_dirty(bh);
+
+ bc.bh_write(on, bh);
+ }
+ continue;
+ }
+
+ // ok, we're talking full block(s) now (modulo last block of the object)
+ assert(opos % EBOFS_BLOCK_SIZE == 0);
+ assert((off_t)(zleft+left) >= (off_t)(EBOFS_BLOCK_SIZE*bh->length()) ||
+ opos+(off_t)(zleft+left) == on->object_size);
+
+ // alloc new buffers.
+ //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data);
+ bh->data.clear();
+ bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
+
+ // copy!
+ unsigned len_in_bh = MIN(bh->length()*EBOFS_BLOCK_SIZE, zleft+left);
+ assert(len_in_bh <= zleft+left);
+
+ dout(10) << "apply_write writing into " << *bh << ":"
+ << " len_in_bh " << len_in_bh
+ << endl;
+
+ unsigned z = MIN(len_in_bh, zleft);
+ if (z) {
+ bufferptr zp(z);
+ zp.zero();
+ bufferlist zb;
+ zb.push_back(zp);
+ bh->data.copy_in(0, z, zb);
+ zleft -= z;
+ }
+
+ bufferlist sub;
+ sub.substr_of(bl, blpos, len_in_bh-z);
+ bh->data.copy_in(z, len_in_bh-z, sub);
+
+ blpos += len_in_bh-z;
+ left -= len_in_bh-z;
+ opos += len_in_bh;
+
+ // old partial?
+ if (bh->is_partial() &&
+ bh->partial_tx_epoch == super_epoch)
+ bc.bh_cancel_partial_write(bh);
+
+ // mark dirty
+ if (!bh->is_dirty())
+ bc.mark_dirty(bh);
+
+ bc.bh_write(on, bh);
+ }
+
+ assert(zleft == 0);
+ assert(left == 0);
+ assert(opos == off+(off_t)len);
+ //assert(blpos == bl.length());
+}
+
+
+
+
+// *** file i/o ***
+
+bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl,
+ Cond *will_wait_on, bool *will_wait_on_bool)
+{
+ dout(10) << "attempt_read " << *on << " " << off << "~" << len << endl;
+ ObjectCache *oc = on->get_oc(&bc);
+
+ // map
+ block_t bstart = off / EBOFS_BLOCK_SIZE;
+ block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE;
+ block_t blen = blast-bstart+1;
+
+ map<block_t, BufferHead*> hits;
+ map<block_t, BufferHead*> missing; // read these
+ map<block_t, BufferHead*> rx; // wait for these
+ map<block_t, BufferHead*> partials; // ??
+ oc->map_read(bstart, blen, hits, missing, rx, partials);
+
+ // missing buffers?
+ if (!missing.empty()) {
+ for (map<block_t,BufferHead*>::iterator i = missing.begin();
+ i != missing.end();
+ i++) {
+ dout(10) << "attempt_read missing buffer " << *(i->second) << endl;
+ bc.bh_read(on, i->second);
+ }
+ BufferHead *wait_on = missing.begin()->second;
+ block_t b = MAX(wait_on->start(), bstart);
+ wait_on->waitfor_read[b].push_back(new C_Cond(will_wait_on, will_wait_on_bool));
+ return false;
+ }
+
+ // are partials sufficient?
+ bool partials_ok = true;
+ for (map<block_t,BufferHead*>::iterator i = partials.begin();
+ i != partials.end();
+ i++) {
+ BufferHead *bh = i->second;
+ off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE);
+ off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE);
+ off_t start = MAX( off, bhstart );
+ off_t end = MIN( off+(off_t)len, bhend );
+
+ if (!i->second->have_partial_range(start-bhstart, end-bhend)) {
+ if (partials_ok) {
+ // wait on this one
+ Context *c = new C_Cond(will_wait_on, will_wait_on_bool);
+ dout(10) << "attempt_read insufficient partial buffer " << *(i->second) << " c " << c << endl;
+ i->second->waitfor_read[i->second->start()].push_back(c);
+ }
+ partials_ok = false;
+ }
+ }
+ if (!partials_ok) return false;
+
+ // wait on rx?
+ if (!rx.empty()) {
+ BufferHead *wait_on = rx.begin()->second;
+ Context *c = new C_Cond(will_wait_on, will_wait_on_bool);
+ dout(1) << "attempt_read waiting for read to finish on " << *wait_on << " c " << c << endl;
+ block_t b = MAX(wait_on->start(), bstart);
+ wait_on->waitfor_read[b].push_back(c);
+ return false;
+ }
+
+ // yay, we have it all!
+ // concurrently walk thru hits, partials.
+ map<block_t,BufferHead*>::iterator h = hits.begin();
+ map<block_t,BufferHead*>::iterator p = partials.begin();
+
+ bl.clear();
+ off_t pos = off;
+ block_t curblock = bstart;
+ while (curblock <= blast) {
+ BufferHead *bh = 0;
+ if (h->first == curblock) {
+ bh = h->second;
+ h++;
+ } else if (p->first == curblock) {
+ bh = p->second;
+ p++;
+ } else assert(0);
+
+ off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE);
+ off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE);
+ off_t start = MAX( pos, bhstart );
+ off_t end = MIN( off+(off_t)len, bhend );
+
+ if (bh->is_partial()) {
+ // copy from a partial block. yuck!
+ bufferlist frag;
+ bh->copy_partial_substr( start-bhstart, end-bhstart, frag );
+ bl.claim_append( frag );
+ pos += frag.length();
+ } else {
+ // copy from a full block.
+ if (bhstart == start && bhend == end) {
+ bl.append( bh->data );
+ pos += bh->data.length();
+ } else {
+ bufferlist frag;
+ dout(10) << "substr " << (start-bhstart) << "~" << (end-start) << " of " << bh->data.length() << " in " << *bh << endl;
+ frag.substr_of(bh->data, start-bhstart, end-start);
+ pos += frag.length();
+ bl.claim_append( frag );
+ }
+ }
+
+ curblock = bh->end();
+ /* this assert is more trouble than it's worth
+ assert((off_t)(curblock*EBOFS_BLOCK_SIZE) == pos || // should be aligned with next block
+ end != bhend || // or we ended midway through bh
+ (bh->last() == blast && end == bhend)); // ended last block ** FIXME WRONG???
+ */
+ }
+
+ assert(bl.length() == len);
+ return true;
+}
+
+
+/*
+ * is_cached -- query whether a object extent is in our cache
+ * return value of -1 if onode isn't loaded. otherwise, the number
+ * of extents that need to be read (i.e. # of seeks)
+ */
+int Ebofs::is_cached(object_t oid, off_t off, size_t len)
+{
+ ebofs_lock.Lock();
+ int r = _is_cached(oid, off, len);
+ ebofs_lock.Unlock();
+ return r;
+}
+
+int Ebofs::_is_cached(object_t oid, off_t off, size_t len)
+{
+ Onode *on = 0;
+ if (onode_map.count(oid) == 0) {
+ dout(7) << "_is_cached " << oid << " " << off << "~" << len << " ... onode " << endl;
+ return -1; // object dne?
+ }
+
+ if (!on->have_oc()) {
+ // nothing is cached. return # of extents in file.
+ return on->extent_map.size();
+ }
+
+ // map
+ block_t bstart = off / EBOFS_BLOCK_SIZE;
+ block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE;
+ block_t blen = blast-bstart+1;
+
+ map<block_t, BufferHead*> hits;
+ map<block_t, BufferHead*> missing; // read these
+ map<block_t, BufferHead*> rx; // wait for these
+ map<block_t, BufferHead*> partials; // ??
+ on->get_oc(&bc)->map_read(bstart, blen, hits, missing, rx, partials);
+ return missing.size() + rx.size() + partials.size();
+
+ // FIXME: actually, we should calculate if these extents are contiguous.
+ // and not using map_read, probably...
+ /* hrmpf
+ block_t dpos = 0;
+ block_t opos = bstart;
+ while (opos < blen) {
+ if (hits.begin()->first == opos) {
+ } else {
+ block_t d;
+ if (missing.begin()->first == opos) d = missing.begin()->second.
+
+ }
+ */
+}
+
+int Ebofs::read(object_t oid,
+ off_t off, size_t len,
+ bufferlist& bl)
+{
+ ebofs_lock.Lock();
+ int r = _read(oid, off, len, bl);
+ ebofs_lock.Unlock();
+ return r;
+}
+
+int Ebofs::_read(object_t oid, off_t off, size_t len, bufferlist& bl)
+{
+ dout(7) << "_read " << oid << " " << off << "~" << len << endl;
+
+ Onode *on = get_onode(oid);
+ if (!on) {
+ dout(7) << "_read " << oid << " " << off << "~" << len << " ... dne " << endl;
+ return -ENOENT; // object dne?
+ }
+
+ // read data into bl. block as necessary.
+ Cond cond;
+
+ int r = 0;
+ while (1) {
+ // check size bound
+ if (off >= on->object_size) {
+ dout(7) << "_read " << oid << " " << off << "~" << len << " ... off past eof " << on->object_size << endl;
+ r = -ESPIPE; // FIXME better errno?
+ break;
+ }
+
+ size_t try_len = len ? len:on->object_size;
+ size_t will_read = MIN(off+(off_t)try_len, on->object_size) - off;
+
+ bool done;
+ if (attempt_read(on, off, will_read, bl, &cond, &done))
+ break; // yay
+
+ // wait
+ while (!done)
+ cond.Wait(ebofs_lock);
+
+ if (on->deleted) {
+ dout(7) << "_read " << oid << " " << off << "~" << len << " ... object deleted" << endl;
+ r = -ENOENT;
+ break;
+ }
+ }
+
+ put_onode(on);
+
+ trim_bc();
+
+ if (r < 0) return r; // return error,
+ dout(7) << "_read " << oid << " " << off << "~" << len << " ... got " << bl.length() << endl;
+ return bl.length(); // or bytes read.
+}
+
+
+bool Ebofs::_write_will_block()
+{
+ return (bc.get_stat_dirty()+bc.get_stat_tx() > g_conf.ebofs_bc_max_dirty);
+}
+
+bool Ebofs::write_will_block()
+{
+ ebofs_lock.Lock();
+ bool b = _write_will_block();
+ ebofs_lock.Unlock();
+ return b;
+}
+
+
+unsigned Ebofs::apply_transaction(Transaction& t, Context *onsafe)
+{
+ ebofs_lock.Lock();
+ dout(7) << "apply_transaction start (" << t.ops.size() << " ops)" << endl;
+
+ // do ops
+ unsigned r = 0; // bit fields indicate which ops failed.
+ int bit = 1;
+ for (list<int>::iterator p = t.ops.begin();
+ p != t.ops.end();
+ p++) {
+ switch (*p) {
+ case Transaction::OP_READ:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ off_t offset = t.offsets.front(); t.offsets.pop_front();
+ size_t len = t.lengths.front(); t.lengths.pop_front();
+ bufferlist *pbl = t.pbls.front(); t.pbls.pop_front();
+ if (_read(oid, offset, len, *pbl) < 0) {
+ dout(7) << "apply_transaction fail on _read" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+ case Transaction::OP_STAT:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ struct stat *st = t.psts.front(); t.psts.pop_front();
+ if (_stat(oid, st) < 0) {
+ dout(7) << "apply_transaction fail on _stat" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+ case Transaction::OP_GETATTR:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+ pair<void*,int*> pattrval = t.pattrvals.front(); t.pattrvals.pop_front();
+ if ((*(pattrval.second) = _getattr(oid, attrname, pattrval.first, *(pattrval.second))) < 0) {
+ dout(7) << "apply_transaction fail on _getattr" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+ case Transaction::OP_GETATTRS:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ map<string,bufferptr> *pset = t.pattrsets.front(); t.pattrsets.pop_front();
+ if (_getattrs(oid, *pset) < 0) {
+ dout(7) << "apply_transaction fail on _getattrs" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+
+ case Transaction::OP_WRITE:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ off_t offset = t.offsets.front(); t.offsets.pop_front();
+ size_t len = t.lengths.front(); t.lengths.pop_front();
+ bufferlist bl = t.bls.front(); t.bls.pop_front();
+ if (_write(oid, offset, len, bl) < 0) {
+ dout(7) << "apply_transaction fail on _write" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+ case Transaction::OP_TRUNCATE:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ off_t len = t.offsets.front(); t.offsets.pop_front();
+ if (_truncate(oid, len) < 0) {
+ dout(7) << "apply_transaction fail on _truncate" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+ case Transaction::OP_REMOVE:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ if (_remove(oid) < 0) {
+ dout(7) << "apply_transaction fail on _remove" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+ case Transaction::OP_SETATTR:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+ //pair<const void*,int> attrval = t.attrvals.front(); t.attrvals.pop_front();
+ bufferlist bl;
+ bl.claim( t.attrbls.front() );
+ t.attrbls.pop_front();
+ if (_setattr(oid, attrname, bl.c_str(), bl.length()) < 0) {
+ dout(7) << "apply_transaction fail on _setattr" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+ case Transaction::OP_SETATTRS:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ map<string,bufferptr> *pattrset = t.pattrsets.front(); t.pattrsets.pop_front();
+ if (_setattrs(oid, *pattrset) < 0) {
+ dout(7) << "apply_transaction fail on _setattrs" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+ case Transaction::OP_RMATTR:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+ if (_rmattr(oid, attrname) < 0) {
+ dout(7) << "apply_transaction fail on _rmattr" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+ case Transaction::OP_CLONE:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ object_t noid = t.oids.front(); t.oids.pop_front();
+ if (_clone(oid, noid) < 0) {
+ dout(7) << "apply_transaction fail on _clone" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+ case Transaction::OP_MKCOLL:
+ {
+ coll_t cid = t.cids.front(); t.cids.pop_front();
+ if (_create_collection(cid) < 0) {
+ dout(7) << "apply_transaction fail on _create_collection" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+ case Transaction::OP_RMCOLL:
+ {
+ coll_t cid = t.cids.front(); t.cids.pop_front();
+ if (_destroy_collection(cid) < 0) {
+ dout(7) << "apply_transaction fail on _destroy_collection" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+ case Transaction::OP_COLL_ADD:
+ {
+ coll_t cid = t.cids.front(); t.cids.pop_front();
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ if (_collection_add(cid, oid) < 0) {
+ //dout(7) << "apply_transaction fail on _collection_add" << endl;
+ //r &= bit;
+ }
+ }
+ break;
+
+ case Transaction::OP_COLL_REMOVE:
+ {
+ coll_t cid = t.cids.front(); t.cids.pop_front();
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ if (_collection_remove(cid, oid) < 0) {
+ dout(7) << "apply_transaction fail on _collection_remove" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+ case Transaction::OP_COLL_SETATTR:
+ {
+ coll_t cid = t.cids.front(); t.cids.pop_front();
+ const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+ //pair<const void*,int> attrval = t.attrvals.front(); t.attrvals.pop_front();
+ bufferlist bl;
+ bl.claim( t.attrbls.front() );
+ t.attrbls.pop_front();
+ if (_collection_setattr(cid, attrname, bl.c_str(), bl.length()) < 0) {
+ //if (_collection_setattr(cid, attrname, attrval.first, attrval.second) < 0) {
+ dout(7) << "apply_transaction fail on _collection_setattr" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+ case Transaction::OP_COLL_RMATTR:
+ {
+ coll_t cid = t.cids.front(); t.cids.pop_front();
+ const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+ if (_collection_rmattr(cid, attrname) < 0) {
+ dout(7) << "apply_transaction fail on _collection_rmattr" << endl;
+ r &= bit;
+ }
+ }
+ break;
+
+ default:
+ cerr << "bad op " << *p << endl;
+ assert(0);
+ }
+
+ bit = bit << 1;
+ }
+
+ dout(7) << "apply_transaction finish (r = " << r << ")" << endl;
+
+ // set up commit waiter
+ //if (r == 0) {
+ if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+ //} else {
+ //if (onsafe) delete onsafe;
+ //}
+
+ ebofs_lock.Unlock();
+ return r;
+}
+
+
+
+int Ebofs::_write(object_t oid, off_t offset, size_t length, bufferlist& bl)
+{
+ dout(7) << "_write " << oid << " " << offset << "~" << length << endl;
+ assert(bl.length() == length);
+
+ // too much unflushed dirty data? (if so, block!)
+ if (_write_will_block()) {
+ dout(10) << "_write blocking "
+ << oid << " " << offset << "~" << length
+ << " bc: "
+ << "size " << bc.get_size()
+ << ", trimmable " << bc.get_trimmable()
+ << ", max " << g_conf.ebofs_bc_size
+ << "; dirty " << bc.get_stat_dirty()
+ << ", tx " << bc.get_stat_tx()
+ << ", max dirty " << g_conf.ebofs_bc_max_dirty
+ << endl;
+
+ while (_write_will_block())
+ bc.waitfor_stat(); // waits on ebofs_lock
+
+ dout(10) << "_write unblocked "
+ << oid << " " << offset << "~" << length
+ << " bc: "
+ << "size " << bc.get_size()
+ << ", trimmable " << bc.get_trimmable()
+ << ", max " << g_conf.ebofs_bc_size
+ << "; dirty " << bc.get_stat_dirty()
+ << ", tx " << bc.get_stat_tx()
+ << ", max dirty " << g_conf.ebofs_bc_max_dirty
+ << endl;
+ }
+
+ // out of space?
+ unsigned max = (length+offset) / EBOFS_BLOCK_SIZE + 10; // very conservative; assumes we have to rewrite
+ max += dirty_onodes.size() + dirty_cnodes.size();
+ if (max >= free_blocks) {
+ dout(1) << "write failing, only " << free_blocks << " blocks free, may need up to " << max << endl;
+ return -ENOSPC;
+ }
+
+ // get|create inode
+ Onode *on = get_onode(oid);
+ if (!on) on = new_onode(oid); // new inode!
+ if (on->readonly) {
+ put_onode(on);
+ return -EACCES;
+ }
+
+ dirty_onode(on); // dirty onode!
+
+ // apply write to buffer cache
+ if (length > 0)
+ apply_write(on, offset, length, bl);
+
+ // done.
+ put_onode(on);
+ trim_bc();
+
+ return length;
+}
+
+
+/*int Ebofs::write(object_t oid,
+ off_t off, size_t len,
+ bufferlist& bl, bool fsync)
+{
+ // wait?
+ if (fsync) {
+ // wait for flush.
+ Cond cond;
+ bool done;
+ int flush = 1; // write never returns positive
+ Context *c = new C_Cond(&cond, &done, &flush);
+ int r = write(oid, off, len, bl, c);
+ if (r < 0) return r;
+
+ ebofs_lock.Lock();
+ {
+ while (!done)
+ cond.Wait(ebofs_lock);
+ assert(flush <= 0);
+ }
+ ebofs_lock.Unlock();
+ if (flush < 0) return flush;
+ return r;
+ } else {
+ // don't wait for flush.
+ return write(oid, off, len, bl, (Context*)0);
+ }
+}
+*/
+
+int Ebofs::write(object_t oid,
+ off_t off, size_t len,
+ bufferlist& bl, Context *onsafe)
+{
+ ebofs_lock.Lock();
+ assert(len > 0);
+
+ // go
+ int r = _write(oid, off, len, bl);
+
+ // commit waiter
+ if (r > 0) {
+ assert((size_t)r == len);
+ if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+ } else {
+ if (onsafe) delete onsafe;
+ }
+
+ ebofs_lock.Unlock();
+ return r;
+}
+
+
+int Ebofs::_remove(object_t oid)
+{
+ dout(7) << "_remove " << oid << endl;
+
+ // get inode
+ Onode *on = get_onode(oid);
+ if (!on) return -ENOENT;
+
+ // ok remove it!
+ remove_onode(on);
+
+ return 0;
+}
+
+
+int Ebofs::remove(object_t oid, Context *onsafe)
+{
+ ebofs_lock.Lock();
+
+ // do it
+ int r = _remove(oid);
+
+ // set up commit waiter
+ if (r >= 0) {
+ if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+ } else {
+ if (onsafe) delete onsafe;
+ }
+
+ ebofs_lock.Unlock();
+ return r;
+}
+
+int Ebofs::_truncate(object_t oid, off_t size)
+{
+ dout(7) << "_truncate " << oid << " size " << size << endl;
+
+ Onode *on = get_onode(oid);
+ if (!on)
+ return -ENOENT;
+ if (on->readonly) {
+ put_onode(on);
+ return -EACCES;
+ }
+
+ int r = 0;
+ if (size > on->object_size) {
+ r = -EINVAL; // whatever
+ }
+ else if (size < on->object_size) {
+ // change size
+ on->object_size = size;
+ dirty_onode(on);
+
+ // free blocks
+ block_t nblocks = 0;
+ if (size) nblocks = 1 + (size-1) / EBOFS_BLOCK_SIZE;
+ if (on->object_blocks > nblocks) {
+ vector<Extent> extra;
+ on->truncate_extents(nblocks, extra);
+ for (unsigned i=0; i<extra.size(); i++)
+ allocator.release(extra[i]);
+ }
+
+ // truncate buffer cache
+ if (on->oc) {
+ on->oc->truncate(on->object_blocks, super_epoch);
+ if (on->oc->is_empty())
+ on->close_oc();
+ }
+
+ // update uncommitted
+ interval_set<block_t> uncom;
+ if (nblocks > 0) {
+ interval_set<block_t> left;
+ left.insert(0, nblocks);
+ uncom.intersection_of(left, on->uncommitted);
+ }
+ dout(10) << "uncommitted was " << on->uncommitted << " now " << uncom << endl;
+ on->uncommitted = uncom;
+
+ }
+ else {
+ assert(size == on->object_size);
+ }
+
+ put_onode(on);
+ return r;
+}
+
+
+int Ebofs::truncate(object_t oid, off_t size, Context *onsafe)
+{
+ ebofs_lock.Lock();
+
+ int r = _truncate(oid, size);
+
+ // set up commit waiter
+ if (r >= 0) {
+ if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+ } else {
+ if (onsafe) delete onsafe;
+ }
+
+ ebofs_lock.Unlock();
+ return r;
+}
+
+
+
+int Ebofs::clone(object_t from, object_t to, Context *onsafe)
+{
+ ebofs_lock.Lock();
+
+ int r = _clone(from, to);
+
+ // set up commit waiter
+ if (r >= 0) {
+ if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+ } else {
+ if (onsafe) delete onsafe;
+ }
+
+ ebofs_lock.Unlock();
+ return r;
+}
+
+int Ebofs::_clone(object_t from, object_t to)
+{
+ dout(7) << "_clone " << from << " -> " << to << endl;
+
+ if (!g_conf.ebofs_cloneable)
+ return -1; // no!
+
+ Onode *fon = get_onode(from);
+ if (!fon) return -ENOENT;
+ Onode *ton = get_onode(to);
+ if (ton) {
+ put_onode(fon);
+ put_onode(ton);
+ return -EEXIST;
+ }
+ ton = new_onode(to);
+ assert(ton);
+
+ // copy easy bits
+ ton->readonly = true;
+ ton->object_size = fon->object_size;
+ ton->object_blocks = fon->object_blocks;
+ ton->attr = fon->attr;
+
+ // collections
+ for (set<coll_t>::iterator p = fon->collections.begin();
+ p != fon->collections.end();
+ p++)
+ _collection_add(*p, to);
+
+ // extents
+ ton->extent_map = fon->extent_map;
+ for (map<block_t, Extent>::iterator p = ton->extent_map.begin();
+ p != ton->extent_map.end();
+ ++p) {
+ allocator.alloc_inc(p->second);
+ }
+
+ // clear uncommitted
+ fon->uncommitted.clear();
+
+ // muck with ObjectCache
+ if (fon->oc)
+ fon->oc->clone_to( ton );
+
+ // ok!
+ put_onode(ton);
+ put_onode(fon);
+ return 0;
+}
+
+
+
+
+/*
+ * pick object revision with rev < specified rev.
+ * (oid.rev is a noninclusive upper bound.)
+ *
+ */
+int Ebofs::pick_object_revision_lt(object_t& oid)
+{
+ assert(oid.rev > 0); // this is only useful for non-zero oid.rev
+
+ int r = -EEXIST; // return code
+ ebofs_lock.Lock();
+ {
+ object_t orig = oid;
+ object_t live = oid;
+ live.rev = 0;
+
+ if (object_tab->get_num_keys() > 0) {
+ Table<object_t, Extent>::Cursor cursor(object_tab);
+
+ object_tab->find(oid, cursor); // this will be just _past_ highest eligible rev
+ if (cursor.move_left() > 0) {
+ bool firstpass = true;
+ while (1) {
+ object_t t = cursor.current().key;
+ if (t.ino != oid.ino ||
+ t.bno != oid.bno) // passed to previous object
+ break;
+ if (oid.rev < t.rev) { // rev < desired. possible match.
+ r = 0;
+ oid = t;
+ break;
+ }
+ if (firstpass && oid.rev >= t.rev) { // there is no old rev < desired. try live.
+ r = 0;
+ oid = live;
+ break;
+ }
+ if (cursor.move_left() <= 0) break;
+ firstpass = false;
+ }
+ }
+ }
+
+ dout(8) << "find_object_revision " << orig << " -> " << oid
+ << " r=" << r << endl;
+ }
+ ebofs_lock.Unlock();
+ return r;
+}
+
+
+
+
+bool Ebofs::exists(object_t oid)
+{
+ ebofs_lock.Lock();
+ dout(8) << "exists " << oid << endl;
+ bool e = (object_tab->lookup(oid) == 0);
+ ebofs_lock.Unlock();
+ return e;
+}
+
+int Ebofs::stat(object_t oid, struct stat *st)
+{
+ ebofs_lock.Lock();
+ int r = _stat(oid,st);
+ ebofs_lock.Unlock();
+ return r;
+}
+
+int Ebofs::_stat(object_t oid, struct stat *st)
+{
+ dout(7) << "_stat " << oid << endl;
+
+ Onode *on = get_onode(oid);
+ if (!on) return -ENOENT;
+
+ // ??
+ st->st_size = on->object_size;
+
+ put_onode(on);
+ return 0;
+}
+
+
+int Ebofs::_setattr(object_t oid, const char *name, const void *value, size_t size)
+{
+ dout(8) << "setattr " << oid << " '" << name << "' len " << size << endl;
+
+ Onode *on = get_onode(oid);
+ if (!on) return -ENOENT;
+ if (on->readonly) {
+ put_onode(on);
+ return -EACCES;
+ }
+
+ string n(name);
+ on->attr[n] = buffer::copy((char*)value, size);
+ dirty_onode(on);
+ put_onode(on);
+
+ dout(8) << "setattr " << oid << " '" << name << "' len " << size << " success" << endl;
+
+ return 0;
+}
+
+int Ebofs::setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe)
+{
+ ebofs_lock.Lock();
+ int r = _setattr(oid, name, value, size);
+
+ // set up commit waiter
+ if (r >= 0) {
+ if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+ } else {
+ if (onsafe) delete onsafe;
+ }
+
+ ebofs_lock.Unlock();
+ return r;
+}
+
+int Ebofs::_setattrs(object_t oid, map<string,bufferptr>& attrset)
+{
+ dout(8) << "setattrs " << oid << endl;
+
+ Onode *on = get_onode(oid);
+ if (!on) return -ENOENT;
+ if (on->readonly) {
+ put_onode(on);
+ return -EACCES;
+ }
+
+ on->attr = attrset;
+ dirty_onode(on);
+ put_onode(on);
+ return 0;
+}
+
+int Ebofs::setattrs(object_t oid, map<string,bufferptr>& attrset, Context *onsafe)
+{
+ ebofs_lock.Lock();
+ int r = _setattrs(oid, attrset);
+
+ // set up commit waiter
+ if (r >= 0) {
+ if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+ } else {
+ if (onsafe) delete onsafe;
+ }
+
+ ebofs_lock.Unlock();
+ return r;
+}
+
+int Ebofs::getattr(object_t oid, const char *name, void *value, size_t size)
+{
+ ebofs_lock.Lock();
+ int r = _getattr(oid, name, value, size);
+ ebofs_lock.Unlock();
+ return r;
+}
+
+int Ebofs::_getattr(object_t oid, const char *name, void *value, size_t size)
+{
+ dout(8) << "_getattr " << oid << " '" << name << "' maxlen " << size << endl;
+
+ Onode *on = get_onode(oid);
+ if (!on) return -ENOENT;
+
+ string n(name);
+ int r = 0;
+ if (on->attr.count(n) == 0) {
+ dout(10) << "_getattr " << oid << " '" << name << "' dne" << endl;
+ r = -1;
+ } else {
+ r = MIN( on->attr[n].length(), size );
+ dout(10) << "_getattr " << oid << " '" << name << "' got len " << r << endl;
+ memcpy(value, on->attr[n].c_str(), r );
+ }
+ put_onode(on);
+ return r;
+}
+
+int Ebofs::getattrs(object_t oid, map<string,bufferptr> &aset)
+{
+ ebofs_lock.Lock();
+ int r = _getattrs(oid, aset);
+ ebofs_lock.Unlock();
+ return r;
+}
+
+int Ebofs::_getattrs(object_t oid, map<string,bufferptr> &aset)
+{
+ dout(8) << "_getattrs " << oid << endl;
+
+ Onode *on = get_onode(oid);
+ if (!on) return -ENOENT;
+ aset = on->attr;
+ put_onode(on);
+ return 0;
+}
+
+
+
+int Ebofs::_rmattr(object_t oid, const char *name)
+{
+ dout(8) << "_rmattr " << oid << " '" << name << "'" << endl;
+
+ Onode *on = get_onode(oid);
+ if (!on) return -ENOENT;
+ if (on->readonly) {
+ put_onode(on);
+ return -EACCES;
+ }
+
+ string n(name);
+ on->attr.erase(n);
+ dirty_onode(on);
+ put_onode(on);
+ return 0;
+}
+
+int Ebofs::rmattr(object_t oid, const char *name, Context *onsafe)
+{
+ ebofs_lock.Lock();
+
+ int r = _rmattr(oid, name);
+
+ // set up commit waiter
+ if (r >= 0) {
+ if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+ } else {
+ if (onsafe) delete onsafe;
+ }
+
+ ebofs_lock.Unlock();
+ return r;
+}
+
+int Ebofs::listattr(object_t oid, vector<string>& attrs)
+{
+ ebofs_lock.Lock();
+ dout(8) << "listattr " << oid << endl;
+
+ Onode *on = get_onode(oid);
+ if (!on) {
+ ebofs_lock.Unlock();
+ return -ENOENT;
+ }
+
+ attrs.clear();
+ for (map<string,bufferptr>::iterator i = on->attr.begin();
+ i != on->attr.end();
+ i++) {
+ attrs.push_back(i->first);
+ }
+
+ put_onode(on);
+ ebofs_lock.Unlock();
+ return 0;
+}
+
+
+
+/***************** collections ******************/
+
+int Ebofs::list_collections(list<coll_t>& ls)
+{
+ ebofs_lock.Lock();
+ dout(9) << "list_collections " << endl;
+
+ Table<coll_t, Extent>::Cursor cursor(collection_tab);
+
+ int num = 0;
+ if (collection_tab->find(0, cursor) >= 0) {
+ while (1) {
+ ls.push_back(cursor.current().key);
+ num++;
+ if (cursor.move_right() <= 0) break;
+ }
+ }
+
+ ebofs_lock.Unlock();
+ return num;
+}
+
+int Ebofs::_create_collection(coll_t cid)
+{
+ dout(9) << "_create_collection " << hex << cid << dec << endl;
+
+ if (_collection_exists(cid))
+ return -EEXIST;
+
+ Cnode *cn = new_cnode(cid);
+ put_cnode(cn);
+
+ return 0;
+}
+
+int Ebofs::create_collection(coll_t cid, Context *onsafe)
+{
+ ebofs_lock.Lock();
+
+ int r = _create_collection(cid);
+
+ // set up commit waiter
+ if (r >= 0) {
+ if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+ } else {
+ if (onsafe) delete onsafe;
+ }
+
+ ebofs_lock.Unlock();
+ return r;
+}
+
+int Ebofs::_destroy_collection(coll_t cid)
+{
+ dout(9) << "_destroy_collection " << hex << cid << dec << endl;
+
+ if (!_collection_exists(cid))
+ return -ENOENT;
+
+ Cnode *cn = get_cnode(cid);
+ assert(cn);
+
+ // hose mappings
+ list<object_t> objects;
+ collection_list(cid, objects);
+ for (list<object_t>::iterator i = objects.begin();
+ i != objects.end();
+ i++) {
+ co_tab->remove(coll_object_t(cid,*i));
+
+ Onode *on = get_onode(*i);
+ if (on) {
+ on->collections.erase(cid);
+ dirty_onode(on);
+ put_onode(on);
+ }
+ }
+
+ remove_cnode(cn);
+ return 0;
+}
+
+int Ebofs::destroy_collection(coll_t cid, Context *onsafe)
+{
+ ebofs_lock.Lock();
+
+ int r = _destroy_collection(cid);
+
+ // set up commit waiter
+ if (r >= 0) {
+ if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+ } else {
+ if (onsafe) delete onsafe;
+ }
+
+ ebofs_lock.Unlock();
+ return r;
+}
+
+bool Ebofs::collection_exists(coll_t cid)
+{
+ ebofs_lock.Lock();
+ dout(10) << "collection_exists " << hex << cid << dec << endl;
+ bool r = _collection_exists(cid);
+ ebofs_lock.Unlock();
+ return r;
+}
+bool Ebofs::_collection_exists(coll_t cid)
+{
+ return (collection_tab->lookup(cid) == 0);
+}
+
+int Ebofs::_collection_add(coll_t cid, object_t oid)
+{
+ dout(9) << "_collection_add " << hex << cid << " object " << oid << dec << endl;
+
+ if (!_collection_exists(cid))
+ return -ENOENT;
+
+ Onode *on = get_onode(oid);
+ if (!on) return -ENOENT;
+
+ int r = 0;
+
+ if (on->collections.count(cid) == 0) {
+ on->collections.insert(cid);
+ dirty_onode(on);
+ co_tab->insert(coll_object_t(cid,oid), true);
+ } else {
+ r = -ENOENT; // FIXME? already in collection.
+ }
+
+ put_onode(on);
+ return r;
+}
+
+int Ebofs::collection_add(coll_t cid, object_t oid, Context *onsafe)
+{
+ ebofs_lock.Lock();
+
+ int r = _collection_add(cid, oid);
+
+ // set up commit waiter
+ if (r >= 0) {
+ if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+ } else {
+ if (onsafe) delete onsafe;
+ }
+
+ ebofs_lock.Unlock();
+ return 0;
+}
+
+int Ebofs::_collection_remove(coll_t cid, object_t oid)
+{
+ dout(9) << "_collection_remove " << hex << cid << " object " << oid << dec << endl;
+
+ if (!_collection_exists(cid))
+ return -ENOENT;
+
+ Onode *on = get_onode(oid);
+ if (!on) return -ENOENT;
+
+ int r = 0;
+
+ if (on->collections.count(cid)) {
+ on->collections.erase(cid);
+ dirty_onode(on);
+ co_tab->remove(coll_object_t(cid,oid));
+ } else {
+ r = -ENOENT; // FIXME?
+ }
+
+ put_onode(on);
+ return r;
+}
+
+int Ebofs::collection_remove(coll_t cid, object_t oid, Context *onsafe)
+{
+ ebofs_lock.Lock();
+
+ int r = _collection_remove(cid, oid);
+
+ // set up commit waiter
+ if (r >= 0) {
+ if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+ } else {
+ if (onsafe) delete onsafe;
+ }
+
+ ebofs_lock.Unlock();
+ return 0;
+}
+
+int Ebofs::collection_list(coll_t cid, list<object_t>& ls)
+{
+ ebofs_lock.Lock();
+ dout(9) << "collection_list " << hex << cid << dec << endl;
+
+ if (!_collection_exists(cid)) {
+ ebofs_lock.Unlock();
+ return -ENOENT;
+ }
+
+ Table<coll_object_t, bool>::Cursor cursor(co_tab);
+
+ int num = 0;
+ if (co_tab->find(coll_object_t(cid,object_t()), cursor) >= 0) {
+ while (1) {
+ const coll_t c = cursor.current().key.first;
+ const object_t o = cursor.current().key.second;
+ if (c != cid) break; // end!
+ dout(10) << "collection_list " << hex << cid << " includes " << o << dec << endl;
+ ls.push_back(o);
+ num++;
+ if (cursor.move_right() < 0) break;
+ }
+ }
+
+ ebofs_lock.Unlock();
+ return num;
+}
+
+
+int Ebofs::_collection_setattr(coll_t cid, const char *name, const void *value, size_t size)
+{
+ dout(10) << "_collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << endl;
+
+ Cnode *cn = get_cnode(cid);
+ if (!cn) return -ENOENT;
+
+ string n(name);
+ cn->attr[n] = buffer::copy((char*)value, size);
+ dirty_cnode(cn);
+ put_cnode(cn);
+
+ return 0;
+}
+
+int Ebofs::collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe)
+{
+ ebofs_lock.Lock();
+ dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << endl;
+
+ int r = _collection_setattr(cid, name, value, size);
+
+ // set up commit waiter
+ if (r >= 0) {
+ if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+ } else {
+ if (onsafe) delete onsafe;
+ }
+
+ ebofs_lock.Unlock();
+ return 0;
+}
+
+int Ebofs::collection_getattr(coll_t cid, const char *name, void *value, size_t size)
+{
+ ebofs_lock.Lock();
+ dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' maxlen " << size << endl;
+
+ Cnode *cn = get_cnode(cid);
+ if (!cn) {
+ ebofs_lock.Unlock();
+ return -ENOENT;
+ }
+
+ string n(name);
+ int r;
+ if (cn->attr.count(n) == 0) {
+ r = -1;
+ } else {
+ r = MIN( cn->attr[n].length(), size );
+ memcpy(value, cn->attr[n].c_str(), r);
+ }
+
+ put_cnode(cn);
+ ebofs_lock.Unlock();
+ return r;
+}
+
+int Ebofs::_collection_rmattr(coll_t cid, const char *name)
+{
+ dout(10) << "_collection_rmattr " << hex << cid << dec << " '" << name << "'" << endl;
+
+ Cnode *cn = get_cnode(cid);
+ if (!cn) return -ENOENT;
+
+ string n(name);
+ cn->attr.erase(n);
+
+ dirty_cnode(cn);
+ put_cnode(cn);
+
+ return 0;
+}
+
+int Ebofs::collection_rmattr(coll_t cid, const char *name, Context *onsafe)
+{
+ ebofs_lock.Lock();
+
+ int r = _collection_rmattr(cid, name);
+
+ // set up commit waiter
+ if (r >= 0) {
+ if (onsafe) commit_waiters[super_epoch].push_back(onsafe);
+ } else {
+ if (onsafe) delete onsafe;
+ }
+
+ ebofs_lock.Unlock();
+ return 0;
+}
+
+int Ebofs::collection_listattr(coll_t cid, vector<string>& attrs)
+{
+ ebofs_lock.Lock();
+ dout(10) << "collection_listattr " << hex << cid << dec << endl;
+
+ Cnode *cn = get_cnode(cid);
+ if (!cn) {
+ ebofs_lock.Unlock();
+ return -ENOENT;
+ }
+
+ attrs.clear();
+ for (map<string,bufferptr>::iterator i = cn->attr.begin();
+ i != cn->attr.end();
+ i++) {
+ attrs.push_back(i->first);
+ }
+
+ put_cnode(cn);
+ ebofs_lock.Unlock();
+ return 0;
+}
+
+
+
+void Ebofs::_export_freelist(bufferlist& bl)
+{
+ for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) {
+ Table<block_t,block_t> *tab;
+ if (b < EBOFS_NUM_FREE_BUCKETS) {
+ tab = free_tab[b];
+ } else {
+ tab = limbo_tab;
+ }
+
+ if (tab->get_num_keys() > 0) {
+ Table<block_t,block_t>::Cursor cursor(tab);
+ assert(tab->find(0, cursor) >= 0);
+ while (1) {
+ assert(cursor.current().value > 0);
+
+ Extent ex(cursor.current().key, cursor.current().value);
+ dout(10) << "_export_freelist " << ex << endl;
+ bl.append((char*)&ex, sizeof(ex));
+ if (cursor.move_right() <= 0) break;
+ }
+ }
+ }
+}
+
+void Ebofs::_import_freelist(bufferlist& bl)
+{
+ // clear
+ for (int b=0; b<EBOFS_NUM_FREE_BUCKETS; b++)
+ free_tab[b]->clear();
+ limbo_tab->clear();
+
+ // import!
+ int num = bl.length() / sizeof(Extent);
+ Extent *p = (Extent*)bl.c_str();
+ for (int i=0; i<num; i++) {
+ dout(10) << "_import_freelist " << p[i] << endl;
+ allocator._release_loner(p[i]);
+ }
+}
+
+void Ebofs::_get_frag_stat(FragmentationStat& st)
+{
+ ebofs_lock.Lock();
+
+ // free list is easy
+ st.total = dev.get_num_blocks();
+ st.total_free = get_free_blocks() + get_limbo_blocks();
+ st.free_extent_dist.clear();
+ st.num_free_extent = 0;
+ st.avg_free_extent = 0;
+/*
+ __uint64_t tfree = 0;
+ for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) {
+ Table<block_t,block_t> *tab;
+ if (b < EBOFS_NUM_FREE_BUCKETS) {
+ tab = free_tab[b];
+ dout(30) << "dump bucket " << b << " " << tab->get_num_keys() << endl;
+ } else {
+ tab = limbo_tab;
+ dout(30) << "dump limbo " << tab->get_num_keys() << endl;;
+ }
+
+ if (tab->get_num_keys() > 0) {
+ Table<block_t,block_t>::Cursor cursor(tab);
+ assert(tab->find(0, cursor) >= 0);
+ while (1) {
+ assert(cursor.current().value > 0);
+
+ block_t l = cursor.current().value;
+ tfree += l;
+ int b = 0;
+ do {
+ l = l >> 1;
+ b++;
+ } while (l);
+ st.free_extent_dist[b]++;
+ st.free_extent_dist_sum[b] += cursor.current().value;
+ st.num_free_extent++;
+
+ if (cursor.move_right() <= 0) break;
+ }
+ }
+ }
+ st.avg_free_extent = tfree / st.num_free_extent;
+*/
+
+ // used extents is harder. :(
+ st.num_extent = 0;
+ st.avg_extent = 0;
+ st.extent_dist.clear();
+ st.extent_dist_sum.clear();
+ st.avg_extent_per_object = 0;
+ st.avg_extent_jump = 0;
+
+ Table<object_t,Extent>::Cursor cursor(object_tab);
+ object_tab->find(object_t(), cursor);
+ int nobj = 0;
+ int njump = 0;
+ while (object_tab->get_num_keys() > 0) {
+ Onode *on = get_onode(cursor.current().key);
+ assert(on);
+
+ nobj++;
+ st.avg_extent_per_object += on->extent_map.size();
+
+ for (map<block_t,Extent>::iterator p = on->extent_map.begin();
+ p != on->extent_map.end();
+ p++) {
+ block_t l = p->second.length;
+
+ st.num_extent++;
+ st.avg_extent += l;
+ if (p->first > 0) {
+ njump++;
+ st.avg_extent_jump += l;
+ }
+
+ int b = 0;
+ do {
+ l = l >> 1;
+ b++;
+ } while (l);
+ st.extent_dist[b]++;
+ st.extent_dist_sum[b] += p->second.length;
+ }
+ put_onode(on);
+ if (cursor.move_right() <= 0) break;
+ }
+ if (njump) st.avg_extent_jump /= njump;
+ if (nobj) st.avg_extent_per_object /= (float)nobj;
+ if (st.num_extent) st.avg_extent /= st.num_extent;
+
+ ebofs_lock.Unlock();
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include <map>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#include "include/Context.h"
+#include "include/buffer.h"
+
+template<typename U,typename V>
+inline ostream& operator<<(ostream& out, const pair<U,V>& p) {
+ return out << p.first << "," << p.second;
+}
+
+#include "types.h"
+#include "Onode.h"
+#include "Cnode.h"
+#include "BlockDevice.h"
+#include "nodes.h"
+#include "Allocator.h"
+#include "Table.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+
+#include "osd/ObjectStore.h"
+
+//typedef pair<object_t,coll_t> object_coll_t;
+typedef pair<coll_t,object_t> coll_object_t;
+
+
+class Ebofs : public ObjectStore {
+ protected:
+ Mutex ebofs_lock; // a beautiful global lock
+
+ // ** debuggy **
+ bool fake_writes;
+
+ // ** super **
+ BlockDevice dev;
+ bool mounted, unmounting, dirty;
+ bool readonly;
+ version_t super_epoch;
+ bool commit_thread_started, mid_commit;
+ Cond commit_cond; // to wake up the commit thread
+ Cond sync_cond;
+
+ map<version_t, list<Context*> > commit_waiters;
+
+ void prepare_super(version_t epoch, bufferptr& bp);
+ void write_super(version_t epoch, bufferptr& bp);
+ int commit_thread_entry();
+
+ class CommitThread : public Thread {
+ Ebofs *ebofs;
+ public:
+ CommitThread(Ebofs *e) : ebofs(e) {}
+ void *entry() {
+ ebofs->commit_thread_entry();
+ return 0;
+ }
+ } commit_thread;
+
+
+
+
+ // ** allocator **
+ block_t free_blocks, limbo_blocks;
+ Allocator allocator;
+ friend class Allocator;
+
+ block_t get_free_blocks() { return free_blocks; }
+ block_t get_limbo_blocks() { return limbo_blocks; }
+ block_t get_free_extents() {
+ int n = 0;
+ for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
+ n += free_tab[i]->get_num_keys();
+ return n;
+ }
+ block_t get_limbo_extents() { return limbo_tab->get_num_keys(); }
+
+
+ // ** tables and sets **
+ // nodes
+ NodePool nodepool; // for all tables...
+
+ // tables
+ Table<object_t, Extent> *object_tab;
+ Table<block_t,block_t> *free_tab[EBOFS_NUM_FREE_BUCKETS];
+ Table<block_t,block_t> *limbo_tab;
+ Table<block_t,pair<block_t,int> > *alloc_tab;
+
+ // collections
+ Table<coll_t, Extent> *collection_tab;
+ Table<coll_object_t, bool> *co_tab;
+
+ void close_tables();
+
+
+ // ** onodes **
+ hash_map<object_t, Onode*> onode_map; // onode cache
+ LRU onode_lru;
+ set<Onode*> dirty_onodes;
+ map<object_t, list<Cond*> > waitfor_onode;
+
+ Onode* new_onode(object_t oid); // make new onode. ref++.
+ Onode* get_onode(object_t oid); // get cached onode, or read from disk. ref++.
+ void remove_onode(Onode *on);
+ void put_onode(Onode* o); // put it back down. ref--.
+ void dirty_onode(Onode* o);
+ void encode_onode(Onode *on, bufferlist& bl, unsigned& off);
+ void write_onode(Onode *on);
+
+ // ** cnodes **
+ hash_map<coll_t, Cnode*> cnode_map;
+ LRU cnode_lru;
+ set<Cnode*> dirty_cnodes;
+ map<coll_t, list<Cond*> > waitfor_cnode;
+
+ Cnode* new_cnode(coll_t cid);
+ Cnode* get_cnode(coll_t cid);
+ void remove_cnode(Cnode *cn);
+ void put_cnode(Cnode *cn);
+ void dirty_cnode(Cnode *cn);
+ void encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off);
+ void write_cnode(Cnode *cn);
+
+ // ** onodes+cnodes = inodes **
+ int inodes_flushing;
+ Cond inode_commit_cond;
+
+ void flush_inode_finish();
+ void commit_inodes_start();
+ void commit_inodes_wait();
+ friend class C_E_InodeFlush;
+
+ void trim_inodes(int max = -1);
+
+ // ** buffer cache **
+ BufferCache bc;
+ pthread_t flushd_thread_id;
+
+ version_t trigger_commit();
+ void commit_bc_wait(version_t epoch);
+ void trim_bc(off_t max = -1);
+
+ public:
+ void kick_idle();
+ void sync();
+ void sync(Context *onsafe);
+ void trim_buffer_cache();
+
+ class IdleKicker : public BlockDevice::kicker {
+ Ebofs *ebo;
+ public:
+ IdleKicker(Ebofs *t) : ebo(t) {}
+ void kick() { ebo->kick_idle(); }
+ } idle_kicker;
+
+
+ protected:
+ //void zero(Onode *on, size_t len, off_t off, off_t write_thru);
+ void alloc_write(Onode *on,
+ block_t start, block_t len,
+ interval_set<block_t>& alloc,
+ block_t& old_bfirst, block_t& old_blast);
+ void apply_write(Onode *on, off_t off, size_t len, bufferlist& bl);
+ bool attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl,
+ Cond *will_wait_on, bool *will_wait_on_bool);
+
+ // ** finisher **
+ // async write notification to users
+ Mutex finisher_lock;
+ Cond finisher_cond;
+ bool finisher_stop;
+ list<Context*> finisher_queue;
+
+ void *finisher_thread_entry();
+ class FinisherThread : public Thread {
+ Ebofs *ebofs;
+ public:
+ FinisherThread(Ebofs *e) : ebofs(e) {}
+ void* entry() { return (void*)ebofs->finisher_thread_entry(); }
+ } finisher_thread;
+
+
+ void alloc_more_node_space();
+
+ void do_csetattrs(map<coll_t, map<const char*, pair<void*,int> > > &cmods);
+ void do_setattrs(Onode *on, map<const char*, pair<void*,int> > &setattrs);
+
+
+ public:
+ Ebofs(char *devfn) :
+ fake_writes(false),
+ dev(devfn),
+ mounted(false), unmounting(false), dirty(false), readonly(false),
+ super_epoch(0), commit_thread_started(false), mid_commit(false),
+ commit_thread(this),
+ free_blocks(0), limbo_blocks(0),
+ allocator(this),
+ nodepool(ebofs_lock),
+ object_tab(0), limbo_tab(0), collection_tab(0), co_tab(0),
+ onode_lru(g_conf.ebofs_oc_size),
+ cnode_lru(g_conf.ebofs_cc_size),
+ inodes_flushing(0),
+ bc(dev, ebofs_lock),
+ idle_kicker(this),
+ finisher_stop(false), finisher_thread(this) {
+ for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
+ free_tab[i] = 0;
+ }
+ ~Ebofs() {
+ }
+
+ int mkfs();
+ int mount();
+ int umount();
+
+ int statfs(struct statfs *buf);
+
+ // atomic transaction
+ unsigned apply_transaction(Transaction& t, Context *onsafe=0);
+
+ int pick_object_revision_lt(object_t& oid);
+
+ // object interface
+ bool exists(object_t);
+ int stat(object_t, struct stat*);
+ int read(object_t, off_t off, size_t len, bufferlist& bl);
+ int is_cached(object_t oid, off_t off, size_t len);
+
+ int write(object_t oid, off_t off, size_t len, bufferlist& bl, Context *onsafe);
+ int truncate(object_t oid, off_t size, Context *onsafe=0);
+ int truncate_front(object_t oid, off_t size, Context *onsafe=0);
+ int remove(object_t oid, Context *onsafe=0);
+ bool write_will_block();
+
+ int rename(object_t from, object_t to);
+ int clone(object_t from, object_t to, Context *onsafe);
+
+
+ // object attr
+ int setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe=0);
+ int setattrs(object_t oid, map<string,bufferptr>& attrset, Context *onsafe=0);
+ int getattr(object_t oid, const char *name, void *value, size_t size);
+ int getattrs(object_t oid, map<string,bufferptr> &aset);
+ int rmattr(object_t oid, const char *name, Context *onsafe=0);
+ int listattr(object_t oid, vector<string>& attrs);
+
+ // collections
+ int list_collections(list<coll_t>& ls);
+ bool collection_exists(coll_t c);
+
+ int create_collection(coll_t c, Context *onsafe);
+ int destroy_collection(coll_t c, Context *onsafe);
+ int collection_add(coll_t c, object_t o, Context *onsafe);
+ int collection_remove(coll_t c, object_t o, Context *onsafe);
+
+ int collection_list(coll_t c, list<object_t>& o);
+
+ int collection_setattr(coll_t oid, const char *name, const void *value, size_t size, Context *onsafe);
+ int collection_getattr(coll_t oid, const char *name, void *value, size_t size);
+ int collection_rmattr(coll_t cid, const char *name, Context *onsafe);
+ int collection_listattr(coll_t oid, vector<string>& attrs);
+
+ // maps
+ int map_lookup(object_t o, bufferlist& key, bufferlist& val);
+ int map_insert(object_t o, bufferlist& key, bufferlist& val);
+ int map_remove(object_t o, bufferlist& key);
+ int map_list(object_t o, list<bufferlist>& keys);
+ int map_list(object_t o, map<bufferlist,bufferlist>& vals);
+ int map_list(object_t o,
+ bufferlist& start, bufferlist& end,
+ map<bufferlist,bufferlist>& vals);
+
+ // crap
+ void _fake_writes(bool b) { fake_writes = b; }
+ void _get_frag_stat(FragmentationStat& st);
+
+ void _import_freelist(bufferlist& bl);
+ void _export_freelist(bufferlist& bl);
+
+
+private:
+ // private interface -- use if caller already holds lock
+ int _read(object_t oid, off_t off, size_t len, bufferlist& bl);
+ int _is_cached(object_t oid, off_t off, size_t len);
+ int _stat(object_t oid, struct stat *st);
+ int _getattr(object_t oid, const char *name, void *value, size_t size);
+ int _getattrs(object_t oid, map<string,bufferptr> &aset);
+
+ bool _write_will_block();
+ int _write(object_t oid, off_t off, size_t len, bufferlist& bl);
+ int _truncate(object_t oid, off_t size);
+ int _truncate_front(object_t oid, off_t size);
+ int _remove(object_t oid);
+ int _clone(object_t from, object_t to);
+ int _setattr(object_t oid, const char *name, const void *value, size_t size);
+ int _setattrs(object_t oid, map<string,bufferptr>& attrset);
+ int _rmattr(object_t oid, const char *name);
+ bool _collection_exists(coll_t c);
+ int _create_collection(coll_t c);
+ int _destroy_collection(coll_t c);
+ int _collection_add(coll_t c, object_t o);
+ int _collection_remove(coll_t c, object_t o);
+ int _collection_setattr(coll_t oid, const char *name, const void *value, size_t size);
+ int _collection_rmattr(coll_t cid, const char *name);
+
+
+};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __EBOFS_ONODE_H
+#define __EBOFS_ONODE_H
+
+#include "include/lru.h"
+
+#include "types.h"
+#include "BufferCache.h"
+
+#include "include/interval_set.h"
+
+
+/*
+ * object node (like an inode)
+ *
+ * holds object metadata, including
+ * size
+ * allocation (extent list)
+ * attributes
+ *
+ */
+
+class Onode : public LRUObject {
+private:
+ int ref;
+
+public:
+ object_t object_id;
+ version_t version; // incremented on each modify.
+
+ // data
+ bool readonly;
+ Extent onode_loc;
+ off_t object_size;
+ unsigned object_blocks;
+
+ // onode
+ set<coll_t> collections;
+ map<string, bufferptr> attr;
+ //vector<Extent> extents;
+ map<block_t, Extent> extent_map;
+
+ interval_set<block_t> uncommitted;
+
+ ObjectCache *oc;
+
+ bool dirty;
+ bool dangling; // not in onode_map
+ bool deleted; // deleted
+
+ list<Context*> commit_waiters;
+
+ public:
+ Onode(object_t oid) : ref(0), object_id(oid), version(0),
+ readonly(false),
+ object_size(0), object_blocks(0), oc(0),
+ dirty(false), dangling(false), deleted(false) {
+ onode_loc.length = 0;
+ }
+ ~Onode() {
+ if (oc) delete oc;
+ }
+
+ block_t get_onode_id() { return onode_loc.start; }
+ int get_onode_len() { return onode_loc.length; }
+
+ int get_ref_count() { return ref; }
+ void get() {
+ if (ref == 0) lru_pin();
+ ref++;
+ //cout << "ebofs.onode.get " << hex << object_id << dec << " " << ref << endl;
+ }
+ void put() {
+ ref--;
+ if (ref == 0) lru_unpin();
+ //cout << "ebofs.onode.put " << hex << object_id << dec << " " << ref << endl;
+ }
+
+ void mark_dirty() {
+ if (!dirty) {
+ dirty = true;
+ get();
+ }
+ }
+ void mark_clean() {
+ if (dirty) {
+ dirty = false;
+ put();
+ }
+ }
+ bool is_dirty() { return dirty; }
+ bool is_deleted() { return deleted; }
+ bool is_dangling() { return dangling; }
+
+
+ bool have_oc() {
+ return oc != 0;
+ }
+ ObjectCache *get_oc(BufferCache *bc) {
+ if (!oc) {
+ oc = new ObjectCache(object_id, this, bc);
+ oc->get();
+ get();
+ }
+ return oc;
+ }
+ void close_oc() {
+ if (oc) {
+ //cout << "close_oc on " << object_id << endl;
+ assert(oc->is_empty());
+ if (oc->put() == 0){
+ //cout << "************************* hosing oc" << endl;
+ delete oc;
+ }
+ oc = 0;
+ put();
+ }
+ }
+
+
+ // allocation
+ void verify_extents() {
+ if (0) { // do crazy stupid sanity checking
+ block_t count = 0;
+ interval_set<block_t> is;
+
+ set<block_t> s;
+ cout << "verifying" << endl;
+
+ for (map<block_t,Extent>::iterator p = extent_map.begin();
+ p != extent_map.end();
+ p++) {
+ cout << " " << p->first << ": " << p->second << endl;
+ assert(count == p->first);
+ count += p->second.length;
+ for (unsigned j=0;j<p->second.length;j++) {
+ assert(s.count(p->second.start+j) == 0);
+ s.insert(p->second.start+j);
+ }
+ }
+
+ assert(s.size() == count);
+ assert(count == object_blocks);
+ }
+ }
+ void set_extent(block_t offset, Extent ex) {
+ //cout << "set_extent " << offset << " -> " << ex << " ... " << object_blocks << endl;
+ assert(offset <= object_blocks);
+ verify_extents();
+
+ // at the end?
+ if (offset == object_blocks) {
+ //cout << " appending " << ex << endl;
+ if (!extent_map.empty() && extent_map.rbegin()->second.end() == ex.start) {
+ //cout << "appending " << ex << " to " << extent_map.rbegin()->second << endl;
+ extent_map.rbegin()->second.length += ex.length;
+ } else
+ extent_map[object_blocks] = ex;
+ object_blocks += ex.length;
+ return;
+ }
+
+ // removing any extent bits we overwrite
+ if (!extent_map.empty()) {
+ // preceeding extent?
+ map<block_t,Extent>::iterator p = extent_map.lower_bound(offset);
+ if (p != extent_map.begin()) {
+ p--;
+ if (p->first + p->second.length > offset) {
+ //cout << " preceeding was " << p->second << endl;
+ if (p->first + p->second.length > offset+ex.length) {
+ // cutting chunk out of middle, add last bit
+ Extent &n = extent_map[offset+ex.length] = p->second;
+ n.start += offset+ex.length - p->first;
+ n.length -= offset+ex.length - p->first;
+ //cout << " tail frag is " << n << endl;
+ }
+ p->second.length = offset - p->first; // cut tail off preceeding extent
+ //cout << " preceeding now " << p->second << endl;
+ }
+ p++;
+ }
+
+ // overlapping extents
+ while (p != extent_map.end() &&
+ p->first < offset + ex.length) {
+ map<block_t,Extent>::iterator next = p;
+ next++;
+
+ // completely subsumed?
+ if (p->first + p->second.length <= offset+ex.length) {
+ //cout << " erasing " << p->second << endl;
+ extent_map.erase(p);
+ p = next;
+ continue;
+ }
+
+ // spans new extent, cut off head
+ Extent &n = extent_map[ offset+ex.length ] = p->second;
+ //cout << " cut head off " << p->second;
+ n.start += offset+ex.length - p->first;
+ n.length -= offset+ex.length - p->first;
+ extent_map.erase(p);
+ //cout << ", now " << n << endl;
+ break;
+ }
+ }
+
+ extent_map[ offset ] = ex;
+
+ // extend object?
+ if (offset + ex.length > object_blocks)
+ object_blocks = offset+ex.length;
+
+ verify_extents();
+ }
+
+
+ /* map_extents(start, len, ls)
+ * map teh given page range into extents on disk.
+ */
+ int map_extents(block_t start, block_t len, vector<Extent>& ls) {
+ //cout << "map_extents " << start << " " << len << endl;
+ verify_extents();
+
+ //assert(start+len <= object_blocks);
+
+ map<block_t,Extent>::iterator p = extent_map.lower_bound(start);
+ if (p != extent_map.begin() &&
+ (p == extent_map.end() || p->first > start && p->first)) {
+ p--;
+ if (p->second.length > start - p->first) {
+ Extent ex;
+ ex.start = p->second.start + (start - p->first);
+ ex.length = MIN(len, p->second.length - (start - p->first));
+ ls.push_back(ex);
+
+ //cout << " got (tail of?) " << p->second << " : " << ex << endl;
+
+ start += ex.length;
+ len -= ex.length;
+ }
+ p++;
+ }
+
+ while (len > 0 &&
+ p != extent_map.end()) {
+ assert(p->first == start);
+ Extent ex = p->second;
+ ex.length = MIN(len, ex.length);
+ ls.push_back(ex);
+ //cout << " got (head of?) " << p->second << " : " << ex << endl;
+ start += ex.length;
+ len -= ex.length;
+ p++;
+ }
+
+ return 0;
+ }
+
+ int truncate_extents(block_t len, vector<Extent>& extra) {
+ verify_extents();
+
+ map<block_t,Extent>::iterator p = extent_map.lower_bound(len);
+ if (p != extent_map.begin() &&
+ (p == extent_map.end() || p->first > len && p->first)) {
+ p--;
+ if (p->second.length > len - p->first) {
+ Extent ex;
+ ex.start = p->second.start + (len - p->first);
+ ex.length = p->second.length - (len - p->first);
+ extra.push_back(ex);
+
+ p->second.length = len - p->first;
+ assert(p->second.length > 0);
+
+ //cout << " got (tail of?) " << p->second << " : " << ex << endl;
+ }
+ p++;
+ }
+
+ while (p != extent_map.end()) {
+ assert(p->first >= len);
+ extra.push_back(p->second);
+ map<block_t,Extent>::iterator n = p;
+ n++;
+ extent_map.erase(p);
+ p = n;
+ }
+
+ object_blocks = len;
+ verify_extents();
+ return 0;
+ }
+
+ int truncate_front_extents(block_t len, vector<Extent>& extra) {
+ verify_extents();
+
+ while (len > 0) {
+ Extent& ex = extent_map.begin()->second; // look, this is a reference!
+ if (ex.length > len) {
+ // partial first extent
+ Extent frontbit( ex.start, len );
+ extra.push_back(frontbit);
+ ex.length -= len;
+ ex.start += len;
+ break;
+ }
+
+ // pull off entire first extent.
+ assert(ex.length <= len);
+ len -= ex.length;
+ extra.push_back(ex);
+ extent_map.erase(extent_map.begin());
+ }
+
+ object_blocks -= len;
+ verify_extents();
+ return 0;
+ }
+
+
+
+ /* map_alloc_regions(start, len, map)
+ * map range into regions that need to be (re)allocated on disk
+ * because they overlap "safe" (or unallocated) parts of the object
+ */
+ /*
+ void map_alloc_regions(block_t start, block_t len,
+ interval_set<block_t>& alloc) {
+ interval_set<block_t> already_uncom;
+
+ alloc.insert(start, len); // start with whole range
+ already_uncom.intersection_of(alloc, uncommitted);
+ alloc.subtract(already_uncom); // take out the bits that aren't yet committed
+ }
+ */
+
+
+
+ // pack/unpack
+ int get_collection_bytes() {
+ return sizeof(coll_t) * collections.size();
+ }
+ int get_attr_bytes() {
+ int s = 0;
+ for (map<string, bufferptr>::iterator i = attr.begin();
+ i != attr.end();
+ i++) {
+ s += i->first.length() + 1;
+ s += i->second.length() + sizeof(int);
+ }
+ return s;
+ }
+ int get_extent_bytes() {
+ return sizeof(Extent) * extent_map.size();
+ }
+
+};
+
+
+inline ostream& operator<<(ostream& out, Onode& on)
+{
+ out << "onode(" << hex << on.object_id << dec << " len=" << on.object_size;
+ out << " ref=" << on.get_ref_count();
+ if (on.is_dirty()) out << " dirty";
+ if (on.is_dangling()) out << " dangling";
+ if (on.is_deleted()) out << " deleted";
+ out << " uncom=" << on.uncommitted;
+ // out << " " << &on;
+ out << ")";
+ return out;
+}
+
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __EBOFS_TABLE_H
+#define __EBOFS_TABLE_H
+
+#include "types.h"
+#include "nodes.h"
+
+/** table **/
+
+#define dbtout if (25 <= g_conf.debug_ebofs) cout << "ebofs.table(" << this << ")."
+
+
+template<class K, class V>
+class Table {
+ private:
+ NodePool &pool;
+
+ nodeid_t root;
+ int nkeys;
+ int depth;
+
+ public:
+ Table(NodePool &p,
+ struct ebofs_table& bts) :
+ pool(p),
+ root(bts.root), nkeys(bts.num_keys), depth(bts.depth) {
+ dbtout << "cons" << endl;
+ }
+
+ nodeid_t get_root() { return root; }
+ int get_num_keys() { return nkeys; }
+ int get_depth() { return depth; }
+
+
+ /*
+ */
+ class _IndexItem { // i just need a struct size for below
+ K k;
+ nodeid_t n;
+ };
+ class IndexItem {
+ public:
+ K key;
+ nodeid_t node;
+ static const int MAX = Node::ITEM_LEN / (sizeof(_IndexItem));
+ static const int MIN = MAX/2;
+ };
+ class _LeafItem { // i just need a struct size for below
+ K k;
+ V v;
+ };
+ class LeafItem {
+ public:
+ K key;
+ V value;
+ static const int MAX = Node::ITEM_LEN / (sizeof(_LeafItem));
+ static const int MIN = MAX/2;
+ };
+
+ class Nodeptr {
+ public:
+ Node *node;
+
+ Nodeptr() : node(0) {}
+ Nodeptr(Node *n) : node(n) {}
+ Nodeptr& operator=(Node *n) {
+ node = n;
+ return *this;
+ }
+
+ LeafItem& leaf_item(int i) { return (( LeafItem*)(node->item_ptr()))[i]; }
+ IndexItem& index_item(int i) { return ((IndexItem*)(node->item_ptr()))[i]; }
+ K key(int i) {
+ if (node->is_index())
+ return index_item(i).key;
+ else
+ return leaf_item(i).key;
+ }
+
+ bool is_leaf() { return node->is_leaf(); }
+ bool is_index() { return node->is_index(); }
+ void set_type(int t) { node->set_type(t); }
+
+ int max_items() const {
+ if (node->is_leaf())
+ return LeafItem::MAX;
+ else
+ return IndexItem::MAX;
+ }
+ int min_items() const { return max_items() / 2; }
+
+ nodeid_t get_id() { return node->get_id(); }
+
+ int size() { return node->size(); }
+ void set_size(int s) { node->set_size(s); }
+
+ void remove_at_pos(int p) {
+ if (node->is_index()) {
+ for (int i=p; i<size()-1; i++)
+ index_item(i) = index_item(i+1);
+ } else {
+ for (int i=p; i<size()-1; i++)
+ leaf_item(i) = leaf_item(i+1);
+ }
+ set_size(size() - 1);
+ }
+ void insert_at_leaf_pos(int p, K key, V value) {
+ assert(is_leaf());
+ for (int i=size(); i>p; i--)
+ leaf_item(i) = leaf_item(i-1);
+ leaf_item(p).key = key;
+ leaf_item(p).value = value;
+ set_size(size() + 1);
+ }
+ void insert_at_index_pos(int p, K key, nodeid_t node) {
+ assert(is_index());
+ for (int i=size(); i>p; i--)
+ index_item(i) = index_item(i-1);
+ index_item(p).key = key;
+ index_item(p).node = node;
+ set_size(size() + 1);
+ }
+
+ void append_item(LeafItem& i) {
+ leaf_item(size()) = i;
+ set_size(size() + 1);
+ }
+ void append_item(IndexItem& i) {
+ index_item(size()) = i;
+ set_size(size() + 1);
+ }
+
+ void split(Nodeptr& right) {
+ if (node->is_index()) {
+ for (int i=min_items(); i<size(); i++)
+ right.append_item( index_item(i) );
+ } else {
+ for (int i=min_items(); i<size(); i++)
+ right.append_item( leaf_item(i) );
+ }
+ set_size(min_items());
+ }
+
+ void merge(Nodeptr& right) {
+ if (node->is_index())
+ for (int i=0; i<right.size(); i++)
+ append_item( right.index_item(i) );
+ else
+ for (int i=0; i<right.size(); i++)
+ append_item( right.leaf_item(i) );
+ right.set_size(0);
+ }
+
+ };
+
+ /*
+ */
+ class Cursor {
+ protected:
+ public:
+ static const int MATCH = 1; // on key
+ static const int INSERT = 0; // before key
+ static const int OOB = -1; // at end
+
+ Table *table;
+ vector<Nodeptr> open; // open nodes
+ vector<int> pos; // position within the node
+ //Nodeptr open[20];
+ //int pos[20];
+ int level;
+
+ Cursor(Table *t) : table(t), open(t->depth), pos(t->depth), level(0) {}
+
+ public:
+
+ const LeafItem& current() {
+ assert(open[level].is_leaf());
+ return open[level].leaf_item(pos[level]);
+ }
+ V& dirty_current_value() {
+ assert(open[level].is_leaf());
+ dirty();
+ return open[level].leaf_item(pos[level]).value;
+ }
+
+ // ** read-only bits **
+ int move_left() {
+ if (table->depth == 0) return OOB;
+
+ // work up around branch
+ int l;
+ for (l = level; l >= 0; l--)
+ if (pos[l] > 0) break;
+ if (l < 0)
+ return OOB; // we are the first item in the btree
+
+ // move left one
+ pos[l]--;
+
+ // work back down right side
+ for (; l<level; l++) {
+ open[l+1] = table->pool.get_node( open[l].index_item(pos[l]).node );
+ pos[l+1] = open[l+1].size() - 1;
+ }
+ return 1;
+ }
+ int move_right() {
+ if (table->depth == 0) return OOB;
+
+ // work up branch
+ int l;
+ for (l=level; l>=0; l--)
+ if (pos[l] < open[l].size() - 1) break;
+ if (l < 0) {
+ /* we are at last item in btree. */
+ if (pos[level] < open[level].size()) {
+ pos[level]++; /* move into add position! */
+ return 0;
+ }
+ return -1;
+ }
+
+ /* move right one */
+ assert( pos[l] < open[l].size() );
+ pos[l]++;
+
+ /* work back down */
+ for (; l<level; l++) {
+ open[l+1] = table->pool.get_node( open[l].index_item(pos[l]).node );
+ pos[l+1] = 0; // furthest left
+ }
+ return 1;
+ }
+
+ // ** modifications **
+ void dirty() {
+ for (int l=level; l>=0; l--) {
+ if (open[l].node->is_dirty()) break; // already dirty! (and thus parents are too)
+
+ table->pool.dirty_node(open[l].node);
+ if (l > 0)
+ open[l-1].index_item( pos[l-1] ).node = open[l].get_id();
+ else
+ table->root = open[0].get_id();
+ }
+ }
+ private:
+ void repair_parents() {
+ // did i make a change at the start of a node?
+ if (pos[level] == 0) {
+ K key = open[level].key(0); // new key parents should have
+ for (int j=level-1; j>=0; j--) {
+ if (open[j].index_item(pos[j]).key == key)
+ break; /* it's the same key, we can stop fixing */
+ open[j].index_item(pos[j]).key = key;
+ if (pos[j] > 0) break; /* last in position 0.. */
+ }
+ }
+ }
+
+ public:
+ void remove() {
+ dirty();
+
+ // remove from node
+ open[level].remove_at_pos( pos[level] );
+ repair_parents();
+
+ // was it a key?
+ if (level == table->depth-1)
+ table->nkeys--;
+ }
+
+ void insert(K key, V value) {
+ dirty();
+
+ // insert
+ open[level].insert_at_leaf_pos(pos[level], key, value);
+ repair_parents();
+
+ // was it a key?
+ if (level == table->depth-1)
+ table->nkeys++;
+ }
+
+ int rotate_left() {
+ if (level == 0) return -1; // i am root
+ if (pos[level-1] == 0) return -1; // nothing to left
+
+ Nodeptr here = open[level];
+ Nodeptr parent = open[level-1];
+ Nodeptr left = table->pool.get_node( parent.index_item(pos[level-1] - 1).node );
+ if (left.size() == left.max_items()) return -1; // it's full
+
+ // make both dirty
+ dirty();
+ if (!left.node->is_dirty()) {
+ table->pool.dirty_node(left.node);
+ parent.index_item(pos[level-1]-1).node = left.get_id();
+ }
+
+ dbtout << "rotating item " << here.key(0) << " left from " << here.get_id() << " to " << left.get_id() << endl;
+
+ /* add */
+ if (here.node->is_leaf())
+ left.append_item(here.leaf_item(0));
+ else
+ left.append_item(here.index_item(0));
+
+ /* remove */
+ here.remove_at_pos(0);
+
+ /* fix parent index for me */
+ parent.index_item( pos[level-1] ).key = here.key(0);
+ // we never have to update past immediate parent, since we're not at pos 0
+
+ /* adjust cursor */
+ if (pos[level] > 0)
+ pos[level]--;
+ //else
+ //assert(1); /* if we were positioned here, we're equal */
+ /* if it was 0, then the shifted item == our key, and we can stay here safely. */
+ return 0;
+ }
+ int rotate_right() {
+ if (level == 0) return -1; // i am root
+ if (pos[level-1] + 1 >= open[level-1].size()) return -1; // nothing to right
+
+ Nodeptr here = open[level];
+ Nodeptr parent = open[level-1];
+ Nodeptr right = table->pool.get_node( parent.index_item( pos[level-1] + 1 ).node );
+ if (right.size() == right.max_items()) return -1; // it's full
+
+ // make both dirty
+ dirty();
+ if (!right.node->is_dirty()) {
+ table->pool.dirty_node(right.node);
+ parent.index_item( pos[level-1]+1 ).node = right.get_id();
+ }
+
+ if (pos[level] == here.size()) {
+ /* let's just move the cursor over! */
+ //if (sizeof(K) == 8)
+ dbtout << "shifting cursor right from " << here.get_id() << " to less-full node " << right.get_id() << endl;
+ open[level] = right;
+ pos[level] = 0;
+ pos[level-1]++;
+ return 0;
+ }
+
+ //if (sizeof(K) == 8)
+ dbtout << "rotating item " << hex << here.key(here.size()-1) << dec << " right from "
+ << here.get_id() << " to " << right.get_id() << endl;
+
+ /* add */
+ if (here.is_index())
+ right.insert_at_index_pos(0,
+ here.index_item( here.size()-1 ).key,
+ here.index_item( here.size()-1 ).node);
+ else
+ right.insert_at_leaf_pos(0,
+ here.leaf_item( here.size()-1 ).key,
+ here.leaf_item( here.size()-1 ).value);
+
+ /* remove */
+ here.set_size(here.size() - 1);
+
+ /* fix parent index for right */
+ parent.index_item( pos[level-1] + 1 ).key = right.key(0);
+
+ return 0;
+ }
+ };
+
+
+ public:
+ bool almost_full() {
+ if (2*(depth+1) > pool.num_free()) // worst case, plus some.
+ return true;
+ return false;
+ }
+
+ int find(K key, Cursor& cursor) {
+ dbtout << "find " << key << endl;
+
+ if (depth == 0)
+ return Cursor::OOB;
+
+ // init
+ cursor.level = 0;
+
+ // start at root
+ Nodeptr curnode( pool.get_node(root) );
+ cursor.open[0] = curnode;
+
+ if (curnode.size() == 0) return -1; // empty!
+
+ // find leaf
+ for (cursor.level = 0; cursor.level < depth-1; cursor.level++) {
+ /* if key=5, we want 2 3 [4] 6 7, or 3 4 [5] 5 6 (err to the left) */
+ int left = 0; /* i >= left */
+ int right = curnode.size()-1; /* i < right */
+ while (left < right) {
+ int i = left + (right - left) / 2;
+ if (curnode.index_item(i).key < key) {
+ left = i + 1;
+ } else if (i && curnode.index_item(i-1).key >= key) {
+ right = i;
+ } else {
+ left = right = i;
+ break;
+ }
+ }
+ int i = left;
+ if (i && curnode.index_item(i).key > key) i--;
+
+#ifdef EBOFS_DEBUG_BTREE
+ int j;
+ for (j=0; j<curnode.size()-1; j++) {
+ if (curnode.index_item(j).key == key) break; /* perfect */
+ if (curnode.index_item(j+1).key > key) break;
+ }
+ if (i != j) {
+ dbtout << "btree binary search failed" << endl;
+ i = j;
+ }
+#endif
+
+ cursor.pos[cursor.level] = i;
+
+ /* get child node */
+ curnode = pool.get_node( cursor.open[cursor.level].index_item(i).node );
+ cursor.open[cursor.level+1] = curnode;
+ }
+
+ /* search leaf */
+ /* if key=5, we want 2 3 4 [6] 7, or 3 4 [5] 5 6 (err to the right) */
+ int left = 0; /* i >= left */
+ int right = curnode.size(); /* i < right */
+ while (left < right) {
+ int i = left + (right - left) / 2;
+ if (curnode.leaf_item(i).key < key) {
+ left = i + 1;
+ } else if (i && curnode.leaf_item(i-1).key >= key) {
+ right = i;
+ } else {
+ left = right = i;
+ break;
+ }
+ }
+ int i = left;
+
+#ifdef EBOFS_DEBUG_BTREE
+ int j;
+ for (j=0; j<curnode.size(); j++) {
+ if (curnode.leaf_item(j).key >= key) break;
+ }
+ if (i != j) {
+ dbtout << "btree binary search failed" << endl;
+ i = j;
+ }
+#endif
+
+ cursor.pos[cursor.level] = i; /* first key in this node, or key insertion point */
+
+ if (curnode.size() >= i+1) {
+ if (curnode.leaf_item(i).key == key) {
+ return Cursor::MATCH; /* it's the actual key */
+ } else {
+ return Cursor::INSERT; /* it's an insertion point */
+ }
+ }
+ return Cursor::OOB; /* it's the end of the btree (also a valid insertion point) */
+ }
+
+ int lookup(K key) {
+ dbtout << "lookup" << endl;
+ Cursor cursor(this);
+ if (find(key, cursor) == Cursor::MATCH)
+ return 0;
+ return -1;
+ }
+
+ int lookup(K key, V& value) {
+ dbtout << "lookup" << endl;
+ Cursor cursor(this);
+ if (find(key, cursor) == Cursor::MATCH) {
+ value = cursor.current().value;
+ return 0;
+ }
+ return -1;
+ }
+
+ int insert(K key, V value) {
+ dbtout << "insert " << key << " -> " << value << endl;
+ if (almost_full()) return -1;
+
+ // empty?
+ if (nkeys == 0) {
+ if (root == -1) {
+ // create a root node (leaf!)
+ assert(depth == 0);
+ Nodeptr newroot( pool.new_node(Node::TYPE_LEAF) );
+ root = newroot.get_id();
+ depth++;
+ }
+ assert(depth == 1);
+ assert(root >= 0);
+ }
+
+ // start at/near key
+ Cursor cursor(this);
+ find(key, cursor);
+
+ // insert loop
+ nodeid_t nodevalue = 0;
+ while (1) {
+
+ /* room in this node? */
+ if (cursor.open[cursor.level].size() < cursor.open[cursor.level].max_items()) {
+ if (cursor.open[cursor.level].is_leaf())
+ cursor.insert( key, value ); // will dirty, etc.
+ else {
+ // indices are already dirty
+ cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue);
+ }
+ verify("insert 1");
+ return 0;
+ }
+
+ /* this node is full. */
+ assert( cursor.open[cursor.level].size() == cursor.open[cursor.level].max_items() );
+
+ /* can we rotate? */
+ if (false) // NO! there's a bug in here somewhere, don't to it.
+ if (cursor.level > 0) {
+ if ((cursor.pos[cursor.level-1] > 0
+ && cursor.rotate_left() >= 0) ||
+ (cursor.pos[cursor.level-1] + 1 < cursor.open[cursor.level-1].size()
+ && cursor.rotate_right() >= 0)) {
+
+ if (cursor.open[cursor.level].is_leaf())
+ cursor.insert( key, value ); // will dirty, etc.
+ else {
+ // indices are already dirty
+ cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue);
+ }
+ verify("insert 2");
+ return 0;
+ }
+ }
+
+ /** split node **/
+
+ if (cursor.level == depth-1) {
+ dbtout << "splitting leaf " << cursor.open[cursor.level].get_id() << endl;
+ } else {
+ dbtout << "splitting index " << cursor.open[cursor.level].get_id() << endl;
+ }
+
+ cursor.dirty();
+
+ // split
+ Nodeptr leftnode = cursor.open[cursor.level];
+ Nodeptr newnode( pool.new_node(leftnode.node->get_type()) );
+ leftnode.split( newnode );
+
+ /* insert our item */
+ if (cursor.pos[cursor.level] > leftnode.size()) {
+ // not with cursor, since this node isn't added yet!
+ if (newnode.is_leaf()) {
+ newnode.insert_at_leaf_pos( cursor.pos[cursor.level] - leftnode.size(),
+ key, value );
+ nkeys++;
+ } else {
+ newnode.insert_at_index_pos( cursor.pos[cursor.level] - leftnode.size(),
+ key, nodevalue );
+ }
+ } else {
+ // with cursor (if leaf)
+ if (leftnode.is_leaf())
+ cursor.insert( key, value );
+ else
+ leftnode.insert_at_index_pos( cursor.pos[cursor.level],
+ key, nodevalue );
+ }
+
+ /* are we at the root? */
+ if (cursor.level == 0) {
+ /* split root. */
+ dbtout << "that split was the root " << root << endl;
+ Nodeptr newroot( pool.new_node(Node::TYPE_INDEX) );
+
+ /* new root node */
+ newroot.set_size(2);
+ newroot.index_item(0).key = leftnode.key(0);
+ newroot.index_item(0).node = root;
+ newroot.index_item(1).key = newnode.key(0);
+ newroot.index_item(1).node = newnode.get_id();
+
+ /* heighten tree */
+ depth++;
+ root = newroot.get_id();
+ verify("insert 3");
+ return 0;
+ }
+
+ /* now insert newindex in level-1 */
+ nodevalue = newnode.get_id();
+ key = newnode.key(0);
+ cursor.level--;
+ cursor.pos[cursor.level]++; // ...to the right of leftnode!
+ }
+ }
+
+
+ int remove(K key) {
+ dbtout << "remove " << key << endl;
+
+ if (almost_full()) {
+ cout << "table almost full, failing" << endl;
+ assert(0);
+ return -1;
+ }
+
+ Cursor cursor(this);
+ if (find(key, cursor) <= 0) {
+ cerr << "remove " << key << " 0x" << hex << key << dec << " .. dne" << endl;
+ g_conf.debug_ebofs = 33;
+ g_conf.ebofs_verify = true;
+ verify("remove dne");
+ assert(0);
+ return -1; // key dne
+ }
+
+
+ while (1) {
+ cursor.remove();
+
+ // balance + adjust
+
+ if (cursor.level == 0) {
+ // useless root index?
+ if (cursor.open[0].size() == 1 &&
+ depth > 1) {
+ depth--;
+ root = cursor.open[0].index_item(0).node;
+ pool.release( cursor.open[0].node );
+ }
+
+ // note: root can be small, but not empty
+ else if (nkeys == 0) {
+ assert(cursor.open[cursor.level].size() == 0);
+ assert(depth == 1);
+ root = -1;
+ depth = 0;
+ pool.release(cursor.open[0].node);
+ }
+ verify("remove 1");
+ return 0;
+ }
+
+ if (cursor.open[cursor.level].size() > cursor.open[cursor.level].min_items()) {
+ verify("remove 2");
+ return 0;
+ }
+
+ // borrow from siblings?
+ Nodeptr left;
+ Nodeptr right;
+
+ // left?
+ if (cursor.pos[cursor.level-1] > 0) {
+ int left_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] - 1).node;
+ left = pool.get_node( left_loc );
+
+ if (left.size() > left.min_items()) {
+ /* move cursor left, shift right */
+ cursor.pos[cursor.level] = 0;
+ cursor.open[cursor.level] = left;
+ cursor.pos[cursor.level-1]--;
+ cursor.rotate_right();
+ verify("remove 3");
+ return 0;
+ }
+
+ /* combine to left */
+ right = cursor.open[cursor.level];
+ }
+ else {
+ assert(cursor.pos[cursor.level-1] < cursor.open[cursor.level-1].size() - 1);
+ int right_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] + 1 ).node;
+ right = pool.get_node( right_loc );
+
+ if (right.size() > right.min_items()) {
+ /* move cursor right, shift an item left */
+ cursor.pos[cursor.level] = 1;
+ cursor.open[cursor.level] = right;
+ cursor.pos[cursor.level-1]++;
+ cursor.rotate_left();
+ verify("remove 4");
+ return 0;
+ }
+
+ /* combine to left */
+ left = cursor.open[cursor.level];
+ cursor.pos[cursor.level-1]++; /* move cursor to (soon-to-be-empty) right side item */
+ }
+
+ // note: cursor now points to _right_ node.
+
+ /* combine (towards left)
+ * (this makes it so our next delete will be in the index
+ * interior, which is less scary.)
+ */
+ dbtout << "combining nodes " << left.get_id() << " and " << right.get_id() << endl;
+
+ left.merge(right);
+
+ // dirty left + right
+ cursor.dirty(); // right
+ if (!left.node->is_dirty()) {
+ pool.dirty_node(left.node);
+ cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1]-1 ).node = left.get_id();
+ }
+
+ pool.release(right.node);
+
+ cursor.level--; // now point to the link to the obsolete (right-side) sib */
+ }
+
+ }
+
+ void clear(Cursor& cursor, int node_loc, int level) {
+ dbtout << "clear" << endl;
+
+ Nodeptr node = pool.get_node( node_loc );
+ cursor.open[level] = node;
+
+ // hose children?
+ if (level < depth-1) {
+ for (int i=0; i<node.size(); i++) {
+ // index
+ cursor.pos[level] = i;
+ nodeid_t child = cursor.open[level].index_item(i).node;
+ clear( cursor, child, level+1 );
+ }
+ }
+
+ // hose myself
+ pool.release( node.node );
+ }
+
+ void clear() {
+ Cursor cursor(this);
+ if (root == -1 && depth == 0) return; // already empty!
+ clear(cursor, root, 0);
+ root = -1;
+ depth = 0;
+ nkeys = 0;
+ }
+
+ int verify_sub(Cursor& cursor, int node_loc, int level, int& count, K& last, const char *on) {
+ int err = 0;
+
+ Nodeptr node = pool.get_node( node_loc );
+ cursor.open[level] = node;
+
+ // identify max, min, and validate key range
+ K min = node.key(0);
+ last = min;
+ K max = min;
+ for (int i=0; i<node.size(); i++) {
+ if (i && node.key(i) <= last) {
+ dbtout << ":: key " << i << " " << hex << node.key(i) << dec << " in node " << node_loc
+ << " is out of order, last is " << hex << last << dec << endl;
+ err++;
+ }
+ if (node.key(i) > max)
+ max = node.key(i);
+
+ if (level < depth-1) {
+ // index
+ cursor.pos[level] = i;
+ err += verify_sub( cursor, cursor.open[level].index_item(i).node, level+1, count, last, on );
+ } else {
+ // leaf
+ count++;
+ last = node.key(i);
+ }
+ }
+
+ if (level) {
+ // verify that parent's keys are appropriate
+ if (min != cursor.open[level-1].index_item(cursor.pos[level-1]).key) {
+ dbtout << ":: key in index node " << cursor.open[level-1].get_id()
+ << " != min in child " << node_loc
+ << "(key is " << hex << cursor.open[level-1].index_item(cursor.pos[level-1]).key
+ << ", min is " << min << ")" << dec << endl;
+ err++;
+ }
+ if (cursor.pos[level-1] < cursor.open[level-1].size()-1) {
+ if (max > cursor.open[level-1].index_item(1+cursor.pos[level-1]).key) {
+ dbtout << ":: next key in index node " << cursor.open[level-1].get_id()
+ << " < max in child " << node_loc
+ << "(key is " << hex << cursor.open[level-1].index_item(1+cursor.pos[level-1]).key
+ << ", max is " << max << ")" << dec << endl;
+ err++;
+ }
+ }
+ }
+
+ //return err;
+
+ // print it
+ char s[1000];
+ strcpy(s," ");
+ s[level+1] = 0;
+ if (1) {
+ if (root == node_loc) {
+ dbtout << s << "root " << node_loc << ": "
+ << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl;
+ } else if (level == depth-1) {
+ dbtout << s << "leaf " << node_loc << ": "
+ << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl;
+ } else {
+ dbtout << s << "indx " << node_loc << ": "
+ << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl;
+ }
+
+ if (0) {
+ for (int i=0; i<node.size(); i++) {
+ if (level < depth-1) { // index
+ dbtout << s << " " << hex << node.key(i) << " [" << node.index_item(i).node << "]" << dec << endl;
+ } else { // leaf
+ dbtout << s << " " << hex << node.key(i) << " -> " << node.leaf_item(i).value << dec << endl;
+ }
+ }
+ }
+ }
+
+ return err;
+ }
+
+ void verify(const char *on) {
+ if (!g_conf.ebofs_verify)
+ return;
+
+ if (root == -1 && depth == 0) {
+ return; // empty!
+ }
+
+ int count = 0;
+ Cursor cursor(this);
+ K last;
+
+ int before = g_conf.debug_ebofs;
+ g_conf.debug_ebofs = 0;
+
+ int err = verify_sub(cursor, root, 0, count, last, on);
+ if (count != nkeys) {
+ cerr << "** count " << count << " != nkeys " << nkeys << endl;
+ err++;
+ }
+
+ g_conf.debug_ebofs = before;
+
+ // ok?
+ if (err) {
+ cerr << "verify failure, called by '" << on << "'" << endl;
+ g_conf.debug_ebofs = 30;
+ // do it again, so we definitely get the dump.
+ int count = 0;
+ Cursor cursor(this);
+ K last;
+ verify_sub(cursor, root, 0, count, last, on);
+ assert(err == 0);
+ }
+ }
+
+};
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include <iostream>
+#include "ebofs/Ebofs.h"
+
+
+int main(int argc, char **argv)
+{
+ // args
+ vector<char*> args;
+ argv_to_vec(argc, argv, args);
+ parse_config_options(args);
+
+ if (args.size() < 1) {
+ cerr << "usage: mkfs.ebofs [options] <device file>" << endl;
+ return -1;
+ }
+ char *filename = args[0];
+
+ // mkfs
+ Ebofs mfs(filename);
+ int r = mfs.mkfs();
+ if (r < 0) exit(r);
+
+ if (args.size() > 1) { // pass an extra arg of some sort to trigger the test crapola
+ // test-o-rama!
+ Ebofs fs(filename);
+ fs.mount();
+
+ /*
+ if (1) {
+ // partial write tests
+ char crap[1024*1024];
+ memset(crap, 0, 1024*1024);
+
+ bufferlist small;
+ small.append(crap, 10);
+ bufferlist med;
+ med.append(crap, 1000);
+ bufferlist big;
+ big.append(crap, 1024*1024);
+
+ cout << "0" << endl;
+ fs.write(10, 0, 1024*1024, big, (Context*)0);
+ fs.sync();
+ fs.trim_buffer_cache();
+
+ cout << "1" << endl;
+ fs.write(10, 10, 10, small, 0);
+ fs.write(10, 1, 1000, med, 0);
+ fs.sync();
+ fs.trim_buffer_cache();
+
+ cout << "2" << endl;
+ fs.write(10, 10, 10, small, 0);
+ //fs.sync();
+ fs.write(10, 1, 1000, med, 0);
+ fs.sync();
+ fs.trim_buffer_cache();
+
+ cout << "3" << endl;
+ fs.write(10, 1, 1000, med, 0);
+ fs.write(10, 10000, 10, small, 0);
+ fs.truncate(10, 100, 0);
+ fs.sync();
+ fs.trim_buffer_cache();
+
+ cout << "4" << endl;
+ fs.remove(10);
+ fs.sync();
+ fs.write(10, 10, 10, small, 0);
+ fs.sync();
+ fs.write(10, 1, 1000, med, 0);
+ fs.sync();
+ fs.truncate(10, 100, 0);
+ fs.write(10, 10, 10, small, 0);
+ fs.trim_buffer_cache();
+
+
+
+ }
+
+ if (0) { // onode write+read test
+ bufferlist bl;
+ char crap[1024*1024];
+ memset(crap, 0, 1024*1024);
+ bl.append(crap, 10);
+
+ fs.write(10, 10, 0, bl, (Context*)0);
+ fs.umount();
+
+ Ebofs fs2(filename);
+ fs2.mount();
+ fs2.read(10, 10, 0, bl);
+ fs2.umount();
+
+ return 0;
+ }
+
+
+ if (0) { // small write + read test
+ bufferlist bl;
+ char crap[1024*1024];
+ memset(crap, 0, 1024*1024);
+
+ object_t oid = 10;
+ int n = 10000;
+ int l = 128;
+ bl.append(crap, l);
+
+
+ char *p = bl.c_str();
+ off_t o = 0;
+ for (int i=0; i<n; i++) {
+ cout << "write at " << o << endl;
+ for (int j=0;j<l;j++)
+ p[j] = (char)(oid^(o+j));
+ fs.write(oid, l, o, bl, (Context*)0);
+ o += l;
+ }
+
+ fs.sync();
+ fs.trim_buffer_cache();
+
+ o = 0;
+ for (int i=0; i<n; i++) {
+ cout << "read at " << o << endl;
+ bl.clear();
+ fs.read(oid, l, o, bl);
+
+ char b[l];
+ bl.copy(0, l, b);
+ char *p = b;
+ int left = l;
+ while (left--) {
+ assert(*p == (char)(o ^ oid));
+ o++;
+ p++;
+ }
+ }
+
+ }
+
+ if (0) { // big write speed test
+ bufferlist bl;
+ char crap[1024*1024];
+ memset(crap, 0, 1024*1024);
+ bl.append(crap, 1024*1024);
+
+ int megs = 1000;
+
+ utime_t start = g_clock.now();
+
+ for (off_t m=0; m<megs; m++) {
+ //if (m%100 == 0)
+ cout << m << " / " << megs << endl;
+ fs.write(10, bl.length(), 1024LL*1024LL*m, bl, (Context*)0);
+ }
+ fs.sync();
+
+ utime_t end = g_clock.now();
+ end -= start;
+
+ dout(1) << "elapsed " << end << endl;
+
+ float mbs = (float)megs / (float)end;
+ dout(1) << "mb/s " << mbs << endl;
+ }
+
+ if (0) { // test
+ bufferlist bl;
+ char crap[10000];
+ memset(crap, 0, 10000);
+ bl.append(crap, 10000);
+ fs.write(10, bl.length(), 200, bl, (Context*)0);
+ fs.trim_buffer_cache();
+ fs.write(10, bl.length(), 5222, bl, (Context*)0);
+ sleep(1);
+ fs.trim_buffer_cache();
+ fs.write(10, 5000, 3222, bl, (Context*)0);
+ }
+
+ // test small writes
+ if (0) {
+ char crap[1024*1024];
+ memset(crap, 0, 1024*1024);
+ bufferlist bl;
+ bl.append(crap, 1024*1024);
+
+ // reandom write
+ if (1) {
+ srand(0);
+ for (int i=0; i<10000; i++) {
+ off_t off = rand() % 1000000;
+ size_t len = 1+rand() % 10000;
+ cout << endl << i << " writing bit at " << off << " len " << len << endl;
+ fs.write(10, len, off, bl, (Context*)0);
+ //fs.sync();
+ //fs.trim_buffer_cache();
+ }
+ fs.remove(10);
+ for (int i=0; i<100; i++) {
+ off_t off = rand() % 1000000;
+ size_t len = 1+rand() % 10000;
+ cout << endl << i << " writing bit at " << off << " len " << len << endl;
+ fs.write(10, len, off, bl, (Context*)0);
+ //fs.sync();
+ //fs.trim_buffer_cache();
+ }
+ }
+
+ if (0) {
+ // sequential write
+ srand(0);
+ off_t off = 0;
+ for (int i=0; i<10000; i++) {
+ size_t len = 1024*1024;//1+rand() % 10000;
+ cout << endl << i << " writing bit at " << off << " len " << len << endl;
+ fs.write(10, len, off, bl, (Context*)0);
+ off += len;
+ }
+
+ }
+
+
+ if (0) {
+ // read
+ srand(0);
+ for (int i=0; i<100; i++) {
+ bufferlist bl;
+ off_t off = rand() % 1000000;
+ size_t len = rand() % 1000;
+ cout << endl << "read bit at " << off << " len " << len << endl;
+ int r = fs.read(10, len, off, bl);
+ assert(bl.length() == len);
+ assert(r == (int)len);
+ }
+ }
+
+ // flush
+ fs.sync();
+ fs.trim_buffer_cache();
+ //fs.trim_buffer_cache();
+
+ if (0) {
+ // read again
+ srand(0);
+ for (int i=0; i<100; i++) {
+ bufferlist bl;
+ off_t off = rand() % 1000000;
+ size_t len = 100;
+ cout << endl << "read bit at " << off << " len " << len << endl;
+ int r = fs.read(10, len, off, bl);
+ assert(bl.length() == len);
+ assert(r == (int)len);
+ }
+
+ // flush
+ fs.sync();
+ fs.trim_buffer_cache();
+ }
+
+ if (0) {
+ // write on empty cache
+ srand(0);
+ for (int i=0; i<100; i++) {
+ off_t off = rand() % 1000000;
+ size_t len = 100;
+ cout << endl << "writing bit at " << off << " len " << len << endl;
+ fs.write(10, len, off, bl, (Context*)0);
+ }
+ }
+
+ }
+ */
+
+ fs.sync();
+ fs.trim_buffer_cache();
+
+ fs.umount();
+ }
+
+ return 0;
+}
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __EBOFS_NODES_H
+#define __EBOFS_NODES_H
+
+/** nodes, node regions **/
+
+#include "types.h"
+#include "BlockDevice.h"
+
+
+/*
+
+ disk wire memory
+
+ free free -> free can alloc
+ free used -> dirty can modify
+
+ free used used -> tx
+ free used free -> limbo
+
+ used used -> clean
+ used free -> limbo
+
+
+ // meaningless
+ used free free -> free can alloc
+ used free used __DNE__
+
+
+*/
+
+#undef debofs
+#define debofs(x) if (x < g_conf.debug_ebofs) cout << "ebofs.nodepool."
+
+
+class Node {
+ public:
+ // bit fields
+ static const int STATE_CLEAN = 1;
+ static const int STATE_DIRTY = 2;
+ static const int STATE_TX = 3;
+
+ static const int ITEM_LEN = EBOFS_NODE_BYTES - sizeof(int) - sizeof(int) - sizeof(int);
+
+ static const int TYPE_INDEX = 1;
+ static const int TYPE_LEAF = 2;
+
+ protected:
+ nodeid_t id;
+ int state; // use bit fields above!
+
+ bufferptr bptr;
+ bufferptr shadow_bptr;
+
+ // in disk buffer
+ int *type;
+ int *nrecs;
+
+ public:
+ Node(nodeid_t i, bufferptr& b, int s) : id(i), state(s), bptr(b) {
+ nrecs = (int*)(bptr.c_str());
+ type = (int*)(bptr.c_str() + sizeof(*nrecs));
+ }
+
+
+ // id
+ nodeid_t get_id() const { return id; }
+ void set_id(nodeid_t n) { id = n; }
+
+ // buffer
+ bufferptr& get_buffer() { return bptr; }
+
+ char *item_ptr() { return bptr.c_str() + sizeof(*nrecs) + sizeof(*type); }
+
+ // size
+ int size() { return *nrecs; }
+ void set_size(int s) { *nrecs = s; }
+
+ // type
+ int& get_type() { return *type; }
+ void set_type(int t) { *type = t; }
+ bool is_index() { return *type == TYPE_INDEX; }
+ bool is_leaf() { return *type == TYPE_LEAF; }
+
+
+ // state
+ bool is_dirty() { return state == STATE_DIRTY; }
+ bool is_tx() { return state == STATE_TX; }
+ bool is_clean() { return state == STATE_CLEAN; }
+
+ void set_state(int s) { state = s; }
+
+ void make_shadow() {
+ assert(is_tx());
+
+ shadow_bptr = bptr;
+
+ // new buffer
+ bptr = buffer::create_page_aligned(EBOFS_NODE_BYTES);
+ nrecs = (int*)(bptr.c_str());
+ type = (int*)(bptr.c_str() + sizeof(*nrecs));
+
+ // copy contents!
+ memcpy(bptr.c_str(), shadow_bptr.c_str(), EBOFS_NODE_BYTES);
+ }
+
+};
+
+
+
+
+
+class NodePool {
+ protected:
+ map<nodeid_t, Node*> node_map; // open node map
+
+ public:
+ vector<Extent> region_loc; // region locations
+ Extent usemap_even;
+ Extent usemap_odd;
+
+ protected:
+ // on-disk block states
+ int num_nodes;
+ set<nodeid_t> free;
+ set<nodeid_t> dirty;
+ set<nodeid_t> tx;
+ set<nodeid_t> clean; // aka used
+ set<nodeid_t> limbo;
+
+ Mutex &ebofs_lock;
+ Cond commit_cond;
+ int flushing;
+
+ static int make_nodeid(int region, int offset) {
+ return (region << 24) | offset;
+ }
+ static int nodeid_region(nodeid_t nid) {
+ return nid >> 24;
+ }
+ static int nodeid_offset(nodeid_t nid) {
+ return nid & ((1 << 24) - 1);
+ }
+
+
+ public:
+ NodePool(Mutex &el) :
+ num_nodes(0),
+ ebofs_lock(el),
+ flushing(0) {}
+ ~NodePool() {
+ // nodes
+ release_all();
+ }
+
+ int num_free() { return free.size(); }
+ int num_dirty() { return dirty.size(); }
+ int num_limbo() { return limbo.size(); }
+ int num_tx() { return tx.size(); }
+ int num_clean() { return clean.size(); }
+ int num_total() { return num_nodes; }
+ int num_used() { return num_clean() + num_dirty() + num_tx(); }
+
+ int get_usemap_len(int n=0) {
+ if (n == 0) n = num_nodes;
+ return ((n-1) / 8 / EBOFS_BLOCK_SIZE) + 1;
+ }
+
+ int num_regions() { return region_loc.size(); }
+
+ // the caller had better adjust usemap locations...
+ void add_region(Extent ex) {
+ int region = region_loc.size();
+ assert(ex.length <= (1 << 24));
+ region_loc.push_back(ex);
+ for (unsigned o = 0; o < ex.length; o++) {
+ free.insert( make_nodeid(region, o) );
+ }
+ num_nodes += ex.length;
+ }
+
+ int init(struct ebofs_nodepool *np) {
+ // regions
+ assert(region_loc.empty());
+ num_nodes = 0;
+ for (int i=0; i<np->num_regions; i++) {
+ debofs(3) << "init region " << i << " at " << np->region_loc[i] << endl;
+ region_loc.push_back( np->region_loc[i] );
+ num_nodes += np->region_loc[i].length;
+ }
+
+ // usemap
+ usemap_even = np->node_usemap_even;
+ usemap_odd = np->node_usemap_odd;
+ debofs(3) << "init even map at " << usemap_even << endl;
+ debofs(3) << "init odd map at " << usemap_odd << endl;
+
+ return 0;
+ }
+
+ void close() {
+ release_all();
+
+ region_loc.clear();
+ free.clear();
+ dirty.clear();
+ tx.clear();
+ clean.clear();
+ limbo.clear();
+ flushing = 0;
+ node_map.clear();
+ }
+
+
+ // *** blocking i/o routines ***
+
+ int read_usemap(BlockDevice& dev, version_t epoch) {
+ // read map
+ Extent loc;
+ if (epoch & 1)
+ loc = usemap_odd;
+ else
+ loc = usemap_even;
+
+ bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*loc.length);
+ dev.read(loc.start, loc.length, bp);
+
+ // parse
+ unsigned region = 0; // current region
+ unsigned roff = 0; // offset in region
+ for (unsigned byte = 0; byte<bp.length(); byte++) { // each byte
+ // get byte
+ int x = *(unsigned char*)(bp.c_str() + byte);
+ int mask = 0x80; // left-most bit
+ for (unsigned bit=0; bit<8; bit++) {
+ nodeid_t nid = make_nodeid(region, roff);
+
+ if (x & mask)
+ clean.insert(nid);
+ else
+ free.insert(nid);
+
+ mask = mask >> 1; // move one bit right.
+ roff++;
+ if (roff == region_loc[region].length) {
+ // next region!
+ roff = 0;
+ region++;
+ break;
+ }
+ }
+ if (region == region_loc.size()) break;
+ }
+ return 0;
+ }
+
+ int read_clean_nodes(BlockDevice& dev) {
+ /*
+ this relies on the clean set begin defined so that we know which nodes
+ to read. so it only really works when called from mount()!
+ */
+ for (unsigned r=0; r<region_loc.size(); r++) {
+ debofs(3) << "ebofs.nodepool.read region " << r << " at " << region_loc[r] << endl;
+
+ for (block_t boff = 0; boff < region_loc[r].length; boff++) {
+ nodeid_t nid = make_nodeid(r, boff);
+
+ if (!clean.count(nid)) continue;
+ debofs(20) << "ebofs.nodepool.read node " << nid << endl;
+
+ bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES);
+ dev.read(region_loc[r].start + (block_t)boff, EBOFS_NODE_BLOCKS,
+ bp);
+
+ Node *n = new Node(nid, bp, Node::STATE_CLEAN);
+ node_map[nid] = n;
+ debofs(10) << "ebofs.nodepool.read node " << n << " at " << (void*)n << endl;
+ }
+ }
+ return 0;
+ }
+
+
+
+ // **** non-blocking i/o ****
+
+ private:
+ class C_NP_FlushUsemap : public BlockDevice::callback {
+ NodePool *pool;
+ public:
+ C_NP_FlushUsemap(NodePool *p) :
+ pool(p) {}
+ void finish(ioh_t ioh, int r) {
+ pool->flushed_usemap();
+ }
+ };
+
+ void flushed_usemap() {
+ ebofs_lock.Lock();
+ flushing--;
+ if (flushing == 0)
+ commit_cond.Signal();
+ ebofs_lock.Unlock();
+ }
+
+ public:
+ int write_usemap(BlockDevice& dev, version_t version) {
+ // alloc
+ Extent loc;
+ if (version & 1)
+ loc = usemap_odd;
+ else
+ loc = usemap_even;
+
+ bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*loc.length);
+
+ // fill in
+ unsigned region = 0; // current region
+ unsigned roff = 0; // offset in region
+ for (unsigned byte = 0; byte<bp.length(); byte++) { // each byte
+ int x = 0; // start with empty byte
+ int mask = 0x80; // left-most bit
+ for (unsigned bit=0; bit<8; bit++) {
+ nodeid_t nid = make_nodeid(region, roff);
+
+ if (clean.count(nid) ||
+ dirty.count(nid))
+ x |= mask;
+
+ roff++;
+ mask = mask >> 1;
+ if (roff == region_loc[region].length) {
+ // next region!
+ roff = 0;
+ region++;
+ break;
+ }
+ }
+
+ *(unsigned char*)(bp.c_str() + byte) = x;
+ if (region == region_loc.size()) break;
+ }
+
+
+ // write
+ bufferlist bl;
+ bl.append(bp);
+ dev.write(loc.start, loc.length, bl,
+ new C_NP_FlushUsemap(this), "usemap");
+ return 0;
+ }
+
+
+
+ // *** node commit ***
+ private:
+
+ class C_NP_FlushNode : public BlockDevice::callback {
+ NodePool *pool;
+ nodeid_t nid;
+ public:
+ C_NP_FlushNode(NodePool *p, nodeid_t n) :
+ pool(p), nid(n) {}
+ void finish(ioh_t ioh, int r) {
+ pool->flushed_node(nid);
+ }
+ };
+
+ void flushed_node(nodeid_t nid) {
+ ebofs_lock.Lock();
+
+ // mark nid clean|limbo
+ if (tx.count(nid)) { // tx -> clean
+ tx.erase(nid);
+ clean.insert(nid);
+
+ // make node itself clean
+ node_map[nid]->set_state(Node::STATE_CLEAN);
+ }
+ else { // already limbo (was dirtied, or released)
+ assert(limbo.count(nid));
+ }
+
+ flushing--;
+ if (flushing == 0)
+ commit_cond.Signal();
+ ebofs_lock.Unlock();
+ }
+
+ public:
+ void commit_start(BlockDevice& dev, version_t version) {
+ dout(20) << "ebofs.nodepool.commit_start start" << endl;
+
+ assert(flushing == 0);
+ /*if (0)
+ for (unsigned i=0; i<region_loc.size(); i++) {
+ int c = dev.count_io(region_loc[i].start, region_loc[i].length);
+ dout(20) << "ebofs.nodepool.commit_start region " << region_loc[i] << " has " << c << " ios" << endl;
+ assert(c == 0);
+ }
+ */
+
+ // write map
+ flushing++;
+ write_usemap(dev,version & 1);
+
+ // dirty -> tx (write to disk)
+ assert(tx.empty());
+ set<block_t> didb;
+ for (set<nodeid_t>::iterator i = dirty.begin();
+ i != dirty.end();
+ i++) {
+ Node *n = get_node(*i);
+ assert(n);
+ assert(n->is_dirty());
+ n->set_state(Node::STATE_TX);
+
+ unsigned region = nodeid_region(*i);
+ block_t off = nodeid_offset(*i);
+ block_t b = region_loc[region].start + off;
+
+ if (1) { // sanity check debug FIXME
+ assert(didb.count(b) == 0);
+ didb.insert(b);
+ }
+
+ bufferlist bl;
+ bl.append(n->get_buffer());
+ dev.write(b, EBOFS_NODE_BLOCKS,
+ bl,
+ new C_NP_FlushNode(this, *i), "node");
+ flushing++;
+
+ tx.insert(*i);
+ }
+ dirty.clear();
+
+ // limbo -> free
+ for (set<nodeid_t>::iterator i = limbo.begin();
+ i != limbo.end();
+ i++) {
+ free.insert(*i);
+ }
+ limbo.clear();
+
+ dout(20) << "ebofs.nodepool.commit_start finish" << endl;
+ }
+
+ void commit_wait() {
+ while (flushing > 0)
+ commit_cond.Wait(ebofs_lock);
+ dout(20) << "ebofs.nodepool.commit_wait finish" << endl;
+ }
+
+
+
+
+
+
+
+
+
+ // *** nodes ***
+ // opened node
+ Node* get_node(nodeid_t nid) {
+ //dbtout << "pool.get " << nid << endl;
+ assert(node_map.count(nid));
+ return node_map[nid];
+ }
+
+ // unopened node
+ /* not implemented yet!!
+ Node* open_node(nodeid_t nid) {
+ Node *n = node_regions[ NodeRegion::nodeid_region(nid) ]->open_node(nid);
+ dbtout << "pool.open_node " << n->get_id() << endl;
+ node_map[n->get_id()] = n;
+ return n;
+ }
+ */
+
+ // allocate id/block on disk. always free -> dirty.
+ nodeid_t alloc_id() {
+ // pick node id
+ assert(!free.empty());
+ nodeid_t nid = *(free.begin());
+ free.erase(nid);
+ dirty.insert(nid);
+ return nid;
+ }
+
+ // new node
+ Node* new_node(int type) {
+ nodeid_t nid = alloc_id();
+ debofs(15) << "ebofs.nodepool.new_node " << nid << endl;
+
+ // alloc node
+ bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES);
+ Node *n = new Node(nid, bp, Node::STATE_DIRTY);
+ n->set_type(type);
+ n->set_size(0);
+
+ assert(node_map.count(nid) == 0);
+ node_map[nid] = n;
+ return n;
+ }
+
+ void release(Node *n) {
+ const nodeid_t nid = n->get_id();
+ debofs(15) << "ebofs.nodepool.release on " << nid << endl;
+ node_map.erase(nid);
+
+ if (n->is_dirty()) {
+ assert(dirty.count(nid));
+ dirty.erase(nid);
+ free.insert(nid);
+ } else if (n->is_clean()) {
+ assert(clean.count(nid));
+ clean.erase(nid);
+ limbo.insert(nid);
+ } else if (n->is_tx()) {
+ assert(tx.count(nid)); // i guess htis happens? -sage
+ tx.erase(nid);
+ limbo.insert(nid);
+ }
+
+ delete n;
+ }
+
+ void release_all() {
+ while (!node_map.empty()) {
+ map<nodeid_t,Node*>::iterator i = node_map.begin();
+ debofs(2) << "ebofs.nodepool.release_all leftover " << i->first << " " << i->second << endl;
+ release( i->second );
+ }
+ assert(node_map.empty());
+ }
+
+ void dirty_node(Node *n) {
+ // get new node id?
+ nodeid_t oldid = n->get_id();
+ nodeid_t newid = alloc_id();
+ debofs(15) << "ebofs.nodepool.dirty_node on " << oldid << " now " << newid << endl;
+
+ // release old block
+ if (n->is_clean()) {
+ assert(clean.count(oldid));
+ clean.erase(oldid);
+ } else {
+ assert(n->is_tx());
+ assert(tx.count(oldid));
+ tx.erase(oldid);
+
+ // move/copy current -> shadow buffer as necessary
+ n->make_shadow();
+ }
+ limbo.insert(oldid);
+ node_map.erase(oldid);
+
+ n->set_state(Node::STATE_DIRTY);
+
+ // move to new one!
+ n->set_id(newid);
+ node_map[newid] = n;
+ }
+
+
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include <iostream>
+#include "ebofs/Ebofs.h"
+
+bool stop = false;
+
+
+int nt = 0;
+class Tester : public Thread {
+ Ebofs &fs;
+ int t;
+
+ char b[1024*1024];
+
+public:
+ Tester(Ebofs &e) : fs(e), t(nt) { nt++; }
+ void *entry() {
+
+ while (!stop) {
+ object_t oid;
+ oid.ino = (rand() % 10) + 0x10000000;
+ coll_t cid = rand() % 50;
+ off_t off = rand() % 10000;//0;//rand() % 1000000;
+ off_t len = 1+rand() % 100000;
+ char *a = "one";
+ if (rand() % 2) a = "two";
+ int l = 3;//rand() % 10;
+
+ switch (rand() % 10) {
+ case 0:
+ {
+ oid.rev = rand() % 10;
+ cout << t << " read " << hex << oid << dec << " at " << off << " len " << len << endl;
+ bufferlist bl;
+ fs.read(oid, off, len, bl);
+ int l = MIN(len,bl.length());
+ if (l) {
+ cout << t << " got " << l << endl;
+ bl.copy(0, l, b);
+ char *p = b;
+ while (l--) {
+ assert(*p == 0 ||
+ *p == (char)(off ^ oid.ino));
+ off++;
+ p++;
+ }
+ }
+ }
+ break;
+
+ case 1:
+ {
+ cout << t << " write " << hex << oid << dec << " at " << off << " len " << len << endl;
+ for (int j=0;j<len;j++)
+ b[j] = (char)(oid.ino^(off+j));
+ bufferptr wp(b, len);
+ bufferlist w;
+ w.append(wp);
+ fs.write(oid, off, len, w, 0);
+ }
+ break;
+
+ case 2:
+ cout << t << " remove " << hex << oid << dec << endl;
+ fs.remove(oid);
+ break;
+
+ case 3:
+ cout << t << " collection_add " << hex << oid << dec << " to " << cid << endl;
+ fs.collection_add(cid, oid, 0);
+ break;
+
+ case 4:
+ cout << t << " collection_remove " << hex << oid << dec << " from " << cid << endl;
+ fs.collection_remove(cid, oid, 0);
+ break;
+
+ case 5:
+ cout << t << " setattr " << hex << oid << dec << " " << a << " len " << l << endl;
+ fs.setattr(oid, a, (void*)a, l, 0);
+ break;
+
+ case 6:
+ cout << t << " rmattr " << hex << oid << dec << " " << a << endl;
+ fs.rmattr(oid,a);
+ break;
+
+ case 7:
+ {
+ char v[4];
+ cout << t << " getattr " << hex << oid << dec << " " << a << endl;
+ if (fs.getattr(oid,a,(void*)v,3) == 0) {
+ v[3] = 0;
+ assert(strcmp(v,a) == 0);
+ }
+ }
+ break;
+
+ case 8:
+ {
+ cout << t << " truncate " << hex << oid << dec << " " << off << endl;
+ fs.truncate(oid, 0);
+ }
+ break;
+
+ case 9:
+ {
+ object_t newoid = oid;
+ newoid.rev = rand() % 10;
+ cout << t << " clone " << oid << " to " << newoid << endl;
+ fs.clone(oid, newoid, 0);
+ }
+ }
+
+
+ }
+ cout << t << " done" << endl;
+ return 0;
+ }
+};
+
+int main(int argc, char **argv)
+{
+ vector<char*> args;
+ argv_to_vec(argc, argv, args);
+ parse_config_options(args);
+
+ // args
+ if (args.size() != 3) return -1;
+ char *filename = args[0];
+ int seconds = atoi(args[1]);
+ int threads = atoi(args[2]);
+
+ cout << "dev " << filename << " .. " << threads << " threads .. " << seconds << " seconds" << endl;
+
+ Ebofs fs(filename);
+ if (fs.mount() < 0) return -1;
+
+
+ // explicit tests
+ if (1) {
+ // verify that clone() plays nice with partial writes
+ object_t oid(1,1);
+ bufferptr bp(10000);
+ bp.zero();
+ bufferlist bl;
+ bl.push_back(bp);
+ fs.write(oid, 0, 10000, bl, 0);
+
+ fs.sync();
+ fs.trim_buffer_cache();
+
+ // induce a partial write
+ bufferlist bl2;
+ bl2.substr_of(bl, 0, 100);
+ fs.write(oid, 100, 100, bl2, 0);
+
+ // clone it
+ object_t oid2;
+ oid2 = oid;
+ oid2.rev = 1;
+ fs.clone(oid, oid2, 0);
+
+ // ...
+ if (0) {
+ // make sure partial still behaves after orig is removed...
+ fs.remove(oid, 0);
+
+ // or i read for oid2...
+ bufferlist rbl;
+ fs.read(oid2, 0, 200, rbl);
+ }
+ if (1) {
+ // make sure things behave if we remove the clone
+ fs.remove(oid2,0);
+ }
+ }
+ // /explicit tests
+
+ list<Tester*> ls;
+ for (int i=0; i<threads; i++) {
+ Tester *t = new Tester(fs);
+ t->create();
+ ls.push_back(t);
+ }
+
+ utime_t now = g_clock.now();
+ utime_t dur(seconds,0);
+ utime_t end = now + dur;
+ cout << "stop at " << end << endl;
+ while (now < end) {
+ sleep(1);
+ now = g_clock.now();
+ cout << now << endl;
+ }
+
+ cout << "stopping" << endl;
+ stop = true;
+
+ while (!ls.empty()) {
+ Tester *t = ls.front();
+ ls.pop_front();
+ t->join();
+ delete t;
+ }
+
+ fs.umount();
+ return 0;
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __EBOFS_TYPES_H
+#define __EBOFS_TYPES_H
+
+#include <cassert>
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "common/Cond.h"
+
+#include <ext/hash_map>
+#include <set>
+#include <list>
+#include <vector>
+using namespace std;
+using namespace __gnu_cxx;
+
+
+#include "include/object.h"
+
+
+#ifndef MIN
+# define MIN(a,b) ((a)<=(b) ? (a):(b))
+#endif
+#ifndef MAX
+# define MAX(a,b) ((a)>=(b) ? (a):(b))
+#endif
+
+
+/*
+namespace __gnu_cxx {
+ template<> struct hash<unsigned long long> {
+ size_t operator()(unsigned long long __x) const {
+ static hash<unsigned long> H;
+ return H((__x >> 32) ^ (__x & 0xffffffff));
+ }
+ };
+
+ template<> struct hash< std::string >
+ {
+ size_t operator()( const std::string& x ) const
+ {
+ static hash<const char*> H;
+ return H(x.c_str());
+ }
+ };
+}
+*/
+
+
+// disk
+typedef __uint64_t block_t; // disk location/sector/block
+
+static const int EBOFS_BLOCK_SIZE = 4096;
+static const int EBOFS_BLOCK_BITS = 12; // 1<<12 == 4096
+
+class Extent {
+ public:
+ block_t start, length;
+
+ Extent() : start(0), length(0) {}
+ Extent(block_t s, block_t l) : start(s), length(l) {}
+
+ block_t last() const { return start + length - 1; }
+ block_t end() const { return start + length; }
+};
+
+inline ostream& operator<<(ostream& out, Extent& ex)
+{
+ return out << ex.start << "~" << ex.length;
+}
+
+
+// tree/set nodes
+typedef int nodeid_t;
+
+static const int EBOFS_NODE_BLOCKS = 1;
+static const int EBOFS_NODE_BYTES = EBOFS_NODE_BLOCKS * EBOFS_BLOCK_SIZE;
+static const int EBOFS_MAX_NODE_REGIONS = 10; // pick a better value!
+
+struct ebofs_nodepool {
+ Extent node_usemap_even; // for even sb versions
+ Extent node_usemap_odd; // for odd sb versions
+
+ int num_regions;
+ Extent region_loc[EBOFS_MAX_NODE_REGIONS];
+};
+
+
+// objects
+
+typedef __uint64_t coll_t;
+
+struct ebofs_onode {
+ Extent onode_loc; /* this is actually the block we live in */
+
+ object_t object_id; /* for kicks */
+ off_t object_size; /* file size in bytes. should this be 64-bit? */
+ unsigned object_blocks;
+ bool readonly;
+
+ int num_collections;
+ int num_attr; // num attr in onode
+ int num_extents; /* number of extents used. if 0, data is in the onode */
+};
+
+struct ebofs_cnode {
+ Extent cnode_loc; /* this is actually the block we live in */
+ coll_t coll_id;
+ int num_attr; // num attr in cnode
+};
+
+
+// table
+struct ebofs_table {
+ nodeid_t root; /* root node of btree */
+ int num_keys;
+ int depth;
+};
+
+
+// super
+typedef __uint64_t version_t;
+
+static const unsigned EBOFS_MAGIC = 0x000EB0F5;
+
+static const int EBOFS_NUM_FREE_BUCKETS = 5; /* see alloc.h for bucket constraints */
+static const int EBOFS_FREE_BUCKET_BITS = 2;
+
+
+struct ebofs_super {
+ unsigned s_magic;
+
+ unsigned epoch; // version of this superblock.
+
+ unsigned num_blocks; /* # blocks in filesystem */
+
+ // some basic stats, for kicks
+ unsigned free_blocks; /* unused blocks */
+ unsigned limbo_blocks; /* limbo blocks */
+ //unsigned num_objects;
+ //unsigned num_fragmented;
+
+ struct ebofs_nodepool nodepool;
+
+ // tables
+ struct ebofs_table free_tab[EBOFS_NUM_FREE_BUCKETS];
+ struct ebofs_table limbo_tab;
+ struct ebofs_table alloc_tab;
+ struct ebofs_table object_tab; // object directory
+ struct ebofs_table collection_tab; // collection directory
+ struct ebofs_table co_tab;
+};
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mon/Monitor.h"
+
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "client/Client.h"
+#include "client/fuse.h"
+
+#include "common/Timer.h"
+
+#include "msg/FakeMessenger.h"
+
+
+
+
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define NUMCLIENT g_conf.num_client
+
+
+class C_Test : public Context {
+public:
+ void finish(int r) {
+ cout << "C_Test->finish(" << r << ")" << endl;
+ }
+};
+class C_Test2 : public Context {
+public:
+ void finish(int r) {
+ cout << "C_Test2->finish(" << r << ")" << endl;
+ g_timer.add_event_after(2, new C_Test);
+ }
+};
+
+
+
+int main(int argc, char **argv) {
+ cerr << "fakefuse starting" << endl;
+
+ vector<char*> args;
+ argv_to_vec(argc, argv, args);
+ parse_config_options(args);
+
+ // start messenger thread
+ fakemessenger_startthread();
+
+ //g_timer.add_event_after(5.0, new C_Test2);
+ //g_timer.add_event_after(10.0, new C_Test);
+
+ vector<char*> nargs;
+ for (unsigned i=0; i<args.size(); i++) {
+ nargs.push_back(args[i]);
+ }
+ args = nargs;
+ vec_to_argv(args, argc, argv);
+
+ MonMap *monmap = new MonMap(g_conf.num_mon);
+
+ Monitor *mon[g_conf.num_mon];
+ for (int i=0; i<g_conf.num_mon; i++) {
+ mon[i] = new Monitor(i, new FakeMessenger(MSG_ADDR_MON(i)), monmap);
+ }
+
+ // create osd
+ OSD *osd[NUMOSD];
+ for (int i=0; i<NUMOSD; i++) {
+ osd[i] = new OSD(i, new FakeMessenger(MSG_ADDR_OSD(i)), monmap);
+ }
+
+ // create mds
+ MDS *mds[NUMMDS];
+ for (int i=0; i<NUMMDS; i++) {
+ mds[i] = new MDS(i, new FakeMessenger(MSG_ADDR_MDS(i)), monmap);
+ }
+
+ // init
+ for (int i=0; i<g_conf.num_mon; i++) {
+ mon[i]->init();
+ }
+ for (int i=0; i<NUMMDS; i++) {
+ mds[i]->init();
+ }
+
+ for (int i=0; i<NUMOSD; i++) {
+ osd[i]->init();
+ }
+
+
+ // create client
+ Client *client[NUMCLIENT];
+ for (int i=0; i<NUMCLIENT; i++) {
+ client[i] = new Client(new FakeMessenger(MSG_ADDR_CLIENT(0)), monmap);
+ client[i]->init();
+
+
+ // start up fuse
+ // use my argc, argv (make sure you pass a mount point!)
+ cout << "starting fuse on pid " << getpid() << endl;
+ client[i]->mount();
+ ceph_fuse_main(client[i], argc, argv);
+ client[i]->unmount();
+ cout << "fuse finished on pid " << getpid() << endl;
+ client[i]->shutdown();
+ }
+
+
+
+ // wait for it to finish
+ cout << "DONE -----" << endl;
+ fakemessenger_wait(); // blocks until messenger stops
+
+
+ // cleanup
+ for (int i=0; i<NUMMDS; i++) {
+ delete mds[i];
+ }
+ for (int i=0; i<NUMOSD; i++) {
+ delete osd[i];
+ }
+ for (int i=0; i<NUMCLIENT; i++) {
+ delete client[i];
+ }
+
+ return 0;
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mds/MDCluster.h"
+
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "mon/Monitor.h"
+#include "client/Client.h"
+
+#include "client/SyntheticClient.h"
+
+#include "msg/FakeMessenger.h"
+
+#include "common/Timer.h"
+
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define NUMCLIENT g_conf.num_client
+
+class C_Test : public Context {
+public:
+ void finish(int r) {
+ cout << "C_Test->finish(" << r << ")" << endl;
+ }
+};
+
+
+int main(int argc, char **argv)
+{
+ cerr << "fakesyn start" << endl;
+
+ //cerr << "inode_t " << sizeof(inode_t) << endl;
+
+ vector<char*> args;
+ argv_to_vec(argc, argv, args);
+
+ parse_config_options(args);
+
+ int start = 0;
+
+ parse_syn_options(args);
+
+ vector<char*> nargs;
+
+ for (unsigned i=0; i<args.size(); i++) {
+ // unknown arg, pass it on.
+ cerr << " stray arg " << args[i] << endl;
+ nargs.push_back(args[i]);
+ }
+ assert(nargs.empty());
+
+
+ MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD);
+
+
+ char hostname[100];
+ gethostname(hostname,100);
+ //int pid = getpid();
+
+ // create mon
+ Monitor *mon[g_conf.num_mon];
+ for (int i=0; i<g_conf.num_mon; i++) {
+ mon[i] = new Monitor(i, new FakeMessenger(MSG_ADDR_MON(i)));
+ }
+
+ // create mds
+ MDS *mds[NUMMDS];
+ OSD *mdsosd[NUMMDS];
+ for (int i=0; i<NUMMDS; i++) {
+ //cerr << "mds" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
+ mds[i] = new MDS(mdc, i, new FakeMessenger(MSG_ADDR_MDS(i)));
+ if (g_conf.mds_local_osd)
+ mdsosd[i] = new OSD(i+10000, new FakeMessenger(MSG_ADDR_OSD(i+10000)));
+ start++;
+ }
+
+ // create osd
+ OSD *osd[NUMOSD];
+ for (int i=0; i<NUMOSD; i++) {
+ //cerr << "osd" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
+ osd[i] = new OSD(i, new FakeMessenger(MSG_ADDR_OSD(i)));
+ start++;
+ }
+
+ // create client
+ Client *client[NUMCLIENT];
+ SyntheticClient *syn[NUMCLIENT];
+ for (int i=0; i<NUMCLIENT; i++) {
+ //cerr << "client" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
+ client[i] = new Client(new FakeMessenger(MSG_ADDR_CLIENT(i)));
+ start++;
+ }
+
+
+ // start message loop
+ fakemessenger_startthread();
+
+ // init
+ for (int i=0; i<g_conf.num_mon; i++) {
+ mon[i]->init();
+ }
+ for (int i=0; i<NUMMDS; i++) {
+ mds[i]->init();
+ if (g_conf.mds_local_osd)
+ mdsosd[i]->init();
+ }
+
+ for (int i=0; i<NUMOSD; i++) {
+ osd[i]->init();
+ }
+
+
+ // create client(s)
+ for (int i=0; i<NUMCLIENT; i++) {
+ client[i]->init();
+
+ // use my argc, argv (make sure you pass a mount point!)
+ //cout << "mounting" << endl;
+ client[i]->mount();
+
+ //cout << "starting synthetic client " << endl;
+ syn[i] = new SyntheticClient(client[i]);
+
+ syn[i]->start_thread();
+ }
+
+
+ for (int i=0; i<NUMCLIENT; i++) {
+
+ cout << "waiting for synthetic client " << i << " to finish" << endl;
+ syn[i]->join_thread();
+ delete syn[i];
+
+ client[i]->unmount();
+ //cout << "unmounted" << endl;
+ client[i]->shutdown();
+ }
+
+
+ // wait for it to finish
+ fakemessenger_wait();
+
+ // cleanup
+ for (int i=0; i<NUMMDS; i++) {
+ delete mds[i];
+ }
+ for (int i=0; i<NUMOSD; i++) {
+ delete osd[i];
+ }
+ for (int i=0; i<NUMCLIENT; i++) {
+ delete client[i];
+ }
+ delete mdc;
+
+ cout << "fakesyn done" << endl;
+ return 0;
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "mon/Monitor.h"
+#include "client/Client.h"
+
+#include "client/SyntheticClient.h"
+
+#include "msg/FakeMessenger.h"
+
+#include "common/Timer.h"
+
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define NUMCLIENT g_conf.num_client
+
+class C_Test : public Context {
+public:
+ void finish(int r) {
+ cout << "C_Test->finish(" << r << ")" << endl;
+ }
+};
+
+
+int main(int argc, char **argv)
+{
+ cerr << "fakesyn start" << endl;
+
+ //cerr << "inode_t " << sizeof(inode_t) << endl;
+
+ vector<char*> args;
+ argv_to_vec(argc, argv, args);
+
+ parse_config_options(args);
+
+ int start = 0;
+
+ parse_syn_options(args);
+
+ vector<char*> nargs;
+
+ for (unsigned i=0; i<args.size(); i++) {
+ // unknown arg, pass it on.
+ cerr << " stray arg " << args[i] << endl;
+ nargs.push_back(args[i]);
+ }
+ assert(nargs.empty());
+
+
+ g_clock.tare();
+
+ MonMap *monmap = new MonMap(g_conf.num_mon);
+ monmap->mon_inst[0].rank = 0; // hack ; see FakeMessenger.cc
+
+ char hostname[100];
+ gethostname(hostname,100);
+ //int pid = getpid();
+
+ // create mon
+ Monitor *mon[g_conf.num_mon];
+ for (int i=0; i<g_conf.num_mon; i++) {
+ mon[i] = new Monitor(i, new FakeMessenger(MSG_ADDR_MON(i)), monmap);
+ }
+
+ // create mds
+ MDS *mds[NUMMDS];
+ OSD *mdsosd[NUMMDS];
+ for (int i=0; i<NUMMDS; i++) {
+ //cerr << "mds" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
+ mds[i] = new MDS(-1, new FakeMessenger(MSG_ADDR_MDS_NEW), monmap);
+ if (g_conf.mds_local_osd)
+ mdsosd[i] = new OSD(i+10000, new FakeMessenger(MSG_ADDR_OSD(i+10000)), monmap);
+ start++;
+ }
+
+ // create osd
+ OSD *osd[NUMOSD];
+ for (int i=0; i<NUMOSD; i++) {
+ //cerr << "osd" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
+ osd[i] = new OSD(i, new FakeMessenger(MSG_ADDR_OSD(i)), monmap);
+ start++;
+ }
+
+ // create client
+ Client *client[NUMCLIENT];
+ SyntheticClient *syn[NUMCLIENT];
+ for (int i=0; i<NUMCLIENT; i++) {
+ //cerr << "client" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
+ client[i] = new Client(new FakeMessenger(MSG_ADDR_CLIENT(i)), monmap);
+ start++;
+ }
+
+
+ // start message loop
+ fakemessenger_startthread();
+
+ // init
+ for (int i=0; i<g_conf.num_mon; i++) {
+ mon[i]->init();
+ }
+ for (int i=0; i<NUMMDS; i++) {
+ mds[i]->init();
+ if (g_conf.mds_local_osd)
+ mdsosd[i]->init();
+ }
+
+ for (int i=0; i<NUMOSD; i++) {
+ osd[i]->init();
+ }
+
+
+ // create client(s)
+ for (int i=0; i<NUMCLIENT; i++) {
+ client[i]->init();
+
+ // use my argc, argv (make sure you pass a mount point!)
+ //cout << "mounting" << endl;
+ client[i]->mount();
+
+ //cout << "starting synthetic client " << endl;
+ syn[i] = new SyntheticClient(client[i]);
+
+ syn[i]->start_thread();
+ }
+
+
+ for (int i=0; i<NUMCLIENT; i++) {
+
+ cout << "waiting for synthetic client " << i << " to finish" << endl;
+ syn[i]->join_thread();
+ delete syn[i];
+
+ client[i]->unmount();
+ //cout << "unmounted" << endl;
+ client[i]->shutdown();
+ }
+
+
+ // wait for it to finish
+ fakemessenger_wait();
+
+ // cleanup
+ for (int i=0; i<NUMMDS; i++) {
+ delete mds[i];
+ }
+ for (int i=0; i<NUMOSD; i++) {
+ delete osd[i];
+ }
+ for (int i=0; i<NUMCLIENT; i++) {
+ delete client[i];
+ }
+
+ cout << "fakesyn done" << endl;
+ return 0;
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __CONTEXT_H
+#define __CONTEXT_H
+
+#include "config.h"
+
+#include <assert.h>
+#include <list>
+#include <set>
+
+#include <iostream>
+
+
+/*
+ * Context - abstract callback class
+ */
+class Context {
+ public:
+ virtual ~Context() {} // we want a virtual destructor!!!
+ virtual void finish(int r) = 0;
+};
+
+
+/*
+ * finish and destroy a list of Contexts
+ */
+inline void finish_contexts(std::list<Context*>& finished,
+ int result = 0)
+{
+ using std::cout;
+ using std::endl;
+
+ if (finished.empty()) return;
+
+ dout(10) << finished.size() << " contexts to finish with " << result << endl;
+ for (std::list<Context*>::iterator it = finished.begin();
+ it != finished.end();
+ it++) {
+ Context *c = *it;
+ dout(10) << "---- " << c << endl;
+ c->finish(result);
+ delete c;
+ }
+}
+
+/*
+ * C_Contexts - set of Contexts
+ */
+class C_Contexts : public Context {
+ std::list<Context*> clist;
+
+public:
+ void add(Context* c) {
+ clist.push_back(c);
+ }
+ void take(std::list<Context*>& ls) {
+ clist.splice(clist.end(), ls);
+ }
+ void finish(int r) {
+ finish_contexts(clist, r);
+ }
+};
+
+
+/*
+ * C_Gather
+ *
+ * BUG: does not report errors.
+ */
+class C_Gather : public Context {
+public:
+ class C_GatherSub : public Context {
+ C_Gather *gather;
+ int num;
+ public:
+ C_GatherSub(C_Gather *g, int n) : gather(g), num(n) {}
+ void finish(int r) {
+ gather->finish(num);
+ }
+ };
+
+private:
+ Context *onfinish;
+ std::set<int> waitfor;
+ int num;
+
+public:
+ C_Gather(Context *f) : onfinish(f), num(0) {}
+
+ void finish(int r) {
+ assert(waitfor.count(r));
+ waitfor.erase(r);
+ if (waitfor.empty()) {
+ onfinish->finish(0);
+ delete onfinish;
+ }
+ }
+
+ Context *new_sub() {
+ num++;
+ waitfor.insert(num);
+ return new C_GatherSub(this, num);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __DISTRIBUTION_H
+#define __DISTRIBUTION_H
+
+#include <cassert>
+#include <vector>
+using namespace std;
+
+class Distribution {
+ vector<float> p;
+ vector<int> v;
+
+ public:
+ //Distribution() {
+ //}
+
+ unsigned get_width() {
+ return p.size();
+ }
+
+ void clear() {
+ p.clear();
+ v.clear();
+ }
+ void add(int val, float pr) {
+ p.push_back(pr);
+ v.push_back(val);
+ }
+
+ void random() {
+ float sum = 0.0;
+ for (unsigned i=0; i<p.size(); i++) {
+ p[i] = (float)(rand() % 10000);
+ sum += p[i];
+ }
+ for (unsigned i=0; i<p.size(); i++)
+ p[i] /= sum;
+ }
+
+ int sample() {
+ float s = (float)(rand() % 10000) / 10000.0;
+ for (unsigned i=0; i<p.size(); i++) {
+ if (s < p[i]) return v[i];
+ s -= p[i];
+ }
+ assert(0);
+ return v[p.size() - 1]; // hmm. :/
+ }
+
+ float normalize() {
+ float s = 0.0;
+ for (unsigned i=0; i<p.size(); i++)
+ s += p[i];
+ for (unsigned i=0; i<p.size(); i++)
+ p[i] /= s;
+ return s;
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __BUFFER_H
+#define __BUFFER_H
+
+#include "common/Mutex.h"
+
+#include <iostream>
+#include <list>
+
+using std::cout;
+using std::endl;
+
+#ifndef __CYGWIN__
+# include <sys/mman.h>
+#endif
+
+#define BUFFER_PAGE_SIZE 4096 // fixme.
+
+// <hack>
+// these are in config.o
+extern Mutex bufferlock;
+extern long buffer_total_alloc;
+// </hack>
+
+class buffer {
+private:
+
+ /* hack for memory utilization debugging. */
+ static void inc_total_alloc(unsigned len) {
+ bufferlock.Lock();
+ buffer_total_alloc += len;
+ bufferlock.Unlock();
+ }
+ static void dec_total_alloc(unsigned len) {
+ bufferlock.Lock();
+ buffer_total_alloc -= len;
+ bufferlock.Unlock();
+ }
+
+ /*
+ * an abstract raw buffer. with a reference count.
+ */
+ class raw {
+ public:
+ char *data;
+ unsigned len;
+ int nref;
+ Mutex lock; // we'll make it non-recursive.
+
+ raw(unsigned l) : len(l), nref(0), lock(false) {}
+ raw(char *c, unsigned l) : data(c), len(l), nref(0), lock(false) {}
+ virtual ~raw() {};
+
+ // no copying.
+ raw(const raw &other);
+ const raw& operator=(const raw &other);
+
+ virtual raw* clone_empty() = 0;
+ raw *clone() {
+ raw *c = clone_empty();
+ memcpy(c->data, data, len);
+ return c;
+ }
+ };
+
+ friend std::ostream& operator<<(std::ostream& out, const raw &r);
+
+ /*
+ * primitive buffer types
+ */
+ class raw_char : public raw {
+ public:
+ raw_char(unsigned l) : raw(l) {
+ data = new char[len];
+ inc_total_alloc(len);
+ }
+ ~raw_char() {
+ delete[] data;
+ dec_total_alloc(len);
+ }
+ raw* clone_empty() {
+ return new raw_char(len);
+ }
+ };
+
+ class raw_static : public raw {
+ public:
+ raw_static(const char *d, unsigned l) : raw((char*)d, l) { }
+ ~raw_static() {}
+ raw* clone_empty() {
+ return new raw_char(len);
+ }
+ };
+
+#ifndef __CYGWIN__
+ class raw_mmap_pages : public raw {
+ public:
+ raw_mmap_pages(unsigned l) : raw(l) {
+ data = (char*)::mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
+ inc_total_alloc(len);
+ }
+ ~raw_mmap_pages() {
+ ::munmap(data, len);
+ dec_total_alloc(len);
+ }
+ raw* clone_empty() {
+ return new raw_mmap_pages(len);
+ }
+ };
+
+ class raw_posix_aligned : public raw {
+ public:
+ raw_posix_aligned(unsigned l) : raw(l) {
+#ifdef DARWIN
+ data = (char *) valloc (len);
+#else
+ ::posix_memalign((void**)&data, BUFFER_PAGE_SIZE, len);
+#endif /* DARWIN */
+ inc_total_alloc(len);
+ }
+ ~raw_posix_aligned() {
+ ::free((void*)data);
+ dec_total_alloc(len);
+ }
+ raw* clone_empty() {
+ return new raw_posix_aligned(len);
+ }
+ };
+#endif
+
+#ifdef __CYGWIN__
+ class raw_hack_aligned : public raw {
+ char *realdata;
+ public:
+ raw_hack_aligned(unsigned l) : raw(l) {
+ realdata = new char[len+4095];
+ unsigned off = ((unsigned)realdata) % 4096;
+ if (off)
+ data = realdata + 4096 - off;
+ else
+ data = realdata;
+ inc_total_alloc(len+4095);
+ //cout << "hack aligned " << (unsigned)data
+ //<< " in raw " << (unsigned)realdata
+ //<< " off " << off << endl;
+ assert(((unsigned)data & 4095) == 0);
+ }
+ ~raw_hack_aligned() {
+ delete[] realdata;
+ dec_total_alloc(len+4095);
+ }
+ raw* clone_empty() {
+ return new raw_hack_aligned(len);
+ }
+ };
+#endif
+
+public:
+
+ /*
+ * named constructors
+ */
+
+ static raw* copy(const char *c, unsigned len) {
+ raw* r = new raw_char(len);
+ memcpy(r->data, c, len);
+ return r;
+ }
+ static raw* create(unsigned len) {
+ return new raw_char(len);
+ }
+
+ static raw* create_page_aligned(unsigned len) {
+#ifndef __CYGWIN__
+ return new raw_mmap_pages(len);
+#else
+ return new raw_hack_aligned(len);
+#endif
+ }
+
+
+ /*
+ * a buffer pointer. references (a subsequence of) a raw buffer.
+ */
+ class ptr {
+ raw *_raw;
+ unsigned _off, _len;
+
+ public:
+ ptr() : _raw(0), _off(0), _len(0) {}
+ ptr(raw *r) : _raw(r), _off(0), _len(r->len) { // no lock needed; this is an unref raw.
+ ++r->nref;
+ }
+ ptr(unsigned l) : _off(0), _len(l) {
+ _raw = create(l);
+ ++_raw->nref;
+ }
+ ptr(char *d, unsigned l) : _off(0), _len(l) { // ditto.
+ _raw = copy(d, l);
+ ++_raw->nref;
+ }
+ ptr(const ptr& p) : _raw(p._raw), _off(p._off), _len(p._len) {
+ if (_raw) {
+ _raw->lock.Lock();
+ ++_raw->nref;
+ _raw->lock.Unlock();
+ }
+ }
+ ptr(const ptr& p, unsigned o, unsigned l) : _raw(p._raw), _off(p._off + o), _len(l) {
+ assert(o+l <= p._len);
+ assert(_raw);
+ _raw->lock.Lock();
+ ++_raw->nref;
+ _raw->lock.Unlock();
+ }
+ ptr& operator= (const ptr& p) {
+ // be careful -- we need to properly handle self-assignment.
+ if (p._raw) {
+ p._raw->lock.Lock();
+ ++p._raw->nref; // inc new
+ p._raw->lock.Unlock();
+ }
+ release(); // dec (+ dealloc) old (if any)
+ _raw = p._raw; // change my ref
+ _off = p._off;
+ _len = p._len;
+ return *this;
+ }
+ ~ptr() {
+ release();
+ }
+
+ void release() {
+ if (_raw) {
+ _raw->lock.Lock();
+ if (--_raw->nref == 0) {
+ //cout << "hosing raw " << (void*)_raw << " len " << _raw->len << std::endl;
+ _raw->lock.Unlock();
+ delete _raw; // dealloc old (if any)
+ } else
+ _raw->lock.Unlock();
+ _raw = 0;
+ }
+ }
+
+ // misc
+ bool at_buffer_head() const { return _off == 0; }
+ bool at_buffer_tail() const { return _off + _len == _raw->len; }
+
+ // accessors
+ const char *c_str() const { assert(_raw); return _raw->data + _off; }
+ char *c_str() { assert(_raw); return _raw->data + _off; }
+ unsigned length() const { return _len; }
+ unsigned offset() const { return _off; }
+ unsigned unused_tail_length() const { return _raw->len - (_off+_len); }
+ const char& operator[](unsigned n) const {
+ assert(_raw);
+ assert(n < _len);
+ return _raw->data[_off + n];
+ }
+ char& operator[](unsigned n) {
+ assert(_raw);
+ assert(n < _len);
+ return _raw->data[_off + n];
+ }
+
+ const char *raw_c_str() const { assert(_raw); return _raw->data; }
+ unsigned raw_length() const { assert(_raw); return _raw->len; }
+ int raw_nref() const { assert(_raw); return _raw->nref; }
+
+ void copy_out(unsigned o, unsigned l, char *dest) const {
+ assert(_raw);
+ assert(o >= 0 && o <= _len);
+ assert(l >= 0 && o+l <= _len);
+ memcpy(dest, c_str()+o, l);
+ }
+
+ unsigned wasted() {
+ assert(_raw);
+ return _raw->len - _len;
+ }
+
+ // modifiers
+ void set_offset(unsigned o) { _off = o; }
+ void set_length(unsigned l) { _len = l; }
+
+ void append(const char *p, unsigned l) {
+ assert(_raw);
+ assert(l <= unused_tail_length());
+ memcpy(c_str() + _len, p, l);
+ _len += l;
+ }
+
+ void copy_in(unsigned o, unsigned l, const char *src) {
+ assert(_raw);
+ assert(o >= 0 && o <= _len);
+ assert(l >= 0 && o+l <= _len);
+ memcpy(c_str()+o, src, l);
+ }
+
+ void zero() {
+ memset(c_str(), 0, _len);
+ }
+
+ void clean() {
+ //raw *newraw = _raw->makesib(_len);
+ }
+ };
+
+ friend std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp);
+
+ /*
+ * list - the useful bit!
+ */
+
+ class list {
+ // my private bits
+ std::list<ptr> _buffers;
+ unsigned _len;
+
+ public:
+ // cons/des
+ list() : _len(0) {}
+ list(const list& other) : _buffers(other._buffers), _len(other._len) { }
+ list(unsigned l) : _len(0) {
+ ptr bp(l);
+ push_back(bp);
+ }
+ ~list() {}
+
+ list& operator= (const list& other) {
+ _buffers = other._buffers;
+ _len = other._len;
+ return *this;
+ }
+
+ const std::list<ptr>& buffers() const { return _buffers; }
+
+ unsigned length() const {
+#if 0
+ // DEBUG: verify _len
+ unsigned len = 0;
+ for (std::list<ptr>::iterator it = _buffers.begin();
+ it != _buffers.end();
+ it++) {
+ len += (*it).length();
+ }
+ assert(len == _len);
+#endif
+ return _len;
+ }
+
+
+ // modifiers
+ void clear() {
+ _buffers.clear();
+ _len = 0;
+ }
+ void push_front(ptr& bp) {
+ _buffers.push_front(bp);
+ _len += bp.length();
+ }
+ void push_front(raw *r) {
+ ptr bp(r);
+ _buffers.push_front(bp);
+ _len += bp.length();
+ }
+ void push_back(ptr& bp) {
+ _buffers.push_back(bp);
+ _len += bp.length();
+ }
+ void push_back(raw *r) {
+ ptr bp(r);
+ _buffers.push_back(bp);
+ _len += bp.length();
+ }
+ void zero() {
+ for (std::list<ptr>::iterator it = _buffers.begin();
+ it != _buffers.end();
+ it++)
+ it->zero();
+ }
+
+ // sort-of-like-assignment-op
+ void claim(list& bl) {
+ // free my buffers
+ clear();
+ claim_append(bl);
+ }
+ void claim_append(list& bl) {
+ // steal the other guy's buffers
+ _len += bl._len;
+ _buffers.splice( _buffers.end(), bl._buffers );
+ bl._len = 0;
+ }
+
+ // crope lookalikes
+ void copy(unsigned off, unsigned len, char *dest) {
+ assert(off >= 0);
+ assert(off + len <= length());
+ /*assert(off < length());
+ if (off + len > length())
+ len = length() - off;
+ */
+ // advance to off
+ std::list<ptr>::iterator curbuf = _buffers.begin();
+
+ // skip off
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ break;
+ }
+ }
+
+ // copy
+ while (len > 0) {
+ // is the rest ALL in this buffer?
+ if (off + len <= (*curbuf).length()) {
+ (*curbuf).copy_out(off, len, dest); // yup, last bit!
+ break;
+ }
+
+ // get as much as we can from this buffer.
+ unsigned howmuch = (*curbuf).length() - off;
+ (*curbuf).copy_out(off, howmuch, dest);
+
+ dest += howmuch;
+ len -= howmuch;
+ off = 0;
+ curbuf++;
+ assert(curbuf != _buffers.end());
+ }
+ }
+
+ void copy_in(unsigned off, unsigned len, const char *src) {
+ assert(off >= 0);
+ assert(off + len <= length());
+
+ // advance to off
+ std::list<ptr>::iterator curbuf = _buffers.begin();
+
+ // skip off
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ break;
+ }
+ }
+
+ // copy
+ while (len > 0) {
+ // is the rest ALL in this buffer?
+ if (off + len <= (*curbuf).length()) {
+ (*curbuf).copy_in(off, len, src); // yup, last bit!
+ break;
+ }
+
+ // get as much as we can from this buffer.
+ unsigned howmuch = (*curbuf).length() - off;
+ (*curbuf).copy_in(off, howmuch, src);
+
+ src += howmuch;
+ len -= howmuch;
+ off = 0;
+ curbuf++;
+ assert(curbuf != _buffers.end());
+ }
+ }
+ void copy_in(unsigned off, unsigned len, const list& bl) {
+ unsigned left = len;
+ for (std::list<ptr>::const_iterator i = bl._buffers.begin();
+ i != bl._buffers.end();
+ i++) {
+ unsigned l = (*i).length();
+ if (left < l) l = left;
+ copy_in(off, l, (*i).c_str());
+ left -= l;
+ if (left == 0) break;
+ off += l;
+ }
+ }
+
+
+ void append(const char *data, unsigned len) {
+ if (len == 0) return;
+
+ unsigned alen = 0;
+
+ // copy into the tail buffer?
+ if (!_buffers.empty()) {
+ unsigned avail = _buffers.back().unused_tail_length();
+ if (avail > 0) {
+ //std::cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf " << _buffers.back() << std::endl;
+ if (avail > len)
+ avail = len;
+ _buffers.back().append(data, avail);
+ _len += avail;
+ data += avail;
+ len -= avail;
+ }
+ alen = _buffers.back().length();
+ }
+ if (len == 0) return;
+
+ // just add another buffer.
+ // alloc a bit extra, in case we do a bunch of appends. FIXME be smarter!
+ if (alen < 4096) alen = 4096;
+ ptr bp = create(alen);
+ bp.set_length(len);
+ bp.copy_in(0, len, data);
+ push_back(bp);
+ }
+ void append(ptr& bp) {
+ push_back(bp);
+ }
+ void append(ptr& bp, unsigned off, unsigned len) {
+ assert(len+off <= bp.length());
+ ptr tempbp(bp, off, len);
+ push_back(tempbp);
+ }
+ void append(const list& bl) {
+ list temp(bl); // copy list
+ claim_append(temp); // and append
+ }
+
+
+ /*
+ * get a char
+ */
+ const char& operator[](unsigned n) {
+ assert(n < _len);
+ for (std::list<ptr>::iterator p = _buffers.begin();
+ p != _buffers.end();
+ p++) {
+ if (n >= p->length()) {
+ n -= p->length();
+ continue;
+ }
+ return (*p)[n];
+ }
+ assert(0);
+ }
+
+ /*
+ * return a contiguous ptr to whole bufferlist contents.
+ */
+ char *c_str() {
+ if (_buffers.size() == 1) {
+ return _buffers.front().c_str(); // good, we're already contiguous.
+ }
+ else if (_buffers.size() == 0) {
+ return 0; // no buffers
+ }
+ else {
+ ptr newbuf = create(length()); // make one new contiguous buffer.
+ copy(0, length(), newbuf.c_str()); // copy myself into it.
+ clear();
+ push_back(newbuf);
+ return newbuf.c_str(); // now it'll work.
+ }
+ }
+
+ void substr_of(list& other, unsigned off, unsigned len) {
+ assert(off + len <= other.length());
+ clear();
+
+ // skip off
+ std::list<ptr>::iterator curbuf = other._buffers.begin();
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ //cout << "skipping over " << *curbuf << endl;
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ //cout << "somewhere in " << *curbuf << endl;
+ break;
+ }
+ }
+
+ while (len > 0) {
+ // partial?
+ if (off + len < (*curbuf).length()) {
+ //cout << "copying partial of " << *curbuf << endl;
+ _buffers.push_back( ptr( *curbuf, off, len ) );
+ _len += len;
+ break;
+ }
+
+ // through end
+ //cout << "copying end (all?) of " << *curbuf << endl;
+ unsigned howmuch = (*curbuf).length() - off;
+ _buffers.push_back( ptr( *curbuf, off, howmuch ) );
+ _len += howmuch;
+ len -= howmuch;
+ off = 0;
+ curbuf++;
+ }
+ }
+
+
+ // funky modifer
+ void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */) { // fixme?
+ assert(off < length());
+ assert(len > 0);
+ //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl;
+
+ // skip off
+ std::list<ptr>::iterator curbuf = _buffers.begin();
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ //cout << "off = " << off << " skipping over " << *curbuf << endl;
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ //cout << "off = " << off << " somewhere in " << *curbuf << endl;
+ break;
+ }
+ }
+ assert(off >= 0);
+
+ if (off) {
+ // add a reference to the front bit
+ // insert it before curbuf (which we'll hose)
+ //cout << "keeping front " << off << " of " << *curbuf << endl;
+ _buffers.insert( curbuf, ptr( *curbuf, 0, off ) );
+ _len += off;
+ }
+
+ while (len > 0) {
+ // partial?
+ if (off + len < (*curbuf).length()) {
+ //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl;
+ if (claim_by)
+ claim_by->append( *curbuf, off, len );
+ (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big
+ (*curbuf).set_length( (*curbuf).length() - (len+off) );
+ _len -= off+len;
+ //cout << " now " << *curbuf << endl;
+ break;
+ }
+
+ // hose though the end
+ unsigned howmuch = (*curbuf).length() - off;
+ //cout << "discarding " << howmuch << " of " << *curbuf << endl;
+ if (claim_by)
+ claim_by->append( *curbuf, off, howmuch );
+ _len -= (*curbuf).length();
+ _buffers.erase( curbuf++ );
+ len -= howmuch;
+ off = 0;
+ }
+
+ // splice in *replace (implement me later?)
+ }
+
+ };
+
+};
+
+typedef buffer::ptr bufferptr;
+typedef buffer::list bufferlist;
+
+
+inline bool operator>(bufferlist& l, bufferlist& r) {
+ for (unsigned p = 0; ; p++) {
+ if (l.length() > p && r.length() == p) return true;
+ if (l.length() == p) return false;
+ if (l[p] > r[p]) return true;
+ if (l[p] < r[p]) return false;
+ p++;
+ }
+}
+inline bool operator>=(bufferlist& l, bufferlist& r) {
+ for (unsigned p = 0; ; p++) {
+ if (l.length() > p && r.length() == p) return true;
+ if (r.length() == p && l.length() == p) return true;
+ if (l[p] > r[p]) return true;
+ if (l[p] < r[p]) return false;
+ p++;
+ }
+}
+inline bool operator<(bufferlist& l, bufferlist& r) {
+ return r > l;
+}
+inline bool operator<=(bufferlist& l, bufferlist& r) {
+ return r >= l;
+}
+
+
+inline std::ostream& operator<<(std::ostream& out, const buffer::raw &r) {
+ return out << "buffer::raw(" << (void*)r.data << " len " << r.len << " nref " << r.nref << ")";
+}
+
+inline std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) {
+ out << "buffer::ptr(" << bp.offset() << "~" << bp.length()
+ << " " << (void*)bp.c_str()
+ << " in raw " << (void*)bp.raw_c_str()
+ << " len " << bp.raw_length()
+ << " nref " << bp.raw_nref() << ")";
+ return out;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const buffer::list& bl) {
+ out << "buffer::list(len=" << bl.length() << "," << std::endl;
+
+ std::list<buffer::ptr>::const_iterator it = bl.buffers().begin();
+ while (it != bl.buffers().end()) {
+ out << "\t" << *it;
+ if (++it == bl.buffers().end()) break;
+ out << "," << std::endl;
+ }
+ out << std::endl << ")";
+ return out;
+}
+
+
+
+
+// encoder/decode helpers
+
+// string
+inline void _encode(const std::string& s, bufferlist& bl)
+{
+ bl.append(s.c_str(), s.length()+1);
+}
+inline void _decode(std::string& s, bufferlist& bl, int& off)
+{
+ s = bl.c_str() + off;
+ off += s.length() + 1;
+}
+
+// bufferptr (encapsulated)
+inline void _encode(bufferptr& bp, bufferlist& bl)
+{
+ size_t len = bp.length();
+ bl.append((char*)&len, sizeof(len));
+ bl.append(bp);
+}
+inline void _decode(bufferptr& bp, bufferlist& bl, int& off)
+{
+ size_t len;
+ bl.copy(off, sizeof(len), (char*)&len);
+ off += sizeof(len);
+ bufferlist s;
+ s.substr_of(bl, off, len);
+ off += len;
+
+ if (s.buffers().size() == 1)
+ bp = s.buffers().front();
+ else
+ bp = buffer::copy(s.c_str(), s.length());
+}
+
+// bufferlist (encapsulated)
+inline void _encode(const bufferlist& s, bufferlist& bl)
+{
+ size_t len = s.length();
+ bl.append((char*)&len, sizeof(len));
+ bl.append(s);
+}
+inline void _decode(bufferlist& s, bufferlist& bl, int& off)
+{
+ size_t len;
+ bl.copy(off, sizeof(len), (char*)&len);
+ off += sizeof(len);
+ s.substr_of(bl, off, len);
+ off += len;
+}
+
+#include <set>
+#include <map>
+#include <vector>
+#include <string>
+
+// set<T>
+template<class T>
+inline void _encode(std::set<T>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename std::set<T>::iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T v = *it;
+ bl.append((char*)&v, sizeof(v));
+ n--;
+ }
+ assert(n==0);
+}
+template<class T>
+inline void _decode(std::set<T>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T v;
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s.insert(v);
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+// vector<T>
+template<class T>
+inline void _encode(std::vector<T>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename std::vector<T>::iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T v = *it;
+ bl.append((char*)&v, sizeof(v));
+ n--;
+ }
+ assert(n==0);
+}
+template<class T>
+inline void _decode(std::vector<T>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ s = std::vector<T>(n);
+ for (int i=0; i<n; i++) {
+ T v;
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s[i] = v;
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+// list<T>
+template<class T>
+inline void _encode(const std::list<T>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename std::list<T>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T v = *it;
+ bl.append((char*)&v, sizeof(v));
+ n--;
+ }
+ assert(n==0);
+}
+template<class T>
+inline void _decode(std::list<T>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T v;
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s.push_back(v);
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+
+// map<string,bufferptr>
+inline void _encode(std::map<std::string, bufferptr>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (std::map<std::string, bufferptr>::iterator it = s.begin();
+ it != s.end();
+ it++) {
+ _encode(it->first, bl);
+ _encode(it->second, bl);
+ n--;
+ }
+ assert(n==0);
+}
+inline void _decode(std::map<std::string,bufferptr>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ std::string k;
+ _decode(k, bl, off);
+ _decode(s[k], bl, off);
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+
+// map<T,bufferlist>
+template<class T>
+inline void _encode(const std::map<T, bufferlist>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ //std::cout << "n = " << n << std::endl;
+ for (typename std::map<T, bufferlist>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T k = it->first;
+ bl.append((char*)&k, sizeof(k));
+ _encode(it->second, bl);
+ n--;
+ //std::cout << "--n = " << n << " after k " << k << std::endl;
+ }
+ assert(n==0);
+}
+template<class T>
+inline void _decode(std::map<T,bufferlist>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T k;
+ bl.copy(off, sizeof(k), (char*)&k);
+ off += sizeof(k);
+ bufferlist b;
+ _decode(b, bl, off);
+ s[k] = b;
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+// map<T,U>
+template<class T, class U>
+inline void _encode(const std::map<T, U>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename std::map<T, U>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T k = it->first;
+ U v = it->second;
+ bl.append((char*)&k, sizeof(k));
+ bl.append((char*)&v, sizeof(v));
+ n--;
+ }
+ assert(n==0);
+}
+template<class T, class U>
+inline void _decode(std::map<T,U>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T k;
+ U v;
+ bl.copy(off, sizeof(k), (char*)&k);
+ off += sizeof(k);
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s[k] = v;
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SYSERROR() syserror("At %s:%d", __FILE__, __LINE__)
+
+#define ASSERT(c) \
+ ((c) || (exiterror("Assertion failed at %s:%d", __FILE__, __LINE__), 1))
+
+/* print usage error message and exit */
+extern void userror(const char *use, const char *fmt, ...);
+
+/* print system error message and exit */
+extern void syserror(const char *fmt, ...);
+
+/* print error message and exit */
+extern void exiterror(const char *fmt, ...);
+
+/* print error message */
+extern void error(const char *fmt, ...);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __FILEPATH_H
+#define __FILEPATH_H
+
+
+/*
+ * BUG: /a/b/c is equivalent to a/b/c in dentry-breakdown, but not string.
+ * -> should it be different? how? should this[0] be "", with depth 4?
+ *
+ */
+
+
+#include <iostream>
+#include <string>
+#include <vector>
+using namespace std;
+
+#include <ext/rope>
+using namespace __gnu_cxx;
+
+#include "buffer.h"
+
+
+class filepath {
+ string path;
+ vector<string> bits;
+
+ void rebuild() {
+ if (absolute())
+ path = "/";
+ else
+ path.clear();
+ for (unsigned i=0; i<bits.size(); i++) {
+ if (i) path += "/";
+ path += bits[i];
+ }
+ }
+ void parse() {
+ bits.clear();
+ int off = 0;
+ while (off < (int)path.length()) {
+ // skip trailing/duplicate slash(es)
+ int nextslash = path.find('/', off);
+ if (nextslash == off) {
+ off++;
+ continue;
+ }
+ if (nextslash < 0)
+ nextslash = path.length(); // no more slashes
+
+ bits.push_back( path.substr(off,nextslash-off) );
+ off = nextslash+1;
+ }
+ }
+
+ public:
+ filepath() {}
+ filepath(const string& s) {
+ set_path(s);
+ }
+ filepath(const char* s) {
+ set_path(s);
+ }
+
+ bool absolute() { return path[0] == '/'; }
+ bool relative() { return !absolute(); }
+
+ void set_path(const string& s) {
+ path = s;
+ parse();
+ }
+ void set_path(const char *s) {
+ path = s;
+ parse();
+ }
+
+ string& get_path() {
+ return path;
+ }
+ int length() const {
+ return path.length();
+ }
+
+ const char *c_str() const {
+ return path.c_str();
+ }
+
+
+ filepath prefixpath(int s) const {
+ filepath t;
+ for (int i=0; i<s; i++)
+ t.add_dentry(bits[i]);
+ return t;
+ }
+ filepath postfixpath(int s) const {
+ filepath t;
+ for (unsigned i=s; i<bits.size(); i++)
+ t.add_dentry(bits[i]);
+ return t;
+ }
+ void add_dentry(const string& s) {
+ bits.push_back(s);
+ if (path.length())
+ path += "/";
+ path += s;
+ }
+ void append(const filepath& a) {
+ for (unsigned i=0; i<a.depth(); i++)
+ add_dentry(a[i]);
+ }
+
+ void pop_dentry() {
+ bits.pop_back();
+ rebuild();
+ }
+
+
+
+ void clear() {
+ path = "";
+ bits.clear();
+ }
+
+ const string& operator[](int i) const {
+ return bits[i];
+ }
+
+ const string& last_bit() const {
+ return bits[ bits.size()-1 ];
+ }
+
+ unsigned depth() const {
+ return bits.size();
+ }
+ bool empty() {
+ return bits.size() == 0;
+ }
+
+
+ void _rope(crope& r) {
+ char n = bits.size();
+ r.append((char*)&n, sizeof(char));
+ for (vector<string>::iterator it = bits.begin();
+ it != bits.end();
+ it++) {
+ r.append((*it).c_str(), (*it).length()+1);
+ }
+ }
+
+ void _unrope(crope& r, int& off) {
+ clear();
+
+ char n;
+ r.copy(off, sizeof(char), (char*)&n);
+ off += sizeof(char);
+ for (int i=0; i<n; i++) {
+ string s = r.c_str() + off;
+ off += s.length() + 1;
+ add_dentry(s);
+ }
+ }
+
+ void _encode(bufferlist& bl) {
+ char n = bits.size();
+ bl.append((char*)&n, sizeof(char));
+ for (vector<string>::iterator it = bits.begin();
+ it != bits.end();
+ it++) {
+ bl.append((*it).c_str(), (*it).length()+1);
+ }
+ }
+
+ void _decode(bufferlist& bl, int& off) {
+ clear();
+
+ char n;
+ bl.copy(off, sizeof(char), (char*)&n);
+ off += sizeof(char);
+ for (int i=0; i<n; i++) {
+ string s = bl.c_str() + off;
+ off += s.length() + 1;
+ add_dentry(s);
+ }
+ }
+
+};
+
+inline ostream& operator<<(ostream& out, filepath& path)
+{
+ return out << path.get_path();
+}
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __INTERVAL_SET_H
+#define __INTERVAL_SET_H
+
+#include <map>
+#include <ostream>
+#include <cassert>
+using namespace std;
+
+#ifndef MIN
+# define MIN(a,b) ((a)<=(b) ? (a):(b))
+#endif
+#ifndef MAX
+# define MAX(a,b) ((a)>=(b) ? (a):(b))
+#endif
+
+
+template<typename T>
+class interval_set {
+ public:
+ map<T,T> m; // map start -> len
+
+ // helpers
+ private:
+ typename map<T,T>::const_iterator find_inc(T start) const {
+ typename map<T,T>::const_iterator p = m.lower_bound(start); // p->first >= start
+ if (p != m.begin() &&
+ (p == m.end() || p->first > start)) {
+ p--; // might overlap?
+ if (p->first + p->second <= start)
+ p++; // it doesn't.
+ }
+ return p;
+ }
+
+ typename map<T,T>::iterator find_inc_m(T start) {
+ typename map<T,T>::iterator p = m.lower_bound(start);
+ if (p != m.begin() &&
+ (p == m.end() || p->first > start)) {
+ p--; // might overlap?
+ if (p->first + p->second <= start)
+ p++; // it doesn't.
+ }
+ return p;
+ }
+
+ typename map<T,T>::const_iterator find_adj(T start) const {
+ typename map<T,T>::const_iterator p = m.lower_bound(start);
+ if (p != m.begin() &&
+ (p == m.end() || p->first > start)) {
+ p--; // might touch?
+ if (p->first + p->second < start)
+ p++; // it doesn't.
+ }
+ return p;
+ }
+
+ typename map<T,T>::iterator find_adj_m(T start) {
+ typename map<T,T>::iterator p = m.lower_bound(start);
+ if (p != m.begin() &&
+ (p == m.end() || p->first > start)) {
+ p--; // might touch?
+ if (p->first + p->second < start)
+ p++; // it doesn't.
+ }
+ return p;
+ }
+
+ public:
+ bool operator==(const interval_set& other) const {
+ return m == other.m;
+ }
+
+ void clear() {
+ m.clear();
+ }
+
+ bool contains(T i) const {
+ typename map<T,T>::const_iterator p = find_inc(i);
+ if (p == m.end()) return false;
+ if (p->first > i) return false;
+ if (p->first+p->second <= i) return false;
+ assert(p->first <= i && p->first+p->second > i);
+ return true;
+ }
+ bool contains(T start, T len) const {
+ typename map<T,T>::const_iterator p = find_inc(start);
+ if (p == m.end()) return false;
+ if (p->first > start) return false;
+ if (p->first+p->second <= start) return false;
+ assert(p->first <= start && p->first+p->second > start);
+ if (p->first+p->second < start+len) return false;
+ return true;
+ }
+ bool intersects(T start, T len) const {
+ interval_set a;
+ a.insert(start, len);
+ interval_set i;
+ i.intersection_of( *this, a );
+ if (i.empty()) return false;
+ return true;
+ }
+
+ // outer range of set
+ bool empty() const {
+ return m.empty();
+ }
+ T start() const {
+ assert(!empty());
+ typename map<T,T>::const_iterator p = m.begin();
+ return p->first;
+ }
+ T end() const {
+ assert(!empty());
+ typename map<T,T>::const_iterator p = m.end();
+ p--;
+ return p->first+p->second;
+ }
+
+ // interval start after p (where p not in set)
+ bool starts_after(T i) const {
+ assert(!contains(i));
+ typename map<T,T>::const_iterator p = find_inc(i);
+ if (p == m.end()) return false;
+ return true;
+ }
+ T start_after(T i) const {
+ assert(!contains(i));
+ typename map<T,T>::const_iterator p = find_inc(i);
+ return p->first;
+ }
+
+ // interval end that contains start
+ T end_after(T start) const {
+ assert(contains(start));
+ typename map<T,T>::const_iterator p = find_inc(start);
+ return p->first+p->second;
+ }
+
+ void insert(T val) {
+ insert(val, 1);
+ }
+
+ void insert(T start, T len) {
+ //cout << "insert " << start << "~" << len << endl;
+ assert(len > 0);
+ typename map<T,T>::iterator p = find_adj_m(start);
+ if (p == m.end()) {
+ m[start] = len; // new interval
+ } else {
+ if (p->first < start) {
+
+ if (p->first + p->second != start) {
+ //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl;
+ assert(0);
+ }
+
+ assert(p->first + p->second == start);
+ p->second += len; // append to end
+
+ typename map<T,T>::iterator n = p;
+ n++;
+ if (n != m.end() &&
+ start+len == n->first) { // combine with next, too!
+ p->second += n->second;
+ m.erase(n);
+ }
+ } else {
+ if (start+len == p->first) {
+ m[start] = len + p->second; // append to front
+ m.erase(p);
+ } else {
+ assert(p->first > start+len);
+ m[start] = len; // new interval
+ }
+ }
+ }
+ }
+
+ void erase(T val) {
+ erase(val, 1);
+ }
+
+ void erase(T start, T len) {
+ typename map<T,T>::iterator p = find_inc_m(start);
+
+ assert(p != m.end());
+ assert(p->first <= start);
+
+ T before = start - p->first;
+ assert(p->second >= before+len);
+ T after = p->second - before - len;
+
+ if (before)
+ p->second = before; // shorten bit before
+ else
+ m.erase(p);
+ if (after)
+ m[start+len] = after;
+ }
+
+
+ void subtract(const interval_set &a) {
+ for (typename map<T,T>::const_iterator p = a.m.begin();
+ p != a.m.end();
+ p++)
+ erase(p->first, p->second);
+ }
+
+ void insert(const interval_set &a) {
+ for (typename map<T,T>::const_iterator p = a.m.begin();
+ p != a.m.end();
+ p++)
+ insert(p->first, p->second);
+ }
+
+
+ void intersection_of(const interval_set &a, const interval_set &b) {
+ assert(&a != this);
+ assert(&b != this);
+ clear();
+
+ typename map<T,T>::const_iterator pa = a.m.begin();
+ typename map<T,T>::const_iterator pb = b.m.begin();
+
+ while (pa != a.m.end() && pb != b.m.end()) {
+ // passing?
+ if (pa->first + pa->second <= pb->first)
+ { pa++; continue; }
+ if (pb->first + pb->second <= pa->first)
+ { pb++; continue; }
+ T start = MAX(pa->first, pb->first);
+ T end = MIN(pa->first+pa->second, pb->first+pb->second);
+ assert(end > start);
+ insert(start, end-start);
+ if (pa->first+pa->second > pb->first+pb->second)
+ pb++;
+ else
+ pa++;
+ }
+ }
+
+ void union_of(const interval_set &a, const interval_set &b) {
+ assert(&a != this);
+ assert(&b != this);
+ clear();
+
+ //cout << "union_of" << endl;
+
+ // a
+ m = a.m;
+
+ // - (a*b)
+ interval_set ab;
+ ab.intersection_of(a, b);
+ subtract(ab);
+
+ // + b
+ insert(b);
+ return;
+ }
+ void union_of(const interval_set &b) {
+ interval_set a;
+ a.m.swap(m);
+ union_of(a, b);
+ }
+
+ bool subset_of(const interval_set &big) const {
+ for (typename map<T,T>::const_iterator i = m.begin();
+ i != m.end();
+ i++)
+ if (!big.contains(i->first, i->second)) return false;
+ return true;
+ }
+
+};
+
+template<class T>
+inline ostream& operator<<(ostream& out, const interval_set<T> &s) {
+ out << "[";
+ for (typename map<T,T>::const_iterator i = s.m.begin();
+ i != s.m.end();
+ i++) {
+ if (i != s.m.begin()) out << ",";
+ out << i->first << "~" << i->second;
+ }
+ out << "]";
+ return out;
+}
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __LRU_H
+#define __LRU_H
+
+#include <assert.h>
+#include <iostream>
+using namespace std;
+
+#include "config.h"
+
+
+
+class LRUObject {
+ private:
+ LRUObject *lru_next, *lru_prev;
+ bool lru_pinned;
+ class LRU *lru;
+ class LRUList *lru_list;
+
+ public:
+ LRUObject() {
+ lru_next = lru_prev = NULL;
+ lru_list = 0;
+ lru_pinned = false;
+ lru = 0;
+ }
+
+ // pin/unpin item in cache
+ void lru_pin();
+ void lru_unpin();
+ bool lru_is_expireable() { return !lru_pinned; }
+
+ friend class LRU;
+ friend class LRUList;
+};
+
+
+class LRUList {
+ private:
+ LRUObject *head, *tail;
+ __uint32_t len;
+
+ public:
+ LRUList() {
+ head = tail = 0;
+ len = 0;
+ }
+
+ __uint32_t get_length() { return len; }
+
+ LRUObject *get_head() {
+ return head;
+ }
+ LRUObject *get_tail() {
+ return tail;
+ }
+
+ void insert_head(LRUObject *o) {
+ o->lru_next = head;
+ o->lru_prev = NULL;
+ if (head) {
+ head->lru_prev = o;
+ } else {
+ tail = o;
+ }
+ head = o;
+ o->lru_list = this;
+ len++;
+ }
+ void insert_tail(LRUObject *o) {
+ o->lru_next = NULL;
+ o->lru_prev = tail;
+ if (tail) {
+ tail->lru_next = o;
+ } else {
+ head = o;
+ }
+ tail = o;
+ o->lru_list = this;
+ len++;
+ }
+
+ void remove(LRUObject *o) {
+ assert(o->lru_list == this);
+ if (o->lru_next)
+ o->lru_next->lru_prev = o->lru_prev;
+ else
+ tail = o->lru_prev;
+ if (o->lru_prev)
+ o->lru_prev->lru_next = o->lru_next;
+ else
+ head = o->lru_next;
+ o->lru_next = o->lru_prev = NULL;
+ o->lru_list = 0;
+ assert(len>0);
+ len--;
+ }
+
+};
+
+
+class LRU {
+ protected:
+ LRUList lru_top, lru_bot, lru_pintail;
+ __uint32_t lru_num, lru_num_pinned;
+ __uint32_t lru_max; // max items
+ double lru_midpoint;
+
+ friend class LRUObject;
+ //friend class MDCache; // hack
+
+ public:
+ LRU(int max = 0) {
+ lru_num = 0;
+ lru_num_pinned = 0;
+ lru_midpoint = .9;
+ lru_max = max;
+ }
+
+ __uint32_t lru_get_size() { return lru_num; }
+ __uint32_t lru_get_top() { return lru_top.get_length(); }
+ __uint32_t lru_get_bot() { return lru_bot.get_length(); }
+ __uint32_t lru_get_pintail() { return lru_pintail.get_length(); }
+ __uint32_t lru_get_max() { return lru_max; }
+ __uint32_t lru_get_num_pinned() { return lru_num_pinned; }
+
+ void lru_set_max(__uint32_t m) { lru_max = m; }
+ void lru_set_midpoint(float f) { lru_midpoint = f; }
+
+
+ // insert at top of lru
+ void lru_insert_top(LRUObject *o) {
+ //assert(!o->lru_in_lru);
+ //o->lru_in_lru = true;
+ assert(!o->lru);
+ o->lru = this;
+ lru_top.insert_head( o );
+ lru_num++;
+ if (o->lru_pinned) lru_num_pinned++;
+ lru_adjust();
+ }
+
+ // insert at mid point in lru
+ void lru_insert_mid(LRUObject *o) {
+ //assert(!o->lru_in_lru);
+ //o->lru_in_lru = true;
+ assert(!o->lru);
+ o->lru = this;
+ lru_bot.insert_head(o);
+ lru_num++;
+ if (o->lru_pinned) lru_num_pinned++;
+ }
+
+ // insert at bottom of lru
+ void lru_insert_bot(LRUObject *o) {
+ assert(!o->lru);
+ o->lru = this;
+ lru_bot.insert_tail(o);
+ lru_num++;
+ if (o->lru_pinned) lru_num_pinned++;
+ }
+
+ /*
+ // insert at bottom of lru
+ void lru_insert_pintail(LRUObject *o) {
+ assert(!o->lru);
+ o->lru = this;
+
+ assert(o->lru_pinned);
+
+ lru_pintail.insert_head(o);
+ lru_num++;
+ lru_num_pinned += o->lru_pinned;
+ }
+ */
+
+
+
+
+ // adjust top/bot balance, as necessary
+ void lru_adjust() {
+ if (!lru_max) return;
+
+ unsigned toplen = lru_top.get_length();
+ unsigned topwant = (unsigned)(lru_midpoint * (double)lru_max);
+ while (toplen > 0 &&
+ toplen > topwant) {
+ // remove from tail of top, stick at head of bot
+ // FIXME: this could be way more efficient by moving a whole chain of items.
+
+ LRUObject *o = lru_top.get_tail();
+ lru_top.remove(o);
+ lru_bot.insert_head(o);
+ toplen--;
+ }
+ }
+
+
+ // remove an item
+ LRUObject *lru_remove(LRUObject *o) {
+ // not in list
+ //assert(o->lru_in_lru);
+ //if (!o->lru_in_lru) return o; // might have expired and been removed that way.
+ if (!o->lru) return o;
+
+
+ if (o->lru_list == &lru_top)
+ lru_top.remove(o);
+ else if (o->lru_list == &lru_bot)
+ lru_bot.remove(o);
+ else if (o->lru_list == &lru_pintail)
+ lru_pintail.remove(o);
+ else
+ assert(0);
+
+ lru_num--;
+ if (o->lru_pinned) lru_num_pinned--;
+ o->lru = 0;
+ return o;
+ }
+
+ // touch item -- move to head of lru
+ bool lru_touch(LRUObject *o) {
+ lru_remove(o);
+ lru_insert_top(o);
+ return true;
+ }
+
+ // touch item -- move to midpoint (unless already higher)
+ bool lru_midtouch(LRUObject *o) {
+ if (o->lru_list == &lru_top) return false;
+
+ lru_remove(o);
+ lru_insert_mid(o);
+ return true;
+ }
+
+ // touch item -- move to bottom
+ bool lru_bottouch(LRUObject *o) {
+ lru_remove(o);
+ lru_insert_bot(o);
+ return true;
+ }
+
+
+ // expire -- expire a single item
+ LRUObject *lru_get_next_expire() {
+ LRUObject *p;
+
+ // look through tail of bot
+ while (lru_bot.get_length()) {
+ p = lru_bot.get_tail();
+ if (!p->lru_pinned) return p;
+
+ // move to pintail
+ lru_bot.remove(p);
+ lru_pintail.insert_head(p);
+ }
+
+ // ok, try head then
+ while (lru_top.get_length()) {
+ p = lru_top.get_tail();
+ if (!p->lru_pinned) return p;
+
+ // move to pintail
+ lru_top.remove(p);
+ lru_pintail.insert_head(p);
+ }
+
+ // no luck!
+ return NULL;
+ }
+
+ LRUObject *lru_expire() {
+ LRUObject *p = lru_get_next_expire();
+ if (p)
+ return lru_remove(p);
+ return NULL;
+ }
+
+
+ void lru_status() {
+ dout(10) << "lru: " << lru_num << " items, " << lru_top.get_length() << " top, " << lru_bot.get_length() << " bot, " << lru_pintail.get_length() << " pintail" << endl;
+ }
+
+};
+
+
+inline void LRUObject::lru_pin()
+{
+ lru_pinned = true;
+ if (lru) lru->lru_num_pinned++;
+}
+inline void LRUObject::lru_unpin() {
+ lru_pinned = false;
+ if (lru) {
+ lru->lru_num_pinned--;
+
+ // move from pintail -> bot
+ if (lru_list == &lru->lru_pintail) {
+ lru->lru_pintail.remove(this);
+ lru->lru_bot.insert_tail(this);
+ }
+ }
+}
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __OBJECT_H
+#define __OBJECT_H
+
+#include <iostream>
+#include <iomanip>
+using namespace std;
+
+
+typedef __uint32_t objectrev_t;
+
+struct object_t {
+ static const __uint32_t MAXREV = 0xffffffffU;
+
+ __uint64_t ino; // "file" identifier
+ __uint32_t bno; // "block" in that "file"
+ objectrev_t rev; // revision. normally ctime (as epoch).
+
+ object_t() : ino(0), bno(0), rev(0) {}
+ object_t(__uint64_t i, __uint32_t b) : ino(i), bno(b), rev(0) {}
+};
+
+
+inline bool operator==(const object_t l, const object_t r) {
+ return (l.ino == r.ino) && (l.bno == r.bno) && (l.rev == r.rev);
+}
+inline bool operator!=(const object_t l, const object_t r) {
+ return (l.ino != r.ino) || (l.bno != r.bno) || (l.rev != r.rev);
+}
+inline bool operator>(const object_t l, const object_t r) {
+ if (l.ino > r.ino) return true;
+ if (l.ino < r.ino) return false;
+ if (l.bno > r.bno) return true;
+ if (l.bno < r.bno) return false;
+ if (l.rev > r.rev) return true;
+ return false;
+}
+inline bool operator<(const object_t l, const object_t r) {
+ if (l.ino < r.ino) return true;
+ if (l.ino > r.ino) return false;
+ if (l.bno < r.bno) return true;
+ if (l.bno > r.bno) return false;
+ if (l.rev < r.rev) return true;
+ return false;
+}
+inline bool operator>=(const object_t l, const object_t r) {
+ return !(l < r);
+}
+inline bool operator<=(const object_t l, const object_t r) {
+ return !(l > r);
+}
+inline ostream& operator<<(ostream& out, const object_t o) {
+ out << hex << o.ino << '.';
+ out.setf(ios::right);
+ out.fill('0');
+ out << setw(8) << o.bno << dec;
+ out.unsetf(ios::right);
+ if (o.rev)
+ out << '.' << o.rev;
+ return out;
+}
+namespace __gnu_cxx {
+ template<> struct hash<__uint64_t> {
+ size_t operator()(__uint64_t __x) const {
+ static hash<__uint32_t> H;
+ return H((__x >> 32) ^ (__x & 0xffffffff));
+ }
+ };
+
+ template<> struct hash<object_t> {
+ size_t operator()(const object_t &r) const {
+ static hash<__uint64_t> H;
+ static hash<__uint32_t> I;
+ return H(r.ino) ^ I(r.bno);
+ }
+ };
+}
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __BUFFER_H
+#define __BUFFER_H
+
+#include <cassert>
+#include <string.h>
+
+#include <iostream>
+using namespace std;
+
+// bit masks
+#define BUFFER_MODE_NOCOPY 0
+#define BUFFER_MODE_COPY 1 // copy on create, my buffer
+
+#define BUFFER_MODE_NOFREE 0
+#define BUFFER_MODE_FREE 2
+
+#define BUFFER_MODE_CUSTOMFREE 4
+
+#define BUFFER_MODE_DEFAULT 3//(BUFFER_MODE_COPY|BUFFER_MODE_FREE)
+
+
+// debug crap
+#include "config.h"
+#define bdbout(x) if (x <= g_conf.debug_buffer) cout
+
+#include "common/Mutex.h"
+
+// HACK: in config.cc
+/*
+ * WARNING: bufferlock placements are tricky for efficiency. note that only bufferptr and
+ * buffer ever use buffer._ref, and only bufferptr should call ~buffer().
+ *
+ * So, I only need to protect:
+ * - buffer()'s modification of buffer_total_alloc
+ * - ~bufferptr() check of buffer._ref, and ~buffer's mod of buffer_total_alloc
+ *
+ * I don't protect
+ * - buffer._get() .. increment is atomic on any sane architecture
+ * - buffer._put() .. only called by ~bufferptr.
+ * - ~buffer .. only called by ~bufferptr *** I HOPE!!
+ */
+extern Mutex bufferlock;
+extern long buffer_total_alloc;
+
+
+typedef void (buffer_free_func_t)(void*,char*,unsigned);
+
+
+/*
+ * buffer - the underlying buffer container. with a reference count.
+ *
+ * the buffer never shrinks.
+ *
+ * some invariants:
+ * _len never shrinks
+ * _len <= _alloc_len
+ */
+class buffer {
+ protected:
+ //wtf
+ //static Mutex bufferlock;
+ //static long buffer_total_alloc;// = 0;
+
+ private:
+ // raw buffer alloc
+ char *_dataptr;
+ bool _myptr;
+ unsigned _len;
+ unsigned _alloc_len;
+
+ // ref counts
+ unsigned _ref;
+ int _get() {
+ bdbout(1) << "buffer.get " << *this << " get " << _ref+1 << endl;
+ return ++_ref;
+ }
+ int _put() {
+ bdbout(1) << "buffer.put " << *this << " put " << _ref-1 << endl;
+ assert(_ref > 0);
+ return --_ref;
+ }
+
+ // custom (de!)allocator
+ buffer_free_func_t *free_func;
+ void *free_func_arg;
+
+ friend class bufferptr;
+
+ public:
+ // constructors
+ buffer() : _dataptr(0), _myptr(true), _len(0), _alloc_len(0), _ref(0), free_func(0), free_func_arg(0) {
+ bdbout(1) << "buffer.cons " << *this << endl;
+ }
+ buffer(unsigned a) : _dataptr(0), _myptr(true), _len(a), _alloc_len(a), _ref(0), free_func(0), free_func_arg(0) {
+ bdbout(1) << "buffer.cons " << *this << endl;
+ _dataptr = new char[a];
+ bufferlock.Lock();
+ buffer_total_alloc += _alloc_len;
+ bufferlock.Unlock();
+ bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl;
+ }
+ ~buffer() {
+ bdbout(1) << "buffer.des " << *this << " " << (void*)free_func << endl;
+ if (free_func) {
+ bdbout(1) << "buffer.custom_free_func " << free_func_arg << " " << (void*)_dataptr << endl;
+ free_func( free_func_arg, _dataptr, _alloc_len );
+ }
+ else if (_dataptr && _myptr) {
+ bdbout(1) << "buffer.free " << (void*)_dataptr << endl;
+ delete[] _dataptr;
+ buffer_total_alloc -= _alloc_len;
+ }
+ }
+
+ buffer(const char *p, int l, int mode=BUFFER_MODE_DEFAULT, int alloc_len=0,
+ buffer_free_func_t free_func=0, void* free_func_arg=0) :
+ _dataptr(0),
+ _myptr(false),
+ _len(l),
+ _ref(0),
+ free_func(0), free_func_arg(0) {
+
+ if (alloc_len)
+ _alloc_len = alloc_len;
+ else
+ _alloc_len = l;
+
+ _myptr = mode & BUFFER_MODE_FREE ? true:false;
+ bdbout(1) << "buffer.cons " << *this << " mode = " << mode << ", myptr=" << _myptr << endl;
+ if (mode & BUFFER_MODE_COPY) {
+ _dataptr = new char[_alloc_len];
+ bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl;
+ bufferlock.Lock();
+ buffer_total_alloc += _alloc_len;
+ bufferlock.Unlock();
+ memcpy(_dataptr, p, l);
+ bdbout(1) << "buffer.copy " << *this << endl;
+ } else {
+ _dataptr = (char*)p; // ugly
+ bdbout(1) << "buffer.claim " << *this << " myptr=" << _myptr << endl;
+ }
+
+ if (mode & BUFFER_MODE_CUSTOMFREE && free_func) {
+ this->free_func = free_func;
+ this->free_func_arg = free_func_arg;
+ }
+ }
+
+ // operators
+ buffer& operator=(buffer& other) {
+ assert(0); // not implemented, no reasonable assignment semantics.
+ return *this;
+ }
+
+ char *c_str() {
+ return _dataptr;
+ }
+
+ bool has_free_func() { return free_func != 0; }
+
+ // accessor
+ unsigned alloc_length() {
+ return _alloc_len;
+ }
+ void set_length(unsigned l) {
+ assert(l <= _alloc_len);
+ _len = l;
+ }
+ unsigned length() { return _len; }
+ unsigned unused_tail_length() { return _alloc_len - _len; }
+
+ friend ostream& operator<<(ostream& out, buffer& b);
+};
+
+inline ostream& operator<<(ostream& out, buffer& b) {
+ return out << "buffer(this=" << &b << " len=" << b._len << ", alloc=" << b._alloc_len << ", data=" << (void*)b._dataptr << " ref=" << b._ref << ")";
+}
+
+
+/*
+ * smart pointer class for buffer
+ *
+ * we reference count the actual buffer.
+ * we also let you refer to a subset of a buffer.
+ * we implement the high-level buffer accessor methods.
+ *
+ * some invariants:
+ * _off < _buffer->_len
+ * _off + _len <= _buffer->_len
+ */
+class bufferptr {
+ private:
+ buffer *_buffer;
+ unsigned _len, _off;
+
+ public:
+ // empty cons
+ bufferptr() :
+ _buffer(0),
+ _len(0),
+ _off(0) { }
+ // main cons - the entire buffer
+ bufferptr(buffer *b) :
+ _buffer(b),
+ _len(b->_len),
+ _off(0) {
+ assert(_buffer->_ref == 0);
+ _buffer->_get(); // this is always the first one.
+ }
+ // subset cons - a subset of another bufferptr (subset)
+ bufferptr(const bufferptr& bp, unsigned len, unsigned off) {
+ bufferlock.Lock();
+ _buffer = bp._buffer;
+ _len = len;
+ _off = bp._off + off;
+ _buffer->_get();
+ assert(_off < _buffer->_len); // sanity checks
+ assert(_off + _len <= _buffer->_len);
+ bufferlock.Unlock();
+ }
+
+ // copy cons
+ bufferptr(const bufferptr &other) {
+ bufferlock.Lock();
+ _buffer = other._buffer;
+ _len = other._len;
+ _off = other._off;
+ if (_buffer) _buffer->_get();
+ bufferlock.Unlock();
+ }
+
+ // assignment operator
+ bufferptr& operator=(const bufferptr& other) {
+ //assert(0);
+ // discard old
+ discard_buffer();
+
+ // point to other
+ bufferlock.Lock();
+ _buffer = other._buffer;
+ _len = other._len;
+ _off = other._off;
+ if (_buffer) _buffer->_get();
+ bufferlock.Unlock();
+ return *this;
+ }
+
+ ~bufferptr() {
+ discard_buffer();
+ }
+
+ void discard_buffer() {
+ if (_buffer) {
+ bufferlock.Lock();
+ if (_buffer->_put() == 0)
+ delete _buffer;
+ _buffer = 0;
+ bufferlock.Unlock();
+ }
+ }
+
+
+ // dereference to get the actual buffer
+ buffer& operator*() {
+ return *_buffer;
+ }
+
+
+ bool at_buffer_head() const {
+ return _off == 0;
+ }
+ bool at_buffer_tail() const {
+ return _off + _len == _buffer->_len;
+ }
+
+ // accessors for my subset
+ char *c_str() {
+ return _buffer->c_str() + _off;
+ }
+ unsigned length() const {
+ return _len;
+ }
+ unsigned offset() const {
+ return _off;
+ }
+ unsigned unused_tail_length() {
+ if (!at_buffer_tail()) return 0;
+ return _buffer->unused_tail_length();
+ }
+
+
+
+ // modifiers
+ void set_offset(unsigned off) {
+ assert(off <= _buffer->_alloc_len);
+ _off = off;
+ }
+ void set_length(unsigned len) {
+ assert(len >= 0 && _off + len <= _buffer->_alloc_len);
+ if (_buffer->_len < _off + len)
+ _buffer->_len = _off + len; // set new buffer len (_IF_ i'm expanding it)
+ _len = len; // my len too
+ }
+ void zero() {
+ //bzero((void*)c_str(), _len);
+ memset((void*)c_str(), 0, _len);
+ }
+
+
+ // crope lookalikes
+ void append(const char *p, unsigned len) {
+ assert(len + _len + _off <= _buffer->_alloc_len); // FIXME later for auto-expansion?
+
+ // copy
+ memcpy(c_str() + _len, p, len);
+ _buffer->_len += len;
+ _len += len;
+ }
+ void copy_out(unsigned off, unsigned len, char *dest) {
+ assert(off >= 0 && off <= _len);
+ assert(len >= 0 && off + len <= _len);
+ memcpy(dest, c_str() + off, len);
+ }
+ void copy_in(unsigned off, unsigned len, const char *src) {
+ assert(off >= 0 && off <= _len);
+ assert(len >= 0 && off + len <= _len);
+ memcpy(c_str() + off, src, len);
+ }
+
+ friend ostream& operator<<(ostream& out, bufferptr& bp);
+};
+
+
+inline ostream& operator<<(ostream& out, bufferptr& bp) {
+ return out << "bufferptr(len=" << bp._len << " off=" << bp._off
+ << " cstr=" << (void*)bp.c_str()
+ << " buf=" << *bp._buffer
+ << ")";
+}
+
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __BUFFERLIST_H
+#define __BUFFERLIST_H
+
+#include "buffer.h"
+
+#include <list>
+#include <map>
+#include <set>
+#include <vector>
+using namespace std;
+
+#include <ext/rope>
+using namespace __gnu_cxx;
+
+
+// debug crap
+#include "config.h"
+#define bdbout(x) if (x <= g_conf.debug_buffer) cout
+
+
+
+class bufferlist {
+ private:
+ /* local state limited to _buffers, and _len.
+ * we maintain _len ourselves, so we must be careful when fiddling with buffers!
+ */
+ list<bufferptr> _buffers;
+ unsigned _len;
+
+ public:
+ // cons/des
+ bufferlist() : _len(0) {
+ bdbout(1) << "bufferlist.cons " << this << endl;
+ }
+ bufferlist(const bufferlist& bl) : _len(0) {
+ //assert(0); // o(n) and stupid
+ bdbout(1) << "bufferlist.cons " << this << endl;
+ _buffers = bl._buffers;
+ _len = bl._len;
+ }
+ ~bufferlist() {
+ bdbout(1) << "bufferlist.des " << this << endl;
+ }
+
+ bufferlist& operator=(bufferlist& bl) {
+ //assert(0); // actually, this should be fine, just slow (O(n)) and stupid.
+ bdbout(1) << "bufferlist.= " << this << endl;
+ _buffers = bl._buffers;
+ _len = bl._len;
+ return *this;
+ }
+
+
+ // accessors
+ list<bufferptr>& buffers() {
+ return _buffers;
+ }
+ //list<buffer*>::iterator begin() { return _buffers.begin(); }
+ //list<buffer*>::iterator end() { return _buffers.end(); }
+
+ unsigned length() const {
+#if 0
+ { // DEBUG: verify _len
+ int len = 0;
+ for (list<bufferptr>::iterator it = _buffers.begin();
+ it != _buffers.end();
+ it++) {
+ len += (*it).length();
+ }
+ assert(len == _len);
+ }
+#endif
+ return _len;
+ }
+
+ void _rope(crope& r) {
+ for (list<bufferptr>::iterator it = _buffers.begin();
+ it != _buffers.end();
+ it++)
+ r.append((*it).c_str(), (*it).length());
+ }
+
+ // modifiers
+ void clear() {
+ _buffers.clear();
+ _len = 0;
+ }
+ void push_front(bufferptr& bp) {
+ _buffers.push_front(bp);
+ _len += bp.length();
+ }
+ void push_front(buffer *b) {
+ bufferptr bp(b);
+ _buffers.push_front(bp);
+ _len += bp.length();
+ }
+ void push_back(bufferptr& bp) {
+ _buffers.push_back(bp);
+ _len += bp.length();
+ }
+ void push_back(buffer *b) {
+ bufferptr bp(b);
+
+ _buffers.push_back(bp);
+ _len += bp.length();
+
+ }
+ void zero() {
+ for (list<bufferptr>::iterator it = _buffers.begin();
+ it != _buffers.end();
+ it++)
+ it->zero();
+ }
+
+ // sort-of-like-assignment-op
+ void claim(bufferlist& bl) {
+ // free my buffers
+ clear();
+ claim_append(bl);
+ }
+ void claim_append(bufferlist& bl) {
+ // steal the other guy's buffers
+ _len += bl._len;
+ _buffers.splice( _buffers.end(), bl._buffers );
+ bl._len = 0;
+ }
+
+
+
+
+ // crope lookalikes
+ void copy(unsigned off, unsigned len, char *dest) {
+ assert(off >= 0);
+ assert(off + len <= length());
+ /*assert(off < length());
+ if (off + len > length())
+ len = length() - off;
+ */
+ // advance to off
+ list<bufferptr>::iterator curbuf = _buffers.begin();
+
+ // skip off
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ break;
+ }
+ }
+
+ // copy
+ while (len > 0) {
+ // is the rest ALL in this buffer?
+ if (off + len <= (*curbuf).length()) {
+ (*curbuf).copy_out(off, len, dest); // yup, last bit!
+ break;
+ }
+
+ // get as much as we can from this buffer.
+ unsigned howmuch = (*curbuf).length() - off;
+ (*curbuf).copy_out(off, howmuch, dest);
+
+ dest += howmuch;
+ len -= howmuch;
+ off = 0;
+ curbuf++;
+ assert(curbuf != _buffers.end());
+ }
+ }
+
+ void copy_in(unsigned off, unsigned len, const char *src) {
+ assert(off >= 0);
+ assert(off + len <= length());
+
+ // advance to off
+ list<bufferptr>::iterator curbuf = _buffers.begin();
+
+ // skip off
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ break;
+ }
+ }
+
+ // copy
+ while (len > 0) {
+ // is the rest ALL in this buffer?
+ if (off + len <= (*curbuf).length()) {
+ (*curbuf).copy_in(off, len, src); // yup, last bit!
+ break;
+ }
+
+ // get as much as we can from this buffer.
+ unsigned howmuch = (*curbuf).length() - off;
+ (*curbuf).copy_in(off, howmuch, src);
+
+ src += howmuch;
+ len -= howmuch;
+ off = 0;
+ curbuf++;
+ assert(curbuf != _buffers.end());
+ }
+ }
+ void copy_in(unsigned off, unsigned len, bufferlist& bl) {
+ unsigned left = len;
+ for (list<bufferptr>::iterator i = bl._buffers.begin();
+ i != bl._buffers.end();
+ i++) {
+ unsigned l = (*i).length();
+ if (left < l) l = left;
+ copy_in(off, l, (*i).c_str());
+ left -= l;
+ if (left == 0) break;
+ off += l;
+ }
+ }
+
+
+ void append(const char *data, unsigned len) {
+ if (len == 0) return;
+
+ unsigned alen = 0;
+
+ // copy into the tail buffer?
+ if (!_buffers.empty()) {
+ unsigned avail = _buffers.back().unused_tail_length();
+ if (avail > 0) {
+ //cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf" << endl;
+ if (avail > len)
+ avail = len;
+ unsigned blen = _buffers.back().length();
+ memcpy(_buffers.back().c_str() + blen, data, avail);
+ blen += avail;
+ _buffers.back().set_length(blen);
+ _len += avail;
+ data += avail;
+ len -= avail;
+ }
+ alen = _buffers.back().length();
+ }
+ if (len == 0) return;
+
+ // just add another buffer.
+ // alloc a bit extra, in case we do a bunch of appends. FIXME be smarter!
+ if (alen < 1024) alen = 1024;
+ push_back(new buffer(data, len, BUFFER_MODE_DEFAULT, len+alen));
+ }
+ void append(bufferptr& bp) {
+ push_back(bp);
+ }
+ void append(bufferptr& bp, unsigned len, unsigned off) {
+ bufferptr tempbp(bp, len, off);
+ push_back(tempbp);
+ }
+ void append(const bufferlist& bl) {
+ bufferlist temp = bl; // copy list
+ claim_append(temp); // and append
+ }
+
+
+ /*
+ * return a contiguous ptr to whole bufferlist contents.
+ */
+ char *c_str() {
+ if (_buffers.size() == 1) {
+ return _buffers.front().c_str(); // good, we're already contiguous.
+ }
+ else if (_buffers.size() == 0) {
+ return 0; // no buffers
+ }
+ else {
+ // make one new contiguous buffer.
+ bufferptr newbuf = new buffer(length());
+ unsigned off = 0;
+
+ for (list<bufferptr>::iterator it = _buffers.begin();
+ it != _buffers.end();
+ it++) {
+ //assert((*(*it)).has_free_func() == false); // not allowed if there's a funky free_func.. -sage ...for debugging at least!
+ memcpy(newbuf.c_str() + off,
+ (*it).c_str(), (*it).length());
+ off += (*it).length();
+ }
+ assert(off == newbuf.length());
+
+ _buffers.clear();
+ _buffers.push_back( newbuf );
+
+ // now it'll work.
+ return c_str();
+ }
+ }
+
+
+ void substr_of(bufferlist& other, unsigned off, unsigned len) {
+ assert(off + len <= other.length());
+ clear();
+
+ // skip off
+ list<bufferptr>::iterator curbuf = other._buffers.begin();
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ //cout << "skipping over " << *curbuf << endl;
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ //cout << "somewhere in " << *curbuf << endl;
+ break;
+ }
+ }
+
+ while (len > 0) {
+ // partial?
+ if (off + len < (*curbuf).length()) {
+ //cout << "copying partial of " << *curbuf << endl;
+ _buffers.push_back( bufferptr( *curbuf, len, off ) );
+ _len += len;
+ break;
+ }
+
+ // through end
+ //cout << "copying end (all?) of " << *curbuf << endl;
+ unsigned howmuch = (*curbuf).length() - off;
+ _buffers.push_back( bufferptr( *curbuf, howmuch, off ) );
+ _len += howmuch;
+ len -= howmuch;
+ off = 0;
+ curbuf++;
+ }
+ }
+
+ // funky modifer
+ void splice(unsigned off, unsigned len, bufferlist *claim_by=0 /*, bufferlist& replace_with */) { // fixme?
+ assert(off < length());
+ assert(len > 0);
+ //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl;
+
+ // skip off
+ list<bufferptr>::iterator curbuf = _buffers.begin();
+ while (off > 0) {
+ assert(curbuf != _buffers.end());
+ if (off >= (*curbuf).length()) {
+ // skip this buffer
+ //cout << "off = " << off << " skipping over " << *curbuf << endl;
+ off -= (*curbuf).length();
+ curbuf++;
+ } else {
+ // somewhere in this buffer!
+ //cout << "off = " << off << " somewhere in " << *curbuf << endl;
+ break;
+ }
+ }
+ assert(off >= 0);
+
+ if (off) {
+ // add a reference to the front bit
+ // insert it before curbuf (which we'll hose)
+ //cout << "keeping front " << off << " of " << *curbuf << endl;
+ _buffers.insert( curbuf, bufferptr( *curbuf, off, 0 ) );
+ _len += off;
+ }
+
+ while (len > 0) {
+ // partial?
+ if (off + len < (*curbuf).length()) {
+ //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl;
+ if (claim_by)
+ claim_by->append( *curbuf, len, off );
+ (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big
+ (*curbuf).set_length( (*curbuf).length() - (len+off) );
+ _len -= off+len;
+ //cout << " now " << *curbuf << endl;
+ break;
+ }
+
+ // hose though the end
+ unsigned howmuch = (*curbuf).length() - off;
+ //cout << "discarding " << howmuch << " of " << *curbuf << endl;
+ if (claim_by)
+ claim_by->append( *curbuf, howmuch, off );
+ _len -= (*curbuf).length();
+ _buffers.erase( curbuf++ );
+ len -= howmuch;
+ off = 0;
+ }
+
+ // splice in *replace (implement me later?)
+ }
+
+ friend ostream& operator<<(ostream& out, bufferlist& bl);
+
+};
+
+inline ostream& operator<<(ostream& out, bufferlist& bl) {
+ out << "bufferlist(len=" << bl.length() << endl;
+ for (list<bufferptr>::iterator it = bl._buffers.begin();
+ it != bl._buffers.end();
+ it++)
+ out << "\t" << *it << endl;
+ out << ")" << endl;
+ return out;
+}
+
+
+
+// encoder/decode helpers
+
+// string
+inline void _encode(const string& s, bufferlist& bl)
+{
+ bl.append(s.c_str(), s.length()+1);
+}
+inline void _decode(string& s, bufferlist& bl, int& off)
+{
+ s = bl.c_str() + off;
+ off += s.length() + 1;
+}
+
+// bufferptr (encapsulated)
+inline void _encode(bufferptr& bp, bufferlist& bl)
+{
+ size_t len = bp.length();
+ bl.append((char*)&len, sizeof(len));
+ bl.append(bp);
+}
+inline void _decode(bufferptr& bp, bufferlist& bl, int& off)
+{
+ size_t len;
+ bl.copy(off, sizeof(len), (char*)&len);
+ off += sizeof(len);
+ bufferlist s;
+ s.substr_of(bl, off, len);
+ off += len;
+
+ if (s.buffers().size() == 1)
+ bp = s.buffers().front();
+ else
+ bp = new buffer(s.c_str(), s.length());
+}
+
+// bufferlist (encapsulated)
+inline void _encode(const bufferlist& s, bufferlist& bl)
+{
+ size_t len = s.length();
+ bl.append((char*)&len, sizeof(len));
+ bl.append(s);
+}
+inline void _decode(bufferlist& s, bufferlist& bl, int& off)
+{
+ size_t len;
+ bl.copy(off, sizeof(len), (char*)&len);
+ off += sizeof(len);
+ s.substr_of(bl, off, len);
+ off += len;
+}
+
+
+// set<T>
+template<class T>
+inline void _encode(set<T>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename set<T>::iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T v = *it;
+ bl.append((char*)&v, sizeof(v));
+ n--;
+ }
+ assert(n==0);
+}
+template<class T>
+inline void _decode(set<T>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T v;
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s.insert(v);
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+// vector<T>
+template<class T>
+inline void _encode(vector<T>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename vector<T>::iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T v = *it;
+ bl.append((char*)&v, sizeof(v));
+ n--;
+ }
+ assert(n==0);
+}
+template<class T>
+inline void _decode(vector<T>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ s = vector<T>(n);
+ for (int i=0; i<n; i++) {
+ T v;
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s[i] = v;
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+// list<T>
+template<class T>
+inline void _encode(const list<T>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename list<T>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T v = *it;
+ bl.append((char*)&v, sizeof(v));
+ n--;
+ }
+ assert(n==0);
+}
+template<class T>
+inline void _decode(list<T>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T v;
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s.push_back(v);
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+// map<string,bufferptr>
+inline void _encode(map<string, bufferptr>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (map<string, bufferptr>::iterator it = s.begin();
+ it != s.end();
+ it++) {
+ _encode(it->first, bl);
+ _encode(it->second, bl);
+ n--;
+ }
+ assert(n==0);
+}
+inline void _decode(map<string,bufferptr>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ string k;
+ _decode(k, bl, off);
+ _decode(s[k], bl, off);
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+
+// map<T,bufferlist>
+template<class T>
+inline void _encode(const map<T, bufferlist>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename map<T, bufferlist>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T k = it->first;
+ bl.append((char*)&k, sizeof(k));
+ _encode(it->second, bl);
+ n--;
+ }
+ assert(n==0);
+}
+template<class T>
+inline void _decode(map<T,bufferlist>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T k;
+ bl.copy(off, sizeof(k), (char*)&k);
+ off += sizeof(k);
+ bufferlist b;
+ _decode(b, bl, off);
+ s[k] = b;
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+// map<T,U>
+template<class T, class U>
+inline void _encode(const map<T, U>& s, bufferlist& bl)
+{
+ int n = s.size();
+ bl.append((char*)&n, sizeof(n));
+ for (typename map<T, U>::const_iterator it = s.begin();
+ it != s.end();
+ it++) {
+ T k = it->first;
+ U v = it->second;
+ bl.append((char*)&k, sizeof(k));
+ bl.append((char*)&v, sizeof(v));
+ n--;
+ }
+ assert(n==0);
+}
+template<class T, class U>
+inline void _decode(map<T,U>& s, bufferlist& bl, int& off)
+{
+ s.clear();
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ T k;
+ U v;
+ bl.copy(off, sizeof(k), (char*)&k);
+ off += sizeof(k);
+ bl.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s[k] = v;
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __RANGESET_H
+#define __RANGESET_H
+
+/*
+ *
+ * my first container with iterator! it's pretty ugly.
+ *
+ */
+
+#include <map>
+#include <cassert>
+#include <iostream>
+using namespace std;
+
+//typedef int T;
+
+template <class T>
+struct _rangeset_base {
+ map<T,T> ranges; // pair(first,last) (inclusive, e.g. [first,last])
+
+ typedef typename map<T,T>::iterator mapit;
+
+ // get iterator for range including val. or ranges.end().
+ mapit get_range_for(T val) {
+ mapit it = ranges.lower_bound(val);
+ if (it == ranges.end()) {
+ // search backwards
+ typename map<T,T>::reverse_iterator it = ranges.rbegin();
+ if (it == ranges.rend()) return ranges.end();
+ if (it->first <= val && it->second >= val)
+ return ranges.find(it->first);
+ return ranges.end();
+ } else {
+ if (it->first == val) return
+ it--;
+ if (it->first <= val && it->second >= val)
+ return it;
+ return ranges.end();
+ }
+ }
+
+};
+
+
+template <class T>
+class rangeset_iterator :
+ public std::iterator<std::input_iterator_tag, T>
+{
+ //typedef typename map<T,T>::iterator mapit;
+
+ map<T,T> ranges;
+ typename map<T,T>::iterator it;
+ T current;
+
+public:
+ // cons
+ rangeset_iterator() {}
+
+ rangeset_iterator(typename map<T,T>::iterator& it, map<T,T>& ranges) {
+ this->ranges = ranges;
+ this->it = it;
+ if (this->it != ranges.end())
+ current = it->first;
+ }
+
+ bool operator==(rangeset_iterator<T> rit) {
+ return (it == rit.it && rit.current == current);
+ }
+ bool operator!=(rangeset_iterator<T> rit) {
+ return (it != rit.it) || (rit.current != current);
+ }
+
+ T& operator*() {
+ return current;
+ }
+
+ rangeset_iterator<T> operator++(int) {
+ if (current < it->second)
+ current++;
+ else {
+ it++;
+ if (it != ranges.end())
+ current = it->first;
+ }
+
+ return *this;
+ }
+};
+
+
+template <class T>
+class rangeset
+{
+ typedef typename map<T,T>::iterator map_iterator;
+
+ _rangeset_base<T> theset;
+ inodeno_t _size;
+
+public:
+ rangeset() { _size = 0; }
+ typedef rangeset_iterator<T> iterator;
+
+ iterator begin() {
+ map_iterator it = theset.ranges.begin();
+ return iterator(it, theset.ranges);
+ }
+
+ iterator end() {
+ map_iterator it = theset.ranges.end();
+ return iterator(it, theset.ranges);
+ }
+
+ map_iterator map_begin() {
+ return theset.ranges.begin();
+ }
+ map_iterator map_end() {
+ return theset.ranges.end();
+ }
+ int map_size() {
+ return theset.ranges.size();
+ }
+
+ void map_insert(T v1, T v2) {
+ theset.ranges.insert(pair<T,T>(v1,v2));
+ _size += v2 - v1+1;
+ }
+
+
+ // ...
+ bool contains(T val) {
+ if (theset.get_range_for(val) == theset.ranges.end()) return false;
+ assert(!empty());
+ return true;
+ }
+
+ void insert(T val) {
+ assert(!contains(val));
+
+ map_iterator left = theset.get_range_for(val-1);
+ map_iterator right = theset.get_range_for(val+1);
+
+ if (left != theset.ranges.end() &&
+ right != theset.ranges.end()) {
+ // join!
+ left->second = right->second;
+ theset.ranges.erase(right);
+ _size++;
+ return;
+ }
+
+ if (left != theset.ranges.end()) {
+ // add to left range
+ left->second = val;
+ _size++;
+ return;
+ }
+
+ if (right != theset.ranges.end()) {
+ // add to right range
+ theset.ranges.insert(pair<T,T>(val, right->second));
+ theset.ranges.erase(val+1);
+ _size++;
+ return;
+ }
+
+ // new range
+ theset.ranges.insert(pair<T,T>(val,val));
+ _size++;
+ return;
+ }
+
+ unsigned size() {
+ return size();
+ }
+
+ bool empty() {
+ if (theset.ranges.empty()) {
+ assert(_size == 0);
+ return true;
+ }
+ assert(_size>0);
+ return false;
+ }
+
+
+ T first() {
+ assert(!empty());
+ map_iterator it = theset.ranges.begin();
+ return it->first;
+ }
+
+ void erase(T val) {
+ assert(contains(val));
+ map_iterator it = theset.get_range_for(val);
+ assert(it != theset.ranges.end());
+
+ // entire range
+ if (val == it->first && val == it->second) {
+ theset.ranges.erase(it);
+ _size--;
+ return;
+ }
+
+ // beginning
+ if (val == it->first) {
+ theset.ranges.insert(pair<T,T>(val+1, it->second));
+ theset.ranges.erase(it);
+ _size--;
+ return;
+ }
+
+ // end
+ if (val == it->second) {
+ it->second = val-1;
+ _size--;
+ return;
+ }
+
+ // middle split
+ theset.ranges.insert(pair<T,T>(it->first, val-1));
+ theset.ranges.insert(pair<T,T>(val+1, it->second));
+ theset.ranges.erase(it);
+ _size--;
+ return;
+ }
+
+ void dump() {
+ for (typename map<T,T>::iterator it = theset.ranges.begin();
+ it != theset.ranges.end();
+ it++) {
+ cout << " " << it->first << "-" << it->second << endl;
+ }
+ }
+
+};
+
+
+#endif
--- /dev/null
+#ifndef _STATLITE_H
+#define _STATLITE_H
+
+extern "C" {
+
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <dirent.h>
+
+struct statlite {
+ dev_t st_dev; /* device */
+ ino_t st_ino; /* inode */
+ mode_t st_mode; /* protection */
+ nlink_t st_nlink; /* number of hard links */
+ uid_t st_uid; /* user ID of owner */
+ gid_t st_gid; /* group ID of owner */
+ dev_t st_rdev; /* device type (if inode device)*/
+ unsigned long st_litemask; /* bit mask for optional fields */
+ /***************************************************************/
+ /**** Remaining fields are optional according to st_litemask ***/
+ off_t st_size; /* total size, in bytes */
+ blksize_t st_blksize; /* blocksize for filesystem I/O */
+ blkcnt_t st_blocks; /* number of blocks allocated */
+ struct timespec st_atim; /* Time of last access. */
+ struct timespec st_mtim; /* Time of last modification. */
+ struct timespec st_ctim; /* Time of last status change. */
+ //time_t st_atime; /* time of last access */
+ //time_t st_mtime; /* time of last modification */
+ //time_t st_ctime; /* time of last change */
+};
+
+#define S_STATLITE_SIZE 1
+#define S_STATLITE_BLKSIZE 2
+#define S_STATLITE_BLOCKS 4
+#define S_STATLITE_ATIME 8
+#define S_STATLITE_MTIME 16
+#define S_STATLITE_CTIME 32
+
+#define S_REQUIRESIZE(m) (m | S_STATLITE_SIZE)
+#define S_REQUIREBLKSIZE(m) (m | S_STATLITE_BLKSIZE)
+#define S_REQUIREBLOCKS(m) (m | S_STATLITE_BLOCKS)
+#define S_REQUIREATIME(m) (m | S_STATLITE_ATIME)
+#define S_REQUIREMTIME(m) (m | S_STATLITE_MTIME)
+#define S_REQUIRECTIME(m) (m | S_STATLITE_CTIME)
+
+#define S_ISVALIDSIZE(m) (m & S_STATLITE_SIZE)
+#define S_ISVALIDBLKSIZE(m) (m & S_STATLITE_BLKSIZE)
+#define S_ISVALIDBLOCKS(m) (m & S_STATLITE_BLOCKS)
+#define S_ISVALIDATIME(m) (m & S_STATLITE_ATIME)
+#define S_ISVALIDMTIME(m) (m & S_STATLITE_MTIME)
+#define S_ISVALIDCTIME(m) (m & S_STATLITE_CTIME)
+
+
+// readdirplus etc.
+
+struct dirent_plus {
+ struct dirent d_dirent; /* dirent struct for this entry */
+ struct stat d_stat; /* attributes for this entry */
+ int d_stat_err;/* errno for d_stat, or 0 */
+};
+struct dirent_lite {
+ struct dirent d_dirent; /* dirent struct for this entry */
+ struct statlite d_stat; /* attributes for this entry */
+ int d_stat_err;/* errno for d_stat, or 0 */
+};
+
+}
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MDS_TYPES_H
+#define __MDS_TYPES_H
+
+extern "C" {
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <assert.h>
+#include "statlite.h"
+}
+
+#include <string>
+#include <set>
+#include <map>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+using namespace std;
+
+#include <ext/rope>
+using namespace __gnu_cxx;
+
+
+#include "object.h"
+
+
+#ifndef MIN
+# define MIN(a,b) ((a) < (b) ? (a):(b))
+#endif
+#ifndef MAX
+# define MAX(a,b) ((a) > (b) ? (a):(b))
+#endif
+
+
+// md ops
+#define MDS_OP_STATFS 1
+
+#define MDS_OP_STAT 100
+#define MDS_OP_LSTAT 101
+#define MDS_OP_UTIME 102
+#define MDS_OP_CHMOD 103
+#define MDS_OP_CHOWN 104
+
+
+#define MDS_OP_READDIR 200
+#define MDS_OP_MKNOD 201
+#define MDS_OP_LINK 202
+#define MDS_OP_UNLINK 203
+#define MDS_OP_RENAME 204
+
+#define MDS_OP_MKDIR 220
+#define MDS_OP_RMDIR 221
+#define MDS_OP_SYMLINK 222
+
+#define MDS_OP_OPEN 301
+#define MDS_OP_TRUNCATE 306
+#define MDS_OP_FSYNC 307
+//#define MDS_OP_CLOSE 310
+#define MDS_OP_RELEASE 308
+
+
+
+// -- stl crap --
+
+/*
+- this is to make some of the STL types work with 64 bit values, string hash keys, etc.
+- added when i was using an old STL.. maybe try taking these out and see if things
+ compile now?
+*/
+
+namespace __gnu_cxx {
+ template<> struct hash< std::string >
+ {
+ size_t operator()( const std::string& x ) const
+ {
+ static hash<const char*> H;
+ return H(x.c_str());
+ }
+ };
+
+ template<> struct hash<__int64_t> {
+ size_t operator()(__int64_t __x) const {
+ static hash<__int32_t> H;
+ return H((__x >> 32) ^ (__x & 0xffffffff));
+ }
+ };
+
+}
+
+
+/*
+ * comparators for stl containers
+ */
+// for hash_map:
+// hash_map<const char*, long, hash<const char*>, eqstr> vals;
+struct eqstr
+{
+ bool operator()(const char* s1, const char* s2) const
+ {
+ return strcmp(s1, s2) == 0;
+ }
+};
+
+// for set, map
+struct ltstr
+{
+ bool operator()(const char* s1, const char* s2) const
+ {
+ return strcmp(s1, s2) < 0;
+ }
+};
+
+
+
+/** object layout
+ * how objects are mapped into PGs
+ */
+#define OBJECT_LAYOUT_DEFAULT 0 // see g_conf
+#define OBJECT_LAYOUT_HASH 1
+#define OBJECT_LAYOUT_LINEAR 2
+#define OBJECT_LAYOUT_HASHINO 3
+#define OBJECT_LAYOUT_STARTOSD 4
+
+/** pg layout
+ * how PGs are mapped into (sets of) OSDs
+ */
+#define PG_LAYOUT_CRUSH 0
+#define PG_LAYOUT_HASH 1
+#define PG_LAYOUT_LINEAR 2
+#define PG_LAYOUT_HYBRID 3
+
+/** FileLayout
+ * specifies a striping and replication strategy
+ */
+
+//#define FILE_LAYOUT_CRUSH 0 // stripe via crush
+//#define FILE_LAYOUT_LINEAR 1 // stripe linearly across cluster
+
+struct FileLayout {
+ // layout
+ int object_layout;
+
+ // FIXME: make this a union?
+ // rushstripe
+ int stripe_size; // stripe unit, in bytes
+ int stripe_count; // over this many objects
+ int object_size; // until objects are this big, then use a new set of objects.
+
+ // period = bytes before i start on a new set of objects.
+ int period() { return object_size * stripe_count; }
+
+ int osd; // osdlocal
+
+ int num_rep; // replication
+
+ FileLayout() { }
+ FileLayout(int ss, int sc, int os, int nr=2, int o=-1) :
+ object_layout(o < 0 ? OBJECT_LAYOUT_DEFAULT:OBJECT_LAYOUT_STARTOSD),
+ stripe_size(ss), stripe_count(sc), object_size(os),
+ osd(o),
+ num_rep(nr) { }
+
+};
+
+
+
+// -- inode --
+
+//typedef __uint64_t inodeno_t;
+
+struct inodeno_t {
+ __uint64_t val;
+ inodeno_t() : val() {}
+ inodeno_t(__uint64_t v) : val(v) {}
+ inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; }
+ operator __uint64_t() const { return val; }
+};
+
+inline ostream& operator<<(ostream& out, inodeno_t ino) {
+ return out << hex << ino.val << dec;
+}
+
+namespace __gnu_cxx {
+ template<> struct hash< inodeno_t >
+ {
+ size_t operator()( const inodeno_t& x ) const
+ {
+ static hash<__uint64_t> H;
+ return H(x.val);
+ }
+ };
+}
+
+typedef __uint64_t version_t;
+
+
+
+#define INODE_MODE_FILE 0100000 // S_IFREG
+#define INODE_MODE_SYMLINK 0120000 // S_IFLNK
+#define INODE_MODE_DIR 0040000 // S_IFDIR
+#define INODE_TYPE_MASK 0170000
+
+#define FILE_MODE_R 1
+#define FILE_MODE_W 2
+#define FILE_MODE_RW (1|2)
+#define FILE_MODE_LAZY 4
+
+#define INODE_MASK_BASE 1 // ino, ctime, nlink
+#define INODE_MASK_PERM 2 // uid, gid, mode
+#define INODE_MASK_SIZE 4 // size, blksize, blocks
+#define INODE_MASK_MTIME 8 // mtime
+#define INODE_MASK_ATIME 16 // atime
+
+#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_PERM|INODE_MASK_SIZE|INODE_MASK_MTIME)
+//#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_PERM|INODE_MASK_SIZE|INODE_MASK_MTIME|INODE_MASK_ATIME)
+
+struct inode_t {
+ // base (immutable)
+ inodeno_t ino; // NOTE: ino _must_ come first for MDStore.cc to behave!!
+ time_t ctime;
+
+ // other
+ FileLayout layout; // ?immutable?
+ int nlink; // base,
+
+ // hard/perm (namespace permissions)
+ mode_t mode;
+ uid_t uid;
+ gid_t gid;
+
+ // file (data access)
+ off_t size;
+ time_t atime, mtime; // maybe atime different? "lazy"?
+
+ int mask;
+
+ // special stuff
+ version_t version; // auth only
+ unsigned char hash_seed; // only defined for dir; 0 if not hashed.
+ bool anchored; // auth only
+ version_t file_data_version; // auth only
+
+ bool is_symlink() { return (mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK; }
+ bool is_dir() { return (mode & INODE_TYPE_MASK) == INODE_MODE_DIR; }
+ bool is_file() { return (mode & INODE_TYPE_MASK) == INODE_MODE_FILE; }
+};
+
+
+
+// lame 128-bit value class.
+class lame128_t {
+public:
+ __uint64_t hi, lo;
+ lame128_t(__uint64_t h=0, __uint64_t l=0) : hi(h), lo(l) {}
+};
+
+inline ostream& operator<<(ostream& out, lame128_t& oid) {
+ return out << oid.hi << "." << oid.lo;
+}
+
+
+// osd types
+//typedef __uint32_t ps_t; // placement seed
+//typedef __uint32_t pg_t; // placement group
+typedef __uint64_t coll_t; // collection id
+typedef __uint64_t tid_t; // transaction id
+
+typedef __uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years)
+
+// pg stuff
+typedef __uint16_t ps_t;
+typedef __uint8_t pruleset_t;
+
+// placement group id
+struct pg_t {
+ union {
+ struct {
+ int preferred;
+ ps_t ps;
+ __uint8_t nrep;
+ pruleset_t ruleset;
+ } fields;
+ __uint64_t val;
+ } u;
+ pg_t() { u.val = 0; }
+ pg_t(const pg_t& o) { u.val = o.u.val; }
+ pg_t(ps_t s, int p, unsigned char n, pruleset_t r=0) {
+ u.fields.ps = s;
+ u.fields.preferred = p;
+ u.fields.nrep = n;
+ u.fields.ruleset = r;
+ }
+ pg_t(__uint64_t v) { u.val = v; }
+ /*
+ pg_t operator=(__uint64_t v) { u.val = v; return *this; }
+ pg_t operator&=(__uint64_t v) { u.val &= v; return *this; }
+ pg_t operator+=(pg_t o) { u.val += o.val; return *this; }
+ pg_t operator-=(pg_t o) { u.val -= o.val; return *this; }
+ pg_t operator++() { ++u.val; return *this; }
+ */
+ operator __uint64_t() const { return u.val; }
+};
+
+inline ostream& operator<<(ostream& out, pg_t pg) {
+ //return out << hex << pg.val << dec;
+ if (pg.u.fields.ruleset)
+ out << (int)pg.u.fields.ruleset << '.';
+ out << (int)pg.u.fields.nrep << '.';
+ if (pg.u.fields.preferred)
+ out << pg.u.fields.preferred << '.';
+ out << hex << pg.u.fields.ps << dec;
+ return out;
+}
+
+namespace __gnu_cxx {
+ template<> struct hash< pg_t >
+ {
+ size_t operator()( const pg_t& x ) const
+ {
+ static hash<__uint64_t> H;
+ return H(x);
+ }
+ };
+}
+
+
+
+// compound rados version type
+class eversion_t {
+public:
+ epoch_t epoch;
+ version_t version;
+ eversion_t(epoch_t e=0, version_t v=0) : epoch(e), version(v) {}
+};
+
+inline bool operator==(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch == r.epoch) && (l.version == r.version);
+}
+inline bool operator!=(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch != r.epoch) || (l.version != r.version);
+}
+inline bool operator<(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
+}
+inline bool operator<=(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
+}
+inline bool operator>(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
+}
+inline bool operator>=(const eversion_t& l, const eversion_t& r) {
+ return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
+}
+inline ostream& operator<<(ostream& out, const eversion_t e) {
+ return out << e.epoch << "'" << e.version;
+}
+
+
+
+#define PG_NONE 0xffffffffL
+
+
+typedef __uint16_t snapv_t; // snapshot version
+
+
+class OSDSuperblock {
+public:
+ const static __uint64_t MAGIC = 0xeb0f505dULL;
+ __uint64_t magic;
+ __uint64_t fsid; // unique fs id (random number)
+ int whoami; // my role in this fs.
+ epoch_t current_epoch; // most recent epoch
+ epoch_t oldest_map, newest_map; // oldest/newest maps we have.
+ OSDSuperblock(__uint64_t f=0, int w=0) :
+ magic(MAGIC), fsid(f), whoami(w),
+ current_epoch(0), oldest_map(0), newest_map(0) {}
+};
+
+inline ostream& operator<<(ostream& out, OSDSuperblock& sb)
+{
+ return out << "sb(fsid " << sb.fsid
+ << " osd" << sb.whoami
+ << " e" << sb.current_epoch
+ << " [" << sb.oldest_map << "," << sb.newest_map
+ << "])";
+}
+
+class MonSuperblock {
+public:
+ const static __uint64_t MAGIC = 0x00eb0f5000ULL;
+ __uint64_t magic;
+ __uint64_t fsid;
+ int whoami; // mon #
+ epoch_t current_epoch;
+ MonSuperblock(__uint64_t f=0, int w=0) :
+ magic(MAGIC), fsid(f), whoami(w), current_epoch(0) {}
+};
+
+
+// new types
+
+class ObjectExtent {
+ public:
+ object_t oid; // object id
+ off_t start; // in object
+ size_t length; // in object
+
+ objectrev_t rev; // which revision?
+ pg_t pgid; // where to find the object
+
+ map<size_t, size_t> buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!)
+
+ ObjectExtent() : start(0), length(0), rev(0), pgid(0) {}
+ ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l), rev(0), pgid(0) { }
+};
+
+inline ostream& operator<<(ostream& out, ObjectExtent &ex)
+{
+ return out << "extent("
+ << ex.oid << " in " << hex << ex.pgid << dec
+ << " " << ex.start << "~" << ex.length
+ << ")";
+}
+
+
+
+// client types
+typedef int fh_t; // file handle
+
+
+// dentries
+#define MAX_DENTRY_LEN 255
+
+
+
+
+
+
+// -- io helpers --
+
+template<class A>
+inline ostream& operator<<(ostream& out, vector<A>& v) {
+ out << "[";
+ for (unsigned i=0; i<v.size(); i++) {
+ if (i) out << ",";
+ out << v[i];
+ }
+ out << "]";
+ return out;
+}
+
+template<class A>
+inline ostream& operator<<(ostream& out, const set<A>& iset) {
+ for (typename set<A>::const_iterator it = iset.begin();
+ it != iset.end();
+ it++) {
+ if (it != iset.begin()) out << ",";
+ out << *it;
+ }
+ return out;
+}
+
+template<class A>
+inline ostream& operator<<(ostream& out, const multiset<A>& iset) {
+ for (typename multiset<A>::const_iterator it = iset.begin();
+ it != iset.end();
+ it++) {
+ if (it != iset.begin()) out << ",";
+ out << *it;
+ }
+ return out;
+}
+
+template<class A,class B>
+inline ostream& operator<<(ostream& out, const map<A,B>& m)
+{
+ out << "{";
+ for (typename map<A,B>::const_iterator it = m.begin();
+ it != m.end();
+ it++) {
+ if (it != m.begin()) out << ",";
+ out << it->first << "=" << it->second;
+ }
+ out << "}";
+ return out;
+}
+
+
+
+
+// -- rope helpers --
+
+// string
+inline void _rope(string& s, crope& r)
+{
+ r.append(s.c_str(), s.length()+1);
+}
+inline void _unrope(string& s, crope& r, int& off)
+{
+ s = r.c_str() + off;
+ off += s.length() + 1;
+}
+
+// set<int>
+inline void _rope(set<int>& s, crope& r)
+{
+ int n = s.size();
+ r.append((char*)&n, sizeof(n));
+ for (set<int>::iterator it = s.begin();
+ it != s.end();
+ it++) {
+ int v = *it;
+ r.append((char*)&v, sizeof(v));
+ n--;
+ }
+ assert(n==0);
+}
+inline void _unrope(set<int>& s, crope& r, int& off)
+{
+ s.clear();
+ int n;
+ r.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ int v;
+ r.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ s.insert(v);
+ }
+ assert(s.size() == (unsigned)n);
+}
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+/*
+ * uofs.h
+ *
+ * user-level object-based file system
+ */
+
+ #ifndef _UOFS_H_
+ #define _UOFS_H_
+
+ #include <sys/types.h>
+ #include <unistd.h>
+ #include <stdio.h>
+
+
+ int device_open(char *path, int xflags);
+ void device_findsizes(int fd, long long *sz, int *bsz);
+
+ int uofs_format(int bdev_id, int donode_size, int bd_ratio, int reg_size, int sb_size, int lb_size,
+ int nr_hash_table_buckets, int delay_allocation, int flush_interval);
+
+ int uofs_mount(int bdev_id);
+ void uofs_shutdown(void);
+
+ int uofs_read(long long oid, void *buf, off_t offset, size_t count);
+ int uofs_write(long long oid, void *buf, off_t offset, size_t count);
+ int uofs_del(long long oid);
+ int uofs_sync(long long oid);
+ int uofs_exist(long long oid);
+
+ int uofs_get_size(long long oid);
+
+ void uofs_superblock_printout(void);
+ int get_large_object_pages(void);
+
+ int uofs_buffer_size(void);
+ #endif
--- /dev/null
+#PSUB -s /bin/bash # Sets your shell in batch
+#PSUB -c alc # Where to run the job
+
+#PSUB -eo # Send std error & std out to the same file
+
+#PSUB -ln $NUM # Number of nodes to use
+#PSUB -g $NUM # Total Number of tasks to use
+#PSUB -cpn 1 # cpus per node
+
+####PSUB -c 1024Mb # memory limit
+#PSUB -lc 1500 # Core file size per process
+#PSUB -nr # Do not automatically resubmit job
+#PSUB -tM 20m # Select time limit. The default time limit
+ # is only 30 minutes! Time can be HH:MM:SS or HH:MM
+
+#PSUB -o $CWD/$OUT # filename for output
+
+# Put your commands here. Remember to 'cd' to the appropriate
+# directory, because the job will initially be in your home directory.
+# To run a parallel job, you need to use the srun.
+
+
+
+echo job $PSUB_JOBID nodes $NUM name $NAME
+
+# environment
+cd $CWD
+export LD_LIBRARY_PATH=/usr/lib/mpi/mpi_gnu/lib
+
+# create fakestore dirs
+srun -l -N $NUM -ppbatch bash -c "test -d tmp/osddata || mkdir tmp/osddata || echo cant make osddata ; uptime"
+
+# go
+srun -l -N $NUM -ppbatch $CMD && touch $DONE
+
+# clean up fakestore
+srun -l -N $NUM -ppbatch bash -c 'uptime ; rm -r tmp/osddata/*'
+
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 3,
+
+ #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192],
+ 'nummds' => [1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208],
+
+ 'cper' => [15,20],
+ '_dep' => [ 'cnode' => '$nummds',# / 4 + 1',
+ 'numclient' => '$nummds * $cper',
+ 'numosd' => '$nummds > 1 ? $nummds:2',
+ 'n' => '1 + $cnode + $nummds + $numosd' ],
+
+ # parameters
+ 'fs' => 'ebofs',
+ #'fs' => 'fakestore',
+
+ 'mds_bal_rep' => 10000, # none of that!
+ 'mds_decay_halflife' => 30,
+
+ 'mds_bal_interval' => 45,
+ 'mds_bal_max' => [2],
+
+ 'until' => 300, # --syn until $n ... when to stop clients
+ 'kill_after' => 400,
+ 'start' => 100,
+ 'end' => 300,
+
+ 'makedirs' => 1,
+ 'makedirs_dirs' => 10,
+ 'makedirs_files' => 10,
+ 'makedirs_depth' => 4,
+
+ # --meta_log_layout_scount 32 --meta_log_layout_ssize 256
+ # --osd_pg_layout linear
+ 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60',
+ #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+
+ 'comb' => {
+ 'x' => 'nummds',
+ 'vars' => [ 'mds.req' ]
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 3,
+
+ #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192],
+ 'nummds' => [160, 200],#[1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208],
+
+ 'cper' => [15,20],
+ '_dep' => [ 'cnode' => '40',#$nummds',# / 4 + 1',
+ 'numclient' => '$nummds * $cper',
+ 'numosd' => '$nummds * .8',
+ 'n' => '415'],#1 + $cnode + $nummds + $numosd' ],
+
+ # parameters
+ 'fs' => 'ebofs',
+ #'fs' => 'fakestore',
+
+ 'mds_bal_rep' => 10000, # none of that!
+ 'mds_decay_halflife' => 30,
+
+ 'mds_bal_interval' => 45,
+ 'mds_bal_max' => 2,
+
+ 'until' => 300, # --syn until $n ... when to stop clients
+ 'kill_after' => 400,
+ 'start' => 100,
+ 'end' => 300,
+
+ 'makedirs' => 1,
+ 'makedirs_dirs' => 10,
+ 'makedirs_files' => 10,
+ 'makedirs_depth' => 4,
+
+ # --meta_log_layout_scount 32 --meta_log_layout_ssize 256
+ # --osd_pg_layout linear
+ 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60',
+ #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+
+ 'comb' => {
+ 'x' => 'nummds',
+ 'vars' => [ 'mds.req' ]
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 3,
+
+ #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192],
+ 'nummds' => [4, 16, 64],#[1, 16, 64, 128],#144, 160, 192, 208],
+
+ #'cper' => [2, 5, 7, 10, 13, 16, 20, 30, 40, 50, 100, 150],
+ 'cper' => [13, 30, 40], # just for final run...
+ '_dep' => [ 'cnode' => '$nummds',# / 4 + 1',
+ 'numclient' => '$nummds * $cper',
+ 'numosd' => '$nummds',
+ 'n' => '1 + $cnode + $nummds + $numosd' ],
+
+ # parameters
+ 'fs' => 'ebofs',
+ #'fs' => 'fakestore',
+
+ 'mds_bal_rep' => 10000, # none of that!
+ 'mds_decay_halflife' => 30,
+
+ 'mds_bal_interval' => 45,
+ 'mds_bal_max' => 2,
+
+ 'until' => 300, # --syn until $n ... when to stop clients
+ 'kill_after' => 400,
+ 'start' => 100,
+ 'end' => 300,
+
+ 'makedirs' => 1,
+ 'makedirs_dirs' => 10,
+ 'makedirs_files' => 10,
+ 'makedirs_depth' => 4,
+
+ # --meta_log_layout_scount 32 --meta_log_layout_ssize 256
+ # --osd_pg_layout linear
+ 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60',
+ #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+
+ 'comb' => {
+ 'x' => 'cper',#nummds',
+ 'vars' => [ 'mds.req', 'cl.lat' ]
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 3,
+
+ 'nummds' => [1, 2, 4, 8, 16, 32, 64, 96, 128], #2, 4, 8, 16, 32, 48, 64, 80, 96],
+
+ 'cper' => [25, 50, 100, 150],# 100, 150, 200],
+
+ '_dep' => [ 'cnode' => '$nummds',
+ 'numclient' => '$nummds * $cper',
+ 'numosd' => '$nummds',
+ 'n' => '1 + $cnode + $nummds + $numosd' ],
+
+ # parameters
+ 'fs' => 'ebofs',
+
+ 'mds_bal_hash_wr' => 1000,
+
+ 'until' => 180, # --syn until $n ... when to stop clients
+ 'kill_after' => 250,
+ 'start' => 30,
+ 'end' => 180,
+
+ 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60 --syn makefiles 100000 1000 0',
+
+ 'comb' => {
+ 'x' => 'nummds',
+ 'vars' => [ 'mds.req', 'cl.lat' ]
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 3,
+
+ 'nummds' => [1, 4, 16, 64, 128, 192 ],
+
+ 'cper' => [10, 50, 100, 150],
+ '_dep' => [ 'cnode' => '$nummds',# > 30 ? 30:$nummds',
+ 'numclient' => '$nummds*$cper',
+ 'numosd' => '$nummds > 30 ? 30:$nummds',
+ 'n' => '1 + $cnode + $nummds + $numosd' ],
+
+ # parameters
+ 'fs' => 'ebofs',
+
+ 'mds_bal_interval' => 10000,
+ 'mds_bal_hash_wr' => 1000,
+
+ 'until' => 120, # --syn until $n ... when to stop clients
+ 'kill_after' => 180,
+ 'start' => 10,
+ 'end' => 120,
+
+ 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 10 --mds_shutdown_check 60 --syn only 0 --syn createshared 10 --syn sleep 5 --syn openshared 10 10000',
+
+ 'comb' => {
+ 'x' => 'nummds',
+ 'vars' => [ 'mds.req', 'cl.lat' ]
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 10,
+
+ #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128],
+ #'nummds' => [1, 2, 4, 6, 7], # googoo
+ 'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ],
+
+ #'trace' => ['make.lib', 'make.include'],
+
+ 'mds_bal_interval' => 45,
+ 'mds_bal_max' => 2,#6, #[ 2,4,6 ],
+ 'mds_decay_halflife' => 30,
+ 'mds_bal_rep' => 1500,
+ 'mds_bal_hash_rd' => 100000,
+
+ 'cper' => [15, 20],#25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ],
+ #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ],
+
+ '_dep' => [ 'cnode' => '$nummds',
+ 'numclient' => '$nummds * $cper',
+ 'numosd' => '$nummds',
+ 'n' => '1 + $cnode + $nummds + $numosd' ],
+
+ 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000',
+
+ # parameters
+ 'fs' => 'ebofs',
+
+ #'until' => 500,
+ #'kill_after' => 600,
+ #'start' => 200,
+ #'end' => 500,
+ 'until' => 300, # --syn until $n ... when to stop clients
+ 'kill_after' => 400,
+ 'start' => 200,
+ 'end' => 300,
+
+ 'comb' => {
+ 'x' => 'nummds',
+ 'vars' => [ 'mds.req' ]
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 10,
+
+ #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128],
+ #'nummds' => [1, 2, 4, 6, 7], # googoo
+ #'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ],
+ 'nummds' => [160,200],
+
+ #'trace' => ['make.lib', 'make.include'],
+
+ 'mds_bal_interval' => 45,
+ 'mds_bal_max' => 2,#6, #[ 2,4,6 ],
+ 'mds_decay_halflife' => 30,
+ 'mds_bal_rep' => 1500,
+ 'mds_bal_hash_rd' => 100000,
+
+ 'cper' => [25, 50], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ],
+ #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ],
+
+ '_dep' => [ 'cnode' => '$nummds',
+ 'numclient' => '$nummds * $cper',
+ 'numosd' => '$nummds * .6',
+ 'n' => '415'],#1 + $cnode + $nummds + $numosd' ],
+
+ 'custom' => '--tcp_skip_rank0 --tcp_overlay_clients --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000',
+
+ # parameters
+ 'fs' => 'ebofs',
+
+ #'until' => 500,
+ #'kill_after' => 600,
+ #'start' => 200,
+ #'end' => 500,
+ 'until' => 300, # --syn until $n ... when to stop clients
+ 'kill_after' => 400,
+ 'start' => 200,
+ 'end' => 300,
+
+ 'comb' => {
+ 'x' => 'nummds',
+ 'vars' => [ 'mds.req' ]
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 10,
+
+ #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128],
+ 'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128],
+
+ #'nummds' => [1, 2, 4, 6, 7], # googoo
+ #'trace' => ['make.lib', 'make.include'],
+
+ 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90],
+ #'mds_bal_max' => [4, 10],#6,#[2,4,6,8],
+
+ 'mds_decay_halflife' => 30,
+ 'mds_bal_rep' => 1500,
+ 'cper' => [10, 16], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ],
+
+ '_dep' => [ 'cnode' => '$nummds',
+ 'numclient' => '$nummds * $cper',
+ 'numosd' => '$nummds',
+ 'n' => '1 + $cnode + $nummds + $numosd' ],
+
+
+ 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000',
+
+ # parameters
+ #'fs' => ['fakestore'],
+ 'fs' => 'ebofs',
+
+ #'until' => 500,
+ #'kill_after' => 600,
+ #'start' => 200,
+ #'end' => 500,
+ 'until' => 300, # --syn until $n ... when to stop clients
+ 'kill_after' => 400,
+ 'start' => 150,
+ 'end' => 300,
+
+ 'comb' => {
+ 'x' => 'nummds',
+ 'vars' => [ 'mds.req' ]
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 10,
+
+ #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128],
+ #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128],
+ 'nummds' => [160,200],
+
+ #'nummds' => [1, 2, 4, 6, 7], # googoo
+ #'trace' => ['make.lib', 'make.include'],
+
+ 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90],
+ #'mds_bal_max' => [4, 10],#6,#[2,4,6,8],
+
+ 'mds_decay_halflife' => 30,
+ 'mds_bal_rep' => 1500,
+ 'cper' => [25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ],
+
+ '_dep' => [ 'cnode' => 0,#'30',
+ 'numclient' => '$nummds * $cper',
+ 'numosd' => '$nummds * .6',
+ 'n' => '415'],#'1 + $cnode + $nummds + $numosd' ],
+
+
+ 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000',
+
+ # parameters
+ #'fs' => ['fakestore'],
+ 'fs' => 'ebofs',
+
+ #'until' => 500,
+ #'kill_after' => 600,
+ #'start' => 200,
+ #'end' => 500,
+ 'until' => 300, # --syn until $n ... when to stop clients
+ 'kill_after' => 400,
+ 'start' => 150,
+ 'end' => 300,
+
+ 'comb' => {
+ 'x' => 'nummds',
+ 'vars' => [ 'mds.req' ]
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 3,
+
+ 'nummds' => 1,
+ 'numosd' => 10,
+
+ 'cnode' => 10,
+ 'cper' => [ 10, 25, 50, 100 ],
+
+ '_dep' => [ 'numclient' => '$cper * $cnode',
+ 'n' => '1 + $cnode + $nummds + $numosd',
+ 'file_layout_osize' => '$writefile_size' ],
+
+ # parameters
+ 'fs' => 'ebofs',
+ #'fs' => 'fakestore',
+
+ 'until' => 160, # --syn until $n ... when to stop clients
+ 'kill_after' => 200,
+ 'start' => 100,
+ 'end' => 160,
+
+ 'writefile' => 1,
+ 'writefile_size' => [
+# 4*1024*1024,
+ 1024*1024 ],
+# 256*1024,
+# 64*1024
+ 'writefile_mb' => 100000,
+
+ 'osd_pg_bits' => 10,#16,
+ #'osd_pg_bits' => [ 16, 20 ],
+
+ #'osd_object_layout' => [ 'hash', 'hashino', 'linear' ],
+ 'osd_pg_layout' => [ 'crush',
+# 'hash',
+ 'linear' ],
+
+ 'custom' => '--tcp_skip_rank0 --file_layout_num_rep 1 --mds_shutdown_check 60',
+
+ 'comb' => {
+ 'x' => 'cper',#writefile_size',
+ 'vars' => [ 'osd.c_wrb', 'osd.c_wr' ],
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 3,
+ 'kill_after' => 300,
+
+ 'nummds' => 1,
+ 'numosd' => 8,
+ 'numclient' => 100,
+ 'n' => 16,
+
+ # parameters
+ 'fs' => ['ebofs','fakestore'],
+ 'meta_log_ssize' => [ 128, 256, 1024, 1 << 15, 1 << 20 ],
+ 'meta_log_scount' => 4,#[ 1, 2, 4, 8 ],
+
+ 'until' => 200, # --syn until $n ... when to stop clients
+
+ 'makedirs' => 1,
+ 'makedirs_dirs' => 10,
+ 'makedirs_files' => 10,
+ 'makedirs_depth' => 4,
+
+ 'custom' => '--tcp_skip_rank0',
+ #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+
+ # for final summation (script/sum.pl)
+ 'start' => 100,
+ 'end' => 550,
+
+ 'comb' => {
+ 'x' => 'nummds',
+ 'vars' => [ 'mds.req' ]
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 3,
+
+ 'nummds' => 1,
+ 'numosd' => 8,
+ 'numclient' => [1],#, 40, 80, 160 ],
+ 'n' => 20,
+
+ 'fs' => 'ebofs',
+
+ 'start' => 20,
+ 'end' => 40,
+ 'until' => 40,
+ 'kill_after' => 60,
+
+ 'makedirs' => 1,
+ 'makedirs_dirs' => 10,
+ 'makedirs_files' => 10,
+ 'makedirs_depth' => 5,
+
+ 'mds_local_osd' => [ 0, 1 ],
+ 'meta_log_layout_num_rep' => [ 0, 1, 2, 3, 4],
+
+ 'custom' => '--tcp_skip_rank0',
+
+ 'comb' => {
+ 'x' => 'meta_log_layout_num_rep',
+ 'vars' => [ 'mds.log.lat', 'cl.lat', 'osd.rlat' ]
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ '_psub' => 'jobs/alc.tp',
+
+ 'sleep' => 3,
+
+ 'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64],
+
+ 'cper' => 50,
+ '_dep' => [ 'cnode' => '$nummds',
+ 'numclient' => '$cnode * $cper',
+ 'numosd' => '$nummds * 2',
+ 'n' => '1 + $cnode + $nummds + $numosd' ],
+
+ # parameters
+ #'fs' => 'ebofs',
+ 'fs' => 'fakestore',
+
+ 'until' => 300, # --syn until $n ... when to stop clients
+ 'kill_after' => 400,
+
+ 'makedirs' => 1,
+ 'makedirs_dirs' => 10,
+ 'makedirs_files' => 10,
+ 'makedirs_depth' => 3,
+
+ 'custom' => '--tcp_skip_rank0',
+ #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+
+ # for final summation (script/sum.pl)
+ 'start' => 100,
+ 'end' => 550,
+
+ 'comb' => {
+ 'x' => 'nummds',
+ 'vars' => [ 'mds.req' ]
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 3,
+
+ 'nummds' => [1, 2, 4, 7], # googoo
+ #'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], # alc
+
+
+ # parameters
+ 'fs' => 'ebofs',
+ #'fs' => 'fakestore',
+
+ 'until' => 300, # --syn until $n ... when to stop clients
+ 'kill_after' => 400,
+ 'start' => 150,
+ 'end' => 300,
+
+ 'mds_bal_interval' => 90,#[60, 90],
+ #'mds_bal_max' => [3,4,5],
+ 'mds_bal_max' => 4,
+ 'mds_decay_halflife' => 30,#[15, 25, 30, 45, 60],
+ 'mds_bal_rep' => 1500,#[1000, 1500, 2000],
+
+ 'decay_hl' => 100,#[ 25, 50, 100, 150 ],
+
+ 'cper' => 100, #[50, 75, 100, 125, 150, 200],
+ '_dep' => [ 'cnode' => '$nummds',
+ 'numclient' => '$nummds * $cper',
+ 'numosd' => '$nummds * 2',
+ 'n' => '1 + $cnode + $nummds + $numosd',
+ 'mds_bal_rep' => '$mds_decay_halflife * $decay_hl'],
+
+ 'custom' => '--tcp_skip_rank0 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn randomsleep 30 --syn trace traces/openssh/make.lib 100 --debug_mds_balancer 1 --mds_shutdown_check 60',
+ #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+
+ # for final summation (script/sum.pl)
+
+ 'comb' => {
+ 'x' => 'nummds',#decay_hl',#'nummds',
+ 'vars' => [ 'mds.req' ]
+ }
+};
--- /dev/null
+#!/bin/sh
+
+# makedirs for 300 seconds
+# first bit in memory
+# second bit is commiting from journal too
+# then walk fs for 300 seconds
+# this should all be in memory.
+
+JOB="meta1"
+ARGS="--numosd 10 --fullmkfs --syn until 180 --syn makedirs 10 10 4 --syn until 360 --syn repeatwalk --mds_bal_max 1 --osd_fsync 0 --mds_log_max_len 200000 --mds_cache_size 500000"
+
+#rm core* ; make tcpsyn && mpiexec -l -n 17 ./tcpsyn $ARGS --nummds 1 --log_name $JOB/1 --numclient 25 > log/$JOB/o.1
+#rm core* ; make tcpsyn && mpiexec -l -n 18 ./tcpsyn $ARGS --nummds 2 --log_name $JOB/2 --numclient 50 > log/$JOB/o.2
+#rm core* ; make tcpsyn && mpiexec -l -n 20 ./tcpsyn $ARGS --nummds 4 --log_name $JOB/4 --numclient 100 > log/$JOB/o.4
+#rm core* ; make tcpsyn && mpiexec -l -n 24 ./tcpsyn $ARGS --nummds 8 --log_name $JOB/8 --numclient 200 > log/$JOB/o.8
+#rm core* ; make tcpsyn && mpiexec -l -n 28 ./tcpsyn $ARGS --nummds 12 --log_name $JOB/12 --numclient 300 > log/$JOB/o.12
+rm core* ; make tcpsyn && mpiexec -l -n 32 ./tcpsyn $ARGS --nummds 16 --log_name $JOB/16 --numclient 300 > log/$JOB/o.16
+
+
--- /dev/null
+#!/bin/sh
+
+for d in 1 2 4 8 12
+do
+ echo $d
+ cd $d
+ ../../../script/sum.pl mds? mds?? > mds.sum
+ ../../../script/sum.pl -avg mds? mds?? > mds.avg
+
+ ../../../script/sum.pl -start 90 -end 180 mds? mds?? > mds.sum.makedirs
+ ../../../script/sum.pl -start 200 -end 300 mds? mds?? > mds.sum.walk
+
+ cd ..
+done
--- /dev/null
+# hi there
+{
+ # startup
+ 'n' => 30, # mpi nodes
+ 'sleep' => 3, # seconds between runs
+ 'nummds' => 1,
+ 'numosd' => 8,
+ 'numclient' => 100,#[10, 50, 100, 200, 400],
+
+'kill_after' => 200,
+
+ # parameters
+ 'fs' => 'ebofs',#[
+# 'obfs',
+# 'fakestore',
+# 'ebofs'
+# ],
+ 'until' => 100, # --syn until $n ... when to stop clients
+ 'writefile' => 1,
+ 'writefile_size' => [
+# 2560000,
+ 1024000,
+ 262144,
+# 131072,
+# 98304,
+ 65536,
+# 16384,
+# 4096,
+ 256,
+# 16,
+# 1
+ ],
+ 'writefile_mb' => 1000,
+
+ 'ebofs_idle_commit_ms' => [ 100, 500 ],
+ 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ],
+
+# 'custom' => '--tcp_skip_rank0',# --osd_maxthreads 0',
+ 'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+
+ # for final summation (script/sum.pl)
+ 'start' => 30,
+ 'end' => 90,
+
+'comb' => {
+ 'x' => 'writefile_size',
+ 'vars' => [ 'osd.c_wrb' ],
+# 'maptitle' => { 'osd_object_layout=' => '',
+# ',osd_pg_layout=' => ' + '}
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ #'_psub' => 'jobs/alc.tp',
+ 'sleep' => 3,
+
+ 'nummds' => 1,
+ 'numclient' => [5, 10, 15, 25, 50, 100, 200, 300, 400],
+ #'numclient' => [ 50, 100, 200 ],
+ 'numosd' => [2,4],#[ 4, 8, 12, 16, 20, 24 ],
+ 'n' => 12,
+
+ # parameters
+ 'fs' => 'fakestore',#['ebofs', 'fakestore','obfs'],
+ #'fs' => 'ebofs',
+ #'ebofs_commit_ms' => [ 1000, 5000 ],
+ #'osd_maxthreads' => [ 0, 1, 2, 4, 8 ],
+
+ 'until' => 100, # --syn until $n ... when to stop clients
+ 'kill_after' => 300,
+ 'start' => 20,
+ 'end' => 90,
+
+ 'makedirs' => 1,
+ 'makedirs_dirs' => 10,
+ 'makedirs_files' => 10,
+ 'makedirs_depth' => 3,
+
+
+ #'meta_log_layout_ssize' => [256, 512, 1024, 4096, 16384, 65536, 262400],
+ #'meta_log_layout_scount' => [2, 4, 8],
+ #'meta_log_layout_num_rep' => [1, 2],
+ #'meta_log_layout_num_rep' => 1,
+
+ 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60',
+ #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+
+ 'comb' => {
+ 'x' => 'numclient',#'meta_log_layout_ssize',
+ 'vars' => [ 'mds.req' ]
+ }
+};
--- /dev/null
+# hi there
+{
+ # startup
+ 'n' => 30, # mpi nodes
+ 'sleep' => 10, # seconds between runs
+ 'nummds' => 1,
+ 'numosd' => 8,
+ 'numclient' => 50,
+
+ # parameters
+ 'fs' => [
+# 'obfs',
+ 'fakestore',
+ 'ebofs'
+ ],
+ 'until' => 100, # --syn until $n ... when to stop clients
+ 'writefile' => 1,
+ 'writefile_size' => [
+ 1024000,
+ 131072,
+ 65536,
+ 16
+ ],
+ 'writefile_mb' => 1000,
+
+ 'osd_maxthreads' => [0, 1, 2, 4, 8],
+
+ 'custom' => '--tcp_skip_rank0',
+
+ # for final summation (script/sum.pl)
+ 'start' => 30,
+ 'end' => 90
+};
--- /dev/null
+#!/usr/bin/perl
+# hi there
+{
+ # startup
+ #'n' => 28, # mpi nodes
+
+ 'sleep' => 3, # seconds between runs
+ 'nummds' => 1,
+
+ 'numosd' => [2,3,4,5,6,7,8,10,12], #[6, 8, 10, 12, 16],
+ 'numosd' => [14],
+ #'cper' => [4, 5, 6, 7, 8, 9, 10, 11, 12], #[1, 4, 6, 8, 16, 32, 64],
+ #'cper' => [4, 6, 8, 10, 12, 16, 24, 32 ], #[1, 4, 6, 8, 16, 32, 64],
+ 'cper' => [30],
+
+ '_dep' => [ 'cnode' => '$numosd',
+ 'numclient' => '$cnode * $cper',
+ 'n' => 38],#'$nummds + $numosd + $cnode'],
+ #'numclient' => [5, 10, 20, 50, 75, 100, 150 ],
+
+ 'start' => 30,
+ 'end' => 90,
+ 'until' => 100, # --syn until $n ... when to stop clients
+ 'kill_after' => 260,
+
+ # parameters
+ 'fs' => 'ebofs',
+ 'writefile' => 1,
+
+ 'writefile_size' => [# 4096,
+ # 16*1024,
+ # 64*1024,
+ # 256*1024,
+ 1024*1024 ],
+# 'writefile_size' => [
+# 2048*1024,
+# 1048576,
+# 512*1024,
+# 262144,
+# 65536,
+# 16384
+# ],
+ 'writefile_mb' => 1000,
+
+ 'file_layout_num_rep'=> [1,2,3],
+
+ 'osd_pg_bits' => 12,#[6, 8, 10, 12, 14],
+
+ 'osd_object_layout' => [ 'hashino' ],#'hash', 'hashino', 'linear' ],
+ 'osd_pg_layout' => [ 'crush', 'linear' ],#, 'linear'],#, 'hash' ],#, 'linear' ],#, 'hash' ],
+
+ #'custom' => '--tcp_skip_rank0', # --osd_maxthreads 0',
+ #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
+
+ # for final summation (script/sum.pl)
+
+ 'comb' => {
+ 'x' => 'numosd',#'writefile_size',
+ 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ],
+# 'maptitle' => { 'osd_object_layout=' => '',
+# ',osd_pg_layout=' => ' + '}
+ }
+};
+
+
+=item some googoo notes
+
+for 1mb 1x writes,
+
+ with numosd=6, min cper=6 to saturate (cper_saturate)
+ googoo saturates at numosd=8. (osd_saturate)
+
+ -> so, numosd=6 or 7 is a safe size!
+
+
+
+
+=cut
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 3,
+
+ 'nummds' => 1,
+ 'numosd' => [12],
+ 'numclient' => [1],#, 40, 80, 160 ],
+ 'n' => 16,
+
+ 'fs' => 'ebofs',
+
+ 'start' => 10,
+ 'end' => 40,
+ 'until' => 40,
+ 'kill_after' => 90,
+
+ 'writefile' => 1,
+ 'writefile_size' => [4096,
+ 8*1024,
+ 16*1024,
+ 32*1024,
+ 64*1024,
+ 128*1024,
+ 256*1024,
+ 512*1024,
+ 1024*1024],
+ 'writefile_mb' => 10000,
+
+ #'tcp_multi_out' => [0,1],
+
+# 'mds_local_osd' => [ 0, 1 ],
+ 'file_layout_num_rep' => [1,2,3],#, 2, 3, 4],
+
+ 'client_oc' => [0,1],
+
+ 'custom' => '--tcp_skip_rank0',
+
+ 'comb' => {
+ 'x' => 'writefile_size',#'file_layout_num_rep',
+ 'vars' => [ 'osd.c_wrb','cl.wrlat' ]
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+# hi there
+{
+ # startup
+ 'n' => 30, # mpi nodes
+ 'sleep' => 3, # seconds between runs
+ 'nummds' => 1,
+ 'numosd' => 6,
+ 'numclient' => 100,#[25,50,100,300],#100,#[10, 50, 100, 200, 400],
+
+ 'until' => 100, # --syn until $n ... when to stop clients
+ 'kill_after' => 300,
+
+ # parameters
+ 'fs' => [
+# 'obfs',
+ 'fakestore',
+# 'ebofs'
+ ],
+ 'writefile' => 1,
+ 'writefile_size' => [
+# 2048*1024,
+ 1024*1024,
+ 512*1024,
+ 256*1024,
+ 128*1024,
+ 64*1024,
+ 48*1024,
+ 32*1024,
+ 28*1024,
+ 24*1024,
+ 16*1024,
+ 12*1024,
+ 8*1024,
+ 4096,
+# 256,
+# 16,
+# 1
+ ],
+ 'writefile_mb' => 1000,
+
+ 'file_layout_num_rep'=> 1,#[1,2],
+
+
+# 'ebofs_idle_commit_ms' => [ 100, 500 ],
+# 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ],
+
+ 'custom' => '--debug_after 110 --debug_mds 15 --debug 5 --mds_shutdown_check 60',
+
+ # for final summation (script/sum.pl)
+ 'start' => 30,
+ 'end' => 90,
+
+ 'comb' => {
+ 'x' => 'writefile_size',
+ 'vars' => [ 'osd.c_wrb' ],
+# 'maptitle' => { 'osd_object_layout=' => '',
+# ',osd_pg_layout=' => ' + '}
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 3,
+
+ 'osdbits' => [6,7,8],#,9],10,11],
+ 'pgperbits' => [3],#,4,5],#[4,6,8],
+
+ 'nummds' => 1,
+
+ '_dep' => [ 'numosd' => '1 << $osdbits',
+ 'osd_pg_bits' => '$pgperbits + $osdbits',
+ 'n' => '3 + $numosd / 32'],
+ 'numclient' => 0,
+
+ 'fake_osdmap_updates' => [30],
+
+ 'fs' => 'ebofs',
+
+ 'start' => 30,
+ 'end' => 300,
+ 'kill_after' => 300,
+
+ 'custom' => '--bdev_lock 0 --ms_stripe_osds --osd_maxthreads 0',
+ #'custom' => '--tcp_skip_rank0',
+
+ 'comb' => {
+ 'x' => 'osdbits',
+ 'vars' => [ 'osd.sum=mapi', 'osd.sum=mapidup', 'osd.numpg', 'osd.pingset' ]
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 3,
+
+ 'nummds' => 1,
+ 'numosd' => 8, #[6],
+ 'numclient' => 1,#, 40, 80, 160 ],
+ 'n' => 10,
+
+ 'fs' => 'ebofs',
+
+ 'start' => 10,
+ 'end' => 40,
+ 'until' => 40,
+ 'kill_after' => 45,
+
+ 'writefile' => 1,
+ 'writefile_size' => [4096,
+# 8*1024,
+# 16*1024,
+# 32*1024,
+ 64*1024,
+# 128*1024,
+# 256*1024,
+# 512*1024,
+# 1024*1024
+],
+ 'writefile_mb' => 10000,
+
+ 'osd_rep' => [0,1,2],
+
+ 'file_layout_num_rep' => [1,2,3,4,5,6],#, 2, 3, 4],
+
+ 'osd_pg_bits' => 4,
+ 'custom' => '--osd_max_rep 8',
+
+ 'comb' => {
+ 'x' => 'file_layout_num_rep',
+ 'vars' => [ 'cl.wrlat' ]
+ }
+};
--- /dev/null
+#!/usr/bin/perl
+
+# hi there
+{
+ 'sleep' => 3,
+
+ 'nummds' => 1,
+ 'numosd' => [8],#10,14,16],
+ 'numclient' => [10*16],
+ 'n' => 15,
+
+ 'fs' => 'ebofs',
+
+ 'start' => 60,
+ 'end' => 90,
+ 'until' => 90,
+ 'kill_after' => 190,
+
+ 'writefile' => 1,
+ 'writefile_size' => [4096,
+ 8*1024,
+ 16*1024,
+ 32*1024,
+ 64*1024,
+ 128*1024,
+ 256*1024,
+ # 512*1024,
+# 4*1024*1024,
+# 2*1024*1024,
+# 1024*1024
+],
+ 'writefile_mb' => 10000,
+
+ 'file_layout_num_rep' => 1,
+ 'file_layout_ssize' => 4*1024*1024,
+ 'file_layout_osize' => 4*1024*1024,
+
+ 'osd_pg_bits' => 12,
+
+# 'ebofs_freelist' => [0, 1080, 65400],
+
+ 'custom' => '--objecter_buffer_uncommitted 0',
+
+ #'custom' => '--tcp_skip_rank0',
+
+ 'comb' => {
+ 'x' => 'writefile_size',
+ 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ]
+ }
+};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __ANCHOR_H
+#define __ANCHOR_H
+
+#include <string>
+using std::string;
+
+#include "include/types.h"
+#include "include/buffer.h"
+
+class Anchor {
+public:
+ inodeno_t ino; // my ino
+ inodeno_t dirino; // containing dir
+ string ref_dn; // referring dentry
+ int nref; // reference count
+
+ Anchor() {}
+ Anchor(inodeno_t ino, inodeno_t dirino, string& ref_dn, int nref=0) {
+ this->ino = ino;
+ this->dirino = dirino;
+ this->ref_dn = ref_dn;
+ this->nref = nref;
+ }
+
+ void _encode(bufferlist &bl) {
+ bl.append((char*)&ino, sizeof(ino));
+ bl.append((char*)&dirino, sizeof(dirino));
+ bl.append((char*)&nref, sizeof(nref));
+ ::_encode(ref_dn, bl);
+ }
+ void _decode(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ bl.copy(off, sizeof(dirino), (char*)&dirino);
+ off += sizeof(dirino);
+ bl.copy(off, sizeof(nref), (char*)&nref);
+ off += sizeof(nref);
+ ::_decode(ref_dn, bl, off);
+ }
+} ;
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <iostream>
+using std::cout;
+using std::cerr;
+using std::endl;
+
+#include "Anchor.h"
+#include "AnchorClient.h"
+#include "MDSMap.h"
+
+#include "include/Context.h"
+#include "msg/Messenger.h"
+
+#include "MDS.h"
+
+#include "messages/MAnchorRequest.h"
+#include "messages/MAnchorReply.h"
+
+#include "config.h"
+#undef dout
+#define dout(x) if (x <= g_conf.debug) cout << g_clock.now() << " " << messenger->get_myaddr() << ".anchorclient "
+#define derr(x) if (x <= g_conf.debug) cout << g_clock.now() << " " << messenger->get_myaddr() << ".anchorclient "
+
+
+void AnchorClient::dispatch(Message *m)
+{
+ switch (m->get_type()) {
+ case MSG_MDS_ANCHORREPLY:
+ handle_anchor_reply((MAnchorReply*)m);
+ break;
+
+ default:
+ assert(0);
+ }
+}
+
+void AnchorClient::handle_anchor_reply(class MAnchorReply *m)
+{
+ switch (m->get_op()) {
+
+ case ANCHOR_OP_LOOKUP:
+ {
+ assert(pending_lookup_trace.count(m->get_ino()) == 1);
+
+ *(pending_lookup_trace[ m->get_ino() ]) = m->get_trace();
+ Context *onfinish = pending_lookup_context[ m->get_ino() ];
+
+ pending_lookup_trace.erase(m->get_ino());
+ pending_lookup_context.erase(m->get_ino());
+
+ if (onfinish) {
+ onfinish->finish(0);
+ delete onfinish;
+ }
+ }
+ break;
+
+ case ANCHOR_OP_UPDATE:
+ case ANCHOR_OP_CREATE:
+ case ANCHOR_OP_DESTROY:
+ {
+ assert(pending_op.count(m->get_ino()) == 1);
+
+ Context *onfinish = pending_op[m->get_ino()];
+ pending_op.erase(m->get_ino());
+
+ if (onfinish) {
+ onfinish->finish(0);
+ delete onfinish;
+ }
+ }
+ break;
+
+ default:
+ assert(0);
+ }
+
+}
+
+
+
+/*
+ * public async interface
+ */
+
+void AnchorClient::lookup(inodeno_t ino, vector<Anchor*>& trace, Context *onfinish)
+{
+ // send message
+ MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_LOOKUP, ino);
+
+ pending_lookup_trace[ino] = &trace;
+ pending_lookup_context[ino] = onfinish;
+
+ messenger->send_message(req,
+ MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()),
+ MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT);
+}
+
+void AnchorClient::create(inodeno_t ino, vector<Anchor*>& trace, Context *onfinish)
+{
+ // send message
+ MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_CREATE, ino);
+ req->set_trace(trace);
+
+ pending_op[ino] = onfinish;
+
+ messenger->send_message(req,
+ MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()),
+ MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT);
+}
+
+void AnchorClient::update(inodeno_t ino, vector<Anchor*>& trace, Context *onfinish)
+{
+ // send message
+ MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_UPDATE, ino);
+ req->set_trace(trace);
+
+ pending_op[ino] = onfinish;
+
+ messenger->send_message(req,
+ MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()),
+ MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT);
+}
+
+void AnchorClient::destroy(inodeno_t ino, Context *onfinish)
+{
+ // send message
+ MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_DESTROY, ino);
+
+ pending_op[ino] = onfinish;
+
+ messenger->send_message(req,
+ MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()),
+ MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT);
+}
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __ANCHORCLIENT_H
+#define __ANCHORCLIENT_H
+
+#include <vector>
+using std::vector;
+#include <ext/hash_map>
+using __gnu_cxx::hash_map;
+
+#include "include/types.h"
+#include "msg/Dispatcher.h"
+
+#include "Anchor.h"
+
+class Messenger;
+class MDSMap;
+class Context;
+
+class AnchorClient : public Dispatcher {
+ Messenger *messenger;
+ MDSMap *mdsmap;
+
+ // remote state
+ hash_map<inodeno_t, Context*> pending_op;
+ hash_map<inodeno_t, Context*> pending_lookup_context;
+ hash_map<inodeno_t, vector<Anchor*>*> pending_lookup_trace;
+
+ void handle_anchor_reply(class MAnchorReply *m);
+
+
+public:
+ AnchorClient(Messenger *ms, MDSMap *mm) : messenger(ms), mdsmap(mm) {}
+
+ // async user interface
+ void lookup(inodeno_t ino, vector<Anchor*>& trace, Context *onfinish);
+ void create(inodeno_t ino, vector<Anchor*>& trace, Context *onfinish);
+ void update(inodeno_t ino, vector<Anchor*>& trace, Context *onfinish);
+ void destroy(inodeno_t ino, Context *onfinish);
+
+ void dispatch(Message *m);
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "AnchorTable.h"
+#include "MDS.h"
+
+#include "osdc/Filer.h"
+
+#include "msg/Messenger.h"
+#include "messages/MAnchorRequest.h"
+#include "messages/MAnchorReply.h"
+
+#include "common/Clock.h"
+
+#include "config.h"
+#undef dout
+#define dout(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " " << mds->messenger->get_myaddr() << ".anchortable "
+#define derr(x) if (x <= g_conf.debug_mds) cerr << g_clock.now() << " " << mds->messenger->get_myaddr() << ".anchortable "
+
+AnchorTable::AnchorTable(MDS *mds)
+{
+ this->mds = mds;
+ opening = false;
+ opened = false;
+}
+
+void AnchorTable::init_inode()
+{
+ memset(&table_inode, 0, sizeof(table_inode));
+ table_inode.ino = MDS_INO_ANCHORTABLE+mds->get_nodeid();
+ table_inode.layout = g_OSD_FileLayout;
+}
+
+void AnchorTable::reset()
+{
+ init_inode();
+ opened = true;
+ anchor_map.clear();
+}
+
+/*
+ * basic updates
+ */
+
+bool AnchorTable::add(inodeno_t ino, inodeno_t dirino, string& ref_dn)
+{
+ dout(7) << "add " << std::hex << ino << " dirino " << dirino << std::dec << " ref_dn " << ref_dn << endl;
+
+ // parent should be there
+ assert(dirino < 1000 || // system dirino
+ anchor_map.count(dirino)); // have
+
+ if (anchor_map.count(ino) == 0) {
+ // new item
+ anchor_map[ ino ] = new Anchor(ino, dirino, ref_dn);
+ dout(10) << " add: added " << std::hex << ino << std::dec << endl;
+ return true;
+ } else {
+ dout(10) << " add: had " << std::hex << ino << std::dec << endl;
+ return false;
+ }
+}
+
+void AnchorTable::inc(inodeno_t ino)
+{
+ dout(7) << "inc " << std::hex << ino << std::dec << endl;
+
+ assert(anchor_map.count(ino) != 0);
+ Anchor *anchor = anchor_map[ino];
+ assert(anchor);
+
+ while (1) {
+ anchor->nref++;
+
+ dout(10) << " inc: record " << std::hex << ino << std::dec << " now " << anchor->nref << endl;
+ ino = anchor->dirino;
+
+ if (ino == 0) break;
+ if (anchor_map.count(ino) == 0) break;
+ anchor = anchor_map[ino];
+ assert(anchor);
+ }
+}
+
+void AnchorTable::dec(inodeno_t ino)
+{
+ dout(7) << "dec " << std::hex << ino << std::dec << endl;
+
+ assert(anchor_map.count(ino) != 0);
+ Anchor *anchor = anchor_map[ino];
+ assert(anchor);
+
+ while (true) {
+ anchor->nref--;
+
+ if (anchor->nref == 0) {
+ dout(10) << " dec: record " << std::hex << ino << std::dec << " now 0, removing" << endl;
+ inodeno_t dirino = anchor->dirino;
+ anchor_map.erase(ino);
+ delete anchor;
+ ino = dirino;
+ } else {
+ dout(10) << " dec: record " << std::hex << ino << std::dec << " now " << anchor->nref << endl;
+ ino = anchor->dirino;
+ }
+
+ if (ino == 0) break;
+ if (anchor_map.count(ino) == 0) break;
+ anchor = anchor_map[ino];
+ assert(anchor);
+ }
+}
+
+
+/*
+ * high level
+ */
+
+void AnchorTable::lookup(inodeno_t ino, vector<Anchor*>& trace)
+{
+ dout(7) << "lookup " << std::hex << ino << std::dec << endl;
+
+ assert(anchor_map.count(ino) == 1);
+ Anchor *anchor = anchor_map[ino];
+ assert(anchor);
+
+ while (true) {
+ dout(10) << " record " << std::hex << anchor->ino << " dirino " << anchor->dirino << std::dec << " ref_dn " << anchor->ref_dn << endl;
+ trace.insert(trace.begin(), anchor); // lame FIXME
+
+ if (anchor->dirino < MDS_INO_BASE) break;
+
+ assert(anchor_map.count(anchor->dirino) == 1);
+ anchor = anchor_map[anchor->dirino];
+ assert(anchor);
+ }
+}
+
+void AnchorTable::create(inodeno_t ino, vector<Anchor*>& trace)
+{
+ dout(7) << "create " << std::hex << ino << std::dec << endl;
+
+ // make sure trace is in table
+ for (unsigned i=0; i<trace.size(); i++)
+ add(trace[i]->ino, trace[i]->dirino, trace[i]->ref_dn);
+
+ inc(ino); // ok!
+}
+
+void AnchorTable::destroy(inodeno_t ino)
+{
+ dec(ino);
+}
+
+
+
+/*
+ * messages
+ */
+
+void AnchorTable::dispatch(Message *m)
+{
+ switch (m->get_type()) {
+ case MSG_MDS_ANCHORREQUEST:
+ handle_anchor_request((MAnchorRequest*)m);
+ break;
+
+ default:
+ assert(0);
+ }
+}
+
+
+
+void AnchorTable::handle_anchor_request(class MAnchorRequest *m)
+{
+ // make sure i'm open!
+ if (!opened) {
+ dout(7) << "not open yet" << endl;
+
+ waiting_for_open.push_back(new C_MDS_RetryMessage(mds,m));
+
+ if (!opening) {
+ opening = true;
+ load(0);
+ }
+ return;
+ }
+
+ // go
+ MAnchorReply *reply = new MAnchorReply(m);
+
+ switch (m->get_op()) {
+
+ case ANCHOR_OP_LOOKUP:
+ lookup( m->get_ino(), reply->get_trace() );
+ break;
+
+ case ANCHOR_OP_UPDATE:
+ destroy( m->get_ino() );
+ create( m->get_ino(), m->get_trace() );
+ break;
+
+ case ANCHOR_OP_CREATE:
+ create( m->get_ino(), m->get_trace() );
+ break;
+
+ case ANCHOR_OP_DESTROY:
+ destroy( m->get_ino() );
+ break;
+
+ default:
+ assert(0);
+ }
+
+ // send reply
+ mds->messenger->send_message(reply, m->get_source(), m->get_source_inst(), m->get_source_port());
+ delete m;
+}
+
+
+
+
+// primitive load/save for now!
+
+// load/save entire table for now!
+
+void AnchorTable::save(Context *onfinish)
+{
+ dout(7) << "save" << endl;
+ if (!opened) return;
+
+ // build up write
+ bufferlist tabbl;
+
+ int num = anchor_map.size();
+ tabbl.append((char*)&num, sizeof(int));
+
+ for (hash_map<inodeno_t, Anchor*>::iterator it = anchor_map.begin();
+ it != anchor_map.end();
+ it++) {
+ dout(14) << " saving anchor for " << std::hex << it->first << std::dec << endl;
+ Anchor *a = it->second;
+ assert(a);
+ a->_encode(tabbl);
+ }
+
+ bufferlist bl;
+ size_t size = tabbl.length();
+ bl.append((char*)&size, sizeof(size));
+ bl.claim_append(tabbl);
+
+ dout(7) << " " << num << " anchors, " << size << " bytes" << endl;
+
+ // write!
+ mds->filer->write(table_inode,
+ 0, bl.length(),
+ bl, 0,
+ NULL, onfinish);
+}
+
+
+
+class C_AT_Load : public Context {
+ AnchorTable *at;
+public:
+ size_t size;
+ bufferlist bl;
+ C_AT_Load(size_t size, AnchorTable *at) {
+ this->size = size;
+ this->at = at;
+ }
+ void finish(int result) {
+ assert(result > 0);
+
+ at->load_2(size, bl);
+ }
+};
+
+class C_AT_LoadSize : public Context {
+ AnchorTable *at;
+ MDS *mds;
+public:
+ bufferlist bl;
+ C_AT_LoadSize(AnchorTable *at, MDS *mds) {
+ this->at = at;
+ this->mds = mds;
+ }
+ void finish(int r) {
+ size_t size = 0;
+ assert(bl.length() >= sizeof(size));
+ bl.copy(0, sizeof(size), (char*)&size);
+ cout << "r is " << r << " size is " << size << endl;
+ if (r > 0 && size > 0) {
+ C_AT_Load *c = new C_AT_Load(size, at);
+ mds->filer->read(at->table_inode,
+ sizeof(size), size,
+ &c->bl,
+ c);
+ } else {
+ // fail
+ bufferlist empty;
+ at->load_2(0, empty);
+ }
+ }
+};
+
+void AnchorTable::load(Context *onfinish)
+{
+ dout(7) << "load" << endl;
+ init_inode();
+
+ assert(!opened);
+
+ waiting_for_open.push_back(onfinish);
+
+ C_AT_LoadSize *c = new C_AT_LoadSize(this, mds);
+ mds->filer->read(table_inode,
+ 0, sizeof(size_t),
+ &c->bl,
+ c);
+}
+
+void AnchorTable::load_2(size_t size, bufferlist& bl)
+{
+ // num
+ int off = 0;
+ int num;
+ bl.copy(0, sizeof(num), (char*)&num);
+ off += sizeof(num);
+
+ // parse anchors
+ for (int i=0; i<num; i++) {
+ Anchor *a = new Anchor;
+ a->_decode(bl, off);
+ dout(10) << "load_2 decoded " << std::hex << a->ino << " dirino " << a->dirino << std::dec << " ref_dn " << a->ref_dn << endl;
+ anchor_map[a->ino] = a;
+ }
+
+ dout(7) << "load_2 got " << num << " anchors" << endl;
+
+ opened = true;
+ opening = false;
+
+ // finish
+ finish_contexts(waiting_for_open);
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __ANCHORTABLE_H
+#define __ANCHORTABLE_H
+
+#include "Anchor.h"
+#include "include/Context.h"
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+class MDS;
+
+
+class AnchorTable {
+ MDS *mds;
+ hash_map<inodeno_t, Anchor*> anchor_map;
+
+ bool opening, opened;
+ list<Context*> waiting_for_open;
+
+ public:
+ inode_t table_inode;
+
+ public:
+ AnchorTable(MDS *mds);
+
+ protected:
+ void init_inode(); // call this before doing anything.
+
+ //
+ bool have_ino(inodeno_t ino) {
+ return true; // always in memory for now.
+ }
+ void fetch_ino(inodeno_t ino, Context *onfinish) {
+ assert(!opened);
+ load(onfinish);
+ }
+
+ // adjust table
+ bool add(inodeno_t ino, inodeno_t dirino, string& ref_dn);
+ void inc(inodeno_t ino);
+ void dec(inodeno_t ino);
+
+
+ // high level interface
+ void lookup(inodeno_t ino, vector<Anchor*>& trace);
+ void create(inodeno_t ino, vector<Anchor*>& trace);
+ void destroy(inodeno_t ino);
+
+ // messages
+ public:
+ void dispatch(class Message *m);
+ protected:
+ void handle_anchor_request(class MAnchorRequest *m);
+
+
+ public:
+
+ // load/save entire table for now!
+ void reset();
+ void save(Context *onfinish);
+ void load(Context *onfinish);
+ void load_2(size_t size, bufferlist& bl);
+
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "CDentry.h"
+#include "CInode.h"
+#include "CDir.h"
+
+#include <cassert>
+
+#undef dout
+#define dout(x) if ((x) <= g_conf.debug) cout << "mds.dentry "
+
+
+// CDentry
+
+ostream& operator<<(ostream& out, CDentry& dn)
+{
+ out << "[dentry " << dn.get_name();
+ if (dn.is_pinned()) out << " " << dn.num_pins() << " pins";
+
+ if (dn.is_null()) out << " NULL";
+ if (dn.is_remote()) out << " REMOTE";
+
+ if (dn.get_lockstate() == DN_LOCK_UNPINNING) out << " unpinning";
+ if (dn.is_dirty()) out << " dirty";
+ if (dn.get_lockstate() == DN_LOCK_PREXLOCK) out << " prexlock=" << dn.get_xlockedby() << " g=" << dn.get_gather_set();
+ if (dn.get_lockstate() == DN_LOCK_XLOCK) out << " xlock=" << dn.get_xlockedby();
+
+ out << " dirv=" << dn.get_parent_dir_version();
+
+ out << " inode=" << dn.get_inode();
+ out << " " << &dn;
+ out << " in " << *dn.get_dir();
+ out << "]";
+ return out;
+}
+
+CDentry::CDentry(const CDentry& m) {
+ assert(0); //std::cerr << "copy cons called, implement me" << endl;
+}
+
+
+void CDentry::mark_dirty()
+{
+ dout(10) << " mark_dirty " << *this << endl;
+
+ // dir is now dirty (if it wasn't already)
+ dir->mark_dirty();
+
+ // pin inode?
+ if (is_primary() && !dirty && inode) inode->get(CINODE_PIN_DNDIRTY);
+
+ // i now live in that (potentially newly dirty) version
+ parent_dir_version = dir->get_version();
+
+ dirty = true;
+}
+void CDentry::mark_clean() {
+ dout(10) << " mark_clean " << *this << endl;
+ assert(dirty);
+ assert(parent_dir_version <= dir->get_version());
+
+ if (parent_dir_version < dir->get_last_committed_version())
+ cerr << " bad mark_clean " << *this << endl;
+
+ assert(parent_dir_version >= dir->get_last_committed_version());
+
+ if (is_primary() && dirty && inode) inode->put(CINODE_PIN_DNDIRTY);
+ dirty = false;
+}
+
+
+void CDentry::make_path(string& s)
+{
+ if (dir->inode->get_parent_dn())
+ dir->inode->get_parent_dn()->make_path(s);
+
+ s += "/";
+ s += name;
+}
+
+
+void CDentry::link_remote(CInode *in)
+{
+ assert(is_remote());
+ assert(in->ino() == remote_ino);
+
+ inode = in;
+ in->add_remote_parent(this);
+}
+
+void CDentry::unlink_remote()
+{
+ assert(is_remote());
+ assert(inode);
+
+ inode->remove_remote_parent(this);
+ inode = 0;
+}
+
+
+
+
+
+// =
+const CDentry& CDentry::operator= (const CDentry& right) {
+ assert(0); //std::cerr << "copy op called, implement me" << endl;
+ return *this;
+}
+
+ // comparisons
+ bool CDentry::operator== (const CDentry& right) const {
+ return name == right.name;
+ }
+ bool CDentry::operator!= (const CDentry& right) const {
+ return name == right.name;
+ }
+ bool CDentry::operator< (const CDentry& right) const {
+ return name < right.name;
+ }
+ bool CDentry::operator> (const CDentry& right) const {
+ return name > right.name;
+ }
+ bool CDentry::operator>= (const CDentry& right) const {
+ return name >= right.name;
+ }
+ bool CDentry::operator<= (const CDentry& right) const {
+ return name <= right.name;
+ }
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __CDENTRY_H
+#define __CDENTRY_H
+
+#include <assert.h>
+#include <string>
+#include <set>
+using namespace std;
+
+#include "include/types.h"
+
+class CInode;
+class CDir;
+
+#define DN_LOCK_SYNC 0
+#define DN_LOCK_PREXLOCK 1
+#define DN_LOCK_XLOCK 2
+#define DN_LOCK_UNPINNING 3 // waiting for pins to go away
+
+#define DN_XLOCK_FOREIGN ((Message*)0x1) // not 0, not a valid pointer.
+
+class Message;
+
+// dentry
+class CDentry {
+ protected:
+ string name;
+ CInode *inode;
+ CDir *dir;
+
+ inodeno_t remote_ino; // if remote dentry
+
+ // state
+ bool dirty;
+ version_t parent_dir_version; // dir version when last touched.
+
+ // locking
+ int lockstate;
+ Message *xlockedby;
+ set<int> gather_set;
+
+ int npins;
+ multiset<Message*> pinset;
+
+ friend class Migrator;
+ friend class Locker;
+ friend class Renamer;
+ friend class Server;
+ friend class MDCache;
+ friend class MDS;
+ friend class CInode;
+ friend class C_MDC_XlockRequest;
+
+ public:
+ // cons
+ CDentry() :
+ inode(0),
+ dir(0),
+ remote_ino(0),
+ dirty(0),
+ parent_dir_version(0),
+ lockstate(DN_LOCK_SYNC),
+ xlockedby(0),
+ npins(0) { }
+ CDentry(const string& n, inodeno_t ino, CInode *in=0) :
+ name(n),
+ inode(in),
+ dir(0),
+ remote_ino(ino),
+ dirty(0),
+ parent_dir_version(0),
+ lockstate(DN_LOCK_SYNC),
+ xlockedby(0),
+ npins(0) { }
+ CDentry(const string& n, CInode *in) :
+ name(n),
+ inode(in),
+ dir(0),
+ remote_ino(0),
+ dirty(0),
+ parent_dir_version(0),
+ lockstate(DN_LOCK_SYNC),
+ xlockedby(0),
+ npins(0) { }
+
+ CInode *get_inode() { return inode; }
+ CDir *get_dir() { return dir; }
+ const string& get_name() { return name; }
+ inodeno_t get_remote_ino() { return remote_ino; }
+
+ void set_remote_ino(inodeno_t ino) { remote_ino = ino; }
+
+ // dentry type is primary || remote || null
+ // inode ptr is required for primary, optional for remote, undefined for null
+ bool is_primary() { return remote_ino == 0 && inode != 0; }
+ bool is_remote() { return remote_ino > 0; }
+ bool is_null() { return (remote_ino == 0 && inode == 0) ? true:false; }
+
+ // remote links
+ void link_remote(CInode *in);
+ void unlink_remote();
+
+
+ // copy cons
+ CDentry(const CDentry& m);
+ const CDentry& operator= (const CDentry& right);
+
+ // comparisons
+ bool operator== (const CDentry& right) const;
+ bool operator!= (const CDentry& right) const;
+ bool operator< (const CDentry& right) const;
+ bool operator> (const CDentry& right) const;
+ bool operator>= (const CDentry& right) const;
+ bool operator<= (const CDentry& right) const;
+
+ // misc
+ void make_path(string& p);
+
+ // -- state
+ __uint64_t get_parent_dir_version() { return parent_dir_version; }
+ void float_parent_dir_version(__uint64_t ge) {
+ if (parent_dir_version < ge)
+ parent_dir_version = ge;
+ }
+
+ bool is_dirty() { return dirty; }
+ bool is_clean() { return !dirty; }
+
+ void mark_dirty();
+ void mark_clean();
+
+
+ // -- locking
+ int get_lockstate() { return lockstate; }
+ set<int>& get_gather_set() { return gather_set; }
+
+ bool is_sync() { return lockstate == DN_LOCK_SYNC; }
+ bool can_read() { return (lockstate == DN_LOCK_SYNC) || (lockstate == DN_LOCK_UNPINNING); }
+ bool can_read(Message *m) { return is_xlockedbyme(m) || can_read(); }
+ bool is_xlocked() { return lockstate == DN_LOCK_XLOCK; }
+ Message* get_xlockedby() { return xlockedby; }
+ bool is_xlockedbyother(Message *m) { return (lockstate == DN_LOCK_XLOCK) && m != xlockedby; }
+ bool is_xlockedbyme(Message *m) { return (lockstate == DN_LOCK_XLOCK) && m == xlockedby; }
+ bool is_prexlockbyother(Message *m) {
+ return (lockstate == DN_LOCK_PREXLOCK) && m != xlockedby;
+ }
+
+ // pins
+ void pin(Message *m) {
+ npins++;
+ pinset.insert(m);
+ assert(pinset.size() == (unsigned)npins);
+ }
+ void unpin(Message *m) {
+ npins--;
+ assert(npins >= 0);
+ assert(pinset.count(m) > 0);
+ pinset.erase(pinset.find(m));
+ assert(pinset.size() == (unsigned)npins);
+ }
+ bool is_pinnable(Message *m) {
+ return (lockstate == DN_LOCK_SYNC) ||
+ (lockstate == DN_LOCK_UNPINNING && pinset.count(m));
+ }
+ bool is_pinned() { return npins>0; }
+ int num_pins() { return npins; }
+
+ friend class CDir;
+};
+
+ostream& operator<<(ostream& out, CDentry& dn);
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "CDir.h"
+#include "CDentry.h"
+#include "CInode.h"
+
+#include "MDS.h"
+#include "MDSMap.h"
+
+#include "include/Context.h"
+#include "common/Clock.h"
+
+#include <cassert>
+
+#include "config.h"
+#undef dout
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".cache.dir(" << inode->inode.ino << ") "
+
+
+// PINS
+int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
+
+static char* cdir_pin_names[CDIR_NUM_PINS] = {
+ "child",
+ "opened",
+ "waiter",
+ "import",
+ "export",
+ "freeze",
+ "proxy",
+ "authpin",
+ "imping",
+ "impex",
+ "hashed",
+ "hashing",
+ "dirty",
+ "reqpins"
+};
+
+
+ostream& operator<<(ostream& out, CDir& dir)
+{
+ string path;
+ dir.get_inode()->make_path(path);
+ out << "[dir " << dir.ino() << " " << path << "/";
+ if (dir.is_dirty()) out << " dirty";
+ if (dir.is_import()) out << " import";
+ if (dir.is_export()) out << " export";
+ if (dir.is_rep()) out << " repl";
+ if (dir.is_hashed()) out << " hashed"; //=" << (int)dir.get_inode()->inode.hash_seed;
+ if (dir.is_auth()) {
+ out << " auth";
+ if (dir.is_open_by_anyone())
+ out << "+" << dir.get_open_by();
+ } else {
+ out << " rep@" << dir.authority();
+ if (dir.get_replica_nonce() > 1)
+ out << "." << dir.get_replica_nonce();
+ }
+
+ if (dir.is_pinned()) {
+ out << " |";
+ for(set<int>::iterator it = dir.get_ref_set().begin();
+ it != dir.get_ref_set().end();
+ it++)
+ if (*it < CDIR_NUM_PINS)
+ out << " " << cdir_pin_names[*it];
+ else
+ out << " " << *it;
+ }
+
+ if (dir.get_dir_auth() != CDIR_AUTH_PARENT)
+ out << " dir_auth=" << dir.get_dir_auth();
+
+ out << " state=" << dir.get_state();
+ out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull();
+
+ out << " v=" << dir.get_version();
+ out << " cv=" << dir.get_committing_version();
+ out << " lastcv=" << dir.get_last_committed_version();
+
+ out << " " << &dir;
+ return out << "]";
+}
+
+
+// -------------------------------------------------------------------
+// CDir
+
+CDir::CDir(CInode *in, MDS *mds, bool auth)
+{
+ inode = in;
+ this->mds = mds;
+
+ nitems = 0;
+ nnull = 0;
+ state = CDIR_STATE_INITIAL;
+
+ version = 0;
+ committing_version = 0;
+ last_committed_version = 0;
+
+ ref = 0;
+
+ // auth
+ dir_auth = -1;
+ assert(in->is_dir());
+ if (auth)
+ state |= CDIR_STATE_AUTH;
+ /*
+ if (in->dir_is_hashed()) {
+ assert(0); // when does this happen?
+ state |= CDIR_STATE_HASHED;
+ }
+ */
+
+ auth_pins = 0;
+ nested_auth_pins = 0;
+ request_pins = 0;
+
+ dir_rep = CDIR_REP_NONE;
+}
+
+
+
+
+/***
+ * linking fun
+ */
+
+CDentry* CDir::add_dentry( const string& dname, inodeno_t ino)
+{
+ // foreign
+ assert(lookup(dname) == 0);
+
+ // create dentry
+ CDentry* dn = new CDentry(dname, ino);
+ dn->dir = this;
+ dn->parent_dir_version = version;
+
+ // add to dir
+ assert(items.count(dn->name) == 0);
+ assert(null_items.count(dn->name) == 0);
+
+ items[dn->name] = dn;
+ nitems++;
+
+ dout(12) << "add_dentry " << *dn << endl;
+
+ // pin?
+ if (nnull + nitems == 1) get(CDIR_PIN_CHILD);
+
+ assert(nnull + nitems == items.size());
+ assert(nnull == null_items.size());
+ return dn;
+}
+
+
+CDentry* CDir::add_dentry( const string& dname, CInode *in )
+{
+ // primary
+ assert(lookup(dname) == 0);
+
+ // create dentry
+ CDentry* dn = new CDentry(dname, in);
+ dn->dir = this;
+ dn->parent_dir_version = version;
+
+ // add to dir
+ assert(items.count(dn->name) == 0);
+ assert(null_items.count(dn->name) == 0);
+
+ items[dn->name] = dn;
+
+ if (in) {
+ link_inode_work( dn, in );
+ } else {
+ assert(dn->inode == 0);
+ null_items[dn->name] = dn;
+ nnull++;
+ }
+
+ dout(12) << "add_dentry " << *dn << endl;
+
+ // pin?
+ if (nnull + nitems == 1) get(CDIR_PIN_CHILD);
+
+ assert(nnull + nitems == items.size());
+ assert(nnull == null_items.size());
+ return dn;
+}
+
+
+
+void CDir::remove_dentry(CDentry *dn)
+{
+ dout(12) << "remove_dentry " << *dn << endl;
+
+ if (dn->inode) {
+ // detach inode and dentry
+ unlink_inode_work(dn);
+ } else {
+ // remove from null list
+ assert(null_items.count(dn->name) == 1);
+ null_items.erase(dn->name);
+ nnull--;
+ }
+
+ // remove from list
+ assert(items.count(dn->name) == 1);
+ items.erase(dn->name);
+
+ delete dn;
+
+ // unpin?
+ if (nnull + nitems == 0) put(CDIR_PIN_CHILD);
+
+ assert(nnull + nitems == items.size());
+ assert(nnull == null_items.size());
+}
+
+void CDir::link_inode( CDentry *dn, inodeno_t ino)
+{
+ //dout(12) << "link_inode " << *dn << " remote " << ino << endl;
+
+ assert(dn->is_null());
+ dn->set_remote_ino(ino);
+ nitems++;
+
+ assert(null_items.count(dn->name) == 1);
+ null_items.erase(dn->name);
+ nnull--;
+}
+
+void CDir::link_inode( CDentry *dn, CInode *in )
+{
+ assert(!dn->is_remote());
+
+ link_inode_work(dn,in);
+ //dout(12) << "link_inode " << *dn << " " << *in << endl;
+
+ // remove from null list
+ assert(null_items.count(dn->name) == 1);
+ null_items.erase(dn->name);
+ nnull--;
+
+ assert(nnull + nitems == items.size());
+ assert(nnull == null_items.size());
+}
+
+void CDir::link_inode_work( CDentry *dn, CInode *in )
+{
+ dn->inode = in;
+ in->set_primary_parent(dn);
+
+ nitems++; // adjust dir size
+
+ // set dir version
+ in->parent_dir_version = get_version();
+
+ // clear dangling
+ in->state_clear(CINODE_STATE_DANGLING);
+
+ // dn dirty?
+ if (dn->is_dirty()) in->get(CINODE_PIN_DNDIRTY);
+
+ // adjust auth pin count
+ if (in->auth_pins + in->nested_auth_pins)
+ adjust_nested_auth_pins( in->auth_pins + in->nested_auth_pins );
+}
+
+void CDir::unlink_inode( CDentry *dn )
+{
+ dout(12) << "unlink_inode " << *dn << " " << *dn->inode << endl;
+
+ unlink_inode_work(dn);
+
+ // add to null list
+ assert(null_items.count(dn->name) == 0);
+ null_items[dn->name] = dn;
+ nnull++;
+
+ assert(nnull + nitems == items.size());
+ assert(nnull == null_items.size());
+}
+
+void CDir::unlink_inode_work( CDentry *dn )
+{
+ CInode *in = dn->inode;
+
+ if (dn->is_remote()) {
+ // remote
+ if (in)
+ dn->unlink_remote();
+
+ dn->set_remote_ino(0);
+ } else {
+ // primary
+ assert(dn->is_primary());
+
+ // explicitly define auth
+ in->dangling_auth = in->authority();
+ //dout(10) << "unlink_inode " << *in << " dangling_auth now " << in->dangling_auth << endl;
+
+ // unlink auth_pin count
+ if (in->auth_pins + in->nested_auth_pins)
+ adjust_nested_auth_pins( 0 - (in->auth_pins + in->nested_auth_pins) );
+
+ // set dangling flag
+ in->state_set(CINODE_STATE_DANGLING);
+
+ // dn dirty?
+ if (dn->is_dirty()) in->put(CINODE_PIN_DNDIRTY);
+
+ // detach inode
+ in->remove_primary_parent(dn);
+ dn->inode = 0;
+ }
+
+ nitems--; // adjust dir size
+}
+
+void CDir::remove_null_dentries() {
+ dout(12) << "remove_null_dentries " << *this << endl;
+
+ list<CDentry*> dns;
+ for (CDir_map_t::iterator it = null_items.begin();
+ it != null_items.end();
+ it++) {
+ dns.push_back(it->second);
+ }
+
+ for (list<CDentry*>::iterator it = dns.begin();
+ it != dns.end();
+ it++) {
+ CDentry *dn = *it;
+ assert(dn->is_sync());
+ remove_dentry(dn);
+ }
+ assert(null_items.empty());
+ assert(nnull == 0);
+ assert(nnull + nitems == items.size());
+}
+
+
+
+/****************************************
+ * WAITING
+ */
+
+bool CDir::waiting_for(int tag)
+{
+ return waiting.count(tag) > 0;
+}
+
+bool CDir::waiting_for(int tag, const string& dn)
+{
+ if (!waiting_on_dentry.count(dn))
+ return false;
+ return waiting_on_dentry[dn].count(tag) > 0;
+}
+
+void CDir::add_waiter(int tag,
+ const string& dentry,
+ Context *c) {
+ if (waiting.empty() && waiting_on_dentry.size() == 0)
+ get(CDIR_PIN_WAITER);
+ waiting_on_dentry[ dentry ].insert(pair<int,Context*>(tag,c));
+ dout(10) << "add_waiter dentry " << dentry << " tag " << tag << " " << c << " on " << *this << endl;
+}
+
+void CDir::add_waiter(int tag, Context *c) {
+ // hierarchical?
+ if (tag & CDIR_WAIT_ATFREEZEROOT && (is_freezing() || is_frozen())) {
+ if (is_freezing_tree_root() || is_frozen_tree_root() ||
+ is_freezing_dir() || is_frozen_dir()) {
+ // it's us, pin here. (fall thru)
+ } else {
+ // pin parent!
+ dout(10) << "add_waiter " << tag << " " << c << " should be ATFREEZEROOT, " << *this << " is not root, trying parent" << endl;
+ inode->parent->dir->add_waiter(tag, c);
+ return;
+ }
+ }
+
+ // this dir.
+ if (waiting.empty() && waiting_on_dentry.size() == 0)
+ get(CDIR_PIN_WAITER);
+ waiting.insert(pair<int,Context*>(tag,c));
+ dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl;
+}
+
+
+void CDir::take_waiting(int mask,
+ const string& dentry,
+ list<Context*>& ls,
+ int num)
+{
+ if (waiting_on_dentry.empty()) return;
+
+ multimap<int,Context*>::iterator it = waiting_on_dentry[dentry].begin();
+ while (it != waiting_on_dentry[dentry].end()) {
+ if (it->first & mask) {
+ ls.push_back(it->second);
+ dout(10) << "take_waiting dentry " << dentry << " mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl;
+ waiting_on_dentry[dentry].erase(it++);
+
+ if (num) {
+ if (num == 1) break;
+ num--;
+ }
+ } else {
+ dout(10) << "take_waiting dentry " << dentry << " mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this << endl;
+ it++;
+ }
+ }
+
+ // did we clear dentry?
+ if (waiting_on_dentry[dentry].empty())
+ waiting_on_dentry.erase(dentry);
+
+ // ...whole map?
+ if (waiting_on_dentry.size() == 0 && waiting.empty())
+ put(CDIR_PIN_WAITER);
+}
+
+/* NOTE: this checks dentry waiters too */
+void CDir::take_waiting(int mask,
+ list<Context*>& ls)
+{
+ if (waiting_on_dentry.size()) {
+ // try each dentry
+ hash_map<string, multimap<int,Context*> >::iterator it =
+ waiting_on_dentry.begin();
+ while (it != waiting_on_dentry.end()) {
+ take_waiting(mask, (it++)->first, ls); // not post-inc
+ }
+ }
+
+ // waiting
+ if (!waiting.empty()) {
+ multimap<int,Context*>::iterator it = waiting.begin();
+ while (it != waiting.end()) {
+ if (it->first & mask) {
+ ls.push_back(it->second);
+ dout(10) << "take_waiting mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl;
+ waiting.erase(it++);
+ } else {
+ dout(10) << "take_waiting mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this<< endl;
+ it++;
+ }
+ }
+
+ if (waiting_on_dentry.size() == 0 && waiting.empty())
+ put(CDIR_PIN_WAITER);
+ }
+}
+
+
+void CDir::finish_waiting(int mask, int result)
+{
+ dout(11) << "finish_waiting mask " << mask << " result " << result << " on " << *this << endl;
+
+ list<Context*> finished;
+ take_waiting(mask, finished);
+ finish_contexts(finished, result);
+}
+
+void CDir::finish_waiting(int mask, const string& dn, int result)
+{
+ dout(11) << "finish_waiting mask " << mask << " dn " << dn << " result " << result << " on " << *this << endl;
+
+ list<Context*> finished;
+ take_waiting(mask, dn, finished);
+ finish_contexts(finished, result);
+}
+
+
+// dirty/clean
+
+void CDir::mark_dirty()
+{
+ if (!state_test(CDIR_STATE_DIRTY)) {
+ version++;
+ state_set(CDIR_STATE_DIRTY);
+ dout(10) << "mark_dirty (was clean) " << *this << " new version " << version << endl;
+ get(CDIR_PIN_DIRTY);
+ }
+ else if (state_test(CDIR_STATE_COMMITTING) &&
+ committing_version == version) {
+ version++; // now dirtier than committing version!
+ dout(10) << "mark_dirty (committing) " << *this << " new version " << version << "/" << committing_version << endl;
+ } else {
+ dout(10) << "mark_dirty (already dirty) " << *this << " version " << version << endl;
+ }
+}
+
+void CDir::mark_clean()
+{
+ dout(10) << "mark_clean " << *this << " version " << version << endl;
+ if (state_test(CDIR_STATE_DIRTY)) {
+ state_clear(CDIR_STATE_DIRTY);
+ put(CDIR_PIN_DIRTY);
+ }
+}
+
+
+
+// ref counts
+
+void CDir::put(int by) {
+ cdir_pins[by]--;
+
+ // bad?
+ if (ref == 0 || ref_set.count(by) != 1) {
+ dout(7) << *this << " bad put by " << by << " " << cdir_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl;
+ assert(ref_set.count(by) == 1);
+ assert(ref > 0);
+ }
+
+ ref--;
+ ref_set.erase(by);
+
+ // inode
+ if (ref == 0)
+ inode->put(CINODE_PIN_DIR);
+
+ dout(7) << *this << " put by " << by << " " << cdir_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl;
+}
+
+void CDir::get(int by) {
+ cdir_pins[by]++;
+
+ // inode
+ if (ref == 0)
+ inode->get(CINODE_PIN_DIR);
+
+ // bad?
+ if (ref_set.count(by)) {
+ dout(7) << *this << " bad get by " << by << " " << cdir_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl;
+ assert(ref_set.count(by) == 0);
+ }
+
+ ref++;
+ ref_set.insert(by);
+
+ dout(7) << *this << " get by " << by << " " << cdir_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl;
+}
+
+
+
+/********************************
+ * AUTHORITY
+ */
+
+/*
+ * simple rule: if dir_auth isn't explicit, auth is the same as the inode.
+ */
+int CDir::authority()
+{
+ if (get_dir_auth() >= 0)
+ return get_dir_auth();
+
+ /*
+ CDir *parent = inode->get_parent_dir();
+ if (parent)
+ return parent->authority();
+
+ // root, or dangling
+ assert(inode->is_root()); // no dirs under danglers!?
+ //assert(inode->is_root() || inode->is_dangling());
+ */
+
+ return inode->authority();
+}
+
+int CDir::dentry_authority(const string& dn )
+{
+ // hashing -- subset of nodes have hashed the contents
+ if (is_hashing() && !hashed_subset.empty()) {
+ int hashauth = mds->hash_dentry( inode->ino(), dn ); // hashed
+ if (hashed_subset.count(hashauth))
+ return hashauth;
+ }
+
+ // hashed
+ if (is_hashed()) {
+ return mds->hash_dentry( inode->ino(), dn ); // hashed
+ }
+
+ if (get_dir_auth() == CDIR_AUTH_PARENT) {
+ //dout(15) << "dir_auth = parent at " << *this << endl;
+ return inode->authority(); // same as my inode
+ }
+
+ // it's explicit for this whole dir
+ //dout(15) << "dir_auth explicit " << dir_auth << " at " << *this << endl;
+ return get_dir_auth();
+}
+
+void CDir::set_dir_auth(int d)
+{
+ dout(10) << "setting dir_auth=" << d << " from " << dir_auth << " on " << *this << endl;
+ dir_auth = d;
+}
+
+
+/*****************************************
+ * AUTH PINS
+ */
+
+void CDir::auth_pin() {
+ if (auth_pins == 0)
+ get(CDIR_PIN_AUTHPIN);
+ auth_pins++;
+
+ dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
+
+ inode->nested_auth_pins++;
+ if (inode->parent)
+ inode->parent->dir->adjust_nested_auth_pins( 1 );
+}
+
+void CDir::auth_unpin() {
+ auth_pins--;
+ if (auth_pins == 0)
+ put(CDIR_PIN_AUTHPIN);
+
+ dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
+ assert(auth_pins >= 0);
+
+ // pending freeze?
+ if (auth_pins + nested_auth_pins == 0)
+ on_freezeable();
+
+ inode->nested_auth_pins--;
+ if (inode->parent)
+ inode->parent->dir->adjust_nested_auth_pins( -1 );
+}
+
+void CDir::adjust_nested_auth_pins(int inc)
+{
+ CDir *dir = this;
+
+ while (1) {
+ // dir
+ dir->nested_auth_pins += inc;
+
+ dout(10) << "adjust_nested_auth_pins on " << *dir << " count now " << dir->auth_pins << " + " << dir->nested_auth_pins << endl;
+ assert(dir->nested_auth_pins >= 0);
+
+ // pending freeze?
+ if (dir->auth_pins + dir->nested_auth_pins == 0)
+ dir->on_freezeable();
+
+ // it's inode
+ dir->inode->nested_auth_pins += inc;
+
+ if (dir->inode->parent)
+ dir = dir->inode->parent->dir;
+ else
+ break;
+ }
+}
+
+
+
+/*****************************************************************************
+ * FREEZING
+ */
+
+void CDir::on_freezeable()
+{
+ // check for anything pending freezeable
+
+ /* NOTE: the first of these will likely freeze the dir, and unmark
+ FREEZING. additional ones will re-flag FREEZING. this isn't
+ particularly graceful, and might cause problems if the first one
+ needs to know about other waiters.... FIXME? */
+
+ finish_waiting(CDIR_WAIT_FREEZEABLE);
+}
+
+// FREEZE TREE
+
+class C_MDS_FreezeTree : public Context {
+ CDir *dir;
+ Context *con;
+public:
+ C_MDS_FreezeTree(CDir *dir, Context *c) {
+ this->dir = dir;
+ this->con = c;
+ }
+ virtual void finish(int r) {
+ dir->freeze_tree_finish(con);
+ }
+};
+
+void CDir::freeze_tree(Context *c)
+{
+ assert(!is_frozen());
+ assert(!is_freezing());
+
+ if (is_freezeable()) {
+ dout(10) << "freeze_tree " << *this << endl;
+
+ state_set(CDIR_STATE_FROZENTREE);
+ inode->auth_pin(); // auth_pin for duration of freeze
+
+ // easy, we're frozen
+ c->finish(0);
+ delete c;
+
+ } else {
+ state_set(CDIR_STATE_FREEZINGTREE);
+ dout(10) << "freeze_tree + wait " << *this << endl;
+
+ // need to wait for auth pins to expire
+ add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c));
+ }
+}
+
+void CDir::freeze_tree_finish(Context *c)
+{
+ // freezeable now?
+ if (!is_freezeable()) {
+ // wait again!
+ dout(10) << "freeze_tree_finish still waiting " << *this << endl;
+ state_set(CDIR_STATE_FREEZINGTREE);
+ add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c));
+ return;
+ }
+
+ dout(10) << "freeze_tree_finish " << *this << endl;
+ state_set(CDIR_STATE_FROZENTREE);
+ state_clear(CDIR_STATE_FREEZINGTREE); // actually, this may get set again by next context?
+
+ inode->auth_pin(); // auth_pin for duration of freeze
+
+ // continue to frozen land
+ if (c) {
+ c->finish(0);
+ delete c;
+ }
+}
+
+void CDir::unfreeze_tree()
+{
+ dout(10) << "unfreeze_tree " << *this << endl;
+ state_clear(CDIR_STATE_FROZENTREE);
+
+ // unpin (may => FREEZEABLE) FIXME: is this order good?
+ inode->auth_unpin();
+
+ // waiters?
+ finish_waiting(CDIR_WAIT_UNFREEZE);
+}
+
+bool CDir::is_freezing_tree()
+{
+ CDir *dir = this;
+ while (1) {
+ if (dir->is_freezing_tree_root()) return true;
+ if (dir->is_import()) return false;
+ if (dir->is_hashed()) return false;
+ if (dir->inode->parent)
+ dir = dir->inode->parent->dir;
+ else
+ return false; // root on replica
+ }
+}
+
+bool CDir::is_frozen_tree()
+{
+ CDir *dir = this;
+ while (1) {
+ if (dir->is_frozen_tree_root()) return true;
+ if (dir->is_import()) return false;
+ if (dir->is_hashed()) return false;
+ if (dir->inode->parent)
+ dir = dir->inode->parent->dir;
+ else
+ return false; // root on replica
+ }
+}
+
+
+
+// FREEZE DIR
+
+class C_MDS_FreezeDir : public Context {
+ CDir *dir;
+ Context *con;
+public:
+ C_MDS_FreezeDir(CDir *dir, Context *c) {
+ this->dir = dir;
+ this->con = c;
+ }
+ virtual void finish(int r) {
+ dir->freeze_dir_finish(con);
+ }
+};
+
+void CDir::freeze_dir(Context *c)
+{
+ assert(!is_frozen());
+ assert(!is_freezing());
+
+ if (is_freezeable_dir()) {
+ dout(10) << "freeze_dir " << *this << endl;
+
+ state_set(CDIR_STATE_FROZENDIR);
+ inode->auth_pin(); // auth_pin for duration of freeze
+
+ // easy, we're frozen
+ c->finish(0);
+ delete c;
+
+ } else {
+ state_set(CDIR_STATE_FREEZINGDIR);
+ dout(10) << "freeze_dir + wait " << *this << endl;
+
+ // need to wait for auth pins to expire
+ add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c));
+ }
+}
+
+void CDir::freeze_dir_finish(Context *c)
+{
+ // freezeable now?
+ if (!is_freezeable_dir()) {
+ // wait again!
+ dout(10) << "freeze_dir_finish still waiting " << *this << endl;
+ state_set(CDIR_STATE_FREEZINGDIR);
+ add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c));
+ return;
+ }
+
+ dout(10) << "freeze_dir_finish " << *this << endl;
+ state_set(CDIR_STATE_FROZENDIR);
+ state_clear(CDIR_STATE_FREEZINGDIR); // actually, this may get set again by next context?
+
+ inode->auth_pin(); // auth_pin for duration of freeze
+
+ // continue to frozen land
+ if (c) {
+ c->finish(0);
+ delete c;
+ }
+}
+
+void CDir::unfreeze_dir()
+{
+ dout(10) << "unfreeze_dir " << *this << endl;
+ state_clear(CDIR_STATE_FROZENDIR);
+
+ // unpin (may => FREEZEABLE) FIXME: is this order good?
+ inode->auth_unpin();
+
+ // waiters?
+ finish_waiting(CDIR_WAIT_UNFREEZE);
+}
+
+
+
+
+
+
+
+
+
+// -----------------------------------------------------------------
+// debug shite
+
+
+void CDir::dump(int depth) {
+ string ind(depth, '\t');
+
+ dout(10) << "dump:" << ind << *this << endl;
+
+ map<string,CDentry*>::iterator iter = items.begin();
+ while (iter != items.end()) {
+ CDentry* d = iter->second;
+ if (d->inode) {
+ char isdir = ' ';
+ if (d->inode->dir != NULL) isdir = '/';
+ dout(10) << "dump: " << ind << *d << " = " << *d->inode << endl;
+ d->inode->dump(depth+1);
+ } else {
+ dout(10) << "dump: " << ind << *d << " = [null]" << endl;
+ }
+ iter++;
+ }
+
+ if (!(state_test(CDIR_STATE_COMPLETE)))
+ dout(10) << ind << "..." << endl;
+ if (state_test(CDIR_STATE_DIRTY))
+ dout(10) << ind << "[dirty]" << endl;
+
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __CDIR_H
+#define __CDIR_H
+
+#include "include/types.h"
+#include "include/buffer.h"
+#include "config.h"
+#include "common/DecayCounter.h"
+
+#include <iostream>
+#include <cassert>
+
+#include <list>
+#include <set>
+#include <map>
+#include <string>
+using namespace std;
+
+#include <ext/hash_map>
+using __gnu_cxx::hash_map;
+
+
+#include "CInode.h"
+
+class CDentry;
+class MDS;
+class MDCluster;
+class Context;
+
+
+// directory authority types
+// >= 0 is the auth mds
+#define CDIR_AUTH_PARENT -1 // default
+
+
+#define CDIR_NONCE_EXPORT 1
+
+
+// state bits
+#define CDIR_STATE_AUTH (1<<0) // auth for this dir (hashing doesn't count)
+#define CDIR_STATE_PROXY (1<<1) // proxy auth
+
+#define CDIR_STATE_COMPLETE (1<<2) // the complete contents are in cache
+#define CDIR_STATE_DIRTY (1<<3) // has been modified since last commit
+
+#define CDIR_STATE_FROZENTREE (1<<4) // root of tree (bounded by exports)
+#define CDIR_STATE_FREEZINGTREE (1<<5) // in process of freezing
+#define CDIR_STATE_FROZENDIR (1<<6)
+#define CDIR_STATE_FREEZINGDIR (1<<7)
+
+#define CDIR_STATE_COMMITTING (1<<8) // mid-commit
+#define CDIR_STATE_FETCHING (1<<9) // currenting fetching
+
+#define CDIR_STATE_DELETED (1<<10)
+
+#define CDIR_STATE_IMPORT (1<<11) // flag set if this is an import.
+#define CDIR_STATE_EXPORT (1<<12)
+#define CDIR_STATE_IMPORTINGEXPORT (1<<13)
+
+#define CDIR_STATE_HASHED (1<<14) // if hashed
+#define CDIR_STATE_HASHING (1<<15)
+#define CDIR_STATE_UNHASHING (1<<16)
+
+
+
+
+
+// these state bits are preserved by an import/export
+// ...except if the directory is hashed, in which case none of them are!
+#define CDIR_MASK_STATE_EXPORTED (CDIR_STATE_COMPLETE\
+ |CDIR_STATE_DIRTY)
+#define CDIR_MASK_STATE_IMPORT_KEPT (CDIR_STATE_IMPORT\
+ |CDIR_STATE_EXPORT\
+ |CDIR_STATE_IMPORTINGEXPORT)
+#define CDIR_MASK_STATE_EXPORT_KEPT (CDIR_STATE_HASHED\
+ |CDIR_STATE_FROZENTREE\
+ |CDIR_STATE_FROZENDIR\
+ |CDIR_STATE_EXPORT\
+ |CDIR_STATE_PROXY)
+
+// common states
+#define CDIR_STATE_CLEAN 0
+#define CDIR_STATE_INITIAL 0
+
+// directory replication
+#define CDIR_REP_ALL 1
+#define CDIR_REP_NONE 0
+#define CDIR_REP_LIST 2
+
+
+
+// pins
+
+#define CDIR_PIN_CHILD 0
+#define CDIR_PIN_OPENED 1 // open by another node
+#define CDIR_PIN_WAITER 2 // waiter(s)
+
+#define CDIR_PIN_IMPORT 3
+#define CDIR_PIN_EXPORT 4
+#define CDIR_PIN_FREEZE 5
+#define CDIR_PIN_PROXY 6 // auth just changed.
+
+#define CDIR_PIN_AUTHPIN 7
+
+#define CDIR_PIN_IMPORTING 8
+#define CDIR_PIN_IMPORTINGEXPORT 9
+
+#define CDIR_PIN_HASHED 10
+#define CDIR_PIN_HASHING 11
+#define CDIR_PIN_DIRTY 12
+
+#define CDIR_PIN_REQUEST 13
+
+#define CDIR_NUM_PINS 14
+
+
+
+// wait reasons
+#define CDIR_WAIT_DENTRY 1 // wait for item to be in cache
+ // waiters: path_traverse
+ // trigger: handle_discover, fetch_dir_2
+#define CDIR_WAIT_COMPLETE 2 // wait for complete dir contents
+ // waiters: fetch_dir, commit_dir
+ // trigger: fetch_dir_2
+#define CDIR_WAIT_FREEZEABLE 4 // hard_pins removed
+ // waiters: freeze, freeze_finish
+ // trigger: auth_unpin, adjust_nested_auth_pins
+#define CDIR_WAIT_UNFREEZE 8 // unfreeze
+ // waiters: path_traverse, handle_discover, handle_inode_update,
+ // export_dir_frozen (mdcache)
+ // handle_client_readdir (mds)
+ // trigger: unfreeze
+#define CDIR_WAIT_AUTHPINNABLE CDIR_WAIT_UNFREEZE
+ // waiters: commit_dir (mdstore)
+ // trigger: (see CDIR_WAIT_UNFREEZE)
+#define CDIR_WAIT_COMMITTED 32 // did commit (who uses this?**)
+ // waiters: commit_dir (if already committing)
+ // trigger: commit_dir_2
+#define CDIR_WAIT_IMPORTED 64 // import finish
+ // waiters: import_dir_block
+ // triggers: handle_export_dir_finish
+
+#define CDIR_WAIT_EXPORTWARNING 8192 // on bystander.
+ // watiers: handle_export_dir_notify
+ // triggers: handle_export_dir_warning
+#define CDIR_WAIT_EXPORTPREPACK 16384
+ // waiter export_dir
+ // trigger handel_export_dir_prep_ack
+
+#define CDIR_WAIT_HASHED (1<<17) // hash finish
+#define CDIR_WAIT_THISHASHEDREADDIR (1<<18) // current readdir lock
+#define CDIR_WAIT_NEXTHASHEDREADDIR (1<<19) // after current readdir lock finishes
+
+#define CDIR_WAIT_DNREAD (1<<20)
+#define CDIR_WAIT_DNLOCK (1<<21)
+#define CDIR_WAIT_DNUNPINNED (1<<22)
+#define CDIR_WAIT_DNPINNABLE (CDIR_WAIT_DNREAD|CDIR_WAIT_DNUNPINNED)
+
+#define CDIR_WAIT_DNREQXLOCK (1<<23)
+
+#define CDIR_WAIT_ANY (0xffffffff)
+
+#define CDIR_WAIT_ATFREEZEROOT (CDIR_WAIT_AUTHPINNABLE|\
+ CDIR_WAIT_UNFREEZE) // hmm, same same
+
+
+ostream& operator<<(ostream& out, class CDir& dir);
+
+
+// CDir
+typedef map<string, CDentry*> CDir_map_t;
+
+
+extern int cdir_pins[CDIR_NUM_PINS];
+
+
+class CDir {
+ public:
+ CInode *inode;
+
+ protected:
+ // contents
+ CDir_map_t items; // non-null AND null
+ CDir_map_t null_items; // null and foreign
+ size_t nitems; // non-null
+ size_t nnull; // null
+ //size_t nauthitems;
+ //size_t namesize;
+
+ // state
+ unsigned state;
+ version_t version;
+ version_t committing_version;
+ version_t last_committed_version;
+
+ // authority, replicas
+ set<int> open_by; // nodes that have me open
+ map<int,int> open_by_nonce;
+ int replica_nonce;
+ int dir_auth;
+
+ // reference countin/pins
+ int ref; // reference count
+ set<int> ref_set;
+
+ // lock nesting, freeze
+ int auth_pins;
+ int nested_auth_pins;
+ int request_pins;
+
+ // hashed dirs
+ set<int> hashed_subset; // HASHING: subset of mds's that are hashed
+ public:
+ // for class MDS
+ map<int, pair< list<class InodeStat*>, list<string> > > hashed_readdir;
+ protected:
+
+ // context
+ MDS *mds;
+
+
+ // waiters
+ multimap<int, Context*> waiting; // tag -> context
+ hash_map< string, multimap<int, Context*> >
+ waiting_on_dentry;
+
+ // cache control (defined for authority; hints for replicas)
+ int dir_rep;
+ set<int> dir_rep_by; // if dir_rep == CDIR_REP_LIST
+
+ // popularity
+ meta_load_t popularity[MDS_NPOP];
+
+ // friends
+ friend class Migrator;
+ friend class CInode;
+ friend class MDCache;
+ friend class MDiscover;
+ friend class MDBalancer;
+
+ friend class CDirDiscover;
+ friend class CDirExport;
+
+ public:
+ CDir(CInode *in, MDS *mds, bool auth);
+
+
+
+ // -- accessors --
+ inodeno_t ino() { return inode->ino(); }
+ CInode *get_inode() { return inode; }
+ CDir *get_parent_dir() { return inode->get_parent_dir(); }
+
+ CDir_map_t::iterator begin() { return items.begin(); }
+ CDir_map_t::iterator end() { return items.end(); }
+ size_t get_size() {
+
+ //if ( is_auth() && !is_hashed()) assert(nauthitems == nitems);
+ //if (!is_auth() && !is_hashed()) assert(nauthitems == 0);
+
+ return nitems;
+ }
+ size_t get_nitems() { return nitems; }
+ size_t get_nnull() { return nnull; }
+ /*
+ size_t get_auth_size() {
+ assert(nauthitems <= nitems);
+ return nauthitems;
+ }
+ */
+
+ /*
+ float get_popularity() {
+ return popularity[0].get();
+ }
+ */
+
+
+ // -- dentries and inodes --
+ public:
+ CDentry* lookup(const string& n) {
+ map<string,CDentry*>::iterator iter = items.find(n);
+ if (iter == items.end())
+ return 0;
+ else
+ return iter->second;
+ }
+
+ CDentry* add_dentry( const string& dname, CInode *in=0 );
+ CDentry* add_dentry( const string& dname, inodeno_t ino );
+ void remove_dentry( CDentry *dn ); // delete dentry
+ void link_inode( CDentry *dn, inodeno_t ino );
+ void link_inode( CDentry *dn, CInode *in );
+ void unlink_inode( CDentry *dn );
+ private:
+ void link_inode_work( CDentry *dn, CInode *in );
+ void unlink_inode_work( CDentry *dn );
+
+ void remove_null_dentries(); // on empty, clean dir
+
+ // -- authority --
+ public:
+ int authority();
+ int dentry_authority(const string& d);
+ int get_dir_auth() { return dir_auth; }
+ void set_dir_auth(int d);
+
+ bool is_open_by_anyone() { return !open_by.empty(); }
+ bool is_open_by(int mds) { return open_by.count(mds); }
+ int get_open_by_nonce(int mds) {
+ map<int,int>::iterator it = open_by_nonce.find(mds);
+ return it->second;
+ }
+ set<int>::iterator open_by_begin() { return open_by.begin(); }
+ set<int>::iterator open_by_end() { return open_by.end(); }
+ set<int>& get_open_by() { return open_by; }
+
+ int get_replica_nonce() { assert(!is_auth()); return replica_nonce; }
+
+ int open_by_add(int mds) {
+ int nonce = 1;
+
+ if (is_open_by(mds)) { // already had it?
+ nonce = get_open_by_nonce(mds) + 1; // new nonce (+1)
+ dout(10) << *this << " issuing new nonce " << nonce << " to mds" << mds << endl;
+ open_by_nonce.erase(mds);
+ } else {
+ if (open_by.empty())
+ get(CDIR_PIN_OPENED);
+ open_by.insert(mds);
+ }
+ open_by_nonce.insert(pair<int,int>(mds,nonce)); // first! serial of 1.
+ return nonce; // default nonce
+ }
+ void open_by_remove(int mds) {
+ //if (!is_open_by(mds)) return;
+ assert(is_open_by(mds));
+
+ open_by.erase(mds);
+ open_by_nonce.erase(mds);
+ if (open_by.empty())
+ put(CDIR_PIN_OPENED);
+ }
+ void open_by_clear() {
+ if (!open_by.empty())
+ put(CDIR_PIN_OPENED);
+ open_by.clear();
+ open_by_nonce.clear();
+ }
+
+
+
+ // for giving to clients
+ void get_dist_spec(set<int>& ls, int auth) {
+ if (( popularity[MDS_POP_CURDOM].pop[META_POP_IRD].get() > g_conf.mds_bal_replicate_threshold)) {
+ //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl;
+ ls = open_by;
+ if (!ls.empty()) ls.insert(auth);
+ }
+ }
+
+
+ // -- state --
+ unsigned get_state() { return state; }
+ void reset_state(unsigned s) {
+ state = s;
+ dout(10) << " cdir:" << *this << " state reset" << endl;
+ }
+ void state_clear(unsigned mask) {
+ state &= ~mask;
+ dout(10) << " cdir:" << *this << " state -" << mask << " = " << state << endl;
+ }
+ void state_set(unsigned mask) {
+ state |= mask;
+ dout(10) << " cdir:" << *this << " state +" << mask << " = " << state << endl;
+ }
+ unsigned state_test(unsigned mask) { return state & mask; }
+
+ bool is_complete() { return state & CDIR_STATE_COMPLETE; }
+ bool is_dirty() { return state_test(CDIR_STATE_DIRTY); }
+
+ bool is_auth() { return state & CDIR_STATE_AUTH; }
+ bool is_proxy() { return state & CDIR_STATE_PROXY; }
+ bool is_import() { return state & CDIR_STATE_IMPORT; }
+ bool is_export() { return state & CDIR_STATE_EXPORT; }
+
+ bool is_hashed() { return state & CDIR_STATE_HASHED; }
+ bool is_hashing() { return state & CDIR_STATE_HASHING; }
+ bool is_unhashing() { return state & CDIR_STATE_UNHASHING; }
+
+ bool is_rep() {
+ if (dir_rep == CDIR_REP_NONE) return false;
+ return true;
+ }
+
+
+
+ // -- dirtyness --
+ version_t get_version() { return version; }
+ void float_version(version_t ge) {
+ if (version < ge)
+ version = ge;
+ }
+ void set_version(version_t v) { version = v; }
+
+ version_t get_committing_version() { return committing_version; }
+ version_t get_last_committed_version() { return last_committed_version; }
+ // as in, we're committing the current version.
+ void set_committing_version() { committing_version = version; }
+ void set_last_committed_version(version_t v) { last_committed_version = v; }
+ void mark_dirty();
+ void mark_clean();
+ void mark_complete() { state_set(CDIR_STATE_COMPLETE); }
+ bool is_clean() { return !state_test(CDIR_STATE_DIRTY); }
+
+
+
+
+ // -- reference counting --
+ void put(int by);
+ void get(int by);
+ bool is_pinned_by(int by) {
+ return ref_set.count(by);
+ }
+ bool is_pinned() { return ref > 0; }
+ int get_ref() { return ref; }
+ set<int>& get_ref_set() { return ref_set; }
+ void request_pin_get() {
+ if (request_pins == 0) get(CDIR_PIN_REQUEST);
+ request_pins++;
+ }
+ void request_pin_put() {
+ request_pins--;
+ if (request_pins == 0) put(CDIR_PIN_REQUEST);
+ }
+
+
+ // -- waiters --
+ bool waiting_for(int tag);
+ bool waiting_for(int tag, const string& dn);
+ void add_waiter(int tag, Context *c);
+ void add_waiter(int tag,
+ const string& dentry,
+ Context *c);
+ void take_waiting(int mask, list<Context*>& ls); // includes dentry waiters
+ void take_waiting(int mask,
+ const string& dentry,
+ list<Context*>& ls,
+ int num=0);
+ void finish_waiting(int mask, int result = 0); // ditto
+ void finish_waiting(int mask, const string& dn, int result = 0); // ditto
+
+
+ // -- auth pins --
+ bool can_auth_pin() { return !(is_frozen() || is_freezing()); }
+ int is_auth_pinned() { return auth_pins; }
+ void auth_pin();
+ void auth_unpin();
+ void adjust_nested_auth_pins(int inc);
+ void on_freezeable();
+
+ // -- freezing --
+ void freeze_tree(Context *c);
+ void freeze_tree_finish(Context *c);
+ void unfreeze_tree();
+
+ void freeze_dir(Context *c);
+ void freeze_dir_finish(Context *c);
+ void unfreeze_dir();
+
+ bool is_freezing() { return is_freezing_tree() || is_freezing_dir(); }
+ bool is_freezing_tree();
+ bool is_freezing_tree_root() { return state & CDIR_STATE_FREEZINGTREE; }
+ bool is_freezing_dir() { return state & CDIR_STATE_FREEZINGDIR; }
+
+ bool is_frozen() { return is_frozen_dir() || is_frozen_tree(); }
+ bool is_frozen_tree();
+ bool is_frozen_tree_root() { return state & CDIR_STATE_FROZENTREE; }
+ bool is_frozen_dir() { return state & CDIR_STATE_FROZENDIR; }
+
+ bool is_freezeable() {
+ if (auth_pins == 0 && nested_auth_pins == 0) return true;
+ return false;
+ }
+ bool is_freezeable_dir() {
+ if (auth_pins == 0) return true;
+ return false;
+ }
+
+
+
+ // debuggin bs
+ void dump(int d = 0);
+};
+
+
+
+// -- encoded state --
+
+// discover
+
+class CDirDiscover {
+ inodeno_t ino;
+ int nonce;
+ int dir_auth;
+ int dir_rep;
+ set<int> rep_by;
+
+ public:
+ CDirDiscover() {}
+ CDirDiscover(CDir *dir, int nonce) {
+ ino = dir->ino();
+ this->nonce = nonce;
+ dir_auth = dir->dir_auth;
+ dir_rep = dir->dir_rep;
+ rep_by = dir->dir_rep_by;
+ }
+
+ void update_dir(CDir *dir) {
+ assert(dir->ino() == ino);
+ assert(!dir->is_auth());
+
+ dir->replica_nonce = nonce;
+ dir->dir_auth = dir_auth;
+ dir->dir_rep = dir_rep;
+ dir->dir_rep_by = rep_by;
+ }
+
+ inodeno_t get_ino() { return ino; }
+
+
+ void _encode(bufferlist& bl) {
+ bl.append((char*)&ino, sizeof(ino));
+ bl.append((char*)&nonce, sizeof(nonce));
+ bl.append((char*)&dir_auth, sizeof(dir_auth));
+ bl.append((char*)&dir_rep, sizeof(dir_rep));
+ ::_encode(rep_by, bl);
+ }
+
+ void _decode(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ bl.copy(off, sizeof(nonce), (char*)&nonce);
+ off += sizeof(nonce);
+ bl.copy(off, sizeof(dir_auth), (char*)&dir_auth);
+ off += sizeof(dir_auth);
+ bl.copy(off, sizeof(dir_rep), (char*)&dir_rep);
+ off += sizeof(dir_rep);
+ ::_decode(rep_by, bl, off);
+ }
+
+};
+
+
+// export
+
+typedef struct {
+ inodeno_t ino;
+ __uint64_t nitems; // actual real entries
+ __uint64_t nden; // num dentries (including null ones)
+ version_t version;
+ unsigned state;
+ meta_load_t popularity_justme;
+ meta_load_t popularity_curdom;
+ int dir_auth;
+ int dir_rep;
+ int nopen_by;
+ int nrep_by;
+ // ints follow
+} CDirExport_st;
+
+class CDirExport {
+ CDirExport_st st;
+ set<int> open_by;
+ map<int,int> open_by_nonce;
+ set<int> rep_by;
+
+ public:
+ CDirExport() {}
+ CDirExport(CDir *dir) {
+ memset(&st, 0, sizeof(st));
+
+ st.ino = dir->ino();
+ st.nitems = dir->nitems;
+ st.nden = dir->items.size();
+ st.version = dir->version;
+ st.state = dir->state;
+ st.dir_auth = dir->dir_auth;
+ st.dir_rep = dir->dir_rep;
+
+ st.popularity_justme.take( dir->popularity[MDS_POP_JUSTME] );
+ st.popularity_curdom.take( dir->popularity[MDS_POP_CURDOM] );
+ dir->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom;
+ dir->popularity[MDS_POP_NESTED] -= st.popularity_curdom;
+
+ rep_by = dir->dir_rep_by;
+ open_by = dir->open_by;
+ open_by_nonce = dir->open_by_nonce;
+ }
+
+ inodeno_t get_ino() { return st.ino; }
+ __uint64_t get_nden() { return st.nden; }
+
+ void update_dir(CDir *dir) {
+ assert(dir->ino() == st.ino);
+
+ //dir->nitems = st.nitems;
+ dir->version = st.version;
+ if (dir->state & CDIR_STATE_HASHED)
+ dir->state |= CDIR_STATE_AUTH; // just inherit auth flag when hashed
+ else
+ dir->state = (dir->state & CDIR_MASK_STATE_IMPORT_KEPT) | // remember import flag, etc.
+ (st.state & CDIR_MASK_STATE_EXPORTED);
+ dir->dir_auth = st.dir_auth;
+ dir->dir_rep = st.dir_rep;
+
+ dir->popularity[MDS_POP_JUSTME] += st.popularity_justme;
+ dir->popularity[MDS_POP_CURDOM] += st.popularity_curdom;
+ dir->popularity[MDS_POP_ANYDOM] += st.popularity_curdom;
+ dir->popularity[MDS_POP_NESTED] += st.popularity_curdom;
+
+ dir->replica_nonce = 0; // no longer defined
+
+ if (!dir->open_by.empty())
+ dout(0) << "open_by not empty non import, " << *dir << ", " << dir->open_by << endl;
+
+ dir->dir_rep_by = rep_by;
+ dir->open_by = open_by;
+ dout(12) << "open_by in export is " << open_by << ", dir now " << dir->open_by << endl;
+ dir->open_by_nonce = open_by_nonce;
+ if (!open_by.empty())
+ dir->get(CDIR_PIN_OPENED);
+ if (dir->is_dirty())
+ dir->get(CDIR_PIN_DIRTY);
+ }
+
+
+ void _encode(bufferlist& bl) {
+ st.nrep_by = rep_by.size();
+ st.nopen_by = open_by_nonce.size();
+ bl.append((char*)&st, sizeof(st));
+
+ // open_by
+ for (map<int,int>::iterator it = open_by_nonce.begin();
+ it != open_by_nonce.end();
+ it++) {
+ int m = it->first;
+ bl.append((char*)&m, sizeof(int));
+ int n = it->second;
+ bl.append((char*)&n, sizeof(int));
+ }
+
+ // rep_by
+ for (set<int>::iterator it = rep_by.begin();
+ it != rep_by.end();
+ it++) {
+ int m = *it;
+ bl.append((char*)&m, sizeof(int));
+ }
+ }
+
+ int _decode(bufferlist& bl, int off = 0) {
+ bl.copy(off, sizeof(st), (char*)&st);
+ off += sizeof(st);
+
+ // open_by
+ for (int i=0; i<st.nopen_by; i++) {
+ int m,n;
+ bl.copy(off, sizeof(int), (char*)&m);
+ off += sizeof(int);
+ bl.copy(off, sizeof(int), (char*)&n);
+ off += sizeof(int);
+ open_by.insert(m);
+ open_by_nonce.insert(pair<int,int>(m,n));
+ }
+
+ // rep_by
+ for (int i=0; i<st.nrep_by; i++) {
+ int m;
+ bl.copy(off, sizeof(int), (char*)&m);
+ off += sizeof(int);
+ rep_by.insert(m);
+ }
+
+ return off;
+ }
+
+};
+
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "CInode.h"
+#include "CDir.h"
+#include "CDentry.h"
+
+#include "MDS.h"
+#include "MDCache.h"
+#include "AnchorTable.h"
+
+#include "common/Clock.h"
+
+#include <string>
+
+#include "config.h"
+#undef dout
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.inode(" << inode.ino << ") "
+
+
+int cinode_pins[CINODE_NUM_PINS]; // counts
+
+
+ostream& operator<<(ostream& out, CInode& in)
+{
+ string path;
+ in.make_path(path);
+ out << "[inode " << in.inode.ino << " " << path << (in.is_dir() ? "/ ":" ");
+ if (in.is_auth()) {
+ out << "auth";
+ if (in.is_cached_by_anyone()) {
+ //out << "+" << in.get_cached_by();
+ for (set<int>::iterator it = in.cached_by_begin();
+ it != in.cached_by_end();
+ it++) {
+ out << "+" << *it << "." << in.get_cached_by_nonce(*it);
+ }
+ }
+ } else {
+ out << "rep@" << in.authority();
+ //if (in.get_replica_nonce() > 1)
+ out << "." << in.get_replica_nonce();
+ assert(in.get_replica_nonce() >= 0);
+ }
+
+ if (in.is_symlink()) out << " symlink";
+
+ out << " v" << in.get_version();
+
+ out << " hard=" << in.hardlock;
+ out << " file=" << in.filelock;
+
+ if (in.is_pinned()) {
+ out << " |";
+ for(set<int>::iterator it = in.get_ref_set().begin();
+ it != in.get_ref_set().end();
+ it++)
+ if (*it < CINODE_NUM_PINS)
+ out << " " << cinode_pin_names[*it];
+ else
+ out << " " << *it;
+ }
+
+ // hack: spit out crap on which clients have caps
+ if (!in.get_client_caps().empty()) {
+ out << " caps={";
+ for (map<int,Capability>::iterator it = in.get_client_caps().begin();
+ it != in.get_client_caps().end();
+ it++) {
+ if (it != in.get_client_caps().begin()) out << ",";
+ out << it->first;
+ }
+ out << "}";
+ }
+ out << " " << ∈
+ out << "]";
+ return out;
+}
+
+
+// ====== CInode =======
+CInode::CInode(MDCache *c, bool auth) : LRUObject() {
+ mdcache = c;
+
+ ref = 0;
+
+ parent = NULL;
+
+ dir = NULL; // CDir opened separately
+
+ auth_pins = 0;
+ nested_auth_pins = 0;
+ num_request_pins = 0;
+
+ state = 0;
+
+ committing_version = committed_version = 0;
+
+ if (auth) state_set(CINODE_STATE_AUTH);
+}
+
+CInode::~CInode() {
+ if (dir) { delete dir; dir = 0; }
+}
+
+CDir *CInode::get_parent_dir()
+{
+ if (parent)
+ return parent->dir;
+ return NULL;
+}
+CInode *CInode::get_parent_inode()
+{
+ if (parent)
+ return parent->dir->inode;
+ return NULL;
+}
+
+bool CInode::dir_is_auth() {
+ if (dir)
+ return dir->is_auth();
+ else
+ return is_auth();
+}
+
+CDir *CInode::get_or_open_dir(MDS *mds)
+{
+ assert(is_dir());
+
+ if (dir) return dir;
+
+ // can't open a dir if we're frozen_dir, bc of hashing stuff.
+ assert(!is_frozen_dir());
+
+ // only auth can open dir alone.
+ assert(is_auth());
+ set_dir( new CDir(this, mds, true) );
+ dir->dir_auth = -1;
+ return dir;
+}
+
+CDir *CInode::set_dir(CDir *newdir)
+{
+ assert(dir == 0);
+ dir = newdir;
+ return dir;
+}
+
+void CInode::set_auth(bool a)
+{
+ if (!is_dangling() && !is_root() &&
+ is_auth() != a) {
+ /*
+ CDir *dir = get_parent_dir();
+ if (is_auth() && !a)
+ dir->nauthitems--;
+ else
+ dir->nauthitems++;
+ */
+ }
+
+ if (a) state_set(CINODE_STATE_AUTH);
+ else state_clear(CINODE_STATE_AUTH);
+}
+
+
+
+void CInode::make_path(string& s)
+{
+ if (parent) {
+ parent->make_path(s);
+ }
+ else if (is_root()) {
+ s = ""; // root
+ }
+ else {
+ s = "(dangling)"; // dangling
+ }
+}
+
+void CInode::make_anchor_trace(vector<Anchor*>& trace)
+{
+ if (parent) {
+ parent->dir->inode->make_anchor_trace(trace);
+
+ dout(7) << "make_anchor_trace adding " << ino() << " dirino " << parent->dir->inode->ino() << " dn " << parent->name << endl;
+ trace.push_back( new Anchor(ino(),
+ parent->dir->inode->ino(),
+ parent->name) );
+ }
+ else if (state_test(CINODE_STATE_DANGLING)) {
+ dout(7) << "make_anchor_trace dangling " << ino() << " on mds " << dangling_auth << endl;
+ string ref_dn;
+ trace.push_back( new Anchor(ino(),
+ MDS_INO_INODEFILE_OFFSET+dangling_auth,
+ ref_dn) );
+ }
+ else
+ assert(is_root());
+}
+
+
+
+
+void CInode::mark_dirty() {
+
+ dout(10) << "mark_dirty " << *this << endl;
+
+ if (!parent) {
+ dout(10) << " dangling, not marking dirty!" << endl;
+ return;
+ }
+
+ /*
+ NOTE: I may already be dirty, but this fn _still_ needs to be called so that
+ the directory is (perhaps newly) dirtied, and so that parent_dir_version is
+ updated below.
+ */
+
+ // only auth can get dirty. "dirty" async data in replicas is relative to (say) filelock state, not dirty flag.
+ assert(is_auth());
+
+ // touch my private version
+ inode.version++;
+ if (!(state & CINODE_STATE_DIRTY)) {
+ state |= CINODE_STATE_DIRTY;
+ get(CINODE_PIN_DIRTY);
+ }
+
+ // relative to parent dir:
+ if (parent) {
+ // dir is now dirty (if it wasn't already)
+ parent->dir->mark_dirty();
+
+ // i now live in that (potentially newly dirty) version
+ parent_dir_version = parent->dir->get_version();
+ }
+}
+
+void CInode::mark_clean()
+{
+ dout(10) << " mark_clean " << *this << endl;
+ if (state & CINODE_STATE_DIRTY) {
+ state &= ~CINODE_STATE_DIRTY;
+ put(CINODE_PIN_DIRTY);
+ }
+}
+
+// state
+
+
+
+
+
+// new state encoders
+
+void CInode::encode_file_state(bufferlist& bl)
+{
+ bl.append((char*)&inode.size, sizeof(inode.size));
+ bl.append((char*)&inode.mtime, sizeof(inode.mtime));
+ bl.append((char*)&inode.atime, sizeof(inode.atime)); // ??
+}
+
+void CInode::decode_file_state(bufferlist& r, int& off)
+{
+ r.copy(off, sizeof(inode.size), (char*)&inode.size);
+ off += sizeof(inode.size);
+ r.copy(off, sizeof(inode.mtime), (char*)&inode.mtime);
+ off += sizeof(inode.mtime);
+ r.copy(off, sizeof(inode.atime), (char*)&inode.atime);
+ off += sizeof(inode.atime);
+}
+
+/* not used currently
+void CInode::decode_merge_file_state(crope& r, int& off)
+{
+ __uint64_t size;
+ r.copy(off, sizeof(size), (char*)&size);
+ off += sizeof(size);
+ if (size > inode.size) inode.size = size;
+
+ time_t t;
+ r.copy(off, sizeof(t), (char*)&t);
+ off += sizeof(t);
+ if (t > inode.mtime) inode.mtime = t;
+
+ r.copy(off, sizeof(t), (char*)&t);
+ off += sizeof(t);
+ if (t > inode.atime) inode.atime = t;
+}
+*/
+
+void CInode::encode_hard_state(bufferlist& r)
+{
+ r.append((char*)&inode.mode, sizeof(inode.mode));
+ r.append((char*)&inode.uid, sizeof(inode.uid));
+ r.append((char*)&inode.gid, sizeof(inode.gid));
+ r.append((char*)&inode.ctime, sizeof(inode.ctime));
+}
+
+void CInode::decode_hard_state(bufferlist& r, int& off)
+{
+ r.copy(off, sizeof(inode.mode), (char*)&inode.mode);
+ off += sizeof(inode.mode);
+ r.copy(off, sizeof(inode.uid), (char*)&inode.uid);
+ off += sizeof(inode.uid);
+ r.copy(off, sizeof(inode.gid), (char*)&inode.gid);
+ off += sizeof(inode.gid);
+ r.copy(off, sizeof(inode.ctime), (char*)&inode.ctime);
+ off += sizeof(inode.ctime);
+}
+
+
+// old state encoders
+
+/*
+void CInode::encode_basic_state(bufferlist& r)
+{
+ // inode
+ r.append((char*)&inode, sizeof(inode));
+ ::_encode(cached_by, r);
+ ::_encode(cached_by_nonce, r);
+}
+
+void CInode::decode_basic_state(bufferlist& r, int& off)
+{
+ // inode
+ r.copy(0,sizeof(inode_t), (char*)&inode);
+ off += sizeof(inode_t);
+
+ bool empty = cached_by.empty();
+ ::_decode(cached_by, r, off);
+ ::_decode(cached_by_nonce, r, off);
+ if (!empty)
+ get(CINODE_PIN_CACHED);
+}
+*/
+
+
+// waiting
+
+bool CInode::is_frozen()
+{
+ if (parent && parent->dir->is_frozen())
+ return true;
+ return false;
+}
+
+bool CInode::is_frozen_dir()
+{
+ if (parent && parent->dir->is_frozen_dir())
+ return true;
+ return false;
+}
+
+bool CInode::is_freezing()
+{
+ if (parent && parent->dir->is_freezing())
+ return true;
+ return false;
+}
+
+bool CInode::waiting_for(int tag)
+{
+ return waiting.count(tag) > 0;
+}
+
+void CInode::add_waiter(int tag, Context *c) {
+ // waiting on hierarchy?
+ if (tag & CDIR_WAIT_ATFREEZEROOT && (is_freezing() || is_frozen())) {
+ parent->dir->add_waiter(tag, c);
+ return;
+ }
+
+ // this inode.
+ if (waiting.size() == 0)
+ get(CINODE_PIN_WAITER);
+ waiting.insert(pair<int,Context*>(tag,c));
+ dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl;
+
+}
+
+void CInode::take_waiting(int mask, list<Context*>& ls)
+{
+ if (waiting.empty()) return;
+
+ multimap<int,Context*>::iterator it = waiting.begin();
+ while (it != waiting.end()) {
+ if (it->first & mask) {
+ ls.push_back(it->second);
+ dout(10) << "take_waiting mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl;
+
+ waiting.erase(it++);
+ } else {
+ dout(10) << "take_waiting mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this << endl;
+ it++;
+ }
+ }
+
+ if (waiting.empty())
+ put(CINODE_PIN_WAITER);
+}
+
+void CInode::finish_waiting(int mask, int result)
+{
+ dout(11) << "finish_waiting mask " << mask << " result " << result << " on " << *this << endl;
+
+ list<Context*> finished;
+ take_waiting(mask, finished);
+ finish_contexts(finished, result);
+}
+
+
+// auth_pins
+bool CInode::can_auth_pin() {
+ if (parent)
+ return parent->dir->can_auth_pin();
+ return true;
+}
+
+void CInode::auth_pin() {
+ if (auth_pins == 0)
+ get(CINODE_PIN_AUTHPIN);
+ auth_pins++;
+
+ dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
+
+ if (parent)
+ parent->dir->adjust_nested_auth_pins( 1 );
+}
+
+void CInode::auth_unpin() {
+ auth_pins--;
+ if (auth_pins == 0)
+ put(CINODE_PIN_AUTHPIN);
+
+ dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl;
+
+ assert(auth_pins >= 0);
+
+ if (parent)
+ parent->dir->adjust_nested_auth_pins( -1 );
+}
+
+
+
+// authority
+
+int CInode::authority() {
+ if (is_dangling())
+ return dangling_auth; // explicit
+ if (is_root())
+ return 0; // i am root
+ assert(parent);
+ return parent->dir->dentry_authority( parent->name );
+}
+
+
+CInodeDiscover* CInode::replicate_to( int rep )
+{
+ assert(is_auth());
+
+ // relax locks?
+ if (!is_cached_by_anyone())
+ replicate_relax_locks();
+
+ // return the thinger
+ int nonce = cached_by_add( rep );
+ return new CInodeDiscover( this, nonce );
+}
+
+
+// debug crap -----------------------------
+
+void CInode::dump(int dep)
+{
+ string ind(dep, '\t');
+ //cout << ind << "[inode " << this << "]" << endl;
+
+ if (dir)
+ dir->dump(dep);
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __CINODE_H
+#define __CINODE_H
+
+#include "config.h"
+#include "include/types.h"
+#include "include/lru.h"
+
+#include "CDentry.h"
+#include "Lock.h"
+#include "Capability.h"
+
+#include "mdstypes.h"
+
+#include <cassert>
+#include <list>
+#include <vector>
+#include <set>
+#include <map>
+#include <iostream>
+using namespace std;
+
+
+
+
+
+// pins for keeping an item in cache (and debugging)
+#define CINODE_PIN_DIR 0
+#define CINODE_PIN_CACHED 1
+#define CINODE_PIN_DIRTY 2 // must flush
+#define CINODE_PIN_PROXY 3 // can't expire yet
+#define CINODE_PIN_WAITER 4 // waiter
+
+#define CINODE_PIN_CAPS 5 // local fh's
+
+#define CINODE_PIN_DNDIRTY 7 // dentry is dirty
+
+#define CINODE_PIN_AUTHPIN 8
+#define CINODE_PIN_IMPORTING 9 // multipurpose, for importing
+#define CINODE_PIN_REQUEST 10 // request is logging, finishing
+#define CINODE_PIN_RENAMESRC 11 // pinned on dest for foreign rename
+#define CINODE_PIN_ANCHORING 12
+
+#define CINODE_PIN_OPENINGDIR 13
+
+#define CINODE_PIN_DENTRYLOCK 14
+
+#define CINODE_NUM_PINS 15
+
+static char *cinode_pin_names[CINODE_NUM_PINS] = {
+ "dir",
+ "cached",
+ "dirty",
+ "proxy",
+ "waiter",
+ "caps",
+ "--",
+ "dndirty",
+ "authpin",
+ "imping",
+ "request",
+ "rensrc",
+ "anching",
+ "opdir",
+ "dnlock"
+};
+
+
+
+
+
+
+// wait reasons
+#define CINODE_WAIT_AUTHPINNABLE CDIR_WAIT_UNFREEZE
+ // waiters: write_hard_start, read_file_start, write_file_start (mdcache)
+ // handle_client_chmod, handle_client_touch (mds)
+ // trigger: (see CDIR_WAIT_UNFREEZE)
+#define CINODE_WAIT_GETREPLICA (1<<11) // update/replicate individual inode
+ // waiters: import_dentry_inode
+ // trigger: handle_inode_replicate_ack
+
+#define CINODE_WAIT_DIR (1<<13)
+ // waiters: traverse_path
+ // triggers: handle_disocver_reply
+
+#define CINODE_WAIT_LINK (1<<14) // as in remotely nlink++
+#define CINODE_WAIT_ANCHORED (1<<15)
+#define CINODE_WAIT_UNLINK (1<<16) // as in remotely nlink--
+
+#define CINODE_WAIT_HARDR (1<<17) // 131072
+#define CINODE_WAIT_HARDW (1<<18) // 262...
+#define CINODE_WAIT_HARDB (1<<19)
+#define CINODE_WAIT_HARDRWB (CINODE_WAIT_HARDR|CINODE_WAIT_HARDW|CINODE_WAIT_HARDB)
+#define CINODE_WAIT_HARDSTABLE (1<<20)
+#define CINODE_WAIT_HARDNORD (1<<21)
+#define CINODE_WAIT_FILER (1<<22)
+#define CINODE_WAIT_FILEW (1<<23)
+#define CINODE_WAIT_FILEB (1<<24)
+#define CINODE_WAIT_FILERWB (CINODE_WAIT_FILER|CINODE_WAIT_FILEW|CINODE_WAIT_FILEB)
+#define CINODE_WAIT_FILESTABLE (1<<25)
+#define CINODE_WAIT_FILENORD (1<<26)
+#define CINODE_WAIT_FILENOWR (1<<27)
+
+#define CINODE_WAIT_RENAMEACK (1<<28)
+#define CINODE_WAIT_RENAMENOTIFYACK (1<<29)
+
+#define CINODE_WAIT_CAPS (1<<30)
+
+
+
+
+#define CINODE_WAIT_ANY 0xffffffff
+
+
+// state
+#define CINODE_STATE_AUTH (1<<0)
+#define CINODE_STATE_ROOT (1<<1)
+
+#define CINODE_STATE_DIRTY (1<<2)
+#define CINODE_STATE_UNSAFE (1<<3) // not logged yet
+#define CINODE_STATE_DANGLING (1<<4) // delete me when i expire; i have no dentry
+#define CINODE_STATE_UNLINKING (1<<5)
+#define CINODE_STATE_PROXY (1<<6) // can't expire yet
+#define CINODE_STATE_EXPORTING (1<<7) // on nonauth bystander.
+
+#define CINODE_STATE_ANCHORING (1<<8)
+
+#define CINODE_STATE_OPENINGDIR (1<<9)
+
+//#define CINODE_STATE_RENAMING (1<<8) // moving me
+//#define CINODE_STATE_RENAMINGTO (1<<9) // rename target (will be unlinked)
+
+
+// misc
+#define CINODE_EXPORT_NONCE 1 // nonce given to replicas created by export
+#define CINODE_HASHREPLICA_NONCE 1 // hashed inodes that are duped ???FIXME???
+
+class Context;
+class CDentry;
+class CDir;
+class MDS;
+class Message;
+class CInode;
+class CInodeDiscover;
+class MDCache;
+
+//class MInodeSyncStart;
+
+ostream& operator<<(ostream& out, CInode& in);
+
+
+extern int cinode_pins[CINODE_NUM_PINS]; // counts
+
+
+// cached inode wrapper
+class CInode : public LRUObject {
+ public:
+ MDCache *mdcache;
+
+ inode_t inode; // the inode itself
+
+ CDir *dir; // directory, if we have it opened.
+ string symlink; // symlink dest, if symlink
+
+ // inode metadata locks
+ CLock hardlock;
+ CLock filelock;
+
+ protected:
+ int ref; // reference count
+ set<int> ref_set;
+ version_t parent_dir_version; // parent dir version when i was last touched.
+ version_t committing_version;
+ version_t committed_version;
+
+ unsigned state;
+
+ // parent dentries in cache
+ CDentry *parent; // primary link
+ set<CDentry*> remote_parents; // if hard linked
+
+ // -- distributed caching
+ set<int> cached_by; // [auth] mds's that cache me.
+ /* NOTE: on replicas, this doubles as replicated_by, but the
+ cached_by_* access methods below should NOT be used in those
+ cases, as the semantics are different! */
+ map<int,int> cached_by_nonce; // [auth] nonce issued to each replica
+ int replica_nonce; // [replica] defined on replica
+
+ int dangling_auth; // explicit auth, when dangling.
+
+ int num_request_pins;
+
+ // waiters
+ multimap<int, Context*> waiting;
+
+ // file capabilities
+ map<int, Capability> client_caps; // client -> caps
+
+ map<int, int> mds_caps_wanted; // [auth] mds -> caps wanted
+ int replica_caps_wanted; // [replica] what i've requested from auth
+ utime_t replica_caps_wanted_keep_until;
+
+
+ private:
+ // lock nesting
+ int auth_pins;
+ int nested_auth_pins;
+
+ public:
+ meta_load_t popularity[MDS_NPOP];
+
+ // friends
+ friend class Server;
+ friend class Locker;
+ friend class Migrator;
+ friend class MDCache;
+ friend class CDir;
+ friend class CInodeExport;
+ friend class CInodeDiscover;
+
+ public:
+ // ---------------------------
+ CInode(MDCache *c, bool auth=true);
+ ~CInode();
+
+
+ // -- accessors --
+ bool is_file() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_FILE) ? true:false; }
+ bool is_symlink() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) ? true:false; }
+ bool is_dir() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR) ? true:false; }
+
+ bool is_anchored() { return inode.anchored; }
+
+ bool is_root() { return state & CINODE_STATE_ROOT; }
+ bool is_proxy() { return state & CINODE_STATE_PROXY; }
+
+ bool is_auth() { return state & CINODE_STATE_AUTH; }
+ void set_auth(bool auth);
+ bool is_replica() { return !is_auth(); }
+ int get_replica_nonce() { assert(!is_auth()); return replica_nonce; }
+
+ inodeno_t ino() { return inode.ino; }
+ inode_t& get_inode() { return inode; }
+ CDentry* get_parent_dn() { return parent; }
+ CDir *get_parent_dir();
+ CInode *get_parent_inode();
+ CInode *get_realm_root(); // import, hash, or root
+
+ CDir *get_or_open_dir(MDS *mds);
+ CDir *set_dir(CDir *newdir);
+
+ bool dir_is_auth();
+
+
+
+ // -- misc --
+ void make_path(string& s);
+ void make_anchor_trace(vector<class Anchor*>& trace);
+
+
+
+ // -- state --
+ unsigned get_state() { return state; }
+ void state_clear(unsigned mask) { state &= ~mask; }
+ void state_set(unsigned mask) { state |= mask; }
+ unsigned state_test(unsigned mask) { return state & mask; }
+
+ bool is_unsafe() { return state & CINODE_STATE_UNSAFE; }
+ bool is_dangling() { return state & CINODE_STATE_DANGLING; }
+ bool is_unlinking() { return state & CINODE_STATE_UNLINKING; }
+
+ void mark_unsafe() { state |= CINODE_STATE_UNSAFE; }
+ void mark_safe() { state &= ~CINODE_STATE_UNSAFE; }
+
+ // -- state encoding --
+ //void encode_basic_state(bufferlist& r);
+ //void decode_basic_state(bufferlist& r, int& off);
+
+
+ void encode_file_state(bufferlist& r);
+ void decode_file_state(bufferlist& r, int& off);
+
+ void encode_hard_state(bufferlist& r);
+ void decode_hard_state(bufferlist& r, int& off);
+
+
+ // -- dirtyness --
+ version_t get_version() { return inode.version; }
+ version_t get_parent_dir_version() { return parent_dir_version; }
+ void float_parent_dir_version(version_t ge) {
+ if (parent_dir_version < ge)
+ parent_dir_version = ge;
+ }
+ version_t get_committing_version() { return committing_version; }
+ version_t get_last_committed_version() { return committed_version; }
+ void set_committing_version(version_t v) { committing_version = v; }
+ void set_committed_version() {
+ committed_version = committing_version;
+ committing_version = 0;
+ }
+
+ bool is_dirty() { return state & CINODE_STATE_DIRTY; }
+ bool is_clean() { return !is_dirty(); }
+
+ void mark_dirty();
+ void mark_clean();
+
+
+
+ // -- cached_by -- to be used ONLY when we're authoritative or cacheproxy
+ bool is_cached_by_anyone() { return !cached_by.empty(); }
+ bool is_cached_by(int mds) { return cached_by.count(mds); }
+ int num_cached_by() { return cached_by.size(); }
+ // cached_by_add returns a nonce
+ int cached_by_add(int mds) {
+ int nonce = 1;
+ if (is_cached_by(mds)) { // already had it?
+ nonce = get_cached_by_nonce(mds) + 1; // new nonce (+1)
+ dout(10) << *this << " issuing new nonce " << nonce << " to mds" << mds << endl;
+ cached_by_nonce.erase(mds);
+ } else {
+ if (cached_by.empty())
+ get(CINODE_PIN_CACHED);
+ cached_by.insert(mds);
+ }
+ cached_by_nonce.insert(pair<int,int>(mds,nonce)); // first! serial of 1.
+ return nonce; // default nonce
+ }
+ void cached_by_add(int mds, int nonce) {
+ if (cached_by.empty())
+ get(CINODE_PIN_CACHED);
+ cached_by.insert(mds);
+ cached_by_nonce.insert(pair<int,int>(mds,nonce));
+ }
+ int get_cached_by_nonce(int mds) {
+ map<int,int>::iterator it = cached_by_nonce.find(mds);
+ return it->second;
+ }
+ void cached_by_remove(int mds) {
+ //if (!is_cached_by(mds)) return;
+ assert(is_cached_by(mds));
+
+ cached_by.erase(mds);
+ cached_by_nonce.erase(mds);
+ if (cached_by.empty())
+ put(CINODE_PIN_CACHED);
+ }
+ void cached_by_clear() {
+ if (cached_by.size())
+ put(CINODE_PIN_CACHED);
+ cached_by.clear();
+ cached_by_nonce.clear();
+ }
+ set<int>::iterator cached_by_begin() { return cached_by.begin(); }
+ set<int>::iterator cached_by_end() { return cached_by.end(); }
+ set<int>& get_cached_by() { return cached_by; }
+
+ CInodeDiscover* replicate_to(int rep);
+
+
+ // -- waiting --
+ bool waiting_for(int tag);
+ void add_waiter(int tag, Context *c);
+ void take_waiting(int tag, list<Context*>& ls);
+ void finish_waiting(int mask, int result = 0);
+
+
+ // -- caps -- (new)
+ // client caps
+ map<int,Capability>& get_client_caps() { return client_caps; }
+ void add_client_cap(int client, Capability& cap) {
+ if (client_caps.empty())
+ get(CINODE_PIN_CAPS);
+ assert(client_caps.count(client) == 0);
+ client_caps[client] = cap;
+ }
+ void remove_client_cap(int client) {
+ assert(client_caps.count(client) == 1);
+ client_caps.erase(client);
+ if (client_caps.empty())
+ put(CINODE_PIN_CAPS);
+ }
+ Capability* get_client_cap(int client) {
+ if (client_caps.count(client))
+ return &client_caps[client];
+ return 0;
+ }
+ /*
+ void set_client_caps(map<int,Capability>& cl) {
+ if (client_caps.empty() && !cl.empty())
+ get(CINODE_PIN_CAPS);
+ client_caps.clear();
+ client_caps = cl;
+ }
+ */
+ void take_client_caps(map<int,Capability>& cl) {
+ if (!client_caps.empty())
+ put(CINODE_PIN_CAPS);
+ cl = client_caps;
+ client_caps.clear();
+ }
+ void merge_client_caps(map<int,Capability>& cl, set<int>& new_client_caps) {
+ if (client_caps.empty() && !cl.empty())
+ get(CINODE_PIN_CAPS);
+ for (map<int,Capability>::iterator it = cl.begin();
+ it != cl.end();
+ it++) {
+ new_client_caps.insert(it->first);
+ if (client_caps.count(it->first)) {
+ // merge
+ client_caps[it->first].merge(it->second);
+ } else {
+ // new
+ client_caps[it->first] = it->second;
+ }
+ }
+ }
+
+ // caps issued, wanted
+ int get_caps_issued() {
+ int c = 0;
+ for (map<int,Capability>::iterator it = client_caps.begin();
+ it != client_caps.end();
+ it++)
+ c |= it->second.issued();
+ return c;
+ }
+ int get_caps_wanted() {
+ int w = 0;
+ for (map<int,Capability>::iterator it = client_caps.begin();
+ it != client_caps.end();
+ it++) {
+ w |= it->second.wanted();
+ //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
+ }
+ if (is_auth())
+ for (map<int,int>::iterator it = mds_caps_wanted.begin();
+ it != mds_caps_wanted.end();
+ it++) {
+ w |= it->second;
+ //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl;
+ }
+ return w;
+ }
+
+
+ void replicate_relax_locks() {
+ assert(is_auth());
+ assert(!is_cached_by_anyone());
+ dout(10) << " relaxing locks on " << *this << endl;
+
+ if (hardlock.get_state() == LOCK_LOCK &&
+ !hardlock.is_used()) {
+ dout(10) << " hard now sync " << *this << endl;
+ hardlock.set_state(LOCK_SYNC);
+ }
+ if (filelock.get_state() == LOCK_LOCK) {
+ if (!filelock.is_used() &&
+ (get_caps_issued() & CAP_FILE_WR) == 0) {
+ filelock.set_state(LOCK_SYNC);
+ dout(10) << " file now sync " << *this << endl;
+ } else {
+ dout(10) << " can't relax filelock on " << *this << endl;
+ }
+ }
+ }
+
+
+ // -- authority --
+ int authority();
+
+
+ // -- auth pins --
+ int is_auth_pinned() {
+ return auth_pins;
+ }
+ int adjust_nested_auth_pins(int a);
+ bool can_auth_pin();
+ void auth_pin();
+ void auth_unpin();
+
+
+ // -- freeze --
+ bool is_frozen();
+ bool is_frozen_dir();
+ bool is_freezing();
+
+
+ // -- reference counting --
+
+ /* these can be pinned any # of times, and are
+ linked to an active_request, so they're automatically cleaned
+ up when a request is finished. pin at will! */
+ void request_pin_get() {
+ if (num_request_pins == 0) get(CINODE_PIN_REQUEST);
+ num_request_pins++;
+ }
+ void request_pin_put() {
+ num_request_pins--;
+ if (num_request_pins == 0) put(CINODE_PIN_REQUEST);
+ assert(num_request_pins >= 0);
+ }
+
+
+ bool is_pinned() { return ref > 0; }
+ set<int>& get_ref_set() { return ref_set; }
+ void put(int by) {
+ cinode_pins[by]--;
+ if (ref == 0 || ref_set.count(by) != 1) {
+ dout(7) << " bad put " << *this << " by " << by << " " << cinode_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl;
+ assert(ref_set.count(by) == 1);
+ assert(ref > 0);
+ }
+ ref--;
+ ref_set.erase(by);
+ if (ref == 0)
+ lru_unpin();
+ dout(7) << " put " << *this << " by " << by << " " << cinode_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl;
+ }
+ void get(int by) {
+ cinode_pins[by]++;
+ if (ref == 0)
+ lru_pin();
+ if (ref_set.count(by)) {
+ dout(7) << " bad get " << *this << " by " << by << " " << cinode_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl;
+ assert(ref_set.count(by) == 0);
+ }
+ ref++;
+ ref_set.insert(by);
+ dout(7) << " get " << *this << " by " << by << " " << cinode_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl;
+ }
+ bool is_pinned_by(int by) {
+ return ref_set.count(by);
+ }
+
+ // -- hierarchy stuff --
+ void set_primary_parent(CDentry *p) {
+ parent = p;
+ }
+ void remove_primary_parent(CDentry *dn) {
+ assert(dn == parent);
+ parent = 0;
+ }
+ void add_remote_parent(CDentry *p) {
+ remote_parents.insert(p);
+ }
+ void remove_remote_parent(CDentry *p) {
+ remote_parents.erase(p);
+ }
+ int num_remote_parents() {
+ return remote_parents.size();
+ }
+
+
+ /*
+ // for giving to clients
+ void get_dist_spec(set<int>& ls, int auth, timepair_t& now) {
+ if (( is_dir() && popularity[MDS_POP_CURDOM].get(now) > g_conf.mds_bal_replicate_threshold) ||
+ (!is_dir() && popularity[MDS_POP_JUSTME].get(now) > g_conf.mds_bal_replicate_threshold)) {
+ //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl;
+ ls = cached_by;
+ }
+ }
+ */
+
+ // dbg
+ void dump(int d = 0);
+};
+
+
+
+
+// -- encoded state
+
+// discover
+
+class CInodeDiscover {
+
+ inode_t inode;
+ int replica_nonce;
+
+ int hardlock_state;
+ int filelock_state;
+
+ public:
+ CInodeDiscover() {}
+ CInodeDiscover(CInode *in, int nonce) {
+ inode = in->inode;
+ replica_nonce = nonce;
+
+ hardlock_state = in->hardlock.get_replica_state();
+ filelock_state = in->filelock.get_replica_state();
+ }
+
+ inodeno_t get_ino() { return inode.ino; }
+ int get_replica_nonce() { return replica_nonce; }
+
+ void update_inode(CInode *in) {
+ in->inode = inode;
+
+ in->replica_nonce = replica_nonce;
+ in->hardlock.set_state(hardlock_state);
+ in->filelock.set_state(filelock_state);
+ }
+
+ void _encode(bufferlist& bl) {
+ bl.append((char*)&inode, sizeof(inode));
+ bl.append((char*)&replica_nonce, sizeof(replica_nonce));
+ bl.append((char*)&hardlock_state, sizeof(hardlock_state));
+ bl.append((char*)&filelock_state, sizeof(filelock_state));
+ }
+
+ void _decode(bufferlist& bl, int& off) {
+ bl.copy(off,sizeof(inode_t), (char*)&inode);
+ off += sizeof(inode_t);
+ bl.copy(off, sizeof(int), (char*)&replica_nonce);
+ off += sizeof(int);
+ bl.copy(off, sizeof(hardlock_state), (char*)&hardlock_state);
+ off += sizeof(hardlock_state);
+ bl.copy(off, sizeof(filelock_state), (char*)&filelock_state);
+ off += sizeof(filelock_state);
+ }
+
+};
+
+
+// export
+
+class CInodeExport {
+
+ struct {
+ inode_t inode;
+ meta_load_t popularity_justme;
+ meta_load_t popularity_curdom;
+ bool is_dirty; // dirty inode?
+
+ int num_caps;
+ } st;
+
+ set<int> cached_by;
+ map<int,int> cached_by_nonce;
+ map<int,Capability> cap_map;
+
+ CLock hardlock,filelock;
+ //int remaining_issued;
+
+public:
+ CInodeExport() {}
+ CInodeExport(CInode *in) {
+ st.inode = in->inode;
+ st.is_dirty = in->is_dirty();
+ cached_by = in->cached_by;
+ cached_by_nonce = in->cached_by_nonce;
+
+ hardlock = in->hardlock;
+ filelock = in->filelock;
+
+ st.popularity_justme.take( in->popularity[MDS_POP_JUSTME] );
+ st.popularity_curdom.take( in->popularity[MDS_POP_CURDOM] );
+ in->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom;
+ in->popularity[MDS_POP_NESTED] -= st.popularity_curdom;
+
+ // steal WRITER caps from inode
+ in->take_client_caps(cap_map);
+ //remaining_issued = in->get_caps_issued();
+ }
+ ~CInodeExport() {
+ }
+
+ inodeno_t get_ino() { return st.inode.ino; }
+
+ void update_inode(CInode *in, set<int>& new_client_caps) {
+ in->inode = st.inode;
+
+ in->popularity[MDS_POP_JUSTME] += st.popularity_justme;
+ in->popularity[MDS_POP_CURDOM] += st.popularity_curdom;
+ in->popularity[MDS_POP_ANYDOM] += st.popularity_curdom;
+ in->popularity[MDS_POP_NESTED] += st.popularity_curdom;
+
+ if (st.is_dirty) {
+ in->mark_dirty();
+ }
+
+ in->cached_by.clear();
+ in->cached_by = cached_by;
+ in->cached_by_nonce = cached_by_nonce;
+ if (!cached_by.empty())
+ in->get(CINODE_PIN_CACHED);
+
+ in->hardlock = hardlock;
+ in->filelock = filelock;
+
+ // caps
+ in->merge_client_caps(cap_map, new_client_caps);
+ }
+
+ void _encode(bufferlist& bl) {
+ st.num_caps = cap_map.size();
+ bl.append((char*)&st, sizeof(st));
+
+ // cached_by + nonce
+ ::_encode(cached_by, bl);
+ ::_encode(cached_by_nonce, bl);
+
+ hardlock.encode_state(bl);
+ filelock.encode_state(bl);
+
+ // caps
+ for (map<int,Capability>::iterator it = cap_map.begin();
+ it != cap_map.end();
+ it++) {
+ bl.append((char*)&it->first, sizeof(it->first));
+ it->second._encode(bl);
+ }
+ }
+
+ int _decode(bufferlist& bl, int off = 0) {
+ bl.copy(off, sizeof(st), (char*)&st);
+ off += sizeof(st);
+
+ ::_decode(cached_by, bl, off);
+ ::_decode(cached_by_nonce, bl, off);
+
+ hardlock.decode_state(bl, off);
+ filelock.decode_state(bl, off);
+
+ // caps
+ for (int i=0; i<st.num_caps; i++) {
+ int c;
+ bl.copy(off, sizeof(c), (char*)&c);
+ off += sizeof(c);
+ cap_map[c]._decode(bl, off);
+ }
+
+ return off;
+ }
+};
+
+
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __CAPABILITY_H
+#define __CAPABILITY_H
+
+#include "include/buffer.h"
+
+#include <map>
+using namespace std;
+
+#include "config.h"
+
+
+// definite caps
+#define CAP_FILE_RDCACHE 1 // client can safely cache reads
+#define CAP_FILE_RD 2 // client can read
+#define CAP_FILE_WR 4 // client can write
+#define CAP_FILE_WREXTEND 8 // client can extend file
+#define CAP_FILE_WRBUFFER 16 // client can safely buffer writes
+#define CAP_FILE_LAZYIO 32 // client can perform lazy io
+
+
+// heuristics
+//#define CAP_FILE_DELAYFLUSH 32
+
+inline string cap_string(int cap)
+{
+ string s;
+ s = "[";
+ if (cap & CAP_FILE_RDCACHE) s += " rdcache";
+ if (cap & CAP_FILE_RD) s += " rd";
+ if (cap & CAP_FILE_WR) s += " wr";
+ if (cap & CAP_FILE_WRBUFFER) s += " wrbuffer";
+ if (cap & CAP_FILE_WRBUFFER) s += " wrextend";
+ if (cap & CAP_FILE_LAZYIO) s += " lazyio";
+ s += " ]";
+ return s;
+}
+
+
+class Capability {
+ int wanted_caps; // what the client wants (ideally)
+
+ map<long, int> cap_history; // seq -> cap
+ long last_sent, last_recv;
+
+ bool suppress;
+
+public:
+ Capability(int want=0) :
+ wanted_caps(want),
+ last_sent(0),
+ last_recv(0),
+ suppress(false) {
+ //cap_history[last_sent] = 0;
+ }
+
+
+ bool is_suppress() { return suppress; }
+ void set_suppress(bool b) { suppress = b; }
+
+ bool is_null() { return cap_history.empty(); }
+
+ // most recently issued caps.
+ int pending() {
+ if (cap_history.count(last_sent))
+ return cap_history[ last_sent ];
+ return 0;
+ }
+
+ // caps client has confirmed receipt of
+ int confirmed() {
+ if (cap_history.count(last_recv))
+ return cap_history[ last_recv ];
+ return 0;
+ }
+
+ // caps potentially issued
+ int issued() {
+ int c = 0;
+ for (long seq = last_recv; seq <= last_sent; seq++) {
+ if (cap_history.count(seq)) {
+ c |= cap_history[seq];
+ dout(10) << " cap issued: seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(c) << endl;
+ }
+ }
+ return c;
+ }
+
+ // caps this client wants to hold
+ int wanted() { return wanted_caps; }
+ void set_wanted(int w) {
+ wanted_caps = w;
+ }
+
+ // needed
+ static int needed(int from) {
+ // strip out wrbuffer, rdcache
+ return from & (CAP_FILE_WR|CAP_FILE_RD);
+ }
+ int needed() { return needed(wanted_caps); }
+
+ // conflicts
+ static int conflicts(int from) {
+ int c = 0;
+ if (from & CAP_FILE_WRBUFFER) c |= CAP_FILE_RDCACHE|CAP_FILE_RD;
+ if (from & CAP_FILE_WR) c |= CAP_FILE_RDCACHE;
+ if (from & CAP_FILE_RD) c |= CAP_FILE_WRBUFFER;
+ if (from & CAP_FILE_RDCACHE) c |= CAP_FILE_WRBUFFER|CAP_FILE_WR;
+ return c;
+ }
+ int wanted_conflicts() { return conflicts(wanted()); }
+ int needed_conflicts() { return conflicts(needed()); }
+ int issued_conflicts() { return conflicts(issued()); }
+
+ // issue caps; return seq number.
+ long issue(int c) {
+ //int was = pending();
+ //no! if (c == was && last_sent) return -1; // repeat of previous?
+
+ ++last_sent;
+ cap_history[last_sent] = c;
+
+ /* no!
+ // not recalling, just adding?
+ if (c & ~was &&
+ cap_history.count(last_sent-1)) {
+ cap_history.erase(last_sent-1);
+ }
+ */
+ return last_sent;
+ }
+ long get_last_seq() { return last_sent; }
+
+ void merge(Capability& other) {
+ // issued + pending
+ int newpending = other.pending() | pending();
+ if (other.issued() & ~newpending)
+ issue(other.issued() | newpending);
+ issue(newpending);
+
+ // wanted
+ wanted_caps = wanted_caps | other.wanted();
+ }
+
+ // confirm receipt of a previous sent/issued seq.
+ int confirm_receipt(long seq, int caps) {
+ int r = 0;
+
+ // old seqs
+ while (last_recv < seq) {
+ dout(10) << " cap.confirm_receipt forgetting seq " << last_recv << " " << cap_string(cap_history[last_recv]) << endl;
+ r |= cap_history[last_recv];
+ cap_history.erase(last_recv);
+ ++last_recv;
+ }
+
+ // release current?
+ if (cap_history.count(seq) &&
+ cap_history[seq] != caps) {
+ dout(10) << " cap.confirm_receipt revising seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(caps) << endl;
+ // note what we're releasing..
+ assert(cap_history[seq] & ~caps);
+ r |= cap_history[seq] & ~caps;
+
+ cap_history[seq] = caps; // confirmed() now less than before..
+ }
+
+ // null?
+ if (caps == 0 &&
+ cap_history.size() == 1 &&
+ cap_history.count(seq)) {
+ cap_history.clear(); // viola, null!
+ }
+
+ return r;
+ }
+
+ // serializers
+ void _encode(bufferlist& bl) {
+ bl.append((char*)&wanted_caps, sizeof(wanted_caps));
+ bl.append((char*)&last_sent, sizeof(last_sent));
+ bl.append((char*)&last_recv, sizeof(last_recv));
+ ::_encode(cap_history, bl);
+ }
+ void _decode(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(wanted_caps), (char*)&wanted_caps);
+ off += sizeof(wanted_caps);
+ bl.copy(off, sizeof(last_sent), (char*)&last_sent);
+ off += sizeof(last_sent);
+ bl.copy(off, sizeof(last_recv), (char*)&last_recv);
+ off += sizeof(last_recv);
+ ::_decode(cap_history, bl, off);
+ }
+
+};
+
+
+
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __CLIENTMAP_H
+#define __CLIENTMAP_H
+
+#include "msg/Message.h"
+
+#include <set>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+class ClientMap {
+ hash_map<int,entity_inst_t> client_inst;
+ set<int> client_mount;
+ hash_map<int, int> client_ref;
+
+ void inc_ref(int client, const entity_inst_t& inst) {
+ if (client_inst.count(client)) {
+ assert(client_inst[client] == inst);
+ assert(client_ref.count(client));
+ } else {
+ client_inst[client] = inst;
+ }
+ client_ref[client]++;
+ }
+ void dec_ref(int client) {
+ assert(client_ref.count(client));
+ assert(client_ref[client] > 0);
+ client_ref[client]--;
+ if (client_ref[client] == 0) {
+ client_ref.erase(client);
+ client_inst.erase(client);
+ }
+ }
+
+public:
+ const entity_inst_t& get_inst(int client) {
+ assert(client_inst.count(client));
+ return client_inst[client];
+ }
+ const set<int>& get_mount_set() { return client_mount; }
+
+ void add_mount(int client, const entity_inst_t& inst) {
+ inc_ref(client, inst);
+ client_mount.insert(client);
+ }
+ void rem_mount(int client) {
+ dec_ref(client);
+ client_mount.erase(client);
+ }
+
+
+ void add_open(int client, const entity_inst_t& inst) {
+ inc_ref(client, inst);
+ }
+ void dec_open(int client) {
+ dec_ref(client);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#define DBLEVEL 20
+
+#include "IdAllocator.h"
+#include "MDS.h"
+#include "MDLog.h"
+#include "events/EAlloc.h"
+
+#include "osdc/Filer.h"
+
+#include "include/types.h"
+
+#include "config.h"
+#undef dout
+#define dout(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".idalloc: "
+
+
+void IdAllocator::init_inode()
+{
+ memset(&inode, 0, sizeof(inode));
+ inode.ino = MDS_INO_IDS_OFFSET + mds->get_nodeid();
+ inode.layout = g_OSD_FileLayout;
+}
+
+
+idno_t IdAllocator::alloc_id(bool replay)
+{
+ assert(is_active());
+
+ // pick one
+ idno_t id = free.start();
+ free.erase(id);
+ dout(10) << "idalloc " << this << ": alloc id " << id << endl;
+
+ version++;
+
+ // log it
+ if (!replay)
+ mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_ALLOC, version));
+
+ return id;
+}
+
+void IdAllocator::reclaim_id(idno_t id, bool replay)
+{
+ assert(is_active());
+
+ dout(10) << "idalloc " << this << ": reclaim id " << id << endl;
+ free.insert(id);
+
+ version++;
+
+ if (!replay)
+ mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_FREE, version));
+}
+
+
+
+class C_ID_Save : public Context {
+ IdAllocator *ida;
+ version_t version;
+public:
+ C_ID_Save(IdAllocator *i, version_t v) : ida(i), version(v) {}
+ void finish(int r) {
+ ida->save_2(version);
+ }
+};
+
+void IdAllocator::save(Context *onfinish, version_t v)
+{
+ if (v > 0 && v <= committing_version) {
+ dout(10) << "save v " << version << " - already saving "
+ << committing_version << " >= needed " << v << endl;
+ waitfor_save[v].push_back(onfinish);
+ return;
+ }
+
+ dout(10) << "save v " << version << endl;
+ assert(is_active());
+
+ bufferlist bl;
+
+ bl.append((char*)&version, sizeof(version));
+ ::_encode(free.m, bl);
+
+ committing_version = version;
+
+ if (onfinish)
+ waitfor_save[version].push_back(onfinish);
+
+ // write (async)
+ mds->filer->write(inode,
+ 0, bl.length(), bl,
+ 0,
+ 0, new C_ID_Save(this, version));
+}
+
+void IdAllocator::save_2(version_t v)
+{
+ dout(10) << "save_2 v " << v << endl;
+
+ committed_version = v;
+
+ list<Context*> ls;
+ while (!waitfor_save.empty()) {
+ if (waitfor_save.begin()->first > v) break;
+ ls.splice(ls.end(), waitfor_save.begin()->second);
+ waitfor_save.erase(waitfor_save.begin());
+ }
+ finish_contexts(ls,0);
+}
+
+
+void IdAllocator::reset()
+{
+ init_inode();
+
+ free.clear();
+
+ // use generic range FIXME THIS IS CRAP
+ free.insert((long long)0x1000000 * (long long)(mds->get_nodeid()+1),
+ (long long)0x1000000 * (long long)(mds->get_nodeid()+2) - 1LL);
+ //free[ID_INO].dump();
+
+ //free[ID_FH].map_insert(10000000LL * (mds->get_nodeid()+1),
+ //10000000LL * (mds->get_nodeid()+2) - 1);
+ //free[ID_FH].dump();
+
+ state = STATE_ACTIVE;
+}
+
+
+
+// -----------------------
+
+class C_ID_Load : public Context {
+public:
+ IdAllocator *ida;
+ Context *onfinish;
+ bufferlist bl;
+ C_ID_Load(IdAllocator *i, Context *o) : ida(i), onfinish(o) {}
+ void finish(int r) {
+ ida->load_2(r, bl, onfinish);
+ }
+};
+
+void IdAllocator::load(Context *onfinish)
+{
+ dout(10) << "load" << endl;
+
+ init_inode();
+
+ assert(is_undef());
+ state = STATE_OPENING;
+
+ C_ID_Load *c = new C_ID_Load(this, onfinish);
+ mds->filer->read(inode,
+ 0, inode.layout.stripe_size,
+ &c->bl,
+ c);
+}
+
+void IdAllocator::load_2(int r, bufferlist& bl, Context *onfinish)
+{
+ assert(is_opening());
+ state = STATE_ACTIVE;
+
+ if (r > 0) {
+ dout(10) << "load_2 got " << bl.length() << " bytes" << endl;
+ int off = 0;
+ bl.copy(off, sizeof(version), (char*)&version);
+ off += sizeof(version);
+ ::_decode(free.m, bl, off);
+ committed_version = version;
+ }
+ else {
+ dout(10) << "load_2 found no alloc file" << endl;
+ assert(0); // this shouldn't happen if mkfs finished.
+ reset();
+ }
+
+ if (onfinish) {
+ onfinish->finish(0);
+ delete onfinish;
+ }
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __IDALLOCATOR_H
+#define __IDALLOCATOR_H
+
+#include "include/types.h"
+#include "include/interval_set.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+
+class MDS;
+
+#define IDTYPE_INO 1
+typedef inodeno_t idno_t;
+
+class IdAllocator {
+ MDS *mds;
+ inode_t inode;
+
+ static const int STATE_UNDEF = 0;
+ static const int STATE_OPENING = 1;
+ static const int STATE_ACTIVE = 2;
+ //static const int STATE_COMMITTING = 3;
+ int state;
+
+ version_t version, committing_version, committed_version;
+
+ interval_set<idno_t> free; // unused ids
+
+ map<version_t, list<Context*> > waitfor_save;
+
+ public:
+ IdAllocator(MDS *m) :
+ mds(m),
+ state(STATE_UNDEF),
+ version(0), committing_version(0), committed_version(0)
+ {
+ }
+
+ void init_inode();
+
+ // alloc or reclaim ids
+ idno_t alloc_id(bool replay=false);
+ void reclaim_id(idno_t id, bool replay=false);
+
+ version_t get_version() { return version; }
+ version_t get_committed_version() { return committed_version; }
+
+ // load/save from disk (hack)
+ bool is_undef() { return state == STATE_UNDEF; }
+ bool is_active() { return state == STATE_ACTIVE; }
+ bool is_opening() { return state == STATE_OPENING; }
+
+ void reset();
+ void save(Context *onfinish=0, version_t need=0);
+ void save_2(version_t v);
+
+ void shutdown() {
+ if (is_active()) save(0);
+ }
+
+ void load(Context *onfinish);
+ void load_2(int, bufferlist&, Context *onfinish);
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __LOCK_H
+#define __LOCK_H
+
+#include <assert.h>
+#include <set>
+using namespace std;
+
+#include "include/buffer.h"
+
+#include "Capability.h"
+
+// states and such.
+// C = cache reads, R = read, W = write, A = append, B = buffer writes, L = lazyio
+
+// basic lock -----auth-------- ---replica-------
+#define LOCK_SYNC 0 // AR R . / C R . . . L R . / C R . . . L stat()
+#define LOCK_LOCK 1 // AR R W / C . . . . . . . / C . . . . . truncate()
+#define LOCK_GLOCKR 2 // AR R . / C . . . . . . . / C . . . . .
+
+// file lock states
+#define LOCK_GLOCKL 3 // A . . / . . . . . . loner -> lock
+#define LOCK_GLOCKM 4 // A . . / . . . . . .
+#define LOCK_MIXED 5 // AR . . / . R W A . L . . / . R . . . L
+#define LOCK_GMIXEDR 6 // AR R . / . R . . . L . . / . R . . . L
+#define LOCK_GMIXEDL 7 // A . . / . . . . . L loner -> mixed
+
+#define LOCK_LONER 8 // A . . / C R W A B L (lock)
+#define LOCK_GLONERR 9 // A . . / . R . . . L
+#define LOCK_GLONERM 10 // A . . / . R W A . L
+
+#define LOCK_GSYNCL 11 // A . . / C ? . . . L loner -> sync (*) FIXME: let old loner keep R, somehow...
+#define LOCK_GSYNCM 12 // A . . / . R . . . L
+
+// 4 stable
+// +9 transition
+// 13 total
+
+/* no append scenarios:
+
+loner + truncate():
+ - loner needs to lose A (?unless it's the loner doing the truncate?)
+loner + statlite(size):
+ - loner needs to lose A
+
+any + statlite(size)
+ - all lose A
+
+any + statlite(mtime)
+ - all lose W
+
+-> we need to add lonerfixed and mixedfixed states (and associated transitions)
+ in order to efficiently support statlite(size) and truncate(). until then,
+ we have to LOCK.
+
+ */
+
+// -- lock... hard or file
+
+class CLock {
+ protected:
+ // lock state
+ char state;
+ set<int> gather_set; // auth
+ int nread, nwrite;
+
+
+ public:
+ CLock() :
+ state(LOCK_LOCK),
+ nread(0),
+ nwrite(0) {
+ }
+
+ // encode/decode
+ void encode_state(bufferlist& bl) {
+ bl.append((char*)&state, sizeof(state));
+ bl.append((char*)&nread, sizeof(nread));
+ bl.append((char*)&nwrite, sizeof(nwrite));
+
+ _encode(gather_set, bl);
+ }
+ void decode_state(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(state), (char*)&state);
+ off += sizeof(state);
+ bl.copy(off, sizeof(nread), (char*)&nread);
+ off += sizeof(nread);
+ bl.copy(off, sizeof(nwrite), (char*)&nwrite);
+ off += sizeof(nwrite);
+
+ _decode(gather_set, bl, off);
+ }
+
+ char get_state() { return state; }
+ char set_state(char s) {
+ state = s;
+ assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states.
+ return s;
+ };
+
+ char get_replica_state() {
+ switch (state) {
+ case LOCK_LOCK:
+ case LOCK_GLOCKM:
+ case LOCK_GLOCKL:
+ case LOCK_GLOCKR:
+ case LOCK_LONER:
+ case LOCK_GLONERR:
+ case LOCK_GLONERM:
+ return LOCK_LOCK;
+ case LOCK_MIXED:
+ case LOCK_GMIXEDR:
+ return LOCK_MIXED;
+ case LOCK_SYNC:
+ return LOCK_SYNC;
+
+ // after gather auth will bc LOCK_AC_MIXED or whatever
+ case LOCK_GSYNCM:
+ return LOCK_MIXED;
+ case LOCK_GSYNCL:
+ case LOCK_GMIXEDL: // ** LOCK isn't exact right state, but works.
+ return LOCK_LOCK;
+
+ default:
+ assert(0);
+ }
+ return 0;
+ }
+
+ // gather set
+ set<int>& get_gather_set() { return gather_set; }
+ void init_gather(set<int>& i) {
+ gather_set = i;
+ }
+ bool is_gathering(int i) {
+ return gather_set.count(i);
+ }
+ void clear_gather() {
+ gather_set.clear();
+ }
+
+ // ref counting
+ int get_read() { return ++nread; }
+ int put_read() {
+ assert(nread>0);
+ return --nread;
+ }
+ int get_nread() { return nread; }
+
+ int get_write() { return ++nwrite; }
+ int put_write() {
+ assert(nwrite>0);
+ return --nwrite;
+ }
+ int get_nwrite() { return nwrite; }
+ bool is_used() {
+ return (nwrite+nread)>0 ? true:false;
+ }
+
+
+ // stable
+ bool is_stable() {
+ return (state == LOCK_SYNC) ||
+ (state == LOCK_LOCK) ||
+ (state == LOCK_MIXED) ||
+ (state == LOCK_LONER);
+ }
+
+ // read/write access
+ bool can_read(bool auth) {
+ if (auth)
+ return (state == LOCK_SYNC) || (state == LOCK_GMIXEDR)
+ || (state == LOCK_GLOCKR) || (state == LOCK_LOCK);
+ else
+ return (state == LOCK_SYNC);
+ }
+ bool can_read_soon(bool auth) {
+ if (auth)
+ return (state == LOCK_GLOCKL);
+ else
+ return false;
+ }
+
+ bool can_write(bool auth) {
+ if (auth)
+ return (state == LOCK_LOCK);
+ else
+ return false;
+ }
+ bool can_write_soon(bool auth) {
+ if (auth)
+ return (state == LOCK_GLOCKR) || (state == LOCK_GLOCKL)
+ || (state == LOCK_GLOCKM);
+ else
+ return false;
+ }
+
+ // client caps allowed
+ int caps_allowed_ever(bool auth) {
+ if (auth)
+ return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO;
+ else
+ return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO;
+ }
+ int caps_allowed(bool auth) {
+ if (auth)
+ switch (state) {
+ case LOCK_SYNC:
+ return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO;
+ case LOCK_LOCK:
+ case LOCK_GLOCKR:
+ return CAP_FILE_RDCACHE;
+
+ case LOCK_GLOCKL:
+ case LOCK_GLOCKM:
+ return 0;
+
+ case LOCK_MIXED:
+ return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO;
+ case LOCK_GMIXEDR:
+ return CAP_FILE_RD | CAP_FILE_LAZYIO;
+ case LOCK_GMIXEDL:
+ return 0;
+
+ case LOCK_LONER: // single client writer, of course.
+ return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO;
+ case LOCK_GLONERR:
+ return CAP_FILE_RD | CAP_FILE_LAZYIO;
+ case LOCK_GLONERM:
+ return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO;
+
+ case LOCK_GSYNCL:
+ return CAP_FILE_RDCACHE | CAP_FILE_LAZYIO;
+ case LOCK_GSYNCM:
+ return CAP_FILE_RD | CAP_FILE_LAZYIO;
+ }
+ else
+ switch (state) {
+ case LOCK_SYNC:
+ return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO;
+ case LOCK_LOCK:
+ case LOCK_GLOCKR:
+ return CAP_FILE_RDCACHE;
+ case LOCK_GMIXEDR:
+ case LOCK_MIXED:
+ return CAP_FILE_RD | CAP_FILE_LAZYIO;
+ }
+ assert(0);
+ return 0;
+ }
+
+ friend class MDCache;
+ friend class Locker;
+ friend class Migrator;
+};
+
+//ostream& operator<<(ostream& out, CLock& l);
+inline ostream& operator<<(ostream& out, CLock& l)
+{
+ static char* __lock_states[] = {
+ "sync",
+ "lock",
+ "glockr",
+ "glockl",
+ "glockm",
+ "mixed",
+ "gmixedr",
+ "gmixedl",
+ "loner",
+ "glonerr",
+ "glonerm",
+ "gsyncl",
+ "gsyncm"
+ };
+
+ out << "(" << __lock_states[(int)l.get_state()];
+
+ if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set();
+
+ if (l.get_nread())
+ out << " " << l.get_nread() << "r";
+ if (l.get_nwrite())
+ out << " " << l.get_nwrite() << "w";
+
+ // rw?
+ /*
+ out << " ";
+ if (l.can_read(true)) out << "r[" << l.get_nread() << "]";
+ if (l.can_write(true)) out << "w[" << l.get_nwrite() << "]";
+ out << "/";
+ if (l.can_read(false)) out << "r[" << l.get_nread() << "]";
+ if (l.can_write(false)) out << "w[" << l.get_nwrite() << "]";
+ */
+ out << ")";
+ return out;
+}
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "MDS.h"
+#include "MDCache.h"
+#include "Locker.h"
+#include "Server.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "CDentry.h"
+#include "Migrator.h"
+
+#include "MDBalancer.h"
+#include "MDLog.h"
+#include "MDSMap.h"
+
+#include "include/filepath.h"
+
+#include "events/EInodeUpdate.h"
+#include "events/EDirUpdate.h"
+#include "events/EUnlink.h"
+
+#include "msg/Messenger.h"
+
+#include "messages/MGenericMessage.h"
+#include "messages/MDiscover.h"
+#include "messages/MDiscoverReply.h"
+
+#include "messages/MDirUpdate.h"
+
+#include "messages/MInodeFileCaps.h"
+
+#include "messages/MInodeLink.h"
+#include "messages/MInodeLinkAck.h"
+#include "messages/MInodeUnlink.h"
+#include "messages/MInodeUnlinkAck.h"
+
+#include "messages/MLock.h"
+#include "messages/MDentryUnlink.h"
+
+#include "messages/MClientRequest.h"
+#include "messages/MClientFileCaps.h"
+
+#include <errno.h>
+#include <assert.h>
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".locker "
+
+
+
+void Locker::dispatch(Message *m)
+{
+ switch (m->get_type()) {
+
+ // locking
+ case MSG_MDS_LOCK:
+ handle_lock((MLock*)m);
+ break;
+
+ // cache fun
+ case MSG_MDS_INODEFILECAPS:
+ handle_inode_file_caps((MInodeFileCaps*)m);
+ break;
+
+ case MSG_CLIENT_FILECAPS:
+ handle_client_file_caps((MClientFileCaps*)m);
+ break;
+
+
+
+ default:
+ assert(0);
+ }
+}
+
+
+
+
+// file i/o -----------------------------------------
+
+__uint64_t Locker::issue_file_data_version(CInode *in)
+{
+ dout(7) << "issue_file_data_version on " << *in << endl;
+ return in->inode.file_data_version;
+}
+
+
+Capability* Locker::issue_new_caps(CInode *in,
+ int mode,
+ MClientRequest *req)
+{
+ dout(7) << "issue_new_caps for mode " << mode << " on " << *in << endl;
+
+ // my needs
+ int my_client = req->get_client();
+ int my_want = 0;
+ if (mode & FILE_MODE_R) my_want |= CAP_FILE_RDCACHE | CAP_FILE_RD;
+ if (mode & FILE_MODE_W) my_want |= CAP_FILE_WRBUFFER | CAP_FILE_WR;
+
+ // register a capability
+ Capability *cap = in->get_client_cap(my_client);
+ if (!cap) {
+ // new cap
+ Capability c(my_want);
+ in->add_client_cap(my_client, c);
+ cap = in->get_client_cap(my_client);
+
+ // note client addr
+ mds->clientmap.add_open(my_client, req->get_client_inst());
+
+ } else {
+ // make sure it has sufficient caps
+ if (cap->wanted() & ~my_want) {
+ // augment wanted caps for this client
+ cap->set_wanted( cap->wanted() | my_want );
+ }
+ }
+
+ // suppress file cap messages for this guy for a few moments (we'll bundle with the open() reply)
+ cap->set_suppress(true);
+ int before = cap->pending();
+
+ if (in->is_auth()) {
+ // [auth] twiddle mode?
+ inode_file_eval(in);
+ } else {
+ // [replica] tell auth about any new caps wanted
+ request_inode_file_caps(in);
+ }
+
+ // issue caps (pot. incl new one)
+ issue_caps(in); // note: _eval above may have done this already...
+
+ // re-issue whatever we can
+ cap->issue(cap->pending());
+
+ // ok, stop suppressing.
+ cap->set_suppress(false);
+
+ int now = cap->pending();
+ if (before != now &&
+ (before & CAP_FILE_WR) == 0 &&
+ (now & CAP_FILE_WR)) {
+ // FIXME FIXME FIXME
+ }
+
+ // twiddle file_data_version?
+ if ((before & CAP_FILE_WRBUFFER) == 0 &&
+ (now & CAP_FILE_WRBUFFER)) {
+ in->inode.file_data_version++;
+ dout(7) << " incrementing file_data_version, now " << in->inode.file_data_version << " for " << *in << endl;
+ }
+
+ return cap;
+}
+
+
+
+bool Locker::issue_caps(CInode *in)
+{
+ // allowed caps are determined by the lock mode.
+ int allowed = in->filelock.caps_allowed(in->is_auth());
+ dout(7) << "issue_caps filelock allows=" << cap_string(allowed)
+ << " on " << *in << endl;
+
+ // count conflicts with
+ int nissued = 0;
+
+ // client caps
+ for (map<int, Capability>::iterator it = in->client_caps.begin();
+ it != in->client_caps.end();
+ it++) {
+ if (it->second.issued() != (it->second.wanted() & allowed)) {
+ // issue
+ nissued++;
+
+ int before = it->second.pending();
+ long seq = it->second.issue(it->second.wanted() & allowed);
+ int after = it->second.pending();
+
+ // twiddle file_data_version?
+ if (!(before & CAP_FILE_WRBUFFER) &&
+ (after & CAP_FILE_WRBUFFER)) {
+ dout(7) << " incrementing file_data_version for " << *in << endl;
+ in->inode.file_data_version++;
+ }
+
+ if (seq > 0 &&
+ !it->second.is_suppress()) {
+ dout(7) << " sending MClientFileCaps to client" << it->first << " seq " << it->second.get_last_seq() << " new pending " << cap_string(it->second.pending()) << " was " << cap_string(before) << endl;
+ mds->messenger->send_message(new MClientFileCaps(in->inode,
+ it->second.get_last_seq(),
+ it->second.pending(),
+ it->second.wanted()),
+ MSG_ADDR_CLIENT(it->first), mds->clientmap.get_inst(it->first),
+ 0, MDS_PORT_LOCKER);
+ }
+ }
+ }
+
+ return (nissued == 0); // true if no re-issued, no callbacks
+}
+
+
+
+void Locker::request_inode_file_caps(CInode *in)
+{
+ int wanted = in->get_caps_wanted();
+ if (wanted != in->replica_caps_wanted) {
+
+ if (wanted == 0) {
+ if (in->replica_caps_wanted_keep_until > g_clock.recent_now()) {
+ // ok, release them finally!
+ in->replica_caps_wanted_keep_until.sec_ref() = 0;
+ dout(7) << "request_inode_file_caps " << cap_string(wanted)
+ << " was " << cap_string(in->replica_caps_wanted)
+ << " no keeping anymore "
+ << " on " << *in
+ << endl;
+ }
+ else if (in->replica_caps_wanted_keep_until.sec() == 0) {
+ in->replica_caps_wanted_keep_until = g_clock.recent_now();
+ in->replica_caps_wanted_keep_until.sec_ref() += 2;
+
+ dout(7) << "request_inode_file_caps " << cap_string(wanted)
+ << " was " << cap_string(in->replica_caps_wanted)
+ << " keeping until " << in->replica_caps_wanted_keep_until
+ << " on " << *in
+ << endl;
+ return;
+ } else {
+ // wait longer
+ return;
+ }
+ } else {
+ in->replica_caps_wanted_keep_until.sec_ref() = 0;
+ }
+ assert(!in->is_auth());
+
+ int auth = in->authority();
+ dout(7) << "request_inode_file_caps " << cap_string(wanted)
+ << " was " << cap_string(in->replica_caps_wanted)
+ << " on " << *in << " to mds" << auth << endl;
+ assert(!in->is_auth());
+
+ in->replica_caps_wanted = wanted;
+ mds->send_message_mds(new MInodeFileCaps(in->ino(), mds->get_nodeid(),
+ in->replica_caps_wanted),
+ auth, MDS_PORT_LOCKER);
+ } else {
+ in->replica_caps_wanted_keep_until.sec_ref() = 0;
+ }
+}
+
+void Locker::handle_inode_file_caps(MInodeFileCaps *m)
+{
+ CInode *in = mdcache->get_inode(m->get_ino());
+ assert(in);
+ assert(in->is_auth() || in->is_proxy());
+
+ dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << endl;
+
+ if (in->is_proxy()) {
+ dout(7) << "proxy, fw" << endl;
+ mds->send_message_mds(m, in->authority(), MDS_PORT_LOCKER);
+ return;
+ }
+
+ if (m->get_caps())
+ in->mds_caps_wanted[m->get_from()] = m->get_caps();
+ else
+ in->mds_caps_wanted.erase(m->get_from());
+
+ inode_file_eval(in);
+ delete m;
+}
+
+
+/*
+ * note: we only get these from the client if
+ * - we are calling back previously issued caps (fewer than the client previously had)
+ * - or if the client releases (any of) its caps on its own
+ */
+void Locker::handle_client_file_caps(MClientFileCaps *m)
+{
+ int client = m->get_source().num();
+ CInode *in = mdcache->get_inode(m->get_ino());
+ Capability *cap = 0;
+ if (in)
+ cap = in->get_client_cap(client);
+
+ if (!in || !cap) {
+ if (!in) {
+ dout(7) << "handle_client_file_caps on unknown ino " << m->get_ino() << ", dropping" << endl;
+ } else {
+ dout(7) << "handle_client_file_caps no cap for client" << client << " on " << *in << endl;
+ }
+ delete m;
+ return;
+ }
+
+ assert(cap);
+
+ // filter wanted based on what we could ever give out (given auth/replica status)
+ int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(in->is_auth());
+
+ dout(7) << "handle_client_file_caps seq " << m->get_seq()
+ << " confirms caps " << cap_string(m->get_caps())
+ << " wants " << cap_string(wanted)
+ << " from client" << client
+ << " on " << *in
+ << endl;
+
+ // update wanted
+ if (cap->wanted() != wanted)
+ cap->set_wanted(wanted);
+
+ // confirm caps
+ int had = cap->confirm_receipt(m->get_seq(), m->get_caps());
+ int has = cap->confirmed();
+ if (cap->is_null()) {
+ dout(7) << " cap for client" << client << " is now null, removing from " << *in << endl;
+ in->remove_client_cap(client);
+ if (!in->is_auth())
+ request_inode_file_caps(in);
+
+ // dec client addr counter
+ mds->clientmap.dec_open(client);
+
+ // tell client.
+ MClientFileCaps *r = new MClientFileCaps(in->inode,
+ 0, 0, 0,
+ MClientFileCaps::FILECAP_RELEASE);
+ mds->messenger->send_message(r, m->get_source(), m->get_source_inst(), 0, MDS_PORT_LOCKER);
+ }
+
+ // merge in atime?
+ if (m->get_inode().atime > in->inode.atime) {
+ dout(7) << " taking atime " << m->get_inode().atime << " > "
+ << in->inode.atime << " for " << *in << endl;
+ in->inode.atime = m->get_inode().atime;
+ }
+
+ if ((has|had) & CAP_FILE_WR) {
+ bool dirty = false;
+
+ // mtime
+ if (m->get_inode().mtime > in->inode.mtime) {
+ dout(7) << " taking mtime " << m->get_inode().mtime << " > "
+ << in->inode.mtime << " for " << *in << endl;
+ in->inode.mtime = m->get_inode().mtime;
+ dirty = true;
+ }
+ // size
+ if (m->get_inode().size > in->inode.size) {
+ dout(7) << " taking size " << m->get_inode().size << " > "
+ << in->inode.size << " for " << *in << endl;
+ in->inode.size = m->get_inode().size;
+ dirty = true;
+ }
+
+ if (dirty)
+ mds->mdlog->submit_entry(new EInodeUpdate(in));
+ }
+
+ // reevaluate, waiters
+ inode_file_eval(in);
+ in->finish_waiting(CINODE_WAIT_CAPS, 0);
+
+ delete m;
+}
+
+
+
+
+
+
+
+
+
+
+// locks ----------------------------------------------------------------
+
+/*
+
+
+INODES:
+
+= two types of inode metadata:
+ hard - uid/gid, mode
+ file - mtime, size
+ ? atime - atime (*) <-- we want a lazy update strategy?
+
+= correspondingly, two types of inode locks:
+ hardlock - hard metadata
+ filelock - file metadata
+
+ -> These locks are completely orthogonal!
+
+= metadata ops and how they affect inode metadata:
+ sma=size mtime atime
+ HARD FILE OP
+ files:
+ R RRR stat
+ RW chmod/chown
+ R W touch ?ctime
+ R openr
+ W read atime
+ R openw
+ Wc openwc ?ctime
+ WW write size mtime
+ close
+
+ dirs:
+ R W readdir atime
+ RRR ( + implied stats on files)
+ Rc WW mkdir (ctime on new dir, size+mtime on parent dir)
+ R WW link/unlink/rename/rmdir (size+mtime on dir)
+
+
+
+= relationship to client (writers):
+
+ - ops in question are
+ - stat ... need reasonable value for mtime (+ atime?)
+ - maybe we want a "quicksync" type operation instead of full lock
+ - truncate ... need to stop writers for the atomic truncate operation
+ - need a full lock
+
+
+
+
+= modes
+ - SYNC
+ Rauth Rreplica Wauth Wreplica
+ sync
+
+
+
+
+
+ALSO:
+
+ dirlock - no dir changes (prior to unhashing)
+ denlock - dentry lock (prior to unlink, rename)
+
+
+*/
+
+
+void Locker::handle_lock(MLock *m)
+{
+ switch (m->get_otype()) {
+ case LOCK_OTYPE_IHARD:
+ handle_lock_inode_hard(m);
+ break;
+
+ case LOCK_OTYPE_IFILE:
+ handle_lock_inode_file(m);
+ break;
+
+ case LOCK_OTYPE_DIR:
+ handle_lock_dir(m);
+ break;
+
+ case LOCK_OTYPE_DN:
+ handle_lock_dn(m);
+ break;
+
+ default:
+ dout(7) << "handle_lock got otype " << m->get_otype() << endl;
+ assert(0);
+ break;
+ }
+}
+
+
+
+// ===============================
+// hard inode metadata
+
+bool Locker::inode_hard_read_try(CInode *in, Context *con)
+{
+ dout(7) << "inode_hard_read_try on " << *in << endl;
+
+ // can read? grab ref.
+ if (in->hardlock.can_read(in->is_auth()))
+ return true;
+
+ assert(!in->is_auth());
+
+ // wait!
+ dout(7) << "inode_hard_read_try waiting on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_HARDR, con);
+ return false;
+}
+
+bool Locker::inode_hard_read_start(CInode *in, MClientRequest *m)
+{
+ dout(7) << "inode_hard_read_start on " << *in << endl;
+
+ // can read? grab ref.
+ if (in->hardlock.can_read(in->is_auth())) {
+ in->hardlock.get_read();
+ return true;
+ }
+
+ // can't read, and replicated.
+ assert(!in->is_auth());
+
+ // wait!
+ dout(7) << "inode_hard_read_start waiting on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_HARDR, new C_MDS_RetryRequest(mds, m, in));
+ return false;
+}
+
+
+void Locker::inode_hard_read_finish(CInode *in)
+{
+ // drop ref
+ assert(in->hardlock.can_read(in->is_auth()));
+ in->hardlock.put_read();
+
+ dout(7) << "inode_hard_read_finish on " << *in << endl;
+
+ //if (in->hardlock.get_nread() == 0) in->finish_waiting(CINODE_WAIT_HARDNORD);
+}
+
+
+bool Locker::inode_hard_write_start(CInode *in, MClientRequest *m)
+{
+ dout(7) << "inode_hard_write_start on " << *in << endl;
+
+ // if not replicated, i can twiddle lock at will
+ if (in->is_auth() &&
+ !in->is_cached_by_anyone() &&
+ in->hardlock.get_state() != LOCK_LOCK)
+ in->hardlock.set_state(LOCK_LOCK);
+
+ // can write? grab ref.
+ if (in->hardlock.can_write(in->is_auth())) {
+ assert(in->is_auth());
+ if (!in->can_auth_pin()) {
+ dout(7) << "inode_hard_write_start waiting for authpinnable on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, m, in));
+ return false;
+ }
+
+ in->auth_pin(); // ugh, can't condition this on nwrite==0 bc we twiddle that in handle_lock_*
+ in->hardlock.get_write();
+ return true;
+ }
+
+ // can't write, replicated.
+ if (in->is_auth()) {
+ // auth
+ if (in->hardlock.can_write_soon(in->is_auth())) {
+ // just wait
+ } else {
+ // initiate lock
+ inode_hard_lock(in);
+ }
+
+ dout(7) << "inode_hard_write_start waiting on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_HARDW, new C_MDS_RetryRequest(mds, m, in));
+
+ return false;
+ } else {
+ // replica
+ // fw to auth
+ int auth = in->authority();
+ dout(7) << "inode_hard_write_start " << *in << " on replica, fw to auth " << auth << endl;
+ assert(auth != mds->get_nodeid());
+ mdcache->request_forward(m, auth);
+ return false;
+ }
+}
+
+
+void Locker::inode_hard_write_finish(CInode *in)
+{
+ // drop ref
+ assert(in->hardlock.can_write(in->is_auth()));
+ in->hardlock.put_write();
+ in->auth_unpin();
+ dout(7) << "inode_hard_write_finish on " << *in << endl;
+
+ // drop lock?
+ if (in->hardlock.get_nwrite() == 0) {
+
+ // auto-sync if alone.
+ if (in->is_auth() &&
+ !in->is_cached_by_anyone() &&
+ in->hardlock.get_state() != LOCK_SYNC)
+ in->hardlock.set_state(LOCK_SYNC);
+
+ inode_hard_eval(in);
+ }
+}
+
+
+void Locker::inode_hard_eval(CInode *in)
+{
+ // finished gather?
+ if (in->is_auth() &&
+ !in->hardlock.is_stable() &&
+ in->hardlock.gather_set.empty()) {
+ dout(7) << "inode_hard_eval finished gather on " << *in << endl;
+ switch (in->hardlock.get_state()) {
+ case LOCK_GLOCKR:
+ in->hardlock.set_state(LOCK_LOCK);
+
+ // waiters
+ in->hardlock.get_write();
+ in->finish_waiting(CINODE_WAIT_HARDRWB|CINODE_WAIT_HARDSTABLE);
+ in->hardlock.put_write();
+ break;
+
+ default:
+ assert(0);
+ }
+ }
+ if (!in->hardlock.is_stable()) return;
+
+ if (in->is_auth()) {
+
+ // sync?
+ if (in->is_cached_by_anyone() &&
+ in->hardlock.get_nwrite() == 0 &&
+ in->hardlock.get_state() != LOCK_SYNC) {
+ dout(7) << "inode_hard_eval stable, syncing " << *in << endl;
+ inode_hard_sync(in);
+ }
+
+ } else {
+ // replica
+ }
+}
+
+
+// mid
+
+void Locker::inode_hard_sync(CInode *in)
+{
+ dout(7) << "inode_hard_sync on " << *in << endl;
+ assert(in->is_auth());
+
+ // check state
+ if (in->hardlock.get_state() == LOCK_SYNC)
+ return; // already sync
+ if (in->hardlock.get_state() == LOCK_GLOCKR)
+ assert(0); // um... hmm!
+ assert(in->hardlock.get_state() == LOCK_LOCK);
+
+ // hard data
+ bufferlist harddata;
+ in->encode_hard_state(harddata);
+
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IHARD);
+ m->set_data(harddata);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+
+ // change lock
+ in->hardlock.set_state(LOCK_SYNC);
+
+ // waiters?
+ in->finish_waiting(CINODE_WAIT_HARDSTABLE);
+}
+
+void Locker::inode_hard_lock(CInode *in)
+{
+ dout(7) << "inode_hard_lock on " << *in << " hardlock=" << in->hardlock << endl;
+ assert(in->is_auth());
+
+ // check state
+ if (in->hardlock.get_state() == LOCK_LOCK ||
+ in->hardlock.get_state() == LOCK_GLOCKR)
+ return; // already lock or locking
+ assert(in->hardlock.get_state() == LOCK_SYNC);
+
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IHARD);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+
+ // change lock
+ in->hardlock.set_state(LOCK_GLOCKR);
+ in->hardlock.init_gather(in->get_cached_by());
+}
+
+
+
+
+
+// messenger
+
+void Locker::handle_lock_inode_hard(MLock *m)
+{
+ assert(m->get_otype() == LOCK_OTYPE_IHARD);
+
+ if (mds->logger) mds->logger->inc("lih");
+
+ int from = m->get_asker();
+ CInode *in = mdcache->get_inode(m->get_ino());
+
+ if (LOCK_AC_FOR_AUTH(m->get_action())) {
+ // auth
+ assert(in);
+ assert(in->is_auth() || in->is_proxy());
+ dout(7) << "handle_lock_inode_hard " << *in << " hardlock=" << in->hardlock << endl;
+
+ if (in->is_proxy()) {
+ // fw
+ int newauth = in->authority();
+ assert(newauth >= 0);
+ if (from == newauth) {
+ dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl;
+ delete m;
+ } else {
+ dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl;
+ mds->send_message_mds(m, newauth, MDS_PORT_LOCKER);
+ }
+ return;
+ }
+ } else {
+ // replica
+ if (!in) {
+ dout(7) << "handle_lock_inode_hard " << m->get_ino() << ": don't have it anymore" << endl;
+ /* do NOT nak.. if we go that route we need to duplicate all the nonce funkiness
+ to keep gather_set a proper/correct subset of cached_by. better to use the existing
+ cacheexpire mechanism instead!
+ */
+ delete m;
+ return;
+ }
+
+ assert(!in->is_auth());
+ }
+
+ dout(7) << "handle_lock_inode_hard a=" << m->get_action() << " from " << from << " " << *in << " hardlock=" << in->hardlock << endl;
+
+ CLock *lock = &in->hardlock;
+
+ switch (m->get_action()) {
+ // -- replica --
+ case LOCK_AC_SYNC:
+ assert(lock->get_state() == LOCK_LOCK);
+
+ { // assim data
+ int off = 0;
+ in->decode_hard_state(m->get_data(), off);
+ }
+
+ // update lock
+ lock->set_state(LOCK_SYNC);
+
+ // no need to reply
+
+ // waiters
+ in->finish_waiting(CINODE_WAIT_HARDR|CINODE_WAIT_HARDSTABLE);
+ break;
+
+ case LOCK_AC_LOCK:
+ assert(lock->get_state() == LOCK_SYNC);
+ //|| lock->get_state() == LOCK_GLOCKR);
+
+ // wait for readers to finish?
+ if (lock->get_nread() > 0) {
+ dout(7) << "handle_lock_inode_hard readers, waiting before ack on " << *in << endl;
+ lock->set_state(LOCK_GLOCKR);
+ in->add_waiter(CINODE_WAIT_HARDNORD,
+ new C_MDS_RetryMessage(mds,m));
+ assert(0); // does this ever happen? (if so, fix hard_read_finish, and CInodeExport.update_inode!)
+ return;
+ } else {
+
+ // update lock and reply
+ lock->set_state(LOCK_LOCK);
+
+ {
+ MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
+ reply->set_ino(in->ino(), LOCK_OTYPE_IHARD);
+ mds->send_message_mds(reply, from, MDS_PORT_LOCKER);
+ }
+ }
+ break;
+
+
+ // -- auth --
+ case LOCK_AC_LOCKACK:
+ assert(lock->state == LOCK_GLOCKR);
+ assert(lock->gather_set.count(from));
+ lock->gather_set.erase(from);
+
+ if (lock->gather_set.size()) {
+ dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
+ } else {
+ dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", last one" << endl;
+ inode_hard_eval(in);
+ }
+ }
+ delete m;
+}
+
+
+
+
+// =====================
+// soft inode metadata
+
+
+bool Locker::inode_file_read_start(CInode *in, MClientRequest *m)
+{
+ dout(7) << "inode_file_read_start " << *in << " filelock=" << in->filelock << endl;
+
+ // can read? grab ref.
+ if (in->filelock.can_read(in->is_auth())) {
+ in->filelock.get_read();
+ return true;
+ }
+
+ // can't read, and replicated.
+ if (in->filelock.can_read_soon(in->is_auth())) {
+ // wait
+ dout(7) << "inode_file_read_start can_read_soon " << *in << endl;
+ } else {
+ if (in->is_auth()) {
+ // auth
+
+ // FIXME or qsync?
+
+ if (in->filelock.is_stable()) {
+ inode_file_lock(in); // lock, bc easiest to back off
+
+ if (in->filelock.can_read(in->is_auth())) {
+ in->filelock.get_read();
+
+ in->filelock.get_write();
+ in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE);
+ in->filelock.put_write();
+ return true;
+ }
+ } else {
+ dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl;
+ in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in));
+ return false;
+ }
+ } else {
+ // replica
+ if (in->filelock.is_stable()) {
+
+ // fw to auth
+ int auth = in->authority();
+ dout(7) << "inode_file_read_start " << *in << " on replica and async, fw to auth " << auth << endl;
+ assert(auth != mds->get_nodeid());
+ mdcache->request_forward(m, auth);
+ return false;
+
+ } else {
+ // wait until stable
+ dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl;
+ in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in));
+ return false;
+ }
+ }
+ }
+
+ // wait
+ dout(7) << "inode_file_read_start waiting on " << *in << ", filelock=" << in->filelock << endl;
+ in->add_waiter(CINODE_WAIT_FILER, new C_MDS_RetryRequest(mds, m, in));
+
+ return false;
+}
+
+
+void Locker::inode_file_read_finish(CInode *in)
+{
+ // drop ref
+ assert(in->filelock.can_read(in->is_auth()));
+ in->filelock.put_read();
+
+ dout(7) << "inode_file_read_finish on " << *in << ", filelock=" << in->filelock << endl;
+
+ if (in->filelock.get_nread() == 0) {
+ in->finish_waiting(CINODE_WAIT_FILENORD);
+ inode_file_eval(in);
+ }
+}
+
+
+bool Locker::inode_file_write_start(CInode *in, MClientRequest *m)
+{
+ // can write? grab ref.
+ if (in->filelock.can_write(in->is_auth())) {
+ in->filelock.get_write();
+ return true;
+ }
+
+ // can't write, replicated.
+ if (in->is_auth()) {
+ // auth
+ if (in->filelock.can_write_soon(in->is_auth())) {
+ // just wait
+ } else {
+ if (!in->filelock.is_stable()) {
+ dout(7) << "inode_file_write_start on auth, waiting for stable on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in));
+ return false;
+ }
+
+ // initiate lock
+ inode_file_lock(in);
+
+ if (in->filelock.can_write(in->is_auth())) {
+ in->filelock.get_write();
+
+ in->filelock.get_read();
+ in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE);
+ in->filelock.put_read();
+ return true;
+ }
+ }
+
+ dout(7) << "inode_file_write_start on auth, waiting for write on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_FILEW, new C_MDS_RetryRequest(mds, m, in));
+ return false;
+ } else {
+ // replica
+ // fw to auth
+ int auth = in->authority();
+ dout(7) << "inode_file_write_start " << *in << " on replica, fw to auth " << auth << endl;
+ assert(auth != mds->get_nodeid());
+ mdcache->request_forward(m, auth);
+ return false;
+ }
+}
+
+
+void Locker::inode_file_write_finish(CInode *in)
+{
+ // drop ref
+ assert(in->filelock.can_write(in->is_auth()));
+ in->filelock.put_write();
+ dout(7) << "inode_file_write_finish on " << *in << ", filelock=" << in->filelock << endl;
+
+ // drop lock?
+ if (in->filelock.get_nwrite() == 0) {
+ in->finish_waiting(CINODE_WAIT_FILENOWR);
+ inode_file_eval(in);
+ }
+}
+
+
+/*
+ * ...
+ *
+ * also called after client caps are acked to us
+ * - checks if we're in unstable sfot state and can now move on to next state
+ * - checks if soft state should change (eg bc last writer closed)
+ */
+
+void Locker::inode_file_eval(CInode *in)
+{
+ int issued = in->get_caps_issued();
+
+ // [auth] finished gather?
+ if (in->is_auth() &&
+ !in->filelock.is_stable() &&
+ in->filelock.gather_set.size() == 0) {
+ dout(7) << "inode_file_eval finished mds gather on " << *in << endl;
+
+ switch (in->filelock.get_state()) {
+ // to lock
+ case LOCK_GLOCKR:
+ case LOCK_GLOCKM:
+ case LOCK_GLOCKL:
+ if (issued == 0) {
+ in->filelock.set_state(LOCK_LOCK);
+
+ // waiters
+ in->filelock.get_read();
+ in->filelock.get_write();
+ in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE);
+ in->filelock.put_read();
+ in->filelock.put_write();
+ }
+ break;
+
+ // to mixed
+ case LOCK_GMIXEDR:
+ if ((issued & ~(CAP_FILE_RD)) == 0) {
+ in->filelock.set_state(LOCK_MIXED);
+ in->finish_waiting(CINODE_WAIT_FILESTABLE);
+ }
+ break;
+
+ case LOCK_GMIXEDL:
+ if ((issued & ~(CAP_FILE_WR)) == 0) {
+ in->filelock.set_state(LOCK_MIXED);
+
+ if (in->is_cached_by_anyone()) {
+ // data
+ bufferlist softdata;
+ in->encode_file_state(softdata);
+
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ m->set_data(softdata);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ }
+
+ in->finish_waiting(CINODE_WAIT_FILESTABLE);
+ }
+ break;
+
+ // to loner
+ case LOCK_GLONERR:
+ if (issued == 0) {
+ in->filelock.set_state(LOCK_LONER);
+ in->finish_waiting(CINODE_WAIT_FILESTABLE);
+ }
+ break;
+
+ case LOCK_GLONERM:
+ if ((issued & ~CAP_FILE_WR) == 0) {
+ in->filelock.set_state(LOCK_LONER);
+ in->finish_waiting(CINODE_WAIT_FILESTABLE);
+ }
+ break;
+
+ // to sync
+ case LOCK_GSYNCL:
+ case LOCK_GSYNCM:
+ if ((issued & ~(CAP_FILE_RD)) == 0) {
+ in->filelock.set_state(LOCK_SYNC);
+
+ { // bcast data to replicas
+ bufferlist softdata;
+ in->encode_file_state(softdata);
+
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *reply = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+ reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ reply->set_data(softdata);
+ mds->send_message_mds(reply, *it, MDS_PORT_LOCKER);
+ }
+ }
+
+ // waiters
+ in->filelock.get_read();
+ in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE);
+ in->filelock.put_read();
+ }
+ break;
+
+ default:
+ assert(0);
+ }
+
+ issue_caps(in);
+ }
+
+ // [replica] finished caps gather?
+ if (!in->is_auth() &&
+ !in->filelock.is_stable()) {
+ switch (in->filelock.get_state()) {
+ case LOCK_GMIXEDR:
+ if ((issued & ~(CAP_FILE_RD)) == 0) {
+ in->filelock.set_state(LOCK_MIXED);
+
+ // ack
+ MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid());
+ reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER);
+ }
+ break;
+
+ case LOCK_GLOCKR:
+ if (issued == 0) {
+ in->filelock.set_state(LOCK_LOCK);
+
+ // ack
+ MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
+ reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER);
+ }
+ break;
+
+ default:
+ assert(0);
+ }
+ }
+
+ // !stable -> do nothing.
+ if (!in->filelock.is_stable()) return;
+
+
+ // stable.
+ assert(in->filelock.is_stable());
+
+ if (in->is_auth()) {
+ // [auth]
+ int wanted = in->get_caps_wanted();
+ bool loner = (in->client_caps.size() == 1) && in->mds_caps_wanted.empty();
+ dout(7) << "inode_file_eval wanted=" << cap_string(wanted)
+ << " filelock=" << in->filelock
+ << " loner=" << loner
+ << endl;
+
+ // * -> loner?
+ if (in->filelock.get_nread() == 0 &&
+ in->filelock.get_nwrite() == 0 &&
+ (wanted & CAP_FILE_WR) &&
+ loner &&
+ in->filelock.get_state() != LOCK_LONER) {
+ dout(7) << "inode_file_eval stable, bump to loner " << *in << ", filelock=" << in->filelock << endl;
+ inode_file_loner(in);
+ }
+
+ // * -> mixed?
+ else if (in->filelock.get_nread() == 0 &&
+ in->filelock.get_nwrite() == 0 &&
+ (wanted & CAP_FILE_RD) &&
+ (wanted & CAP_FILE_WR) &&
+ !(loner && in->filelock.get_state() == LOCK_LONER) &&
+ in->filelock.get_state() != LOCK_MIXED) {
+ dout(7) << "inode_file_eval stable, bump to mixed " << *in << ", filelock=" << in->filelock << endl;
+ inode_file_mixed(in);
+ }
+
+ // * -> sync?
+ else if (in->filelock.get_nwrite() == 0 &&
+ !(wanted & CAP_FILE_WR) &&
+ ((wanted & CAP_FILE_RD) ||
+ in->is_cached_by_anyone() ||
+ (!loner && in->filelock.get_state() == LOCK_LONER)) &&
+ in->filelock.get_state() != LOCK_SYNC) {
+ dout(7) << "inode_file_eval stable, bump to sync " << *in << ", filelock=" << in->filelock << endl;
+ inode_file_sync(in);
+ }
+
+ // * -> lock? (if not replicated or open)
+ else if (!in->is_cached_by_anyone() &&
+ wanted == 0 &&
+ in->filelock.get_state() != LOCK_LOCK) {
+ inode_file_lock(in);
+ }
+
+ } else {
+ // replica
+ // recall? check wiaters? XXX
+ }
+}
+
+
+// mid
+
+bool Locker::inode_file_sync(CInode *in)
+{
+ dout(7) << "inode_file_sync " << *in << " filelock=" << in->filelock << endl;
+
+ assert(in->is_auth());
+
+ // check state
+ if (in->filelock.get_state() == LOCK_SYNC ||
+ in->filelock.get_state() == LOCK_GSYNCL ||
+ in->filelock.get_state() == LOCK_GSYNCM)
+ return true;
+
+ assert(in->filelock.is_stable());
+
+ int issued = in->get_caps_issued();
+
+ assert((in->get_caps_wanted() & CAP_FILE_WR) == 0);
+
+ if (in->filelock.get_state() == LOCK_LOCK) {
+ if (in->is_cached_by_anyone()) {
+ // soft data
+ bufferlist softdata;
+ in->encode_file_state(softdata);
+
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ m->set_data(softdata);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ }
+
+ // change lock
+ in->filelock.set_state(LOCK_SYNC);
+
+ // reissue caps
+ issue_caps(in);
+ return true;
+ }
+
+ else if (in->filelock.get_state() == LOCK_MIXED) {
+ // writers?
+ if (issued & CAP_FILE_WR) {
+ // gather client write caps
+ in->filelock.set_state(LOCK_GSYNCM);
+ issue_caps(in);
+ } else {
+ // no writers, go straight to sync
+
+ if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ }
+
+ // change lock
+ in->filelock.set_state(LOCK_SYNC);
+ }
+ return false;
+ }
+
+ else if (in->filelock.get_state() == LOCK_LONER) {
+ // writers?
+ if (issued & CAP_FILE_WR) {
+ // gather client write caps
+ in->filelock.set_state(LOCK_GSYNCL);
+ issue_caps(in);
+ } else {
+ // no writers, go straight to sync
+ if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ }
+
+ // change lock
+ in->filelock.set_state(LOCK_SYNC);
+ }
+ return false;
+ }
+ else
+ assert(0); // wtf.
+
+ return false;
+}
+
+
+void Locker::inode_file_lock(CInode *in)
+{
+ dout(7) << "inode_file_lock " << *in << " filelock=" << in->filelock << endl;
+
+ assert(in->is_auth());
+
+ // check state
+ if (in->filelock.get_state() == LOCK_LOCK ||
+ in->filelock.get_state() == LOCK_GLOCKR ||
+ in->filelock.get_state() == LOCK_GLOCKM ||
+ in->filelock.get_state() == LOCK_GLOCKL)
+ return; // lock or locking
+
+ assert(in->filelock.is_stable());
+
+ int issued = in->get_caps_issued();
+
+ if (in->filelock.get_state() == LOCK_SYNC) {
+ if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ in->filelock.init_gather(in->get_cached_by());
+
+ // change lock
+ in->filelock.set_state(LOCK_GLOCKR);
+
+ // call back caps
+ if (issued)
+ issue_caps(in);
+ } else {
+ if (issued) {
+ // call back caps
+ in->filelock.set_state(LOCK_GLOCKR);
+ issue_caps(in);
+ } else {
+ in->filelock.set_state(LOCK_LOCK);
+ }
+ }
+ }
+
+ else if (in->filelock.get_state() == LOCK_MIXED) {
+ if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ in->filelock.init_gather(in->get_cached_by());
+
+ // change lock
+ in->filelock.set_state(LOCK_GLOCKM);
+
+ // call back caps
+ issue_caps(in);
+ } else {
+ //assert(issued); // ??? -sage 2/19/06
+ if (issued) {
+ // change lock
+ in->filelock.set_state(LOCK_GLOCKM);
+
+ // call back caps
+ issue_caps(in);
+ } else {
+ in->filelock.set_state(LOCK_LOCK);
+ }
+ }
+
+ }
+ else if (in->filelock.get_state() == LOCK_LONER) {
+ if (issued & CAP_FILE_WR) {
+ // change lock
+ in->filelock.set_state(LOCK_GLOCKL);
+
+ // call back caps
+ issue_caps(in);
+ } else {
+ in->filelock.set_state(LOCK_LOCK);
+ }
+ }
+ else
+ assert(0); // wtf.
+}
+
+
+void Locker::inode_file_mixed(CInode *in)
+{
+ dout(7) << "inode_file_mixed " << *in << " filelock=" << in->filelock << endl;
+
+ assert(in->is_auth());
+
+ // check state
+ if (in->filelock.get_state() == LOCK_GMIXEDR ||
+ in->filelock.get_state() == LOCK_GMIXEDL)
+ return; // mixed or mixing
+
+ assert(in->filelock.is_stable());
+
+ int issued = in->get_caps_issued();
+
+ if (in->filelock.get_state() == LOCK_SYNC) {
+ if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ in->filelock.init_gather(in->get_cached_by());
+
+ in->filelock.set_state(LOCK_GMIXEDR);
+ issue_caps(in);
+ } else {
+ if (issued) {
+ in->filelock.set_state(LOCK_GMIXEDR);
+ issue_caps(in);
+ } else {
+ in->filelock.set_state(LOCK_MIXED);
+ }
+ }
+ }
+
+ else if (in->filelock.get_state() == LOCK_LOCK) {
+ if (in->is_cached_by_anyone()) {
+ // data
+ bufferlist softdata;
+ in->encode_file_state(softdata);
+
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ m->set_data(softdata);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ }
+
+ // change lock
+ in->filelock.set_state(LOCK_MIXED);
+ issue_caps(in);
+ }
+
+ else if (in->filelock.get_state() == LOCK_LONER) {
+ if (issued & CAP_FILE_WRBUFFER) {
+ // gather up WRBUFFER caps
+ in->filelock.set_state(LOCK_GMIXEDL);
+ issue_caps(in);
+ }
+ else if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ in->filelock.set_state(LOCK_MIXED);
+ issue_caps(in);
+ } else {
+ in->filelock.set_state(LOCK_MIXED);
+ issue_caps(in);
+ }
+ }
+
+ else
+ assert(0); // wtf.
+}
+
+
+void Locker::inode_file_loner(CInode *in)
+{
+ dout(7) << "inode_file_loner " << *in << " filelock=" << in->filelock << endl;
+
+ assert(in->is_auth());
+
+ // check state
+ if (in->filelock.get_state() == LOCK_LONER ||
+ in->filelock.get_state() == LOCK_GLONERR ||
+ in->filelock.get_state() == LOCK_GLONERM)
+ return;
+
+ assert(in->filelock.is_stable());
+ assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty());
+
+ if (in->filelock.get_state() == LOCK_SYNC) {
+ if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ in->filelock.init_gather(in->get_cached_by());
+
+ // change lock
+ in->filelock.set_state(LOCK_GLONERR);
+ } else {
+ // only one guy with file open, who gets it all, so
+ in->filelock.set_state(LOCK_LONER);
+ issue_caps(in);
+ }
+ }
+
+ else if (in->filelock.get_state() == LOCK_LOCK) {
+ // change lock. ignore replicas; they don't know about LONER.
+ in->filelock.set_state(LOCK_LONER);
+ issue_caps(in);
+ }
+
+ else if (in->filelock.get_state() == LOCK_MIXED) {
+ if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ in->filelock.init_gather(in->get_cached_by());
+
+ // change lock
+ in->filelock.set_state(LOCK_GLONERM);
+ } else {
+ in->filelock.set_state(LOCK_LONER);
+ issue_caps(in);
+ }
+ }
+
+ else
+ assert(0);
+}
+
+// messenger
+
+void Locker::handle_lock_inode_file(MLock *m)
+{
+ assert(m->get_otype() == LOCK_OTYPE_IFILE);
+
+ if (mds->logger) mds->logger->inc("lif");
+
+ CInode *in = mdcache->get_inode(m->get_ino());
+ int from = m->get_asker();
+
+ if (LOCK_AC_FOR_AUTH(m->get_action())) {
+ // auth
+ assert(in);
+ assert(in->is_auth() || in->is_proxy());
+ dout(7) << "handle_lock_inode_file " << *in << " hardlock=" << in->hardlock << endl;
+
+ if (in->is_proxy()) {
+ // fw
+ int newauth = in->authority();
+ assert(newauth >= 0);
+ if (from == newauth) {
+ dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl;
+ delete m;
+ } else {
+ dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl;
+ mds->send_message_mds(m, newauth, MDS_PORT_LOCKER);
+ }
+ return;
+ }
+ } else {
+ // replica
+ if (!in) {
+ // drop it. don't nak.
+ dout(7) << "handle_lock " << m->get_ino() << ": don't have it anymore" << endl;
+ delete m;
+ return;
+ }
+
+ assert(!in->is_auth());
+ }
+
+ dout(7) << "handle_lock_inode_file a=" << m->get_action() << " from " << from << " " << *in << " filelock=" << in->filelock << endl;
+
+ CLock *lock = &in->filelock;
+ int issued = in->get_caps_issued();
+
+ switch (m->get_action()) {
+ // -- replica --
+ case LOCK_AC_SYNC:
+ assert(lock->get_state() == LOCK_LOCK ||
+ lock->get_state() == LOCK_MIXED);
+
+ { // assim data
+ int off = 0;
+ in->decode_file_state(m->get_data(), off);
+ }
+
+ // update lock
+ lock->set_state(LOCK_SYNC);
+
+ // no need to reply.
+
+ // waiters
+ in->filelock.get_read();
+ in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE);
+ in->filelock.put_read();
+ inode_file_eval(in);
+ break;
+
+ case LOCK_AC_LOCK:
+ assert(lock->get_state() == LOCK_SYNC ||
+ lock->get_state() == LOCK_MIXED);
+
+ // call back caps?
+ if (issued & CAP_FILE_RD) {
+ dout(7) << "handle_lock_inode_file client readers, gathering caps on " << *in << endl;
+ issue_caps(in);
+ }
+ if (lock->get_nread() > 0) {
+ dout(7) << "handle_lock_inode_file readers, waiting before ack on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_FILENORD,
+ new C_MDS_RetryMessage(mds,m));
+ lock->set_state(LOCK_GLOCKR);
+ assert(0);// i am broken.. why retry message when state captures all the info i need?
+ return;
+ }
+ if (issued & CAP_FILE_RD) {
+ lock->set_state(LOCK_GLOCKR);
+ break;
+ }
+
+ // nothing to wait for, lock and ack.
+ {
+ lock->set_state(LOCK_LOCK);
+
+ MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
+ reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(reply, from, MDS_PORT_LOCKER);
+ }
+ break;
+
+ case LOCK_AC_MIXED:
+ assert(lock->get_state() == LOCK_SYNC ||
+ lock->get_state() == LOCK_LOCK);
+
+ if (lock->get_state() == LOCK_SYNC) {
+ // MIXED
+ if (issued & CAP_FILE_RD) {
+ // call back client caps
+ lock->set_state(LOCK_GMIXEDR);
+ issue_caps(in);
+ break;
+ } else {
+ // no clients, go straight to mixed
+ lock->set_state(LOCK_MIXED);
+
+ // ack
+ MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid());
+ reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(reply, from, MDS_PORT_LOCKER);
+ }
+ } else {
+ // LOCK
+ lock->set_state(LOCK_MIXED);
+
+ // no ack needed.
+ }
+
+ issue_caps(in);
+
+ // waiters
+ in->filelock.get_write();
+ in->finish_waiting(CINODE_WAIT_FILEW|CINODE_WAIT_FILESTABLE);
+ in->filelock.put_write();
+ inode_file_eval(in);
+ break;
+
+
+
+
+ // -- auth --
+ case LOCK_AC_LOCKACK:
+ assert(lock->state == LOCK_GLOCKR ||
+ lock->state == LOCK_GLOCKM ||
+ lock->state == LOCK_GLONERM ||
+ lock->state == LOCK_GLONERR);
+ assert(lock->gather_set.count(from));
+ lock->gather_set.erase(from);
+
+ if (lock->gather_set.size()) {
+ dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
+ } else {
+ dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl;
+ inode_file_eval(in);
+ }
+ break;
+
+ case LOCK_AC_SYNCACK:
+ assert(lock->state == LOCK_GSYNCM);
+ assert(lock->gather_set.count(from));
+ lock->gather_set.erase(from);
+
+ /* not used currently
+ {
+ // merge data (keep largest size, mtime, etc.)
+ int off = 0;
+ in->decode_merge_file_state(m->get_data(), off);
+ }
+ */
+
+ if (lock->gather_set.size()) {
+ dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
+ } else {
+ dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl;
+ inode_file_eval(in);
+ }
+ break;
+
+ case LOCK_AC_MIXEDACK:
+ assert(lock->state == LOCK_GMIXEDR);
+ assert(lock->gather_set.count(from));
+ lock->gather_set.erase(from);
+
+ if (lock->gather_set.size()) {
+ dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
+ } else {
+ dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl;
+ inode_file_eval(in);
+ }
+ break;
+
+
+ default:
+ assert(0);
+ }
+
+ delete m;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+void Locker::handle_lock_dir(MLock *m)
+{
+
+}
+
+
+
+// DENTRY
+
+bool Locker::dentry_xlock_start(CDentry *dn, Message *m, CInode *ref)
+{
+ dout(7) << "dentry_xlock_start on " << *dn << endl;
+
+ // locked?
+ if (dn->lockstate == DN_LOCK_XLOCK) {
+ if (dn->xlockedby == m) return true; // locked by me!
+
+ // not by me, wait
+ dout(7) << "dentry " << *dn << " xlock by someone else" << endl;
+ dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name,
+ new C_MDS_RetryRequest(mds,m,ref));
+ return false;
+ }
+
+ // prelock?
+ if (dn->lockstate == DN_LOCK_PREXLOCK) {
+ if (dn->xlockedby == m) {
+ dout(7) << "dentry " << *dn << " prexlock by me" << endl;
+ dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name,
+ new C_MDS_RetryRequest(mds,m,ref));
+ } else {
+ dout(7) << "dentry " << *dn << " prexlock by someone else" << endl;
+ dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name,
+ new C_MDS_RetryRequest(mds,m,ref));
+ }
+ return false;
+ }
+
+
+ // lockable!
+ assert(dn->lockstate == DN_LOCK_SYNC ||
+ dn->lockstate == DN_LOCK_UNPINNING);
+
+ // dir auth pinnable?
+ if (!dn->dir->can_auth_pin()) {
+ dout(7) << "dentry " << *dn << " dir not pinnable, waiting" << endl;
+ dn->dir->add_waiter(CDIR_WAIT_AUTHPINNABLE,
+ new C_MDS_RetryRequest(mds,m,ref));
+ return false;
+ }
+
+ // is dentry path pinned?
+ if (dn->is_pinned()) {
+ dout(7) << "dentry " << *dn << " pinned, waiting" << endl;
+ dn->lockstate = DN_LOCK_UNPINNING;
+ dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED,
+ dn->name,
+ new C_MDS_RetryRequest(mds,m,ref));
+ return false;
+ }
+
+ // pin path up to dentry! (if success, point of no return)
+ CDentry *pdn = dn->dir->inode->get_parent_dn();
+ if (pdn) {
+ if (mdcache->active_requests[m].traces.count(pdn)) {
+ dout(7) << "already path pinned parent dentry " << *pdn << endl;
+ } else {
+ dout(7) << "pinning parent dentry " << *pdn << endl;
+ vector<CDentry*> trace;
+ mdcache->make_trace(trace, pdn->inode);
+ assert(trace.size());
+
+ if (!mdcache->path_pin(trace, m, new C_MDS_RetryRequest(mds, m, ref))) return false;
+
+ mdcache->active_requests[m].traces[trace[trace.size()-1]] = trace;
+ }
+ }
+
+ // pin dir!
+ dn->dir->auth_pin();
+
+ // mine!
+ dn->xlockedby = m;
+
+ if (dn->dir->is_open_by_anyone()) {
+ dn->lockstate = DN_LOCK_PREXLOCK;
+
+ // xlock with whom?
+ set<int> who = dn->dir->get_open_by();
+ dn->gather_set = who;
+
+ // make path
+ string path;
+ dn->make_path(path);
+ dout(10) << "path is " << path << " for " << *dn << endl;
+
+ for (set<int>::iterator it = who.begin();
+ it != who.end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+ m->set_dn(dn->dir->ino(), dn->name);
+ m->set_path(path);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+
+ // wait
+ dout(7) << "dentry_xlock_start locking, waiting for replicas " << endl;
+ dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name,
+ new C_MDS_RetryRequest(mds, m, ref));
+ return false;
+ } else {
+ dn->lockstate = DN_LOCK_XLOCK;
+ mdcache->active_requests[dn->xlockedby].xlocks.insert(dn);
+ return true;
+ }
+}
+
+void Locker::dentry_xlock_finish(CDentry *dn, bool quiet)
+{
+ dout(7) << "dentry_xlock_finish on " << *dn << endl;
+
+ assert(dn->xlockedby);
+ if (dn->xlockedby == DN_XLOCK_FOREIGN) {
+ dout(7) << "this was a foreign xlock" << endl;
+ } else {
+ // remove from request record
+ assert(mdcache->active_requests[dn->xlockedby].xlocks.count(dn) == 1);
+ mdcache->active_requests[dn->xlockedby].xlocks.erase(dn);
+ }
+
+ dn->xlockedby = 0;
+ dn->lockstate = DN_LOCK_SYNC;
+
+ // unpin parent dir?
+ // -> no? because we might have xlocked 2 things in this dir.
+ // instead, we let request_finish clean up the mess.
+
+ // tell replicas?
+ if (!quiet) {
+ // tell even if dn is null.
+ if (dn->dir->is_open_by_anyone()) {
+ for (set<int>::iterator it = dn->dir->open_by_begin();
+ it != dn->dir->open_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+ m->set_dn(dn->dir->ino(), dn->name);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ }
+ }
+
+ // unpin dir
+ dn->dir->auth_unpin();
+}
+
+/*
+ * onfinish->finish() will be called with
+ * 0 on successful xlock,
+ * -1 on failure
+ */
+
+class C_MDC_XlockRequest : public Context {
+ Locker *mdc;
+ CDir *dir;
+ string dname;
+ Message *req;
+ Context *finisher;
+public:
+ C_MDC_XlockRequest(Locker *mdc,
+ CDir *dir, string& dname,
+ Message *req,
+ Context *finisher) {
+ this->mdc = mdc;
+ this->dir = dir;
+ this->dname = dname;
+ this->req = req;
+ this->finisher = finisher;
+ }
+
+ void finish(int r) {
+ mdc->dentry_xlock_request_finish(r, dir, dname, req, finisher);
+ }
+};
+
+void Locker::dentry_xlock_request_finish(int r,
+ CDir *dir, string& dname,
+ Message *req,
+ Context *finisher)
+{
+ dout(10) << "dentry_xlock_request_finish r = " << r << endl;
+ if (r == 1) { // 1 for xlock request success
+ CDentry *dn = dir->lookup(dname);
+ if (dn && dn->xlockedby == 0) {
+ // success
+ dn->xlockedby = req; // our request was the winner
+ dout(10) << "xlock request success, now xlocked by req " << req << " dn " << *dn << endl;
+
+ // remember!
+ mdcache->active_requests[req].foreign_xlocks.insert(dn);
+ }
+ }
+
+ // retry request (or whatever)
+ finisher->finish(0);
+ delete finisher;
+}
+
+void Locker::dentry_xlock_request(CDir *dir, string& dname, bool create,
+ Message *req, Context *onfinish)
+{
+ dout(10) << "dentry_xlock_request on dn " << dname << " create=" << create << " in " << *dir << endl;
+ // send request
+ int dauth = dir->dentry_authority(dname);
+ MLock *m = new MLock(create ? LOCK_AC_REQXLOCKC:LOCK_AC_REQXLOCK, mds->get_nodeid());
+ m->set_dn(dir->ino(), dname);
+ mds->send_message_mds(m, dauth, MDS_PORT_LOCKER);
+
+ // add waiter
+ dir->add_waiter(CDIR_WAIT_DNREQXLOCK, dname,
+ new C_MDC_XlockRequest(this,
+ dir, dname, req,
+ onfinish));
+}
+
+
+
+
+void Locker::handle_lock_dn(MLock *m)
+{
+ assert(m->get_otype() == LOCK_OTYPE_DN);
+
+ CInode *diri = mdcache->get_inode(m->get_ino()); // may be null
+ CDir *dir = 0;
+ if (diri) dir = diri->dir; // may be null
+ string dname = m->get_dn();
+ int from = m->get_asker();
+ CDentry *dn = 0;
+
+ if (LOCK_AC_FOR_AUTH(m->get_action())) {
+ // auth
+
+ // normally we have it always
+ if (diri && dir) {
+ int dauth = dir->dentry_authority(dname);
+ assert(dauth == mds->get_nodeid() || dir->is_proxy() || // mine or proxy,
+ m->get_action() == LOCK_AC_REQXLOCKACK || // or we did a REQXLOCK and this is our ack/nak
+ m->get_action() == LOCK_AC_REQXLOCKNAK);
+
+ if (dir->is_proxy()) {
+
+ assert(dauth >= 0);
+
+ if (dauth == m->get_asker() &&
+ (m->get_action() == LOCK_AC_REQXLOCK ||
+ m->get_action() == LOCK_AC_REQXLOCKC)) {
+ dout(7) << "handle_lock_dn got reqxlock from " << dauth << " and they are auth.. dropping on floor (their import will have woken them up)" << endl;
+ if (mdcache->active_requests.count(m))
+ mdcache->request_finish(m);
+ else
+ delete m;
+ return;
+ }
+
+ dout(7) << "handle_lock_dn " << m << " " << m->get_ino() << " dname " << dname << " from " << from << ": proxy, fw to " << dauth << endl;
+
+ // forward
+ if (mdcache->active_requests.count(m)) {
+ // xlock requests are requests, use request_* functions!
+ assert(m->get_action() == LOCK_AC_REQXLOCK ||
+ m->get_action() == LOCK_AC_REQXLOCKC);
+ // forward as a request
+ mdcache->request_forward(m, dauth, MDS_PORT_LOCKER);
+ } else {
+ // not an xlock req, or it is and we just didn't register the request yet
+ // forward normally
+ mds->send_message_mds(m, dauth, MDS_PORT_LOCKER);
+ }
+ return;
+ }
+
+ dn = dir->lookup(dname);
+ }
+
+ // except with.. an xlock request?
+ if (!dn) {
+ assert(dir); // we should still have the dir, though! the requester has the dir open.
+ switch (m->get_action()) {
+
+ case LOCK_AC_LOCK:
+ dout(7) << "handle_lock_dn xlock on " << dname << ", adding (null)" << endl;
+ dn = dir->add_dentry(dname);
+ break;
+
+ case LOCK_AC_REQXLOCK:
+ // send nak
+ if (dir->state_test(CDIR_STATE_DELETED)) {
+ dout(7) << "handle_lock_dn reqxlock on deleted dir " << *dir << ", nak" << endl;
+ } else {
+ dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << " dne, nak" << endl;
+ }
+ {
+ MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid());
+ reply->set_dn(dir->ino(), dname);
+ reply->set_path(m->get_path());
+ mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER);
+ }
+
+ // finish request (if we got that far)
+ if (mdcache->active_requests.count(m))
+ mdcache->request_finish(m);
+
+ delete m;
+ return;
+
+ case LOCK_AC_REQXLOCKC:
+ dout(7) << "handle_lock_dn reqxlockc on " << dname << " in " << *dir << " dne (yet!)" << endl;
+ break;
+
+ default:
+ assert(0);
+ }
+ }
+ } else {
+ // replica
+ if (dir) dn = dir->lookup(dname);
+ if (!dn) {
+ dout(7) << "handle_lock_dn " << m << " don't have " << m->get_ino() << " dname " << dname << endl;
+
+ if (m->get_action() == LOCK_AC_REQXLOCKACK ||
+ m->get_action() == LOCK_AC_REQXLOCKNAK) {
+ dout(7) << "handle_lock_dn got reqxlockack/nak, but don't have dn " << m->get_path() << ", discovering" << endl;
+ //assert(0); // how can this happen? tell me now!
+
+ vector<CDentry*> trace;
+ filepath path = m->get_path();
+ int r = mdcache->path_traverse(path, trace, true,
+ m, new C_MDS_RetryMessage(mds,m),
+ MDS_TRAVERSE_DISCOVER);
+ assert(r>0);
+ return;
+ }
+
+ if (m->get_action() == LOCK_AC_LOCK) {
+ if (0) { // not anymore
+ dout(7) << "handle_lock_dn don't have " << m->get_path() << ", discovering" << endl;
+
+ vector<CDentry*> trace;
+ filepath path = m->get_path();
+ int r = mdcache->path_traverse(path, trace, true,
+ m, new C_MDS_RetryMessage(mds,m),
+ MDS_TRAVERSE_DISCOVER);
+ assert(r>0);
+ }
+ if (1) {
+ // NAK
+ MLock *reply = new MLock(LOCK_AC_LOCKNAK, mds->get_nodeid());
+ reply->set_dn(m->get_ino(), dname);
+ mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER);
+ }
+ } else {
+ dout(7) << "safely ignoring." << endl;
+ delete m;
+ }
+ return;
+ }
+
+ assert(dn);
+ }
+
+ if (dn) {
+ dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << *dn << endl;
+ } else {
+ dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << dname << " in " << *dir << endl;
+ }
+
+ switch (m->get_action()) {
+ // -- replica --
+ case LOCK_AC_LOCK:
+ assert(dn->lockstate == DN_LOCK_SYNC ||
+ dn->lockstate == DN_LOCK_UNPINNING ||
+ dn->lockstate == DN_LOCK_XLOCK); // <-- bc the handle_lock_dn did the discover!
+
+ if (dn->is_pinned()) {
+ dn->lockstate = DN_LOCK_UNPINNING;
+
+ // wait
+ dout(7) << "dn pinned, waiting " << *dn << endl;
+ dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED,
+ dn->name,
+ new C_MDS_RetryMessage(mds, m));
+ return;
+ } else {
+ dn->lockstate = DN_LOCK_XLOCK;
+ dn->xlockedby = 0;
+
+ // ack now
+ MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
+ reply->set_dn(diri->ino(), dname);
+ mds->send_message_mds(reply, from, MDS_PORT_LOCKER);
+ }
+
+ // wake up waiters
+ dir->finish_waiting(CDIR_WAIT_DNLOCK, dname); // ? will this happen on replica ?
+ break;
+
+ case LOCK_AC_SYNC:
+ assert(dn->lockstate == DN_LOCK_XLOCK);
+ dn->lockstate = DN_LOCK_SYNC;
+ dn->xlockedby = 0;
+
+ // null? hose it.
+ if (dn->is_null()) {
+ dout(7) << "hosing null (and now sync) dentry " << *dn << endl;
+ dir->remove_dentry(dn);
+ }
+
+ // wake up waiters
+ dir->finish_waiting(CDIR_WAIT_DNREAD, dname); // will this happen either? YES: if a rename lock backs out
+ break;
+
+ case LOCK_AC_REQXLOCKACK:
+ case LOCK_AC_REQXLOCKNAK:
+ {
+ dout(10) << "handle_lock_dn got ack/nak on a reqxlock for " << *dn << endl;
+ list<Context*> finished;
+ dir->take_waiting(CDIR_WAIT_DNREQXLOCK, m->get_dn(), finished, 1); // TAKE ONE ONLY!
+ finish_contexts(finished,
+ (m->get_action() == LOCK_AC_REQXLOCKACK) ? 1:-1);
+ }
+ break;
+
+
+ // -- auth --
+ case LOCK_AC_LOCKACK:
+ case LOCK_AC_LOCKNAK:
+ assert(dn->gather_set.count(from) == 1);
+ dn->gather_set.erase(from);
+ if (dn->gather_set.size() == 0) {
+ dout(7) << "handle_lock_dn finish gather, now xlock on " << *dn << endl;
+ dn->lockstate = DN_LOCK_XLOCK;
+ mdcache->active_requests[dn->xlockedby].xlocks.insert(dn);
+ dir->finish_waiting(CDIR_WAIT_DNLOCK, dname);
+ }
+ break;
+
+
+ case LOCK_AC_REQXLOCKC:
+ // make sure it's a _file_, if it exists.
+ if (dn && dn->inode && dn->inode->is_dir()) {
+ dout(7) << "handle_lock_dn failing, reqxlockc on dir " << *dn->inode << endl;
+
+ // nak
+ string path;
+ dn->make_path(path);
+
+ MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid());
+ reply->set_dn(dir->ino(), dname);
+ reply->set_path(path);
+ mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER);
+
+ // done
+ if (mdcache->active_requests.count(m))
+ mdcache->request_finish(m);
+ else
+ delete m;
+ return;
+ }
+
+ case LOCK_AC_REQXLOCK:
+ if (dn) {
+ dout(7) << "handle_lock_dn reqxlock on " << *dn << endl;
+ } else {
+ dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << endl;
+ }
+
+
+ // start request?
+ if (!mdcache->active_requests.count(m)) {
+ vector<CDentry*> trace;
+ if (!mdcache->request_start(m, dir->inode, trace))
+ return; // waiting for pin
+ }
+
+ // try to xlock!
+ if (!dn) {
+ assert(m->get_action() == LOCK_AC_REQXLOCKC);
+ dn = dir->add_dentry(dname);
+ }
+
+ if (dn->xlockedby != m) {
+ if (!dentry_xlock_start(dn, m, dir->inode)) {
+ // hose null dn if we're waiting on something
+ if (dn->is_clean() && dn->is_null() && dn->is_sync()) dir->remove_dentry(dn);
+ return; // waiting for xlock
+ }
+ } else {
+ // successfully xlocked! on behalf of requestor.
+ string path;
+ dn->make_path(path);
+
+ dout(7) << "handle_lock_dn reqxlock success for " << m->get_asker() << " on " << *dn << ", acking" << endl;
+
+ // ACK xlock request
+ MLock *reply = new MLock(LOCK_AC_REQXLOCKACK, mds->get_nodeid());
+ reply->set_dn(dir->ino(), dname);
+ reply->set_path(path);
+ mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER);
+
+ // note: keep request around in memory (to hold the xlock/pins on behalf of requester)
+ return;
+ }
+ break;
+
+ case LOCK_AC_UNXLOCK:
+ dout(7) << "handle_lock_dn unxlock on " << *dn << endl;
+ {
+ string dname = dn->name;
+ Message *m = dn->xlockedby;
+
+ // finish request
+ mdcache->request_finish(m); // this will drop the locks (and unpin paths!)
+ return;
+ }
+ break;
+
+ default:
+ assert(0);
+ }
+
+ delete m;
+}
+
+
+
+
+
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MDS_LOCKER_H
+#define __MDS_LOCKER_H
+
+#include "include/types.h"
+
+#include <map>
+#include <list>
+#include <set>
+using std::map;
+using std::list;
+using std::set;
+
+class MDS;
+class CDir;
+class CInode;
+class CDentry;
+
+class Message;
+
+class MDiscover;
+class MDiscoverReply;
+class MCacheExpire;
+class MDirUpdate;
+class MDentryUnlink;
+class MLock;
+
+class MClientRequest;
+
+
+class Anchor;
+class Capability;
+
+
+class Locker {
+private:
+ MDS *mds;
+ MDCache *mdcache;
+
+ public:
+ Locker(MDS *m, MDCache *c) : mds(m), mdcache(c) {}
+
+ void dispatch(Message *m);
+
+ // -- locks --
+ // high level interface
+ public:
+ bool inode_hard_read_try(CInode *in, Context *con);
+ bool inode_hard_read_start(CInode *in, MClientRequest *m);
+ void inode_hard_read_finish(CInode *in);
+ bool inode_hard_write_start(CInode *in, MClientRequest *m);
+ void inode_hard_write_finish(CInode *in);
+ bool inode_file_read_start(CInode *in, MClientRequest *m);
+ void inode_file_read_finish(CInode *in);
+ bool inode_file_write_start(CInode *in, MClientRequest *m);
+ void inode_file_write_finish(CInode *in);
+
+ void inode_hard_eval(CInode *in);
+ void inode_file_eval(CInode *in);
+
+ protected:
+ void inode_hard_mode(CInode *in, int mode);
+ void inode_file_mode(CInode *in, int mode);
+
+ // low level triggers
+ void inode_hard_sync(CInode *in);
+ void inode_hard_lock(CInode *in);
+ bool inode_file_sync(CInode *in);
+ void inode_file_lock(CInode *in);
+ void inode_file_mixed(CInode *in);
+ void inode_file_loner(CInode *in);
+
+ // messengers
+ void handle_lock(MLock *m);
+ void handle_lock_inode_hard(MLock *m);
+ void handle_lock_inode_file(MLock *m);
+
+ // -- file i/o --
+ public:
+ version_t issue_file_data_version(CInode *in);
+ Capability* issue_new_caps(CInode *in, int mode, MClientRequest *req);
+ bool issue_caps(CInode *in);
+
+ protected:
+ void handle_client_file_caps(class MClientFileCaps *m);
+
+ void request_inode_file_caps(CInode *in);
+ void handle_inode_file_caps(class MInodeFileCaps *m);
+
+
+ // dirs
+ void handle_lock_dir(MLock *m);
+
+ // dentry locks
+ public:
+ bool dentry_xlock_start(CDentry *dn,
+ Message *m, CInode *ref);
+ void dentry_xlock_finish(CDentry *dn, bool quiet=false);
+ void handle_lock_dn(MLock *m);
+ void dentry_xlock_request(CDir *dir, string& dname, bool create,
+ Message *req, Context *onfinish);
+ void dentry_xlock_request_finish(int r,
+ CDir *dir, string& dname,
+ Message *req,
+ Context *finisher);
+
+
+};
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "LogEvent.h"
+
+#include "MDS.h"
+
+// events i know of
+#include "events/EString.h"
+#include "events/EInodeUpdate.h"
+#include "events/EDirUpdate.h"
+#include "events/EUnlink.h"
+#include "events/EAlloc.h"
+#include "events/EMknod.h"
+#include "events/EMkdir.h"
+#include "events/EPurgeFinish.h"
+
+LogEvent *LogEvent::decode(bufferlist& bl)
+{
+ // parse type, length
+ int off = 0;
+ int type;
+ bl.copy(off, sizeof(type), (char*)&type);
+ off += sizeof(type);
+
+ int length = bl.length() - off;
+ dout(15) << "decode_log_event type " << type << ", size " << length << endl;
+
+ assert(type > 0);
+
+ // create event
+ LogEvent *le;
+ switch (type) {
+ case EVENT_STRING: // string
+ le = new EString();
+ break;
+
+ case EVENT_INODEUPDATE:
+ le = new EInodeUpdate();
+ break;
+
+ case EVENT_DIRUPDATE:
+ le = new EDirUpdate();
+ break;
+
+ case EVENT_UNLINK:
+ le = new EUnlink();
+ break;
+
+ case EVENT_PURGEFINISH:
+ le = new EPurgeFinish();
+ break;
+
+ case EVENT_ALLOC:
+ le = new EAlloc();
+ break;
+
+ case EVENT_MKNOD:
+ le = new EMknod();
+ break;
+
+ case EVENT_MKDIR:
+ le = new EMkdir();
+ break;
+
+ default:
+ dout(1) << "uh oh, unknown event type " << type << endl;
+ assert(0);
+ }
+
+ // decode
+ le->decode_payload(bl, off);
+
+ return le;
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __LOGEVENT_H
+#define __LOGEVENT_H
+
+#define EVENT_STRING 1
+
+#define EVENT_INODEUPDATE 2
+#define EVENT_DIRUPDATE 3
+
+#define EVENT_ALLOC 10
+#define EVENT_MKNOD 11
+#define EVENT_MKDIR 12
+#define EVENT_LINK 13
+
+#define EVENT_UNLINK 20
+#define EVENT_RMDIR 21
+#define EVENT_PURGEFINISH 22
+
+
+#include <string>
+using namespace std;
+
+#include "include/buffer.h"
+#include "include/Context.h"
+
+class MDS;
+
+// generic log event
+class LogEvent {
+ private:
+ int _type;
+ off_t _end_off;
+ friend class MDLog;
+
+ public:
+ LogEvent(int t) : _type(t), _end_off(0) { }
+ virtual ~LogEvent() { }
+
+ // encoding
+ virtual void encode_payload(bufferlist& bl) = 0;
+ virtual void decode_payload(bufferlist& bl, int& off) = 0;
+ static LogEvent *decode(bufferlist &bl);
+
+
+ virtual void print(ostream& out) {
+ out << "event(" << _type << ")";
+ }
+
+
+ /*** live journal ***/
+
+ /* obsolete() - is this entry committed to primary store, such that
+ * we can expire it from the journal?
+ */
+ virtual bool can_expire(MDS *m) {
+ return true;
+ }
+
+ /* retire() - prod MDS into committing hte relevant state so that this
+ * entry can be expired from the jorunal.
+ */
+ virtual void retire(MDS *m, Context *c) {
+ c->finish(0);
+ delete c;
+ }
+
+
+ /*** recovery ***/
+
+ /* has_happened() - true if this event has already been applied.
+ */
+ virtual bool has_happened(MDS *m) { return true; }
+
+ /* replay() - replay given event
+ */
+ virtual void replay(MDS *m) { assert(0); }
+
+};
+
+inline ostream& operator<<(ostream& out, LogEvent& le) {
+ le.print(out);
+ return out;
+}
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "mdstypes.h"
+
+#include "MDBalancer.h"
+#include "MDS.h"
+#include "MDSMap.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "MDCache.h"
+#include "Migrator.h"
+
+#include "include/Context.h"
+#include "msg/Messenger.h"
+#include "messages/MHeartbeat.h"
+
+#include <vector>
+#include <map>
+using namespace std;
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mds_balancer) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".bal "
+
+#define MIN_LOAD 50 // ??
+#define MIN_REEXPORT 5 // will automatically reexport
+#define MIN_OFFLOAD 10 // point at which i stop trying, close enough
+
+
+int MDBalancer::proc_message(Message *m)
+{
+ switch (m->get_type()) {
+
+ case MSG_MDS_HEARTBEAT:
+ handle_heartbeat((MHeartbeat*)m);
+ break;
+
+ default:
+ dout(1) << " balancer unknown message " << m->get_type() << endl;
+ assert(0);
+ break;
+ }
+
+ return 0;
+}
+
+
+class C_Bal_SendHeartbeat : public Context {
+public:
+ MDS *mds;
+ C_Bal_SendHeartbeat(MDS *mds) {
+ this->mds = mds;
+ }
+ virtual void finish(int f) {
+ mds->balancer->send_heartbeat();
+ }
+};
+
+mds_load_t MDBalancer::get_load()
+{
+ mds_load_t load;
+ if (mds->mdcache->get_root())
+ load.root =
+ mds->mdcache->get_root()->popularity[MDS_POP_ANYDOM];
+ // +
+ // mds->mdcache->get_root()->popularity[MDS_POP_NESTED];
+
+ load.req_rate = mds->get_req_rate();
+ load.queue_len = mds->messenger->get_dispatch_queue_len();
+ return load;
+}
+
+void MDBalancer::send_heartbeat()
+{
+ if (!mds->mdcache->get_root()) {
+ dout(5) << "no root on send_heartbeat" << endl;
+ mds->mdcache->open_root(new C_Bal_SendHeartbeat(mds));
+ return;
+ }
+
+ mds_load.clear();
+ if (mds->get_nodeid() == 0)
+ beat_epoch++;
+
+ // load
+ mds_load_t load = get_load();
+ mds_load[ mds->get_nodeid() ] = load;
+
+ // import_map
+ map<int, float> import_map;
+
+ for (set<CDir*>::iterator it = mds->mdcache->imports.begin();
+ it != mds->mdcache->imports.end();
+ it++) {
+ CDir *im = *it;
+ if (im->inode->is_root()) continue;
+ int from = im->inode->authority();
+ import_map[from] += im->popularity[MDS_POP_CURDOM].meta_load();
+ }
+ mds_import_map[ mds->get_nodeid() ] = import_map;
+
+
+ dout(5) << "mds" << mds->get_nodeid() << " sending heartbeat " << beat_epoch << " " << load << endl;
+ for (map<int, float>::iterator it = import_map.begin();
+ it != import_map.end();
+ it++) {
+ dout(5) << " import_map from " << it->first << " -> " << it->second << endl;
+ }
+
+
+ int size = mds->get_mds_map()->get_num_mds();
+ for (int i = 0; i<size; i++) {
+ if (i == mds->get_nodeid()) continue;
+ MHeartbeat *hb = new MHeartbeat(load, beat_epoch);
+ hb->get_import_map() = import_map;
+ mds->messenger->send_message(hb,
+ MSG_ADDR_MDS(i), mds->mdsmap->get_inst(i),
+ MDS_PORT_BALANCER,
+ MDS_PORT_BALANCER);
+ }
+}
+
+void MDBalancer::handle_heartbeat(MHeartbeat *m)
+{
+ dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << endl;
+
+ if (!mds->mdcache->get_root()) {
+ dout(10) << "no root on handle" << endl;
+ mds->mdcache->open_root(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ int who = m->get_source().num();
+
+ if (who == 0) {
+ dout(20) << " from mds0, new epoch" << endl;
+ beat_epoch = m->get_beat();
+ send_heartbeat();
+
+ show_imports();
+ }
+
+ mds_load[ who ] = m->get_load();
+ mds_import_map[ who ] = m->get_import_map();
+
+ //cout << " load is " << load << " have " << mds_load.size() << endl;
+
+ unsigned cluster_size = mds->get_mds_map()->get_num_mds();
+ if (mds_load.size() == cluster_size) {
+ // let's go!
+ //export_empties(); // no!
+ do_rebalance(m->get_beat());
+ }
+
+ // done
+ delete m;
+}
+
+
+void MDBalancer::export_empties()
+{
+ dout(5) << "export_empties checking for empty imports" << endl;
+
+ for (set<CDir*>::iterator it = mds->mdcache->imports.begin();
+ it != mds->mdcache->imports.end();
+ it++) {
+ CDir *dir = *it;
+
+ if (!dir->inode->is_root() && dir->get_size() == 0)
+ mds->mdcache->migrator->export_empty_import(dir);
+ }
+}
+
+
+
+double MDBalancer::try_match(int ex, double& maxex,
+ int im, double& maxim)
+{
+ if (maxex <= 0 || maxim <= 0) return 0.0;
+
+ double howmuch = MIN(maxex, maxim);
+ if (howmuch <= 0) return 0.0;
+
+ dout(5) << " - mds" << ex << " exports " << howmuch << " to mds" << im << endl;
+
+ if (ex == mds->get_nodeid())
+ my_targets[im] += howmuch;
+
+ exported[ex] += howmuch;
+ imported[im] += howmuch;
+
+ maxex -= howmuch;
+ maxim -= howmuch;
+
+ return howmuch;
+}
+
+
+
+void MDBalancer::do_hashing()
+{
+ if (hash_queue.empty()) {
+ dout(20) << "do_hashing has nothing to do" << endl;
+ return;
+ }
+
+ dout(0) << "do_hashing " << hash_queue.size() << " dirs marked for possible hashing" << endl;
+
+ for (set<inodeno_t>::iterator i = hash_queue.begin();
+ i != hash_queue.end();
+ i++) {
+ inodeno_t dirino = *i;
+ CInode *in = mds->mdcache->get_inode(dirino);
+ if (!in) continue;
+ CDir *dir = in->dir;
+ if (!dir) continue;
+ if (!dir->is_auth()) continue;
+
+ dout(0) << "do_hashing hashing " << *dir << endl;
+ mds->mdcache->migrator->hash_dir(dir);
+ }
+ hash_queue.clear();
+}
+
+
+
+void MDBalancer::do_rebalance(int beat)
+{
+ int cluster_size = mds->get_mds_map()->get_num_mds();
+ int whoami = mds->get_nodeid();
+
+ // reset
+ my_targets.clear();
+ imported.clear();
+ exported.clear();
+
+ dout(5) << " do_rebalance: cluster loads are" << endl;
+
+ // rescale! turn my mds_load back into meta_load units
+ double load_fac = 1.0;
+ if (mds_load[whoami].mds_load() > 0) {
+ load_fac = mds_load[whoami].root.meta_load() / mds_load[whoami].mds_load();
+ dout(7) << " load_fac is " << load_fac
+ << " <- " << mds_load[whoami].root.meta_load() << " / " << mds_load[whoami].mds_load()
+ << endl;
+ }
+
+ double total_load = 0;
+ multimap<double,int> load_map;
+ for (int i=0; i<cluster_size; i++) {
+ double l = mds_load[i].mds_load() * load_fac;
+ mds_meta_load[i] = l;
+
+ if (whoami == 0)
+ dout(-5) << " mds" << i
+ << " meta load " << mds_load[i]
+ << " = " << mds_load[i].mds_load()
+ << " --> " << l << endl;
+
+ if (whoami == i) my_load = l;
+ total_load += l;
+
+ load_map.insert(pair<double,int>( l, i ));
+ }
+
+ // target load
+ target_load = total_load / (double)cluster_size;
+ dout(5) << "do_rebalance: my load " << my_load
+ << " target " << target_load
+ << " total " << total_load
+ << endl;
+
+ // under or over?
+ if (my_load < target_load) {
+ dout(5) << " i am underloaded, doing nothing." << endl;
+ show_imports();
+ return;
+ }
+
+ dout(5) << " i am overloaded" << endl;
+
+
+ // first separate exporters and importers
+ multimap<double,int> importers;
+ multimap<double,int> exporters;
+ set<int> importer_set;
+ set<int> exporter_set;
+
+ for (multimap<double,int>::iterator it = load_map.begin();
+ it != load_map.end();
+ it++) {
+ if (it->first < target_load) {
+ dout(15) << " mds" << it->second << " is importer" << endl;
+ importers.insert(pair<double,int>(it->first,it->second));
+ importer_set.insert(it->second);
+ } else {
+ dout(15) << " mds" << it->second << " is exporter" << endl;
+ exporters.insert(pair<double,int>(it->first,it->second));
+ exporter_set.insert(it->second);
+ }
+ }
+
+
+ // determine load transfer mapping
+
+ if (true) {
+ // analyze import_map; do any matches i can
+
+ dout(5) << " matching exporters to import sources" << endl;
+
+ // big -> small exporters
+ for (multimap<double,int>::reverse_iterator ex = exporters.rbegin();
+ ex != exporters.rend();
+ ex++) {
+ double maxex = get_maxex(ex->second);
+ if (maxex <= .001) continue;
+
+ // check importers. for now, just in arbitrary order (no intelligent matching).
+ for (map<int, float>::iterator im = mds_import_map[ex->second].begin();
+ im != mds_import_map[ex->second].end();
+ im++) {
+ double maxim = get_maxim(im->first);
+ if (maxim <= .001) continue;
+ try_match(ex->second, maxex,
+ im->first, maxim);
+ if (maxex <= .001) break;;
+ }
+ }
+ }
+
+
+ if (1) {
+ if (beat % 2 == 1) {
+ // old way
+ dout(5) << " matching big exporters to big importers" << endl;
+ // big exporters to big importers
+ multimap<double,int>::reverse_iterator ex = exporters.rbegin();
+ multimap<double,int>::iterator im = importers.begin();
+ while (ex != exporters.rend() &&
+ im != importers.end()) {
+ double maxex = get_maxex(ex->second);
+ double maxim = get_maxim(im->second);
+ if (maxex < .001 || maxim < .001) break;
+ try_match(ex->second, maxex,
+ im->second, maxim);
+ if (maxex <= .001) ex++;
+ if (maxim <= .001) im++;
+ }
+ } else {
+ // new way
+ dout(5) << " matching small exporters to big importers" << endl;
+ // small exporters to big importers
+ multimap<double,int>::iterator ex = exporters.begin();
+ multimap<double,int>::iterator im = importers.begin();
+ while (ex != exporters.end() &&
+ im != importers.end()) {
+ double maxex = get_maxex(ex->second);
+ double maxim = get_maxim(im->second);
+ if (maxex < .001 || maxim < .001) break;
+ try_match(ex->second, maxex,
+ im->second, maxim);
+ if (maxex <= .001) ex++;
+ if (maxim <= .001) im++;
+ }
+ }
+ }
+
+
+
+ // make a sorted list of my imports
+ map<double,CDir*> import_pop_map;
+ multimap<int,CDir*> import_from_map;
+ for (set<CDir*>::iterator it = mds->mdcache->imports.begin();
+ it != mds->mdcache->imports.end();
+ it++) {
+ if ((*it)->is_hashed()) continue;
+ double pop = (*it)->popularity[MDS_POP_CURDOM].meta_load();
+ if (pop < g_conf.mds_bal_idle_threshold &&
+ (*it)->inode != mds->mdcache->get_root()) {
+ dout(-5) << " exporting idle import " << **it
+ << " back to mds" << (*it)->inode->authority()
+ << endl;
+ mds->mdcache->migrator->export_dir(*it, (*it)->inode->authority());
+ continue;
+ }
+ import_pop_map[ pop ] = *it;
+ int from = (*it)->inode->authority();
+ dout(15) << " map: i imported " << **it << " from " << from << endl;
+ import_from_map.insert(pair<int,CDir*>(from, *it));
+ }
+
+
+
+ // do my exports!
+ set<CDir*> already_exporting;
+ double total_sent = 0;
+ double total_goal = 0;
+
+ for (map<int,double>::iterator it = my_targets.begin();
+ it != my_targets.end();
+ it++) {
+
+ /*
+ double fac = 1.0;
+ if (false && total_goal > 0 && total_sent > 0) {
+ fac = total_goal / total_sent;
+ dout(-5) << " total sent is " << total_sent << " / " << total_goal << " -> fac 1/ " << fac << endl;
+ if (fac > 1.0) fac = 1.0;
+ }
+ fac = .9 - .4 * ((float)g_conf.num_mds / 128.0); // hack magic fixme
+ */
+
+ int target = (*it).first;
+ double amount = (*it).second;// * load_fac;
+ total_goal += amount;
+
+ if (amount < MIN_OFFLOAD) continue;
+
+ dout(-5) << " sending " << amount << " to mds" << target
+ //<< " .. " << (*it).second << " * " << load_fac
+ << " -> " << amount
+ << endl;//" .. fudge is " << fudge << endl;
+ double have = 0;
+
+ show_imports();
+
+ // search imports from target
+ if (import_from_map.count(target)) {
+ dout(5) << " aha, looking through imports from target mds" << target << endl;
+ pair<multimap<int,CDir*>::iterator, multimap<int,CDir*>::iterator> p =
+ import_from_map.equal_range(target);
+ while (p.first != p.second) {
+ CDir *dir = (*p.first).second;
+ dout(5) << "considering " << *dir << " from " << (*p.first).first << endl;
+ multimap<int,CDir*>::iterator plast = p.first++;
+
+ if (dir->inode->is_root()) continue;
+ if (dir->is_hashed()) continue;
+ if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress
+ double pop = dir->popularity[MDS_POP_CURDOM].meta_load();
+ assert(dir->inode->authority() == target); // cuz that's how i put it in the map, dummy
+
+ if (pop <= amount-have) {
+ dout(-5) << "reexporting " << *dir
+ << " pop " << pop
+ << " back to mds" << target << endl;
+ mds->mdcache->migrator->export_dir(dir, target);
+ have += pop;
+ import_from_map.erase(plast);
+ import_pop_map.erase(pop);
+ } else {
+ dout(5) << "can't reexport " << *dir << ", too big " << pop << endl;
+ }
+ if (amount-have < MIN_OFFLOAD) break;
+ }
+ }
+ if (amount-have < MIN_OFFLOAD) {
+ total_sent += have;
+ continue;
+ }
+
+ // any other imports
+ if (false)
+ for (map<double,CDir*>::iterator import = import_pop_map.begin();
+ import != import_pop_map.end();
+ import++) {
+ CDir *imp = (*import).second;
+ if (imp->inode->is_root()) continue;
+
+ double pop = (*import).first;
+ if (pop < amount-have || pop < MIN_REEXPORT) {
+ dout(-5) << "reexporting " << *imp
+ << " pop " << pop
+ << " back to mds" << imp->inode->authority()
+ << endl;
+ have += pop;
+ mds->mdcache->migrator->export_dir(imp, imp->inode->authority());
+ }
+ if (amount-have < MIN_OFFLOAD) break;
+ }
+ if (amount-have < MIN_OFFLOAD) {
+ //fudge = amount-have;
+ total_sent += have;
+ continue;
+ }
+
+ // okay, search for fragments of my workload
+ set<CDir*> candidates = mds->mdcache->imports;
+
+ list<CDir*> exports;
+
+ for (set<CDir*>::iterator pot = candidates.begin();
+ pot != candidates.end();
+ pot++) {
+ find_exports(*pot, amount, exports, have, already_exporting);
+ if (have > amount-MIN_OFFLOAD) {
+ break;
+ }
+ }
+ //fudge = amount - have;
+ total_sent += have;
+
+ for (list<CDir*>::iterator it = exports.begin(); it != exports.end(); it++) {
+ dout(-5) << " exporting to mds" << target
+ << " fragment " << **it
+ << " pop " << (*it)->popularity[MDS_POP_CURDOM].meta_load()
+ << endl;
+ mds->mdcache->migrator->export_dir(*it, target);
+ }
+ }
+
+ dout(5) << "rebalance done" << endl;
+ show_imports();
+
+}
+
+
+
+void MDBalancer::find_exports(CDir *dir,
+ double amount,
+ list<CDir*>& exports,
+ double& have,
+ set<CDir*>& already_exporting)
+{
+ double need = amount - have;
+ if (need < amount * g_conf.mds_bal_min_start)
+ return; // good enough!
+ double needmax = need * g_conf.mds_bal_need_max;
+ double needmin = need * g_conf.mds_bal_need_min;
+ double midchunk = need * g_conf.mds_bal_midchunk;
+ double minchunk = need * g_conf.mds_bal_minchunk;
+
+ list<CDir*> bigger;
+ multimap<double, CDir*> smaller;
+
+ double dir_pop = dir->popularity[MDS_POP_CURDOM].meta_load();
+ double dir_sum = 0;
+ dout(-7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << endl;
+
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CInode *in = it->second->get_inode();
+ if (!in) continue;
+ if (!in->is_dir()) continue;
+ if (!in->dir) continue; // clearly not popular
+
+ if (in->dir->is_export()) continue;
+ if (in->dir->is_hashed()) continue;
+ if (already_exporting.count(in->dir)) continue;
+
+ if (in->dir->is_frozen()) continue; // can't export this right now!
+ //if (in->dir->get_size() == 0) continue; // don't export empty dirs, even if they're not complete. for now!
+
+ // how popular?
+ double pop = in->dir->popularity[MDS_POP_CURDOM].meta_load();
+ dir_sum += pop;
+ dout(20) << " pop " << pop << " " << *in->dir << endl;
+
+ if (pop < minchunk) continue;
+
+ // lucky find?
+ if (pop > needmin && pop < needmax) {
+ exports.push_back(in->dir);
+ have += pop;
+ return;
+ }
+
+ if (pop > need)
+ bigger.push_back(in->dir);
+ else
+ smaller.insert(pair<double,CDir*>(pop, in->dir));
+ }
+ dout(7) << " .. sum " << dir_sum << " / " << dir_pop << endl;
+
+ // grab some sufficiently big small items
+ multimap<double,CDir*>::reverse_iterator it;
+ for (it = smaller.rbegin();
+ it != smaller.rend();
+ it++) {
+
+ if ((*it).first < midchunk)
+ break; // try later
+
+ dout(7) << " taking smaller " << *(*it).second << endl;
+
+ exports.push_back((*it).second);
+ already_exporting.insert((*it).second);
+ have += (*it).first;
+ if (have > needmin)
+ return;
+ }
+
+ // apprently not enough; drill deeper into the hierarchy (if non-replicated)
+ for (list<CDir*>::iterator it = bigger.begin();
+ it != bigger.end();
+ it++) {
+ if ((*it)->is_rep()) continue;
+ dout(7) << " descending into " << **it << endl;
+ find_exports(*it, amount, exports, have, already_exporting);
+ if (have > needmin)
+ return;
+ }
+
+ // ok fine, use smaller bits
+ for (;
+ it != smaller.rend();
+ it++) {
+
+ dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << endl;
+
+ exports.push_back((*it).second);
+ already_exporting.insert((*it).second);
+ have += (*it).first;
+ if (have > needmin)
+ return;
+ }
+
+ // ok fine, drill inot replicated dirs
+ for (list<CDir*>::iterator it = bigger.begin();
+ it != bigger.end();
+ it++) {
+ if (!(*it)->is_rep()) continue;
+ dout(7) << " descending into replicated " << **it << endl;
+ find_exports(*it, amount, exports, have, already_exporting);
+ if (have > needmin)
+ return;
+ }
+
+}
+
+
+
+
+void MDBalancer::hit_inode(CInode *in, int type)
+{
+ // hit me
+ in->popularity[MDS_POP_JUSTME].pop[type].hit();
+ in->popularity[MDS_POP_NESTED].pop[type].hit();
+ if (in->is_auth()) {
+ in->popularity[MDS_POP_CURDOM].pop[type].hit();
+ in->popularity[MDS_POP_ANYDOM].pop[type].hit();
+ }
+
+ // hit auth up to import
+ CDir *dir = in->get_parent_dir();
+ if (dir) hit_dir(dir, type);
+}
+
+
+void MDBalancer::hit_dir(CDir *dir, int type)
+{
+ // hit me
+ float v = dir->popularity[MDS_POP_JUSTME].pop[type].hit();
+
+ // hit modify counter, if this was a modify
+ if (g_conf.num_mds > 2 && // FIXME >2 thing
+ !dir->inode->is_root() && // not root (for now at least)
+ dir->is_auth()) {
+ //dout(-20) << "hit_dir " << type << " pop is " << v << " " << *dir << endl;
+
+ // hash this dir? (later?)
+ if (((v > g_conf.mds_bal_hash_rd && type == META_POP_IRD) ||
+ //(v > g_conf.mds_bal_hash_wr && type == META_POP_IWR) ||
+ (v > g_conf.mds_bal_hash_wr && type == META_POP_DWR)) &&
+ !(dir->is_hashed() || dir->is_hashing()) &&
+ hash_queue.count(dir->ino()) == 0) {
+ dout(0) << "hit_dir " << type << " pop is " << v << ", putting in hash_queue: " << *dir << endl;
+ hash_queue.insert(dir->ino());
+ }
+
+ }
+
+ hit_recursive(dir, type);
+}
+
+
+
+void MDBalancer::hit_recursive(CDir *dir, int type)
+{
+ bool anydom = dir->is_auth();
+ bool curdom = dir->is_auth();
+
+ float rd_adj = 0.0;
+
+ // replicate?
+ float dir_pop = dir->popularity[MDS_POP_CURDOM].pop[type].get(); // hmm??
+
+ if (dir->is_auth()) {
+ if (!dir->is_rep() &&
+ dir_pop >= g_conf.mds_bal_replicate_threshold) {
+ // replicate
+ float rdp = dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].get();
+ rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp;
+ rd_adj /= 2.0; // temper somewhat
+
+ dout(1) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl;
+
+ dir->dir_rep = CDIR_REP_ALL;
+ mds->mdcache->send_dir_updates(dir, true);
+
+ dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].adjust(rd_adj);
+ dir->popularity[MDS_POP_CURDOM].pop[META_POP_IRD].adjust(rd_adj);
+ }
+
+ if (!dir->ino() != 1 &&
+ dir->is_rep() &&
+ dir_pop < g_conf.mds_bal_unreplicate_threshold) {
+ // unreplicate
+ dout(1) << "unreplicating dir " << *dir << " pop " << dir_pop << endl;
+
+ dir->dir_rep = CDIR_REP_NONE;
+ mds->mdcache->send_dir_updates(dir);
+ }
+ }
+
+
+ while (dir) {
+ CInode *in = dir->inode;
+
+ dir->popularity[MDS_POP_NESTED].pop[type].hit();
+ in->popularity[MDS_POP_NESTED].pop[type].hit();
+
+ if (rd_adj != 0.0) dir->popularity[MDS_POP_NESTED].pop[META_POP_IRD].adjust(rd_adj);
+
+ if (anydom) {
+ dir->popularity[MDS_POP_ANYDOM].pop[type].hit();
+ in->popularity[MDS_POP_ANYDOM].pop[type].hit();
+ }
+
+ if (curdom) {
+ dir->popularity[MDS_POP_CURDOM].pop[type].hit();
+ in->popularity[MDS_POP_CURDOM].pop[type].hit();
+ }
+
+ if (dir->is_import())
+ curdom = false; // end of auth domain, stop hitting auth counters.
+ dir = dir->inode->get_parent_dir();
+ }
+}
+
+
+/*
+ * subtract off an exported chunk
+ */
+void MDBalancer::subtract_export(CDir *dir)
+{
+ meta_load_t curdom = dir->popularity[MDS_POP_CURDOM];
+
+ bool in_domain = !dir->is_import();
+
+ while (true) {
+ CInode *in = dir->inode;
+
+ in->popularity[MDS_POP_ANYDOM] -= curdom;
+ if (in_domain) in->popularity[MDS_POP_CURDOM] -= curdom;
+
+ dir = in->get_parent_dir();
+ if (!dir) break;
+
+ if (dir->is_import()) in_domain = false;
+
+ dir->popularity[MDS_POP_ANYDOM] -= curdom;
+ if (in_domain) dir->popularity[MDS_POP_CURDOM] -= curdom;
+ }
+}
+
+
+void MDBalancer::add_import(CDir *dir)
+{
+ meta_load_t curdom = dir->popularity[MDS_POP_CURDOM];
+
+ bool in_domain = !dir->is_import();
+
+ while (true) {
+ CInode *in = dir->inode;
+
+ in->popularity[MDS_POP_ANYDOM] += curdom;
+ if (in_domain) in->popularity[MDS_POP_CURDOM] += curdom;
+
+ dir = in->get_parent_dir();
+ if (!dir) break;
+
+ if (dir->is_import()) in_domain = false;
+
+ dir->popularity[MDS_POP_ANYDOM] += curdom;
+ if (in_domain) dir->popularity[MDS_POP_CURDOM] += curdom;
+ }
+
+}
+
+
+
+
+
+
+void MDBalancer::show_imports(bool external)
+{
+ int db = 20; //debug level
+ return;
+
+ if (mds->mdcache->imports.empty() &&
+ mds->mdcache->hashdirs.empty()) {
+ dout(db) << "no imports/exports/hashdirs" << endl;
+ return;
+ }
+ dout(db) << "imports/exports/hashdirs:" << endl;
+
+ set<CDir*> ecopy = mds->mdcache->exports;
+
+ set<CDir*>::iterator it = mds->mdcache->hashdirs.begin();
+ while (1) {
+ if (it == mds->mdcache->hashdirs.end()) it = mds->mdcache->imports.begin();
+ if (it == mds->mdcache->imports.end() ) break;
+
+ CDir *im = *it;
+
+ if (im->is_import()) {
+ dout(db) << " + import (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl;
+ assert( im->is_auth() );
+ }
+ else if (im->is_hashed()) {
+ if (im->is_import()) continue; // if import AND hash, list as import.
+ dout(db) << " + hash (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl;
+ }
+
+ for (set<CDir*>::iterator p = mds->mdcache->nested_exports[im].begin();
+ p != mds->mdcache->nested_exports[im].end();
+ p++) {
+ CDir *exp = *p;
+ if (exp->is_hashed()) {
+ //assert(0); // we don't do it this way actually
+ dout(db) << " - hash (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl;
+ assert( !exp->is_auth() );
+ } else {
+ dout(db) << " - ex (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl;
+ assert( exp->is_export() );
+ assert( !exp->is_auth() );
+ }
+
+ if ( mds->mdcache->get_auth_container(exp) != im ) {
+ dout(1) << "uh oh, auth container is " << mds->mdcache->get_auth_container(exp) << endl;
+ dout(1) << "uh oh, auth container is " << *mds->mdcache->get_auth_container(exp) << endl;
+ assert( mds->mdcache->get_auth_container(exp) == im );
+ }
+
+ if (ecopy.count(exp) != 1) {
+ dout(1) << "***** nested_export " << *exp << " not in exports" << endl;
+ assert(0);
+ }
+ ecopy.erase(exp);
+ }
+
+ it++;
+ }
+
+ if (ecopy.size()) {
+ for (set<CDir*>::iterator it = ecopy.begin();
+ it != ecopy.end();
+ it++)
+ dout(1) << "***** stray item in exports: " << **it << endl;
+ assert(ecopy.size() == 0);
+ }
+}
+
+
+
+/* replicate?
+
+ float dir_pop = dir->get_popularity();
+
+ if (dir->is_auth()) {
+ if (!dir->is_rep() &&
+ dir_pop >= g_conf.mds_bal_replicate_threshold) {
+ // replicate
+ dout(5) << "replicating dir " << *in << " pop " << dir_pop << endl;
+
+ dir->dir_rep = CDIR_REP_ALL;
+ mds->mdcache->send_dir_updates(dir);
+ }
+
+ if (dir->is_rep() &&
+ dir_pop < g_conf.mds_bal_unreplicate_threshold) {
+ // unreplicate
+ dout(5) << "unreplicating dir " << *in << " pop " << dir_pop << endl;
+
+ dir->dir_rep = CDIR_REP_NONE;
+ mds->mdcache->send_dir_updates(dir);
+ }
+ }
+
+*/
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __MDBALANCER_H
+#define __MDBALANCER_H
+
+#include <ostream>
+#include <list>
+using namespace std;
+
+#include <map>
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#include "include/types.h"
+#include "common/Clock.h"
+#include "CInode.h"
+
+
+class MDS;
+class Message;
+class MHeartbeat;
+class CInode;
+class Context;
+class CDir;
+
+class MDBalancer {
+ protected:
+ MDS *mds;
+
+ int beat_epoch;
+
+ // todo
+ set<inodeno_t> hash_queue;
+
+ // per-epoch scatter/gathered info
+ hash_map<int, mds_load_t> mds_load;
+ hash_map<int, float> mds_meta_load;
+ map<int, map<int, float> > mds_import_map;
+
+ // per-epoch state
+ double my_load, target_load;
+ map<int,double> my_targets;
+ map<int,double> imported;
+ map<int,double> exported;
+
+ double try_match(int ex, double& maxex,
+ int im, double& maxim);
+ double get_maxim(int im) {
+ return target_load - mds_meta_load[im] - imported[im];
+ }
+ double get_maxex(int ex) {
+ return mds_meta_load[ex] - target_load - exported[ex];
+ }
+
+ public:
+ MDBalancer(MDS *m) {
+ mds = m;
+ beat_epoch = 0;
+ }
+
+ mds_load_t get_load();
+
+ int proc_message(Message *m);
+
+ void send_heartbeat();
+ void handle_heartbeat(MHeartbeat *m);
+
+ void do_hashing();
+
+ void export_empties();
+ void do_rebalance(int beat);
+ void find_exports(CDir *dir,
+ double amount,
+ list<CDir*>& exports,
+ double& have,
+ set<CDir*>& already_exporting);
+
+
+ void subtract_export(class CDir *ex);
+ void add_import(class CDir *im);
+
+ void hit_inode(class CInode *in, int type=0);
+ void hit_dir(class CDir *dir, int type=0);
+ void hit_recursive(class CDir *dir, int type=0);
+
+
+ void show_imports(bool external=false);
+
+};
+
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "MDCache.h"
+#include "MDStore.h"
+#include "MDS.h"
+#include "Server.h"
+#include "Locker.h"
+#include "MDLog.h"
+#include "MDBalancer.h"
+#include "AnchorClient.h"
+#include "Migrator.h"
+#include "Renamer.h"
+
+#include "MDSMap.h"
+
+#include "CInode.h"
+#include "CDir.h"
+
+#include "include/filepath.h"
+
+#include "msg/Message.h"
+#include "msg/Messenger.h"
+
+#include "common/Logger.h"
+
+#include "osdc/Filer.h"
+
+#include "events/EUnlink.h"
+#include "events/EPurgeFinish.h"
+
+#include "messages/MGenericMessage.h"
+#include "messages/MDiscover.h"
+#include "messages/MDiscoverReply.h"
+
+//#include "messages/MInodeUpdate.h"
+#include "messages/MDirUpdate.h"
+#include "messages/MCacheExpire.h"
+
+#include "messages/MInodeFileCaps.h"
+
+#include "messages/MInodeLink.h"
+#include "messages/MInodeLinkAck.h"
+#include "messages/MInodeUnlink.h"
+#include "messages/MInodeUnlinkAck.h"
+
+#include "messages/MLock.h"
+#include "messages/MDentryUnlink.h"
+
+#include "messages/MClientRequest.h"
+#include "messages/MClientFileCaps.h"
+
+#include "IdAllocator.h"
+
+#include "common/Timer.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <iostream>
+#include <string>
+#include <map>
+using namespace std;
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".cache "
+
+
+
+
+MDCache::MDCache(MDS *m)
+{
+ mds = m;
+ migrator = new Migrator(mds, this);
+ renamer = new Renamer(mds, this);
+ root = NULL;
+ lru.lru_set_max(g_conf.mds_cache_size);
+ lru.lru_set_midpoint(g_conf.mds_cache_mid);
+
+ did_shutdown_exports = false;
+ shutdown_commits = 0;
+}
+
+MDCache::~MDCache()
+{
+ delete migrator;
+ delete renamer;
+}
+
+
+void MDCache::log_stat(Logger *logger)
+{
+ if (get_root()) {
+ logger->set("popanyd", (int)get_root()->popularity[MDS_POP_ANYDOM].meta_load());
+ logger->set("popnest", (int)get_root()->popularity[MDS_POP_NESTED].meta_load());
+ }
+ logger->set("c", lru.lru_get_size());
+ logger->set("cpin", lru.lru_get_num_pinned());
+ logger->set("ctop", lru.lru_get_top());
+ logger->set("cbot", lru.lru_get_bot());
+ logger->set("cptail", lru.lru_get_pintail());
+}
+
+
+//
+
+bool MDCache::shutdown()
+{
+ if (lru.lru_get_size() > 0) {
+ dout(7) << "WARNING: mdcache shutodwn with non-empty cache" << endl;
+ //show_cache();
+ show_imports();
+ //dump();
+ }
+ return true;
+}
+
+
+// MDCache
+
+CInode *MDCache::create_inode()
+{
+ CInode *in = new CInode(this);
+
+ // zero
+ memset(&in->inode, 0, sizeof(inode_t));
+
+ // assign ino
+ in->inode.ino = mds->idalloc->alloc_id();
+
+ in->inode.nlink = 1; // FIXME
+
+ in->inode.layout = g_OSD_FileLayout;
+
+ add_inode(in); // add
+ return in;
+}
+
+void MDCache::destroy_inode(CInode *in)
+{
+ mds->idalloc->reclaim_id(in->ino());
+ remove_inode(in);
+}
+
+
+void MDCache::add_inode(CInode *in)
+{
+ // add to lru, inode map
+ assert(inode_map.size() == lru.lru_get_size());
+ lru.lru_insert_mid(in);
+ assert(inode_map.count(in->ino()) == 0); // should be no dup inos!
+ inode_map[ in->ino() ] = in;
+ assert(inode_map.size() == lru.lru_get_size());
+}
+
+void MDCache::remove_inode(CInode *o)
+{
+ dout(14) << "remove_inode " << *o << endl;
+ if (o->get_parent_dn()) {
+ // FIXME: multiple parents?
+ CDentry *dn = o->get_parent_dn();
+ assert(!dn->is_dirty());
+ if (dn->is_sync())
+ dn->dir->remove_dentry(dn); // unlink inode AND hose dentry
+ else
+ dn->dir->unlink_inode(dn); // leave dentry
+ }
+ inode_map.erase(o->ino()); // remove from map
+ lru.lru_remove(o); // remove from lru
+}
+
+
+
+
+void MDCache::rename_file(CDentry *srcdn,
+ CDentry *destdn)
+{
+ CInode *in = srcdn->inode;
+
+ // unlink src
+ srcdn->dir->unlink_inode(srcdn);
+
+ // unlink old inode?
+ if (destdn->inode) destdn->dir->unlink_inode(destdn);
+
+ // link inode w/ dentry
+ destdn->dir->link_inode( destdn, in );
+}
+
+
+
+void MDCache::set_root(CInode *in)
+{
+ assert(root == 0);
+ root = in;
+ root->state_set(CINODE_STATE_ROOT);
+}
+
+void MDCache::add_import(CDir *dir)
+{
+ imports.insert(dir);
+ dir->state_set(CDIR_STATE_IMPORT);
+ dir->get(CDIR_PIN_IMPORT);
+}
+
+
+
+
+
+// **************
+// Inode purging -- reliably removing deleted file's objects
+
+class C_MDC_PurgeFinish : public Context {
+ MDCache *mdc;
+ inodeno_t ino;
+public:
+ C_MDC_PurgeFinish(MDCache *c, inodeno_t i) : mdc(c), ino(i) {}
+ void finish(int r) {
+ mdc->purge_inode_finish(ino);
+ }
+};
+class C_MDC_PurgeFinish2 : public Context {
+ MDCache *mdc;
+ inodeno_t ino;
+public:
+ C_MDC_PurgeFinish2(MDCache *c, inodeno_t i) : mdc(c), ino(i) {}
+ void finish(int r) {
+ mdc->purge_inode_finish_2(ino);
+ }
+};
+
+/* purge_inode in
+ * will be called by on unlink or rmdir
+ * caller responsible for journaling an appropriate EUnlink or ERmdir
+ */
+void MDCache::purge_inode(inode_t &inode)
+{
+ dout(10) << "purge_inode " << inode.ino << " size " << inode.size << endl;
+
+ // take note
+ assert(purging.count(inode.ino) == 0);
+ purging[inode.ino] = inode;
+
+ // remove
+ mds->filer->remove(inode, 0, inode.size,
+ 0, new C_MDC_PurgeFinish(this, inode.ino));
+}
+
+void MDCache::purge_inode_finish(inodeno_t ino)
+{
+ dout(10) << "purge_inode_finish " << ino << " - logging our completion" << endl;
+
+ // log completion
+ mds->mdlog->submit_entry(new EPurgeFinish(ino),
+ new C_MDC_PurgeFinish2(this, ino));
+}
+
+void MDCache::purge_inode_finish_2(inodeno_t ino)
+{
+ dout(10) << "purge_inode_finish_2 " << ino << endl;
+
+ // remove from purging list
+ purging.erase(ino);
+
+ // tell anyone who cares (log flusher?)
+ list<Context*> ls;
+ ls.swap(waiting_for_purge[ino]);
+ waiting_for_purge.erase(ino);
+ finish_contexts(ls, 0);
+
+ // reclaim ino?
+
+}
+
+void MDCache::start_recovered_purges()
+{
+ for (map<inodeno_t,inode_t>::iterator p = purging.begin();
+ p != purging.end();
+ ++p) {
+ dout(10) << "start_recovered_purges " << p->first << " size " << p->second.size << endl;
+ mds->filer->remove(p->second, 0, p->second.size,
+ 0, new C_MDC_PurgeFinish(this, p->first));
+ }
+}
+
+
+
+
+bool MDCache::trim(int max)
+{
+ // empty? short cut.
+ if (lru.lru_get_size() == 0) return true;
+
+ if (max < 0) {
+ max = lru.lru_get_max();
+ if (!max) return false;
+ }
+
+ map<int, MCacheExpire*> expiremap;
+
+ dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << endl;
+ assert(expiremap.empty());
+
+ while (lru.lru_get_size() > (unsigned)max) {
+ CInode *in = (CInode*)lru.lru_expire();
+ if (!in) break; //return false;
+
+ if (in->dir) {
+ // notify dir authority?
+ int auth = in->dir->authority();
+ if (auth != mds->get_nodeid()) {
+ dout(17) << "sending expire to mds" << auth << " on " << *in->dir << endl;
+ if (expiremap.count(auth) == 0) expiremap[auth] = new MCacheExpire(mds->get_nodeid());
+ expiremap[auth]->add_dir(in->ino(), in->dir->replica_nonce);
+ }
+ }
+
+ // notify inode authority?
+ {
+ int auth = in->authority();
+ if (auth != mds->get_nodeid()) {
+ assert(!in->is_auth());
+ dout(17) << "sending expire to mds" << auth << " on " << *in << endl;
+ if (expiremap.count(auth) == 0) expiremap[auth] = new MCacheExpire(mds->get_nodeid());
+ expiremap[auth]->add_inode(in->ino(), in->replica_nonce);
+ } else {
+ assert(in->is_auth());
+ }
+ }
+ CInode *diri = NULL;
+ if (in->parent)
+ diri = in->parent->dir->inode;
+
+ if (in->is_root()) {
+ dout(7) << "just trimmed root, cache now empty." << endl;
+ root = NULL;
+ }
+
+
+ // last link?
+ if (in->inode.nlink == 0) {
+ dout(17) << "last link, removing file content " << *in << endl; // FIXME THIS IS WRONG PLACE FOR THIS!
+ mds->filer->zero(in->inode,
+ 0, in->inode.size,
+ NULL, NULL); // FIXME
+ }
+
+ // remove it
+ dout(15) << "trim removing " << *in << " " << in << endl;
+ remove_inode(in);
+ delete in;
+
+ if (diri) {
+ // dir incomplete!
+ diri->dir->state_clear(CDIR_STATE_COMPLETE);
+
+ // reexport?
+ if (diri->dir->is_import() && // import
+ diri->dir->get_size() == 0 && // no children
+ !diri->is_root()) // not root
+ migrator->export_empty_import(diri->dir);
+
+ }
+
+ if (mds->logger) mds->logger->inc("cex");
+ }
+
+
+ /* hack
+ if (lru.lru_get_size() == max) {
+ int i;
+ dout(1) << "lru_top " << lru.lru_ntop << "/" << lru.lru_num << endl;
+ CInode *cur = (CInode*)lru.lru_tophead;
+ i = 1;
+ while (cur) {
+ dout(1) << " top " << i++ << "/" << lru.lru_ntop << " " << cur->lru_is_expireable() << " " << *cur << endl;
+ cur = (CInode*)cur->lru_next;
+ }
+
+ dout(1) << "lru_bot " << lru.lru_nbot << "/" << lru.lru_num << endl;
+ cur = (CInode*)lru.lru_bothead;
+ i = 1;
+ while (cur) {
+ dout(1) << " bot " << i++ << "/" << lru.lru_nbot << " " << cur->lru_is_expireable() << " " << *cur << endl;
+ cur = (CInode*)cur->lru_next;
+ }
+
+ }
+ */
+
+ // send expires
+ for (map<int, MCacheExpire*>::iterator it = expiremap.begin();
+ it != expiremap.end();
+ it++) {
+ dout(7) << "sending cache_expire to " << it->first << endl;
+ mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE);
+ }
+
+
+ return true;
+}
+
+class C_MDC_ShutdownCommit : public Context {
+ MDCache *mdc;
+public:
+ C_MDC_ShutdownCommit(MDCache *mdc) {
+ this->mdc = mdc;
+ }
+ void finish(int r) {
+ mdc->shutdown_commits--;
+ }
+};
+
+class C_MDC_ShutdownCheck : public Context {
+ MDCache *mdc;
+ Mutex *lock;
+public:
+ C_MDC_ShutdownCheck(MDCache *m, Mutex *l) : mdc(m), lock(l) {}
+ void finish(int) {
+ lock->Lock();
+ mdc->shutdown_check();
+ lock->Unlock();
+ }
+};
+
+void MDCache::shutdown_check()
+{
+ dout(0) << "shutdown_check at " << g_clock.now() << endl;
+
+ // cache
+ int o = g_conf.debug_mds;
+ g_conf.debug_mds = 10;
+ show_cache();
+ g_conf.debug_mds = o;
+ g_timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this, &mds->mds_lock));
+
+ // this
+ dout(0) << "lru size now " << lru.lru_get_size() << endl;
+ dout(0) << "log len " << mds->mdlog->get_num_events() << endl;
+
+
+ if (exports.size())
+ dout(0) << "still have " << exports.size() << " exports" << endl;
+
+ if (mds->filer->is_active())
+ dout(0) << "filer still active" << endl;
+}
+
+void MDCache::shutdown_start()
+{
+ dout(1) << "shutdown_start" << endl;
+
+ if (g_conf.mds_shutdown_check)
+ g_timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this, &mds->mds_lock));
+}
+
+
+
+bool MDCache::shutdown_pass()
+{
+ dout(7) << "shutdown_pass" << endl;
+ //assert(mds->is_shutting_down());
+ if (mds->is_stopped()) {
+ dout(7) << " already shut down" << endl;
+ show_cache();
+ show_imports();
+ return true;
+ }
+
+ // unhash dirs?
+ if (!hashdirs.empty()) {
+ // unhash any of my dirs?
+ for (set<CDir*>::iterator it = hashdirs.begin();
+ it != hashdirs.end();
+ it++) {
+ CDir *dir = *it;
+ if (!dir->is_auth()) continue;
+ if (dir->is_unhashing()) continue;
+ migrator->unhash_dir(dir);
+ }
+
+ dout(7) << "waiting for dirs to unhash" << endl;
+ return false;
+ }
+
+ // commit dirs?
+ if (g_conf.mds_commit_on_shutdown) {
+
+ if (shutdown_commits < 0) {
+ dout(1) << "shutdown_pass committing all dirty dirs" << endl;
+ shutdown_commits = 0;
+
+ for (hash_map<inodeno_t, CInode*>::iterator it = inode_map.begin();
+ it != inode_map.end();
+ it++) {
+ CInode *in = it->second;
+
+ // commit any dirty dir that's ours
+ if (in->is_dir() && in->dir && in->dir->is_auth() && in->dir->is_dirty()) {
+ mds->mdstore->commit_dir(in->dir, new C_MDC_ShutdownCommit(this));
+ shutdown_commits++;
+ }
+ }
+ }
+
+ // commits?
+ if (shutdown_commits > 0) {
+ dout(7) << "shutdown_commits still waiting for " << shutdown_commits << endl;
+ return false;
+ }
+ }
+
+ // flush anything we can from the cache
+ trim(0);
+ dout(5) << "cache size now " << lru.lru_get_size() << endl;
+
+
+ // (wait for) flush log?
+ if (g_conf.mds_log_flush_on_shutdown &&
+ mds->mdlog->get_num_events()) {
+ dout(7) << "waiting for log to flush .. " << mds->mdlog->get_num_events() << endl;
+ return false;
+ }
+
+ // send all imports back to 0.
+ if (mds->get_nodeid() != 0 && !did_shutdown_exports) {
+ // flush what i can from the cache first..
+ trim(0);
+
+ // export to root
+ for (set<CDir*>::iterator it = imports.begin();
+ it != imports.end();
+ ) {
+ CDir *im = *it;
+ it++;
+ if (im->inode->is_root()) continue;
+ if (im->is_frozen() || im->is_freezing()) continue;
+
+ dout(7) << "sending " << *im << " back to mds0" << endl;
+ migrator->export_dir(im,0);
+ }
+ did_shutdown_exports = true;
+ }
+
+
+ // waiting for imports? (e.g. root?)
+ if (exports.size()) {
+ dout(7) << "still have " << exports.size() << " exports" << endl;
+ //show_cache();
+ return false;
+ }
+
+ // filer active?
+ if (mds->filer->is_active()) {
+ dout(7) << "filer still active" << endl;
+ return false;
+ }
+
+ // close root?
+ if (mds->get_nodeid() == 0 &&
+ lru.lru_get_size() == 1 &&
+ root &&
+ root->dir &&
+ root->dir->is_import() &&
+ root->dir->get_ref() == 1) { // 1 is the import!
+ // un-import
+ dout(7) << "removing root import" << endl;
+ imports.erase(root->dir);
+ root->dir->state_clear(CDIR_STATE_IMPORT);
+ root->dir->put(CDIR_PIN_IMPORT);
+
+ if (root->is_pinned_by(CINODE_PIN_DIRTY)) {
+ dout(7) << "clearing root dirty flag" << endl;
+ root->put(CINODE_PIN_DIRTY);
+ }
+
+ trim(0);
+ assert(inode_map.size() == lru.lru_get_size());
+ }
+
+ // imports?
+ if (!imports.empty()) {
+ dout(7) << "still have " << imports.size() << " imports" << endl;
+ show_cache();
+ return false;
+ }
+
+ // done?
+ if (lru.lru_get_size() > 0) {
+ dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << endl;
+ show_cache();
+ //dump();
+ return false;
+ }
+
+ // done!
+ dout(1) << "shutdown done." << endl;
+ return true;
+}
+
+
+
+
+
+
+
+int MDCache::open_root(Context *c)
+{
+ int whoami = mds->get_nodeid();
+
+ // open root inode
+ if (whoami == 0) {
+ // i am root inode
+ CInode *root = new CInode(this);
+ memset(&root->inode, 0, sizeof(inode_t));
+ root->inode.ino = 1;
+ root->inode.hash_seed = 0; // not hashed!
+
+ // make it up (FIXME)
+ root->inode.mode = 0755 | INODE_MODE_DIR;
+ root->inode.size = 0;
+ root->inode.ctime = 0;
+ root->inode.mtime = g_clock.gettime();
+
+ root->inode.nlink = 1;
+ root->inode.layout = g_OSD_MDDirLayout;
+
+ set_root( root );
+ add_inode( root );
+
+ // root directory too
+ assert(root->dir == NULL);
+ root->set_dir( new CDir(root, mds, true) );
+ root->dir->set_dir_auth( 0 ); // me!
+ root->dir->dir_rep = CDIR_REP_ALL; //NONE;
+
+ // root is sort of technically an import (from a vacuum)
+ imports.insert( root->dir );
+ root->dir->state_set(CDIR_STATE_IMPORT);
+ root->dir->get(CDIR_PIN_IMPORT);
+
+ if (c) {
+ c->finish(0);
+ delete c;
+ }
+ } else {
+ // request inode from root mds
+ if (waiting_for_root.empty()) {
+ dout(7) << "discovering root" << endl;
+
+ filepath want;
+ MDiscover *req = new MDiscover(whoami,
+ 0,
+ want,
+ false); // there _is_ no base dir for the root inode
+ mds->send_message_mds(req, 0, MDS_PORT_CACHE);
+ } else {
+ dout(7) << "waiting for root" << endl;
+ }
+
+ // wait
+ waiting_for_root.push_back(c);
+
+ }
+
+ return 0;
+}
+
+
+
+
+
+
+
+
+// ========= messaging ==============
+
+
+void MDCache::dispatch(Message *m)
+{
+ switch (m->get_type()) {
+ case MSG_MDS_DISCOVER:
+ handle_discover((MDiscover*)m);
+ break;
+ case MSG_MDS_DISCOVERREPLY:
+ handle_discover_reply((MDiscoverReply*)m);
+ break;
+
+ /*
+ case MSG_MDS_INODEUPDATE:
+ handle_inode_update((MInodeUpdate*)m);
+ break;
+ */
+
+ case MSG_MDS_INODELINK:
+ handle_inode_link((MInodeLink*)m);
+ break;
+ case MSG_MDS_INODELINKACK:
+ handle_inode_link_ack((MInodeLinkAck*)m);
+ break;
+
+ case MSG_MDS_DIRUPDATE:
+ handle_dir_update((MDirUpdate*)m);
+ break;
+
+ case MSG_MDS_CACHEEXPIRE:
+ handle_cache_expire((MCacheExpire*)m);
+ break;
+
+
+
+ case MSG_MDS_DENTRYUNLINK:
+ handle_dentry_unlink((MDentryUnlink*)m);
+ break;
+
+
+
+
+
+ default:
+ dout(7) << "cache unknown message " << m->get_type() << endl;
+ assert(0);
+ break;
+ }
+}
+
+
+/* path_traverse
+ *
+ * return values:
+ * <0 : traverse error (ENOTDIR, ENOENT)
+ * 0 : success
+ * >0 : delayed or forwarded
+ *
+ * Notes:
+ * onfinish context is only needed if you specify MDS_TRAVERSE_DISCOVER _and_
+ * you aren't absolutely certain that the path actually exists. If it doesn't,
+ * the context is needed to pass a (failure) result code.
+ */
+
+class C_MDC_TraverseDiscover : public Context {
+ Context *onfinish, *ondelay;
+ public:
+ C_MDC_TraverseDiscover(Context *onfinish, Context *ondelay) {
+ this->ondelay = ondelay;
+ this->onfinish = onfinish;
+ }
+ void finish(int r) {
+ //dout(10) << "TraverseDiscover r = " << r << endl;
+ if (r < 0 && onfinish) { // ENOENT on discover, pass back to caller.
+ onfinish->finish(r);
+ } else {
+ ondelay->finish(r); // retry as usual
+ }
+ delete onfinish;
+ delete ondelay;
+ }
+};
+
+int MDCache::path_traverse(filepath& origpath,
+ vector<CDentry*>& trace,
+ bool follow_trailing_symlink,
+ Message *req,
+ Context *ondelay,
+ int onfail,
+ Context *onfinish,
+ bool is_client_req) // true if req is MClientRequest .. gross, FIXME
+{
+ int whoami = mds->get_nodeid();
+ set< pair<CInode*, string> > symlinks_resolved; // keep a list of symlinks we touch to avoid loops
+
+ bool noperm = false;
+ if (onfail == MDS_TRAVERSE_DISCOVER ||
+ onfail == MDS_TRAVERSE_DISCOVERXLOCK) noperm = true;
+
+ // root
+ CInode *cur = get_root();
+ if (cur == NULL) {
+ dout(7) << "traverse: i don't have root" << endl;
+ open_root(ondelay);
+ if (onfinish) delete onfinish;
+ return 1;
+ }
+
+ // start trace
+ trace.clear();
+
+ // make our own copy, since we'll modify when we hit symlinks
+ filepath path = origpath;
+
+ unsigned depth = 0;
+ while (depth < path.depth()) {
+ dout(12) << "traverse: path seg depth " << depth << " = " << path[depth] << endl;
+
+ // ENOTDIR?
+ if (!cur->is_dir()) {
+ dout(7) << "traverse: " << *cur << " not a dir " << endl;
+ delete ondelay;
+ if (onfinish) {
+ onfinish->finish(-ENOTDIR);
+ delete onfinish;
+ }
+ return -ENOTDIR;
+ }
+
+ // open dir
+ if (!cur->dir) {
+ if (cur->dir_is_auth()) {
+ // parent dir frozen_dir?
+ if (cur->is_frozen_dir()) {
+ dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << endl;
+ cur->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, ondelay);
+ if (onfinish) delete onfinish;
+ return 1;
+ }
+
+ cur->get_or_open_dir(mds);
+ assert(cur->dir);
+ } else {
+ // discover dir from/via inode auth
+ assert(!cur->is_auth());
+ if (cur->waiting_for(CINODE_WAIT_DIR)) {
+ dout(10) << "traverse: need dir for " << *cur << ", already doing discover" << endl;
+ } else {
+ filepath want = path.postfixpath(depth);
+ dout(10) << "traverse: need dir for " << *cur << ", doing discover, want " << want.get_path() << endl;
+ mds->send_message_mds(new MDiscover(mds->get_nodeid(),
+ cur->ino(),
+ want,
+ true), // need this dir too
+ cur->authority(), MDS_PORT_CACHE);
+ }
+ cur->add_waiter(CINODE_WAIT_DIR, ondelay);
+ if (onfinish) delete onfinish;
+ return 1;
+ }
+ }
+
+ // frozen?
+ /*
+ if (cur->dir->is_frozen()) {
+ // doh!
+ // FIXME: traverse is allowed?
+ dout(7) << "traverse: " << *cur->dir << " is frozen, waiting" << endl;
+ cur->dir->add_waiter(CDIR_WAIT_UNFREEZE, ondelay);
+ if (onfinish) delete onfinish;
+ return 1;
+ }
+ */
+
+ // must read directory hard data (permissions, x bit) to traverse
+ if (!noperm && !mds->locker->inode_hard_read_try(cur, ondelay)) {
+ if (onfinish) delete onfinish;
+ return 1;
+ }
+
+ // check permissions?
+ // XXX
+
+ // ..?
+ if (path[depth] == "..") {
+ trace.pop_back();
+ depth++;
+ cur = cur->get_parent_inode();
+ dout(10) << "traverse: following .. back to " << *cur << endl;
+ continue;
+ }
+
+
+ // dentry
+ CDentry *dn = cur->dir->lookup(path[depth]);
+
+ // null and last_bit and xlocked by me?
+ if (dn && dn->is_null() &&
+ dn->is_xlockedbyme(req) &&
+ depth == path.depth()-1) {
+ dout(10) << "traverse: hit (my) xlocked dentry at tail of traverse, succeeding" << endl;
+ trace.push_back(dn);
+ break; // done!
+ }
+
+ if (dn && !dn->is_null()) {
+ // dentry exists. xlocked?
+ if (!noperm && dn->is_xlockedbyother(req)) {
+ dout(10) << "traverse: xlocked dentry at " << *dn << endl;
+ cur->dir->add_waiter(CDIR_WAIT_DNREAD,
+ path[depth],
+ ondelay);
+ if (onfinish) delete onfinish;
+ return 1;
+ }
+
+ // do we have inode?
+ if (!dn->inode) {
+ assert(dn->is_remote());
+ // do i have it?
+ CInode *in = get_inode(dn->get_remote_ino());
+ if (in) {
+ dout(7) << "linking in remote in " << *in << endl;
+ dn->link_remote(in);
+ } else {
+ dout(7) << "remote link to " << dn->get_remote_ino() << ", which i don't have" << endl;
+ open_remote_ino(dn->get_remote_ino(), req,
+ ondelay);
+ return 1;
+ }
+ }
+
+ // symlink?
+ if (dn->inode->is_symlink() &&
+ (follow_trailing_symlink || depth < path.depth()-1)) {
+ // symlink, resolve!
+ filepath sym = dn->inode->symlink;
+ dout(10) << "traverse: hit symlink " << *dn->inode << " to " << sym << endl;
+
+ // break up path components
+ // /head/symlink/tail
+ filepath head = path.prefixpath(depth);
+ filepath tail = path.postfixpath(depth+1);
+ dout(10) << "traverse: path head = " << head << endl;
+ dout(10) << "traverse: path tail = " << tail << endl;
+
+ if (symlinks_resolved.count(pair<CInode*,string>(dn->inode, tail.get_path()))) {
+ dout(10) << "already hit this symlink, bailing to avoid the loop" << endl;
+ return -ELOOP;
+ }
+ symlinks_resolved.insert(pair<CInode*,string>(dn->inode, tail.get_path()));
+
+ // start at root?
+ if (dn->inode->symlink[0] == '/') {
+ // absolute
+ trace.clear();
+ depth = 0;
+ path = tail;
+ dout(10) << "traverse: absolute symlink, path now " << path << " depth " << depth << endl;
+ } else {
+ // relative
+ path = head;
+ path.append(sym);
+ path.append(tail);
+ dout(10) << "traverse: relative symlink, path now " << path << " depth " << depth << endl;
+ }
+ continue;
+ } else {
+ // keep going.
+
+ // forwarder wants replicas?
+ if (is_client_req && ((MClientRequest*)req)->get_mds_wants_replica_in_dirino()) {
+ dout(30) << "traverse: REP is here, " << ((MClientRequest*)req)->get_mds_wants_replica_in_dirino() << " vs " << cur->dir->ino() << endl;
+
+ if (((MClientRequest*)req)->get_mds_wants_replica_in_dirino() == cur->dir->ino() &&
+ cur->dir->is_auth() &&
+ cur->dir->is_rep() &&
+ cur->dir->is_open_by(req->get_source().num()) &&
+ dn->get_inode()->is_auth()
+ ) {
+ assert(req->get_source().is_mds());
+ int from = req->get_source().num();
+
+ if (dn->get_inode()->is_cached_by(from)) {
+ dout(15) << "traverse: REP would replicate to mds" << from << ", but already cached_by "
+ << req->get_source() << " dn " << *dn << endl;
+ } else {
+ dout(10) << "traverse: REP replicating to " << req->get_source() << " dn " << *dn << endl;
+ MDiscoverReply *reply = new MDiscoverReply(cur->dir->ino());
+ reply->add_dentry( dn->get_name(), !dn->can_read());
+ reply->add_inode( dn->inode->replicate_to( from ) );
+ mds->send_message_mds(reply, req->get_source().num(), MDS_PORT_CACHE);
+ }
+ }
+ }
+
+ trace.push_back(dn);
+ cur = dn->inode;
+ touch_inode(cur);
+ depth++;
+ continue;
+ }
+ }
+
+ // MISS. don't have it.
+
+ int dauth = cur->dir->dentry_authority( path[depth] );
+ dout(12) << "traverse: miss on dentry " << path[depth] << " dauth " << dauth << " in " << *cur->dir << endl;
+
+
+ if (dauth == whoami) {
+ // dentry is mine.
+ if (cur->dir->is_complete()) {
+ // file not found
+ delete ondelay;
+ if (onfinish) {
+ onfinish->finish(-ENOENT);
+ delete onfinish;
+ }
+ return -ENOENT;
+ } else {
+
+ //wrong?
+ //if (onfail == MDS_TRAVERSE_DISCOVER)
+ // return -1;
+
+ // directory isn't complete; reload
+ dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << endl;
+ touch_inode(cur);
+ mds->mdstore->fetch_dir(cur->dir, ondelay);
+
+ if (mds->logger) mds->logger->inc("cmiss");
+
+ if (onfinish) delete onfinish;
+ return 1;
+ }
+ } else {
+ // dentry is not mine.
+
+ /* no, let's let auth handle the discovery/replication ..
+ if (onfail == MDS_TRAVERSE_FORWARD &&
+ onfinish == 0 && // no funnyness
+ cur->dir->is_rep()) {
+ dout(5) << "trying to discover in popular dir " << *cur->dir << endl;
+ onfail = MDS_TRAVERSE_DISCOVER;
+ }
+ */
+
+ if ((onfail == MDS_TRAVERSE_DISCOVER ||
+ onfail == MDS_TRAVERSE_DISCOVERXLOCK)) {
+ // discover
+
+ filepath want = path.postfixpath(depth);
+ if (cur->dir->waiting_for(CDIR_WAIT_DENTRY, path[depth])) {
+ dout(7) << "traverse: already waiting for discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl;
+ } else {
+ dout(7) << "traverse: discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl;
+
+ touch_inode(cur);
+
+ mds->send_message_mds(new MDiscover(mds->get_nodeid(),
+ cur->ino(),
+ want,
+ false),
+ dauth, MDS_PORT_CACHE);
+ if (mds->logger) mds->logger->inc("dis");
+ }
+
+ // delay processing of current request.
+ // delay finish vs ondelay until result of traverse, so that ENOENT can be
+ // passed to onfinish if necessary
+ cur->dir->add_waiter(CDIR_WAIT_DENTRY,
+ path[depth],
+ new C_MDC_TraverseDiscover(onfinish, ondelay));
+
+ if (mds->logger) mds->logger->inc("cmiss");
+ return 1;
+ }
+ if (onfail == MDS_TRAVERSE_FORWARD) {
+ // forward
+ dout(7) << "traverse: not auth for " << path << " at " << path[depth] << ", fwd to mds" << dauth << endl;
+
+ if (is_client_req && cur->dir->is_rep()) {
+ dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " << *cur->dir << " req " << *(MClientRequest*)req << endl;
+ ((MClientRequest*)req)->set_mds_wants_replica_in_dirino(cur->dir->ino());
+ req->clear_payload(); // reencode!
+ }
+
+ mds->send_message_mds(req, dauth, req->get_dest_port());
+ //show_imports();
+
+ if (mds->logger) mds->logger->inc("cfw");
+ if (onfinish) delete onfinish;
+ delete ondelay;
+ return 2;
+ }
+ if (onfail == MDS_TRAVERSE_FAIL) {
+ delete ondelay;
+ if (onfinish) {
+ onfinish->finish(-ENOENT); // -ENOENT, but only because i'm not the authority!
+ delete onfinish;
+ }
+ return -ENOENT; // not necessarily exactly true....
+ }
+ }
+
+ assert(0); // i shouldn't get here
+ }
+
+ // success.
+ delete ondelay;
+ if (onfinish) {
+ onfinish->finish(0);
+ delete onfinish;
+ }
+ return 0;
+}
+
+
+
+void MDCache::open_remote_dir(CInode *diri,
+ Context *fin)
+{
+ dout(10) << "open_remote_dir on " << *diri << endl;
+
+ assert(diri->is_dir());
+ assert(!diri->dir_is_auth());
+ assert(!diri->is_auth());
+ assert(diri->dir == 0);
+
+ filepath want; // no dentries, i just want the dir open
+ mds->send_message_mds(new MDiscover(mds->get_nodeid(),
+ diri->ino(),
+ want,
+ true), // need the dir open
+ diri->authority(), MDS_PORT_CACHE);
+
+ diri->add_waiter(CINODE_WAIT_DIR, fin);
+}
+
+
+
+class C_MDC_OpenRemoteInoLookup : public Context {
+ MDCache *mdc;
+ inodeno_t ino;
+ Message *req;
+ Context *onfinish;
+public:
+ vector<Anchor*> anchortrace;
+ C_MDC_OpenRemoteInoLookup(MDCache *mdc, inodeno_t ino, Message *req, Context *onfinish) {
+ this->mdc = mdc;
+ this->ino = ino;
+ this->req = req;
+ this->onfinish = onfinish;
+ }
+ void finish(int r) {
+ assert(r == 0);
+ if (r == 0)
+ mdc->open_remote_ino_2(ino, req, anchortrace, onfinish);
+ else {
+ onfinish->finish(r);
+ delete onfinish;
+ }
+ }
+};
+
+void MDCache::open_remote_ino(inodeno_t ino,
+ Message *req,
+ Context *onfinish)
+{
+ dout(7) << "open_remote_ino on " << ino << endl;
+
+ C_MDC_OpenRemoteInoLookup *c = new C_MDC_OpenRemoteInoLookup(this, ino, req, onfinish);
+ mds->anchorclient->lookup(ino, c->anchortrace, c);
+}
+
+void MDCache::open_remote_ino_2(inodeno_t ino,
+ Message *req,
+ vector<Anchor*>& anchortrace,
+ Context *onfinish)
+{
+ dout(7) << "open_remote_ino_2 on " << ino << ", trace depth is " << anchortrace.size() << endl;
+
+ // construct path
+ filepath path;
+ for (unsigned i=0; i<anchortrace.size(); i++)
+ path.add_dentry(anchortrace[i]->ref_dn);
+
+ dout(7) << " path is " << path << endl;
+
+ vector<CDentry*> trace;
+ int r = path_traverse(path, trace, false,
+ req,
+ onfinish, // delay actually
+ MDS_TRAVERSE_DISCOVER);
+ if (r > 0) return;
+
+ onfinish->finish(r);
+ delete onfinish;
+}
+
+
+
+
+// path pins
+
+bool MDCache::path_pin(vector<CDentry*>& trace,
+ Message *m,
+ Context *c)
+{
+ // verify everything is pinnable
+ for (vector<CDentry*>::iterator it = trace.begin();
+ it != trace.end();
+ it++) {
+ CDentry *dn = *it;
+ if (!dn->is_pinnable(m)) {
+ // wait
+ if (c) {
+ dout(10) << "path_pin can't pin " << *dn << ", waiting" << endl;
+ dn->dir->add_waiter(CDIR_WAIT_DNPINNABLE,
+ dn->name,
+ c);
+ } else {
+ dout(10) << "path_pin can't pin, no waiter, failing." << endl;
+ }
+ return false;
+ }
+ }
+
+ // pin!
+ for (vector<CDentry*>::iterator it = trace.begin();
+ it != trace.end();
+ it++) {
+ (*it)->pin(m);
+ dout(11) << "path_pinned " << *(*it) << endl;
+ }
+
+ delete c;
+ return true;
+}
+
+
+void MDCache::path_unpin(vector<CDentry*>& trace,
+ Message *m)
+{
+ for (vector<CDentry*>::iterator it = trace.begin();
+ it != trace.end();
+ it++) {
+ CDentry *dn = *it;
+ dn->unpin(m);
+ dout(11) << "path_unpinned " << *dn << endl;
+
+ // did we completely unpin a waiter?
+ if (dn->lockstate == DN_LOCK_UNPINNING && !dn->is_pinned()) {
+ // return state to sync, in case the unpinner flails
+ dn->lockstate = DN_LOCK_SYNC;
+
+ // run finisher right now to give them a fair shot.
+ dn->dir->finish_waiting(CDIR_WAIT_DNUNPINNED, dn->name);
+ }
+ }
+}
+
+
+void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
+{
+ CInode *parent = in->get_parent_inode();
+ if (parent) {
+ make_trace(trace, parent);
+
+ CDentry *dn = in->get_parent_dn();
+ dout(15) << "make_trace adding " << *dn << endl;
+ trace.push_back(dn);
+ }
+}
+
+
+bool MDCache::request_start(Message *req,
+ CInode *ref,
+ vector<CDentry*>& trace)
+{
+ assert(active_requests.count(req) == 0);
+
+ // pin path
+ if (trace.size()) {
+ if (!path_pin(trace, req, new C_MDS_RetryMessage(mds,req))) return false;
+ }
+
+ dout(7) << "request_start " << *req << endl;
+
+ // add to map
+ active_requests[req].ref = ref;
+ if (trace.size()) active_requests[req].traces[trace[trace.size()-1]] = trace;
+
+ // request pins
+ request_pin_inode(req, ref);
+
+ if (mds->logger) mds->logger->inc("req");
+
+ return true;
+}
+
+
+void MDCache::request_pin_inode(Message *req, CInode *in)
+{
+ if (active_requests[req].request_pins.count(in) == 0) {
+ in->request_pin_get();
+ active_requests[req].request_pins.insert(in);
+ }
+}
+
+void MDCache::request_pin_dir(Message *req, CDir *dir)
+{
+ if (active_requests[req].request_dir_pins.count(dir) == 0) {
+ dir->request_pin_get();
+ active_requests[req].request_dir_pins.insert(dir);
+ }
+}
+
+
+void MDCache::request_cleanup(Message *req)
+{
+ assert(active_requests.count(req) == 1);
+
+ // leftover xlocks?
+ if (active_requests[req].xlocks.size()) {
+ set<CDentry*> dns = active_requests[req].xlocks;
+
+ for (set<CDentry*>::iterator it = dns.begin();
+ it != dns.end();
+ it++) {
+ CDentry *dn = *it;
+
+ dout(7) << "request_cleanup leftover xlock " << *dn << endl;
+
+ mds->locker->dentry_xlock_finish(dn);
+
+ // queue finishers
+ dn->dir->take_waiting(CDIR_WAIT_ANY, dn->name, mds->finished_queue);
+
+ // remove clean, null dentry? (from a failed rename or whatever)
+ if (dn->is_null() && dn->is_sync() && !dn->is_dirty()) {
+ dn->dir->remove_dentry(dn);
+ }
+ }
+
+ assert(active_requests[req].xlocks.empty()); // we just finished finished them
+ }
+
+ // foreign xlocks?
+ if (active_requests[req].foreign_xlocks.size()) {
+ set<CDentry*> dns = active_requests[req].foreign_xlocks;
+ active_requests[req].foreign_xlocks.clear();
+
+ for (set<CDentry*>::iterator it = dns.begin();
+ it != dns.end();
+ it++) {
+ CDentry *dn = *it;
+
+ dout(7) << "request_cleanup sending unxlock for foreign xlock on " << *dn << endl;
+ assert(dn->is_xlocked());
+ int dauth = dn->dir->dentry_authority(dn->name);
+ MLock *m = new MLock(LOCK_AC_UNXLOCK, mds->get_nodeid());
+ m->set_dn(dn->dir->ino(), dn->name);
+ mds->send_message_mds(m, dauth, MDS_PORT_CACHE);
+ }
+ }
+
+ // unpin paths
+ for (map< CDentry*, vector<CDentry*> >::iterator it = active_requests[req].traces.begin();
+ it != active_requests[req].traces.end();
+ it++) {
+ path_unpin(it->second, req);
+ }
+
+ // request pins
+ for (set<CInode*>::iterator it = active_requests[req].request_pins.begin();
+ it != active_requests[req].request_pins.end();
+ it++) {
+ (*it)->request_pin_put();
+ }
+ for (set<CDir*>::iterator it = active_requests[req].request_dir_pins.begin();
+ it != active_requests[req].request_dir_pins.end();
+ it++) {
+ (*it)->request_pin_put();
+ }
+
+ // remove from map
+ active_requests.erase(req);
+
+
+ // log some stats *****
+ if (mds->logger) {
+ mds->logger->set("c", lru.lru_get_size());
+ mds->logger->set("cpin", lru.lru_get_num_pinned());
+ mds->logger->set("ctop", lru.lru_get_top());
+ mds->logger->set("cbot", lru.lru_get_bot());
+ mds->logger->set("cptail", lru.lru_get_pintail());
+ //mds->logger->set("buf",buffer_total_alloc);
+ }
+
+ if (g_conf.log_pins) {
+ // pin
+ for (int i=0; i<CINODE_NUM_PINS; i++) {
+ if (mds->logger2) mds->logger2->set(cinode_pin_names[i],
+ cinode_pins[i]);
+ }
+ /*
+ for (map<int,int>::iterator it = cdir_pins.begin();
+ it != cdir_pins.end();
+ it++) {
+ //string s = "D";
+ //s += cdir_pin_names[it->first];
+ if (mds->logger2) mds->logger2->set(//s,
+ cdir_pin_names[it->first],
+ it->second);
+ }
+ */
+ }
+
+}
+
+void MDCache::request_finish(Message *req)
+{
+ dout(7) << "request_finish " << *req << endl;
+ request_cleanup(req);
+ delete req; // delete req
+
+ if (mds->logger) mds->logger->inc("reply");
+
+
+ //dump();
+}
+
+
+void MDCache::request_forward(Message *req, int who, int port)
+{
+ if (!port) port = MDS_PORT_SERVER;
+
+ dout(7) << "request_forward to " << who << " req " << *req << endl;
+ request_cleanup(req);
+ mds->send_message_mds(req, who, port);
+
+ if (mds->logger) mds->logger->inc("fw");
+}
+
+
+
+// ANCHORS
+
+class C_MDC_AnchorInode : public Context {
+ CInode *in;
+
+public:
+ C_MDC_AnchorInode(CInode *in) {
+ this->in = in;
+ }
+ void finish(int r) {
+ if (r == 0) {
+ assert(in->inode.anchored == false);
+ in->inode.anchored = true;
+
+ in->state_clear(CINODE_STATE_ANCHORING);
+ in->put(CINODE_PIN_ANCHORING);
+
+ in->mark_dirty();
+ }
+
+ // trigger
+ in->finish_waiting(CINODE_WAIT_ANCHORED, r);
+ }
+};
+
+void MDCache::anchor_inode(CInode *in, Context *onfinish)
+{
+ assert(in->is_auth());
+
+ // already anchoring?
+ if (in->state_test(CINODE_STATE_ANCHORING)) {
+ dout(7) << "anchor_inode already anchoring " << *in << endl;
+
+ // wait
+ in->add_waiter(CINODE_WAIT_ANCHORED,
+ onfinish);
+
+ } else {
+ dout(7) << "anchor_inode anchoring " << *in << endl;
+
+ // auth: do it
+ in->state_set(CINODE_STATE_ANCHORING);
+ in->get(CINODE_PIN_ANCHORING);
+
+ // wait
+ in->add_waiter(CINODE_WAIT_ANCHORED,
+ onfinish);
+
+ // make trace
+ vector<Anchor*> trace;
+ in->make_anchor_trace(trace);
+
+ // do it
+ mds->anchorclient->create(in->ino(), trace,
+ new C_MDC_AnchorInode( in ));
+ }
+}
+
+
+void MDCache::handle_inode_link(MInodeLink *m)
+{
+ CInode *in = get_inode(m->get_ino());
+ assert(in);
+
+ if (!in->is_auth()) {
+ assert(in->is_proxy());
+ dout(7) << "handle_inode_link not auth for " << *in << ", fw to auth" << endl;
+ mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE);
+ return;
+ }
+
+ dout(7) << "handle_inode_link on " << *in << endl;
+
+ if (!in->is_anchored()) {
+ assert(in->inode.nlink == 1);
+ dout(7) << "needs anchor, nlink=" << in->inode.nlink << ", creating anchor" << endl;
+
+ anchor_inode(in,
+ new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ in->inode.nlink++;
+ in->mark_dirty();
+
+ // reply
+ dout(7) << " nlink++, now " << in->inode.nlink++ << endl;
+
+ mds->send_message_mds(new MInodeLinkAck(m->get_ino(), true), m->get_from(), MDS_PORT_CACHE);
+ delete m;
+}
+
+
+void MDCache::handle_inode_link_ack(MInodeLinkAck *m)
+{
+ CInode *in = get_inode(m->get_ino());
+ assert(in);
+
+ dout(7) << "handle_inode_link_ack success = " << m->is_success() << " on " << *in << endl;
+ in->finish_waiting(CINODE_WAIT_LINK,
+ m->is_success() ? 1:-1);
+}
+
+
+
+// REPLICAS
+
+
+void MDCache::handle_discover(MDiscover *dis)
+{
+ int whoami = mds->get_nodeid();
+
+ // from me to me?
+ if (dis->get_asker() == whoami) {
+ dout(7) << "discover for " << dis->get_want().get_path() << " bounced back to me, dropping." << endl;
+ delete dis;
+ return;
+ }
+
+ CInode *cur = 0;
+ MDiscoverReply *reply = 0;
+ //filepath fullpath;
+
+ // get started.
+ if (dis->get_base_ino() == 0) {
+ // wants root
+ dout(7) << "discover from mds" << dis->get_asker() << " wants root + " << dis->get_want().get_path() << endl;
+
+ assert(mds->get_nodeid() == 0);
+ assert(root->is_auth());
+
+ //fullpath = dis->get_want();
+
+
+ // add root
+ reply = new MDiscoverReply(0);
+ reply->add_inode( root->replicate_to( dis->get_asker() ) );
+ dout(10) << "added root " << *root << endl;
+
+ cur = root;
+
+ } else {
+ // there's a base inode
+ cur = get_inode(dis->get_base_ino());
+ assert(cur);
+
+ if (dis->wants_base_dir()) {
+ dout(7) << "discover from mds" << dis->get_asker() << " has " << *cur << " wants dir+" << dis->get_want().get_path() << endl;
+ } else {
+ dout(7) << "discover from mds" << dis->get_asker() << " has " << *cur->dir << " wants " << dis->get_want().get_path() << endl;
+ }
+
+ assert(cur->is_dir());
+
+ // crazyness?
+ if (!cur->dir && !cur->is_auth()) {
+ int iauth = cur->authority();
+ dout(7) << "no dir and not inode auth; fwd to auth " << iauth << endl;
+ mds->send_message_mds( dis, iauth, MDS_PORT_CACHE);
+ return;
+ }
+
+ // frozen_dir?
+ if (!cur->dir && cur->is_frozen_dir()) {
+ dout(7) << "is frozen_dir, waiting" << endl;
+ cur->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE,
+ new C_MDS_RetryMessage(mds, dis));
+ return;
+ }
+
+ if (!cur->dir)
+ cur->get_or_open_dir(mds);
+ assert(cur->dir);
+
+ dout(10) << "dir is " << *cur->dir << endl;
+
+ // create reply
+ reply = new MDiscoverReply(cur->ino());
+ }
+
+ assert(reply);
+ assert(cur);
+
+ /*
+ // first traverse and make sure we won't have to do any waiting
+ dout(10) << "traversing full discover path = " << fullpath << endl;
+ vector<CInode*> trav;
+ int r = path_traverse(fullpath, trav, dis, MDS_TRAVERSE_FAIL);
+ if (r > 0)
+ return; // fw or delay
+ dout(10) << "traverse finish w/o blocking, continuing" << endl;
+ // ok, now we know we won't block on dentry locks or readdir.
+ */
+
+
+ // add content
+ // do some fidgeting to include a dir if they asked for the base dir, or just root.
+ for (unsigned i = 0; i < dis->get_want().depth() || dis->get_want().depth() == 0; i++) {
+ // add dir
+ if (reply->is_empty() && !dis->wants_base_dir()) {
+ dout(7) << "they don't want the base dir" << endl;
+ } else {
+ // is it actaully a dir at all?
+ if (!cur->is_dir()) {
+ dout(7) << "not a dir " << *cur << endl;
+ reply->set_flag_error_dir();
+ break;
+ }
+
+ // add dir
+ if (!cur->dir_is_auth()) {
+ dout(7) << *cur << " dir auth is someone else, i'm done" << endl;
+ break;
+ }
+
+ // did we hit a frozen_dir?
+ if (!cur->dir && cur->is_frozen_dir()) {
+ dout(7) << *cur << " is frozen_dir, stopping" << endl;
+ break;
+ }
+
+ if (!cur->dir) cur->get_or_open_dir(mds);
+
+ reply->add_dir( new CDirDiscover( cur->dir,
+ cur->dir->open_by_add( dis->get_asker() ) ) );
+ dout(7) << "added dir " << *cur->dir << endl;
+ }
+ if (dis->get_want().depth() == 0) break;
+
+ // lookup dentry
+ int dentry_auth = cur->dir->dentry_authority( dis->get_dentry(i) );
+ if (dentry_auth != mds->get_nodeid()) {
+ dout(7) << *cur->dir << "dentry " << dis->get_dentry(i) << " auth " << dentry_auth << ", i'm done." << endl;
+ break; // that's it for us!
+ }
+
+ // get inode
+ CDentry *dn = cur->dir->lookup( dis->get_dentry(i) );
+
+ /*
+ if (dn && !dn->can_read()) { // xlocked?
+ dout(7) << "waiting on " << *dn << endl;
+ cur->dir->add_waiter(CDIR_WAIT_DNREAD,
+ dn->name,
+ new C_MDS_RetryMessage(mds, dis));
+ return;
+ }
+ */
+
+ if (dn) {
+ if (!dn->inode && dn->is_sync()) {
+ dout(7) << "mds" << whoami << " dentry " << dis->get_dentry(i) << " null in " << *cur->dir << ", returning error" << endl;
+ reply->set_flag_error_dn( dis->get_dentry(i) );
+ break; // don't replicate null but non-locked dentries.
+ }
+
+ reply->add_dentry( dis->get_dentry(i), !dn->can_read() );
+ dout(7) << "added dentry " << *dn << endl;
+
+ if (!dn->inode) break; // we're done.
+ }
+
+ if (dn && dn->inode) {
+ CInode *next = dn->inode;
+ assert(next->is_auth());
+
+ // add inode
+ //int nonce = next->cached_by_add(dis->get_asker());
+ reply->add_inode( next->replicate_to( dis->get_asker() ) );
+ dout(7) << "added inode " << *next << endl;// " nonce=" << nonce<< endl;
+
+ // descend
+ cur = next;
+ } else {
+ // don't have inode?
+ if (cur->dir->is_complete()) {
+ // set error flag in reply
+ dout(7) << "mds" << whoami << " dentry " << dis->get_dentry(i) << " not found in " << *cur->dir << ", returning error" << endl;
+ reply->set_flag_error_dn( dis->get_dentry(i) );
+ break;
+ } else {
+ // readdir
+ dout(7) << "mds" << whoami << " incomplete dir contents for " << *cur->dir << ", fetching" << endl;
+
+ //mds->mdstore->fetch_dir(cur->dir, NULL); //new C_MDS_RetryMessage(mds, dis));
+ //break; // send what we have so far
+
+ mds->mdstore->fetch_dir(cur->dir, new C_MDS_RetryMessage(mds, dis));
+ return;
+ }
+ }
+ }
+
+ // how did we do.
+ if (reply->is_empty()) {
+
+ // discard empty reply
+ delete reply;
+
+ if ((cur->is_auth() || cur->is_proxy() || cur->dir->is_proxy()) &&
+ !cur->dir->is_auth()) {
+ // fwd to dir auth
+ int dirauth = cur->dir->authority();
+ if (dirauth == dis->get_asker()) {
+ dout(7) << "from (new?) dir auth, dropping (obsolete) discover on floor." << endl; // XXX FIXME is this right?
+ //assert(dis->get_asker() == dis->get_source()); //might be a weird other loop. either way, asker has it.
+ delete dis;
+ } else {
+ dout(7) << "fwd to dir auth " << dirauth << endl;
+ mds->send_message_mds( dis, dirauth, MDS_PORT_CACHE );
+ }
+ return;
+ }
+
+ dout(7) << "i'm not auth or proxy, dropping (this empty reply). i bet i just exported." << endl;
+ //assert(0);
+
+ } else {
+ // send back to asker
+ dout(7) << "sending result back to asker mds" << dis->get_asker() << endl;
+ mds->send_message_mds(reply, dis->get_asker(), MDS_PORT_CACHE);
+ }
+
+ // done.
+ delete dis;
+}
+
+
+void MDCache::handle_discover_reply(MDiscoverReply *m)
+{
+ // starting point
+ CInode *cur;
+ list<Context*> finished, error;
+
+ if (m->has_root()) {
+ // nowhere!
+ dout(7) << "discover_reply root + " << m->get_path() << " " << m->get_num_inodes() << " inodes" << endl;
+ assert(!root);
+ assert(m->get_base_ino() == 0);
+ assert(!m->has_base_dentry());
+ assert(!m->has_base_dir());
+
+ // add in root
+ cur = new CInode(this, false);
+
+ m->get_inode(0).update_inode(cur);
+
+ // root
+ set_root( cur );
+ add_inode( cur );
+ dout(7) << " got root: " << *cur << endl;
+
+ // take waiters
+ finished.swap(waiting_for_root);
+ } else {
+ // grab inode
+ cur = get_inode(m->get_base_ino());
+
+ if (!cur) {
+ dout(7) << "discover_reply don't have base ino " << m->get_base_ino() << ", dropping" << endl;
+ delete m;
+ return;
+ }
+
+ dout(7) << "discover_reply " << *cur << " + " << m->get_path() << ", have " << m->get_num_inodes() << " inodes" << endl;
+ }
+
+ // fyi
+ if (m->is_flag_error_dir()) dout(7) << " flag error, dir" << endl;
+ if (m->is_flag_error_dn()) dout(7) << " flag error, dentry = " << m->get_error_dentry() << endl;
+ dout(10) << "depth is " << m->get_depth() << ", has_root = " << m->has_root() << endl;
+
+ // loop over discover results.
+ // indexese follow each ([[dir] dentry] inode)
+ // can start, end with any type.
+
+ for (int i=m->has_root(); i<m->get_depth(); i++) {
+ dout(10) << "discover_reply i=" << i << " cur " << *cur << endl;
+
+ // dir
+ if ((i > 0) ||
+ (i == 0 && m->has_base_dir())) {
+ if (cur->dir) {
+ // had it
+ /* this is strange, but it happens when:
+ we discover multiple dentries under a dir.
+ bc, no flag to indicate a dir discover is underway, (as there is w/ a dentry one).
+ this is actually good, since (dir aside) they're asking for different information.
+ */
+ dout(7) << "had " << *cur->dir;
+ m->get_dir(i).update_dir(cur->dir);
+ dout2(7) << ", now " << *cur->dir << endl;
+ } else {
+ // add it (_replica_)
+ cur->set_dir( new CDir(cur, mds, false) );
+ m->get_dir(i).update_dir(cur->dir);
+ dout(7) << "added " << *cur->dir << " nonce " << cur->dir->replica_nonce << endl;
+
+ // get waiters
+ cur->take_waiting(CINODE_WAIT_DIR, finished);
+ }
+ }
+
+ // dentry error?
+ if (i == m->get_depth()-1 &&
+ m->is_flag_error_dn()) {
+ // error!
+ assert(cur->is_dir());
+ if (cur->dir) {
+ dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dentry?" << endl;
+ cur->dir->take_waiting(CDIR_WAIT_DENTRY,
+ m->get_error_dentry(),
+ error);
+ } else {
+ dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dir?" << endl;
+ cur->take_waiting(CINODE_WAIT_DIR, error);
+ }
+ break;
+ }
+
+ if (i >= m->get_num_dentries()) break;
+
+ // dentry
+ dout(7) << "i = " << i << " dentry is " << m->get_dentry(i) << endl;
+
+ CDentry *dn = 0;
+ if (i > 0 ||
+ m->has_base_dentry()) {
+ dn = cur->dir->lookup( m->get_dentry(i) );
+
+ if (dn) {
+ dout(7) << "had " << *dn << endl;
+ } else {
+ dn = cur->dir->add_dentry( m->get_dentry(i) );
+ if (m->get_dentry_xlock(i)) {
+ dout(7) << " new dentry is xlock " << *dn << endl;
+ dn->lockstate = DN_LOCK_XLOCK;
+ dn->xlockedby = 0;
+ }
+ dout(7) << "added " << *dn << endl;
+ }
+
+ cur->dir->take_waiting(CDIR_WAIT_DENTRY,
+ m->get_dentry(i),
+ finished);
+ }
+
+ if (i >= m->get_num_inodes()) break;
+
+ // inode
+ dout(7) << "i = " << i << " ino is " << m->get_ino(i) << endl;
+ CInode *in = get_inode( m->get_inode(i).get_ino() );
+ assert(dn);
+
+ if (in) {
+ dout(7) << "had " << *in << endl;
+
+ // fix nonce
+ dout(7) << " my nonce is " << in->replica_nonce << ", taking from discover, which has " << m->get_inode(i).get_replica_nonce() << endl;
+ in->replica_nonce = m->get_inode(i).get_replica_nonce();
+
+ if (dn && in != dn->inode) {
+ dout(7) << " but it's not linked via dentry " << *dn << endl;
+ // link
+ if (dn->inode) {
+ dout(7) << "dentry WAS linked to " << *dn->inode << endl;
+ assert(0); // WTF.
+ }
+ dn->dir->link_inode(dn, in);
+ }
+ }
+ else {
+ assert(dn->inode == 0); // better not be something else linked to this dentry...
+
+ // didn't have it.
+ in = new CInode(this, false);
+
+ m->get_inode(i).update_inode(in);
+
+ // link in
+ add_inode( in );
+ dn->dir->link_inode(dn, in);
+
+ dout(7) << "added " << *in << " nonce " << in->replica_nonce << endl;
+ }
+
+ // onward!
+ cur = in;
+ }
+
+ // dir error at the end there?
+ if (m->is_flag_error_dir()) {
+ dout(7) << " flag_error on dir " << *cur << endl;
+ assert(!cur->is_dir());
+ cur->take_waiting(CINODE_WAIT_DIR, error);
+ }
+
+ // finish errors directly
+ finish_contexts(error, -ENOENT);
+
+ mds->queue_finished(finished);
+
+ // done
+ delete m;
+}
+
+
+
+
+
+
+
+
+/*
+int MDCache::send_inode_updates(CInode *in)
+{
+ assert(in->is_auth());
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ dout(7) << "sending inode_update on " << *in << " to " << *it << endl;
+ assert(*it != mds->get_nodeid());
+ mds->send_message_mds(new MInodeUpdate(in, in->get_cached_by_nonce(*it)), *it, MDS_PORT_CACHE);
+ }
+
+ return 0;
+}
+
+
+void MDCache::handle_inode_update(MInodeUpdate *m)
+{
+ inodeno_t ino = m->get_ino();
+ CInode *in = get_inode(m->get_ino());
+ if (!in) {
+ //dout(7) << "inode_update on " << m->get_ino() << ", don't have it, ignoring" << endl;
+ dout(7) << "inode_update on " << m->get_ino() << ", don't have it, sending expire" << endl;
+ MCacheExpire *expire = new MCacheExpire(mds->get_nodeid());
+ expire->add_inode(m->get_ino(), m->get_nonce());
+ mds->send_message_mds(expire, m->get_source().num(), MDS_PORT_CACHE);
+ goto out;
+ }
+
+ if (in->is_auth()) {
+ dout(7) << "inode_update on " << *in << ", but i'm the authority!" << endl;
+ assert(0); // this should never happen
+ }
+
+ dout(7) << "inode_update on " << *in << endl;
+
+ // update! NOTE dir_auth is unaffected by this.
+ in->decode_basic_state(m->get_payload());
+
+ out:
+ // done
+ delete m;
+}
+*/
+
+
+
+void MDCache::handle_cache_expire(MCacheExpire *m)
+{
+ int from = m->get_from();
+ int source = m->get_source().num();
+ map<int, MCacheExpire*> proxymap;
+
+ if (m->get_from() == source) {
+ dout(7) << "cache_expire from " << from << endl;
+ } else {
+ dout(7) << "cache_expire from " << from << " via " << source << endl;
+ }
+
+ // inodes
+ for (map<inodeno_t,int>::iterator it = m->get_inodes().begin();
+ it != m->get_inodes().end();
+ it++) {
+ CInode *in = get_inode(it->first);
+ int nonce = it->second;
+
+ if (!in) {
+ dout(0) << "inode_expire on " << it->first << " from " << from << ", don't have it" << endl;
+ assert(in); // i should be authority, or proxy .. and pinned
+ }
+ if (!in->is_auth()) {
+ int newauth = in->authority();
+ dout(7) << "proxy inode expire on " << *in << " to " << newauth << endl;
+ assert(newauth >= 0);
+ if (!in->state_test(CINODE_STATE_PROXY)) dout(0) << "missing proxy bit on " << *in << endl;
+ assert(in->state_test(CINODE_STATE_PROXY));
+ if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from);
+ proxymap[newauth]->add_inode(it->first, it->second);
+ continue;
+ }
+
+ // check nonce
+ if (from == mds->get_nodeid()) {
+ // my cache_expire, and the export_dir giving auth back to me crossed paths!
+ // we can ignore this. no danger of confusion since the two parties are both me.
+ dout(7) << "inode_expire on " << *in << " from mds" << from << " .. ME! ignoring." << endl;
+ }
+ else if (nonce == in->get_cached_by_nonce(from)) {
+ // remove from our cached_by
+ dout(7) << "inode_expire on " << *in << " from mds" << from << " cached_by was " << in->cached_by << endl;
+ in->cached_by_remove(from);
+ in->mds_caps_wanted.erase(from);
+
+ // note: this code calls _eval more often than it needs to!
+ // fix lock
+ if (in->hardlock.is_gathering(from)) {
+ in->hardlock.gather_set.erase(from);
+ if (in->hardlock.gather_set.size() == 0)
+ mds->locker->inode_hard_eval(in);
+ }
+ if (in->filelock.is_gathering(from)) {
+ in->filelock.gather_set.erase(from);
+ if (in->filelock.gather_set.size() == 0)
+ mds->locker->inode_file_eval(in);
+ }
+
+ // alone now?
+ if (!in->is_cached_by_anyone()) {
+ mds->locker->inode_hard_eval(in);
+ mds->locker->inode_file_eval(in);
+ }
+
+ }
+ else {
+ // this is an old nonce, ignore expire.
+ dout(7) << "inode_expire on " << *in << " from mds" << from << " with old nonce " << nonce << " (current " << in->get_cached_by_nonce(from) << "), dropping" << endl;
+ assert(in->get_cached_by_nonce(from) > nonce);
+ }
+ }
+
+ // dirs
+ for (map<inodeno_t,int>::iterator it = m->get_dirs().begin();
+ it != m->get_dirs().end();
+ it++) {
+ CInode *diri = get_inode(it->first);
+ CDir *dir = diri->dir;
+ int nonce = it->second;
+
+ if (!dir) {
+ dout(0) << "dir_expire on " << it->first << " from " << from << ", don't have it" << endl;
+ assert(dir); // i should be authority, or proxy ... and pinned
+ }
+ if (!dir->is_auth()) {
+ int newauth = dir->authority();
+ dout(7) << "proxy dir expire on " << *dir << " to " << newauth << endl;
+ if (!dir->is_proxy()) dout(0) << "nonproxy dir expire? " << *dir << " .. auth is " << newauth << " .. expire is from " << from << endl;
+ assert(dir->is_proxy());
+ assert(newauth >= 0);
+ assert(dir->state_test(CDIR_STATE_PROXY));
+ if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from);
+ proxymap[newauth]->add_dir(it->first, it->second);
+ continue;
+ }
+
+ // check nonce
+ if (from == mds->get_nodeid()) {
+ dout(7) << "dir_expire on " << *dir << " from mds" << from << " .. ME! ignoring" << endl;
+ }
+ else if (nonce == dir->get_open_by_nonce(from)) {
+ // remove from our cached_by
+ dout(7) << "dir_expire on " << *dir << " from mds" << from << " open_by was " << dir->open_by << endl;
+ dir->open_by_remove(from);
+ }
+ else {
+ // this is an old nonce, ignore expire.
+ dout(7) << "dir_expire on " << *dir << " from mds" << from << " with old nonce " << nonce << " (current " << dir->get_open_by_nonce(from) << "), dropping" << endl;
+ assert(dir->get_open_by_nonce(from) > nonce);
+ }
+ }
+
+ // send proxy forwards
+ for (map<int, MCacheExpire*>::iterator it = proxymap.begin();
+ it != proxymap.end();
+ it++) {
+ dout(7) << "sending proxy forward to " << it->first << endl;
+ mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE);
+ }
+
+ // done
+ delete m;
+}
+
+
+
+int MDCache::send_dir_updates(CDir *dir, bool bcast)
+{
+ // this is an FYI, re: replication
+
+ set<int> who = dir->open_by;
+ if (bcast)
+ who = mds->get_mds_map()->get_mds();
+
+ dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << endl;
+
+ string path;
+ dir->inode->make_path(path);
+
+ int whoami = mds->get_nodeid();
+ for (set<int>::iterator it = who.begin();
+ it != who.end();
+ it++) {
+ if (*it == whoami) continue;
+ //if (*it == except) continue;
+ dout(7) << "sending dir_update on " << *dir << " to " << *it << endl;
+
+ mds->send_message_mds(new MDirUpdate(dir->ino(),
+ dir->dir_rep,
+ dir->dir_rep_by,
+ path,
+ bcast),
+ *it, MDS_PORT_CACHE);
+ }
+
+ return 0;
+}
+
+
+void MDCache::handle_dir_update(MDirUpdate *m)
+{
+ CInode *in = get_inode(m->get_ino());
+ if (!in || !in->dir) {
+ dout(5) << "dir_update on " << m->get_ino() << ", don't have it" << endl;
+
+ // discover it?
+ if (m->should_discover()) {
+ m->tried_discover(); // only once!
+ vector<CDentry*> trace;
+ filepath path = m->get_path();
+
+ dout(5) << "trying discover on dir_update for " << path << endl;
+
+ int r = path_traverse(path, trace, true,
+ m, new C_MDS_RetryMessage(mds, m),
+ MDS_TRAVERSE_DISCOVER);
+ if (r > 0)
+ return;
+ if (r == 0) {
+ assert(in);
+ open_remote_dir(in, new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ assert(0);
+ }
+
+ goto out;
+ }
+
+ // update
+ dout(5) << "dir_update on " << *in->dir << endl;
+ in->dir->dir_rep = m->get_dir_rep();
+ in->dir->dir_rep_by = m->get_dir_rep_by();
+
+ // done
+ out:
+ delete m;
+}
+
+
+
+
+
+class C_MDC_DentryUnlink : public Context {
+public:
+ MDCache *mdc;
+ CDentry *dn;
+ CDir *dir;
+ Context *c;
+ C_MDC_DentryUnlink(MDCache *mdc, CDentry *dn, CDir *dir, Context *c) {
+ this->mdc = mdc;
+ this->dn = dn;
+ this->dir = dir;
+ this->c = c;
+ }
+ void finish(int r) {
+ assert(r == 0);
+ mdc->dentry_unlink_finish(dn, dir, c);
+ }
+};
+
+
+// NAMESPACE FUN
+
+void MDCache::dentry_unlink(CDentry *dn, Context *c)
+{
+ CDir *dir = dn->dir;
+ string dname = dn->name;
+
+ assert(dn->lockstate == DN_LOCK_XLOCK);
+
+ // i need the inode to do any of this properly
+ assert(dn->inode);
+
+ // log it
+ if (dn->inode) dn->inode->mark_unsafe(); // XXX ??? FIXME
+ mds->mdlog->submit_entry(new EUnlink(dir, dn, dn->inode),
+ NULL); // FIXME FIXME FIXME
+
+ // tell replicas
+ if (dir->is_open_by_anyone()) {
+ for (set<int>::iterator it = dir->open_by_begin();
+ it != dir->open_by_end();
+ it++) {
+ dout(7) << "inode_unlink sending DentryUnlink to " << *it << endl;
+
+ mds->send_message_mds(new MDentryUnlink(dir->ino(), dn->name), *it, MDS_PORT_CACHE);
+ }
+
+ // don't need ack.
+ }
+
+
+ // inode deleted?
+ if (dn->is_primary()) {
+ assert(dn->inode->is_auth());
+ dn->inode->inode.nlink--;
+
+ if (dn->inode->is_dir()) assert(dn->inode->inode.nlink == 0); // no hard links on dirs
+
+ // last link?
+ if (dn->inode->inode.nlink == 0) {
+ // truly dangling
+ if (dn->inode->dir) {
+ // mark dir clean too, since it now dne!
+ assert(dn->inode->dir->is_auth());
+ dn->inode->dir->state_set(CDIR_STATE_DELETED);
+ dn->inode->dir->remove_null_dentries();
+ dn->inode->dir->mark_clean();
+ }
+
+ // mark it clean, it's dead
+ if (dn->inode->is_dirty())
+ dn->inode->mark_clean();
+
+ } else {
+ // migrate to inode file
+ dout(7) << "removed primary, but there are remote links, moving to inode file: " << *dn->inode << endl;
+
+ // dangling but still linked.
+ assert(dn->inode->is_anchored());
+
+ // unlink locally
+ CInode *in = dn->inode;
+ dn->dir->unlink_inode( dn );
+ dn->mark_dirty();
+
+ // mark it dirty!
+ in->mark_dirty();
+
+ // update anchor to point to inode file+mds
+ vector<Anchor*> atrace;
+ in->make_anchor_trace(atrace);
+ assert(atrace.size() == 1); // it's dangling
+ mds->anchorclient->update(in->ino(), atrace,
+ new C_MDC_DentryUnlink(this, dn, dir, c));
+ return;
+ }
+ }
+ else if (dn->is_remote()) {
+ // need to dec nlink on primary
+ if (dn->inode->is_auth()) {
+ // awesome, i can do it
+ dout(7) << "remote target is local, nlink--" << endl;
+ dn->inode->inode.nlink--;
+ dn->inode->mark_dirty();
+
+ if (( dn->inode->state_test(CINODE_STATE_DANGLING) && dn->inode->inode.nlink == 0) ||
+ (!dn->inode->state_test(CINODE_STATE_DANGLING) && dn->inode->inode.nlink == 1)) {
+ dout(7) << "nlink=1+primary or 0+dangling, removing anchor" << endl;
+
+ // remove anchor (async)
+ mds->anchorclient->destroy(dn->inode->ino(), NULL);
+ }
+ } else {
+ int auth = dn->inode->authority();
+ dout(7) << "remote target is remote, sending unlink request to " << auth << endl;
+
+ mds->send_message_mds(new MInodeUnlink(dn->inode->ino(), mds->get_nodeid()),
+ auth, MDS_PORT_CACHE);
+
+ // unlink locally
+ CInode *in = dn->inode;
+ dn->dir->unlink_inode( dn );
+ dn->mark_dirty();
+
+ // add waiter
+ in->add_waiter(CINODE_WAIT_UNLINK, c);
+ return;
+ }
+ }
+ else
+ assert(0); // unlink on null dentry??
+
+ // unlink locally
+ dn->dir->unlink_inode( dn );
+ dn->mark_dirty();
+
+ // finish!
+ dentry_unlink_finish(dn, dir, c);
+}
+
+
+void MDCache::dentry_unlink_finish(CDentry *dn, CDir *dir, Context *c)
+{
+ dout(7) << "dentry_unlink_finish on " << *dn << endl;
+ string dname = dn->name;
+
+ // unpin dir / unxlock
+ mds->locker->dentry_xlock_finish(dn, true); // quiet, no need to bother replicas since they're already unlinking
+
+ // did i empty out an imported dir?
+ if (dir->is_import() && !dir->inode->is_root() && dir->get_size() == 0)
+ migrator->export_empty_import(dir);
+
+ // wake up any waiters
+ dir->take_waiting(CDIR_WAIT_ANY, dname, mds->finished_queue);
+
+ c->finish(0);
+}
+
+
+
+
+void MDCache::handle_dentry_unlink(MDentryUnlink *m)
+{
+ CInode *diri = get_inode(m->get_dirino());
+ CDir *dir = 0;
+ if (diri) dir = diri->dir;
+
+ if (!diri || !dir) {
+ dout(7) << "handle_dentry_unlink don't have dir " << m->get_dirino() << endl;
+ }
+ else {
+ CDentry *dn = dir->lookup(m->get_dn());
+ if (!dn) {
+ dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << endl;
+ } else {
+ dout(7) << "handle_dentry_unlink on " << *dn << endl;
+
+ // dir?
+ if (dn->inode) {
+ if (dn->inode->dir) {
+ dn->inode->dir->state_set(CDIR_STATE_DELETED);
+ dn->inode->dir->remove_null_dentries();
+ }
+ }
+
+ string dname = dn->name;
+
+ // unlink
+ dn->dir->remove_dentry(dn);
+
+ // wake up
+ //dir->finish_waiting(CDIR_WAIT_DNREAD, dname);
+ dir->take_waiting(CDIR_WAIT_DNREAD, dname, mds->finished_queue);
+ }
+ }
+
+ delete m;
+ return;
+}
+
+
+void MDCache::handle_inode_unlink(MInodeUnlink *m)
+{
+ CInode *in = get_inode(m->get_ino());
+ assert(in);
+
+ // proxy?
+ if (in->is_proxy()) {
+ dout(7) << "handle_inode_unlink proxy on " << *in << endl;
+ mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE);
+ return;
+ }
+ assert(in->is_auth());
+
+ // do it.
+ dout(7) << "handle_inode_unlink nlink=" << in->inode.nlink << " on " << *in << endl;
+ assert(in->inode.nlink > 0);
+ in->inode.nlink--;
+
+ if (in->state_test(CINODE_STATE_DANGLING)) {
+ // already dangling.
+ // last link?
+ if (in->inode.nlink == 0) {
+ dout(7) << "last link, marking clean and removing anchor" << endl;
+
+ in->mark_clean(); // mark it clean.
+
+ // remove anchor (async)
+ mds->anchorclient->destroy(in->ino(), NULL);
+ }
+ else {
+ in->mark_dirty();
+ }
+ } else {
+ // has primary link still.
+ assert(in->inode.nlink >= 1);
+ in->mark_dirty();
+
+ if (in->inode.nlink == 1) {
+ dout(7) << "nlink=1, removing anchor" << endl;
+
+ // remove anchor (async)
+ mds->anchorclient->destroy(in->ino(), NULL);
+ }
+ }
+
+ // ack
+ mds->send_message_mds(new MInodeUnlinkAck(m->get_ino()), m->get_from(), MDS_PORT_CACHE);
+}
+
+void MDCache::handle_inode_unlink_ack(MInodeUnlinkAck *m)
+{
+ CInode *in = get_inode(m->get_ino());
+ assert(in);
+
+ dout(7) << "handle_inode_unlink_ack on " << *in << endl;
+ in->finish_waiting(CINODE_WAIT_UNLINK, 0);
+}
+
+
+
+
+
+
+
+
+
+
+/*
+ * some import/export helpers
+ */
+
+/** con = get_auth_container(dir)
+ * Returns the directory in which authority is delegated for *dir.
+ * This may be because a directory is an import, or because it is hashed
+ * and we are nested underneath an inode in that dir (that hashes to us).
+ * Thus do not assume con->is_auth()! It is_auth() || is_hashed().
+ */
+CDir *MDCache::get_auth_container(CDir *dir)
+{
+ CDir *imp = dir; // might be *dir
+
+ // find the underlying import or hash that delegates dir
+ while (true) {
+ if (imp->is_import()) break; // import
+ imp = imp->get_parent_dir();
+ assert(imp);
+ if (imp->is_hashed()) break; // hash
+ }
+
+ return imp;
+}
+
+
+void MDCache::find_nested_exports(CDir *dir, set<CDir*>& s)
+{
+ CDir *import = get_auth_container(dir);
+ find_nested_exports_under(import, dir, s);
+}
+
+void MDCache::find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s)
+{
+ dout(10) << "find_nested_exports for " << *dir << endl;
+ dout(10) << "find_nested_exports_under import " << *import << endl;
+
+ if (import == dir) {
+ // yay, my job is easy!
+ for (set<CDir*>::iterator p = nested_exports[import].begin();
+ p != nested_exports[import].end();
+ p++) {
+ CDir *nested = *p;
+ s.insert(nested);
+ dout(10) << "find_nested_exports " << *dir << " " << *nested << endl;
+ }
+ return;
+ }
+
+ // ok, my job is annoying.
+ for (set<CDir*>::iterator p = nested_exports[import].begin();
+ p != nested_exports[import].end();
+ p++) {
+ CDir *nested = *p;
+
+ dout(12) << "find_nested_exports checking " << *nested << endl;
+
+ // trace back to import, or dir
+ CDir *cur = nested->get_parent_dir();
+ while (!cur->is_import() || cur == dir) {
+ if (cur == dir) {
+ s.insert(nested);
+ dout(10) << "find_nested_exports " << *dir << " " << *nested << endl;
+ break;
+ } else {
+ cur = cur->get_parent_dir();
+ }
+ }
+ }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+// ==============================================================
+// debug crap
+
+
+void MDCache::show_imports()
+{
+ mds->balancer->show_imports();
+}
+
+
+void MDCache::show_cache()
+{
+ dout(7) << "show_cache" << endl;
+ for (hash_map<inodeno_t,CInode*>::iterator it = inode_map.begin();
+ it != inode_map.end();
+ it++) {
+ dout(7) << *((*it).second) << endl;
+
+ CDentry *dn = (*it).second->get_parent_dn();
+ if (dn)
+ dout(7) << " dn " << *dn << endl;
+ if ((*it).second->dir)
+ dout(7) << " subdir " << *(*it).second->dir << endl;
+ }
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __MDCACHE_H
+#define __MDCACHE_H
+
+#include <string>
+#include <vector>
+#include <map>
+#include <set>
+#include <ext/hash_map>
+
+#include "include/types.h"
+#include "include/filepath.h"
+
+#include "CInode.h"
+#include "CDentry.h"
+#include "CDir.h"
+#include "Lock.h"
+
+
+class MDS;
+class Migrator;
+class Renamer;
+
+class Logger;
+
+class Message;
+
+class MDiscover;
+class MDiscoverReply;
+class MCacheExpire;
+class MDirUpdate;
+class MDentryUnlink;
+class MLock;
+
+
+class MClientRequest;
+
+
+// MDCache
+
+//typedef const char* pchar;
+
+
+
+/** active_request_t
+ * state we track for requests we are currently processing.
+ * mostly information about locks held, so that we can drop them all
+ * the request is finished or forwarded. see request_*().
+ */
+typedef struct {
+ CInode *ref; // reference inode
+ set< CInode* > request_pins;
+ set< CDir* > request_dir_pins;
+ map< CDentry*, vector<CDentry*> > traces; // path pins held
+ set< CDentry* > xlocks; // xlocks (local)
+ set< CDentry* > foreign_xlocks; // xlocks on foreign hosts
+} active_request_t;
+
+namespace __gnu_cxx {
+ template<> struct hash<Message*> {
+ size_t operator()(const Message *p) const {
+ static hash<unsigned long> H;
+ return H((unsigned long)p);
+ }
+ };
+}
+
+class MDCache {
+ protected:
+ // my master
+ MDS *mds;
+
+ // the cache
+ CInode *root; // root inode
+ LRU lru; // lru for expiring items
+ hash_map<inodeno_t,CInode*> inode_map; // map of inodes by ino
+
+ // root
+ list<Context*> waiting_for_root;
+
+ // imports, exports, and hashes.
+ set<CDir*> imports; // includes root (on mds0)
+ set<CDir*> exports;
+ set<CDir*> hashdirs;
+ map<CDir*,set<CDir*> > nested_exports; // exports nested under imports _or_ hashdirs
+
+ // active MDS requests
+ hash_map<Message*, active_request_t> active_requests;
+
+ // inode purging
+ map<inodeno_t, inode_t> purging;
+ map<inodeno_t, list<Context*> > waiting_for_purge;
+
+ // shutdown crap
+ int shutdown_commits;
+ bool did_shutdown_exports;
+ friend class C_MDC_ShutdownCommit;
+
+ friend class CInode;
+ friend class Locker;
+ friend class Migrator;
+ friend class Renamer;
+ friend class MDBalancer;
+
+ public:
+ // subsystems
+ Migrator *migrator;
+ Renamer *renamer;
+
+ public:
+ MDCache(MDS *m);
+ ~MDCache();
+
+ // debug
+ void log_stat(Logger *logger);
+
+ // root inode
+ CInode *get_root() { return root; }
+ void set_root(CInode *r);
+
+ void add_import(CDir *dir);
+ void remove_import(CDir *dir);
+
+ // cache
+ void set_cache_size(size_t max) { lru.lru_set_max(max); }
+ size_t get_cache_size() { return lru.lru_get_size(); }
+ bool trim(int max = -1); // trim cache
+
+ // shutdown
+ void shutdown_start();
+ void shutdown_check();
+ bool shutdown_pass();
+ bool shutdown(); // clear cache (ie at shutodwn)
+
+ // inode_map
+ bool have_inode( inodeno_t ino ) { return inode_map.count(ino) ? true:false; }
+ CInode* get_inode( inodeno_t ino ) {
+ if (have_inode(ino))
+ return inode_map[ ino ];
+ return NULL;
+ }
+
+ public:
+ CInode *create_inode();
+ void add_inode(CInode *in);
+
+ protected:
+ void remove_inode(CInode *in);
+ void destroy_inode(CInode *in);
+ void touch_inode(CInode *in) {
+ // touch parent(s) too
+ if (in->get_parent_dir()) touch_inode(in->get_parent_dir()->inode);
+
+ // top or mid, depending on whether i'm auth
+ if (in->is_auth())
+ lru.lru_touch(in);
+ else
+ lru.lru_midtouch(in);
+ }
+ void rename_file(CDentry *srcdn, CDentry *destdn);
+
+ public:
+ // inode purging
+ void purge_inode(inode_t& inode);
+ void purge_inode_finish(inodeno_t ino);
+ void purge_inode_finish_2(inodeno_t ino);
+ void waitfor_purge(inodeno_t ino, Context *c);
+ void start_recovered_purges();
+
+
+ protected:
+ // private methods
+ CDir *get_auth_container(CDir *in);
+ void find_nested_exports(CDir *dir, set<CDir*>& s);
+ void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
+
+
+ public:
+ int open_root(Context *c);
+ int path_traverse(filepath& path, vector<CDentry*>& trace, bool follow_trailing_sym,
+ Message *req, Context *ondelay,
+ int onfail,
+ Context *onfinish=0,
+ bool is_client_req = false);
+ void open_remote_dir(CInode *diri, Context *fin);
+ void open_remote_ino(inodeno_t ino, Message *req, Context *fin);
+ void open_remote_ino_2(inodeno_t ino, Message *req,
+ vector<Anchor*>& anchortrace,
+ Context *onfinish);
+
+ bool path_pin(vector<CDentry*>& trace, Message *m, Context *c);
+ void path_unpin(vector<CDentry*>& trace, Message *m);
+ void make_trace(vector<CDentry*>& trace, CInode *in);
+
+ bool request_start(Message *req,
+ CInode *ref,
+ vector<CDentry*>& trace);
+ void request_cleanup(Message *req);
+ void request_finish(Message *req);
+ void request_forward(Message *req, int mds, int port=0);
+ void request_pin_inode(Message *req, CInode *in);
+ void request_pin_dir(Message *req, CDir *dir);
+
+ // anchors
+ void anchor_inode(CInode *in, Context *onfinish);
+ //void unanchor_inode(CInode *in, Context *c);
+
+ void handle_inode_link(class MInodeLink *m);
+ void handle_inode_link_ack(class MInodeLinkAck *m);
+
+ // == messages ==
+ public:
+ void dispatch(Message *m);
+
+ protected:
+ // -- replicas --
+ void handle_discover(MDiscover *dis);
+ void handle_discover_reply(MDiscoverReply *m);
+
+
+ // -- namespace --
+ // these handle logging, cache sync themselves.
+ // UNLINK
+ public:
+ void dentry_unlink(CDentry *in, Context *c);
+ protected:
+ void dentry_unlink_finish(CDentry *in, CDir *dir, Context *c);
+ void handle_dentry_unlink(MDentryUnlink *m);
+ void handle_inode_unlink(class MInodeUnlink *m);
+ void handle_inode_unlink_ack(class MInodeUnlinkAck *m);
+ friend class C_MDC_DentryUnlink;
+
+
+
+ // -- misc auth --
+ int ino_proxy_auth(inodeno_t ino,
+ int frommds,
+ map<CDir*, set<inodeno_t> >& inomap);
+ void do_ino_proxy(CInode *in, Message *m);
+ void do_dir_proxy(CDir *dir, Message *m);
+
+
+
+
+ // -- updates --
+ //int send_inode_updates(CInode *in);
+ //void handle_inode_update(MInodeUpdate *m);
+
+ int send_dir_updates(CDir *in, bool bcast=false);
+ void handle_dir_update(MDirUpdate *m);
+
+ void handle_cache_expire(MCacheExpire *m);
+
+
+
+ // == crap fns ==
+ public:
+ void dump() {
+ if (root) root->dump();
+ }
+
+ void show_imports();
+ void show_cache();
+
+};
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "MDLog.h"
+#include "MDS.h"
+#include "LogEvent.h"
+
+#include "osdc/Journaler.h"
+
+#include "common/LogType.h"
+#include "common/Logger.h"
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log "
+#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log "
+
+// cons/des
+
+LogType mdlog_logtype;
+
+MDLog::MDLog(MDS *m)
+{
+ mds = m;
+ num_events = 0;
+ waiting_for_read = false;
+
+ max_events = g_conf.mds_log_max_len;
+
+ unflushed = 0;
+
+ // logger
+ char name[80];
+ sprintf(name, "mds%d.log", mds->get_nodeid());
+ logger = new Logger(name, &mdlog_logtype);
+
+ static bool didit = false;
+ if (!didit) {
+ mdlog_logtype.add_inc("add");
+ mdlog_logtype.add_inc("retire");
+ mdlog_logtype.add_inc("obs");
+ mdlog_logtype.add_inc("trim");
+ mdlog_logtype.add_set("size");
+ mdlog_logtype.add_set("read");
+ mdlog_logtype.add_set("append");
+ mdlog_logtype.add_inc("lsum");
+ mdlog_logtype.add_inc("lnum");
+ }
+
+ // inode
+ memset(&log_inode, 0, sizeof(log_inode));
+ log_inode.ino = MDS_INO_LOG_OFFSET + mds->get_nodeid();
+ log_inode.layout = g_OSD_MDLogLayout;
+
+ if (g_conf.mds_local_osd) {
+ log_inode.layout.object_layout = OBJECT_LAYOUT_STARTOSD;
+ log_inode.layout.osd = mds->get_nodeid() + 10000; // hack
+ }
+
+ // log streamer
+ journaler = new Journaler(log_inode, mds->objecter, logger);
+
+}
+
+
+MDLog::~MDLog()
+{
+ if (journaler) { delete journaler; journaler = 0; }
+ if (logger) { delete logger; logger = 0; }
+}
+
+
+void MDLog::reset()
+{
+ journaler->reset();
+}
+
+void MDLog::open(Context *c)
+{
+ dout(5) << "open discovering log bounds" << endl;
+ journaler->recover(c);
+}
+
+void MDLog::write_head(Context *c)
+{
+ journaler->write_head(c);
+}
+
+
+void MDLog::submit_entry( LogEvent *le,
+ Context *c )
+{
+ dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << endl;
+
+ if (g_conf.mds_log) {
+ // encode it, with event type
+ bufferlist bl;
+ bl.append((char*)&le->_type, sizeof(le->_type));
+ le->encode_payload(bl);
+
+ // journal it.
+ journaler->append_entry(bl);
+
+ delete le;
+ num_events++;
+
+ logger->inc("add");
+ logger->set("size", num_events);
+ logger->set("append", journaler->get_write_pos());
+
+ if (c) {
+ unflushed = 0;
+ journaler->flush(c);
+ }
+ else
+ unflushed++;
+
+ } else {
+ // hack: log is disabled.
+ if (c) {
+ c->finish(0);
+ delete c;
+ }
+ }
+}
+
+void MDLog::wait_for_sync( Context *c )
+{
+ if (g_conf.mds_log) {
+ // wait
+ journaler->flush(c);
+ } else {
+ // hack: bypass.
+ c->finish(0);
+ delete c;
+ }
+}
+
+void MDLog::flush()
+{
+ if (unflushed)
+ journaler->flush();
+ unflushed = 0;
+
+ // trim
+ trim(NULL);
+}
+
+
+
+
+// trim
+
+class C_MDL_Trimmed : public Context {
+public:
+ MDLog *mdl;
+ LogEvent *le;
+
+ C_MDL_Trimmed(MDLog *mdl, LogEvent *le) {
+ this->mdl = mdl;
+ this->le = le;
+ }
+ void finish(int res) {
+ mdl->_trimmed(le);
+ }
+};
+
+class C_MDL_Reading : public Context {
+public:
+ MDLog *mdl;
+ C_MDL_Reading(MDLog *m) {
+ mdl = m;
+ }
+ void finish(int res) {
+ mdl->_did_read();
+ }
+};
+
+
+void MDLog::_did_read()
+{
+ dout(5) << "_did_read()" << endl;
+ waiting_for_read = false;
+ trim(0);
+}
+
+void MDLog::_trimmed(LogEvent *le)
+{
+ dout(7) << " trimmed " << *le << endl;
+
+ assert(le->can_expire(mds));
+
+ if (trimming.begin()->first == le->_end_off) {
+ // front! we can expire the log a bit
+ journaler->set_expire_pos(le->_end_off);
+ }
+
+ trimming.erase(le->_end_off);
+ delete le;
+
+ logger->set("trim", trimming.size());
+ logger->set("read", journaler->get_read_pos());
+
+ trim(0);
+}
+
+
+
+void MDLog::trim(Context *c)
+{
+ // add waiter
+ if (c)
+ trim_waiters.push_back(c);
+
+ // trim!
+ while (num_events > max_events) {
+
+ off_t gap = journaler->get_write_pos() - journaler->get_read_pos();
+ dout(5) << "trim num_events " << num_events << " > max " << max_events
+ << ", trimming " << trimming.size()
+ << ", byte gap " << gap
+ << endl;
+
+ if ((int)trimming.size() >= g_conf.mds_log_max_trimming) {
+ dout(7) << "trim already trimming max, waiting" << endl;
+ return;
+ }
+
+ bufferlist bl;
+ if (journaler->try_read_entry(bl)) {
+ // decode logevent
+ LogEvent *le = LogEvent::decode(bl);
+ le->_end_off = journaler->get_read_pos();
+ num_events--;
+
+ // we just read an event.
+ if (le->can_expire(mds) == true) {
+ // obsolete
+ dout(7) << "trim obsolete: " << *le << endl;
+ delete le;
+ logger->inc("obs");
+ } else {
+ assert ((int)trimming.size() < g_conf.mds_log_max_trimming);
+
+ // trim!
+ dout(7) << "trim trimming: " << *le << endl;
+ trimming[le->_end_off] = le;
+ le->retire(mds, new C_MDL_Trimmed(this, le));
+ logger->inc("retire");
+ logger->set("trim", trimming.size());
+ }
+ logger->set("read", journaler->get_read_pos());
+ logger->set("size", num_events);
+ } else {
+ // need to read!
+ if (!waiting_for_read) {
+ waiting_for_read = true;
+ dout(7) << "trim waiting for read" << endl;
+ journaler->wait_for_readable(new C_MDL_Reading(this));
+ } else {
+ dout(7) << "trim already waiting for read" << endl;
+ }
+ return;
+ }
+ }
+
+ dout(5) << "trim num_events " << num_events << " <= max " << max_events
+ << ", trimming " << trimming.size()
+ << ", done for now."
+ << endl;
+
+ // trimmed!
+ std::list<Context*> finished;
+ finished.swap(trim_waiters);
+ finish_contexts(finished, 0);
+}
+
+
+void MDLog::replay(Context *c)
+{
+ assert(journaler->is_active());
+
+ // start reading at the last known expire point.
+ journaler->set_read_pos( journaler->get_expire_pos() );
+
+ // empty?
+ if (journaler->get_read_pos() == journaler->get_write_pos()) {
+ dout(10) << "replay - journal empty, done." << endl;
+ if (c) {
+ c->finish(0);
+ delete c;
+ }
+ return;
+ }
+
+ // add waiter
+ if (c)
+ waitfor_replay.push_back(c);
+
+ // go!
+ dout(10) << "replay start, from " << journaler->get_read_pos()
+ << " to " << journaler->get_write_pos() << endl;
+
+ assert(num_events == 0);
+
+ _replay();
+}
+
+class C_MDL_Replay : public Context {
+ MDLog *mdlog;
+public:
+ C_MDL_Replay(MDLog *l) : mdlog(l) {}
+ void finish(int r) { mdlog->_replay(); }
+};
+
+void MDLog::_replay()
+{
+ // read what's buffered
+ while (journaler->is_readable() &&
+ journaler->get_read_pos() < journaler->get_write_pos()) {
+ // read it
+ off_t pos = journaler->get_read_pos();
+ bufferlist bl;
+ bool r = journaler->try_read_entry(bl);
+ assert(r);
+
+ // unpack event
+ LogEvent *le = LogEvent::decode(bl);
+ num_events++;
+
+ if (le->has_happened(mds)) {
+ dout(10) << "_replay " << pos << " / " << journaler->get_write_pos()
+ << " : " << *le << " : already happened" << endl;
+ } else {
+ dout(10) << "_replay " << pos << " / " << journaler->get_write_pos()
+ << " : " << *le << " : applying" << endl;
+ le->replay(mds);
+ }
+ delete le;
+ }
+
+ // wait for read?
+ if (journaler->get_read_pos() < journaler->get_write_pos()) {
+ journaler->wait_for_readable(new C_MDL_Replay(this));
+ return;
+ }
+
+ // done!
+ assert(journaler->get_read_pos() == journaler->get_write_pos());
+ dout(10) << "_replay - complete" << endl;
+
+ // move read pointer _back_ to expire pos, for eventual trimming
+ journaler->set_read_pos(journaler->get_expire_pos());
+
+ // kick waiter(s)
+ list<Context*> ls;
+ ls.swap(waitfor_replay);
+ finish_contexts(ls,0);
+}
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MDLOG_H
+#define __MDLOG_H
+
+#include "include/types.h"
+#include "include/Context.h"
+
+#include <list>
+
+//#include <ext/hash_map>
+//using __gnu_cxx::hash_mapset;
+
+class Journaler;
+class LogEvent;
+class MDS;
+
+class Logger;
+
+/*
+namespace __gnu_cxx {
+ template<> struct hash<LogEvent*> {
+ size_t operator()(const LogEvent *p) const {
+ static hash<unsigned long> H;
+ return H((unsigned long)p);
+ }
+ };
+}
+*/
+
+class MDLog {
+ protected:
+ MDS *mds;
+ size_t num_events; // in events
+ size_t max_events;
+
+ int unflushed;
+
+ inode_t log_inode;
+ Journaler *journaler;
+
+
+ //hash_map<LogEvent*> trimming; // events currently being trimmed
+ map<off_t, LogEvent*> trimming;
+ std::list<Context*> trim_waiters; // contexts waiting for trim
+ bool trim_reading;
+
+ bool waiting_for_read;
+ friend class C_MDL_Reading;
+
+ Logger *logger;
+
+ list<Context*> waitfor_replay;
+
+ public:
+ MDLog(MDS *m);
+ ~MDLog();
+
+ void set_max_events(size_t max) { max_events = max; }
+ size_t get_max_events() { return max_events; }
+ size_t get_num_events() { return num_events + trimming.size(); }
+
+ void submit_entry( LogEvent *e, Context *c = 0 );
+ void wait_for_sync( Context *c );
+ void flush();
+
+ void trim(Context *c);
+ void _did_read();
+ void _trimmed(LogEvent *le);
+
+ void reset(); // fresh, empty log!
+ void open(Context *onopen);
+ void write_head(Context *onfinish);
+
+ void replay(Context *onfinish);
+ void _replay();
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "include/types.h"
+#include "common/Clock.h"
+
+#include "msg/Messenger.h"
+
+#include "osd/OSDMap.h"
+#include "osdc/Objecter.h"
+#include "osdc/Filer.h"
+
+#include "MDSMap.h"
+
+#include "MDS.h"
+#include "Server.h"
+#include "Locker.h"
+#include "MDCache.h"
+#include "MDStore.h"
+#include "MDLog.h"
+#include "MDBalancer.h"
+#include "IdAllocator.h"
+#include "Migrator.h"
+#include "Renamer.h"
+
+#include "AnchorTable.h"
+#include "AnchorClient.h"
+
+#include "common/Logger.h"
+#include "common/LogType.h"
+
+#include "common/Timer.h"
+
+#include "messages/MMDSMap.h"
+#include "messages/MMDSBoot.h"
+
+#include "messages/MPing.h"
+#include "messages/MPingAck.h"
+#include "messages/MGenericMessage.h"
+
+#include "messages/MOSDMap.h"
+#include "messages/MOSDGetMap.h"
+
+
+LogType mds_logtype, mds_cache_logtype;
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << " "
+#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << " "
+
+
+
+
+
+// cons/des
+MDS::MDS(int whoami, Messenger *m, MonMap *mm) {
+ this->whoami = whoami;
+
+ monmap = mm;
+ messenger = m;
+
+ mdsmap = new MDSMap;
+ osdmap = new OSDMap;
+
+ objecter = new Objecter(messenger, monmap, osdmap);
+ filer = new Filer(objecter);
+
+ mdcache = new MDCache(this);
+ mdstore = new MDStore(this);
+ mdlog = new MDLog(this);
+ balancer = new MDBalancer(this);
+
+ anchorclient = new AnchorClient(messenger, mdsmap);
+ idalloc = new IdAllocator(this);
+
+ anchormgr = new AnchorTable(this);
+
+ server = new Server(this);
+ locker = new Locker(this, mdcache);
+
+
+ req_rate = 0;
+
+ state = STATE_BOOTING;
+
+ last_balancer_hash = last_balancer_heartbeat = g_clock.recent_now();
+
+
+ logger = logger2 = 0;
+
+ // i'm ready!
+ messenger->set_dispatcher(this);
+}
+
+MDS::~MDS() {
+ if (mdcache) { delete mdcache; mdcache = NULL; }
+ if (mdstore) { delete mdstore; mdstore = NULL; }
+ if (mdlog) { delete mdlog; mdlog = NULL; }
+ if (balancer) { delete balancer; balancer = NULL; }
+ if (idalloc) { delete idalloc; idalloc = NULL; }
+ if (anchormgr) { delete anchormgr; anchormgr = NULL; }
+ if (anchorclient) { delete anchorclient; anchorclient = NULL; }
+ if (osdmap) { delete osdmap; osdmap = 0; }
+
+ if (filer) { delete filer; filer = 0; }
+ if (objecter) { delete objecter; objecter = 0; }
+ if (messenger) { delete messenger; messenger = NULL; }
+
+ if (logger) { delete logger; logger = 0; }
+ if (logger2) { delete logger2; logger2 = 0; }
+
+}
+
+
+void MDS::reopen_log()
+{
+ // flush+close old log
+ if (logger) {
+ logger->flush(true);
+ delete logger;
+ }
+ if (logger2) {
+ logger2->flush(true);
+ delete logger2;
+ }
+
+
+ // log
+ string name;
+ name = "mds";
+ int w = whoami;
+ if (w >= 1000) name += ('0' + ((w/1000)%10));
+ if (w >= 100) name += ('0' + ((w/100)%10));
+ if (w >= 10) name += ('0' + ((w/10)%10));
+ name += ('0' + ((w/1)%10));
+
+ logger = new Logger(name, (LogType*)&mds_logtype);
+
+ mds_logtype.add_inc("req");
+ mds_logtype.add_inc("reply");
+ mds_logtype.add_inc("fw");
+ mds_logtype.add_inc("cfw");
+
+ mds_logtype.add_set("l");
+ mds_logtype.add_set("q");
+ mds_logtype.add_set("popanyd");
+ mds_logtype.add_set("popnest");
+
+ mds_logtype.add_inc("lih");
+ mds_logtype.add_inc("lif");
+
+ mds_logtype.add_set("c");
+ mds_logtype.add_set("ctop");
+ mds_logtype.add_set("cbot");
+ mds_logtype.add_set("cptail");
+ mds_logtype.add_set("cpin");
+ mds_logtype.add_inc("cex");
+ mds_logtype.add_inc("dis");
+ mds_logtype.add_inc("cmiss");
+
+ mds_logtype.add_set("buf");
+ mds_logtype.add_inc("cdir");
+ mds_logtype.add_inc("fdir");
+
+ mds_logtype.add_inc("iex");
+ mds_logtype.add_inc("iim");
+ mds_logtype.add_inc("ex");
+ mds_logtype.add_inc("im");
+ mds_logtype.add_inc("imex");
+ mds_logtype.add_set("nex");
+ mds_logtype.add_set("nim");
+
+
+ char n[80];
+ sprintf(n, "mds%d.cache", whoami);
+ logger2 = new Logger(n, (LogType*)&mds_cache_logtype);
+}
+
+void MDS::send_message_mds(Message *m, int mds, int port, int fromport)
+{
+ if (port && !fromport)
+ fromport = port;
+ messenger->send_message(m, MSG_ADDR_MDS(mds), mdsmap->get_inst(mds), port, fromport);
+}
+
+
+int MDS::init()
+{
+ // request osd map
+ dout(5) << "requesting mds and osd maps from mon" << endl;
+ int mon = monmap->pick_mon();
+ messenger->send_message(new MMDSBoot, MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ return 0;
+}
+
+
+void MDS::handle_mds_map(MMDSMap *m)
+{
+ map<epoch_t, bufferlist>::reverse_iterator p = m->maps.rbegin();
+
+ dout(1) << "handle_mds_map epoch " << p->first << endl;
+ mdsmap->decode(p->second);
+
+ delete m;
+
+ // see who i am
+ int w = mdsmap->get_inst_rank(messenger->get_myinst());
+ if (w != whoami) {
+ whoami = w;
+ messenger->reset_myaddr(MSG_ADDR_MDS(w));
+ reopen_log();
+ }
+ dout(1) << "map says i am " << w << endl;
+
+ if (is_booting()) {
+ // we need an osdmap too.
+ int mon = monmap->pick_mon();
+ messenger->send_message(new MOSDGetMap(0),
+ MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ }
+}
+
+void MDS::handle_osd_map(MOSDMap *m)
+{
+ // process locally
+ objecter->handle_osd_map(m);
+
+ if (is_booting()) {
+ // we got our maps. mkfs for recovery?
+ if (g_conf.mkfs)
+ boot_mkfs();
+ else
+ boot_recover();
+ }
+
+ // pass on to clients
+ for (set<int>::iterator it = clientmap.get_mount_set().begin();
+ it != clientmap.get_mount_set().end();
+ it++) {
+ MOSDMap *n = new MOSDMap;
+ n->maps = m->maps;
+ n->incremental_maps = m->incremental_maps;
+ messenger->send_message(n, MSG_ADDR_CLIENT(*it), clientmap.get_inst(*it));
+ }
+}
+
+
+class C_MDS_MkfsFinish : public Context {
+ MDS *mds;
+public:
+ C_MDS_MkfsFinish(MDS *m) : mds(m) {}
+ void finish(int r) { mds->boot_mkfs_finish(); }
+};
+
+void MDS::boot_mkfs()
+{
+ dout(3) << "boot_mkfs" << endl;
+
+ C_Gather *fin = new C_Gather(new C_MDS_MkfsFinish(this));
+
+ if (whoami == 0) {
+ dout(3) << "boot_mkfs - creating root inode and dir" << endl;
+
+ // create root inode.
+ mdcache->open_root(0);
+ CInode *root = mdcache->get_root();
+ assert(root);
+
+ // force empty root dir
+ CDir *dir = root->dir;
+ dir->mark_complete();
+ dir->mark_dirty();
+
+ // save it
+ mdstore->commit_dir(dir, fin->new_sub());
+ }
+
+ // start with a fresh journal
+ dout(10) << "boot_mkfs creating fresh journal" << endl;
+ mdlog->reset();
+ mdlog->write_head(fin->new_sub());
+
+ // fixme: fake out idalloc (reset, pretend loaded)
+ dout(10) << "boot_mkfs creating fresh idalloc table" << endl;
+ idalloc->reset();
+ idalloc->save(fin->new_sub());
+
+ // fixme: fake out anchortable
+ if (mdsmap->get_anchortable() == whoami) {
+ dout(10) << "boot_mkfs creating fresh anchortable" << endl;
+ anchormgr->reset();
+ anchormgr->save(fin->new_sub());
+ }
+}
+
+void MDS::boot_mkfs_finish()
+{
+ dout(3) << "boot_mkfs_finish" << endl;
+ mark_active();
+}
+
+
+class C_MDS_BootRecover : public Context {
+ MDS *mds;
+ int nextstep;
+public:
+ C_MDS_BootRecover(MDS *m, int n) : mds(m), nextstep(n) {}
+ void finish(int r) { mds->boot_recover(nextstep); }
+};
+
+void MDS::boot_recover(int step)
+{
+ if (is_booting())
+ state = STATE_RECOVERING;
+
+ switch (step) {
+ case 0:
+ if (whoami == 0) {
+ dout(2) << "boot_recover " << step << ": creating root inode" << endl;
+ mdcache->open_root(0);
+ step = 1;
+ // fall-thru
+ } else {
+ // FIXME
+ assert(0);
+ }
+
+ case 1:
+ dout(2) << "boot_recover " << step << ": opening idalloc" << endl;
+ idalloc->load(new C_MDS_BootRecover(this, 2));
+ break;
+
+ case 2:
+ if (mdsmap->get_anchortable() == whoami) {
+ dout(2) << "boot_recover " << step << ": opening anchor table" << endl;
+ anchormgr->load(new C_MDS_BootRecover(this, 3));
+ break;
+ } else {
+ dout(2) << "boot_recover " << step << ": i have no anchor table" << endl;
+ step++;
+ }
+ // fall-thru
+
+ case 3:
+ dout(2) << "boot_recover " << step << ": opening mds log" << endl;
+ mdlog->open(new C_MDS_BootRecover(this, 4));
+ break;
+
+ case 4:
+ dout(2) << "boot_recover " << step << ": replaying mds log" << endl;
+ mdlog->replay(new C_MDS_BootRecover(this, 5));
+ break;
+
+ case 5:
+ dout(2) << "boot_recover " << step << ": restarting any recovered purges" << endl;
+ mdcache->start_recovered_purges();
+ step++;
+ // fall-thru
+
+ case 6:
+ dout(2) << "boot_recover " << step << ": done." << endl;
+ mark_active();
+ }
+}
+
+
+
+void MDS::mark_active()
+{
+ dout(3) << "mark_active" << endl;
+ state = STATE_ACTIVE;
+ finish_contexts(waitfor_active); // kick waiters
+}
+
+
+
+
+
+int MDS::shutdown_start()
+{
+ dout(1) << "shutdown_start" << endl;
+ derr(0) << "mds shutdown start" << endl;
+
+ for (set<int>::iterator p = mdsmap->get_mds().begin();
+ p != mdsmap->get_mds().end();
+ p++) {
+ dout(1) << "sending MShutdownStart to mds" << *p << endl;
+ send_message_mds(new MGenericMessage(MSG_MDS_SHUTDOWNSTART),
+ *p, MDS_PORT_MAIN);
+ }
+
+ if (idalloc) idalloc->shutdown();
+
+ handle_shutdown_start(NULL);
+ return 0;
+}
+
+
+void MDS::handle_shutdown_start(Message *m)
+{
+ dout(1) << " handle_shutdown_start" << endl;
+
+ // set flag
+ state = STATE_STOPPING;
+
+ mdcache->shutdown_start();
+
+ // save anchor table
+ if (mdsmap->get_anchortable() == whoami)
+ anchormgr->save(0); // FIXME FIXME
+
+ // flush log
+ mdlog->set_max_events(0);
+ mdlog->trim(NULL);
+
+ if (m) delete m;
+
+ //g_conf.debug_mds = 10;
+}
+
+
+
+int MDS::shutdown_final()
+{
+ dout(1) << "shutdown" << endl;
+
+ state = STATE_STOPPED;
+
+ // shut down cache
+ mdcache->shutdown();
+
+ // tell monitor
+ messenger->send_message(new MGenericMessage(MSG_SHUTDOWN),
+ MSG_ADDR_MON(0), monmap->get_inst(0));
+
+ // shut down messenger
+ messenger->shutdown();
+
+ return 0;
+}
+
+
+
+
+void MDS::dispatch(Message *m)
+{
+ // make sure we advacne the clock
+ g_clock.now();
+
+ // process
+ mds_lock.Lock();
+ my_dispatch(m);
+ mds_lock.Unlock();
+}
+
+
+
+void MDS::my_dispatch(Message *m)
+{
+
+ switch (m->get_dest_port()) {
+
+ case MDS_PORT_ANCHORMGR:
+ anchormgr->dispatch(m);
+ break;
+ case MDS_PORT_ANCHORCLIENT:
+ anchorclient->dispatch(m);
+ break;
+
+ case MDS_PORT_CACHE:
+ mdcache->dispatch(m);
+ break;
+ case MDS_PORT_LOCKER:
+ locker->dispatch(m);
+ break;
+
+ case MDS_PORT_MIGRATOR:
+ mdcache->migrator->dispatch(m);
+ break;
+ case MDS_PORT_RENAMER:
+ mdcache->renamer->dispatch(m);
+ break;
+
+ case MDS_PORT_BALANCER:
+ balancer->proc_message(m);
+ break;
+
+ case MDS_PORT_MAIN:
+ proc_message(m);
+ break;
+
+ case MDS_PORT_SERVER:
+ server->dispatch(m);
+ break;
+
+ default:
+ dout(1) << "MDS dispatch unknown message port" << m->get_dest_port() << endl;
+ assert(0);
+ }
+
+
+ // HACK FOR NOW
+ /*
+ static bool did_heartbeat_hack = false;
+ if (!shutting_down && !shut_down &&
+ false &&
+ !did_heartbeat_hack) {
+ osdmonitor->initiate_heartbeat();
+ did_heartbeat_hack = true;
+ }
+ */
+
+
+ if (is_active()) {
+ // flush log to disk after every op. for now.
+ mdlog->flush();
+
+ // trim cache
+ mdcache->trim();
+ }
+
+ // finish any triggered contexts
+ if (finished_queue.size()) {
+ dout(7) << "mds has " << finished_queue.size() << " queued contexts" << endl;
+ list<Context*> ls;
+ ls.splice(ls.begin(), finished_queue);
+ assert(finished_queue.empty());
+ finish_contexts(ls);
+ }
+
+
+
+ // hash root?
+ if (false &&
+ mdcache->get_root() &&
+ mdcache->get_root()->dir &&
+ !(mdcache->get_root()->dir->is_hashed() ||
+ mdcache->get_root()->dir->is_hashing())) {
+ dout(0) << "hashing root" << endl;
+ mdcache->migrator->hash_dir(mdcache->get_root()->dir);
+ }
+
+
+ // periodic crap (1-second resolution)
+ static utime_t last_log = g_clock.recent_now();
+ utime_t now = g_clock.recent_now();
+ if (is_active() &&
+ last_log.sec() != now.sec()) {
+
+ // log
+ last_log = now;
+ mds_load_t load = balancer->get_load();
+
+ if (logger) {
+ req_rate = logger->get("req");
+
+ logger->set("l", (int)load.mds_load());
+ logger->set("q", messenger->get_dispatch_queue_len());
+ logger->set("buf", buffer_total_alloc);
+
+ mdcache->log_stat(logger);
+ }
+
+
+ // balance?
+ static int num_bal_times = g_conf.mds_bal_max;
+ static utime_t first = g_clock.recent_now();
+ utime_t elapsed = now;
+ elapsed -= first;
+ if (true &&
+ whoami == 0 &&
+ (num_bal_times || (g_conf.mds_bal_max_until >= 0 && elapsed.sec() > g_conf.mds_bal_max_until)) &&
+ !is_stopping() && !is_stopped() &&
+ now.sec() - last_balancer_heartbeat.sec() >= g_conf.mds_bal_interval) {
+ last_balancer_heartbeat = now;
+ balancer->send_heartbeat();
+ num_bal_times--;
+ }
+
+ // hash?
+ if (true &&
+ g_conf.num_mds > 1 &&
+ now.sec() - last_balancer_hash.sec() > g_conf.mds_bal_hash_interval) {
+ last_balancer_hash = now;
+ balancer->do_hashing();
+ }
+
+
+
+ // HACK to test hashing stuff
+ if (false) {
+ static map<int,int> didhash;
+ if (elapsed.sec() > 15 && !didhash[whoami]) {
+ CInode *in = mdcache->get_inode(100000010);
+ if (in && in->dir) {
+ if (in->dir->is_auth())
+ mdcache->migrator->hash_dir(in->dir);
+ didhash[whoami] = 1;
+ }
+ }
+ if (0 && elapsed.sec() > 25 && didhash[whoami] == 1) {
+ CInode *in = mdcache->get_inode(100000010);
+ if (in && in->dir) {
+ if (in->dir->is_auth() && in->dir->is_hashed())
+ mdcache->migrator->unhash_dir(in->dir);
+ didhash[whoami] = 2;
+ }
+ }
+ }
+
+
+
+ }
+
+ // HACK to force export to test foreign renames
+ if (false && whoami == 0) {
+ static bool didit = false;
+
+ // 7 to 1
+ CInode *in = mdcache->get_inode(1001);
+ if (in && in->is_dir() && !didit) {
+ CDir *dir = in->get_or_open_dir(this);
+ if (dir->is_auth()) {
+ dout(1) << "FORCING EXPORT" << endl;
+ mdcache->migrator->export_dir(dir,1);
+ didit = true;
+ }
+ }
+ }
+
+
+
+ // shut down?
+ if (is_stopping()) {
+ if (mdcache->shutdown_pass()) {
+ dout(7) << "shutdown_pass=true, finished w/ shutdown" << endl;
+ shutdown_final();
+ }
+ }
+
+}
+
+
+void MDS::proc_message(Message *m)
+{
+ switch (m->get_type()) {
+ // OSD ===============
+ /*
+ case MSG_OSD_MKFS_ACK:
+ handle_osd_mkfs_ack(m);
+ return;
+ */
+ case MSG_OSD_OPREPLY:
+ objecter->handle_osd_op_reply((class MOSDOpReply*)m);
+ return;
+ case MSG_OSD_MAP:
+ handle_osd_map((MOSDMap*)m);
+ return;
+
+
+ // MDS
+ case MSG_MDS_MAP:
+ handle_mds_map((MMDSMap*)m);
+ return;
+
+ case MSG_MDS_SHUTDOWNSTART: // mds0 -> mds1+
+ handle_shutdown_start(m);
+ return;
+
+
+
+ case MSG_PING:
+ handle_ping((MPing*)m);
+ return;
+ }
+
+}
+
+
+
+
+
+
+void MDS::handle_ping(MPing *m)
+{
+ dout(10) << " received ping from " << m->get_source() << " with seq " << m->seq << endl;
+
+ messenger->send_message(new MPingAck(m),
+ m->get_source(), m->get_source_inst());
+
+ delete m;
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __MDS_H
+#define __MDS_H
+
+#include <list>
+#include <vector>
+#include <set>
+#include <map>
+#include <ostream>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#include "msg/Dispatcher.h"
+#include "include/types.h"
+#include "include/Context.h"
+#include "common/DecayCounter.h"
+#include "common/Logger.h"
+#include "common/Mutex.h"
+
+#include "mon/MonMap.h"
+
+#include "ClientMap.h"
+
+
+#define MDS_PORT_MAIN 0
+#define MDS_PORT_SERVER 1
+#define MDS_PORT_CACHE 2
+#define MDS_PORT_LOCKER 3
+#define MDS_PORT_STORE 4
+#define MDS_PORT_BALANCER 5
+#define MDS_PORT_MIGRATOR 6
+#define MDS_PORT_RENAMER 7
+
+#define MDS_PORT_ANCHORCLIENT 10
+#define MDS_PORT_ANCHORMGR 11
+
+
+#define MDS_INO_ROOT 1
+#define MDS_INO_PGTABLE 2
+#define MDS_INO_LOG_OFFSET 0x100
+#define MDS_INO_IDS_OFFSET 0x200
+#define MDS_INO_INODEFILE_OFFSET 0x300
+#define MDS_INO_ANCHORTABLE 0x400
+#define MDS_INO_BASE 0x1000
+
+#define MDS_TRAVERSE_FORWARD 1
+#define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc.
+#define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries.
+#define MDS_TRAVERSE_FAIL 4
+
+
+class filepath;
+
+class MDSMap;
+class OSDMap;
+class Objecter;
+class Filer;
+
+class Server;
+class Locker;
+class AnchorTable;
+class AnchorClient;
+class MDCache;
+class MDStore;
+class MDLog;
+class MDBalancer;
+class IdAllocator;
+
+class CInode;
+class CDir;
+class CDentry;
+
+class Messenger;
+class Message;
+
+class MClientRequest;
+class MClientReply;
+class MHashReaddir;
+class MHashReaddirReply;
+
+
+
+
+class MDS : public Dispatcher {
+ public:
+ Mutex mds_lock;
+
+ protected:
+ int whoami;
+
+ public:
+ Messenger *messenger;
+ MDSMap *mdsmap;
+ MonMap *monmap;
+ OSDMap *osdmap;
+ Objecter *objecter;
+ Filer *filer; // for reading/writing to/from osds
+
+ ClientMap clientmap;
+
+ // sub systems
+ Server *server;
+ MDCache *mdcache;
+ Locker *locker;
+ MDStore *mdstore;
+ MDLog *mdlog;
+ MDBalancer *balancer;
+
+ IdAllocator *idalloc;
+
+ AnchorTable *anchormgr;
+ AnchorClient *anchorclient;
+
+ Logger *logger, *logger2;
+
+
+
+ protected:
+ // -- MDS state --
+ static const int STATE_BOOTING = 1; // fetching mds and osd maps
+ static const int STATE_MKFS = 2; // creating a file system
+ static const int STATE_RECOVERING = 3; // recovering mds log
+ static const int STATE_ACTIVE = 4; // up and active!
+ static const int STATE_STOPPING = 5;
+ static const int STATE_STOPPED = 6;
+
+ int state;
+ list<Context*> waitfor_active;
+
+public:
+ void queue_waitfor_active(Context *c) { waitfor_active.push_back(c); }
+
+ bool is_booting() { return state == STATE_BOOTING; }
+ bool is_recovering() { return state == STATE_RECOVERING; }
+ bool is_active() { return state == STATE_ACTIVE; }
+ bool is_stopping() { return state == STATE_STOPPING; }
+ bool is_stopped() { return state == STATE_STOPPED; }
+
+ void mark_active();
+
+
+ // -- waiters --
+ list<Context*> finished_queue;
+
+ void queue_finished(Context *c) {
+ finished_queue.push_back(c);
+ }
+ void queue_finished(list<Context*>& ls) {
+ finished_queue.splice( finished_queue.end(), ls );
+ }
+
+
+
+ // shutdown crap
+ int req_rate;
+
+ // ino's and fh's
+ public:
+
+ int get_req_rate() { return req_rate; }
+
+ protected:
+
+ friend class MDStore;
+
+
+ public:
+
+ protected:
+ utime_t last_balancer_heartbeat, last_balancer_hash;
+
+ public:
+ MDS(int whoami, Messenger *m, MonMap *mm);
+ ~MDS();
+
+ // who am i etc
+ int get_nodeid() { return whoami; }
+ MDSMap *get_mds_map() { return mdsmap; }
+ OSDMap *get_osd_map() { return osdmap; }
+
+ void send_message_mds(Message *m, int mds, int port=0, int fromport=0);
+
+ // start up, shutdown
+ int init();
+ void reopen_log();
+
+ void boot_mkfs();
+ void boot_mkfs_finish();
+ void boot_recover(int step=0);
+
+ int shutdown_start();
+ int shutdown_final();
+
+ int hash_dentry(inodeno_t ino, const string& s) {
+ return 0; // fixme
+ }
+
+
+ // messages
+ void proc_message(Message *m);
+ virtual void dispatch(Message *m);
+ void my_dispatch(Message *m);
+
+ // special message types
+ void handle_ping(class MPing *m);
+
+ void handle_mds_map(class MMDSMap *m);
+
+ void handle_shutdown_start(Message *m);
+
+ // osds
+ void handle_osd_getmap(Message *m);
+ void handle_osd_map(class MOSDMap *m);
+
+};
+
+
+
+class C_MDS_RetryMessage : public Context {
+ Message *m;
+ MDS *mds;
+public:
+ C_MDS_RetryMessage(MDS *mds, Message *m) {
+ assert(m);
+ this->m = m;
+ this->mds = mds;
+ }
+ virtual void finish(int r) {
+ mds->my_dispatch(m);
+ }
+};
+
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MDSMAP_H
+#define __MDSMAP_H
+
+#include "common/Clock.h"
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+#include <set>
+#include <map>
+#include <string>
+using namespace std;
+
+class MDSMap {
+ protected:
+ epoch_t epoch;
+ utime_t ctime;
+
+ int anchortable;
+
+ set<int> all_mds;
+ set<int> down_mds;
+ map<int,entity_inst_t> mds_inst;
+
+ friend class MDSMonitor;
+
+ public:
+ MDSMap() : epoch(0), anchortable(0) {}
+
+ epoch_t get_epoch() const { return epoch; }
+ void inc_epoch() { epoch++; }
+
+ const utime_t& get_ctime() const { return ctime; }
+
+ int get_anchortable() const { return anchortable; }
+
+ int get_num_mds() const { return all_mds.size(); }
+ int get_num_up_mds() const { return all_mds.size() - down_mds.size(); }
+
+ const set<int>& get_mds() const { return all_mds; }
+ const set<int>& get_down_mds() const { return down_mds; }
+
+ bool is_down(int m) const { return down_mds.count(m); }
+ bool is_up(int m) const { return !is_down(m); }
+
+ const entity_inst_t& get_inst(int m) {
+ assert(mds_inst.count(m));
+ return mds_inst[m];
+ }
+ bool get_inst(int m, entity_inst_t& inst) {
+ if (mds_inst.count(m)) {
+ inst = mds_inst[m];
+ return true;
+ }
+ return false;
+ }
+
+ int get_inst_rank(const entity_inst_t& inst) {
+ for (map<int,entity_inst_t>::iterator p = mds_inst.begin();
+ p != mds_inst.end();
+ ++p) {
+ if (p->second == inst) return p->first;
+ }
+ return -1;
+ }
+
+
+ // serialize, unserialize
+ void encode(bufferlist& blist) {
+ blist.append((char*)&epoch, sizeof(epoch));
+ blist.append((char*)&ctime, sizeof(ctime));
+ blist.append((char*)&anchortable, sizeof(anchortable));
+
+ _encode(all_mds, blist);
+ _encode(down_mds, blist);
+ _encode(mds_inst, blist);
+ }
+
+ void decode(bufferlist& blist) {
+ int off = 0;
+ blist.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ blist.copy(off, sizeof(ctime), (char*)&ctime);
+ off += sizeof(ctime);
+ blist.copy(off, sizeof(anchortable), (char*)&anchortable);
+ off += sizeof(anchortable);
+
+ _decode(all_mds, blist, off);
+ _decode(down_mds, blist, off);
+ _decode(mds_inst, blist, off);
+ }
+
+
+ /*** mapping functions ***/
+
+ int hash_dentry( inodeno_t dirino, const string& dn );
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "MDStore.h"
+#include "MDS.h"
+#include "MDCache.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "CDentry.h"
+#include "MDSMap.h"
+
+#include "osd/OSDMap.h"
+#include "osdc/Filer.h"
+
+#include "msg/Message.h"
+
+#include <cassert>
+#include <iostream>
+using namespace std;
+
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".store "
+
+
+/*
+ * separate hashed dir slices into "regions"
+ */
+size_t get_hash_offset(int hashcode) {
+ if (hashcode < 0)
+ return 0; // not hashed
+ else
+ return (size_t)(1<<30) * (size_t)(1+hashcode);
+}
+
+
+
+
+// ==========================================================================
+// FETCH
+
+
+class C_MDS_Fetch : public Context {
+ protected:
+ MDStore *ms;
+ inodeno_t ino;
+
+ public:
+ C_MDS_Fetch(MDStore *ms, inodeno_t ino) : Context() {
+ this->ms = ms;
+ this->ino = ino;
+ }
+
+ void finish(int result) {
+ ms->fetch_dir_2( result, ino );
+ }
+};
+
+/** fetch_dir(dir, context)
+ * public call to fetch a dir.
+ */
+void MDStore::fetch_dir( CDir *dir,
+ Context *c )
+{
+ dout(7) << "fetch_dir " << *dir << " context is " << c << endl;
+ assert(dir->is_auth() ||
+ dir->is_hashed());
+
+ // wait
+ if (c) dir->add_waiter(CDIR_WAIT_COMPLETE, c);
+
+ // already fetching?
+ if (dir->state_test(CDIR_STATE_FETCHING)) {
+ dout(7) << "already fetching " << *dir << "; waiting" << endl;
+ return;
+ }
+
+ // state
+ dir->state_set(CDIR_STATE_FETCHING);
+
+ // stats
+ if (mds->logger) mds->logger->inc("fdir");
+
+ // create return context
+ Context *fin = new C_MDS_Fetch( this, dir->ino() );
+ if (dir->is_hashed())
+ fetch_dir_hash( dir, fin, mds->get_nodeid()); // hashed
+ else
+ fetch_dir_hash( dir, fin ); // normal
+}
+
+/*
+ * called by low level fn when it's fetched.
+ * fix up dir state.
+ */
+void MDStore::fetch_dir_2( int result,
+ inodeno_t ino)
+{
+ CInode *idir = mds->mdcache->get_inode(ino);
+
+ if (!idir || result < 0) return; // hmm! nevermind i guess.
+
+ assert(idir);
+ CDir *dir = idir->dir;
+ assert(dir);
+
+ // dir is now complete
+ dir->state_set(CDIR_STATE_COMPLETE);
+ dir->state_clear(CDIR_STATE_FETCHING);
+
+ // finish
+ list<Context*> finished;
+ dir->take_waiting(CDIR_WAIT_COMPLETE|CDIR_WAIT_DENTRY, finished);
+ finish_contexts(finished, result);
+}
+
+
+/** low level methods **/
+
+class C_MDS_FetchHash : public Context {
+protected:
+ MDS *mds;
+ inode_t inode;
+ int hashcode;
+ Context *context;
+
+public:
+ bufferlist bl;
+ bufferlist bl2;
+
+ C_MDS_FetchHash(MDS *mds, inode_t inode, Context *c, int hashcode) : Context() {
+ this->mds = mds;
+ this->inode = inode;
+ this->hashcode = hashcode;
+ this->context = c;
+ }
+
+ void finish(int result) {
+ assert(result>0);
+
+ // combine bufferlists bl + bl2 -> bl
+ bl.claim_append(bl2);
+
+ // did i get the whole thing?
+ size_t size;
+ bl.copy(0, sizeof(size_t), (char*)&size);
+ size_t got = bl.length() - sizeof(size);
+ size_t left = size - got;
+ size_t from = bl.length();
+
+ // what part of dir are we getting?
+ from += get_hash_offset(hashcode);
+
+ if (got >= size) {
+ // done.
+ mds->mdstore->fetch_dir_hash_2( bl, inode, context, hashcode );
+ }
+ else {
+ // read the rest!
+ dout(12) << "fetch_dir_hash_2 dir size is " << size << ", got " << got << ", reading remaniing " << left << " from off " << from << endl;
+
+ // create return context
+ C_MDS_FetchHash *fin = new C_MDS_FetchHash( mds, inode, context, hashcode );
+ fin->bl.claim( bl );
+ mds->filer->read(inode,
+ from, left,
+ &fin->bl2,
+ fin );
+ return;
+ }
+ }
+};
+
+/** fetch_dir_hash
+ * low level method.
+ * fetch part of a dir. either the whole thing if hashcode is -1, or a specific
+ * hash segment.
+ */
+void MDStore::fetch_dir_hash( CDir *dir,
+ Context *c,
+ int hashcode)
+{
+ dout(11) << "fetch_dir_hash hashcode " << hashcode << " " << *dir << endl;
+
+ // create return context
+ C_MDS_FetchHash *fin = new C_MDS_FetchHash( mds, dir->get_inode()->inode, c, hashcode );
+
+ // grab first stripe bit (which had better be more than 16 bytes!)
+ assert(dir->get_inode()->inode.layout.stripe_size >= 16);
+ mds->filer->read(dir->get_inode()->inode,
+ get_hash_offset(hashcode), dir->get_inode()->inode.layout.stripe_size,
+ &fin->bl,
+ fin );
+}
+
+void MDStore::fetch_dir_hash_2( bufferlist& bl,
+ inode_t& inode,
+ Context *c,
+ int hashcode)
+{
+ CInode *idir = mds->mdcache->get_inode(inode.ino);
+ if (!idir) {
+ dout(7) << "fetch_dir_hash_2 on ino " << inode.ino << " but no longer in our cache!" << endl;
+ c->finish(-1);
+ delete c;
+ return;
+ }
+
+ if (!idir->dir_is_auth() ||
+ !idir->dir) {
+ dout(7) << "fetch_dir_hash_2 on " << *idir << ", but i'm not auth, or dir not open" << endl;
+ c->finish(-1);
+ delete c;
+ return;
+ }
+
+ // make sure we have a CDir
+ CDir *dir = idir->get_or_open_dir(mds);
+
+ // do it
+ dout(7) << "fetch_dir_hash_2 hashcode " << hashcode << " dir " << *dir << endl;
+
+ // parse buffer contents into cache
+ dout(15) << "bl is " << bl << endl;
+
+ int off = 0;
+ size_t size;
+ __uint32_t num;
+ version_t got_version;
+ int got_hashcode;
+ bl.copy(off, sizeof(size), (char*)&size);
+ off += sizeof(size);
+ assert(bl.length() >= size + sizeof(size));
+ bl.copy(off, sizeof(num), (char*)&num);
+ off += sizeof(num);
+ bl.copy(off, sizeof(got_version), (char*)&got_version);
+ off += sizeof(got_version);
+ bl.copy(off, sizeof(got_hashcode), (char*)&got_hashcode);
+ off += sizeof(got_hashcode);
+
+ assert(got_hashcode == hashcode);
+
+ int buflen = bl.length();
+
+ dout(10) << " " << num << " items in " << size << " bytes" << endl;
+
+ unsigned parsed = 0;
+ while (parsed < num) {
+ assert(off < buflen && num > 0);
+ parsed++;
+
+ dout(24) << " " << parsed << "/" << num << " pos " << off << endl;
+
+ // dentry
+ string dname;
+ ::_decode(dname, bl, off);
+ dout(24) << "parse filename '" << dname << "'" << endl;
+
+ CDentry *dn = dir->lookup(dname); // existing dentry?
+
+ char type = bl[off];
+ ++off;
+ if (type == 'L') {
+ // hard link
+ inodeno_t ino;
+ bl.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+
+ // what to do?
+ if (hashcode >= 0) {
+ int dentryhashcode = mds->hash_dentry( dir->ino(), dname );
+ assert(dentryhashcode == hashcode);
+ }
+
+ if (dn) {
+ if (dn->get_inode() == 0) {
+ // negative dentry?
+ dout(12) << "readdir had NEG dentry " << dname << endl;
+ } else {
+ // had dentry
+ dout(12) << "readdir had dentry " << dname << endl;
+ }
+ continue;
+ }
+
+ // (remote) link
+ CDentry *dn = dir->add_dentry( dname, ino );
+
+ // link to inode?
+ CInode *in = mds->mdcache->get_inode(ino); // we may or may not have it.
+ if (in) {
+ dn->link_remote(in);
+ dout(12) << "readdir got remote link " << ino << " which we have " << *in << endl;
+ } else {
+ dout(12) << "readdir got remote link " << ino << " (dont' have it)" << endl;
+ }
+ }
+ else if (type == 'I') {
+ // inode
+
+ // parse out inode
+ inode_t inode;
+ bl.copy(off, sizeof(inode), (char*)&inode);
+ off += sizeof(inode);
+
+ string symlink;
+ if (inode.is_symlink())
+ ::_decode(symlink, bl, off);
+
+ // what to do?
+ if (hashcode >= 0) {
+ int dentryhashcode = mds->hash_dentry( dir->ino(), dname );
+ assert(dentryhashcode == hashcode);
+ }
+
+ if (dn) {
+ if (dn->get_inode() == 0) {
+ // negative dentry?
+ dout(12) << "readdir had NEG dentry " << dname << endl;
+ } else {
+ // had dentry
+ dout(12) << "readdir had dentry " << dname << endl;
+
+ // under water?
+ if (dn->get_inode()->get_parent_dir_version() <= got_version) {
+ dout(10) << "readdir had underwater dentry " << dname << " and inode, marking clean" << endl;
+ dn->get_inode()->mark_clean();
+ dn->mark_clean();
+ }
+ }
+ continue;
+ }
+
+ // add inode
+ CInode *in = 0;
+ if (mds->mdcache->have_inode(inode.ino)) {
+ in = mds->mdcache->get_inode(inode.ino);
+ dout(12) << "readdir got (but i already had) " << *in
+ << " mode " << in->inode.mode
+ << " mtime " << in->inode.mtime << endl;
+ } else {
+ // inode
+ in = new CInode(mds->mdcache);
+ in->inode = inode;
+
+ // symlink?
+ if (in->is_symlink()) {
+ in->symlink = symlink;
+ }
+
+ // add
+ mds->mdcache->add_inode( in );
+ }
+
+ // link
+ dir->add_dentry( dname, in );
+ dout(12) << "readdir got " << *in << " mode " << in->inode.mode << " mtime " << in->inode.mtime << endl;
+ }
+ else {
+ dout(1) << "corrupt directory, i got tag char '" << type << "' val " << (int)(type)
+ << " at pos " << off << endl;
+ assert(0);
+ }
+ }
+ dout(15) << "parsed " << parsed << endl;
+
+ if (c) {
+ c->finish(0);
+ delete c;
+ }
+}
+
+
+
+
+// ==================================================================
+// COMMIT
+
+class C_MDS_CommitDirVerify : public Context {
+public:
+ MDS *mds;
+ inodeno_t ino;
+ version_t version;
+ Context *c;
+
+ C_MDS_CommitDirVerify( MDS *mds,
+ inodeno_t ino,
+ version_t version,
+ Context *c) {
+ this->mds = mds;
+ this->c = c;
+ this->version = version;
+ this->ino = ino;
+ }
+
+ virtual void finish(int r) {
+
+ if (r >= 0) {
+ CInode *in = mds->mdcache->get_inode(ino);
+ assert(in && in->dir);
+ if (in && in->dir && in->dir->is_auth()) {
+ dout(7) << "CommitDirVerify: current version = " << in->dir->get_version() << endl;
+ dout(7) << "CommitDirVerify: last committed = " << in->dir->get_last_committed_version() << endl;
+ dout(7) << "CommitDirVerify: required = " << version << endl;
+
+ if (in->dir->get_last_committed_version() >= version) {
+ dout(7) << "my required version is safe, done." << endl;
+ } else {
+ dout(7) << "my required version is still not safe, committing again." << endl;
+
+ // what was requested isn't committed yet.
+ mds->mdstore->commit_dir(in->dir,
+ version,
+ c);
+ return;
+ }
+ }
+ }
+
+ // must have exported ors omethign!
+ dout(7) << "can't retry commit dir on " << ino << ", must have exported?" << endl;
+ if (c) {
+ c->finish(-1);
+ delete c;
+ }
+ }
+};
+
+class C_MDS_CommitDirFinish : public Context {
+ protected:
+ MDStore *ms;
+ CDir *dir;
+ version_t version;
+
+ public:
+
+ C_MDS_CommitDirFinish(MDStore *ms, CDir *dir) : Context() {
+ this->ms = ms;
+ this->dir = dir;
+ this->version = dir->get_version(); // just for sanity check later
+ }
+
+ void finish(int result) {
+ ms->commit_dir_2( result, dir, version );
+ }
+};
+
+
+void MDStore::commit_dir( CDir *dir,
+ Context *c )
+{
+ assert(dir->is_dirty());
+
+ // commit thru current version
+ commit_dir(dir, dir->get_version(), c);
+}
+
+void MDStore::commit_dir( CDir *dir,
+ version_t version,
+ Context *c )
+{
+ assert(dir->is_auth() ||
+ dir->is_hashed());
+
+ // already committing?
+ if (dir->state_test(CDIR_STATE_COMMITTING)) {
+ // already mid-commit!
+ dout(7) << "commit_dir " << *dir << " mid-commit of " << dir->get_committing_version() << endl;
+ dout(7) << " current version = " << dir->get_version() << endl;
+ dout(7) << "requested version = " << version << endl;
+
+ assert(version >= dir->get_last_committed_version()); // why would we request _old_ one?
+
+ dir->add_waiter(CDIR_WAIT_COMMITTED,
+ new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) );
+ return;
+ }
+
+ if (!dir->can_auth_pin()) {
+ // something must be frozen up the hiearchy!
+ dout(7) << "commit_dir " << *dir << " can't auth_pin, waiting" << endl;
+ dir->add_waiter(CDIR_WAIT_AUTHPINNABLE,
+ new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) );
+ return;
+ }
+
+
+ // is it complete?
+ if (!dir->is_complete()) {
+ dout(7) << "commit_dir " << *dir << " not complete, fetching first" << endl;
+ // fetch dir first
+ fetch_dir(dir,
+ new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) );
+ return;
+ }
+
+
+ // ok go
+ dout(7) << "commit_dir " << *dir << " version " << dir->get_version() << endl;
+
+ // add waiter
+ if (c) dir->add_waiter(CDIR_WAIT_COMMITTED, c);
+
+ // get continuation ready
+ Context *fin = new C_MDS_CommitDirFinish(this, dir);
+
+ // state
+ dir->state_set(CDIR_STATE_COMMITTING);
+ dir->set_committing_version();
+
+ // stats
+ if (mds->logger) mds->logger->inc("cdir");
+
+ if (dir->is_hashed()) {
+ // hashed
+ commit_dir_slice( dir, fin, mds->get_nodeid() );
+ } else {
+ // non-hashed
+ commit_dir_slice( dir, fin );
+ }
+}
+
+void MDStore::commit_dir_2( int result,
+ CDir *dir,
+ version_t committed_version)
+{
+ dout(5) << "commit_dir_2 " << *dir << " committed " << committed_version << ", current version " << dir->get_version() << endl;
+ assert(committed_version == dir->get_committing_version());
+
+ // remember which version is now safe
+ dir->set_last_committed_version(committed_version);
+
+ // is the dir now clean?
+ if (committed_version == dir->get_version())
+ dir->mark_clean();
+
+ dir->state_clear(CDIR_STATE_COMMITTING);
+
+ // finish
+ dir->finish_waiting(CDIR_WAIT_COMMITTED);
+}
+
+
+
+
+// low-level committer (hashed or normal)
+
+class C_MDS_CommitSlice : public Context {
+ protected:
+ MDStore *ms;
+ CDir *dir;
+ Context *c;
+ int hashcode;
+ version_t version;
+
+public:
+ bufferlist bl;
+
+ C_MDS_CommitSlice(MDStore *ms, CDir *dir, Context *c, int w) : Context() {
+ this->ms = ms;
+ this->dir = dir;
+ this->c = c;
+ this->hashcode = w;
+ version = dir->get_version();
+ }
+
+ void finish(int result) {
+ ms->commit_dir_slice_2( result, dir, c, version, hashcode );
+ }
+};
+
+
+void MDStore::commit_dir_slice( CDir *dir,
+ Context *c,
+ int hashcode)
+{
+ if (hashcode >= 0) {
+ assert(dir->is_hashed());
+ dout(10) << "commit_dir_slice hashcode " << hashcode << " " << *dir << " version " << dir->get_version() << endl;
+ } else {
+ assert(dir->is_auth());
+ dout(10) << "commit_dir_slice (whole dir) " << *dir << " version " << dir->get_version() << endl;
+ }
+
+ // get continuation ready
+ C_MDS_CommitSlice *fin = new C_MDS_CommitSlice(this, dir, c, hashcode);
+
+ // fill buffer
+ __uint32_t num = 0;
+
+ bufferlist dirdata;
+
+ version_t v = dir->get_version();
+ dirdata.append((char*)&v, sizeof(v));
+ dirdata.append((char*)&hashcode, sizeof(hashcode));
+
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+
+ if (hashcode >= 0) {
+ int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode != hashcode) continue;
+ }
+
+ // put dentry in this version
+ if (dn->is_dirty()) {
+ dn->float_parent_dir_version( dir->get_version() );
+ dout(12) << " dirty dn " << *dn << " now " << dn->get_parent_dir_version() << endl;
+ }
+
+ if (dn->is_null()) continue; // skipping negative entry
+
+ // primary or remote?
+ if (dn->is_remote()) {
+
+ inodeno_t ino = dn->get_remote_ino();
+ dout(14) << " pos " << dirdata.length() << " dn '" << it->first << "' remote ino " << ino << endl;
+
+ // name, marker, ion
+ dirdata.append( it->first.c_str(), it->first.length() + 1);
+ dirdata.append( "L", 1 ); // remote link
+ dirdata.append((char*)&ino, sizeof(ino));
+
+ } else {
+ // primary link
+ CInode *in = dn->get_inode();
+ assert(in);
+
+ dout(14) << " pos " << dirdata.length() << " dn '" << it->first << "' inode " << *in << endl;
+
+ // name, marker, inode, [symlink string]
+ dirdata.append( it->first.c_str(), it->first.length() + 1);
+ dirdata.append( "I", 1 ); // inode
+ dirdata.append( (char*) &in->inode, sizeof(inode_t));
+
+ if (in->is_symlink()) {
+ // include symlink destination!
+ dout(18) << " inlcuding symlink ptr " << in->symlink << endl;
+ dirdata.append( (char*) in->symlink.c_str(), in->symlink.length() + 1);
+ }
+
+ // put inode in this dir version
+ if (in->is_dirty()) {
+ in->float_parent_dir_version( dir->get_version() );
+ dout(12) << " dirty inode " << *in << " now " << in->get_parent_dir_version() << endl;
+
+ in->set_committing_version( in->get_version() );
+ assert(in->get_last_committed_version() < in->get_committing_version());
+ } else {
+ assert(in->get_committing_version() == in->get_version());
+ }
+
+ }
+
+ num++;
+ }
+ dout(14) << "num " << num << endl;
+
+ // put count in buffer
+ //bufferlist bl;
+ size_t size = sizeof(num) + dirdata.length();
+ fin->bl.append((char*)&size, sizeof(size));
+ fin->bl.append((char*)&num, sizeof(num));
+ fin->bl.claim_append(dirdata); //.c_str(), dirdata.length());
+ assert(fin->bl.length() == size + sizeof(size));
+
+ // pin inode
+ dir->auth_pin();
+
+ // submit to osd
+ mds->filer->write( dir->get_inode()->inode,
+ 0, fin->bl.length(),
+ fin->bl,
+ 0, //OSD_OP_FLAGS_TRUNCATE, // truncate file/object after end of this write
+ NULL, fin ); // on safe
+}
+
+
+void MDStore::commit_dir_slice_2( int result,
+ CDir *dir,
+ Context *c,
+ version_t committed_version,
+ int hashcode )
+{
+ dout(11) << "commit_dir_slice_2 hashcode " << hashcode << " " << *dir << " v " << committed_version << endl;
+
+ // mark inodes and dentries clean too (if we committed them!)
+ list<CDentry*> null_clean;
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end(); ) {
+ CDentry *dn = it->second;
+ it++;
+
+ if (hashcode >= 0) {
+ int dentryhashcode = mds->hash_dentry( dir->ino(), dn->get_name() );
+ if (dentryhashcode != hashcode) continue;
+ }
+
+ // dentry
+ if (committed_version > dn->get_parent_dir_version()) {
+ dout(15) << " dir " << committed_version << " > dn " << dn->get_parent_dir_version() << " still clean " << *dn << endl;
+ assert(!dn->is_dirty());
+ }
+ else if (dn->get_parent_dir_version() == committed_version) {
+ dout(15) << " dir " << committed_version << " == dn " << dn->get_parent_dir_version() << " now clean " << *dn << endl;
+ if (dn->is_dirty())
+ dn->mark_clean(); // might not but could be dirty
+
+ // remove, if it's null and unlocked
+ if (dn->is_null() && dn->is_sync()) {
+ dout(15) << " removing clean and null " << *dn << endl;
+ null_clean.push_back(dn);
+ continue;
+ }
+ } else {
+ dout(15) << " dir " << committed_version << " < dn " << dn->get_parent_dir_version() << " still dirty " << *dn << endl;
+ assert(committed_version < dn->get_parent_dir_version());
+ //assert(dn->is_dirty() || !dn->is_sync()); // -OR- we did a fetch_dir in order to do a newer commit...
+ }
+
+ // only do primary...
+ if (!dn->is_primary()) continue;
+
+ CInode *in = dn->get_inode();
+ assert(in);
+ assert(in->is_auth());
+
+ if (in->get_committing_version())
+ in->set_committed_version();
+
+ if (committed_version > in->get_parent_dir_version()) {
+ dout(15) << " dir " << committed_version << " > inode " << in->get_parent_dir_version() << " still clean " << *(in) << endl;
+ assert(!in->is_dirty());
+ }
+ else if (in->get_parent_dir_version() == committed_version) {
+ dout(15) << " dir " << committed_version << " == inode " << in->get_parent_dir_version() << " now clean " << *(in) << endl;
+ in->mark_clean(); // might not but could be dirty
+ } else {
+ dout(15) << " dir " << committed_version << " < inode " << in->get_parent_dir_version() << " still dirty " << *(in) << endl;
+ assert(committed_version < in->get_parent_dir_version());
+ //assert(in->is_dirty()); // -OR- we did a fetch_dir in order to do a newer commit...
+ }
+ }
+
+ // remove null clean dentries
+ for (list<CDentry*>::iterator it = null_clean.begin();
+ it != null_clean.end();
+ it++)
+ dir->remove_dentry(*it);
+
+ // unpin
+ dir->auth_unpin();
+
+ // finish
+ if (c) {
+ c->finish(0);
+ delete c;
+ }
+}
+
+
+
+
+
+
+
+
+
+
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __MDSTORE_H
+#define __MDSTORE_H
+
+#include "include/types.h"
+#include "include/buffer.h"
+
+class MDS;
+class CDir;
+class Context;
+
+class MDStore {
+ protected:
+ MDS *mds;
+
+
+ public:
+ MDStore(MDS *m) {
+ mds = m;
+ }
+
+
+ // fetch
+ public:
+ void fetch_dir( CDir *dir, Context *c );
+ protected:
+ void fetch_dir_2( int result, inodeno_t ino );
+
+ void fetch_dir_hash( CDir *dir,
+ Context *c,
+ int hashcode = -1);
+ void fetch_dir_hash_2( bufferlist &bl,
+ inode_t& inode,
+ Context *c,
+ int which);
+ friend class C_MDS_Fetch;
+ friend class C_MDS_FetchHash;
+
+ // commit
+ public:
+ void commit_dir( CDir *dir, Context *c ); // commit current dir version to disk.
+ void commit_dir( CDir *dir, __uint64_t version, Context *c ); // commit specified version to disk
+ protected:
+ void commit_dir_2( int result, CDir *dir, __uint64_t committed_version );
+
+ // low level committers
+ void commit_dir_slice( CDir *dir,
+ Context *c,
+ int hashcode = -1);
+ void commit_dir_slice_2( int result,
+ CDir *dir,
+ Context *c,
+ __uint64_t version,
+ int hashcode );
+
+ friend class C_MDS_CommitDirFinish;
+ friend class C_MDS_CommitSlice;
+};
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "MDS.h"
+#include "MDCache.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "CDentry.h"
+#include "Migrator.h"
+#include "Locker.h"
+
+#include "MDBalancer.h"
+#include "MDLog.h"
+#include "MDSMap.h"
+
+#include "include/filepath.h"
+
+#include "events/EInodeUpdate.h"
+#include "events/EDirUpdate.h"
+
+#include "msg/Messenger.h"
+
+#include "messages/MClientFileCaps.h"
+
+#include "messages/MExportDirDiscover.h"
+#include "messages/MExportDirDiscoverAck.h"
+#include "messages/MExportDirPrep.h"
+#include "messages/MExportDirPrepAck.h"
+#include "messages/MExportDirWarning.h"
+#include "messages/MExportDir.h"
+#include "messages/MExportDirNotify.h"
+#include "messages/MExportDirNotifyAck.h"
+#include "messages/MExportDirFinish.h"
+
+#include "messages/MHashDirDiscover.h"
+#include "messages/MHashDirDiscoverAck.h"
+#include "messages/MHashDirPrep.h"
+#include "messages/MHashDirPrepAck.h"
+#include "messages/MHashDir.h"
+#include "messages/MHashDirNotify.h"
+#include "messages/MHashDirAck.h"
+
+#include "messages/MUnhashDirPrep.h"
+#include "messages/MUnhashDirPrepAck.h"
+#include "messages/MUnhashDir.h"
+#include "messages/MUnhashDirAck.h"
+#include "messages/MUnhashDirNotify.h"
+#include "messages/MUnhashDirNotifyAck.h"
+
+
+
+void Migrator::dispatch(Message *m)
+{
+ switch (m->get_type()) {
+ // import
+ case MSG_MDS_EXPORTDIRDISCOVER:
+ handle_export_dir_discover((MExportDirDiscover*)m);
+ break;
+ case MSG_MDS_EXPORTDIRPREP:
+ handle_export_dir_prep((MExportDirPrep*)m);
+ break;
+ case MSG_MDS_EXPORTDIR:
+ handle_export_dir((MExportDir*)m);
+ break;
+ case MSG_MDS_EXPORTDIRFINISH:
+ handle_export_dir_finish((MExportDirFinish*)m);
+ break;
+
+ // export
+ case MSG_MDS_EXPORTDIRDISCOVERACK:
+ handle_export_dir_discover_ack((MExportDirDiscoverAck*)m);
+ break;
+ case MSG_MDS_EXPORTDIRPREPACK:
+ handle_export_dir_prep_ack((MExportDirPrepAck*)m);
+ break;
+ case MSG_MDS_EXPORTDIRNOTIFYACK:
+ handle_export_dir_notify_ack((MExportDirNotifyAck*)m);
+ break;
+
+ // export 3rd party (inode authority)
+ case MSG_MDS_EXPORTDIRWARNING:
+ handle_export_dir_warning((MExportDirWarning*)m);
+ break;
+ case MSG_MDS_EXPORTDIRNOTIFY:
+ handle_export_dir_notify((MExportDirNotify*)m);
+ break;
+
+
+ // hashing
+ case MSG_MDS_HASHDIRDISCOVER:
+ handle_hash_dir_discover((MHashDirDiscover*)m);
+ break;
+ case MSG_MDS_HASHDIRDISCOVERACK:
+ handle_hash_dir_discover_ack((MHashDirDiscoverAck*)m);
+ break;
+ case MSG_MDS_HASHDIRPREP:
+ handle_hash_dir_prep((MHashDirPrep*)m);
+ break;
+ case MSG_MDS_HASHDIRPREPACK:
+ handle_hash_dir_prep_ack((MHashDirPrepAck*)m);
+ break;
+ case MSG_MDS_HASHDIR:
+ handle_hash_dir((MHashDir*)m);
+ break;
+ case MSG_MDS_HASHDIRACK:
+ handle_hash_dir_ack((MHashDirAck*)m);
+ break;
+ case MSG_MDS_HASHDIRNOTIFY:
+ handle_hash_dir_notify((MHashDirNotify*)m);
+ break;
+
+ // unhashing
+ case MSG_MDS_UNHASHDIRPREP:
+ handle_unhash_dir_prep((MUnhashDirPrep*)m);
+ break;
+ case MSG_MDS_UNHASHDIRPREPACK:
+ handle_unhash_dir_prep_ack((MUnhashDirPrepAck*)m);
+ break;
+ case MSG_MDS_UNHASHDIR:
+ handle_unhash_dir((MUnhashDir*)m);
+ break;
+ case MSG_MDS_UNHASHDIRACK:
+ handle_unhash_dir_ack((MUnhashDirAck*)m);
+ break;
+ case MSG_MDS_UNHASHDIRNOTIFY:
+ handle_unhash_dir_notify((MUnhashDirNotify*)m);
+ break;
+ case MSG_MDS_UNHASHDIRNOTIFYACK:
+ handle_unhash_dir_notify_ack((MUnhashDirNotifyAck*)m);
+ break;
+
+ default:
+ assert(0);
+ }
+}
+
+
+class C_MDC_EmptyImport : public Context {
+ Migrator *mig;
+ CDir *dir;
+public:
+ C_MDC_EmptyImport(Migrator *m, CDir *d) : mig(m), dir(d) {}
+ void finish(int r) {
+ mig->export_empty_import(dir);
+ }
+};
+
+
+void Migrator::export_empty_import(CDir *dir)
+{
+ dout(7) << "export_empty_import " << *dir << endl;
+
+ return; // hack fixme
+
+ if (!dir->is_import()) {
+ dout(7) << "not import (anymore?)" << endl;
+ return;
+ }
+ if (dir->inode->is_root()) {
+ dout(7) << "root" << endl;
+ return;
+ }
+
+ if (dir->get_size() > 0) {
+ dout(7) << "not actually empty" << endl;
+ return;
+ }
+
+ // is it really empty?
+ if (!dir->is_complete()) {
+ dout(7) << "not complete, fetching." << endl;
+ mds->mdstore->fetch_dir(dir,
+ new C_MDC_EmptyImport(this,dir));
+ return;
+ }
+
+ int dest = dir->inode->authority();
+
+ // comment this out ot wreak havoc?
+ //if (mds->is_shutting_down()) dest = 0; // this is more efficient.
+
+ dout(7) << "really empty, exporting to " << dest << endl;
+ assert (dest != mds->get_nodeid());
+
+ dout(-7) << "exporting to mds" << dest
+ << " empty import " << *dir << endl;
+ export_dir( dir, dest );
+}
+
+
+// ==========================================================
+// IMPORT/EXPORT
+
+
+class C_MDC_ExportFreeze : public Context {
+ Migrator *mig;
+ CDir *ex; // dir i'm exporting
+ int dest;
+
+public:
+ C_MDC_ExportFreeze(Migrator *m, CDir *e, int d) :
+ mig(m), ex(e), dest(d) {}
+ virtual void finish(int r) {
+ mig->export_dir_frozen(ex, dest);
+ }
+};
+
+
+
+/** export_dir(dir, dest)
+ * public method to initiate an export.
+ * will fail if the directory is freezing, frozen, unpinnable, or root.
+ */
+void Migrator::export_dir(CDir *dir,
+ int dest)
+{
+ dout(7) << "export_dir " << *dir << " to " << dest << endl;
+ assert(dest != mds->get_nodeid());
+ assert(!dir->is_hashed());
+
+ if (dir->inode->is_root()) {
+ dout(7) << "i won't export root" << endl;
+ assert(0);
+ return;
+ }
+
+ if (dir->is_frozen() ||
+ dir->is_freezing()) {
+ dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << endl;
+ return;
+ }
+ if (dir->is_hashed()) {
+ dout(7) << "can't export hashed dir right now. implement me carefully later." << endl;
+ return;
+ }
+
+
+ // pin path?
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ if (!cache->path_pin(trace, 0, 0)) {
+ dout(7) << "export_dir couldn't pin path, failing." << endl;
+ return;
+ }
+
+ // ok, let's go.
+
+ // send ExportDirDiscover (ask target)
+ export_gather[dir].insert(dest);
+ mds->send_message_mds(new MExportDirDiscover(dir->inode), dest, MDS_PORT_MIGRATOR);
+ dir->auth_pin(); // pin dir, to hang up our freeze (unpin on prep ack)
+
+ // take away the popularity we're sending. FIXME: do this later?
+ mds->balancer->subtract_export(dir);
+
+
+ // freeze the subtree
+ dir->freeze_tree(new C_MDC_ExportFreeze(this, dir, dest));
+}
+
+
+/*
+ * called on receipt of MExportDirDiscoverAck
+ * the importer now has the directory's _inode_ in memory, and pinned.
+ */
+void Migrator::handle_export_dir_discover_ack(MExportDirDiscoverAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ int from = m->get_source().num();
+ assert(export_gather[dir].count(from));
+ export_gather[dir].erase(from);
+
+ if (export_gather[dir].empty()) {
+ dout(7) << "export_dir_discover_ack " << *dir << ", releasing auth_pin" << endl;
+ dir->auth_unpin(); // unpin to allow freeze to complete
+ } else {
+ dout(7) << "export_dir_discover_ack " << *dir << ", still waiting for " << export_gather[dir] << endl;
+ }
+
+ delete m; // done
+}
+
+
+void Migrator::export_dir_frozen(CDir *dir,
+ int dest)
+{
+ // subtree is now frozen!
+ dout(7) << "export_dir_frozen on " << *dir << " to " << dest << endl;
+
+ show_imports();
+
+ MExportDirPrep *prep = new MExportDirPrep(dir->inode);
+
+ // include spanning tree for all nested exports.
+ // these need to be on the destination _before_ the final export so that
+ // dir_auth updates on any nested exports are properly absorbed.
+
+ set<inodeno_t> inodes_added;
+
+ // include base dir
+ prep->add_dir( new CDirDiscover(dir, dir->open_by_add(dest)) );
+
+ // also include traces to all nested exports.
+ set<CDir*> my_nested;
+ cache->find_nested_exports(dir, my_nested);
+ for (set<CDir*>::iterator it = my_nested.begin();
+ it != my_nested.end();
+ it++) {
+ CDir *exp = *it;
+
+ dout(7) << " including nested export " << *exp << " in prep" << endl;
+
+ prep->add_export( exp->ino() );
+
+ /* first assemble each trace, in trace order, and put in message */
+ list<CInode*> inode_trace;
+
+ // trace to dir
+ CDir *cur = exp;
+ while (cur != dir) {
+ // don't repeat ourselves
+ if (inodes_added.count(cur->ino())) break; // did already!
+ inodes_added.insert(cur->ino());
+
+ CDir *parent_dir = cur->get_parent_dir();
+
+ // inode?
+ assert(cur->inode->is_auth());
+ inode_trace.push_front(cur->inode);
+ dout(7) << " will add " << *cur->inode << endl;
+
+ // include dir? note: this'll include everything except the nested exports themselves,
+ // since someone else is obviously auth.
+ if (cur->is_auth()) {
+ prep->add_dir( new CDirDiscover(cur, cur->open_by_add(dest)) ); // yay!
+ dout(7) << " added " << *cur << endl;
+ }
+
+ cur = parent_dir;
+ }
+
+ for (list<CInode*>::iterator it = inode_trace.begin();
+ it != inode_trace.end();
+ it++) {
+ CInode *in = *it;
+ dout(7) << " added " << *in << endl;
+ prep->add_inode( in->parent->dir->ino(),
+ in->parent->name,
+ in->replicate_to(dest) );
+ }
+
+ }
+
+ // send it!
+ mds->send_message_mds(prep, dest, MDS_PORT_MIGRATOR);
+}
+
+void Migrator::handle_export_dir_prep_ack(MExportDirPrepAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "export_dir_prep_ack " << *dir << ", starting export" << endl;
+
+ // start export.
+ export_dir_go(dir, m->get_source().num());
+
+ // done
+ delete m;
+}
+
+
+void Migrator::export_dir_go(CDir *dir,
+ int dest)
+{
+ dout(7) << "export_dir_go " << *dir << " to " << dest << endl;
+
+ show_imports();
+
+
+ // build export message
+ MExportDir *req = new MExportDir(dir->inode); // include pop
+
+
+ // update imports/exports
+ CDir *containing_import = cache->get_auth_container(dir);
+
+ if (containing_import == dir) {
+ dout(7) << " i'm rexporting a previous import" << endl;
+ assert(dir->is_import());
+ cache->imports.erase(dir);
+ dir->state_clear(CDIR_STATE_IMPORT);
+ dir->put(CDIR_PIN_IMPORT); // unpin, no longer an import
+
+ // discard nested exports (that we're handing off
+ for (set<CDir*>::iterator p = cache->nested_exports[dir].begin();
+ p != cache->nested_exports[dir].end(); ) {
+ CDir *nested = *p;
+ p++;
+
+ // add to export message
+ req->add_export(nested);
+
+ // nested beneath our new export *in; remove!
+ dout(7) << " export " << *nested << " was nested beneath us; removing from export list(s)" << endl;
+ assert(cache->exports.count(nested) == 1);
+ cache->nested_exports[dir].erase(nested);
+ }
+
+ } else {
+ dout(7) << " i'm a subdir nested under import " << *containing_import << endl;
+ cache->exports.insert(dir);
+ cache->nested_exports[containing_import].insert(dir);
+
+ dir->state_set(CDIR_STATE_EXPORT);
+ dir->get(CDIR_PIN_EXPORT); // i must keep it pinned
+
+ // discard nested exports (that we're handing off)
+ for (set<CDir*>::iterator p = cache->nested_exports[containing_import].begin();
+ p != cache->nested_exports[containing_import].end(); ) {
+ CDir *nested = *p;
+ p++;
+ if (nested == dir) continue; // ignore myself
+
+ // container of parent; otherwise we get ourselves.
+ CDir *containing_export = nested->get_parent_dir();
+ while (containing_export && !containing_export->is_export())
+ containing_export = containing_export->get_parent_dir();
+ if (!containing_export) continue;
+
+ if (containing_export == dir) {
+ // nested beneath our new export *in; remove!
+ dout(7) << " export " << *nested << " was nested beneath us; removing from nested_exports" << endl;
+ cache->nested_exports[containing_import].erase(nested);
+ // exports.erase(nested); _walk does this
+
+ // add to msg
+ req->add_export(nested);
+ } else {
+ dout(12) << " export " << *nested << " is under other export " << *containing_export << ", which is unrelated" << endl;
+ assert(cache->get_auth_container(containing_export) != containing_import);
+ }
+ }
+ }
+
+ // note new authority (locally)
+ if (dir->inode->authority() == dest)
+ dir->set_dir_auth( CDIR_AUTH_PARENT );
+ else
+ dir->set_dir_auth( dest );
+
+ // make list of nodes i expect an export_dir_notify_ack from
+ // (everyone w/ this dir open, but me!)
+ assert(export_notify_ack_waiting[dir].empty());
+ for (set<int>::iterator it = dir->open_by.begin();
+ it != dir->open_by.end();
+ it++) {
+ if (*it == mds->get_nodeid()) continue;
+ export_notify_ack_waiting[dir].insert( *it );
+
+ // send warning to all but dest
+ if (*it != dest) {
+ dout(10) << " sending export_dir_warning to mds" << *it << endl;
+ mds->send_message_mds(new MExportDirWarning( dir->ino() ), *it, MDS_PORT_MIGRATOR);
+ }
+ }
+ assert(export_notify_ack_waiting[dir].count( dest ));
+
+ // fill export message with cache data
+ C_Contexts *fin = new C_Contexts;
+ int num_exported_inodes = export_dir_walk( req,
+ fin,
+ dir, // base
+ dir, // recur start point
+ dest );
+
+ // send the export data!
+ mds->send_message_mds(req, dest, MDS_PORT_MIGRATOR);
+
+ // queue up the finisher
+ dir->add_waiter( CDIR_WAIT_UNFREEZE, fin );
+
+
+ // stats
+ if (mds->logger) mds->logger->inc("ex");
+ if (mds->logger) mds->logger->inc("iex", num_exported_inodes);
+
+ show_imports();
+}
+
+
+/** encode_export_inode
+ * update our local state for this inode to export.
+ * encode relevant state to be sent over the wire.
+ * used by: export_dir_walk, file_rename (if foreign)
+ */
+void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth)
+{
+ in->inode.version++; // so local log entries are ignored, etc. (FIXME ??)
+
+ // tell (all) clients about migrating caps.. mark STALE
+ for (map<int, Capability>::iterator it = in->client_caps.begin();
+ it != in->client_caps.end();
+ it++) {
+ dout(7) << "encode_export_inode " << *in << " telling client" << it->first << " stale caps" << endl;
+ MClientFileCaps *m = new MClientFileCaps(in->inode,
+ it->second.get_last_seq(),
+ it->second.pending(),
+ it->second.wanted(),
+ MClientFileCaps::FILECAP_STALE);
+ mds->messenger->send_message(m, MSG_ADDR_CLIENT(it->first), mds->clientmap.get_inst(it->first),
+ 0, MDS_PORT_CACHE);
+ }
+
+ // relax locks?
+ if (!in->is_cached_by_anyone())
+ in->replicate_relax_locks();
+
+ // add inode
+ assert(in->cached_by.count(mds->get_nodeid()) == 0);
+ CInodeExport istate( in );
+ istate._encode( enc_state );
+
+ // we're export this inode; fix inode state
+ dout(7) << "encode_export_inode " << *in << endl;
+
+ if (in->is_dirty()) in->mark_clean();
+
+ // clear/unpin cached_by (we're no longer the authority)
+ in->cached_by_clear();
+
+ // twiddle lock states for auth -> replica transition
+ // hard
+ in->hardlock.clear_gather();
+ if (in->hardlock.get_state() == LOCK_GLOCKR)
+ in->hardlock.set_state(LOCK_LOCK);
+
+ // file : we lost all our caps, so move to stable state!
+ in->filelock.clear_gather();
+ if (in->filelock.get_state() == LOCK_GLOCKR ||
+ in->filelock.get_state() == LOCK_GLOCKM ||
+ in->filelock.get_state() == LOCK_GLOCKL ||
+ in->filelock.get_state() == LOCK_GLONERR ||
+ in->filelock.get_state() == LOCK_GLONERM ||
+ in->filelock.get_state() == LOCK_LONER)
+ in->filelock.set_state(LOCK_LOCK);
+ if (in->filelock.get_state() == LOCK_GMIXEDR)
+ in->filelock.set_state(LOCK_MIXED);
+ // this looks like a step backwards, but it's what we want!
+ if (in->filelock.get_state() == LOCK_GSYNCM)
+ in->filelock.set_state(LOCK_MIXED);
+ if (in->filelock.get_state() == LOCK_GSYNCL)
+ in->filelock.set_state(LOCK_LOCK);
+ if (in->filelock.get_state() == LOCK_GMIXEDL)
+ in->filelock.set_state(LOCK_LOCK);
+ //in->filelock.set_state(LOCK_MIXED);
+
+ // mark auth
+ assert(in->is_auth());
+ in->set_auth(false);
+ in->replica_nonce = CINODE_EXPORT_NONCE;
+
+ // *** other state too?
+
+ // move to end of LRU so we drop out of cache quickly!
+ cache->lru.lru_bottouch(in);
+}
+
+
+int Migrator::export_dir_walk(MExportDir *req,
+ C_Contexts *fin,
+ CDir *basedir,
+ CDir *dir,
+ int newauth)
+{
+ int num_exported = 0;
+
+ dout(7) << "export_dir_walk " << *dir << " " << dir->nitems << " items" << endl;
+
+ // dir
+ bufferlist enc_dir;
+
+ CDirExport dstate(dir);
+ dstate._encode( enc_dir );
+
+ // release open_by
+ dir->open_by_clear();
+
+ // mark
+ assert(dir->is_auth());
+ dir->state_clear(CDIR_STATE_AUTH);
+ dir->replica_nonce = CDIR_NONCE_EXPORT;
+
+ // proxy
+ dir->state_set(CDIR_STATE_PROXY);
+ dir->get(CDIR_PIN_PROXY);
+ export_proxy_dirinos[basedir].push_back(dir->ino());
+
+ list<CDir*> subdirs;
+
+ if (dir->is_hashed()) {
+ // fix state
+ dir->state_clear( CDIR_STATE_AUTH );
+
+ } else {
+
+ if (dir->is_dirty())
+ dir->mark_clean();
+
+ // discard most dir state
+ dir->state &= CDIR_MASK_STATE_EXPORT_KEPT; // i only retain a few things.
+
+ // suck up all waiters
+ list<Context*> waiting;
+ dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters
+ fin->take(waiting);
+
+ // inodes
+
+ CDir_map_t::iterator it;
+ for (it = dir->begin(); it != dir->end(); it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->inode;
+
+ num_exported++;
+
+ // -- dentry
+ dout(7) << "export_dir_walk exporting " << *dn << endl;
+ _encode(it->first, enc_dir);
+
+ if (dn->is_dirty())
+ enc_dir.append("D", 1); // dirty
+ else
+ enc_dir.append("C", 1); // clean
+
+ // null dentry?
+ if (dn->is_null()) {
+ enc_dir.append("N", 1); // null dentry
+ assert(dn->is_sync());
+ continue;
+ }
+
+ if (dn->is_remote()) {
+ // remote link
+ enc_dir.append("L", 1); // remote link
+
+ inodeno_t ino = dn->get_remote_ino();
+ enc_dir.append((char*)&ino, sizeof(ino));
+ continue;
+ }
+
+ // primary link
+ // -- inode
+ enc_dir.append("I", 1); // inode dentry
+
+ encode_export_inode(in, enc_dir, newauth); // encode, and (update state for) export
+
+ // directory?
+ if (in->is_dir() && in->dir) {
+ if (in->dir->is_auth()) {
+ // nested subdir
+ assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT);
+ subdirs.push_back(in->dir); // it's ours, recurse (later)
+
+ } else {
+ // nested export
+ assert(in->dir->get_dir_auth() >= 0);
+ dout(7) << " encountered nested export " << *in->dir << " dir_auth " << in->dir->get_dir_auth() << "; removing from exports" << endl;
+ assert(cache->exports.count(in->dir) == 1);
+ cache->exports.erase(in->dir); // discard nested export (nested_exports updated above)
+
+ in->dir->state_clear(CDIR_STATE_EXPORT);
+ in->dir->put(CDIR_PIN_EXPORT);
+
+ // simplify dir_auth?
+ if (in->dir->get_dir_auth() == newauth)
+ in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ }
+ }
+
+ // add to proxy
+ export_proxy_inos[basedir].push_back(in->ino());
+ in->state_set(CINODE_STATE_PROXY);
+ in->get(CINODE_PIN_PROXY);
+
+ // waiters
+ list<Context*> waiters;
+ in->take_waiting(CINODE_WAIT_ANY, waiters);
+ fin->take(waiters);
+ }
+ }
+
+ req->add_dir( enc_dir );
+
+ // subdirs
+ for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); it++)
+ num_exported += export_dir_walk(req, fin, basedir, *it, newauth);
+
+ return num_exported;
+}
+
+
+/*
+ * i should get an export_dir_notify_ack from every mds that had me open, including the new auth (an ack)
+ */
+void Migrator::handle_export_dir_notify_ack(MExportDirNotifyAck *m)
+{
+ CInode *diri = cache->get_inode(m->get_ino());
+ CDir *dir = diri->dir;
+ assert(dir);
+ assert(dir->is_frozen_tree_root()); // i'm exporting!
+
+ // remove from waiting list
+ int from = m->get_source().num();
+ assert(export_notify_ack_waiting[dir].count(from));
+ export_notify_ack_waiting[dir].erase(from);
+
+ // done?
+ if (!export_notify_ack_waiting[dir].empty()) {
+ dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from
+ << ", still waiting for " << export_notify_ack_waiting[dir] << endl;
+
+ } else {
+ dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from
+ << ", last one!" << endl;
+
+ // ok, we're finished!
+ export_notify_ack_waiting.erase(dir);
+
+ // finish export (unfreeze, trigger finish context, etc.)
+ export_dir_finish(dir);
+
+ // unpin proxies
+ // inodes
+ for (list<inodeno_t>::iterator it = export_proxy_inos[dir].begin();
+ it != export_proxy_inos[dir].end();
+ it++) {
+ CInode *in = cache->get_inode(*it);
+ in->put(CINODE_PIN_PROXY);
+ assert(in->state_test(CINODE_STATE_PROXY));
+ in->state_clear(CINODE_STATE_PROXY);
+ }
+ export_proxy_inos.erase(dir);
+
+ // dirs
+ for (list<inodeno_t>::iterator it = export_proxy_dirinos[dir].begin();
+ it != export_proxy_dirinos[dir].end();
+ it++) {
+ CDir *dir = cache->get_inode(*it)->dir;
+ dir->put(CDIR_PIN_PROXY);
+ assert(dir->state_test(CDIR_STATE_PROXY));
+ dir->state_clear(CDIR_STATE_PROXY);
+
+ // hose neg dentries, too, since we're no longer auth
+ CDir_map_t::iterator it;
+ for (it = dir->begin(); it != dir->end(); ) {
+ CDentry *dn = it->second;
+ it++;
+ if (dn->is_null()) {
+ assert(dn->is_sync());
+ dir->remove_dentry(dn);
+ } else {
+ //dout(10) << "export_dir_notify_ack leaving xlocked neg " << *dn << endl;
+ if (dn->is_dirty())
+ dn->mark_clean();
+ }
+ }
+ }
+ export_proxy_dirinos.erase(dir);
+
+ }
+
+ delete m;
+}
+
+
+/*
+ * once i get all teh notify_acks i can finish
+ */
+void Migrator::export_dir_finish(CDir *dir)
+{
+ // exported!
+
+
+ // FIXME log it
+
+ // send finish to new auth
+ mds->send_message_mds(new MExportDirFinish(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR);
+
+ // unfreeze
+ dout(7) << "export_dir_finish " << *dir << ", unfreezing" << endl;
+ dir->unfreeze_tree();
+
+ // unpin path
+ dout(7) << "export_dir_finish unpinning path" << endl;
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ cache->path_unpin(trace, 0);
+
+
+ // stats
+ if (mds->logger) mds->logger->set("nex", cache->exports.size());
+
+ show_imports();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+// IMPORTS
+
+class C_MDC_ExportDirDiscover : public Context {
+ Migrator *mig;
+ MExportDirDiscover *m;
+public:
+ vector<CDentry*> trace;
+ C_MDC_ExportDirDiscover(Migrator *mig_, MExportDirDiscover *m_) :
+ mig(mig_), m(m_) {}
+ void finish(int r) {
+ CInode *in = 0;
+ if (r >= 0) in = trace[trace.size()-1]->get_inode();
+ mig->handle_export_dir_discover_2(m, in, r);
+ }
+};
+
+void Migrator::handle_export_dir_discover(MExportDirDiscover *m)
+{
+ assert(m->get_source().num() != mds->get_nodeid());
+
+ dout(7) << "handle_export_dir_discover on " << m->get_path() << endl;
+
+ // must discover it!
+ C_MDC_ExportDirDiscover *onfinish = new C_MDC_ExportDirDiscover(this, m);
+ filepath fpath(m->get_path());
+ cache->path_traverse(fpath, onfinish->trace, true,
+ m, new C_MDS_RetryMessage(mds,m), // on delay/retry
+ MDS_TRAVERSE_DISCOVER,
+ onfinish); // on completion|error
+}
+
+void Migrator::handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r)
+{
+ // yay!
+ if (in) {
+ dout(7) << "handle_export_dir_discover_2 has " << *in << endl;
+ }
+
+ if (r < 0 || !in->is_dir()) {
+ dout(7) << "handle_export_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl;
+
+ assert(0); // this shouldn't happen if the auth pins his path properly!!!!
+
+ mds->send_message_mds(new MExportDirDiscoverAck(m->get_ino(), false),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+ delete m;
+ return;
+ }
+
+ assert(in->is_dir());
+
+ if (in->is_frozen()) {
+ dout(7) << "frozen, waiting." << endl;
+ in->add_waiter(CINODE_WAIT_AUTHPINNABLE,
+ new C_MDS_RetryMessage(mds,m));
+ return;
+ }
+
+ // pin inode in the cache (for now)
+ in->get(CINODE_PIN_IMPORTING);
+
+ // pin auth too, until the import completes.
+ in->auth_pin();
+
+ // reply
+ dout(7) << " sending export_dir_discover_ack on " << *in << endl;
+ mds->send_message_mds(new MExportDirDiscoverAck(in->ino()),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+ delete m;
+}
+
+
+
+void Migrator::handle_export_dir_prep(MExportDirPrep *m)
+{
+ assert(m->get_source().num() != mds->get_nodeid());
+
+ CInode *diri = cache->get_inode(m->get_ino());
+ assert(diri);
+
+ list<Context*> finished;
+
+ // assimilate root dir.
+ CDir *dir = diri->dir;
+ if (dir) {
+ dout(7) << "handle_export_dir_prep on " << *dir << " (had dir)" << endl;
+
+ if (!m->did_assim())
+ m->get_dir(diri->ino())->update_dir(dir);
+ } else {
+ assert(!m->did_assim());
+
+ // open dir i'm importing.
+ diri->set_dir( new CDir(diri, mds, false) );
+ dir = diri->dir;
+ m->get_dir(diri->ino())->update_dir(dir);
+
+ dout(7) << "handle_export_dir_prep on " << *dir << " (opening dir)" << endl;
+
+ diri->take_waiting(CINODE_WAIT_DIR, finished);
+ }
+ assert(dir->is_auth() == false);
+
+ show_imports();
+
+ // assimilate contents?
+ if (!m->did_assim()) {
+ dout(7) << "doing assim on " << *dir << endl;
+ m->mark_assim(); // only do this the first time!
+
+ // move pin to dir
+ diri->put(CINODE_PIN_IMPORTING);
+ dir->get(CDIR_PIN_IMPORTING);
+
+ // auth pin too
+ dir->auth_pin();
+ diri->auth_unpin();
+
+ // assimilate traces to exports
+ for (list<CInodeDiscover*>::iterator it = m->get_inodes().begin();
+ it != m->get_inodes().end();
+ it++) {
+ // inode
+ CInode *in = cache->get_inode( (*it)->get_ino() );
+ if (in) {
+ (*it)->update_inode(in);
+ dout(7) << " updated " << *in << endl;
+ } else {
+ in = new CInode(mds->mdcache, false);
+ (*it)->update_inode(in);
+
+ // link to the containing dir
+ CInode *condiri = cache->get_inode( m->get_containing_dirino(in->ino()) );
+ assert(condiri && condiri->dir);
+ cache->add_inode( in );
+ condiri->dir->add_dentry( m->get_dentry(in->ino()), in );
+
+ dout(7) << " added " << *in << endl;
+ }
+
+ assert( in->get_parent_dir()->ino() == m->get_containing_dirino(in->ino()) );
+
+ // dir
+ if (m->have_dir(in->ino())) {
+ if (in->dir) {
+ m->get_dir(in->ino())->update_dir(in->dir);
+ dout(7) << " updated " << *in->dir << endl;
+ } else {
+ in->set_dir( new CDir(in, mds, false) );
+ m->get_dir(in->ino())->update_dir(in->dir);
+ dout(7) << " added " << *in->dir << endl;
+ in->take_waiting(CINODE_WAIT_DIR, finished);
+ }
+ }
+ }
+
+ // open export dirs?
+ for (list<inodeno_t>::iterator it = m->get_exports().begin();
+ it != m->get_exports().end();
+ it++) {
+ dout(7) << " checking dir " << hex << *it << dec << endl;
+ CInode *in = cache->get_inode(*it);
+ assert(in);
+
+ if (!in->dir) {
+ dout(7) << " opening nested export on " << *in << endl;
+ cache->open_remote_dir(in,
+ new C_MDS_RetryMessage(mds, m));
+
+ // pin it!
+ in->get(CINODE_PIN_OPENINGDIR);
+ in->state_set(CINODE_STATE_OPENINGDIR);
+ }
+ }
+ } else {
+ dout(7) << " not doing assim on " << *dir << endl;
+ }
+
+
+ // verify we have all exports
+ int waiting_for = 0;
+ for (list<inodeno_t>::iterator it = m->get_exports().begin();
+ it != m->get_exports().end();
+ it++) {
+ inodeno_t ino = *it;
+ CInode *in = cache->get_inode(ino);
+ if (!in) dout(0) << "** missing ino " << hex << ino << dec << endl;
+ assert(in);
+ if (in->dir) {
+ if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
+ dout(7) << " pinning nested export " << *in->dir << endl;
+ in->dir->get(CDIR_PIN_IMPORTINGEXPORT);
+ in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
+
+ if (in->state_test(CINODE_STATE_OPENINGDIR)) {
+ in->put(CINODE_PIN_OPENINGDIR);
+ in->state_clear(CINODE_STATE_OPENINGDIR);
+ }
+ } else {
+ dout(7) << " already pinned nested export " << *in << endl;
+ }
+ } else {
+ dout(7) << " waiting for nested export dir on " << *in << endl;
+ waiting_for++;
+ }
+ }
+ if (waiting_for) {
+ dout(7) << " waiting for " << waiting_for << " nested export dir opens" << endl;
+ } else {
+ // ok!
+ dout(7) << " all ready, sending export_dir_prep_ack on " << *dir << endl;
+ mds->send_message_mds(new MExportDirPrepAck(dir->ino()),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+
+ // done
+ delete m;
+ }
+
+ // finish waiters
+ finish_contexts(finished, 0);
+}
+
+
+
+
+/* this guy waits for the pre-import discovers on hashed directory dir inodes to finish.
+ * if it's the last one on the dir, it reprocessed the import.
+ */
+/*
+class C_MDS_ImportPrediscover : public Context {
+public:
+ MDS *mds;
+ MExportDir *m;
+ inodeno_t dir_ino;
+ string dentry;
+ C_MDS_ImportPrediscover(MDS *mds, MExportDir *m, inodeno_t dir_ino, const string& dentry) {
+ this->mds = mds;
+ this->m = m;
+ this->dir_ino = dir_ino;
+ this->dentry = dentry;
+ }
+ virtual void finish(int r) {
+ assert(r == 0); // should never fail!
+
+ m->remove_prediscover(dir_ino, dentry);
+
+ if (!m->any_prediscovers())
+ mds->mdcache->handle_export_dir(m);
+ }
+};
+*/
+
+
+
+void Migrator::handle_export_dir(MExportDir *m)
+{
+ CInode *diri = cache->get_inode(m->get_ino());
+ assert(diri);
+ CDir *dir = diri->dir;
+ assert(dir);
+
+ int oldauth = m->get_source().num();
+ dout(7) << "handle_export_dir, import " << *dir << " from " << oldauth << endl;
+ assert(dir->is_auth() == false);
+
+
+
+ show_imports();
+
+ // note new authority (locally)
+ if (dir->inode->is_auth())
+ dir->set_dir_auth( CDIR_AUTH_PARENT );
+ else
+ dir->set_dir_auth( mds->get_nodeid() );
+ dout(10) << " set dir_auth to " << dir->get_dir_auth() << endl;
+
+ // update imports/exports
+ CDir *containing_import;
+ if (cache->exports.count(dir)) {
+ // reimporting
+ dout(7) << " i'm reimporting " << *dir << endl;
+ cache->exports.erase(dir);
+
+ dir->state_clear(CDIR_STATE_EXPORT);
+ dir->put(CDIR_PIN_EXPORT); // unpin, no longer an export
+
+ containing_import = cache->get_auth_container(dir);
+ dout(7) << " it is nested under import " << *containing_import << endl;
+ cache->nested_exports[containing_import].erase(dir);
+ } else {
+ // new import
+ cache->imports.insert(dir);
+ dir->state_set(CDIR_STATE_IMPORT);
+ dir->get(CDIR_PIN_IMPORT); // must keep it pinned
+
+ containing_import = dir; // imported exports nested under *in
+
+ dout(7) << " new import at " << *dir << endl;
+ }
+
+
+ // take out my temp pin
+ dir->put(CDIR_PIN_IMPORTING);
+
+ // add any inherited exports
+ for (list<inodeno_t>::iterator it = m->get_exports().begin();
+ it != m->get_exports().end();
+ it++) {
+ CInode *exi = cache->get_inode(*it);
+ assert(exi && exi->dir);
+ CDir *ex = exi->dir;
+
+ dout(15) << " nested export " << *ex << endl;
+
+ // remove our pin
+ ex->put(CDIR_PIN_IMPORTINGEXPORT);
+ ex->state_clear(CDIR_STATE_IMPORTINGEXPORT);
+
+
+ // add...
+ if (ex->is_import()) {
+ dout(7) << " importing my import " << *ex << endl;
+ cache->imports.erase(ex);
+ ex->state_clear(CDIR_STATE_IMPORT);
+
+ if (mds->logger) mds->logger->inc("imex");
+
+ // move nested exports under containing_import
+ for (set<CDir*>::iterator it = cache->nested_exports[ex].begin();
+ it != cache->nested_exports[ex].end();
+ it++) {
+ dout(7) << " moving nested export " << **it << " under " << *containing_import << endl;
+ cache->nested_exports[containing_import].insert(*it);
+ }
+ cache->nested_exports.erase(ex); // de-list under old import
+
+ ex->set_dir_auth( CDIR_AUTH_PARENT );
+ ex->put(CDIR_PIN_IMPORT); // imports are pinned, no longer import
+
+ } else {
+ dout(7) << " importing export " << *ex << endl;
+
+ // add it
+ ex->state_set(CDIR_STATE_EXPORT);
+ ex->get(CDIR_PIN_EXPORT); // all exports are pinned
+ cache->exports.insert(ex);
+ cache->nested_exports[containing_import].insert(ex);
+ if (mds->logger) mds->logger->inc("imex");
+ }
+
+ }
+
+
+ // add this crap to my cache
+ list<inodeno_t> imported_subdirs;
+ bufferlist dir_state;
+ dir_state.claim( m->get_state() );
+ int off = 0;
+ int num_imported_inodes = 0;
+
+ for (int i = 0; i < m->get_ndirs(); i++) {
+ num_imported_inodes +=
+ import_dir_block(dir_state,
+ off,
+ oldauth,
+ dir, // import root
+ imported_subdirs);
+ }
+ dout(10) << " " << imported_subdirs.size() << " imported subdirs" << endl;
+ dout(10) << " " << m->get_exports().size() << " imported nested exports" << endl;
+
+
+ // adjust popularity
+ mds->balancer->add_import(dir);
+
+ // send notify's etc.
+ dout(7) << "sending notifyack for " << *dir << " to old auth " << m->get_source().num() << endl;
+ mds->send_message_mds(new MExportDirNotifyAck(dir->inode->ino()),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+
+ dout(7) << "sending notify to others" << endl;
+ for (set<int>::iterator it = dir->open_by.begin();
+ it != dir->open_by.end();
+ it++) {
+ assert( *it != mds->get_nodeid() );
+ if ( *it == m->get_source().num() ) continue; // not to old auth.
+
+ MExportDirNotify *notify = new MExportDirNotify(dir->ino(), m->get_source().num(), mds->get_nodeid());
+ notify->copy_exports(m->get_exports());
+
+ if (g_conf.mds_verify_export_dirauth)
+ notify->copy_subdirs(imported_subdirs); // copy subdir list (DEBUG)
+
+ mds->send_message_mds(notify, *it, MDS_PORT_MIGRATOR);
+ }
+
+ // done
+ delete m;
+
+ show_imports();
+
+
+ // is it empty?
+ if (dir->get_size() == 0 &&
+ !dir->inode->is_auth()) {
+ // reexport!
+ export_empty_import(dir);
+ }
+
+
+ // some stats
+ if (mds->logger) {
+ mds->logger->inc("im");
+ mds->logger->inc("iim", num_imported_inodes);
+ mds->logger->set("nim", cache->imports.size());
+ }
+
+
+ // FIXME LOG IT
+
+ /*
+ stupid hashing crap, FIXME
+
+ // wait for replicas in hashed dirs?
+ if (import_hashed_replicate_waiting.count(m->get_ino())) {
+ // it'll happen later!, when i get my inodegetreplicaack's back
+ } else {
+ // finish now
+ //not anymoreimport_dir_finish(dir);
+ }
+ */
+
+}
+
+
+
+void Migrator::handle_export_dir_finish(MExportDirFinish *m)
+{
+ CInode *diri = cache->get_inode(m->get_ino());
+ CDir *dir = diri->dir;
+ assert(dir);
+
+ dout(7) << "handle_export_dir_finish on " << *dir << endl;
+ assert(dir->is_auth());
+
+ dout(5) << "done with import of " << *dir << endl;
+ show_imports();
+ if (mds->logger) {
+ mds->logger->set("nex", cache->exports.size());
+ mds->logger->set("nim", cache->imports.size());
+ }
+
+ // un auth pin (other exports can now proceed)
+ dir->auth_unpin();
+
+ // ok now finish contexts
+ dout(5) << "finishing any waiters on imported data" << endl;
+ dir->finish_waiting(CDIR_WAIT_IMPORTED);
+
+ delete m;
+}
+
+
+void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int oldauth)
+{
+ CInodeExport istate;
+ off = istate._decode(bl, off);
+ dout(15) << "got a cinodeexport " << endl;
+
+ bool added = false;
+ CInode *in = cache->get_inode(istate.get_ino());
+ if (!in) {
+ in = new CInode(mds->mdcache);
+ added = true;
+ } else {
+ in->set_auth(true);
+ }
+
+ // link before state
+ if (dn->inode != in) {
+ assert(!dn->inode);
+ dn->dir->link_inode(dn, in);
+ }
+
+ // state after link
+ set<int> merged_client_caps;
+ istate.update_inode(in, merged_client_caps);
+
+
+ // add inode?
+ if (added) {
+ cache->add_inode(in);
+ dout(10) << "added " << *in << endl;
+ } else {
+ dout(10) << " had " << *in << endl;
+ }
+
+
+ // cached_by
+ assert(!in->is_cached_by(oldauth));
+ in->cached_by_add( oldauth, CINODE_EXPORT_NONCE );
+ if (in->is_cached_by(mds->get_nodeid()))
+ in->cached_by_remove(mds->get_nodeid());
+
+ // twiddle locks
+ // hard
+ if (in->hardlock.get_state() == LOCK_GLOCKR) {
+ in->hardlock.gather_set.erase(mds->get_nodeid());
+ in->hardlock.gather_set.erase(oldauth);
+ if (in->hardlock.gather_set.empty())
+ mds->locker->inode_hard_eval(in);
+ }
+
+ // caps
+ for (set<int>::iterator it = merged_client_caps.begin();
+ it != merged_client_caps.end();
+ it++) {
+ MClientFileCaps *caps = new MClientFileCaps(in->inode,
+ in->client_caps[*it].get_last_seq(),
+ in->client_caps[*it].pending(),
+ in->client_caps[*it].wanted(),
+ MClientFileCaps::FILECAP_REAP);
+ caps->set_mds( oldauth ); // reap from whom?
+ mds->messenger->send_message(caps,
+ MSG_ADDR_CLIENT(*it), mds->clientmap.get_inst(*it),
+ 0, MDS_PORT_CACHE);
+ }
+
+ // filelock
+ if (!in->filelock.is_stable()) {
+ // take me and old auth out of gather set
+ in->filelock.gather_set.erase(mds->get_nodeid());
+ in->filelock.gather_set.erase(oldauth);
+ if (in->filelock.gather_set.empty()) // necessary but not suffient...
+ mds->locker->inode_file_eval(in);
+ }
+
+ // other
+ if (in->is_dirty()) {
+ dout(10) << "logging dirty import " << *in << endl;
+ mds->mdlog->submit_entry(new EInodeUpdate(in));
+ }
+}
+
+
+int Migrator::import_dir_block(bufferlist& bl,
+ int& off,
+ int oldauth,
+ CDir *import_root,
+ list<inodeno_t>& imported_subdirs)
+{
+ // set up dir
+ CDirExport dstate;
+ off = dstate._decode(bl, off);
+
+ CInode *diri = cache->get_inode(dstate.get_ino());
+ assert(diri);
+ CDir *dir = diri->get_or_open_dir(mds);
+ assert(dir);
+
+ dout(7) << " import_dir_block " << *dir << " have " << dir->nitems << " items, importing " << dstate.get_nden() << " dentries" << endl;
+
+ // add to list
+ if (dir != import_root)
+ imported_subdirs.push_back(dir->ino());
+
+ // assimilate state
+ dstate.update_dir( dir );
+ if (diri->is_auth())
+ dir->set_dir_auth( CDIR_AUTH_PARENT ); // update_dir may hose dir_auth
+
+ // mark (may already be marked from get_or_open_dir() above)
+ if (!dir->is_auth())
+ dir->state_set(CDIR_STATE_AUTH);
+
+ // open_by
+ assert(!dir->is_open_by(oldauth));
+ dir->open_by_add(oldauth);
+ if (dir->is_open_by(mds->get_nodeid()))
+ dir->open_by_remove(mds->get_nodeid());
+
+ if (dir->is_hashed()) {
+
+ // do nothing; dir is hashed
+ return 0;
+ } else {
+ // take all waiters on this dir
+ // NOTE: a pass of imported data is guaranteed to get all of my waiters because
+ // a replica's presense in my cache implies/forces it's presense in authority's.
+ list<Context*> waiters;
+
+ dir->take_waiting(CDIR_WAIT_ANY, waiters);
+ for (list<Context*>::iterator it = waiters.begin();
+ it != waiters.end();
+ it++)
+ import_root->add_waiter(CDIR_WAIT_IMPORTED, *it);
+
+ dout(15) << "doing contents" << endl;
+
+ // contents
+ int num_imported = 0;
+ long nden = dstate.get_nden();
+
+ for (; nden>0; nden--) {
+
+ num_imported++;
+
+ // dentry
+ string dname;
+ _decode(dname, bl, off);
+ dout(15) << "dname is " << dname << endl;
+
+ char dirty;
+ bl.copy(off, 1, &dirty);
+ off++;
+
+ char icode;
+ bl.copy(off, 1, &icode);
+ off++;
+
+ CDentry *dn = dir->lookup(dname);
+ if (!dn)
+ dn = dir->add_dentry(dname); // null
+
+ // mark dn dirty _after_ we link the inode (scroll down)
+
+ if (icode == 'N') {
+ // null dentry
+ assert(dn->is_null());
+
+ // fall thru
+ }
+ else if (icode == 'L') {
+ // remote link
+ inodeno_t ino;
+ bl.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ dir->link_inode(dn, ino);
+ }
+ else if (icode == 'I') {
+ // inode
+ decode_import_inode(dn, bl, off, oldauth);
+ }
+
+ // mark dentry dirty? (only _after_ we link the inode!)
+ if (dirty == 'D') dn->mark_dirty();
+
+ }
+
+ if (dir->is_dirty())
+ mds->mdlog->submit_entry(new EDirUpdate(dir));
+
+ return num_imported;
+ }
+}
+
+
+
+
+
+// authority bystander
+
+void Migrator::handle_export_dir_warning(MExportDirWarning *m)
+{
+ // add to warning list
+ stray_export_warnings.insert( m->get_ino() );
+
+ // did i already see the notify?
+ if (stray_export_notifies.count(m->get_ino())) {
+ // i did, we're good.
+ dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". already got notify." << endl;
+
+ // process the notify
+ map<inodeno_t, MExportDirNotify*>::iterator it = stray_export_notifies.find(m->get_ino());
+ handle_export_dir_notify(it->second);
+ stray_export_notifies.erase(it);
+ } else {
+ dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". waiting for notify." << endl;
+ }
+
+ // done
+ delete m;
+}
+
+
+void Migrator::handle_export_dir_notify(MExportDirNotify *m)
+{
+ CDir *dir = 0;
+ CInode *in = cache->get_inode(m->get_ino());
+ if (in) dir = in->dir;
+
+ // did i see the warning yet?
+ if (!stray_export_warnings.count(m->get_ino())) {
+ // wait for it.
+ dout(7) << "export_dir_notify on " << m->get_ino() << ", waiting for warning." << endl;
+ stray_export_notifies.insert(pair<inodeno_t, MExportDirNotify*>( m->get_ino(), m ));
+ return;
+ }
+
+ // i did, we're all good.
+ dout(7) << "export_dir_notify on " << m->get_ino() << ", already saw warning." << endl;
+
+ // update dir_auth!
+ if (dir) {
+ dout(7) << "export_dir_notify on " << *dir << " new_auth " << m->get_new_auth() << " (old_auth " << m->get_old_auth() << ")" << endl;
+
+ // update bounds first
+ for (list<inodeno_t>::iterator it = m->get_exports().begin();
+ it != m->get_exports().end();
+ it++) {
+ CInode *n = cache->get_inode(*it);
+ if (!n) continue;
+ CDir *ndir = n->dir;
+ if (!ndir) continue;
+
+ int boundauth = ndir->authority();
+ dout(7) << "export_dir_notify bound " << *ndir << " was dir_auth " << ndir->get_dir_auth() << " (" << boundauth << ")" << endl;
+ if (ndir->get_dir_auth() == CDIR_AUTH_PARENT) {
+ if (boundauth != m->get_new_auth())
+ ndir->set_dir_auth( boundauth );
+ else assert(dir->authority() == m->get_new_auth()); // apparently we already knew!
+ } else {
+ if (boundauth == m->get_new_auth())
+ ndir->set_dir_auth( CDIR_AUTH_PARENT );
+ }
+ }
+
+ // update dir_auth
+ if (in->authority() == m->get_new_auth()) {
+ dout(7) << "handle_export_dir_notify on " << *in << ": inode auth is the same, setting dir_auth -1" << endl;
+ dir->set_dir_auth( CDIR_AUTH_PARENT );
+ assert(!in->is_auth());
+ assert(!dir->is_auth());
+ } else {
+ dir->set_dir_auth( m->get_new_auth() );
+ }
+ assert(dir->authority() != mds->get_nodeid());
+ assert(!dir->is_auth());
+
+ // DEBUG: verify subdirs
+ if (g_conf.mds_verify_export_dirauth) {
+
+ dout(7) << "handle_export_dir_notify on " << *dir << " checking " << m->num_subdirs() << " subdirs" << endl;
+ for (list<inodeno_t>::iterator it = m->subdirs_begin();
+ it != m->subdirs_end();
+ it++) {
+ CInode *diri = cache->get_inode(*it);
+ if (!diri) continue; // don't have it, don't care
+ if (!diri->dir) continue;
+ dout(10) << "handle_export_dir_notify checking subdir " << *diri->dir << " is auth " << diri->dir->get_dir_auth() << endl;
+ assert(diri->dir != dir); // base shouldn't be in subdir list
+ if (diri->dir->get_dir_auth() != CDIR_AUTH_PARENT) {
+ dout(7) << "*** weird value for dir_auth " << diri->dir->get_dir_auth() << " on " << *diri->dir << ", should have been -1 probably??? ******************" << endl;
+ assert(0); // bad news!
+ //dir->set_dir_auth( CDIR_AUTH_PARENT );
+ }
+ assert(diri->dir->authority() == m->get_new_auth());
+ }
+ }
+ }
+
+ // send notify ack to old auth
+ dout(7) << "handle_export_dir_notify sending ack to old_auth " << m->get_old_auth() << endl;
+ mds->send_message_mds(new MExportDirNotifyAck(m->get_ino()),
+ m->get_old_auth(), MDS_PORT_MIGRATOR);
+
+
+ // done
+ stray_export_warnings.erase( m->get_ino() );
+ delete m;
+}
+
+
+
+
+
+// =======================================================================
+// HASHING
+
+
+void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth)
+{
+ int off = 0;
+
+ for (; nden>0; nden--) {
+ // dentry
+ string dname;
+ _decode(dname, bl, off);
+ dout(15) << "dname is " << dname << endl;
+
+ char icode;
+ bl.copy(off, 1, &icode);
+ off++;
+
+ CDentry *dn = dir->lookup(dname);
+ if (!dn)
+ dn = dir->add_dentry(dname); // null
+
+ // mark dn dirty _after_ we link the inode (scroll down)
+
+ if (icode == 'N') {
+
+ // null dentry
+ assert(dn->is_null());
+
+ // fall thru
+ }
+ else if (icode == 'L') {
+ // remote link
+ inodeno_t ino;
+ bl.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ dir->link_inode(dn, ino);
+ }
+ else if (icode == 'I') {
+ // inode
+ decode_import_inode(dn, bl, off, oldauth);
+
+ // fix up subdir export?
+ if (dn->inode->dir) {
+ assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTINGEXPORT));
+ dn->inode->dir->put(CDIR_PIN_IMPORTINGEXPORT);
+ dn->inode->dir->state_clear(CDIR_STATE_IMPORTINGEXPORT);
+
+ if (dn->inode->dir->is_auth()) {
+ // mine. must have been an import.
+ assert(dn->inode->dir->is_import());
+ dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl;
+ dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ cache->imports.erase(dn->inode->dir);
+ dn->inode->dir->put(CDIR_PIN_IMPORT);
+ dn->inode->dir->state_clear(CDIR_STATE_IMPORT);
+
+ // move nested under hashdir
+ for (set<CDir*>::iterator it = cache->nested_exports[dn->inode->dir].begin();
+ it != cache->nested_exports[dn->inode->dir].end();
+ it++)
+ cache->nested_exports[dir].insert(*it);
+ cache->nested_exports.erase(dn->inode->dir);
+
+ // now it matches the inode
+ dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ }
+ else {
+ // not mine. make it an export.
+ dout(7) << "making subdir into export " << *dn->inode->dir << endl;
+ dn->inode->dir->get(CDIR_PIN_EXPORT);
+ dn->inode->dir->state_set(CDIR_STATE_EXPORT);
+ cache->exports.insert(dn->inode->dir);
+ cache->nested_exports[dir].insert(dn->inode->dir);
+
+ if (dn->inode->dir->get_dir_auth() == CDIR_AUTH_PARENT)
+ dn->inode->dir->set_dir_auth( oldauth ); // no longer matches inode
+ assert(dn->inode->dir->get_dir_auth() >= 0);
+ }
+ }
+ }
+
+ // mark dentry dirty? (only _after_ we link the inode!)
+ dn->mark_dirty();
+ }
+}
+
+/*
+
+ notes on interaction of hashing and export/import:
+
+ - dir->is_auth() is completely independent of hashing. for a hashed dir,
+ - all nodes are partially authoritative
+ - all nodes dir->is_hashed() == true
+ - all nodes dir->inode->dir_is_hashed() == true
+ - one node dir->is_auth() == true, the rest == false
+ - dir_auth for all subdirs in a hashed dir will (likely?) be explicit.
+
+ - remember simple rule: dir auth follows inode, unless dir_auth is explicit.
+
+ - export_dir_walk and import_dir_block take care with dir_auth: (for import/export)
+ - on export, -1 is changed to mds->get_nodeid()
+ - on import, nothing special, actually.
+
+ - hashed dir files aren't included in export; subdirs are converted to imports
+ or exports as necessary.
+ - hashed dir subdirs are discovered on export. this is important
+ because dirs are needed to tie together auth hierarchy, for auth to know about
+ imports/exports, etc.
+
+ - dir state is maintained on auth.
+ - COMPLETE and HASHED are transfered to importers.
+ - DIRTY is set everywhere.
+
+ - hashed dir is like an import: hashed dir used for nested_exports map.
+ - nested_exports is updated appropriately on auth and replicas.
+ - a subtree terminates as a hashed dir, since the hashing explicitly
+ redelegates all inodes. thus export_dir_walk includes hashed dirs, but
+ not their inodes.
+*/
+
+// HASH on auth
+
+class C_MDC_HashFreeze : public Context {
+public:
+ Migrator *mig;
+ CDir *dir;
+ C_MDC_HashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
+ virtual void finish(int r) {
+ mig->hash_dir_frozen(dir);
+ }
+};
+
+class C_MDC_HashComplete : public Context {
+public:
+ Migrator *mig;
+ CDir *dir;
+ C_MDC_HashComplete(Migrator *mig, CDir *dir) {
+ this->mig = mig;
+ this->dir = dir;
+ }
+ virtual void finish(int r) {
+ mig->hash_dir_complete(dir);
+ }
+};
+
+
+/** hash_dir(dir)
+ * start hashing a directory.
+ */
+void Migrator::hash_dir(CDir *dir)
+{
+ dout(-7) << "hash_dir " << *dir << endl;
+
+ assert(!dir->is_hashed());
+ assert(dir->is_auth());
+
+ if (dir->is_frozen() ||
+ dir->is_freezing()) {
+ dout(7) << " can't hash, freezing|frozen." << endl;
+ return;
+ }
+
+ // pin path?
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ if (!cache->path_pin(trace, 0, 0)) {
+ dout(7) << "hash_dir couldn't pin path, failing." << endl;
+ return;
+ }
+
+ // ok, go
+ dir->state_set(CDIR_STATE_HASHING);
+ dir->get(CDIR_PIN_HASHING);
+ assert(dir->hashed_subset.empty());
+
+ // discover on all mds
+ assert(hash_gather.count(dir) == 0);
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue; // except me
+ hash_gather[dir].insert(i);
+ mds->send_message_mds(new MHashDirDiscover(dir->inode), i, MDS_PORT_MIGRATOR);
+ }
+ dir->auth_pin(); // pin until discovers are all acked.
+
+ // start freeze
+ dir->freeze_dir(new C_MDC_HashFreeze(this, dir));
+
+ // make complete
+ if (!dir->is_complete()) {
+ dout(7) << "hash_dir " << *dir << " not complete, fetching" << endl;
+ mds->mdstore->fetch_dir(dir,
+ new C_MDC_HashComplete(this, dir));
+ } else
+ hash_dir_complete(dir);
+}
+
+
+/*
+ * wait for everybody to discover and open the hashing dir
+ * then auth_unpin, to let the freeze happen
+ */
+void Migrator::handle_hash_dir_discover_ack(MHashDirDiscoverAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ int from = m->get_source().num();
+ assert(hash_gather[dir].count(from));
+ hash_gather[dir].erase(from);
+
+ if (hash_gather[dir].empty()) {
+ hash_gather.erase(dir);
+ dout(7) << "hash_dir_discover_ack " << *dir << ", releasing auth_pin" << endl;
+ dir->auth_unpin(); // unpin to allow freeze to complete
+ } else {
+ dout(7) << "hash_dir_discover_ack " << *dir << ", still waiting for " << hash_gather[dir] << endl;
+ }
+
+ delete m; // done
+}
+
+
+
+/*
+ * once the dir is completely in memory,
+ * mark all migrating inodes dirty (to pin in cache)
+ */
+void Migrator::hash_dir_complete(CDir *dir)
+{
+ dout(7) << "hash_dir_complete " << *dir << ", dirtying inodes" << endl;
+
+ assert(!dir->is_hashed());
+ assert(dir->is_auth());
+
+ // mark dirty to pin in cache
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CInode *in = it->second->inode;
+ in->mark_dirty();
+ }
+
+ if (dir->is_frozen_dir())
+ hash_dir_go(dir);
+}
+
+
+/*
+ * once the dir is frozen,
+ * make sure it's complete
+ * send the prep messages!
+ */
+void Migrator::hash_dir_frozen(CDir *dir)
+{
+ dout(7) << "hash_dir_frozen " << *dir << endl;
+
+ assert(!dir->is_hashed());
+ assert(dir->is_auth());
+ assert(dir->is_frozen_dir());
+
+ if (!dir->is_complete()) {
+ dout(7) << "hash_dir_frozen !complete, waiting still on " << *dir << endl;
+ return;
+ }
+
+ // send prep messages w/ export directories to open
+ vector<MHashDirPrep*> msgs(mds->get_mds_map()->get_num_mds());
+
+ // check for subdirs
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->inode;
+
+ if (!in->is_dir()) continue;
+ if (!in->dir) continue;
+
+ int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode == mds->get_nodeid()) continue;
+
+ // msg?
+ if (msgs[dentryhashcode] == 0) {
+ msgs[dentryhashcode] = new MHashDirPrep(dir->ino());
+ }
+ msgs[dentryhashcode]->add_inode(it->first, in->replicate_to(dentryhashcode));
+ }
+
+ // send them!
+ assert(hash_gather[dir].empty());
+ for (unsigned i=0; i<msgs.size(); i++) {
+ if (msgs[i]) {
+ mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR);
+ hash_gather[dir].insert(i);
+ }
+ }
+
+ if (hash_gather[dir].empty()) {
+ // no subdirs! continue!
+ hash_gather.erase(dir);
+ hash_dir_go(dir);
+ } else {
+ // wait!
+ }
+}
+
+/*
+ * wait for peers to open all subdirs
+ */
+void Migrator::handle_hash_dir_prep_ack(MHashDirPrepAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ int from = m->get_source().num();
+
+ assert(hash_gather[dir].count(from) == 1);
+ hash_gather[dir].erase(from);
+
+ if (hash_gather[dir].empty()) {
+ hash_gather.erase(dir);
+ dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", last one" << endl;
+ hash_dir_go(dir);
+ } else {
+ dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
+ }
+
+ delete m;
+}
+
+
+/*
+ * once the dir is frozen,
+ * make sure it's complete
+ * do the hashing!
+ */
+void Migrator::hash_dir_go(CDir *dir)
+{
+ dout(7) << "hash_dir_go " << *dir << endl;
+
+ assert(!dir->is_hashed());
+ assert(dir->is_auth());
+ assert(dir->is_frozen_dir());
+
+ // get messages to other nodes ready
+ vector<MHashDir*> msgs(mds->get_mds_map()->get_num_mds());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ msgs[i] = new MHashDir(dir->ino());
+ }
+
+ // pick a hash seed.
+ dir->inode->inode.hash_seed = 1;//dir->ino();
+
+ // suck up all waiters
+ C_Contexts *fin = new C_Contexts;
+ list<Context*> waiting;
+ dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters
+ fin->take(waiting);
+
+ // get containing import. might be me.
+ CDir *containing_import = cache->get_auth_container(dir);
+ assert(containing_import != dir || dir->is_import());
+
+ // divy up contents
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->inode;
+
+ int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode == mds->get_nodeid()) {
+ continue; // still mine!
+ }
+
+ bufferlist *bl = msgs[dentryhashcode]->get_state_ptr();
+ assert(bl);
+
+ // -- dentry
+ dout(7) << "hash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl;
+ _encode(it->first, *bl);
+
+ // null dentry?
+ if (dn->is_null()) {
+ bl->append("N", 1); // null dentry
+ assert(dn->is_sync());
+ continue;
+ }
+
+ if (dn->is_remote()) {
+ // remote link
+ bl->append("L", 1); // remote link
+
+ inodeno_t ino = dn->get_remote_ino();
+ bl->append((char*)&ino, sizeof(ino));
+ continue;
+ }
+
+ // primary link
+ // -- inode
+ bl->append("I", 1); // inode dentry
+
+ encode_export_inode(in, *bl, dentryhashcode); // encode, and (update state for) export
+ msgs[dentryhashcode]->inc_nden();
+
+ if (dn->is_dirty())
+ dn->mark_clean();
+
+ // add to proxy
+ hash_proxy_inos[dir].push_back(in);
+ in->state_set(CINODE_STATE_PROXY);
+ in->get(CINODE_PIN_PROXY);
+
+ // fix up subdirs
+ if (in->dir) {
+ if (in->dir->is_auth()) {
+ // mine. make it into an import.
+ dout(7) << "making subdir into import " << *in->dir << endl;
+ in->dir->set_dir_auth( mds->get_nodeid() );
+ cache->imports.insert(in->dir);
+ in->dir->get(CDIR_PIN_IMPORT);
+ in->dir->state_set(CDIR_STATE_IMPORT);
+
+ // fix nested bits
+ for (set<CDir*>::iterator it = cache->nested_exports[containing_import].begin();
+ it != cache->nested_exports[containing_import].end(); ) {
+ CDir *ex = *it;
+ it++;
+ if (cache->get_auth_container(ex) == in->dir) {
+ dout(10) << "moving nested export " << *ex << endl;
+ cache->nested_exports[containing_import].erase(ex);
+ cache->nested_exports[in->dir].insert(ex);
+ }
+ }
+ }
+ else {
+ // not mine.
+ dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl;
+ assert(in->dir->is_export());
+ in->dir->put(CDIR_PIN_EXPORT);
+ in->dir->state_clear(CDIR_STATE_EXPORT);
+ cache->exports.erase(in->dir);
+ cache->nested_exports[containing_import].erase(in->dir);
+ if (in->dir->authority() == dentryhashcode)
+ in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ else
+ in->dir->set_dir_auth( in->dir->authority() );
+ }
+ }
+
+ // waiters
+ list<Context*> waiters;
+ in->take_waiting(CINODE_WAIT_ANY, waiters);
+ fin->take(waiters);
+ }
+
+ // dir state
+ dir->state_set(CDIR_STATE_HASHED);
+ dir->get(CDIR_PIN_HASHED);
+ cache->hashdirs.insert(dir);
+ dir->mark_dirty();
+ mds->mdlog->submit_entry(new EDirUpdate(dir));
+
+ // inode state
+ if (dir->inode->is_auth()) {
+ dir->inode->mark_dirty();
+ mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+ }
+
+ // fix up nested_exports?
+ if (containing_import != dir) {
+ dout(7) << "moving nested exports under hashed dir" << endl;
+ for (set<CDir*>::iterator it = cache->nested_exports[containing_import].begin();
+ it != cache->nested_exports[containing_import].end(); ) {
+ CDir *ex = *it;
+ it++;
+ if (cache->get_auth_container(ex) == dir) {
+ dout(7) << " moving nested export under hashed dir: " << *ex << endl;
+ cache->nested_exports[containing_import].erase(ex);
+ cache->nested_exports[dir].insert(ex);
+ } else {
+ dout(7) << " NOT moving nested export under hashed dir: " << *ex << endl;
+ }
+ }
+ }
+
+ // send hash messages
+ assert(hash_gather[dir].empty());
+ assert(hash_notify_gather[dir].empty());
+ assert(dir->hashed_subset.empty());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ // all nodes hashed locally..
+ dir->hashed_subset.insert(i);
+
+ if (i == mds->get_nodeid()) continue;
+
+ // init hash_gather and hash_notify_gather sets
+ hash_gather[dir].insert(i);
+
+ assert(hash_notify_gather[dir][i].empty());
+ for (int j=0; j<mds->get_mds_map()->get_num_mds(); j++) {
+ if (j == mds->get_nodeid()) continue;
+ if (j == i) continue;
+ hash_notify_gather[dir][i].insert(j);
+ }
+
+ mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR);
+ }
+
+ // wait for all the acks.
+}
+
+
+void Migrator::handle_hash_dir_ack(MHashDirAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ assert(dir->is_hashed());
+ assert(dir->is_hashing());
+
+ int from = m->get_source().num();
+ assert(hash_gather[dir].count(from) == 1);
+ hash_gather[dir].erase(from);
+
+ if (hash_gather[dir].empty()) {
+ dout(7) << "handle_hash_dir_ack on " << *dir << ", last one" << endl;
+
+ if (hash_notify_gather[dir].empty()) {
+ dout(7) << "got notifies too, all done" << endl;
+ hash_dir_finish(dir);
+ } else {
+ dout(7) << "waiting on notifies " << endl;
+ }
+
+ } else {
+ dout(7) << "handle_hash_dir_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
+ }
+
+ delete m;
+}
+
+
+void Migrator::hash_dir_finish(CDir *dir)
+{
+ dout(7) << "hash_dir_finish finishing " << *dir << endl;
+ assert(dir->is_hashed());
+ assert(dir->is_hashing());
+
+ // dir state
+ hash_gather.erase(dir);
+ dir->state_clear(CDIR_STATE_HASHING);
+ dir->put(CDIR_PIN_HASHING);
+ dir->hashed_subset.clear();
+
+ // unproxy inodes
+ // this _could_ happen sooner, on a per-peer basis, but no harm in waiting a few more seconds.
+ for (list<CInode*>::iterator it = hash_proxy_inos[dir].begin();
+ it != hash_proxy_inos[dir].end();
+ it++) {
+ CInode *in = *it;
+ assert(in->state_test(CINODE_STATE_PROXY));
+ in->state_clear(CINODE_STATE_PROXY);
+ in->put(CINODE_PIN_PROXY);
+ }
+ hash_proxy_inos.erase(dir);
+
+ // unpin path
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ cache->path_unpin(trace, 0);
+
+ // unfreeze
+ dir->unfreeze_dir();
+
+ show_imports();
+ assert(hash_gather.count(dir) == 0);
+
+ // stats
+ //if (mds->logger) mds->logger->inc("nh", 1);
+
+}
+
+
+
+
+// HASH on auth and non-auth
+
+void Migrator::handle_hash_dir_notify(MHashDirNotify *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+ assert(dir->is_hashing());
+
+ dout(5) << "handle_hash_dir_notify " << *dir << endl;
+ int from = m->get_from();
+
+ int source = m->get_source().num();
+ if (dir->is_auth()) {
+ // gather notifies
+ assert(dir->is_hashed());
+
+ assert( hash_notify_gather[dir][from].count(source) );
+ hash_notify_gather[dir][from].erase(source);
+
+ if (hash_notify_gather[dir][from].empty()) {
+ dout(7) << "last notify from " << from << endl;
+ hash_notify_gather[dir].erase(from);
+
+ if (hash_notify_gather[dir].empty()) {
+ dout(7) << "last notify!" << endl;
+ hash_notify_gather.erase(dir);
+
+ if (hash_gather[dir].empty()) {
+ dout(7) << "got acks too, all done" << endl;
+ hash_dir_finish(dir);
+ } else {
+ dout(7) << "still waiting on acks from " << hash_gather[dir] << endl;
+ }
+ } else {
+ dout(7) << "still waiting for notify gathers from " << hash_notify_gather[dir].size() << " others" << endl;
+ }
+ } else {
+ dout(7) << "still waiting for notifies from " << from << " via " << hash_notify_gather[dir][from] << endl;
+ }
+
+ // delete msg
+ delete m;
+ } else {
+ // update dir hashed_subset
+ assert(dir->hashed_subset.count(from) == 0);
+ dir->hashed_subset.insert(from);
+
+ // update open subdirs
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->get_inode();
+ if (!in) continue;
+ if (!in->dir) continue;
+
+ int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode != from) continue; // we'll import these in a minute
+
+ if (in->dir->authority() != dentryhashcode)
+ in->dir->set_dir_auth( in->dir->authority() );
+ else
+ in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ }
+
+ // remove from notify gather set
+ assert(hash_gather[dir].count(from));
+ hash_gather[dir].erase(from);
+
+ // last notify?
+ if (hash_gather[dir].empty()) {
+ dout(7) << "gathered all the notifies, finishing hash of " << *dir << endl;
+ hash_gather.erase(dir);
+
+ dir->state_clear(CDIR_STATE_HASHING);
+ dir->put(CDIR_PIN_HASHING);
+ dir->hashed_subset.clear();
+ } else {
+ dout(7) << "still waiting for notify from " << hash_gather[dir] << endl;
+ }
+
+ // fw notify to auth
+ mds->send_message_mds(m, dir->authority(), MDS_PORT_MIGRATOR);
+ }
+}
+
+
+
+
+// HASH on non-auth
+
+/*
+ * discover step:
+ * each peer needs to open up the directory and pin it before we start
+ */
+class C_MDC_HashDirDiscover : public Context {
+ Migrator *mig;
+ MHashDirDiscover *m;
+public:
+ vector<CDentry*> trace;
+ C_MDC_HashDirDiscover(Migrator *mig, MHashDirDiscover *m) {
+ this->mig = mig;
+ this->m = m;
+ }
+ void finish(int r) {
+ CInode *in = 0;
+ if (r >= 0) {
+ if (trace.size())
+ in = trace[trace.size()-1]->get_inode();
+ else
+ in = mig->cache->get_root();
+ }
+ mig->handle_hash_dir_discover_2(m, in, r);
+ }
+};
+
+void Migrator::handle_hash_dir_discover(MHashDirDiscover *m)
+{
+ assert(m->get_source().num() != mds->get_nodeid());
+
+ dout(7) << "handle_hash_dir_discover on " << m->get_path() << endl;
+
+ // must discover it!
+ C_MDC_HashDirDiscover *onfinish = new C_MDC_HashDirDiscover(this, m);
+ filepath fpath(m->get_path());
+ cache->path_traverse(fpath, onfinish->trace, true,
+ m, new C_MDS_RetryMessage(mds,m), // on delay/retry
+ MDS_TRAVERSE_DISCOVER,
+ onfinish); // on completion|error
+}
+
+void Migrator::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r)
+{
+ // yay!
+ if (in) {
+ dout(7) << "handle_hash_dir_discover_2 has " << *in << endl;
+ }
+
+ if (r < 0 || !in->is_dir()) {
+ dout(7) << "handle_hash_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl;
+ assert(0); // this shouldn't happen if the auth pins his path properly!!!!
+ }
+ assert(in->is_dir());
+
+ // is dir open?
+ if (!in->dir) {
+ dout(7) << "handle_hash_dir_discover_2 opening dir " << *in << endl;
+ cache->open_remote_dir(in,
+ new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+ CDir *dir = in->dir;
+
+ // pin dir, set hashing flag
+ dir->state_set(CDIR_STATE_HASHING);
+ dir->get(CDIR_PIN_HASHING);
+ assert(dir->hashed_subset.empty());
+
+ // inode state
+ dir->inode->inode.hash_seed = 1;// dir->ino();
+ if (dir->inode->is_auth()) {
+ dir->inode->mark_dirty();
+ mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+ }
+
+ // get gather set ready for notifies
+ assert(hash_gather[dir].empty());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ if (i == dir->authority()) continue;
+ hash_gather[dir].insert(i);
+ }
+
+ // reply
+ dout(7) << " sending hash_dir_discover_ack on " << *dir << endl;
+ mds->send_message_mds(new MHashDirDiscoverAck(dir->ino()),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+ delete m;
+}
+
+/*
+ * prep step:
+ * peers need to open up all subdirs of the hashed dir
+ */
+
+void Migrator::handle_hash_dir_prep(MHashDirPrep *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_hash_dir_prep " << *dir << endl;
+
+ if (!m->did_assim()) {
+ m->mark_assim(); // only do this the first time!
+
+ // assimilate dentry+inodes for exports
+ for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+ it != m->get_inodes().end();
+ it++) {
+ CInode *in = cache->get_inode( it->second->get_ino() );
+ if (in) {
+ it->second->update_inode(in);
+ dout(5) << " updated " << *in << endl;
+ } else {
+ in = new CInode(mds->mdcache, false);
+ it->second->update_inode(in);
+ cache->add_inode(in);
+
+ // link
+ dir->add_dentry( it->first, in );
+ dout(5) << " added " << *in << endl;
+ }
+
+ // open!
+ if (!in->dir) {
+ dout(5) << " opening nested export on " << *in << endl;
+ cache->open_remote_dir(in,
+ new C_MDS_RetryMessage(mds, m));
+ }
+ }
+ }
+
+ // verify!
+ int waiting_for = 0;
+ for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+ it != m->get_inodes().end();
+ it++) {
+ CInode *in = cache->get_inode( it->second->get_ino() );
+ assert(in);
+
+ if (in->dir) {
+ if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
+ dout(5) << " pinning nested export " << *in->dir << endl;
+ in->dir->get(CDIR_PIN_IMPORTINGEXPORT);
+ in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
+ } else {
+ dout(5) << " already pinned nested export " << *in << endl;
+ }
+ } else {
+ dout(5) << " waiting for nested export dir on " << *in << endl;
+ waiting_for++;
+ }
+ }
+
+ if (waiting_for) {
+ dout(5) << "waiting for " << waiting_for << " dirs to open" << endl;
+ return;
+ }
+
+ // ack!
+ mds->send_message_mds(new MHashDirPrepAck(dir->ino()),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+
+ // done.
+ delete m;
+}
+
+
+/*
+ * hash step:
+ */
+
+void Migrator::handle_hash_dir(MHashDir *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+ assert(!dir->is_auth());
+ assert(!dir->is_hashed());
+ assert(dir->is_hashing());
+
+ dout(5) << "handle_hash_dir " << *dir << endl;
+ int oldauth = m->get_source().num();
+
+ // content
+ import_hashed_content(dir, m->get_state(), m->get_nden(), oldauth);
+
+ // dir state
+ dir->state_set(CDIR_STATE_HASHED);
+ dir->get(CDIR_PIN_HASHED);
+ cache->hashdirs.insert(dir);
+ dir->hashed_subset.insert(mds->get_nodeid());
+
+ // dir is complete
+ dir->mark_complete();
+ dir->mark_dirty();
+ mds->mdlog->submit_entry(new EDirUpdate(dir));
+
+ // commit
+ mds->mdstore->commit_dir(dir, 0);
+
+ // send notifies
+ dout(7) << "sending notifies" << endl;
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ if (i == m->get_source().num()) continue;
+ mds->send_message_mds(new MHashDirNotify(dir->ino(), mds->get_nodeid()),
+ i, MDS_PORT_MIGRATOR);
+ }
+
+ // ack
+ dout(7) << "acking" << endl;
+ mds->send_message_mds(new MHashDirAck(dir->ino()),
+ m->get_source().num(), MDS_PORT_MIGRATOR);
+
+ // done.
+ delete m;
+
+ show_imports();
+}
+
+
+
+
+
+// UNHASH on auth
+
+class C_MDC_UnhashFreeze : public Context {
+public:
+ Migrator *mig;
+ CDir *dir;
+ C_MDC_UnhashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
+ virtual void finish(int r) {
+ mig->unhash_dir_frozen(dir);
+ }
+};
+
+class C_MDC_UnhashComplete : public Context {
+public:
+ Migrator *mig;
+ CDir *dir;
+ C_MDC_UnhashComplete(Migrator *m, CDir *d) : mig(m), dir(d) {}
+ virtual void finish(int r) {
+ mig->unhash_dir_complete(dir);
+ }
+};
+
+
+void Migrator::unhash_dir(CDir *dir)
+{
+ dout(-7) << "unhash_dir " << *dir << endl;
+
+ assert(dir->is_hashed());
+ assert(!dir->is_unhashing());
+ assert(dir->is_auth());
+ assert(hash_gather.count(dir)==0);
+
+ // pin path?
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ if (!cache->path_pin(trace, 0, 0)) {
+ dout(7) << "unhash_dir couldn't pin path, failing." << endl;
+ return;
+ }
+
+ // twiddle state
+ dir->state_set(CDIR_STATE_UNHASHING);
+
+ // first, freeze the dir.
+ dir->freeze_dir(new C_MDC_UnhashFreeze(this, dir));
+
+ // make complete
+ if (!dir->is_complete()) {
+ dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl;
+ mds->mdstore->fetch_dir(dir,
+ new C_MDC_UnhashComplete(this, dir));
+ } else
+ unhash_dir_complete(dir);
+
+}
+
+void Migrator::unhash_dir_frozen(CDir *dir)
+{
+ dout(7) << "unhash_dir_frozen " << *dir << endl;
+
+ assert(dir->is_hashed());
+ assert(dir->is_auth());
+ assert(dir->is_frozen_dir());
+
+ if (!dir->is_complete()) {
+ dout(7) << "unhash_dir_frozen !complete, waiting still on " << *dir << endl;
+ } else
+ unhash_dir_prep(dir);
+}
+
+
+/*
+ * ask peers to freeze and complete hashed dir
+ */
+void Migrator::unhash_dir_prep(CDir *dir)
+{
+ dout(7) << "unhash_dir_prep " << *dir << endl;
+ assert(dir->is_hashed());
+ assert(dir->is_auth());
+ assert(dir->is_frozen_dir());
+ assert(dir->is_complete());
+
+ if (!hash_gather[dir].empty()) return; // already been here..freeze must have been instantaneous
+
+ // send unhash prep to all peers
+ assert(hash_gather[dir].empty());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ hash_gather[dir].insert(i);
+ mds->send_message_mds(new MUnhashDirPrep(dir->ino()),
+ i, MDS_PORT_MIGRATOR);
+ }
+}
+
+/*
+ * wait for peers to freeze and complete hashed dirs
+ */
+void Migrator::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ int from = m->get_source().num();
+ dout(7) << "handle_unhash_dir_prep_ack from " << from << " " << *dir << endl;
+
+ if (!m->did_assim()) {
+ m->mark_assim(); // only do this the first time!
+
+ // assimilate dentry+inodes for exports
+ for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+ it != m->get_inodes().end();
+ it++) {
+ CInode *in = cache->get_inode( it->second->get_ino() );
+ if (in) {
+ it->second->update_inode(in);
+ dout(5) << " updated " << *in << endl;
+ } else {
+ in = new CInode(mds->mdcache, false);
+ it->second->update_inode(in);
+ cache->add_inode(in);
+
+ // link
+ dir->add_dentry( it->first, in );
+ dout(5) << " added " << *in << endl;
+ }
+
+ // open!
+ if (!in->dir) {
+ dout(5) << " opening nested export on " << *in << endl;
+ cache->open_remote_dir(in,
+ new C_MDS_RetryMessage(mds, m));
+ }
+ }
+ }
+
+ // verify!
+ int waiting_for = 0;
+ for (map<string,CInodeDiscover*>::iterator it = m->get_inodes().begin();
+ it != m->get_inodes().end();
+ it++) {
+ CInode *in = cache->get_inode( it->second->get_ino() );
+ assert(in);
+
+ if (in->dir) {
+ if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) {
+ dout(5) << " pinning nested export " << *in->dir << endl;
+ in->dir->get(CDIR_PIN_IMPORTINGEXPORT);
+ in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT);
+ } else {
+ dout(5) << " already pinned nested export " << *in << endl;
+ }
+ } else {
+ dout(5) << " waiting for nested export dir on " << *in << endl;
+ waiting_for++;
+ }
+ }
+
+ if (waiting_for) {
+ dout(5) << "waiting for " << waiting_for << " dirs to open" << endl;
+ return;
+ }
+
+ // ok, done with this PrepAck
+ assert(hash_gather[dir].count(from) == 1);
+ hash_gather[dir].erase(from);
+
+ if (hash_gather[dir].empty()) {
+ hash_gather.erase(dir);
+ dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", last one" << endl;
+ unhash_dir_go(dir);
+ } else {
+ dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl;
+ }
+
+ delete m;
+}
+
+
+/*
+ * auth:
+ * send out MHashDir's to peers
+ */
+void Migrator::unhash_dir_go(CDir *dir)
+{
+ dout(7) << "unhash_dir_go " << *dir << endl;
+ assert(dir->is_hashed());
+ assert(dir->is_auth());
+ assert(dir->is_frozen_dir());
+ assert(dir->is_complete());
+
+ // send unhash prep to all peers
+ assert(hash_gather[dir].empty());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ hash_gather[dir].insert(i);
+ mds->send_message_mds(new MUnhashDir(dir->ino()),
+ i, MDS_PORT_MIGRATOR);
+ }
+}
+
+/*
+ * auth:
+ * assimilate unhashing content
+ */
+void Migrator::handle_unhash_dir_ack(MUnhashDirAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_unhash_dir_ack " << *dir << endl;
+ assert(dir->is_hashed());
+
+ // assimilate content
+ int from = m->get_source().num();
+ import_hashed_content(dir, m->get_state(), m->get_nden(), from);
+ delete m;
+
+ // done?
+ assert(hash_gather[dir].count(from));
+ hash_gather[dir].erase(from);
+
+ if (!hash_gather[dir].empty()) {
+ dout(7) << "still waiting for unhash acks from " << hash_gather[dir] << endl;
+ return;
+ }
+
+ // done!
+
+ // fix up nested_exports
+ CDir *containing_import = cache->get_auth_container(dir);
+ if (containing_import != dir) {
+ for (set<CDir*>::iterator it = cache->nested_exports[dir].begin();
+ it != cache->nested_exports[dir].end();
+ it++) {
+ dout(7) << "moving nested export out from under hashed dir : " << **it << endl;
+ cache->nested_exports[containing_import].insert(*it);
+ }
+ cache->nested_exports.erase(dir);
+ }
+
+ // dir state
+ //dir->state_clear(CDIR_STATE_UNHASHING); //later
+ dir->state_clear(CDIR_STATE_HASHED);
+ dir->put(CDIR_PIN_HASHED);
+ cache->hashdirs.erase(dir);
+
+ // commit!
+ assert(dir->is_complete());
+ //dir->mark_complete();
+ dir->mark_dirty();
+ mds->mdstore->commit_dir(dir, 0);
+
+ // inode state
+ dir->inode->inode.hash_seed = 0;
+ if (dir->inode->is_auth()) {
+ dir->inode->mark_dirty();
+ mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+ }
+
+ // notify
+ assert(hash_gather[dir].empty());
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+
+ hash_gather[dir].insert(i);
+
+ mds->send_message_mds(new MUnhashDirNotify(dir->ino()),
+ i, MDS_PORT_MIGRATOR);
+ }
+}
+
+
+/*
+ * sent by peer to flush mds links. unfreeze when all gathered.
+ */
+void Migrator::handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_unhash_dir_ack " << *dir << endl;
+ assert(!dir->is_hashed());
+ assert(dir->is_unhashing());
+ assert(dir->is_frozen_dir());
+
+ // done?
+ int from = m->get_source().num();
+ assert(hash_gather[dir].count(from));
+ hash_gather[dir].erase(from);
+ delete m;
+
+ if (!hash_gather[dir].empty()) {
+ dout(7) << "still waiting for notifyack from " << hash_gather[dir] << " on " << *dir << endl;
+ } else {
+ unhash_dir_finish(dir);
+ }
+}
+
+
+/*
+ * all mds links are flushed. unfreeze dir!
+ */
+void Migrator::unhash_dir_finish(CDir *dir)
+{
+ dout(7) << "unhash_dir_finish " << *dir << endl;
+ hash_gather.erase(dir);
+
+ // unpin path
+ vector<CDentry*> trace;
+ cache->make_trace(trace, dir->inode);
+ cache->path_unpin(trace, 0);
+
+ // state
+ dir->state_clear(CDIR_STATE_UNHASHING);
+
+ // unfreeze
+ dir->unfreeze_dir();
+
+}
+
+
+
+// UNHASH on all
+
+/*
+ * hashed dir is complete.
+ * mark all migrating inodes dirty (to pin in cache)
+ * if frozen too, then go to next step (depending on auth)
+ */
+void Migrator::unhash_dir_complete(CDir *dir)
+{
+ dout(7) << "unhash_dir_complete " << *dir << ", dirtying inodes" << endl;
+
+ assert(dir->is_hashed());
+ assert(dir->is_complete());
+
+ // mark dirty to pin in cache
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CInode *in = it->second->inode;
+ if (in->is_auth()) {
+ in->mark_dirty();
+ mds->mdlog->submit_entry(new EInodeUpdate(in));
+ }
+ }
+
+ if (!dir->is_frozen_dir()) {
+ dout(7) << "dir complete but !frozen, waiting " << *dir << endl;
+ } else {
+ if (dir->is_auth())
+ unhash_dir_prep(dir); // auth
+ else
+ unhash_dir_prep_finish(dir); // nonauth
+ }
+}
+
+
+// UNHASH on non-auth
+
+class C_MDC_UnhashPrepFreeze : public Context {
+public:
+ Migrator *mig;
+ CDir *dir;
+ C_MDC_UnhashPrepFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {}
+ virtual void finish(int r) {
+ mig->unhash_dir_prep_frozen(dir);
+ }
+};
+
+
+/*
+ * peers need to freeze their dir and make them complete
+ */
+void Migrator::handle_unhash_dir_prep(MUnhashDirPrep *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_unhash_dir_prep " << *dir << endl;
+ assert(dir->is_hashed());
+
+ // freeze
+ dir->freeze_dir(new C_MDC_UnhashPrepFreeze(this, dir));
+
+ // make complete
+ if (!dir->is_complete()) {
+ dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl;
+ mds->mdstore->fetch_dir(dir,
+ new C_MDC_UnhashComplete(this, dir));
+ } else {
+ unhash_dir_complete(dir);
+ }
+
+ delete m;
+}
+
+/*
+ * peer has hashed dir frozen.
+ * complete too?
+ */
+void Migrator::unhash_dir_prep_frozen(CDir *dir)
+{
+ dout(7) << "unhash_dir_prep_frozen " << *dir << endl;
+
+ assert(dir->is_hashed());
+ assert(dir->is_frozen_dir());
+ assert(!dir->is_auth());
+
+ if (!dir->is_complete()) {
+ dout(7) << "unhash_dir_prep_frozen !complete, waiting still on " << *dir << endl;
+ } else
+ unhash_dir_prep_finish(dir);
+}
+
+/*
+ * peer has hashed dir complete and frozen. ack.
+ */
+void Migrator::unhash_dir_prep_finish(CDir *dir)
+{
+ dout(7) << "unhash_dir_prep_finish " << *dir << endl;
+ assert(dir->is_hashed());
+ assert(!dir->is_auth());
+ assert(dir->is_frozen());
+ assert(dir->is_complete());
+
+ // twiddle state
+ if (dir->is_unhashing())
+ return; // already replied.
+ dir->state_set(CDIR_STATE_UNHASHING);
+
+ // send subdirs back to auth
+ MUnhashDirPrepAck *ack = new MUnhashDirPrepAck(dir->ino());
+ int auth = dir->authority();
+
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->inode;
+
+ if (!in->is_dir()) continue;
+ if (!in->dir) continue;
+
+ int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode != mds->get_nodeid()) continue;
+
+ // msg?
+ ack->add_inode(it->first, in->replicate_to(auth));
+ }
+
+ // ack
+ mds->send_message_mds(ack, auth, MDS_PORT_MIGRATOR);
+}
+
+
+
+/*
+ * peer needs to send hashed dir content back to auth.
+ * unhash dir.
+ */
+void Migrator::handle_unhash_dir(MUnhashDir *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl;
+ assert(dir->is_hashed());
+ assert(dir->is_unhashing());
+ assert(!dir->is_auth());
+
+ // get message ready
+ bufferlist bl;
+ int nden = 0;
+
+ // suck up all waiters
+ C_Contexts *fin = new C_Contexts;
+ list<Context*> waiting;
+ dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters
+ fin->take(waiting);
+
+ // divy up contents
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+ CInode *in = dn->inode;
+
+ int dentryhashcode = mds->hash_dentry( dir->ino(), it->first );
+ if (dentryhashcode != mds->get_nodeid()) {
+ // not mine!
+ // twiddle dir_auth?
+ if (in->dir) {
+ if (in->dir->authority() != dir->authority())
+ in->dir->set_dir_auth( in->dir->authority() );
+ else
+ in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ }
+ continue;
+ }
+
+ // -- dentry
+ dout(7) << "unhash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl;
+ _encode(it->first, bl);
+
+ // null dentry?
+ if (dn->is_null()) {
+ bl.append("N", 1); // null dentry
+ assert(dn->is_sync());
+ continue;
+ }
+
+ if (dn->is_remote()) {
+ // remote link
+ bl.append("L", 1); // remote link
+
+ inodeno_t ino = dn->get_remote_ino();
+ bl.append((char*)&ino, sizeof(ino));
+ continue;
+ }
+
+ // primary link
+ // -- inode
+ bl.append("I", 1); // inode dentry
+
+ encode_export_inode(in, bl, dentryhashcode); // encode, and (update state for) export
+ nden++;
+
+ if (dn->is_dirty())
+ dn->mark_clean();
+
+ // proxy
+ in->state_set(CINODE_STATE_PROXY);
+ in->get(CINODE_PIN_PROXY);
+ hash_proxy_inos[dir].push_back(in);
+
+ if (in->dir) {
+ if (in->dir->is_auth()) {
+ // mine. make it into an import.
+ dout(7) << "making subdir into import " << *in->dir << endl;
+ in->dir->set_dir_auth( mds->get_nodeid() );
+ cache->imports.insert(in->dir);
+ in->dir->get(CDIR_PIN_IMPORT);
+ in->dir->state_set(CDIR_STATE_IMPORT);
+ }
+ else {
+ // not mine.
+ dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl;
+ assert(in->dir->is_export());
+ in->dir->put(CDIR_PIN_EXPORT);
+ in->dir->state_clear(CDIR_STATE_EXPORT);
+ cache->exports.erase(in->dir);
+ cache->nested_exports[dir].erase(in->dir);
+ }
+ }
+
+ // waiters
+ list<Context*> waiters;
+ in->take_waiting(CINODE_WAIT_ANY, waiters);
+ fin->take(waiters);
+ }
+
+ // we should have no nested exports; we're not auth for the dir!
+ assert(cache->nested_exports[dir].empty());
+ cache->nested_exports.erase(dir);
+
+ // dir state
+ //dir->state_clear(CDIR_STATE_UNHASHING); // later
+ dir->state_clear(CDIR_STATE_HASHED);
+ dir->put(CDIR_PIN_HASHED);
+ cache->hashdirs.erase(dir);
+ dir->mark_clean();
+
+ // inode state
+ dir->inode->inode.hash_seed = 0;
+ if (dir->inode->is_auth()) {
+ dir->inode->mark_dirty();
+ mds->mdlog->submit_entry(new EInodeUpdate(dir->inode));
+ }
+
+ // init gather set
+ hash_gather[dir] = mds->get_mds_map()->get_mds();
+ hash_gather[dir].erase(mds->get_nodeid());
+
+ // send unhash message
+ mds->send_message_mds(new MUnhashDirAck(dir->ino(), bl, nden),
+ dir->authority(), MDS_PORT_MIGRATOR);
+}
+
+
+/*
+ * first notify comes from auth.
+ * send notifies to all other peers, with peer = self
+ * if we get notify from peer=other, remove from our gather list.
+ * when we've gotten notifies from everyone,
+ * unpin proxies,
+ * send notify_ack to auth.
+ * this ensures that all mds links are flushed of cache_expire type messages.
+ */
+void Migrator::handle_unhash_dir_notify(MUnhashDirNotify *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(7) << "handle_unhash_dir_finish " << *dir << endl;
+ assert(!dir->is_hashed());
+ assert(dir->is_unhashing());
+ assert(!dir->is_auth());
+
+ int from = m->get_source().num();
+ assert(hash_gather[dir].count(from) == 1);
+ hash_gather[dir].erase(from);
+ delete m;
+
+ // did we send our shout out?
+ if (from == dir->authority()) {
+ // send notify to everyone else in weird chatter storm
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i == from) continue;
+ if (i == mds->get_nodeid()) continue;
+ mds->send_message_mds(new MUnhashDirNotify(dir->ino()), i, MDS_PORT_MIGRATOR);
+ }
+ }
+
+ // are we done?
+ if (!hash_gather[dir].empty()) {
+ dout(7) << "still waiting for notify from " << hash_gather[dir] << endl;
+ return;
+ }
+ hash_gather.erase(dir);
+
+ // all done!
+ dout(7) << "all mds links flushed, unpinning unhash proxies" << endl;
+
+ // unpin proxies
+ for (list<CInode*>::iterator it = hash_proxy_inos[dir].begin();
+ it != hash_proxy_inos[dir].end();
+ it++) {
+ CInode *in = *it;
+ assert(in->state_test(CINODE_STATE_PROXY));
+ in->state_clear(CINODE_STATE_PROXY);
+ in->put(CINODE_PIN_PROXY);
+ }
+
+ // unfreeze
+ dir->unfreeze_dir();
+
+ // ack
+ dout(7) << "sending notify_ack to auth for unhash of " << *dir << endl;
+ mds->send_message_mds(new MUnhashDirNotifyAck(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR);
+
+}
+
+
+
+
+void Migrator::show_imports()
+{
+ mds->balancer->show_imports();
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MDS_MIGRATOR_H
+#define __MDS_MIGRATOR_H
+
+#include "include/types.h"
+
+#include <map>
+#include <list>
+#include <set>
+using std::map;
+using std::list;
+using std::set;
+
+
+class MDS;
+class CDir;
+class CInode;
+class CDentry;
+
+class MExportDirDiscover;
+class MExportDirDiscoverAck;
+class MExportDirPrep;
+class MExportDirPrepAck;
+class MExportDirWarning;
+class MExportDir;
+class MExportDirNotify;
+class MExportDirNotifyAck;
+class MExportDirFinish;
+
+class MHashDirDiscover;
+class MHashDirDiscoverAck;
+class MHashDirPrep;
+class MHashDirPrepAck;
+class MHashDir;
+class MHashDirAck;
+class MHashDirNotify;
+
+class MUnhashDirPrep;
+class MUnhashDirPrepAck;
+class MUnhashDir;
+class MUnhashDirAck;
+class MUnhashDirNotify;
+class MUnhashDirNotifyAck;
+
+class Migrator {
+private:
+ MDS *mds;
+ MDCache *cache;
+
+ // export fun
+ map<CDir*, set<int> > export_notify_ack_waiting; // nodes i am waiting to get export_notify_ack's from
+ map<CDir*, list<inodeno_t> > export_proxy_inos;
+ map<CDir*, list<inodeno_t> > export_proxy_dirinos;
+
+ set<inodeno_t> stray_export_warnings; // notifies i haven't seen
+ map<inodeno_t, MExportDirNotify*> stray_export_notifies;
+
+ // hashing madness
+ multimap<CDir*, int> unhash_waiting; // nodes i am waiting for UnhashDirAck's from
+ multimap<inodeno_t, inodeno_t> import_hashed_replicate_waiting; // nodes i am waiting to discover to complete my import of a hashed dir
+ // maps frozen_dir_ino's to waiting-for-discover ino's.
+ multimap<inodeno_t, inodeno_t> import_hashed_frozen_waiting; // dirs i froze (for the above)
+
+public:
+ // -- cons --
+ Migrator(MDS *m, MDCache *c) : mds(m), cache(c) {}
+
+ void dispatch(Message*);
+
+ // -- import/export --
+ // exporter
+ public:
+ void export_dir(CDir *dir,
+ int mds);
+ void export_empty_import(CDir *dir);
+
+ void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth);
+ void decode_import_inode(CDentry *dn, bufferlist& bl, int &off, int oldauth);
+
+ protected:
+ map< CDir*, set<int> > export_gather;
+ void handle_export_dir_discover_ack(MExportDirDiscoverAck *m);
+ void export_dir_frozen(CDir *dir, int dest);
+ void handle_export_dir_prep_ack(MExportDirPrepAck *m);
+ void export_dir_go(CDir *dir,
+ int dest);
+ int export_dir_walk(MExportDir *req,
+ class C_Contexts *fin,
+ CDir *basedir,
+ CDir *dir,
+ int newauth);
+ void export_dir_finish(CDir *dir);
+ void handle_export_dir_notify_ack(MExportDirNotifyAck *m);
+
+
+ friend class C_MDC_ExportFreeze;
+
+ // importer
+ void handle_export_dir_discover(MExportDirDiscover *m);
+ void handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r);
+ void handle_export_dir_prep(MExportDirPrep *m);
+ void handle_export_dir(MExportDir *m);
+ void import_dir_finish(CDir *dir);
+ void handle_export_dir_finish(MExportDirFinish *m);
+ int import_dir_block(bufferlist& bl,
+ int& off,
+ int oldauth,
+ CDir *import_root,
+ list<inodeno_t>& imported_subdirs);
+ void got_hashed_replica(CDir *import,
+ inodeno_t dir_ino,
+ inodeno_t replica_ino);
+
+
+ friend class C_MDC_ExportDirDiscover;
+
+ // bystander
+ void handle_export_dir_warning(MExportDirWarning *m);
+ void handle_export_dir_notify(MExportDirNotify *m);
+
+ void show_imports();
+
+ // -- hashed directories --
+
+ // HASH
+ public:
+ void hash_dir(CDir *dir); // on auth
+ protected:
+ map< CDir*, set<int> > hash_gather;
+ map< CDir*, map< int, set<int> > > hash_notify_gather;
+ map< CDir*, list<CInode*> > hash_proxy_inos;
+
+ // hash on auth
+ void handle_hash_dir_discover_ack(MHashDirDiscoverAck *m);
+ void hash_dir_complete(CDir *dir);
+ void hash_dir_frozen(CDir *dir);
+ void handle_hash_dir_prep_ack(MHashDirPrepAck *m);
+ void hash_dir_go(CDir *dir);
+ void handle_hash_dir_ack(MHashDirAck *m);
+ void hash_dir_finish(CDir *dir);
+ friend class C_MDC_HashFreeze;
+ friend class C_MDC_HashComplete;
+
+ // auth and non-auth
+ void handle_hash_dir_notify(MHashDirNotify *m);
+
+ // hash on non-auth
+ void handle_hash_dir_discover(MHashDirDiscover *m);
+ void handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r);
+ void handle_hash_dir_prep(MHashDirPrep *m);
+ void handle_hash_dir(MHashDir *m);
+ friend class C_MDC_HashDirDiscover;
+
+ // UNHASH
+ public:
+ void unhash_dir(CDir *dir); // on auth
+ protected:
+ map< CDir*, list<MUnhashDirAck*> > unhash_content;
+ void import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth);
+
+ // unhash on auth
+ void unhash_dir_frozen(CDir *dir);
+ void unhash_dir_prep(CDir *dir);
+ void handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m);
+ void unhash_dir_go(CDir *dir);
+ void handle_unhash_dir_ack(MUnhashDirAck *m);
+ void handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m);
+ void unhash_dir_finish(CDir *dir);
+ friend class C_MDC_UnhashFreeze;
+ friend class C_MDC_UnhashComplete;
+
+ // unhash on all
+ void unhash_dir_complete(CDir *dir);
+
+ // unhash on non-auth
+ void handle_unhash_dir_prep(MUnhashDirPrep *m);
+ void unhash_dir_prep_frozen(CDir *dir);
+ void unhash_dir_prep_finish(CDir *dir);
+ void handle_unhash_dir(MUnhashDir *m);
+ void handle_unhash_dir_notify(MUnhashDirNotify *m);
+ friend class C_MDC_UnhashPrepFreeze;
+
+
+};
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "OSDMonitor.h"
+
+#include "osd/OSDMap.h"
+
+#include "msg/Message.h"
+#include "msg/Messenger.h"
+
+#include "messages/MPing.h"
+#include "messages/MPingAck.h"
+#include "messages/MOSDFailure.h"
+#include "messages/MOSDMap.h"
+#include "messages/MOSDGetMap.h"
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDIn.h"
+#include "messages/MOSDOut.h"
+
+#include "common/Timer.h"
+#include "common/Clock.h"
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << "mon" << whoami << " e" << (osdmap ? osdmap->get_epoch():0) << " "
+#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << "mon" << whoami << " e" << (osdmap ? osdmap->get_epoch():0) << " "
+
+
+class C_OM_PingTick : public Context {
+public:
+ Messenger *msgr;
+ C_OM_PingTick(Messenger *m) : msgr(m) {}
+ void finish(int r) {
+ msgr->send_message(new MPing, MSG_ADDR_MON(0));
+ }
+};
+
+class C_OM_Faker : public Context {
+public:
+ OSDMonitor *om;
+ C_OM_Faker(OSDMonitor *m) {
+ this->om = m;
+ }
+ void finish(int r) {
+ om->fake_reorg();
+ }
+};
+
+class C_OM_FakeOSDFailure : public Context {
+ OSDMonitor *mon;
+ int osd;
+ bool down;
+public:
+ C_OM_FakeOSDFailure(OSDMonitor *m, int o, bool d) : mon(m), osd(o), down(d) {}
+ void finish(int r) {
+ mon->fake_osd_failure(osd,down);
+ }
+};
+
+
+
+void OSDMonitor::fake_osdmap_update()
+{
+ dout(1) << "fake_osdmap_update" << endl;
+ accept_pending();
+
+ // tell a random osd
+ send_incremental_map(osdmap->get_epoch()-1, // ick! FIXME
+ MSG_ADDR_OSD(rand() % g_conf.num_osd));
+}
+
+
+void OSDMonitor::fake_reorg()
+{
+ int r = rand() % g_conf.num_osd;
+
+ if (osdmap->is_out(r)) {
+ dout(1) << "fake_reorg marking osd" << r << " in" << endl;
+ pending.new_in.push_back(r);
+ } else {
+ dout(1) << "fake_reorg marking osd" << r << " out" << endl;
+ pending.new_out.push_back(r);
+ }
+
+ accept_pending();
+
+ // tell him!
+ send_incremental_map(osdmap->get_epoch()-1, MSG_ADDR_OSD(r));
+}
+
+
+void OSDMonitor::init()
+{
+ dout(1) << "init" << endl;
+
+
+ // <HACK set up OSDMap from g_conf>
+ osdmap = new OSDMap();
+ osdmap->set_pg_bits(g_conf.osd_pg_bits);
+
+ // start at epoch 0 until all osds boot
+ //osdmap->inc_epoch(); // = 1
+ //assert(osdmap->get_epoch() == 1);
+
+
+ //if (g_conf.mkfs) osdmap->set_mkfs();
+
+ Bucket *b = new UniformBucket(1, 0);
+ int root = osdmap->crush.add_bucket(b);
+ for (int i=0; i<g_conf.num_osd; i++) {
+ osdmap->osds.insert(i);
+ b->add_item(i, 1);
+ }
+
+ for (int i=1; i<5; i++) {
+ osdmap->crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ osdmap->crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0));
+ osdmap->crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+ }
+
+ if (g_conf.mds_local_osd) {
+ // add mds osds, but don't put them in the crush mapping func
+ for (int i=0; i<g_conf.num_mds; i++)
+ osdmap->osds.insert(i+10000);
+ }
+
+ // </HACK>
+
+
+
+ if (whoami == 0 &&
+ g_conf.num_osd > 4 &&
+ g_conf.fake_osdmap_expand) {
+ dout(1) << "scheduling OSD map reorg at " << g_conf.fake_osdmap_expand << endl;
+ g_timer.add_event_after(g_conf.fake_osdmap_expand,
+ new C_OM_Faker(this));
+ }
+
+ if (whoami == 0) {
+ // fake osd failures
+ for (map<int,float>::iterator i = g_fake_osd_down.begin();
+ i != g_fake_osd_down.end();
+ i++) {
+ dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << endl;
+ g_timer.add_event_after(i->second, new C_OM_FakeOSDFailure(this, i->first, 1));
+ }
+ for (map<int,float>::iterator i = g_fake_osd_out.begin();
+ i != g_fake_osd_out.end();
+ i++) {
+ dout(0) << "will fake osd" << i->first << " OUT after " << i->second << endl;
+ g_timer.add_event_after(i->second, new C_OM_FakeOSDFailure(this, i->first, 0));
+ }
+ }
+
+
+ // i'm ready!
+ messenger->set_dispatcher(this);
+
+ // start ticker
+ g_timer.add_event_after(g_conf.mon_tick_interval, new C_OM_PingTick(messenger));
+}
+
+
+void OSDMonitor::dispatch(Message *m)
+{
+ switch (m->get_type()) {
+ case MSG_OSD_FAILURE:
+ handle_osd_failure((MOSDFailure*)m);
+ break;
+
+ case MSG_PING_ACK:
+ handle_ping_ack((MPingAck*)m);
+ break;
+
+ case MSG_OSD_GETMAP:
+ handle_osd_getmap((MOSDGetMap*)m);
+ return;
+
+ case MSG_OSD_BOOT:
+ handle_osd_boot((MOSDBoot*)m);
+ return;
+
+ case MSG_OSD_IN:
+ handle_osd_in((MOSDIn*)m);
+ break;
+ case MSG_OSD_OUT:
+ handle_osd_out((MOSDOut*)m);
+ break;
+
+ case MSG_SHUTDOWN:
+ handle_shutdown(m);
+ return;
+
+ case MSG_PING:
+ tick();
+ delete m;
+ return;
+
+ default:
+ dout(0) << "unknown message " << *m << endl;
+ assert(0);
+ }
+}
+
+
+void OSDMonitor::handle_shutdown(Message *m)
+{
+ dout(1) << "shutdown from " << m->get_source() << endl;
+ messenger->shutdown();
+ delete messenger;
+ delete m;
+}
+
+void OSDMonitor::handle_ping_ack(MPingAck *m)
+{
+ // ...
+
+ delete m;
+}
+
+void OSDMonitor::handle_osd_failure(MOSDFailure *m)
+{
+ dout(1) << "osd failure: " << m->get_failed() << " from " << m->get_source() << endl;
+
+ // FIXME?
+
+ // take their word for it
+ int from = m->get_failed().num();
+ if (osdmap->is_up(from) &&
+ (osdmap->osd_inst.count(from) == 0 ||
+ osdmap->osd_inst[from] == m->get_inst())) {
+ pending.new_down[from] = m->get_inst();
+
+ if (osdmap->is_in(from))
+ pending_out[from] = g_clock.now();
+
+ //awaiting_maps[pending.epoch][m->get_source()] =
+
+ accept_pending();
+ bcast_latest_osd_map_mds();
+ //bcast_latest_osd_map_osd(); // FIXME: which osds can i tell?
+ }
+
+ send_incremental_map(m->get_epoch(), m->get_source());
+
+ delete m;
+}
+
+
+
+void OSDMonitor::fake_osd_failure(int osd, bool down)
+{
+ if (down) {
+ dout(1) << "fake_osd_failure DOWN osd" << osd << endl;
+ pending.new_down[osd] = osdmap->osd_inst[osd];
+ } else {
+ dout(1) << "fake_osd_failure OUT osd" << osd << endl;
+ pending.new_out.push_back(osd);
+ }
+ accept_pending();
+ bcast_latest_osd_map_osd();
+ bcast_latest_osd_map_mds();
+}
+
+
+void OSDMonitor::handle_osd_boot(MOSDBoot *m)
+{
+ dout(7) << "osd_boot from " << m->get_source() << endl;
+ assert(m->get_source().is_osd());
+ int from = m->get_source().num();
+
+ if (osdmap->get_epoch() == 0) {
+ // waiting for boot!
+ osdmap->osd_inst[from] = m->get_source_inst();
+
+ if (osdmap->osd_inst.size() == osdmap->osds.size()) {
+ dout(-7) << "osd_boot all osds booted." << endl;
+ osdmap->inc_epoch();
+ osdmap->encode(maps[osdmap->get_epoch()]); // 1
+ pending.epoch = osdmap->get_epoch()+1; // 2
+
+ send_map();
+ bcast_latest_osd_map_osd();
+ bcast_latest_osd_map_mds();
+ } else {
+ dout(7) << "osd_boot waiting for "
+ << (osdmap->osds.size() - osdmap->osd_inst.size())
+ << " osds to boot" << endl;
+ }
+ return;
+ }
+
+ // already up? mark down first?
+ if (osdmap->is_up(from)) {
+ assert(m->get_source_inst() > osdmap->osd_inst[from]); // this better be newer!
+ pending.new_down[from] = osdmap->osd_inst[from];
+ accept_pending();
+ }
+
+ // mark up.
+ pending_out.erase(from);
+ assert(osdmap->is_down(from));
+ pending.new_up[from] = m->get_source_inst();
+
+ // mark in?
+ if (osdmap->out_osds.count(from))
+ pending.new_in.push_back(from);
+
+ accept_pending();
+
+ // the booting osd will spread word
+ send_incremental_map(m->sb.current_epoch, m->get_source());
+ delete m;
+
+ // tell mds
+ bcast_latest_osd_map_mds();
+}
+
+void OSDMonitor::handle_osd_in(MOSDIn *m)
+{
+ dout(7) << "osd_in from " << m->get_source() << endl;
+ int from = m->get_source().num();
+ if (osdmap->is_out(from)) {
+ pending.new_in.push_back(from);
+ accept_pending();
+ send_incremental_map(m->map_epoch, m->get_source());
+ }
+}
+
+void OSDMonitor::handle_osd_out(MOSDOut *m)
+{
+ dout(7) << "osd_out from " << m->get_source() << endl;
+ int from = m->get_source().num();
+ if (osdmap->is_in(from)) {
+ pending.new_out.push_back(from);
+ accept_pending();
+ send_incremental_map(m->map_epoch, m->get_source());
+ }
+}
+
+
+void OSDMonitor::handle_osd_getmap(MOSDGetMap *m)
+{
+ dout(7) << "osd_getmap from " << m->get_source() << " since " << m->get_since() << endl;
+
+ if (osdmap->get_epoch() == 0) {
+ awaiting_map[1][m->get_source()] = m->get_since();
+ } else {
+ if (m->get_since())
+ send_incremental_map(m->get_since(), m->get_source());
+ else
+ send_full_map(m->get_source());
+ }
+ delete m;
+}
+
+
+
+void OSDMonitor::accept_pending()
+{
+ dout(-10) << "accept_pending " << osdmap->get_epoch() << " -> " << pending.epoch << endl;
+
+ // accept pending into a new map!
+ pending.encode( inc_maps[ pending.epoch ] );
+
+ // advance!
+ osdmap->apply_incremental(pending);
+
+
+ // tell me about it
+ for (map<int,entity_inst_t>::iterator i = pending.new_up.begin();
+ i != pending.new_up.end();
+ i++) {
+ dout(0) << "osd" << i->first << " UP " << i->second << endl;
+ derr(0) << "osd" << i->first << " UP " << i->second << endl;
+ messenger->mark_up(MSG_ADDR_OSD(i->first), i->second);
+ }
+ for (map<int,entity_inst_t>::iterator i = pending.new_down.begin();
+ i != pending.new_down.end();
+ i++) {
+ dout(0) << "osd" << i->first << " DOWN " << i->second << endl;
+ derr(0) << "osd" << i->first << " DOWN " << i->second << endl;
+ messenger->mark_down(MSG_ADDR_OSD(i->first), i->second);
+ }
+ for (list<int>::iterator i = pending.new_in.begin();
+ i != pending.new_in.end();
+ i++) {
+ dout(0) << "osd" << *i << " IN" << endl;
+ derr(0) << "osd" << *i << " IN" << endl;
+ }
+ for (list<int>::iterator i = pending.new_out.begin();
+ i != pending.new_out.end();
+ i++) {
+ dout(0) << "osd" << *i << " OUT" << endl;
+ derr(0) << "osd" << *i << " OUT" << endl;
+ }
+
+ // clear new pending
+ OSDMap::Incremental next(osdmap->get_epoch() + 1);
+ pending = next;
+}
+
+void OSDMonitor::send_map()
+{
+ dout(10) << "send_map " << osdmap->get_epoch() << endl;
+
+ map<msg_addr_t,epoch_t> s;
+ s.swap( awaiting_map[osdmap->get_epoch()] );
+ awaiting_map.erase(osdmap->get_epoch());
+
+ for (map<msg_addr_t,epoch_t>::iterator i = s.begin();
+ i != s.end();
+ i++)
+ send_incremental_map(i->second, i->first);
+}
+
+
+void OSDMonitor::send_full_map(msg_addr_t who)
+{
+ messenger->send_message(new MOSDMap(osdmap), who);
+}
+
+void OSDMonitor::send_incremental_map(epoch_t since, msg_addr_t dest)
+{
+ dout(-10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch()
+ << " to " << dest << endl;
+
+ MOSDMap *m = new MOSDMap;
+
+ for (epoch_t e = osdmap->get_epoch();
+ e > since;
+ e--) {
+ bufferlist bl;
+ if (inc_maps.count(e)) {
+ dout(-10) << "send_incremental_map inc " << e << endl;
+ m->incremental_maps[e] = inc_maps[e];
+ } else if (maps.count(e)) {
+ dout(-10) << "send_incremental_map full " << e << endl;
+ m->maps[e] = maps[e];
+ //if (!full) break;
+ }
+ else {
+ assert(0); // we should have all maps.
+ }
+ }
+
+ messenger->send_message(m, dest);
+}
+
+
+
+void OSDMonitor::bcast_latest_osd_map_mds()
+{
+ epoch_t e = osdmap->get_epoch();
+ dout(1) << "bcast_latest_osd_map_mds epoch " << e << endl;
+
+ // tell mds
+ for (int i=0; i<g_conf.num_mds; i++) {
+ //send_full_map(MSG_ADDR_MDS(i));
+ send_incremental_map(osdmap->get_epoch()-1, MSG_ADDR_MDS(i));
+ }
+}
+
+void OSDMonitor::bcast_latest_osd_map_osd()
+{
+ epoch_t e = osdmap->get_epoch();
+ dout(1) << "bcast_latest_osd_map_osd epoch " << e << endl;
+
+ // tell osds
+ set<int> osds;
+ osdmap->get_all_osds(osds);
+ for (set<int>::iterator it = osds.begin();
+ it != osds.end();
+ it++) {
+ if (osdmap->is_down(*it)) continue;
+
+ send_incremental_map(osdmap->get_epoch()-1, MSG_ADDR_OSD(*it));
+ }
+}
+
+
+
+void OSDMonitor::tick()
+{
+ dout(10) << "tick" << endl;
+
+ // mark down osds out?
+ utime_t now = g_clock.now();
+ list<int> mark_out;
+ for (map<int,utime_t>::iterator i = pending_out.begin();
+ i != pending_out.end();
+ i++) {
+ utime_t down = now;
+ down -= i->second;
+
+ if (down.sec() >= g_conf.mon_osd_down_out_interval) {
+ dout(10) << "tick marking osd" << i->first << " OUT after " << down << " sec" << endl;
+ mark_out.push_back(i->first);
+ }
+ }
+ for (list<int>::iterator i = mark_out.begin();
+ i != mark_out.end();
+ i++) {
+ pending_out.erase(*i);
+ pending.new_out.push_back( *i );
+ accept_pending();
+ }
+
+ // next!
+ g_timer.add_event_after(g_conf.mon_tick_interval, new C_OM_PingTick(messenger));
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __OSDMONITOR_H
+#define __OSDMONITOR_H
+
+#include <time.h>
+
+#include <map>
+#include <set>
+using namespace std;
+
+#include "include/types.h"
+#include "msg/Messenger.h"
+
+#include "osd/OSDMap.h"
+
+class OSDMonitor : public Dispatcher {
+ // me
+ int whoami;
+ Messenger *messenger;
+
+ // maps
+ OSDMap *osdmap;
+ map<epoch_t, bufferlist> maps;
+ map<epoch_t, bufferlist> inc_maps;
+
+ OSDMap::Incremental pending;
+
+ map<epoch_t, map<msg_addr_t, epoch_t> > awaiting_map;
+
+ // osd down -> out
+ map<int,utime_t> pending_out;
+
+
+ void tick(); // check state, take actions
+
+ // maps
+ void accept_pending(); // accept pending, new map.
+ void send_map(); // send current map to waiters.
+ void send_full_map(msg_addr_t dest);
+ void send_incremental_map(epoch_t since, msg_addr_t dest);
+ void bcast_latest_osd_map_mds();
+ void bcast_latest_osd_map_osd();
+
+
+ public:
+ OSDMonitor(int w, Messenger *m) :
+ whoami(w),
+ messenger(m),
+ osdmap(0) {
+ }
+
+ void init();
+
+ void dispatch(Message *m);
+ void handle_shutdown(Message *m);
+
+ void handle_osd_boot(class MOSDBoot *m);
+ void handle_osd_in(class MOSDIn *m);
+ void handle_osd_out(class MOSDOut *m);
+ void handle_osd_failure(class MOSDFailure *m);
+ void handle_osd_getmap(class MOSDGetMap *m);
+
+ void handle_ping_ack(class MPingAck *m);
+
+ // hack
+ void fake_osd_failure(int osd, bool down);
+ void fake_osdmap_update();
+ void fake_reorg();
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "MDCache.h"
+#include "MDStore.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "MDS.h"
+#include "MDSMap.h"
+#include "MDLog.h"
+#include "AnchorClient.h"
+#include "Migrator.h"
+#include "Renamer.h"
+
+#include "include/filepath.h"
+
+#include "msg/Message.h"
+#include "msg/Messenger.h"
+
+#include "events/EInodeUpdate.h"
+#include "events/EDirUpdate.h"
+#include "events/EUnlink.h"
+
+#include "messages/MRenameWarning.h"
+#include "messages/MRenameNotify.h"
+#include "messages/MRenameNotifyAck.h"
+#include "messages/MRename.h"
+#include "messages/MRenameAck.h"
+#include "messages/MRenameReq.h"
+#include "messages/MRenamePrep.h"
+
+
+
+void Renamer::dispatch(Message *m)
+{
+ switch (m->get_type()) {
+ case MSG_MDS_RENAMEWARNING:
+ handle_rename_warning((MRenameWarning*)m);
+ break;
+ case MSG_MDS_RENAMENOTIFY:
+ handle_rename_notify((MRenameNotify*)m);
+ break;
+ case MSG_MDS_RENAMENOTIFYACK:
+ handle_rename_notify_ack((MRenameNotifyAck*)m);
+ break;
+ case MSG_MDS_RENAME:
+ handle_rename((MRename*)m);
+ break;
+ case MSG_MDS_RENAMEREQ:
+ handle_rename_req((MRenameReq*)m);
+ break;
+ case MSG_MDS_RENAMEPREP:
+ handle_rename_prep((MRenamePrep*)m);
+ break;
+ case MSG_MDS_RENAMEACK:
+ handle_rename_ack((MRenameAck*)m);
+ break;
+
+ default:
+ assert(0);
+ }
+}
+
+
+// renaming!
+
+
+/*
+ fix_renamed_dir():
+
+ caller has already:
+ - relinked inode in new location
+ - fixed in->is_auth()
+ - set dir_auth, if appropriate
+
+ caller has not:
+ - touched in->dir
+ - updated import/export tables
+*/
+void Renamer::fix_renamed_dir(CDir *srcdir,
+ CInode *in,
+ CDir *destdir,
+ bool authchanged, // _inode_ auth
+ int dir_auth) // dir auth (for certain cases)
+{
+ dout(7) << "fix_renamed_dir on " << *in << endl;
+ dout(7) << "fix_renamed_dir on " << *in->dir << endl;
+
+ if (in->dir->is_auth()) {
+ // dir ours
+ dout(7) << "dir is auth" << endl;
+ assert(!in->dir->is_export());
+
+ if (in->is_auth()) {
+ // inode now ours
+
+ if (authchanged) {
+ // inode _was_ replica, now ours
+ dout(7) << "inode was replica, now ours. removing from import list." << endl;
+ assert(in->dir->is_import());
+
+ // not import anymore!
+ cache->imports.erase(in->dir);
+ in->dir->state_clear(CDIR_STATE_IMPORT);
+ in->dir->put(CDIR_PIN_IMPORT);
+
+ in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl;
+
+ // move my nested imports to in's containing import
+ CDir *con = cache->get_auth_container(in->dir);
+ assert(con);
+ for (set<CDir*>::iterator p = cache->nested_exports[in->dir].begin();
+ p != cache->nested_exports[in->dir].end();
+ p++) {
+ dout(7) << "moving nested export under new container " << *con << endl;
+ cache->nested_exports[con].insert(*p);
+ }
+ cache->nested_exports.erase(in->dir);
+
+ } else {
+ // inode was ours, still ours.
+ dout(7) << "inode was ours, still ours." << endl;
+ assert(!in->dir->is_import());
+ assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT);
+
+ // move any exports nested beneath me?
+ CDir *newcon = cache->get_auth_container(in->dir);
+ assert(newcon);
+ CDir *oldcon = cache->get_auth_container(srcdir);
+ assert(oldcon);
+ if (newcon != oldcon) {
+ dout(7) << "moving nested exports under new container" << endl;
+ set<CDir*> nested;
+ cache->find_nested_exports_under(oldcon, in->dir, nested);
+ for (set<CDir*>::iterator it = nested.begin();
+ it != nested.end();
+ it++) {
+ dout(7) << "moving nested export " << *it << " under new container" << endl;
+ cache->nested_exports[oldcon].erase(*it);
+ cache->nested_exports[newcon].insert(*it);
+ }
+ }
+ }
+
+ } else {
+ // inode now replica
+
+ if (authchanged) {
+ // inode was ours, but now replica
+ dout(7) << "inode was ours, now replica. adding to import list." << endl;
+
+ // i am now an import
+ cache->imports.insert(in->dir);
+ in->dir->state_set(CDIR_STATE_IMPORT);
+ in->dir->get(CDIR_PIN_IMPORT);
+
+ in->dir->set_dir_auth( mds->get_nodeid() );
+ dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl;
+
+ // find old import
+ CDir *oldcon = cache->get_auth_container(srcdir);
+ assert(oldcon);
+ dout(7) << " oldcon is " << *oldcon << endl;
+
+ // move nested exports under me
+ set<CDir*> nested;
+ cache->find_nested_exports_under(oldcon, in->dir, nested);
+ for (set<CDir*>::iterator it = nested.begin();
+ it != nested.end();
+ it++) {
+ dout(7) << "moving nested export " << *it << " under me" << endl;
+ cache->nested_exports[oldcon].erase(*it);
+ cache->nested_exports[in->dir].insert(*it);
+ }
+
+ } else {
+ // inode was replica, still replica
+ dout(7) << "inode was replica, still replica. doing nothing." << endl;
+ assert(in->dir->is_import());
+
+ // verify dir_auth
+ assert(in->dir->get_dir_auth() == mds->get_nodeid()); // me, because i'm auth for dir.
+ assert(in->authority() != in->dir->get_dir_auth()); // inode not me.
+ }
+
+ assert(in->dir->is_import());
+ }
+
+ } else {
+ // dir is not ours
+ dout(7) << "dir is not auth" << endl;
+
+ if (in->is_auth()) {
+ // inode now ours
+
+ if (authchanged) {
+ // inode was replica, now ours
+ dout(7) << "inode was replica, now ours. now an export." << endl;
+ assert(!in->dir->is_export());
+
+ // now export
+ cache->exports.insert(in->dir);
+ in->dir->state_set(CDIR_STATE_EXPORT);
+ in->dir->get(CDIR_PIN_EXPORT);
+
+ assert(dir_auth >= 0); // better be defined
+ in->dir->set_dir_auth( dir_auth );
+ dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl;
+
+ CDir *newcon = cache->get_auth_container(in->dir);
+ assert(newcon);
+ cache->nested_exports[newcon].insert(in->dir);
+
+ } else {
+ // inode was ours, still ours
+ dout(7) << "inode was ours, still ours. did my import change?" << endl;
+
+ // sanity
+ assert(in->dir->is_export());
+ assert(in->dir->get_dir_auth() >= 0);
+ assert(in->dir->get_dir_auth() != in->authority());
+
+ // moved under new import?
+ CDir *oldcon = cache->get_auth_container(srcdir);
+ CDir *newcon = cache->get_auth_container(in->dir);
+ if (oldcon != newcon) {
+ dout(7) << "moving myself under new import " << *newcon << endl;
+ cache->nested_exports[oldcon].erase(in->dir);
+ cache->nested_exports[newcon].insert(in->dir);
+ }
+ }
+
+ assert(in->dir->is_export());
+ } else {
+ // inode now replica
+
+ if (authchanged) {
+ // inode was ours, now replica
+ dout(7) << "inode was ours, now replica. removing from export list." << endl;
+ assert(in->dir->is_export());
+
+ // remove from export list
+ cache->exports.erase(in->dir);
+ in->dir->state_clear(CDIR_STATE_EXPORT);
+ in->dir->put(CDIR_PIN_EXPORT);
+
+ CDir *oldcon = cache->get_auth_container(srcdir);
+ assert(oldcon);
+ assert(cache->nested_exports[oldcon].count(in->dir) == 1);
+ cache->nested_exports[oldcon].erase(in->dir);
+
+ // simplify dir_auth
+ if (in->authority() == in->dir->authority()) {
+ in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ dout(7) << "simplified dir_auth to -1, inode auth is (also) " << in->authority() << endl;
+ } else {
+ assert(in->dir->get_dir_auth() >= 0); // someone else's export,
+ }
+
+ } else {
+ // inode was replica, still replica
+ dout(7) << "inode was replica, still replica. do nothing." << endl;
+
+ // fix dir_auth?
+ if (in->authority() == dir_auth)
+ in->dir->set_dir_auth( CDIR_AUTH_PARENT );
+ else
+ in->dir->set_dir_auth( dir_auth );
+ dout(7) << " fixing dir_auth to be " << dir_auth << endl;
+
+ // do nothing.
+ }
+
+ assert(!in->dir->is_export());
+ }
+ }
+
+ cache->show_imports();
+}
+
+/*
+ * when initiator gets an ack back for a foreign rename
+ */
+
+class C_MDC_RenameNotifyAck : public Context {
+ Renamer *rn;
+ CInode *in;
+ int initiator;
+
+public:
+ C_MDC_RenameNotifyAck(Renamer *r,
+ CInode *i, int init) : rn(r), in(i), initiator(init) {}
+ void finish(int r) {
+ rn->file_rename_ack(in, initiator);
+ }
+};
+
+
+
+/************** initiator ****************/
+
+/*
+ * when we get MRenameAck (and rename is done, notifies gone out+acked, etc.)
+ */
+class C_MDC_RenameAck : public Context {
+ Renamer *mdc;
+ CDir *srcdir;
+ CInode *in;
+ Context *c;
+public:
+ C_MDC_RenameAck(Renamer *mdc, CDir *srcdir, CInode *in, Context *c) {
+ this->mdc = mdc;
+ this->srcdir = srcdir;
+ this->in = in;
+ this->c = c;
+ }
+ void finish(int r) {
+ mdc->file_rename_finish(srcdir, in, c);
+ }
+};
+
+
+void Renamer::file_rename(CDentry *srcdn, CDentry *destdn, Context *onfinish)
+{
+ assert(srcdn->is_xlocked()); // by me
+ assert(destdn->is_xlocked()); // by me
+
+ CDir *srcdir = srcdn->dir;
+ string srcname = srcdn->name;
+
+ CDir *destdir = destdn->dir;
+ string destname = destdn->name;
+
+ CInode *in = srcdn->inode;
+ //Message *req = srcdn->xlockedby;
+
+
+ // determine the players
+ int srcauth = srcdir->dentry_authority(srcdn->name);
+ int destauth = destdir->dentry_authority(destname);
+
+
+ // FOREIGN rename?
+ if (srcauth != mds->get_nodeid() ||
+ destauth != mds->get_nodeid()) {
+ dout(7) << "foreign rename. srcauth " << srcauth << ", destauth " << destauth << ", isdir " << srcdn->inode->is_dir() << endl;
+
+ string destpath;
+ destdn->make_path(destpath);
+
+ if (destauth != mds->get_nodeid()) {
+ // make sure dest has dir open.
+ dout(7) << "file_rename i'm not dest auth. sending MRenamePrep to " << destauth << endl;
+
+ // prep dest first, they must have the dir open! rest will follow.
+ string srcpath;
+ srcdn->make_path(srcpath);
+
+ MRenamePrep *m = new MRenamePrep(mds->get_nodeid(), // i'm the initiator
+ srcdir->ino(), srcname, srcpath,
+ destdir->ino(), destname, destpath,
+ srcauth); // tell dest who src is (maybe even me)
+ mds->send_message_mds(m, destauth, MDS_PORT_CACHE);
+
+ cache->show_imports();
+
+ }
+
+ else if (srcauth != mds->get_nodeid()) {
+ if (destauth == mds->get_nodeid()) {
+ dout(7) << "file_rename dest auth, not src auth. sending MRenameReq" << endl;
+ } else {
+ dout(7) << "file_rename neither src auth nor dest auth. sending MRenameReq" << endl;
+ }
+
+ // srcdn not important on destauth, just request
+ MRenameReq *m = new MRenameReq(mds->get_nodeid(), // i'm the initiator
+ srcdir->ino(), srcname,
+ destdir->ino(), destname, destpath, destauth); // tell src who dest is (they may not know)
+ mds->send_message_mds(m, srcauth, MDS_PORT_CACHE);
+ }
+
+ else
+ assert(0);
+
+ // set waiter on the inode (is this the best place?)
+ in->add_waiter(CINODE_WAIT_RENAMEACK,
+ new C_MDC_RenameAck(this,
+ srcdir, in, onfinish));
+ return;
+ }
+
+ // LOCAL rename!
+ assert(srcauth == mds->get_nodeid() && destauth == mds->get_nodeid());
+ dout(7) << "file_rename src and dest auth, renaming locally (easy!)" << endl;
+
+ // update our cache
+ if (destdn->inode && destdn->inode->is_dirty())
+ destdn->inode->mark_clean();
+
+ cache->rename_file(srcdn, destdn);
+
+ // update imports/exports?
+ if (in->is_dir() && in->dir)
+ fix_renamed_dir(srcdir, in, destdir, false); // auth didnt change
+
+ // mark dentries dirty
+ srcdn->mark_dirty();
+ destdn->mark_dirty();
+ in->mark_dirty();
+
+
+ // local, restrict notify to ppl with open dirs
+ set<int> notify = srcdir->get_open_by();
+ for (set<int>::iterator it = destdir->open_by_begin();
+ it != destdir->open_by_end();
+ it++)
+ if (notify.count(*it) == 0) notify.insert(*it);
+
+ if (notify.size()) {
+ // warn + notify
+ file_rename_warn(in, notify);
+ file_rename_notify(in, srcdir, srcname, destdir, destname, notify, mds->get_nodeid());
+
+ // wait for MRenameNotifyAck's
+ in->add_waiter(CINODE_WAIT_RENAMENOTIFYACK,
+ new C_MDC_RenameNotifyAck(this, in, mds->get_nodeid())); // i am initiator
+
+ // wait for finish
+ in->add_waiter(CINODE_WAIT_RENAMEACK,
+ new C_MDC_RenameAck(this, srcdir, in, onfinish));
+ } else {
+ // sweet, no notify necessary, we're done!
+ file_rename_finish(srcdir, in, onfinish);
+ }
+}
+
+void Renamer::handle_rename_ack(MRenameAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+
+ dout(7) << "handle_rename_ack on " << *in << endl;
+
+ // all done!
+ in->finish_waiting(CINODE_WAIT_RENAMEACK);
+
+ delete m;
+}
+
+void Renamer::file_rename_finish(CDir *srcdir, CInode *in, Context *c)
+{
+ dout(10) << "file_rename_finish on " << *in << endl;
+
+ // did i empty out an imported dir? FIXME this check should go somewhere else???
+ if (srcdir->is_import() && !srcdir->inode->is_root() && srcdir->get_size() == 0)
+ cache->migrator->export_empty_import(srcdir);
+
+ // finish our caller
+ if (c) {
+ c->finish(0);
+ delete c;
+ }
+}
+
+
+/************* src **************/
+
+
+/** handle_rename_req
+ * received by auth of src dentry (from init, or destauth if dir).
+ * src may not have dest dir open.
+ * src will export inode, unlink|rename, and send MRename to dest.
+ */
+void Renamer::handle_rename_req(MRenameReq *m)
+{
+ // i am auth, i will have it.
+ CInode *srcdiri = cache->get_inode(m->get_srcdirino());
+ CDir *srcdir = srcdiri->dir;
+ CDentry *srcdn = srcdir->lookup(m->get_srcname());
+ assert(srcdn);
+
+ // do it
+ file_rename_foreign_src(srcdn,
+ m->get_destdirino(), m->get_destname(), m->get_destpath(), m->get_destauth(),
+ m->get_initiator());
+ delete m;
+}
+
+
+void Renamer::file_rename_foreign_src(CDentry *srcdn,
+ inodeno_t destdirino, string& destname, string& destpath, int destauth,
+ int initiator)
+{
+ dout(7) << "file_rename_foreign_src " << *srcdn << endl;
+
+ CDir *srcdir = srcdn->dir;
+ string srcname = srcdn->name;
+
+ // (we're basically exporting this inode)
+ CInode *in = srcdn->inode;
+ assert(in);
+ assert(in->is_auth());
+
+ if (in->is_dir()) cache->show_imports();
+
+ // encode and export inode state
+ bufferlist inode_state;
+ cache->migrator->encode_export_inode(in, inode_state, destauth);
+
+ // send
+ MRename *m = new MRename(initiator,
+ srcdir->ino(), srcdn->name, destdirino, destname,
+ inode_state);
+ mds->send_message_mds(m, destauth, MDS_PORT_CACHE);
+
+ // have dest?
+ CInode *destdiri = cache->get_inode(m->get_destdirino());
+ CDir *destdir = 0;
+ if (destdiri) destdir = destdiri->dir;
+ CDentry *destdn = 0;
+ if (destdir) destdn = destdir->lookup(m->get_destname());
+
+ // discover src
+ if (!destdn) {
+ dout(7) << "file_rename_foreign_src doesn't have destdn, discovering " << destpath << endl;
+
+ filepath destfilepath = destpath;
+ vector<CDentry*> trace;
+ int r = cache->path_traverse(destfilepath, trace, true,
+ m, new C_MDS_RetryMessage(mds, m),
+ MDS_TRAVERSE_DISCOVER);
+ assert(r>0);
+ return;
+ }
+
+ assert(destdn);
+
+ // update our cache
+ cache->rename_file(srcdn, destdn);
+
+ // update imports/exports?
+ if (in->is_dir() && in->dir)
+ fix_renamed_dir(srcdir, in, destdir, true); // auth changed
+
+ srcdn->mark_dirty();
+
+ // proxy!
+ in->state_set(CINODE_STATE_PROXY);
+ in->get(CINODE_PIN_PROXY);
+
+ // generate notify list (everybody but src|dst) and send warnings
+ set<int> notify;
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i != mds->get_nodeid() && // except the source
+ i != destauth) // and the dest
+ notify.insert(i);
+ }
+ file_rename_warn(in, notify);
+
+
+ // wait for MRenameNotifyAck's
+ in->add_waiter(CINODE_WAIT_RENAMENOTIFYACK,
+ new C_MDC_RenameNotifyAck(this, in, initiator));
+}
+
+void Renamer::file_rename_warn(CInode *in,
+ set<int>& notify)
+{
+ // note gather list
+ rename_waiting_for_ack[in->ino()] = notify;
+
+ // send
+ for (set<int>::iterator it = notify.begin();
+ it != notify.end();
+ it++) {
+ dout(10) << "file_rename_warn to " << *it << " for " << *in << endl;
+ mds->send_message_mds(new MRenameWarning(in->ino()), *it, MDS_PORT_CACHE);
+ }
+}
+
+
+void Renamer::handle_rename_notify_ack(MRenameNotifyAck *m)
+{
+ CInode *in = cache->get_inode(m->get_ino());
+ assert(in);
+ dout(7) << "handle_rename_notify_ack on " << *in << endl;
+
+ int source = m->get_source().num();
+ rename_waiting_for_ack[in->ino()].erase(source);
+ if (rename_waiting_for_ack[in->ino()].empty()) {
+ // last one!
+ rename_waiting_for_ack.erase(in->ino());
+ in->finish_waiting(CINODE_WAIT_RENAMENOTIFYACK, 0);
+ } else {
+ dout(7) << "still waiting for " << rename_waiting_for_ack[in->ino()] << endl;
+ }
+}
+
+
+void Renamer::file_rename_ack(CInode *in, int initiator)
+{
+ // we got all our MNotifyAck's.
+
+ // was i proxy (if not, it's cuz this was a local rename)
+ if (in->state_test(CINODE_STATE_PROXY)) {
+ dout(10) << "file_rename_ack clearing proxy bit on " << *in << endl;
+ in->state_clear(CINODE_STATE_PROXY);
+ in->put(CINODE_PIN_PROXY);
+ }
+
+ // done!
+ if (initiator == mds->get_nodeid()) {
+ // it's me, finish
+ dout(7) << "file_rename_ack i am initiator, finishing" << endl;
+ in->finish_waiting(CINODE_WAIT_RENAMEACK);
+ } else {
+ // send ack
+ dout(7) << "file_rename_ack sending MRenameAck to initiator " << initiator << endl;
+ mds->send_message_mds(new MRenameAck(in->ino()), initiator, MDS_PORT_CACHE);
+ }
+}
+
+
+
+
+/************ dest *************/
+
+/** handle_rename_prep
+ * received by auth of dest dentry to make sure they have src + dir open.
+ * this is so that when they get the inode and dir, they can update exports etc properly.
+ * will send MRenameReq to src.
+ */
+void Renamer::handle_rename_prep(MRenamePrep *m)
+{
+ // open src
+ filepath srcpath = m->get_srcpath();
+ vector<CDentry*> trace;
+ int r = cache->path_traverse(srcpath, trace, false,
+ m, new C_MDS_RetryMessage(mds, m),
+ MDS_TRAVERSE_DISCOVER);
+
+ if (r>0) return;
+
+ // ok!
+ CInode *srcin = trace[trace.size()-1]->inode;
+ assert(srcin);
+
+ dout(7) << "handle_rename_prep have srcin " << *srcin << endl;
+
+ if (srcin->is_dir()) {
+ if (!srcin->dir) {
+ dout(7) << "handle_rename_prep need to open dir" << endl;
+ cache->open_remote_dir(srcin,
+ new C_MDS_RetryMessage(mds,m));
+ return;
+ }
+
+ dout(7) << "handle_rename_prep have dir " << *srcin->dir << endl;
+ }
+
+ // pin
+ srcin->get(CINODE_PIN_RENAMESRC);
+
+ // send rename request
+ MRenameReq *req = new MRenameReq(m->get_initiator(), // i'm the initiator
+ m->get_srcdirino(), m->get_srcname(),
+ m->get_destdirino(), m->get_destname(), m->get_destpath(),
+ mds->get_nodeid()); // i am dest
+ mds->send_message_mds(req, m->get_srcauth(), MDS_PORT_CACHE);
+ delete m;
+ return;
+}
+
+
+
+/** handle_rename
+ * received by auth of dest dentry. includes exported inode info.
+ * dest may not have srcdir open.
+ */
+void Renamer::handle_rename(MRename *m)
+{
+ // srcdn (required)
+ CInode *srcdiri = cache->get_inode(m->get_srcdirino());
+ CDir *srcdir = srcdiri->dir;
+ CDentry *srcdn = srcdir->lookup(m->get_srcname());
+ string srcname = srcdn->name;
+ assert(srcdn && srcdn->inode);
+
+ dout(7) << "handle_rename srcdn " << *srcdn << endl;
+
+ // destdn (required). i am auth, so i will have it.
+ CInode *destdiri = cache->get_inode(m->get_destdirino());
+ CDir *destdir = destdiri->dir;
+ CDentry *destdn = destdir->lookup(m->get_destname());
+ string destname = destdn->name;
+ assert(destdn);
+
+ dout(7) << "handle_rename destdn " << *destdn << endl;
+
+ // note old dir auth
+ int old_dir_auth = -1;
+ if (srcdn->inode->dir) old_dir_auth = srcdn->inode->dir->authority();
+
+ // rename replica into position
+ if (destdn->inode && destdn->inode->is_dirty())
+ destdn->inode->mark_clean();
+
+ cache->rename_file(srcdn, destdn);
+
+ // decode + import inode (into new location start)
+ int off = 0;
+ // HACK
+ bufferlist bufstate;
+ bufstate.claim_append(m->get_inode_state());
+ cache->migrator->decode_import_inode(destdn, bufstate, off, m->get_source().num());
+
+ CInode *in = destdn->inode;
+ assert(in);
+
+ // update imports/exports?
+ if (in->is_dir()) {
+ assert(in->dir); // i had better already ahve it open.. see MRenamePrep
+ fix_renamed_dir(srcdir, in, destdir, true, // auth changed
+ old_dir_auth); // src is possibly new dir auth.
+ }
+
+ // mark dirty
+ destdn->mark_dirty();
+ in->mark_dirty();
+
+ // unpin
+ in->put(CINODE_PIN_RENAMESRC);
+
+ // ok, send notifies.
+ set<int> notify;
+ for (int i=0; i<mds->get_mds_map()->get_num_mds(); i++) {
+ if (i != m->get_source().num() && // except the source
+ i != mds->get_nodeid()) // and the dest
+ notify.insert(i);
+ }
+ file_rename_notify(in, srcdir, srcname, destdir, destname, notify, m->get_source().num());
+
+ delete m;
+}
+
+
+void Renamer::file_rename_notify(CInode *in,
+ CDir *srcdir, string& srcname, CDir *destdir, string& destname,
+ set<int>& notify,
+ int srcauth)
+{
+ /* NOTE: notify list might include myself */
+
+ // tell
+ string destdirpath;
+ destdir->inode->make_path(destdirpath);
+
+ for (set<int>::iterator it = notify.begin();
+ it != notify.end();
+ it++) {
+ dout(10) << "file_rename_notify to " << *it << " for " << *in << endl;
+ mds->send_message_mds(new MRenameNotify(in->ino(),
+ srcdir->ino(),
+ srcname,
+ destdir->ino(),
+ destdirpath,
+ destname,
+ srcauth),
+ *it, MDS_PORT_CACHE);
+ }
+}
+
+
+
+/************** bystanders ****************/
+
+void Renamer::handle_rename_warning(MRenameWarning *m)
+{
+ // add to warning list
+ stray_rename_warnings.insert( m->get_ino() );
+
+ // did i already see the notify?
+ if (stray_rename_notifies.count(m->get_ino())) {
+ // i did, we're good.
+ dout(7) << "handle_rename_warning on " << m->get_ino() << ". already got notify." << endl;
+
+ handle_rename_notify(stray_rename_notifies[m->get_ino()]);
+ stray_rename_notifies.erase(m->get_ino());
+ } else {
+ dout(7) << "handle_rename_warning on " << m->get_ino() << ". waiting for notify." << endl;
+ }
+
+ // done
+ delete m;
+}
+
+
+void Renamer::handle_rename_notify(MRenameNotify *m)
+{
+ // FIXME: when we do hard links, i think we need to
+ // have srcdn and destdn both, or neither, always!
+
+ // did i see the warning yet?
+ if (!stray_rename_warnings.count(m->get_ino())) {
+ // wait for it.
+ dout(7) << "handle_rename_notify on " << m->get_ino() << ", waiting for warning." << endl;
+ stray_rename_notifies[m->get_ino()] = m;
+ return;
+ }
+
+ dout(7) << "handle_rename_notify dir " << m->get_srcdirino() << " dn " << m->get_srcname() << " to dir " << m->get_destdirino() << " dname " << m->get_destname() << endl;
+
+ // src
+ CInode *srcdiri = cache->get_inode(m->get_srcdirino());
+ CDir *srcdir = 0;
+ if (srcdiri) srcdir = srcdiri->dir;
+ CDentry *srcdn = 0;
+ if (srcdir) srcdn = srcdir->lookup(m->get_srcname());
+
+ // dest
+ CInode *destdiri = cache->get_inode(m->get_destdirino());
+ CDir *destdir = 0;
+ if (destdiri) destdir = destdiri->dir;
+ CDentry *destdn = 0;
+ if (destdir) destdn = destdir->lookup(m->get_destname());
+
+ // have both?
+ list<Context*> finished;
+ if (srcdn && destdir) {
+ CInode *in = srcdn->inode;
+
+ int old_dir_auth = -1;
+ if (in && in->dir) old_dir_auth = in->dir->authority();
+
+ if (!destdn) {
+ destdn = destdir->add_dentry(m->get_destname()); // create null dentry
+ destdn->lockstate = DN_LOCK_XLOCK; // that's xlocked!
+ }
+
+ dout(7) << "handle_rename_notify renaming " << *srcdn << " to " << *destdn << endl;
+
+ if (in) {
+ cache->rename_file(srcdn, destdn);
+
+ // update imports/exports?
+ if (in && in->is_dir() && in->dir) {
+ fix_renamed_dir(srcdir, in, destdir, false, old_dir_auth); // auth didnt change
+ }
+ } else {
+ dout(7) << " i don't have the inode (just null dentries)" << endl;
+ }
+
+ }
+
+ else if (srcdn) {
+ dout(7) << "handle_rename_notify no dest, but have src" << endl;
+ dout(7) << "srcdn is " << *srcdn << endl;
+
+ if (destdiri) {
+ dout(7) << "have destdiri, opening dir " << *destdiri << endl;
+ cache->open_remote_dir(destdiri,
+ new C_MDS_RetryMessage(mds,m));
+ } else {
+ filepath destdirpath = m->get_destdirpath();
+ dout(7) << "don't have destdiri even, doing traverse+discover on " << destdirpath << endl;
+
+ vector<CDentry*> trace;
+ int r = cache->path_traverse(destdirpath, trace, true,
+ m, new C_MDS_RetryMessage(mds, m),
+ MDS_TRAVERSE_DISCOVER);
+ assert(r>0);
+ }
+ return;
+ }
+
+ else if (destdn) {
+ dout(7) << "handle_rename_notify unlinking dst only " << *destdn << endl;
+ if (destdn->inode) {
+ destdir->unlink_inode(destdn);
+ }
+ }
+
+ else {
+ dout(7) << "handle_rename_notify didn't have srcdn or destdn" << endl;
+ assert(srcdn == 0 && destdn == 0);
+ }
+
+ mds->queue_finished(finished);
+
+
+ // ack
+ dout(10) << "sending RenameNotifyAck back to srcauth " << m->get_srcauth() << endl;
+ MRenameNotifyAck *ack = new MRenameNotifyAck(m->get_ino());
+ mds->send_message_mds(ack, m->get_srcauth(), MDS_PORT_CACHE);
+
+
+ stray_rename_warnings.erase( m->get_ino() );
+ delete m;
+}
+
+
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MDS_RENAMER_H
+#define __MDS_RENAMER_H
+
+#include "include/types.h"
+
+#include <map>
+#include <set>
+using std::map;
+using std::set;
+
+class MDS;
+class MDCache;
+class CDentry;
+class CInode;
+class CDir;
+
+class Message;
+class MRenameWarning;
+class MRenameNotify;
+class MRenameNotifyAck;
+class MRename;
+class MRenamePrep;
+class MRenameReq;
+class MRenameAck;
+
+class Renamer {
+ MDS *mds;
+ MDCache *cache;
+
+ // rename fun
+ set<inodeno_t> stray_rename_warnings; // notifies i haven't seen
+ map<inodeno_t, MRenameNotify*> stray_rename_notifies;
+
+ map<inodeno_t, set<int> > rename_waiting_for_ack;
+
+
+
+ void fix_renamed_dir(CDir *srcdir,
+ CInode *in,
+ CDir *destdir,
+ bool authchanged, // _inode_ auth changed
+ int dirauth=-1); // dirauth (for certain cases)
+
+
+public:
+ Renamer(MDS *m, MDCache *c) : mds(m), cache(c) {}
+
+ void dispatch(Message *m);
+
+ // RENAME
+ // initiator
+ public:
+ void file_rename(CDentry *srcdn, CDentry *destdn, Context *c);
+ protected:
+ void handle_rename_ack(MRenameAck *m); // dest -> init (almost always)
+ void file_rename_finish(CDir *srcdir, CInode *in, Context *c);
+ friend class C_MDC_RenameAck;
+
+ // src
+ void handle_rename_req(MRenameReq *m); // dest -> src
+ void file_rename_foreign_src(CDentry *srcdn,
+ inodeno_t destdirino, string& destname, string& destpath, int destauth,
+ int initiator);
+ void file_rename_warn(CInode *in, set<int>& notify);
+ void handle_rename_notify_ack(MRenameNotifyAck *m); // bystanders -> src
+ void file_rename_ack(CInode *in, int initiator);
+ friend class C_MDC_RenameNotifyAck;
+
+ // dest
+ void handle_rename_prep(MRenamePrep *m); // init -> dest
+ void handle_rename(MRename *m); // src -> dest
+ void file_rename_notify(CInode *in,
+ CDir *srcdir, string& srcname, CDir *destdir, string& destname,
+ set<int>& notify, int srcauth);
+
+ // bystander
+ void handle_rename_warning(MRenameWarning *m); // src -> bystanders
+ void handle_rename_notify(MRenameNotify *m); // dest -> bystanders
+
+
+};
+
+#endif
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "MDS.h"
+#include "Server.h"
+#include "Locker.h"
+#include "MDCache.h"
+#include "MDLog.h"
+#include "Migrator.h"
+#include "MDBalancer.h"
+#include "Renamer.h"
+
+#include "msg/Messenger.h"
+
+#include "messages/MClientMount.h"
+#include "messages/MClientMountAck.h"
+#include "messages/MClientRequest.h"
+#include "messages/MClientReply.h"
+#include "messages/MHashReaddir.h"
+#include "messages/MHashReaddirReply.h"
+
+#include "messages/MLock.h"
+
+#include "messages/MInodeLink.h"
+
+#include "events/EInodeUpdate.h"
+#include "events/EDirUpdate.h"
+#include "events/EMknod.h"
+#include "events/EMkdir.h"
+
+#include "include/filepath.h"
+#include "common/Timer.h"
+#include "common/Logger.h"
+#include "common/LogType.h"
+
+#include <errno.h>
+#include <fcntl.h>
+
+#include <list>
+#include <iostream>
+using namespace std;
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".server "
+#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".server "
+
+
+void Server::dispatch(Message *m)
+{
+ // active?
+ if (!mds->is_active()) {
+ dout(3) << "not active yet, waiting" << endl;
+ mds->queue_waitfor_active(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ switch (m->get_type()) {
+ case MSG_CLIENT_MOUNT:
+ handle_client_mount((MClientMount*)m);
+ return;
+ case MSG_CLIENT_UNMOUNT:
+ handle_client_unmount(m);
+ return;
+ }
+
+
+ switch (m->get_type()) {
+ case MSG_CLIENT_REQUEST:
+ handle_client_request((MClientRequest*)m);
+ return;
+
+ case MSG_MDS_HASHREADDIR:
+ handle_hash_readdir((MHashReaddir*)m);
+ return;
+ case MSG_MDS_HASHREADDIRREPLY:
+ handle_hash_readdir_reply((MHashReaddirReply*)m);
+ return;
+
+ }
+
+ dout(1) << " main unknown message " << m->get_type() << endl;
+ assert(0);
+}
+
+
+
+
+
+void Server::handle_client_mount(MClientMount *m)
+{
+ int n = m->get_source().num();
+ dout(3) << "mount by client" << n << endl;
+ mds->clientmap.add_mount(n, m->get_source_inst());
+
+ assert(mds->get_nodeid() == 0); // mds0 mounts/unmounts
+
+ // ack
+ messenger->send_message(new MClientMountAck(m, mds->mdsmap, mds->osdmap),
+ m->get_source(), m->get_source_inst());
+ delete m;
+}
+
+void Server::handle_client_unmount(Message *m)
+{
+ int n = m->get_source().num();
+ dout(3) << "unmount by client" << n << endl;
+
+ assert(mds->get_nodeid() == 0); // mds0 mounts/unmounts
+
+ mds->clientmap.rem_mount(n);
+
+ if (mds->clientmap.get_mount_set().empty()) {
+ dout(3) << "all clients done, initiating shutdown" << endl;
+ mds->shutdown_start();
+ }
+
+ // ack by sending back to client
+ entity_inst_t srcinst = m->get_source_inst(); // make a copy!
+ messenger->send_message(m, m->get_source(), srcinst);
+}
+
+
+
+/*******
+ * some generic stuff for finishing off requests
+ */
+
+/** C_MDS_CommitRequest
+ */
+
+class C_MDS_CommitRequest : public Context {
+ Server *server;
+ MClientRequest *req;
+ MClientReply *reply;
+ CInode *tracei; // inode to include a trace for
+ LogEvent *event;
+
+public:
+ C_MDS_CommitRequest(Server *server,
+ MClientRequest *req, MClientReply *reply, CInode *tracei,
+ LogEvent *event=0) {
+ this->server = server;
+ this->req = req;
+ this->tracei = tracei;
+ this->reply = reply;
+ this->event = event;
+ }
+ void finish(int r) {
+ if (r != 0) {
+ // failure. set failure code and reply.
+ reply->set_result(r);
+ }
+ if (event) {
+ server->commit_request(req, reply, tracei, event);
+ } else {
+ // reply.
+ server->reply_request(req, reply, tracei);
+ }
+ }
+};
+
+
+/*
+ * send generic response (just and error code)
+ */
+void Server::reply_request(MClientRequest *req, int r, CInode *tracei)
+{
+ reply_request(req, new MClientReply(req, r), tracei);
+}
+
+
+/*
+ * send given reply
+ * include a trace to tracei
+ */
+void Server::reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei) {
+ dout(10) << "reply_request r=" << reply->get_result() << " " << *req << endl;
+
+ // include trace
+ if (tracei) {
+ reply->set_trace_dist( tracei, mds->get_nodeid() );
+ }
+
+ // send reply
+ messenger->send_message(reply,
+ MSG_ADDR_CLIENT(req->get_client()), req->get_client_inst());
+
+ // discard request
+ mdcache->request_finish(req);
+
+ // stupid stats crap (FIXME)
+ stat_ops++;
+}
+
+
+/*
+ * commit event(s) to the metadata journal, then reply.
+ * or, be sloppy and do it concurrently (see g_conf.mds_log_before_reply)
+ */
+void Server::commit_request(MClientRequest *req,
+ MClientReply *reply,
+ CInode *tracei,
+ LogEvent *event,
+ LogEvent *event2)
+{
+ // log
+ if (event) mdlog->submit_entry(event);
+ if (event2) mdlog->submit_entry(event2);
+
+ if (g_conf.mds_log_before_reply && g_conf.mds_log && event) {
+ // SAFE mode!
+
+ // pin inode so it doesn't go away!
+ if (tracei) mdcache->request_pin_inode(req, tracei);
+
+ // wait for log sync
+ mdlog->wait_for_sync(new C_MDS_CommitRequest(this, req, reply, tracei));
+ return;
+ }
+ else {
+ // just reply
+ reply_request(req, reply, tracei);
+ }
+}
+
+
+
+/***
+ * process a client request
+ */
+
+void Server::handle_client_request(MClientRequest *req)
+{
+ dout(4) << "req " << *req << endl;
+
+ // note original client addr
+ if (req->get_source().is_client()) {
+ req->set_client_inst( req->get_source_inst() );
+ req->clear_payload();
+ }
+
+ if (!mds->is_active()) {
+ dout(5) << " not active, discarding client request." << endl;
+ delete req;
+ return;
+ }
+
+ if (!mdcache->get_root()) {
+ dout(5) << "need to open root" << endl;
+ mdcache->open_root(new C_MDS_RetryMessage(mds, req));
+ return;
+ }
+
+ // okay, i want
+ CInode *ref = 0;
+ vector<CDentry*> trace; // might be blank, for fh guys
+
+ bool follow_trailing_symlink = false;
+
+ // operations on fh's or other non-files
+ switch (req->get_op()) {
+ /*
+ case MDS_OP_FSTAT:
+ reply = handle_client_fstat(req, cur);
+ break; ****** fiX ME ***
+ */
+
+ case MDS_OP_TRUNCATE:
+ if (!req->get_ino()) break; // can be called w/ either fh OR path
+
+ case MDS_OP_RELEASE:
+ case MDS_OP_FSYNC:
+ ref = mdcache->get_inode(req->get_ino()); // fixme someday no ino needed?
+
+ if (!ref) {
+ int next = mds->get_nodeid() + 1;
+ if (next >= mds->mdsmap->get_num_mds()) next = 0;
+ dout(10) << "got request on ino we don't have, passing buck to " << next << endl;
+ mds->send_message_mds(req, next, MDS_PORT_SERVER);
+ return;
+ }
+ }
+
+ if (!ref) {
+ // we need to traverse a path
+ filepath refpath = req->get_filepath();
+
+ // ops on non-existing files --> directory paths
+ switch (req->get_op()) {
+ case MDS_OP_OPEN:
+ if (!(req->get_iarg() & O_CREAT)) break;
+
+ case MDS_OP_MKNOD:
+ case MDS_OP_MKDIR:
+ case MDS_OP_SYMLINK:
+ case MDS_OP_LINK:
+ case MDS_OP_UNLINK: // also wrt parent dir, NOT the unlinked inode!!
+ case MDS_OP_RMDIR:
+ case MDS_OP_RENAME:
+ // remove last bit of path
+ refpath = refpath.prefixpath(refpath.depth()-1);
+ break;
+ }
+ dout(10) << "refpath = " << refpath << endl;
+
+ Context *ondelay = new C_MDS_RetryMessage(mds, req);
+
+ if (req->get_op() == MDS_OP_LSTAT) {
+ follow_trailing_symlink = false;
+ }
+
+ // do trace
+ int r = mdcache->path_traverse(refpath, trace, follow_trailing_symlink,
+ req, ondelay,
+ MDS_TRAVERSE_FORWARD,
+ 0,
+ true); // is MClientRequest
+
+ if (r > 0) return; // delayed
+ if (r == -ENOENT ||
+ r == -ENOTDIR ||
+ r == -EISDIR) {
+ // error!
+ dout(10) << " path traverse error " << r << ", replying" << endl;
+
+ // send error
+ messenger->send_message(new MClientReply(req, r),
+ MSG_ADDR_CLIENT(req->get_client()), req->get_client_inst());
+
+ // <HACK>
+ // is this a special debug command?
+ if (refpath.depth() - 1 == trace.size() &&
+ refpath.last_bit().find(".ceph.") == 0) {
+ CDir *dir = 0;
+ if (trace.empty())
+ dir = mdcache->get_root()->dir;
+ else
+ dir = trace[trace.size()-1]->get_inode()->dir;
+
+ dout(1) << "** POSSIBLE CEPH DEBUG COMMAND '" << refpath.last_bit() << "' in " << *dir << endl;
+
+ if (refpath.last_bit() == ".ceph.hash" &&
+ refpath.depth() > 1) {
+ dout(1) << "got explicit hash command " << refpath << endl;
+ CDir *dir = trace[trace.size()-1]->get_inode()->dir;
+ if (!dir->is_hashed() &&
+ !dir->is_hashing() &&
+ dir->is_auth())
+ mdcache->migrator->hash_dir(dir);
+ }
+ else if (refpath.last_bit() == ".ceph.commit") {
+ dout(1) << "got explicit commit command on " << *dir << endl;
+ mds->mdstore->commit_dir(dir, 0);
+ }
+ }
+ // </HACK>
+
+
+ delete req;
+ return;
+ }
+
+ if (trace.size())
+ ref = trace[trace.size()-1]->inode;
+ else
+ ref = mdcache->get_root();
+ }
+
+ dout(10) << "ref is " << *ref << endl;
+
+ // rename doesn't pin src path (initially)
+ if (req->get_op() == MDS_OP_RENAME) trace.clear();
+
+ // register
+ if (!mdcache->request_start(req, ref, trace))
+ return;
+
+ // process
+ dispatch_request(req, ref);
+}
+
+
+
+void Server::dispatch_request(Message *m, CInode *ref)
+{
+ MClientRequest *req = 0;
+
+ // MLock or MClientRequest?
+ /* this is a little weird.
+ client requests and mlocks both initial dentry xlocks, path pins, etc.,
+ and thus both make use of the context C_MDS_RetryRequest.
+ */
+ switch (m->get_type()) {
+ case MSG_CLIENT_REQUEST:
+ req = (MClientRequest*)m;
+ break; // continue below!
+
+ case MSG_MDS_LOCK:
+ mds->locker->handle_lock_dn((MLock*)m);
+ return; // done
+
+ default:
+ assert(0); // shouldn't get here
+ }
+
+ // MClientRequest.
+
+ switch(req->get_op()) {
+
+ // files
+ case MDS_OP_OPEN:
+ if (req->get_iarg() & O_CREAT)
+ handle_client_openc(req, ref);
+ else
+ handle_client_open(req, ref);
+ break;
+ case MDS_OP_TRUNCATE:
+ handle_client_truncate(req, ref);
+ break;
+ /*
+ case MDS_OP_FSYNC:
+ handle_client_fsync(req, ref);
+ break;
+ */
+ /*
+ case MDS_OP_RELEASE:
+ handle_client_release(req, ref);
+ break;
+ */
+
+ // inodes
+ case MDS_OP_STAT:
+ case MDS_OP_LSTAT:
+ handle_client_stat(req, ref);
+ break;
+ case MDS_OP_UTIME:
+ handle_client_utime(req, ref);
+ break;
+ case MDS_OP_CHMOD:
+ handle_client_chmod(req, ref);
+ break;
+ case MDS_OP_CHOWN:
+ handle_client_chown(req, ref);
+ break;
+
+ // namespace
+ case MDS_OP_READDIR:
+ handle_client_readdir(req, ref);
+ break;
+ case MDS_OP_MKNOD:
+ handle_client_mknod(req, ref);
+ break;
+ case MDS_OP_LINK:
+ handle_client_link(req, ref);
+ break;
+ case MDS_OP_UNLINK:
+ handle_client_unlink(req, ref);
+ break;
+ case MDS_OP_RENAME:
+ handle_client_rename(req, ref);
+ break;
+ case MDS_OP_RMDIR:
+ handle_client_unlink(req, ref);
+ break;
+ case MDS_OP_MKDIR:
+ handle_client_mkdir(req, ref);
+ break;
+ case MDS_OP_SYMLINK:
+ handle_client_symlink(req, ref);
+ break;
+
+
+
+ default:
+ dout(1) << " unknown client op " << req->get_op() << endl;
+ assert(0);
+ }
+
+ return;
+}
+
+
+
+
+// STAT
+
+void Server::handle_client_stat(MClientRequest *req,
+ CInode *ref)
+{
+ // do I need file info?
+ int mask = req->get_iarg();
+ if (mask & (INODE_MASK_SIZE|INODE_MASK_MTIME)) {
+ // yes. do a full stat.
+ if (!mds->locker->inode_file_read_start(ref, req))
+ return; // syncing
+ mds->locker->inode_file_read_finish(ref);
+ } else {
+ // nope! easy peasy.
+ }
+
+ mds->balancer->hit_inode(ref, META_POP_IRD);
+
+ // reply
+ dout(10) << "reply to " << *req << " stat " << ref->inode.mtime << endl;
+ MClientReply *reply = new MClientReply(req);
+
+ reply_request(req, reply, ref);
+}
+
+
+
+// INODE UPDATES
+
+// utime
+
+void Server::handle_client_utime(MClientRequest *req,
+ CInode *cur)
+{
+ // write
+ if (!mds->locker->inode_file_write_start(cur, req))
+ return; // fw or (wait for) sync
+
+ // do update
+ cur->inode.mtime = req->get_targ();
+ cur->inode.atime = req->get_targ2();
+ if (cur->is_auth())
+ cur->mark_dirty();
+
+ mds->locker->inode_file_write_finish(cur);
+
+ mds->balancer->hit_inode(cur, META_POP_IWR);
+
+ // init reply
+ MClientReply *reply = new MClientReply(req, 0);
+ reply->set_result(0);
+
+ // commit
+ commit_request(req, reply, cur,
+ new EInodeUpdate(cur));
+}
+
+
+
+// HARD
+
+// chmod
+
+void Server::handle_client_chmod(MClientRequest *req,
+ CInode *cur)
+{
+ // write
+ if (!mds->locker->inode_hard_write_start(cur, req))
+ return; // fw or (wait for) lock
+
+
+ // check permissions
+
+ // do update
+ int mode = req->get_iarg();
+ cur->inode.mode &= ~04777;
+ cur->inode.mode |= (mode & 04777);
+ cur->mark_dirty();
+
+ mds->locker->inode_hard_write_finish(cur);
+
+ mds->balancer->hit_inode(cur, META_POP_IWR);
+
+ // start reply
+ MClientReply *reply = new MClientReply(req, 0);
+
+ // commit
+ commit_request(req, reply, cur,
+ new EInodeUpdate(cur));
+}
+
+// chown
+
+void Server::handle_client_chown(MClientRequest *req,
+ CInode *cur)
+{
+ // write
+ if (!mds->locker->inode_hard_write_start(cur, req))
+ return; // fw or (wait for) lock
+
+ // check permissions
+
+ // do update
+ int uid = req->get_iarg();
+ int gid = req->get_iarg2();
+ cur->inode.uid = uid;
+ cur->inode.gid = gid;
+ cur->mark_dirty();
+
+ mds->locker->inode_hard_write_finish(cur);
+
+ mds->balancer->hit_inode(cur, META_POP_IWR);
+
+ // start reply
+ MClientReply *reply = new MClientReply(req, 0);
+
+ // commit
+ commit_request(req, reply, cur,
+ new EInodeUpdate(cur));
+}
+
+
+
+bool Server::try_open_dir(CInode *in, MClientRequest *req)
+{
+ if (!in->dir && in->is_frozen_dir()) {
+ // doh!
+ dout(10) << " dir inode is frozen, can't open dir, waiting " << *in << endl;
+ assert(in->get_parent_dir());
+ in->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE,
+ new C_MDS_RetryRequest(mds, req, in));
+ return false;
+ }
+
+ in->get_or_open_dir(mds);
+ return true;
+}
+
+
+// DIRECTORY and NAMESPACE OPS
+
+// READDIR
+
+int Server::encode_dir_contents(CDir *dir,
+ list<InodeStat*>& inls,
+ list<string>& dnls)
+{
+ int numfiles = 0;
+
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+
+ // hashed?
+ if (dir->is_hashed() &&
+ mds->get_nodeid() != mds->hash_dentry( dir->ino(), it->first ))
+ continue;
+
+ // is dentry readable?
+ if (dn->is_xlocked()) {
+ // ***** FIXME *****
+ // ?
+ dout(10) << "warning, returning xlocked dentry, we _may_ be fudging on POSIX consistency" << endl;
+ }
+
+ CInode *in = dn->inode;
+ if (!in) continue; // null dentry?
+
+ dout(12) << "including inode " << *in << endl;
+
+ // add this item
+ // note: InodeStat makes note of whether inode data is readable.
+ dnls.push_back( it->first );
+ inls.push_back( new InodeStat(in, mds->get_nodeid()) );
+ numfiles++;
+ }
+ return numfiles;
+}
+
+
+/*
+ * note: this is pretty sloppy, but should work just fine i think...
+ */
+void Server::handle_hash_readdir(MHashReaddir *m)
+{
+ CInode *cur = mdcache->get_inode(m->get_ino());
+ assert(cur);
+
+ if (!cur->dir ||
+ !cur->dir->is_hashed()) {
+ assert(0);
+ dout(7) << "handle_hash_readdir don't have dir open, or not hashed. giving up!" << endl;
+ delete m;
+ return;
+ }
+ CDir *dir = cur->dir;
+ assert(dir);
+ assert(dir->is_hashed());
+
+ // complete?
+ if (!dir->is_complete()) {
+ dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << endl;
+ mds->mdstore->fetch_dir(dir, new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ // get content
+ list<InodeStat*> inls;
+ list<string> dnls;
+ int num = encode_dir_contents(dir, inls, dnls);
+
+ // sent it back!
+ messenger->send_message(new MHashReaddirReply(dir->ino(), inls, dnls, num),
+ m->get_source(), m->get_source_inst(), MDS_PORT_CACHE);
+}
+
+
+void Server::handle_hash_readdir_reply(MHashReaddirReply *m)
+{
+ CInode *cur = mdcache->get_inode(m->get_ino());
+ assert(cur);
+
+ if (!cur->dir ||
+ !cur->dir->is_hashed()) {
+ assert(0);
+ dout(7) << "handle_hash_readdir don't have dir open, or not hashed. giving up!" << endl;
+ delete m;
+ return;
+ }
+ CDir *dir = cur->dir;
+ assert(dir);
+ assert(dir->is_hashed());
+
+ // move items to hashed_readdir gather
+ int from = m->get_source().num();
+ assert(dir->hashed_readdir.count(from) == 0);
+ dir->hashed_readdir[from].first.splice(dir->hashed_readdir[from].first.begin(),
+ m->get_in());
+ dir->hashed_readdir[from].second.splice(dir->hashed_readdir[from].second.begin(),
+ m->get_dn());
+ delete m;
+
+ // gather finished?
+ if (dir->hashed_readdir.size() < (unsigned)mds->mdsmap->get_num_mds()) {
+ dout(7) << "still waiting for more hashed readdir bits" << endl;
+ return;
+ }
+
+ dout(7) << "got last bit! finishing waiters" << endl;
+
+ // do these finishers. they'll copy the results.
+ list<Context*> finished;
+ dir->take_waiting(CDIR_WAIT_THISHASHEDREADDIR, finished);
+ finish_contexts(finished);
+
+ // now discard these results
+ for (map<int, pair< list<InodeStat*>, list<string> > >::iterator it = dir->hashed_readdir.begin();
+ it != dir->hashed_readdir.end();
+ it++) {
+ for (list<InodeStat*>::iterator ci = it->second.first.begin();
+ ci != it->second.first.end();
+ ci++)
+ delete *ci;
+ }
+ dir->hashed_readdir.clear();
+
+ // unpin dir (we're done!)
+ dir->auth_unpin();
+
+ // trigger any waiters for next hashed readdir cycle
+ dir->take_waiting(CDIR_WAIT_NEXTHASHEDREADDIR, mds->finished_queue);
+}
+
+
+class C_MDS_HashReaddir : public Context {
+ Server *server;
+ MClientRequest *req;
+ CDir *dir;
+public:
+ C_MDS_HashReaddir(Server *server, MClientRequest *req, CDir *dir) {
+ this->server = server;
+ this->req = req;
+ this->dir = dir;
+ }
+ void finish(int r) {
+ server->finish_hash_readdir(req, dir);
+ }
+};
+
+void Server::finish_hash_readdir(MClientRequest *req, CDir *dir)
+{
+ dout(7) << "finish_hash_readdir on " << *dir << endl;
+
+ assert(dir->is_hashed());
+ assert(dir->hashed_readdir.size() == (unsigned)mds->mdsmap->get_num_mds());
+
+ // reply!
+ MClientReply *reply = new MClientReply(req);
+ reply->set_result(0);
+
+ for (int i=0; i<mds->mdsmap->get_num_mds(); i++) {
+ reply->copy_dir_items(dir->hashed_readdir[i].first,
+ dir->hashed_readdir[i].second);
+ }
+
+ // ok!
+ reply_request(req, reply, dir->inode);
+}
+
+
+void Server::handle_client_readdir(MClientRequest *req,
+ CInode *cur)
+{
+ // it's a directory, right?
+ if (!cur->is_dir()) {
+ // not a dir
+ dout(10) << "reply to " << *req << " readdir -ENOTDIR" << endl;
+ reply_request(req, -ENOTDIR);
+ return;
+ }
+
+ // auth?
+ if (!cur->dir_is_auth()) {
+ int dirauth = cur->authority();
+ if (cur->dir)
+ dirauth = cur->dir->authority();
+ assert(dirauth >= 0);
+ assert(dirauth != mds->get_nodeid());
+
+ // forward to authority
+ dout(10) << " forwarding readdir to authority " << dirauth << endl;
+ mdcache->request_forward(req, dirauth);
+ return;
+ }
+
+ if (!try_open_dir(cur, req))
+ return;
+ assert(cur->dir->is_auth());
+
+ // unhashing? wait!
+ if (cur->dir->is_hashed() &&
+ cur->dir->is_unhashing()) {
+ dout(10) << "unhashing, waiting" << endl;
+ cur->dir->add_waiter(CDIR_WAIT_UNFREEZE,
+ new C_MDS_RetryRequest(mds, req, cur));
+ return;
+ }
+
+ // check perm
+ if (!mds->locker->inode_hard_read_start(cur,req))
+ return;
+ mds->locker->inode_hard_read_finish(cur);
+
+ CDir *dir = cur->dir;
+ assert(dir);
+
+ if (!dir->is_complete()) {
+ // fetch
+ dout(10) << " incomplete dir contents for readdir on " << *cur->dir << ", fetching" << endl;
+ mds->mdstore->fetch_dir(dir, new C_MDS_RetryRequest(mds, req, cur));
+ return;
+ }
+
+ if (dir->is_hashed()) {
+ // HASHED
+ dout(7) << "hashed dir" << endl;
+ if (!dir->can_auth_pin()) {
+ dout(7) << "can't auth_pin dir " << *dir << " waiting" << endl;
+ dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur));
+ return;
+ }
+
+ if (!dir->hashed_readdir.empty()) {
+ dout(7) << "another readdir gather in progres, waiting" << endl;
+ dir->add_waiter(CDIR_WAIT_NEXTHASHEDREADDIR, new C_MDS_RetryRequest(mds, req, cur));
+ return;
+ }
+
+ // start new readdir gather
+ dout(7) << "staring new hashed readdir gather" << endl;
+
+ // pin auth for process!
+ dir->auth_pin();
+
+ // get local bits
+ encode_dir_contents(cur->dir,
+ dir->hashed_readdir[mds->get_nodeid()].first,
+ dir->hashed_readdir[mds->get_nodeid()].second);
+
+ // request other bits
+ for (int i=0; i<mds->mdsmap->get_num_mds(); i++) {
+ if (i == mds->get_nodeid()) continue;
+ mds->send_message_mds(new MHashReaddir(dir->ino()), i, MDS_PORT_SERVER);
+ }
+
+ // wait
+ dir->add_waiter(CDIR_WAIT_THISHASHEDREADDIR,
+ new C_MDS_HashReaddir(this, req, dir));
+ } else {
+ // NON-HASHED
+ // build dir contents
+ list<InodeStat*> inls;
+ list<string> dnls;
+ int numfiles = encode_dir_contents(cur->dir, inls, dnls);
+
+ // . too
+ dnls.push_back(".");
+ inls.push_back(new InodeStat(cur, mds->get_nodeid()));
+ ++numfiles;
+
+ // yay, reply
+ MClientReply *reply = new MClientReply(req);
+ reply->take_dir_items(inls, dnls, numfiles);
+
+ dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl;
+ reply->set_result(0);
+
+ //balancer->hit_dir(cur->dir);
+
+ // reply
+ reply_request(req, reply, cur);
+ }
+}
+
+
+// MKNOD
+
+void Server::handle_client_mknod(MClientRequest *req, CInode *ref)
+{
+ // make dentry and inode, link.
+ CInode *newi = mknod(req, ref);
+ if (!newi) return;
+
+ // it's a file!
+ newi->inode.mode = req->get_iarg();
+ newi->inode.mode &= ~INODE_TYPE_MASK;
+ newi->inode.mode |= INODE_MODE_FILE;
+
+ mds->balancer->hit_inode(newi, META_POP_IWR);
+
+ // commit
+ commit_request(req, new MClientReply(req, 0), ref,
+ new EMknod(newi));
+}
+
+// mknod(): used by handle_client_mkdir, handle_client_mknod, which are mostly identical.
+
+CInode *Server::mknod(MClientRequest *req, CInode *diri, bool okexist)
+{
+ dout(10) << "mknod " << req->get_filepath() << " in " << *diri << endl;
+
+ // get containing directory (without last bit)
+ filepath dirpath = req->get_filepath().prefixpath(req->get_filepath().depth() - 1);
+ string name = req->get_filepath().last_bit();
+
+ // did we get to parent?
+ dout(10) << "dirpath is " << dirpath << " depth " << dirpath.depth() << endl;
+
+ // make sure parent is a dir?
+ if (!diri->is_dir()) {
+ dout(7) << "not a dir" << endl;
+ reply_request(req, -ENOTDIR);
+ return 0;
+ }
+
+ // am i not open, not auth?
+ if (!diri->dir && !diri->is_auth()) {
+ int dirauth = diri->authority();
+ dout(7) << "don't know dir auth, not open, auth is i think " << dirauth << endl;
+ mdcache->request_forward(req, dirauth);
+ return 0;
+ }
+
+ if (!try_open_dir(diri, req)) return 0;
+ CDir *dir = diri->dir;
+
+ // make sure it's my dentry
+ int dnauth = dir->dentry_authority(name);
+ if (dnauth != mds->get_nodeid()) {
+ // fw
+
+ dout(7) << "mknod on " << req->get_path() << ", dentry " << *dir << " dn " << name << " not mine, fw to " << dnauth << endl;
+ mdcache->request_forward(req, dnauth);
+ return 0;
+ }
+ // ok, done passing buck.
+
+
+ // frozen?
+ if (dir->is_frozen()) {
+ dout(7) << "dir is frozen " << *dir << endl;
+ dir->add_waiter(CDIR_WAIT_UNFREEZE,
+ new C_MDS_RetryRequest(mds, req, diri));
+ return 0;
+ }
+
+ // make sure name doesn't already exist
+ CDentry *dn = dir->lookup(name);
+ if (dn) {
+ if (!dn->can_read(req)) {
+ dout(10) << "waiting on (existing!) dentry " << *dn << endl;
+ dir->add_waiter(CDIR_WAIT_DNREAD, name, new C_MDS_RetryRequest(mds, req, diri));
+ return 0;
+ }
+
+ if (!dn->is_null()) {
+ // name already exists
+ if (okexist) {
+ dout(10) << "dentry " << name << " exists in " << *dir << endl;
+ return dn->inode;
+ } else {
+ dout(10) << "dentry " << name << " exists in " << *dir << endl;
+ reply_request(req, -EEXIST);
+ return 0;
+ }
+ }
+ }
+
+ // make sure dir is complete
+ if (!dir->is_complete()) {
+ dout(7) << " incomplete dir contents for " << *dir << ", fetching" << endl;
+ mds->mdstore->fetch_dir(dir, new C_MDS_RetryRequest(mds, req, diri));
+ return 0;
+ }
+
+ // create!
+ CInode *newi = mdcache->create_inode();
+ newi->inode.uid = req->get_caller_uid();
+ newi->inode.gid = req->get_caller_gid();
+ newi->inode.ctime = newi->inode.mtime = newi->inode.atime = g_clock.gettime(); // now
+
+ // link
+ if (!dn)
+ dn = dir->add_dentry(name, newi);
+ else
+ dir->link_inode(dn, newi);
+
+ // bump modify pop
+ mds->balancer->hit_dir(dir, META_POP_DWR);
+
+ // mark dirty
+ dn->mark_dirty();
+ newi->mark_dirty();
+
+ // journal it
+ //mdlog->submit_entry(new EMknod(newi));
+
+ // ok!
+ return newi;
+}
+
+
+// LINK
+
+class C_MDS_LinkTraverse : public Context {
+ Server *server;
+ MClientRequest *req;
+ CInode *ref;
+public:
+ vector<CDentry*> trace;
+ C_MDS_LinkTraverse(Server *server, MClientRequest *req, CInode *ref) {
+ this->server = server;
+ this->req = req;
+ this->ref = ref;
+ }
+ void finish(int r) {
+ server->handle_client_link_2(r, req, ref, trace);
+ }
+};
+
+void Server::handle_client_link(MClientRequest *req, CInode *ref)
+{
+ // figure out name
+ string dname = req->get_filepath().last_bit();
+ dout(7) << "dname is " << dname << endl;
+
+ // make sure parent is a dir?
+ if (!ref->is_dir()) {
+ dout(7) << "not a dir " << *ref << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ // am i not open, not auth?
+ if (!ref->dir && !ref->is_auth()) {
+ int dirauth = ref->authority();
+ dout(7) << "don't know dir auth, not open, srcdir auth is probably " << dirauth << endl;
+ mdcache->request_forward(req, dirauth);
+ return;
+ }
+
+ if (!try_open_dir(ref, req)) return;
+ CDir *dir = ref->dir;
+ dout(7) << "handle_client_link dir is " << *dir << endl;
+
+
+
+ // make sure it's my dentry
+ int dauth = dir->dentry_authority(dname);
+ if (dauth != mds->get_nodeid()) {
+ // fw
+ dout(7) << "link on " << req->get_path() << ", dn " << dname << " in " << *dir << " not mine, fw to " << dauth << endl;
+ mdcache->request_forward(req, dauth);
+ return;
+ }
+ // ok, done passing buck.
+
+
+ // exists?
+ CDentry *dn = dir->lookup(dname);
+ if (dn && (!dn->is_null() || dn->is_xlockedbyother(req))) {
+ dout(7) << "handle_client_link dn exists " << *dn << endl;
+ reply_request(req, -EEXIST);
+ return;
+ }
+
+ // keep src dir in memory
+ mdcache->request_pin_dir(req, dir);
+
+ // discover link target
+ filepath target = req->get_sarg();
+
+ dout(7) << "handle_client_link discovering target " << target << endl;
+
+ C_MDS_LinkTraverse *onfinish = new C_MDS_LinkTraverse(this, req, ref);
+ Context *ondelay = new C_MDS_RetryRequest(mds, req, ref);
+
+ mdcache->path_traverse(target, onfinish->trace, false,
+ req, ondelay,
+ MDS_TRAVERSE_DISCOVER, //XLOCK,
+ onfinish);
+}
+
+
+class C_MDS_RemoteLink : public Context {
+ Server *server;
+ MClientRequest *req;
+ CInode *ref;
+ CDentry *dn;
+ CInode *targeti;
+public:
+ C_MDS_RemoteLink(Server *server, MClientRequest *req, CInode *ref, CDentry *dn, CInode *targeti) {
+ this->server = server;
+ this->req = req;
+ this->ref = ref;
+ this->dn = dn;
+ this->targeti = targeti;
+ }
+ void finish(int r) {
+ if (r > 0) { // success
+ // yay
+ server->handle_client_link_finish(req, ref, dn, targeti);
+ }
+ else if (r == 0) {
+ // huh? retry!
+ assert(0);
+ server->dispatch_request(req, ref);
+ } else {
+ // link failed
+ server->reply_request(req, r);
+ }
+ }
+};
+
+void Server::handle_client_link_2(int r, MClientRequest *req, CInode *ref, vector<CDentry*>& trace)
+{
+ // target dne?
+ if (r < 0) {
+ dout(7) << "target " << req->get_sarg() << " dne" << endl;
+ reply_request(req, r);
+ return;
+ }
+ assert(r == 0);
+
+ CInode *targeti = mdcache->get_root();
+ if (trace.size()) targeti = trace[trace.size()-1]->inode;
+ assert(targeti);
+
+ // dir?
+ dout(7) << "target is " << *targeti << endl;
+ if (targeti->is_dir()) {
+ dout(7) << "target is a dir, failing" << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ // keep target inode in memory
+ mdcache->request_pin_inode(req, targeti);
+
+ dout(7) << "dir is " << *ref << endl;
+
+ // xlock the dentry
+ CDir *dir = ref->dir;
+ assert(dir);
+
+ string dname = req->get_filepath().last_bit();
+ int dauth = dir->dentry_authority(dname);
+ if (mds->get_nodeid() != dauth) {
+ // ugh, exported out from under us
+ dout(7) << "ugh, forwarded out from under us, dentry auth is " << dauth << endl;
+ mdcache->request_forward(req, dauth);
+ return;
+ }
+
+ CDentry *dn = dir->lookup(dname);
+ if (dn && (!dn->is_null() || dn->is_xlockedbyother(req))) {
+ dout(7) << "handle_client_link dn exists " << *dn << endl;
+ reply_request(req, -EEXIST);
+ return;
+ }
+
+ if (!dn) dn = dir->add_dentry(dname);
+
+ if (!dn->is_xlockedbyme(req)) {
+ if (!mds->locker->dentry_xlock_start(dn, req, ref)) {
+ if (dn->is_clean() && dn->is_null() && dn->is_sync()) dir->remove_dentry(dn);
+ return;
+ }
+ }
+
+
+ // ok xlocked!
+ if (targeti->is_auth()) {
+ // mine
+ if (targeti->is_anchored()) {
+ dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << endl;
+ } else {
+ assert(targeti->inode.nlink == 1);
+ dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << endl;
+
+ mdcache->anchor_inode(targeti,
+ new C_MDS_RetryRequest(mds, req, ref));
+ return;
+ }
+
+ // ok, inc link!
+ targeti->inode.nlink++;
+ dout(7) << "nlink++, now " << targeti->inode.nlink << " on " << *targeti << endl;
+ targeti->mark_dirty();
+
+ } else {
+ // remote: send nlink++ request, wait
+ dout(7) << "target is remote, sending InodeLink" << endl;
+ mds->send_message_mds(new MInodeLink(targeti->ino(), mds->get_nodeid()), targeti->authority(), MDS_PORT_CACHE);
+
+ // wait
+ targeti->add_waiter(CINODE_WAIT_LINK,
+ new C_MDS_RemoteLink(this, req, ref, dn, targeti));
+ return;
+ }
+
+ handle_client_link_finish(req, ref, dn, targeti);
+}
+
+void Server::handle_client_link_finish(MClientRequest *req, CInode *ref,
+ CDentry *dn, CInode *targeti)
+{
+ // create remote link
+ dn->dir->link_inode(dn, targeti->ino());
+ dn->link_remote( targeti ); // since we have it
+ dn->mark_dirty();
+
+ mds->balancer->hit_dir(dn->dir, META_POP_DWR);
+
+ // done!
+ commit_request(req, new MClientReply(req, 0), ref,
+ 0); // FIXME i should log something
+}
+
+
+// UNLINK
+
+void Server::handle_client_unlink(MClientRequest *req,
+ CInode *diri)
+{
+ // rmdir or unlink
+ bool rmdir = false;
+ if (req->get_op() == MDS_OP_RMDIR) rmdir = true;
+
+ // find it
+ if (req->get_filepath().depth() == 0) {
+ dout(7) << "can't rmdir root" << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+ string name = req->get_filepath().last_bit();
+
+ // make sure parent is a dir?
+ if (!diri->is_dir()) {
+ dout(7) << "not a dir" << endl;
+ reply_request(req, -ENOTDIR);
+ return;
+ }
+
+ // am i not open, not auth?
+ if (!diri->dir && !diri->is_auth()) {
+ int dirauth = diri->authority();
+ dout(7) << "don't know dir auth, not open, auth is i think " << dirauth << endl;
+ mdcache->request_forward(req, dirauth);
+ return;
+ }
+
+ if (!try_open_dir(diri, req)) return;
+ CDir *dir = diri->dir;
+ int dnauth = dir->dentry_authority(name);
+
+ // does it exist?
+ CDentry *dn = dir->lookup(name);
+ if (!dn) {
+ if (dnauth == mds->get_nodeid()) {
+ dout(7) << "handle_client_rmdir/unlink dne " << name << " in " << *dir << endl;
+ reply_request(req, -ENOENT);
+ } else {
+ // send to authority!
+ dout(7) << "handle_client_rmdir/unlink fw, don't have " << name << " in " << *dir << endl;
+ mdcache->request_forward(req, dnauth);
+ }
+ return;
+ }
+
+ // have it. locked?
+ if (!dn->can_read(req)) {
+ dout(10) << " waiting on " << *dn << endl;
+ dir->add_waiter(CDIR_WAIT_DNREAD,
+ name,
+ new C_MDS_RetryRequest(mds, req, diri));
+ return;
+ }
+
+ // null?
+ if (dn->is_null()) {
+ dout(10) << "unlink on null dn " << *dn << endl;
+ reply_request(req, -ENOENT);
+ return;
+ }
+
+ // ok!
+ CInode *in = dn->inode;
+ assert(in);
+ if (rmdir) {
+ dout(7) << "handle_client_rmdir on dir " << *in << endl;
+ } else {
+ dout(7) << "handle_client_unlink on non-dir " << *in << endl;
+ }
+
+ // dir stuff
+ if (in->is_dir()) {
+ if (rmdir) {
+ // rmdir
+
+ // open dir?
+ if (in->is_auth() && !in->dir) {
+ if (!try_open_dir(in, req)) return;
+ }
+
+ // not dir auth? (or not open, which implies the same!)
+ if (!in->dir) {
+ dout(7) << "handle_client_rmdir dir not open for " << *in << ", sending to dn auth " << dnauth << endl;
+ mdcache->request_forward(req, dnauth);
+ return;
+ }
+ if (!in->dir->is_auth()) {
+ int dirauth = in->dir->authority();
+ dout(7) << "handle_client_rmdir not auth for dir " << *in->dir << ", sending to dir auth " << dnauth << endl;
+ mdcache->request_forward(req, dirauth);
+ return;
+ }
+
+ assert(in->dir);
+ assert(in->dir->is_auth());
+
+ // dir size check on dir auth (but not necessarily dentry auth)?
+
+ // should be empty
+ if (in->dir->get_size() == 0 && !in->dir->is_complete()) {
+ dout(7) << "handle_client_rmdir on dir " << *in->dir << ", empty but not complete, fetching" << endl;
+ mds->mdstore->fetch_dir(in->dir,
+ new C_MDS_RetryRequest(mds, req, diri));
+ return;
+ }
+ if (in->dir->get_size() > 0) {
+ dout(7) << "handle_client_rmdir on dir " << *in->dir << ", not empty" << endl;
+ reply_request(req, -ENOTEMPTY);
+ return;
+ }
+
+ dout(7) << "handle_client_rmdir dir is empty!" << endl;
+
+ // export sanity check
+ if (!in->is_auth()) {
+ // i should be exporting this now/soon, since the dir is empty.
+ dout(7) << "handle_client_rmdir dir is auth, but not inode." << endl;
+ if (!in->dir->is_freezing() && in->dir->is_frozen()) {
+ assert(in->dir->is_import());
+ mdcache->migrator->export_empty_import(in->dir);
+ } else {
+ dout(7) << "apparently already exporting" << endl;
+ }
+ in->dir->add_waiter(CDIR_WAIT_UNFREEZE,
+ new C_MDS_RetryRequest(mds, req, diri));
+ return;
+ }
+
+ } else {
+ // unlink
+ dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << endl;
+ reply_request(req, -EISDIR);
+ return;
+ }
+ } else {
+ if (rmdir) {
+ // unlink
+ dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << endl;
+ reply_request(req, -ENOTDIR);
+ return;
+ }
+ }
+
+ // am i dentry auth?
+ if (dnauth != mds->get_nodeid()) {
+ // not auth; forward!
+ dout(7) << "handle_client_unlink not auth for " << *dir << " dn " << dn->name << ", fwd to " << dnauth << endl;
+ mdcache->request_forward(req, dnauth);
+ return;
+ }
+
+ dout(7) << "handle_client_unlink/rmdir on " << *in << endl;
+
+ // xlock dentry
+ if (!mds->locker->dentry_xlock_start(dn, req, diri))
+ return;
+
+ // is this a remote link?
+ if (dn->is_remote() && !dn->inode) {
+ CInode *in = mdcache->get_inode(dn->get_remote_ino());
+ if (in) {
+ dn->link_remote(in);
+ } else {
+ // open inode
+ dout(7) << "opening target inode first, ino is " << dn->get_remote_ino() << endl;
+ mdcache->open_remote_ino(dn->get_remote_ino(), req,
+ new C_MDS_RetryRequest(mds, req, diri));
+ return;
+ }
+ }
+
+
+ mds->balancer->hit_dir(dn->dir, META_POP_DWR);
+
+ // it's locked, unlink!
+ MClientReply *reply = new MClientReply(req,0);
+ mdcache->dentry_unlink(dn,
+ new C_MDS_CommitRequest(this, req, reply, diri,
+ new EInodeUpdate(diri))); // FIXME WRONG EVENT
+ return;
+}
+
+
+
+
+
+
+// RENAME
+
+class C_MDS_RenameTraverseDst : public Context {
+ Server *server;
+ MClientRequest *req;
+ CInode *ref;
+ CInode *srcdiri;
+ CDir *srcdir;
+ CDentry *srcdn;
+ filepath destpath;
+public:
+ vector<CDentry*> trace;
+
+ C_MDS_RenameTraverseDst(Server *server,
+ MClientRequest *req,
+ CInode *ref,
+ CInode *srcdiri,
+ CDir *srcdir,
+ CDentry *srcdn,
+ filepath& destpath) {
+ this->server = server;
+ this->req = req;
+ this->ref = ref;
+ this->srcdiri = srcdiri;
+ this->srcdir = srcdir;
+ this->srcdn = srcdn;
+ this->destpath = destpath;
+ }
+ void finish(int r) {
+ server->handle_client_rename_2(req, ref,
+ srcdiri, srcdir, srcdn, destpath,
+ trace, r);
+ }
+};
+
+
+/*
+
+ weirdness iwith rename:
+ - ref inode is what was originally srcdiri, but that may change by the tiem
+ the rename actually happens. for all practical purpose, ref is useless except
+ for C_MDS_RetryRequest
+
+ */
+void Server::handle_client_rename(MClientRequest *req,
+ CInode *ref)
+{
+ dout(7) << "handle_client_rename on " << *req << endl;
+
+ // sanity checks
+ if (req->get_filepath().depth() == 0) {
+ dout(7) << "can't rename root" << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+ // mv a/b a/b/c -- meaningless
+ if (req->get_sarg().compare( 0, req->get_path().length(), req->get_path()) == 0 &&
+ req->get_sarg().c_str()[ req->get_path().length() ] == '/') {
+ dout(7) << "can't rename to underneath myself" << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ // mv blah blah -- also meaningless
+ if (req->get_sarg() == req->get_path()) {
+ dout(7) << "can't rename something to itself (or into itself)" << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ // traverse to source
+ /*
+ this is abnoraml, just for rename. since we don't pin source path
+ (because we don't want to screw up the lock ordering) the ref inode
+ (normally/initially srcdiri) may move, and this may fail.
+ -> so, re-traverse path. and make sure we request_finish in the case of a forward!
+ */
+ filepath refpath = req->get_filepath();
+ string srcname = refpath.last_bit();
+ refpath = refpath.prefixpath(refpath.depth()-1);
+
+ dout(7) << "handle_client_rename src traversing to srcdir " << refpath << endl;
+ vector<CDentry*> trace;
+ int r = mdcache->path_traverse(refpath, trace, true,
+ req, new C_MDS_RetryRequest(mds, req, ref),
+ MDS_TRAVERSE_FORWARD);
+ if (r == 2) {
+ dout(7) << "path traverse forwarded, ending request, doing manual request_cleanup" << endl;
+ dout(7) << "(pseudo) request_forward to 9999 req " << *req << endl;
+ mdcache->request_cleanup(req); // not _finish (deletes) or _forward (path_traverse did that)
+ return;
+ }
+ if (r > 0) return;
+ if (r < 0) { // dne or something. got renamed out from under us, probably!
+ dout(7) << "traverse r=" << r << endl;
+ reply_request(req, r);
+ return;
+ }
+
+ CInode *srcdiri;
+ if (trace.size())
+ srcdiri = trace[trace.size()-1]->inode;
+ else
+ srcdiri = mdcache->get_root();
+
+ dout(7) << "handle_client_rename srcdiri is " << *srcdiri << endl;
+
+ dout(7) << "handle_client_rename srcname is " << srcname << endl;
+
+ // make sure parent is a dir?
+ if (!srcdiri->is_dir()) {
+ dout(7) << "srcdiri not a dir " << *srcdiri << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ // am i not open, not auth?
+ if (!srcdiri->dir && !srcdiri->is_auth()) {
+ int dirauth = srcdiri->authority();
+ dout(7) << "don't know dir auth, not open, srcdir auth is probably " << dirauth << endl;
+ mdcache->request_forward(req, dirauth);
+ return;
+ }
+
+ if (!try_open_dir(srcdiri, req)) return;
+ CDir *srcdir = srcdiri->dir;
+ dout(7) << "handle_client_rename srcdir is " << *srcdir << endl;
+
+ // make sure it's my dentry
+ int srcauth = srcdir->dentry_authority(srcname);
+ if (srcauth != mds->get_nodeid()) {
+ // fw
+ dout(7) << "rename on " << req->get_path() << ", dentry " << *srcdir << " dn " << srcname << " not mine, fw to " << srcauth << endl;
+ mdcache->request_forward(req, srcauth);
+ return;
+ }
+ // ok, done passing buck.
+
+ // src dentry
+ CDentry *srcdn = srcdir->lookup(srcname);
+
+ // xlocked?
+ if (srcdn && !srcdn->can_read(req)) {
+ dout(10) << " waiting on " << *srcdn << endl;
+ srcdir->add_waiter(CDIR_WAIT_DNREAD,
+ srcname,
+ new C_MDS_RetryRequest(mds, req, srcdiri));
+ return;
+ }
+
+ if ((srcdn && !srcdn->inode) ||
+ (!srcdn && srcdir->is_complete())) {
+ dout(10) << "handle_client_rename src dne " << endl;
+ reply_request(req, -EEXIST);
+ return;
+ }
+
+ if (!srcdn && !srcdir->is_complete()) {
+ dout(10) << "readding incomplete dir" << endl;
+ mds->mdstore->fetch_dir(srcdir,
+ new C_MDS_RetryRequest(mds, req, srcdiri));
+ return;
+ }
+ assert(srcdn && srcdn->inode);
+
+
+ dout(10) << "handle_client_rename srcdn is " << *srcdn << endl;
+ dout(10) << "handle_client_rename srci is " << *srcdn->inode << endl;
+
+ // pin src in cache (so it won't expire)
+ mdcache->request_pin_inode(req, srcdn->inode);
+
+ // find the destination, normalize
+ // discover, etc. on the way... just get it on the local node.
+ filepath destpath = req->get_sarg();
+
+ C_MDS_RenameTraverseDst *onfinish = new C_MDS_RenameTraverseDst(this, req, ref, srcdiri, srcdir, srcdn, destpath);
+ Context *ondelay = new C_MDS_RetryRequest(mds, req, ref);
+
+ /*
+ * use DISCOVERXLOCK mode:
+ * the dest may not exist, and may be xlocked from a remote host
+ * we want to succeed if we find the xlocked dentry
+ * ??
+ */
+ mdcache->path_traverse(destpath, onfinish->trace, false,
+ req, ondelay,
+ MDS_TRAVERSE_DISCOVER, //XLOCK,
+ onfinish);
+}
+
+void Server::handle_client_rename_2(MClientRequest *req,
+ CInode *ref,
+ CInode *srcdiri,
+ CDir *srcdir,
+ CDentry *srcdn,
+ filepath& destpath,
+ vector<CDentry*>& trace,
+ int r)
+{
+ dout(7) << "handle_client_rename_2 on " << *req << endl;
+ dout(12) << " r = " << r << " trace depth " << trace.size() << " destpath depth " << destpath.depth() << endl;
+
+ CInode *srci = srcdn->inode;
+ assert(srci);
+ CDir* destdir = 0;
+ string destname;
+
+ // what is the dest? (dir or file or complete filename)
+ // note: trace includes root, destpath doesn't (include leading /)
+ if (trace.size() && trace[trace.size()-1]->inode == 0) {
+ dout(10) << "dropping null dentry from tail of trace" << endl;
+ trace.pop_back(); // drop it!
+ }
+
+ CInode *d;
+ if (trace.size())
+ d = trace[trace.size()-1]->inode;
+ else
+ d = mdcache->get_root();
+ assert(d);
+ dout(10) << "handle_client_rename_2 traced to " << *d << ", trace size = " << trace.size() << ", destpath = " << destpath.depth() << endl;
+
+ // make sure i can open the dir?
+ if (d->is_dir() && !d->dir_is_auth() && !d->dir) {
+ // discover it
+ mdcache->open_remote_dir(d,
+ new C_MDS_RetryRequest(mds, req, ref));
+ return;
+ }
+
+ if (trace.size() == destpath.depth()) {
+ if (d->is_dir()) {
+ // mv /some/thing /to/some/dir
+ if (!try_open_dir(d, req)) return;
+ destdir = d->dir; // /to/some/dir
+ destname = req->get_filepath().last_bit(); // thing
+ destpath.add_dentry(destname);
+ } else {
+ // mv /some/thing /to/some/existing_filename
+ destdir = trace[trace.size()-1]->dir; // /to/some
+ destname = destpath.last_bit(); // existing_filename
+ }
+ }
+ else if (trace.size() == destpath.depth()-1) {
+ if (d->is_dir()) {
+ // mv /some/thing /to/some/place_that_maybe_dne (we might be replica)
+ if (!try_open_dir(d, req)) return;
+ destdir = d->dir; // /to/some
+ destname = destpath.last_bit(); // place_that_MAYBE_dne
+ } else {
+ dout(7) << "dest dne" << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+ }
+ else {
+ assert(trace.size() < destpath.depth()-1);
+ // check traverse return value
+ if (r > 0) {
+ return; // discover, readdir, etc.
+ }
+
+ // ??
+ assert(r < 0 || trace.size() == 0); // musta been an error
+
+ // error out
+ dout(7) << " rename dest " << destpath << " dne" << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ string srcpath = req->get_path();
+ dout(10) << "handle_client_rename_2 srcpath " << srcpath << endl;
+ dout(10) << "handle_client_rename_2 destpath " << destpath << endl;
+
+ // src == dest?
+ if (srcdn->get_dir() == destdir && srcdn->name == destname) {
+ dout(7) << "rename src=dest, same file " << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ // does destination exist? (is this an overwrite?)
+ CDentry *destdn = destdir->lookup(destname);
+ CInode *oldin = 0;
+ if (destdn) {
+ oldin = destdn->get_inode();
+
+ if (oldin) {
+ // make sure it's also a file!
+ // this can happen, e.g. "mv /some/thing /a/dir" where /a/dir/thing exists and is a dir.
+ if (oldin->is_dir()) {
+ // fail!
+ dout(7) << "dest exists and is dir" << endl;
+ reply_request(req, -EISDIR);
+ return;
+ }
+
+ if (srcdn->inode->is_dir() &&
+ !oldin->is_dir()) {
+ dout(7) << "cannot overwrite non-directory with directory" << endl;
+ reply_request(req, -EISDIR);
+ return;
+ }
+ }
+
+ dout(7) << "dest exists " << *destdn << endl;
+ if (destdn->get_inode()) {
+ dout(7) << "destino is " << *destdn->get_inode() << endl;
+ } else {
+ dout(7) << "dest dn is a NULL stub" << endl;
+ }
+ } else {
+ dout(7) << "dest dn dne (yet)" << endl;
+ }
+
+
+ // local or remote?
+ int srcauth = srcdir->dentry_authority(srcdn->name);
+ int destauth = destdir->dentry_authority(destname);
+ dout(7) << "handle_client_rename_2 destname " << destname << " destdir " << *destdir << " auth " << destauth << endl;
+
+ //
+ if (srcauth != mds->get_nodeid() ||
+ destauth != mds->get_nodeid()) {
+ dout(7) << "rename has remote dest " << destauth << endl;
+ dout(7) << "FOREIGN RENAME" << endl;
+
+ // punt?
+ if (false && srcdn->inode->is_dir()) {
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ } else {
+ dout(7) << "rename is local" << endl;
+ }
+
+ handle_client_rename_local(req, ref,
+ srcpath, srcdiri, srcdn,
+ destpath.get_path(), destdir, destdn, destname);
+ return;
+}
+
+
+
+
+void Server::handle_client_rename_local(MClientRequest *req,
+ CInode *ref,
+ string& srcpath,
+ CInode *srcdiri,
+ CDentry *srcdn,
+ string& destpath,
+ CDir *destdir,
+ CDentry *destdn,
+ string& destname)
+{
+ //bool everybody = false;
+ //if (true || srcdn->inode->is_dir()) {
+ /* overkill warning: lock w/ everyone for simplicity. FIXME someday! along with the foreign rename crap!
+ i could limit this to cases where something beneath me is exported.
+ could possibly limit the list. (maybe.)
+ Underlying constraint is that, regardless of the order i do the xlocks, and whatever
+ imports/exports might happen in the process, the destdir _must_ exist on any node
+ importing something beneath me when rename finishes, or else mayhem ensues when
+ their import is dangling in the cache.
+ */
+ /*
+ having made a proper mess of this on the first pass, here is my plan:
+
+ - xlocks of src, dest are done in lex order
+ - xlock is optional.. if you have the dentry, lock it, if not, don't.
+ - if you discover an xlocked dentry, you get the xlock.
+
+ possible trouble:
+ - you have an import beneath the source, and don't have the dest dir.
+ - when the actual rename happens, you discover the dest
+ - actually, do this on any open dir, so we don't detach whole swaths
+ of our cache.
+
+ notes:
+ - xlocks are initiated from authority, as are discover_replies, so replicas are
+ guaranteed to either not have dentry, or to have it xlocked.
+ -
+ - foreign xlocks are eventually unraveled by the initiator on success or failure.
+
+ todo to make this work:
+ - hose bool everybody param crap
+ /- make handle_lock_dn not discover, clean up cases
+ /- put dest path in MRenameNotify
+ /- make rename_notify discover if its a dir
+ / - this will catch nested imports too, obviously
+ /- notify goes to merged list on local rename
+ /- notify goes to everybody on a foreign rename
+ /- handle_notify needs to gracefully ignore spurious notifies
+ */
+ //dout(7) << "handle_client_rename_local: overkill? doing xlocks with _all_ nodes" << endl;
+ //everybody = true;
+ //}
+
+ bool srclocal = srcdn->dir->dentry_authority(srcdn->name) == mds->get_nodeid();
+ bool destlocal = destdir->dentry_authority(destname) == mds->get_nodeid();
+
+ dout(7) << "handle_client_rename_local: src local=" << srclocal << " " << *srcdn << endl;
+ if (destdn) {
+ dout(7) << "handle_client_rename_local: dest local=" << destlocal << " " << *destdn << endl;
+ } else {
+ dout(7) << "handle_client_rename_local: dest local=" << destlocal << " dn dne yet" << endl;
+ }
+
+ /* lock source and dest dentries, in lexicographic order.
+ */
+ bool dosrc = srcpath < destpath;
+ for (int i=0; i<2; i++) {
+ if (dosrc) {
+
+ // src
+ if (srclocal) {
+ if (!srcdn->is_xlockedbyme(req) &&
+ !mds->locker->dentry_xlock_start(srcdn, req, ref))
+ return;
+ } else {
+ if (!srcdn || srcdn->xlockedby != req) {
+ mds->locker->dentry_xlock_request(srcdn->dir, srcdn->name, false, req, new C_MDS_RetryRequest(mds, req, ref));
+ return;
+ }
+ }
+ dout(7) << "handle_client_rename_local: srcdn is xlock " << *srcdn << endl;
+
+ } else {
+
+ if (destlocal) {
+ // dest
+ if (!destdn) destdn = destdir->add_dentry(destname);
+ if (!destdn->is_xlockedbyme(req) &&
+ !mds->locker->dentry_xlock_start(destdn, req, ref)) {
+ if (destdn->is_clean() && destdn->is_null() && destdn->is_sync()) destdir->remove_dentry(destdn);
+ return;
+ }
+ } else {
+ if (!destdn || destdn->xlockedby != req) {
+ /* NOTE: require that my xlocked item be a leaf/file, NOT a dir. in case
+ * my traverse and determination of dest vs dest/srcfilename was out of date.
+ */
+ mds->locker->dentry_xlock_request(destdir, destname, true, req, new C_MDS_RetryRequest(mds, req, ref));
+ return;
+ }
+ }
+ dout(7) << "handle_client_rename_local: destdn is xlock " << *destdn << endl;
+
+ }
+
+ dosrc = !dosrc;
+ }
+
+
+ // final check: verify if dest exists that src is a file
+
+ // FIXME: is this necessary?
+
+ if (destdn->inode) {
+ if (destdn->inode->is_dir()) {
+ dout(7) << "handle_client_rename_local failing, dest exists and is a dir: " << *destdn->inode << endl;
+ assert(0);
+ reply_request(req, -EINVAL);
+ return;
+ }
+ if (srcdn->inode->is_dir()) {
+ dout(7) << "handle_client_rename_local failing, dest exists and src is a dir: " << *destdn->inode << endl;
+ assert(0);
+ reply_request(req, -EINVAL);
+ return;
+ }
+ } else {
+ // if destdn->inode is null, then we know it's a non-existent dest,
+ // why? because if it's local, it dne. and if it's remote, we xlocked with
+ // REQXLOCKC, which will only allow you to lock a file.
+ // so we know dest is a file, or non-existent
+ if (!destlocal) {
+ if (srcdn->inode->is_dir()) {
+ // help: maybe the dest exists and is a file? ..... FIXME
+ } else {
+ // we're fine, src is file, dest is file|dne
+ }
+ }
+ }
+
+ mds->balancer->hit_dir(srcdn->dir, META_POP_DWR);
+ mds->balancer->hit_dir(destdn->dir, META_POP_DWR);
+
+ // we're golden.
+ // everything is xlocked by us, we rule, etc.
+ MClientReply *reply = new MClientReply(req, 0);
+ mdcache->renamer->file_rename( srcdn, destdn,
+ new C_MDS_CommitRequest(this, req, reply, srcdn->inode,
+ new EInodeUpdate(srcdn->inode)) ); // FIXME WRONG EVENT
+}
+
+
+
+
+
+
+
+// MKDIR
+
+void Server::handle_client_mkdir(MClientRequest *req, CInode *diri)
+{
+ // make dentry and inode, link.
+ CInode *newi = mknod(req, diri);
+ if (!newi) return;
+
+ // make my new inode a dir.
+ newi->inode.mode = req->get_iarg();
+ newi->inode.mode &= ~INODE_TYPE_MASK;
+ newi->inode.mode |= INODE_MODE_DIR;
+
+ // use dir layout
+ newi->inode.layout = g_OSD_MDDirLayout;
+
+ // init dir to be empty
+ assert(!newi->is_frozen_dir()); // bc mknod worked
+ CDir *newdir = newi->get_or_open_dir(mds);
+ newdir->mark_complete();
+ newdir->mark_dirty();
+
+ mds->balancer->hit_dir(newdir, META_POP_DWR);
+
+ if (
+ diri->dir->is_auth() &&
+ diri->dir->is_rep() &&
+ newdir->is_auth() &&
+ !newdir->is_hashing()) {
+ int dest = rand() % mds->mdsmap->get_num_mds();
+ if (dest != mds->get_nodeid()) {
+ dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << endl;
+ mdcache->migrator->export_dir(newdir, dest);
+ }
+ }
+
+ // commit to log
+ commit_request(req, new MClientReply(req, 0), diri,
+ new EMkdir(newdir));
+ //new EInodeUpdate(newi),//);
+ //new EDirUpdate(newdir)); // FIXME: weird performance regression here w/ double log; somewhat of a mystery!
+ return;
+}
+
+
+
+
+
+// SYMLINK
+
+void Server::handle_client_symlink(MClientRequest *req, CInode *diri)
+{
+ // make dentry and inode, link.
+ CInode *newi = mknod(req, diri);
+ if (!newi) return;
+
+ // make my new inode a symlink
+ newi->inode.mode &= ~INODE_TYPE_MASK;
+ newi->inode.mode |= INODE_MODE_SYMLINK;
+
+ // set target
+ newi->symlink = req->get_sarg();
+
+ mds->balancer->hit_dir(diri->dir, META_POP_DWR);
+
+ // commit
+ commit_request(req, new MClientReply(req, 0), diri,
+ new EInodeUpdate(newi)); // FIXME should be differnet log entry
+}
+
+
+
+
+
+
+
+// ===================================
+// TRUNCATE, FSYNC
+
+/*
+ * FIXME: this truncate implemention is WRONG WRONG WRONG
+ */
+
+void Server::handle_client_truncate(MClientRequest *req, CInode *cur)
+{
+ // write
+ if (!mds->locker->inode_file_write_start(cur, req))
+ return; // fw or (wait for) lock
+
+ // check permissions
+
+ // do update
+ cur->inode.size = req->get_sizearg();
+ cur->mark_dirty();
+
+ mds->locker->inode_file_write_finish(cur);
+
+ mds->balancer->hit_inode(cur, META_POP_IWR);
+
+ // start reply
+ MClientReply *reply = new MClientReply(req, 0);
+
+ // commit
+ commit_request(req, reply, cur,
+ new EInodeUpdate(cur));
+}
+
+
+
+// ===========================
+// open, openc, close
+
+void Server::handle_client_open(MClientRequest *req,
+ CInode *cur)
+{
+ int flags = req->get_iarg();
+ int mode = req->get_iarg2();
+
+ dout(7) << "open " << flags << " on " << *cur << endl;
+ dout(10) << "open flags = " << flags << " mode = " << mode << endl;
+
+ // is it a file?
+ if (!(cur->inode.mode & INODE_MODE_FILE)) {
+ dout(7) << "not a regular file" << endl;
+ reply_request(req, -EINVAL); // FIXME what error do we want?
+ return;
+ }
+
+ // auth for write access
+ if (mode != FILE_MODE_R && mode != FILE_MODE_LAZY &&
+ !cur->is_auth()) {
+ int auth = cur->authority();
+ assert(auth != mds->get_nodeid());
+ dout(9) << "open writeable on replica for " << *cur << " fw to auth " << auth << endl;
+
+ mdcache->request_forward(req, auth);
+ return;
+ }
+
+
+ // hmm, check permissions or something.
+
+
+ // can we issue the caps they want?
+ version_t fdv = mds->locker->issue_file_data_version(cur);
+ Capability *cap = mds->locker->issue_new_caps(cur, mode, req);
+ if (!cap) return; // can't issue (yet), so wait!
+
+ dout(12) << "open gets caps " << cap_string(cap->pending()) << " for " << req->get_source() << " on " << *cur << endl;
+
+ mds->balancer->hit_inode(cur, META_POP_IRD);
+
+ // reply
+ MClientReply *reply = new MClientReply(req, 0);
+ reply->set_file_caps(cap->pending());
+ reply->set_file_caps_seq(cap->get_last_seq());
+ reply->set_file_data_version(fdv);
+ reply_request(req, reply, cur);
+}
+
+
+
+void Server::handle_client_openc(MClientRequest *req, CInode *ref)
+{
+ dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl;
+
+ CInode *in = mknod(req, ref, true);
+ if (!in) return;
+
+ in->inode.mode = 0644; // wtf FIXME
+ in->inode.mode |= INODE_MODE_FILE;
+
+ handle_client_open(req, in);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MDS_SERVER_H
+#define __MDS_SERVER_H
+
+#include "MDS.h"
+
+class LogEvent;
+
+class Server {
+ MDS *mds;
+ MDCache *mdcache;
+ MDLog *mdlog;
+ Messenger *messenger;
+
+ __uint64_t stat_ops;
+
+
+public:
+ Server(MDS *m) :
+ mds(m),
+ mdcache(mds->mdcache), mdlog(mds->mdlog),
+ messenger(mds->messenger),
+ stat_ops(0) {
+ }
+
+ void dispatch(Message *m);
+
+ // generic request helpers
+ void reply_request(MClientRequest *req, int r = 0, CInode *tracei = 0);
+ void reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei);
+ void commit_request(MClientRequest *req,
+ MClientReply *reply,
+ CInode *tracei,
+ LogEvent *event,
+ LogEvent *event2 = 0);
+
+ bool try_open_dir(CInode *in, MClientRequest *req);
+
+
+ // clients
+ void handle_client_mount(class MClientMount *m);
+ void handle_client_unmount(Message *m);
+
+ void handle_client_request(MClientRequest *m);
+ void handle_client_request_2(MClientRequest *req,
+ vector<CDentry*>& trace,
+ int r);
+
+ // fs ops
+ void handle_client_fstat(MClientRequest *req);
+
+ // requests
+ void dispatch_request(Message *m, CInode *ref);
+
+ // inode request *req, CInode *ref;
+ void handle_client_stat(MClientRequest *req, CInode *ref);
+ void handle_client_utime(MClientRequest *req, CInode *ref);
+ void handle_client_inode_soft_update_2(MClientRequest *req,
+ MClientReply *reply,
+ CInode *ref);
+ void handle_client_chmod(MClientRequest *req, CInode *ref);
+ void handle_client_chown(MClientRequest *req, CInode *ref);
+ void handle_client_inode_hard_update_2(MClientRequest *req,
+ MClientReply *reply,
+ CInode *ref);
+
+ // readdir
+ void handle_client_readdir(MClientRequest *req, CInode *ref);
+ int encode_dir_contents(CDir *dir,
+ list<class InodeStat*>& inls,
+ list<string>& dnls);
+ void handle_hash_readdir(MHashReaddir *m);
+ void handle_hash_readdir_reply(MHashReaddirReply *m);
+ void finish_hash_readdir(MClientRequest *req, CDir *dir);
+
+ // namespace changes
+ void handle_client_mknod(MClientRequest *req, CInode *ref);
+ void handle_client_link(MClientRequest *req, CInode *ref);
+ void handle_client_link_2(int r, MClientRequest *req, CInode *ref, vector<CDentry*>& trace);
+ void handle_client_link_finish(MClientRequest *req, CInode *ref,
+ CDentry *dn, CInode *targeti);
+
+ void handle_client_unlink(MClientRequest *req, CInode *ref);
+ void handle_client_rename(MClientRequest *req, CInode *ref);
+ void handle_client_rename_2(MClientRequest *req,
+ CInode *ref,
+ CInode *srcdiri,
+ CDir *srcdir,
+ CDentry *srcdn,
+ filepath& destpath,
+ vector<CDentry*>& trace,
+ int r);
+ void handle_client_rename_local(MClientRequest *req, CInode *ref,
+ string& srcpath, CInode *srcdiri, CDentry *srcdn,
+ string& destpath, CDir *destdir, CDentry *destdn, string& name);
+
+ void handle_client_mkdir(MClientRequest *req, CInode *ref);
+ void handle_client_rmdir(MClientRequest *req, CInode *ref);
+ void handle_client_symlink(MClientRequest *req, CInode *ref);
+
+ // file
+ void handle_client_open(MClientRequest *req, CInode *ref);
+ void handle_client_openc(MClientRequest *req, CInode *ref);
+ void handle_client_release(MClientRequest *req, CInode *in);
+ void handle_client_truncate(MClientRequest *req, CInode *in);
+ void handle_client_fsync(MClientRequest *req, CInode *in);
+
+ CInode *mknod(MClientRequest *req, CInode *ref, bool okexist=false); // used by mknod, symlink, mkdir, openc
+
+
+};
+
+class C_MDS_RetryRequest : public Context {
+ MDS *mds;
+ Message *req; // MClientRequest or MLock
+ CInode *ref;
+ public:
+ C_MDS_RetryRequest(MDS *mds, Message *req, CInode *ref) {
+ assert(ref);
+ this->mds = mds;
+ this->req = req;
+ this->ref = ref;
+ }
+ virtual void finish(int r) {
+ mds->server->dispatch_request(req, ref);
+ }
+};
+
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MDS_EALLOC_H
+#define __MDS_EALLOC_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "../LogEvent.h"
+#include "../IdAllocator.h"
+
+#define EALLOC_EV_ALLOC 1
+#define EALLOC_EV_FREE 2
+
+class EAlloc : public LogEvent {
+ protected:
+ int idtype;
+ idno_t id;
+ int what; // alloc or dealloc
+ version_t table_version;
+
+ public:
+ EAlloc() : LogEvent(EVENT_ALLOC) { }
+ EAlloc(int idtype, idno_t id, int what, version_t v) :
+ LogEvent(EVENT_ALLOC) {
+ this->idtype = idtype;
+ this->id = id;
+ this->what = what;
+ this->table_version = v;
+ }
+
+ void encode_payload(bufferlist& bl) {
+ bl.append((char*)&idtype, sizeof(idtype));
+ bl.append((char*)&id, sizeof(id));
+ bl.append((char*)&what, sizeof(what));
+ bl.append((char*)&table_version, sizeof(table_version));
+ }
+ void decode_payload(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(idtype), (char*)&idtype);
+ off += sizeof(idtype);
+ bl.copy(off, sizeof(id), (char*)&id);
+ off += sizeof(id);
+ bl.copy(off, sizeof(what), (char*)&what);
+ off += sizeof(what);
+ bl.copy(off, sizeof(table_version), (char*)&table_version);
+ off += sizeof(table_version);
+ }
+
+
+ void print(ostream& out) {
+ if (what == EALLOC_EV_ALLOC)
+ out << "alloc " << hex << id << dec << " tablev " << table_version;
+ else
+ out << "dealloc " << hex << id << dec << " tablev " << table_version;
+ }
+
+
+ // live journal
+ bool can_expire(MDS *mds) {
+ if (mds->idalloc->get_committed_version() < table_version)
+ return false; // still dirty
+ else
+ return true; // already flushed
+ }
+
+ void retire(MDS *mds, Context *c) {
+ mds->idalloc->save(c, table_version);
+ }
+
+
+ // recovery
+ bool has_happened(MDS *mds) {
+ if (mds->idalloc->get_version() >= table_version) {
+ cout << " event " << table_version << " <= table " << mds->idalloc->get_version() << endl;
+ return true;
+ } else
+ return false;
+ }
+
+ void replay(MDS *mds) {
+ assert(table_version-1 == mds->idalloc->get_version());
+
+ if (what == EALLOC_EV_ALLOC) {
+ idno_t nid = mds->idalloc->alloc_id(true);
+ assert(nid == id); // this should match.
+ }
+ else if (what == EALLOC_EV_FREE) {
+ mds->idalloc->reclaim_id(id, true);
+ }
+ else
+ assert(0);
+
+ assert(table_version == mds->idalloc->get_version());
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __EDIRUPDATE_H
+#define __EDIRUPDATE_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "../LogEvent.h"
+#include "ETrace.h"
+#include "../CDir.h"
+#include "../MDCache.h"
+#include "../MDStore.h"
+
+
+
+class EDirUpdate : public LogEvent {
+ protected:
+ ETrace trace;
+ inodeno_t dirino;
+ version_t version;
+
+ public:
+ EDirUpdate(CDir *dir) : LogEvent(EVENT_DIRUPDATE),
+ trace(dir->inode) {
+ this->dirino = dir->ino();
+ version = dir->get_version();
+ }
+ EDirUpdate() : LogEvent(EVENT_DIRUPDATE) {
+ }
+
+ void print(ostream& out) {
+ out << "up dir " << dirino << " "
+ << trace
+ << "/ v " << version;
+ }
+
+ virtual void encode_payload(bufferlist& bl) {
+ trace.encode(bl);
+ bl.append((char*)&version, sizeof(version));
+ bl.append((char*)&dirino, sizeof(dirino));
+ }
+ void decode_payload(bufferlist& bl, int& off) {
+ trace.decode(bl, off);
+ bl.copy(off, sizeof(version), (char*)&version);
+ off += sizeof(version);
+ bl.copy(off, sizeof(dirino), (char*)&dirino);
+ off += sizeof(dirino);
+ }
+
+
+ virtual bool can_expire(MDS *mds) {
+ // am i obsolete?
+ CInode *in = mds->mdcache->get_inode(dirino);
+ if (!in) return true;
+ CDir *dir = in->dir;
+ if (!dir) return true;
+
+ dout(10) << "EDirUpdate v " << version << " on dir " << *dir << endl;
+
+ if (!dir->is_auth()) return true; // not mine!
+ if (dir->is_frozen()) return true; // frozen -> exporting -> obsolete? FIXME
+
+ if (!dir->is_dirty()) return true;
+
+ if (dir->get_committing_version() > version)
+ return true;
+
+ return false;
+ }
+
+ virtual void retire(MDS *mds, Context *c) {
+ // commit directory
+ CInode *in = mds->mdcache->get_inode(dirino);
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+
+ dout(10) << "EDirUpdate committing dir " << *dir << endl;
+ mds->mdstore->commit_dir(dir, c);
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __EINODEUPDATE_H
+#define __EINODEUPDATE_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "../LogEvent.h"
+#include "ETrace.h"
+
+
+class EInodeUpdate : public LogEvent {
+ protected:
+ ETrace trace;
+
+ public:
+ EInodeUpdate(CInode *in) : LogEvent(EVENT_INODEUPDATE),
+ trace(in) {
+ }
+ EInodeUpdate() : LogEvent(EVENT_INODEUPDATE) { }
+
+ void print(ostream& out) {
+ out << "up inode " << trace.back().inode.ino
+ << " " << trace
+ << " v " << trace.back().inode.version;
+ }
+
+ virtual void encode_payload(bufferlist& bl) {
+ trace.encode(bl);
+ }
+ void decode_payload(bufferlist& bl, int& off) {
+ trace.decode(bl, off);
+ }
+
+ bool can_expire(MDS *mds);
+ void retire(MDS *mds, Context *c);
+ bool has_happened(MDS *mds);
+ void replay(MDS *mds);
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __EMKDIR_H
+#define __EMKDIR_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "ETrace.h"
+#include "../MDS.h"
+#include "../MDStore.h"
+
+
+class EMkdir : public LogEvent {
+ protected:
+ ETrace trace;
+ //version_t pdirv;
+
+ public:
+ EMkdir(CDir *dir) : LogEvent(EVENT_MKDIR),
+ trace(dir->inode) {
+ //pdirv = dir->inode->get_parent_dir()->get_version();
+ }
+ EMkdir() : LogEvent(EVENT_MKDIR) { }
+
+ void print(ostream& out) {
+ out << "mkdir ";
+ trace.print(out);
+ }
+
+ virtual void encode_payload(bufferlist& bl) {
+ trace.encode(bl);
+ //bl.append((char*)&pdirv, sizeof(pdirv));
+ }
+ void decode_payload(bufferlist& bl, int& off) {
+ trace.decode(bl, off);
+ //bl.copy(off, sizeof(pdirv), (char*)&pdirv);
+ //off += sizeof(pdirv);
+ }
+
+ bool can_expire(MDS *mds);
+ void retire(MDS *mds, Context *c);
+
+ // recovery
+ bool has_happened(MDS *mds);
+ void replay(MDS *mds);
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __EMKNOD_H
+#define __EMKNOD_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "../LogEvent.h"
+#include "ETrace.h"
+#include "../MDS.h"
+#include "../MDStore.h"
+
+
+class EMknod : public LogEvent {
+ protected:
+ ETrace trace;
+ //version_t pdirv;
+
+ public:
+ EMknod(CInode *in) : LogEvent(EVENT_MKNOD),
+ trace(in) {
+ //pdirv = in->get_parent_dir()->get_version();
+ }
+ EMknod() : LogEvent(EVENT_MKNOD) { }
+
+ void print(ostream& out) {
+ out << "mknod " << trace;
+ }
+
+ virtual void encode_payload(bufferlist& bl) {
+ trace.encode(bl);
+ //bl.append((char*)&pdirv, sizeof(pdirv));
+ }
+ void decode_payload(bufferlist& bl, int& off) {
+ trace.decode(bl, off);
+ //bl.copy(off, sizeof(pdirv), (char*)&pdirv);
+ //off += sizeof(pdirv);
+ }
+
+ bool can_expire(MDS *mds);
+ void retire(MDS *mds, Context *c);
+ bool has_happened(MDS *mds);
+ void replay(MDS *mds);
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __EPURGE_H
+#define __EPURGE_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+class EPurgeFinish : public LogEvent {
+ protected:
+ inodeno_t ino;
+
+ public:
+ EPurgeFinish(inodeno_t i) :
+ LogEvent(EVENT_PURGEFINISH),
+ ino(i) { }
+ EPurgeFinish() : LogEvent(EVENT_PURGEFINISH) { }
+
+ void print(ostream& out) {
+ out << "purgefinish " << ino;
+ }
+
+ virtual void encode_payload(bufferlist& bl) {
+ bl.append((char*)&ino, sizeof(ino));
+ }
+ void decode_payload(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(ino), (char*)&ino);
+ }
+
+ bool can_expire(MDS *mds);
+ void retire(MDS *mds, Context *c);
+ bool has_happened(MDS *mds);
+ void replay(MDS *mds);
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __ESTRING_H
+#define __ESTRING_H
+
+#include <stdlib.h>
+#include <string>
+using namespace std;
+
+#include "../LogEvent.h"
+
+// generic log event
+class EString : public LogEvent {
+ protected:
+ string event;
+
+ public:
+ EString(string e) :
+ LogEvent(EVENT_STRING) {
+ event = e;
+ }
+ EString() :
+ LogEvent(EVENT_STRING) {
+ }
+
+ void decode_payload(bufferlist& bl, int& off) {
+ event = bl.c_str() + off;
+ off += event.length() + 1;
+ }
+
+ void encode_payload(bufferlist& bl) {
+ bl.append(event.c_str(), event.length()+1);
+ }
+
+ void print(ostream& out) {
+ out << '"' << event << '"';
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MDS_ETRACE_H
+#define __MDS_ETRACE_H
+
+#include <stdlib.h>
+#include <string>
+using namespace std;
+
+#include "../CInode.h"
+#include "../CDir.h"
+#include "../CDentry.h"
+
+
+// path trace for use in journal events
+
+class ETrace {
+
+ // <dir, dn, inode> segment.
+ struct bit {
+ inodeno_t dirino;
+ version_t dirv;
+ string dn;
+ inode_t inode;
+
+ bit(bufferlist& bl, int& off) { _decode(bl,off); }
+ bit(inodeno_t di, version_t dv, const string& d, inode_t i) :
+ dirino(di), dirv(dv), dn(d), inode(i) {}
+
+ void _encode(bufferlist& bl) {
+ bl.append((char*)&dirino, sizeof(dirino));
+ bl.append((char*)&dirv, sizeof(dirv));
+ ::_encode(dn, bl);
+ bl.append((char*)&inode, sizeof(inode));
+ }
+ void _decode(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(dirino), (char*)&dirino); off += sizeof(dirino);
+ bl.copy(off, sizeof(dirv), (char*)&dirv); off += sizeof(dirv);
+ ::_decode(dn, bl, off);
+ bl.copy(off, sizeof(inode), (char*)&inode); off += sizeof(inode);
+ }
+ };
+
+ public:
+ list<bit> trace;
+
+ ETrace(CInode *in = 0) {
+ if (in) {
+ CDir *dir;
+ CDentry *dn;
+ do {
+ dn = in->get_parent_dn();
+ if (!dn) break;
+ dir = dn->get_dir();
+ if (!dir) break;
+
+ trace.push_front(bit(dir->ino(),
+ dir->get_version(),
+ dn->get_name(),
+ in->inode));
+
+ in = dir->get_inode();
+ } while (!dir->is_import());
+ }
+ }
+
+ bit& back() {
+ return trace.back();
+ }
+
+ void decode(bufferlist& bl, int& off) {
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++)
+ trace.push_back( bit(bl, off) );
+ }
+
+ void encode(bufferlist& bl) {
+ int n = trace.size();
+ bl.append((char*)&n, sizeof(n));
+ for (list<bit>::iterator i = trace.begin();
+ i != trace.end();
+ i++)
+ i->_encode(bl);
+ }
+
+ void print(ostream& out) const {
+ for (list<bit>::const_iterator p = trace.begin();
+ p != trace.end();
+ p++) {
+ if (p == trace.begin())
+ out << "[" << p->dirino << "]/" << p->dn;
+ else
+ out << "/" << p->dn;
+ }
+ }
+
+ CInode *restore_trace(MDS *mds);
+
+};
+
+inline ostream& operator<<(ostream& out, const ETrace& t) {
+ t.print(out);
+ return out;
+}
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __EUNLINK_H
+#define __EUNLINK_H
+
+#include <assert.h>
+#include "config.h"
+#include "include/types.h"
+
+#include "../LogEvent.h"
+#include "ETrace.h"
+
+#include "../CInode.h"
+#include "../CDentry.h"
+#include "../CDir.h"
+
+class EUnlink : public LogEvent {
+ protected:
+ ETrace diritrace;
+ version_t dirv;
+ string dname;
+ ETrace inodetrace;
+
+ public:
+ EUnlink(CDir *dir, CDentry* dn, CInode *in) :
+ LogEvent(EVENT_UNLINK),
+ diritrace(dir->inode),
+ dirv(dir->get_version()),
+ dname(dn->get_name()),
+ inodetrace(in) {}
+ EUnlink() : LogEvent(EVENT_UNLINK) { }
+
+ virtual void encode_payload(bufferlist& bl) {
+ diritrace.encode(bl);
+ bl.append((char*)&dirv, sizeof(dirv));
+ ::_encode(dname, bl);
+ inodetrace.encode(bl);
+ }
+ void decode_payload(bufferlist& bl, int& off) {
+ diritrace.decode(bl,off);
+ bl.copy(off, sizeof(dirv), (char*)&dirv);
+ off += sizeof(dirv);
+ ::_decode(dname, bl, off);
+ inodetrace.decode(bl, off);
+ }
+
+ bool can_expire(MDS *mds);
+ void retire(MDS *mds, Context *c);
+ bool has_happened(MDS *mds);
+ void replay(MDS *mds);
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "events/ETrace.h"
+#include "events/EMknod.h"
+#include "events/EMkdir.h"
+#include "events/EInodeUpdate.h"
+#include "events/EPurgeFinish.h"
+#include "events/EUnlink.h"
+
+#include "MDS.h"
+#include "MDCache.h"
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal "
+#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal "
+
+
+// -----------------------
+// ETrace
+
+CInode *ETrace::restore_trace(MDS *mds)
+{
+ CInode *in = 0;
+ for (list<bit>::iterator p = trace.begin();
+ p != trace.end();
+ ++p) {
+ // the dir
+ CInode *diri = mds->mdcache->get_inode(p->dirino);
+ if (!diri) {
+ dout(10) << "ETrace.restore_trace adding dir " << p->dirino << endl;
+ diri = new CInode(mds->mdcache);
+ diri->inode.ino = p->dirino;
+ diri->inode.mode = INODE_MODE_DIR;
+ mds->mdcache->add_inode(diri);
+
+ CDir *dir = diri->get_or_open_dir(mds);
+
+ // root? import?
+ if (p == trace.begin()) {
+ mds->mdcache->add_import(dir);
+ if (dir->ino() == 1)
+ mds->mdcache->set_root(diri);
+ }
+ } else {
+ dout(20) << "ETrace.restore_trace had dir " << p->dirino << endl;
+ diri->get_or_open_dir(mds);
+ }
+ assert(diri->dir);
+ dout(20) << "ETrace.restore_trace dir is " << *diri->dir << endl;
+
+ // the inode
+ in = mds->mdcache->get_inode(p->inode.ino);
+ if (!in) {
+ dout(10) << "ETrace.restore_trace adding dn '" << p->dn << "' inode " << p->inode.ino << endl;
+ in = new CInode(mds->mdcache);
+ in->inode = p->inode;
+ mds->mdcache->add_inode(in);
+
+ // the dentry
+ CDentry *dn = diri->dir->add_dentry( p->dn, in );
+ dn->mark_dirty();
+ assert(dn);
+ } else {
+ dout(20) << "ETrace.restore_trace had dn '" << p->dn << "' inode " << p->inode.ino << endl;
+ in->inode = p->inode;
+ }
+ dout(20) << "ETrace.restore_trace in is " << *in << endl;
+ }
+ return in;
+}
+
+
+// -----------------------
+// EMkdir
+// - trace goes to new dir's inode.
+
+bool EMkdir::can_expire(MDS *mds)
+{
+ // am i obsolete?
+ CInode *in = mds->mdcache->get_inode( trace.back().inode.ino );
+ if (!in) return true;
+ CDir *dir = in->dir;
+ if (!dir) return true;
+ CDir *pdir = in->get_parent_dir();
+ assert(pdir);
+
+ dout(10) << "EMkdir.can_expire in is " << *in << endl;
+ dout(10) << "EMkdir.can_expire inv is " << trace.back().inode.version << endl;
+ dout(10) << "EMkdir.can_expire dir is " << *dir << endl;
+ bool commitparent = in->get_last_committed_version() < trace.back().inode.version;
+ bool commitnew = dir->get_last_committed_version() == 0;
+
+ if (commitparent || commitnew) return false;
+ return true;
+}
+
+void EMkdir::retire(MDS *mds, Context *c)
+{
+ // commit parent dir AND my dir
+ CInode *in = mds->mdcache->get_inode( trace.back().inode.ino );
+ assert(in);
+ CDir *dir = in->dir;
+ assert(dir);
+ CDir *pdir = in->get_parent_dir();
+ assert(pdir);
+
+ dout(10) << "EMkdir.retire in is " << *in << endl;
+ dout(10) << "EMkdir.retire inv is " << trace.back().inode.version << endl;
+ dout(10) << "EMkdir.retire dir is " << *dir << endl;
+ bool commitparent = in->get_last_committed_version() < trace.back().inode.version;
+ bool commitnew = dir->get_last_committed_version() == 0;
+
+ if (commitparent && commitnew) {
+ // both
+ dout(10) << "EMkdir.retire committing parent+new dir " << *dir << endl;
+ C_Gather *gather = new C_Gather(c);
+ mds->mdstore->commit_dir(pdir, gather->new_sub());
+ mds->mdstore->commit_dir(dir, gather->new_sub());
+ } else if (commitparent) {
+ // just parent
+ dout(10) << "EMkdir.retire committing parent dir " << *dir << endl;
+ mds->mdstore->commit_dir(pdir, c);
+ } else {
+ // just new dir
+ dout(10) << "EMkdir.retire committing new dir " << *dir << endl;
+ mds->mdstore->commit_dir(dir, c);
+ }
+}
+
+bool EMkdir::has_happened(MDS *mds)
+{
+ return false;
+}
+
+void EMkdir::replay(MDS *mds)
+{
+ dout(10) << "EMkdir.replay " << *this << endl;
+ CInode *in = trace.restore_trace(mds);
+
+ // mark dir inode dirty
+ in->mark_dirty();
+
+ // mark parent dir dirty, and set version.
+ // this may end up being below water when dir is fetched from disk.
+ CDir *pdir = in->get_parent_dir();
+ if (!pdir->is_dirty()) pdir->mark_dirty();
+ pdir->set_version(trace.back().dirv);
+
+ // mark new dir dirty + complete
+ CDir *dir = in->get_or_open_dir(mds);
+ dir->mark_dirty();
+ dir->mark_complete();
+}
+
+
+
+// -----------------------
+// EMknod
+
+bool EMknod::can_expire(MDS *mds)
+{
+ // am i obsolete?
+ CInode *in = mds->mdcache->get_inode( trace.back().inode.ino );
+ if (!in) return true;
+
+ if (!in->is_auth()) return true; // not my inode anymore!
+ if (in->get_version() != trace.back().inode.version)
+ return true; // i'm obsolete! (another log entry follows)
+
+ if (in->get_last_committed_version() >= trace.back().inode.version)
+ return true;
+
+ return false;
+}
+
+void EMknod::retire(MDS *mds, Context *c)
+{
+ // commit parent directory
+ CInode *diri = mds->mdcache->get_inode( trace.back().dirino );
+ assert(diri);
+ CDir *dir = diri->dir;
+ assert(dir);
+
+ dout(10) << "EMknod.retire committing parent dir " << *dir << endl;
+ mds->mdstore->commit_dir(dir, c);
+}
+
+bool EMknod::has_happened(MDS *mds)
+{
+ return false;
+}
+
+void EMknod::replay(MDS *mds)
+{
+ dout(10) << "EMknod.replay " << *this << endl;
+ CInode *in = trace.restore_trace(mds);
+ in->mark_dirty();
+
+ // mark parent dir dirty, and set version.
+ // this may end up being below water when dir is fetched from disk.
+ CDir *pdir = in->get_parent_dir();
+ if (!pdir->is_dirty()) pdir->mark_dirty();
+ pdir->set_version(trace.back().dirv);
+}
+
+
+
+// -----------------------
+// EInodeUpdate
+
+bool EInodeUpdate::can_expire(MDS *mds)
+{
+ CInode *in = mds->mdcache->get_inode( trace.back().inode.ino );
+ if (!in) return true;
+
+ if (!in->is_auth()) return true; // not my inode anymore!
+ if (in->get_version() != trace.back().inode.version)
+ return true; // i'm obsolete! (another log entry follows)
+
+ /*
+ // frozen -> exporting -> obsolete (FOR NOW?)
+ if (in->is_frozen())
+ return true;
+ */
+
+ if (in->get_last_committed_version() >= trace.back().inode.version)
+ return true;
+
+ return false;
+}
+
+void EInodeUpdate::retire(MDS *mds, Context *c)
+{
+ // commit parent directory
+ CInode *diri = mds->mdcache->get_inode( trace.back().dirino );
+ assert(diri);
+ CDir *dir = diri->dir;
+ assert(dir);
+
+ dout(10) << "EMknod.retire committing parent dir " << *dir << endl;
+ mds->mdstore->commit_dir(dir, c);
+}
+
+bool EInodeUpdate::has_happened(MDS *mds)
+{
+ return false;
+}
+
+void EInodeUpdate::replay(MDS *mds)
+{
+ dout(10) << "EInodeUpdate.replay " << *this << endl;
+ CInode *in = trace.restore_trace(mds);
+ in->mark_dirty();
+
+ // mark parent dir dirty, and set version.
+ // this may end up being below water when dir is fetched from disk.
+ CDir *pdir = in->get_parent_dir();
+ if (!pdir->is_dirty()) pdir->mark_dirty();
+ pdir->set_version(trace.back().dirv);
+}
+
+
+
+// -----------------------
+// EUnlink
+
+bool EUnlink::can_expire(MDS *mds)
+{
+ // dir
+ CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino );
+ CDir *dir = 0;
+ if (diri) dir = diri->dir;
+
+ if (dir && dir->get_last_committed_version() < dirv) return false;
+
+ if (!inodetrace.trace.empty()) {
+ // inode
+ CInode *in = mds->mdcache->get_inode( inodetrace.back().inode.ino );
+ if (in && in->get_last_committed_version() < inodetrace.back().inode.version)
+ return false;
+ }
+
+ return true;
+}
+
+void EUnlink::retire(MDS *mds, Context *c)
+{
+ CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino );
+ CDir *dir = diri->dir;
+ assert(dir);
+
+ // okay!
+ dout(7) << "commiting dirty (from unlink) dir " << *dir << endl;
+ mds->mdstore->commit_dir(dir, dirv, c);
+}
+
+bool EUnlink::has_happened(MDS *mds)
+{
+ return true;
+}
+
+void EUnlink::replay(MDS *mds)
+{
+}
+
+
+
+
+// -----------------------
+// EPurgeFinish
+
+
+bool EPurgeFinish::can_expire(MDS *mds)
+{
+ return true;
+}
+
+void EPurgeFinish::retire(MDS *mds, Context *c)
+{
+}
+
+bool EPurgeFinish::has_happened(MDS *mds)
+{
+ return true;
+}
+
+void EPurgeFinish::replay(MDS *mds)
+{
+}
+
+
+
+
--- /dev/null
+#ifndef __MDSTYPES_H
+#define __MDSTYPES_H
+
+
+#include <math.h>
+#include <ostream>
+using namespace std;
+
+#include "config.h"
+#include "common/DecayCounter.h"
+
+#include <cassert>
+
+
+/* meta_load_t
+ * hierarchical load for an inode/dir and it's children
+ */
+#define META_POP_IRD 0
+#define META_POP_IWR 1
+#define META_POP_DWR 2
+//#define META_POP_LOG 3
+//#define META_POP_FDIR 4
+//#define META_POP_CDIR 4
+#define META_NPOP 3
+
+class meta_load_t {
+ public:
+ DecayCounter pop[META_NPOP];
+
+ double meta_load() {
+ return pop[META_POP_IRD].get() + 2*pop[META_POP_IWR].get();
+ }
+
+ void take(meta_load_t& other) {
+ for (int i=0; i<META_NPOP; i++) {
+ pop[i] = other.pop[i];
+ other.pop[i].reset();
+ }
+ }
+};
+
+inline ostream& operator<<( ostream& out, meta_load_t& load )
+{
+ return out << "metaload<rd " << load.pop[META_POP_IRD].get()
+ << ", wr " << load.pop[META_POP_IWR].get()
+ << ">";
+}
+
+
+inline meta_load_t& operator-=(meta_load_t& l, meta_load_t& r)
+{
+ for (int i=0; i<META_NPOP; i++)
+ l.pop[i].adjust(- r.pop[i].get());
+ return l;
+}
+
+inline meta_load_t& operator+=(meta_load_t& l, meta_load_t& r)
+{
+ for (int i=0; i<META_NPOP; i++)
+ l.pop[i].adjust(r.pop[i].get());
+ return l;
+}
+
+
+
+/* mds_load_t
+ * mds load
+ */
+
+// popularity classes
+#define MDS_POP_JUSTME 0 // just me (this dir or inode)
+#define MDS_POP_NESTED 1 // me + children, auth or not
+#define MDS_POP_CURDOM 2 // me + children in current auth domain
+#define MDS_POP_ANYDOM 3 // me + children in any (nested) auth domain
+//#define MDS_POP_DIRMOD 4 // just this dir, modifications only
+#define MDS_NPOP 4
+
+class mds_load_t {
+ public:
+ meta_load_t root;
+
+ double req_rate;
+ double cache_hit_rate;
+ double queue_len;
+
+ mds_load_t() :
+ req_rate(0), cache_hit_rate(0), queue_len(0) { }
+
+ double mds_load() {
+ switch(g_conf.mds_bal_mode) {
+ case 0:
+ return root.meta_load()
+ + req_rate
+ + 10.0*queue_len;
+
+ case 1:
+ return req_rate + 10.0*queue_len;
+ }
+ assert(0);
+ return 0;
+ }
+
+};
+
+
+inline ostream& operator<<( ostream& out, mds_load_t& load )
+{
+ return out << "mdsload<" << load.root
+ << ", req " << load.req_rate
+ << ", hr " << load.cache_hit_rate
+ << ", qlen " << load.queue_len
+ << ">";
+}
+
+/*
+inline mds_load_t& operator+=( mds_load_t& l, mds_load_t& r )
+{
+ l.root_pop += r.root_pop;
+ l.req_rate += r.req_rate;
+ l.queue_len += r.queue_len;
+ return l;
+}
+
+inline mds_load_t operator/( mds_load_t& a, double d )
+{
+ mds_load_t r;
+ r.root_pop = a.root_pop / d;
+ r.req_rate = a.req_rate / d;
+ r.queue_len = a.queue_len / d;
+ return r;
+}
+*/
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+/*
+
+
+OLD LOCK CRAP:
+ (old):
+ sync - soft metadata.. no reads/writes can proceed. (eg no stat)
+ lock - hard(+soft) metadata.. path traversals stop etc. (??)
+
+
+ replication consistency modes:
+ hard+soft - hard and soft are defined on all replicas.
+ all reads proceed (in absense of sync lock)
+ writes require sync lock, fw to auth
+ -> normal behavior.
+
+ hard - hard only, soft is undefined
+ reads require a sync
+ writes proceed if field updates are monotonic (e.g. size, m/c/atime)
+ -> 'softasync'
+
+ types of access by cache users:
+
+ hard soft
+ R - read_hard_try path traversal
+ R <= R read_soft_start stat
+ R <= W write_soft_start touch
+ W => W write_hard_start chmod
+
+ note on those implications:
+ read_soft_start() calls read_hard_try()
+ write_soft_start() calls read_hard_try()
+ a hard lock implies/subsumes a soft sync (read_soft_start() returns true if a
+ lock is held)
+
+
+ relationship with frozen directories:
+
+ read_hard_try - can proceed, because any hard changes require a lock, which
+ requires an active authority, which implies things are unfrozen.
+ write_hard_start - waits (has to; only auth can initiate)
+ read_soft_start - ???? waits for now. (FIXME: if !softasync & !syncbyauth)
+ write_soft_start - ???? waits for now. (FIXME: if (softasync & !syncbyauth))
+
+ if sticky is on, an export_dir will drop any sync or lock so that the freeze will
+ proceed (otherwise, deadlock!). likewise, a sync will not stick if is_freezing().
+
+
+
+NAMESPACE:
+
+ none right now.
+
+
+*/
+
+
+/* soft sync locks: mtime, size, etc.
+ */
+
+bool MDCache::read_soft_start(CInode *in, Message *m)
+{
+ // if (!read_hard_try(in, m))
+ // return false;
+
+ // if frozen: i can't proceed (for now, see above)
+ if (in->is_frozen()) {
+ dout(7) << "read_soft_start " << *in << " is frozen, waiting" << endl;
+ in->add_waiter(CDIR_WAIT_UNFREEZE,
+ new C_MDS_RetryMessage(mds, m));
+ return false;
+ }
+
+
+ dout(5) << "read_soft_start " << *in << endl;
+
+ // what soft sync mode?
+
+ if (in->is_softasync()) {
+ // softasync: hard consistency only
+
+ if (in->is_auth()) {
+ // i am auth: i need sync
+ if (in->is_syncbyme()) goto yes;
+ if (in->is_lockbyme()) goto yes; // lock => sync
+ if (!in->is_cached_by_anyone() &&
+ !in->is_open_write()) goto yes; // i'm alone
+ } else {
+ // i am replica: fw to auth
+ int auth = in->authority();
+ dout(5) << "read_soft_start " << *in << " is softasync, fw to auth " << auth << endl;
+ assert(auth != mds->get_nodeid());
+ mds->messenger->send_message(m,
+ MSG_ADDR_MDS(auth), m->get_dest_port(),
+ MDS_PORT_CACHE);
+ return false;
+ }
+ } else {
+ // normal: soft+hard consistency
+
+ if (in->is_syncbyauth()) {
+ // wait for sync
+ } else {
+ // i'm consistent
+ goto yes;
+ }
+ }
+
+ // we need sync
+ if (in->is_syncbyauth() && !in->is_softasync()) {
+ dout(5) << "read_soft_start " << *in << " is normal+replica+syncbyauth" << endl;
+ } else if (in->is_softasync() && in->is_auth()) {
+ dout(5) << "read_soft_start " << *in << " is softasync+auth, waiting on sync" << endl;
+ } else
+ assert(2+2==5);
+
+ if (!in->can_auth_pin()) {
+ dout(5) << "read_soft_start " << *in << " waiting to auth_pin" << endl;
+ in->add_waiter(CINODE_WAIT_AUTHPINNABLE,
+ new C_MDS_RetryMessage(mds,m));
+ return false;
+ }
+
+ if (in->is_auth()) {
+ // wait for sync
+ in->add_waiter(CINODE_WAIT_SYNC,
+ new C_MDS_RetryMessage(mds, m));
+
+ if (!in->is_presync())
+ inode_sync_start(in);
+ } else {
+ // wait for unsync
+ in->add_waiter(CINODE_WAIT_UNSYNC,
+ new C_MDS_RetryMessage(mds, m));
+
+ assert(in->is_syncbyauth());
+
+ if (!in->is_waitonunsync())
+ inode_sync_wait(in);
+ }
+
+ return false;
+
+ yes:
+ mds->balancer->hit_inode(in, MDS_POP_SOFTRD);
+ mds->balancer->hit_inode(in, MDS_POP_ANY);
+ return true;
+}
+
+
+int MDCache::read_soft_finish(CInode *in)
+{
+ dout(5) << "read_soft_finish " << *in << endl; // " soft_sync_count " << in->soft_sync_count << endl;
+ return 0; // do nothing, actually..
+}
+
+
+bool MDCache::write_soft_start(CInode *in, Message *m)
+{
+ // if (!read_hard_try(in, m))
+ //return false;
+
+ // if frozen: i can't proceed (for now, see above)
+ if (in->is_frozen()) {
+ dout(7) << "read_soft_start " << *in << " is frozen, waiting" << endl;
+ in->add_waiter(CDIR_WAIT_UNFREEZE,
+ new C_MDS_RetryMessage(mds, m));
+ return false;
+ }
+
+ dout(5) << "write_soft_start " << *in << endl;
+ // what soft sync mode?
+
+ if (in->is_softasync()) {
+ // softasync: hard consistency only
+
+ if (in->is_syncbyauth()) {
+ // wait for sync release
+ } else {
+ // i'm inconsistent; write away!
+ goto yes;
+ }
+
+ } else {
+ // normal: soft+hard consistency
+
+ if (in->is_auth()) {
+ // i am auth: i need sync
+ if (in->is_syncbyme()) goto yes;
+ if (in->is_lockbyme()) goto yes; // lock => sync
+ if (!in->is_cached_by_anyone() &&
+ !in->is_open_write()) goto yes; // i'm alone
+ } else {
+ // i am replica: fw to auth
+ int auth = in->authority();
+ dout(5) << "write_soft_start " << *in << " is !softasync, fw to auth " << auth << endl;
+ assert(auth != mds->get_nodeid());
+ mds->messenger->send_message(m,
+ MSG_ADDR_MDS(auth), m->get_dest_port(),
+ MDS_PORT_CACHE);
+ return false;
+ }
+ }
+
+ // we need sync
+ if (in->is_syncbyauth() && in->is_softasync() && !in->is_auth()) {
+ dout(5) << "write_soft_start " << *in << " is softasync+replica+syncbyauth" << endl;
+ } else if (!in->is_softasync() && in->is_auth()) {
+ dout(5) << "write_soft_start " << *in << " is normal+auth, waiting on sync" << endl;
+ } else
+ assert(2+2==5);
+
+ if (!in->can_auth_pin()) {
+ dout(5) << "write_soft_start " << *in << " waiting to auth_pin" << endl;
+ in->add_waiter(CINODE_WAIT_AUTHPINNABLE,
+ new C_MDS_RetryMessage(mds,m));
+ return false;
+ }
+
+ if (in->is_auth()) {
+ // wait for sync
+ in->add_waiter(CINODE_WAIT_SYNC,
+ new C_MDS_RetryMessage(mds, m));
+
+ if (!in->is_presync())
+ inode_sync_start(in);
+ } else {
+ // wait for unsync
+ in->add_waiter(CINODE_WAIT_UNSYNC,
+ new C_MDS_RetryMessage(mds, m));
+
+ assert(in->is_syncbyauth());
+ assert(in->is_softasync());
+
+ if (!in->is_waitonunsync())
+ inode_sync_wait(in);
+ }
+
+ return false;
+
+ yes:
+ mds->balancer->hit_inode(in, MDS_POP_SOFTWR);
+ mds->balancer->hit_inode(in, MDS_POP_ANY);
+ return true;
+}
+
+
+int MDCache::write_soft_finish(CInode *in)
+{
+ dout(5) << "write_soft_finish " << *in << endl; //" soft_sync_count " << in->soft_sync_count << endl;
+ return 0; // do nothing, actually..
+}
+
+
+
+
+
+
+
+
+/* hard locks: owner, mode
+ */
+
+/*
+bool MDCache::read_hard_try(CInode *in,
+ Message *m)
+{
+ //dout(5) << "read_hard_try " << *in << endl;
+
+ if (in->is_auth()) {
+ // auth
+ goto yes; // fine
+ } else {
+ // replica
+ if (in->is_lockbyauth()) {
+ // locked by auth; wait!
+ dout(7) << "read_hard_try waiting on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_UNLOCK, new C_MDS_RetryMessage(mds, m));
+ if (!in->is_waitonunlock())
+ inode_lock_wait(in);
+ return false;
+ } else {
+ // not locked.
+ goto yes;
+ }
+ }
+
+ yes:
+ mds->balancer->hit_inode(in, MDS_POP_HARDRD);
+ mds->balancer->hit_inode(in, MDS_POP_ANY);
+ return true;
+}
+
+
+bool MDCache::write_hard_start(CInode *in,
+ Message *m)
+{
+ // if frozen: i can't proceed; only auth can initiate lock
+ if (in->is_frozen()) {
+ dout(7) << "write_hard_start " << *in << " is frozen, waiting" << endl;
+ in->add_waiter(CDIR_WAIT_UNFREEZE,
+ new C_MDS_RetryMessage(mds, m));
+ return false;
+ }
+
+ // NOTE: if freezing, and locked, we must proceed, to avoid deadlock (where
+ // the freeze is waiting for our lock to be released)
+
+
+ if (in->is_auth()) {
+ // auth
+ if (in->is_lockbyme()) goto success;
+ if (!in->is_cached_by_anyone()) goto success;
+
+ // need lock
+ if (!in->can_auth_pin()) {
+ dout(5) << "write_hard_start " << *in << " waiting to auth_pin" << endl;
+ in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryMessage(mds, m));
+ return false;
+ }
+
+ in->add_waiter(CINODE_WAIT_LOCK, new C_MDS_RetryMessage(mds, m));
+
+ if (!in->is_prelock())
+ inode_lock_start(in);
+
+ return false;
+ } else {
+ // replica
+ // fw to auth
+ int auth = in->authority();
+ dout(5) << "write_hard_start " << *in << " on replica, fw to auth " << auth << endl;
+ assert(auth != mds->get_nodeid());
+ mds->messenger->send_message(m,
+ MSG_ADDR_MDS(auth), m->get_dest_port(),
+ MDS_PORT_CACHE);
+ return false;
+ }
+
+ success:
+ in->lock_active_count++;
+ dout(5) << "write_hard_start " << *in << " count now " << in->lock_active_count << endl;
+ assert(in->lock_active_count > 0);
+
+ mds->balancer->hit_inode(in, MDS_POP_HARDWR);
+ mds->balancer->hit_inode(in, MDS_POP_ANY);
+ return true;
+}
+
+void MDCache::write_hard_finish(CInode *in)
+{
+ in->lock_active_count--;
+ dout(5) << "write_hard_finish " << *in << " count now " << in->lock_active_count << endl;
+ assert(in->lock_active_count >= 0);
+
+ // release lock?
+ if (in->lock_active_count == 0 &&
+ in->is_lockbyme() &&
+ !g_conf.mdcache_sticky_lock) {
+ dout(7) << "write_hard_finish " << *in << " !sticky, releasing lock immediately" << endl;
+ inode_lock_release(in);
+ }
+}
+
+
+void MDCache::inode_lock_start(CInode *in)
+{
+ dout(5) << "lock_start on " << *in << ", waiting for " << in->cached_by << endl;
+
+ assert(in->is_auth());
+ assert(!in->is_prelock());
+ assert(!in->is_lockbyme());
+ assert(!in->is_lockbyauth());
+
+ in->lock_waiting_for_ack = in->cached_by;
+ in->dist_state |= CINODE_DIST_PRELOCK;
+ in->get(CINODE_PIN_PRELOCK);
+ in->auth_pin();
+
+ // send messages
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ mds->messenger->send_message(new MInodeLockStart(in->inode.ino, mds->get_nodeid()),
+ MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
+ MDS_PORT_CACHE);
+ }
+}
+
+
+void MDCache::inode_lock_release(CInode *in)
+{
+ dout(5) << "lock_release on " << *in << ", messages to " << in->get_cached_by() << endl;
+
+ assert(in->is_lockbyme());
+ assert(in->is_auth());
+
+ in->dist_state &= ~CINODE_DIST_LOCKBYME;
+
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ mds->messenger->send_message(new MInodeLockRelease(in),
+ MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
+ MDS_PORT_CACHE);
+ }
+
+ in->auth_unpin();
+}
+
+void MDCache::inode_lock_wait(CInode *in)
+{
+ dout(5) << "lock_wait on " << *in << endl;
+ assert(!in->is_auth());
+ assert(in->is_lockbyauth());
+
+ in->dist_state |= CINODE_DIST_WAITONUNLOCK;
+ in->get(CINODE_PIN_WAITONUNLOCK);
+}
+
+
+void MDCache::handle_inode_lock_start(MInodeLockStart *m)
+{
+ // authority is requesting a lock
+ CInode *in = get_inode(m->get_ino());
+ if (!in) {
+ // don't have it anymore!
+ dout(7) << "handle_lock_start " << m->get_ino() << ": don't have it anymore, nak" << endl;
+ mds->messenger->send_message(new MInodeLockAck(m->get_ino(), false),
+ MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE,
+ MDS_PORT_CACHE);
+ delete m; // done
+ return;
+ }
+
+ // we shouldn't be authoritative...
+ assert(!in->is_auth());
+
+ dout(7) << "handle_lock_start " << *in << ", sending ack" << endl;
+
+ // lock it
+ in->dist_state |= CINODE_DIST_LOCKBYAUTH;
+
+ // sanity check: make sure we know who _is_ authoritative!
+ assert(m->get_asker() == in->authority());
+
+ // send ack
+ mds->messenger->send_message(new MInodeLockAck(in->ino()),
+ MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE,
+ MDS_PORT_CACHE);
+
+ delete m; // done
+}
+
+
+void MDCache::handle_inode_lock_ack(MInodeLockAck *m)
+{
+ CInode *in = get_inode(m->get_ino());
+ int from = m->get_source();
+ dout(7) << "handle_lock_ack from " << from << " on " << *in << endl;
+
+ assert(in);
+ assert(in->is_auth());
+ assert(in->dist_state & CINODE_DIST_PRELOCK);
+
+ // remove it from waiting list
+ in->lock_waiting_for_ack.erase(from);
+
+ if (!m->did_have()) {
+ // erase from cached_by too!
+ in->cached_by_remove(from);
+ }
+
+ if (in->lock_waiting_for_ack.size()) {
+
+ // more coming
+ dout(7) << "handle_lock_ack " << *in << " from " << from << ", still waiting for " << in->lock_waiting_for_ack << endl;
+
+ } else {
+
+ // yay!
+ dout(7) << "handle_lock_ack " << *in << " from " << from << ", last one" << endl;
+
+ in->dist_state &= ~CINODE_DIST_PRELOCK;
+ in->dist_state |= CINODE_DIST_LOCKBYME;
+ in->put(CINODE_PIN_PRELOCK);
+
+ // do waiters!
+ in->finish_waiting(CINODE_WAIT_LOCK);
+ }
+
+ delete m; // done
+}
+
+
+void MDCache::handle_inode_lock_release(MInodeLockRelease *m)
+{
+ CInode *in = get_inode(m->get_ino());
+
+ if (!in) {
+ dout(7) << "handle_lock_release " << m->get_ino() << ", don't have it, dropping" << endl;
+ delete m; // done
+ return;
+ }
+
+ if (!in->is_lockbyauth()) {
+ dout(7) << "handle_lock_release " << m->get_ino() << ", not flagged as locked, wtf" << endl;
+ assert(0); // i should have it, locked, or not have it at all!
+ delete m; // done
+ return;
+ }
+
+ dout(7) << "handle_lock_release " << *in << endl;
+ assert(!in->is_auth());
+
+ // release state
+ in->dist_state &= ~CINODE_DIST_LOCKBYAUTH;
+
+ // waiters?
+ if (in->is_waitonunlock()) {
+ in->put(CINODE_PIN_WAITONUNLOCK);
+ in->dist_state &= ~CINODE_DIST_WAITONUNLOCK;
+
+ // finish
+ in->finish_waiting(CINODE_WAIT_UNLOCK);
+ }
+
+ // done
+ delete m;
+}
+*/
+
+
+
+
+
+
+
+
+
+// sync interface
+
+void MDCache::inode_sync_wait(CInode *in)
+{
+ assert(!in->is_auth());
+
+ int auth = in->authority();
+ dout(5) << "inode_sync_wait on " << *in << ", auth " << auth << endl;
+
+ assert(in->is_syncbyauth());
+ assert(!in->is_waitonunsync());
+
+ in->dist_state |= CINODE_DIST_WAITONUNSYNC;
+ in->get(CINODE_PIN_WAITONUNSYNC);
+
+ if ((in->is_softasync() && g_conf.mdcache_sticky_sync_softasync) ||
+ (!in->is_softasync() && g_conf.mdcache_sticky_sync_normal)) {
+ // actually recall; if !sticky, auth will immediately release.
+ dout(5) << "inode_sync_wait on " << *in << " sticky, recalling from auth" << endl;
+ mds->messenger->send_message(new MInodeSyncRecall(in->inode.ino),
+ MSG_ADDR_MDS(auth), MDS_PORT_CACHE,
+ MDS_PORT_CACHE);
+ }
+}
+
+
+void MDCache::inode_sync_start(CInode *in)
+{
+ // wait for all replicas
+ dout(5) << "inode_sync_start on " << *in << ", waiting for " << in->cached_by << " " << in->get_open_write()<< endl;
+
+ assert(in->is_auth());
+ assert(!in->is_presync());
+ assert(!in->is_sync());
+
+ in->sync_waiting_for_ack.clear();
+ in->dist_state |= CINODE_DIST_PRESYNC;
+ in->get(CINODE_PIN_PRESYNC);
+ in->auth_pin();
+
+ in->sync_replicawantback = false;
+
+ // send messages
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ in->sync_waiting_for_ack.insert(MSG_ADDR_MDS(*it));
+ mds->messenger->send_message(new MInodeSyncStart(in->inode.ino, mds->get_nodeid()),
+ MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
+ MDS_PORT_CACHE);
+ }
+
+ // sync clients
+ int last = -1;
+ for (multiset<int>::iterator it = in->get_open_write().begin();
+ it != in->get_open_write().end();
+ it++) {
+ if (*it == last) continue; last = *it; // only 1 per client (even if open multiple times)
+ in->sync_waiting_for_ack.insert(MSG_ADDR_CLIENT(*it));
+ mds->messenger->send_message(new MInodeSyncStart(in->ino(), mds->get_nodeid()),
+ MSG_ADDR_CLIENT(*it), 0,
+ MDS_PORT_CACHE);
+ }
+
+}
+
+void MDCache::inode_sync_release(CInode *in)
+{
+ dout(5) << "inode_sync_release on " << *in << ", messages to " << in->get_cached_by() << " " << in->get_open_write() << endl;
+
+ assert(in->is_syncbyme());
+ assert(in->is_auth());
+
+ in->dist_state &= ~CINODE_DIST_SYNCBYME;
+
+ // release replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ mds->messenger->send_message(new MInodeSyncRelease(in),
+ MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
+ MDS_PORT_CACHE);
+ }
+
+ // release writers
+ for (multiset<int>::iterator it = in->get_open_write().begin();
+ it != in->get_open_write().end();
+ it++) {
+ mds->messenger->send_message(new MInodeSyncRelease(in),
+ MSG_ADDR_CLIENT(*it), 0,
+ MDS_PORT_CACHE);
+ }
+
+ in->auth_unpin();
+}
+
+
+
+
+// messages
+void MDCache::handle_inode_sync_start(MInodeSyncStart *m)
+{
+ // assume asker == authority for now.
+
+ // authority is requesting a lock
+ CInode *in = get_inode(m->get_ino());
+ if (!in) {
+ // don't have it anymore!
+ dout(7) << "handle_sync_start " << m->get_ino() << ": don't have it anymore, nak" << endl;
+ mds->messenger->send_message(new MInodeSyncAck(m->get_ino(), false),
+ MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE,
+ MDS_PORT_CACHE);
+ delete m; // done
+ return;
+ }
+
+ dout(10) << "handle_sync_start " << *in << endl;
+
+ // we shouldn't be authoritative...
+ assert(!in->is_auth());
+
+ // sanity check: make sure we know who _is_ authoritative!
+ assert(m->get_asker() == in->authority());
+
+ // lock it
+ in->dist_state |= CINODE_DIST_SYNCBYAUTH;
+
+ // open for write by clients?
+ if (in->is_open_write()) {
+ dout(7) << "handle_sync_start " << *in << " syncing write clients " << in->get_open_write() << endl;
+
+ // sync clients
+ in->sync_waiting_for_ack.clear();
+ for (multiset<int>::iterator it = in->get_open_write().begin();
+ it != in->get_open_write().end();
+ it++) {
+ in->sync_waiting_for_ack.insert(MSG_ADDR_CLIENT(*it));
+ mds->messenger->send_message(new MInodeSyncStart(in->ino(), mds->get_nodeid()),
+ MSG_ADDR_CLIENT(*it), 0,
+ MDS_PORT_CACHE);
+ }
+
+ in->pending_sync_request = m;
+ } else {
+ // no writers, ack.
+ dout(7) << "handle_sync_start " << *in << ", sending ack" << endl;
+
+ inode_sync_ack(in, m);
+ }
+}
+
+void MDCache::inode_sync_ack(CInode *in, MInodeSyncStart *m, bool wantback)
+{
+ dout(7) << "sending inode_sync_ack " << *in << endl;
+
+ // send ack
+ mds->messenger->send_message(new MInodeSyncAck(in->ino(), true, wantback),
+ MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE,
+ MDS_PORT_CACHE);
+
+ delete m;
+}
+
+void MDCache::handle_inode_sync_ack(MInodeSyncAck *m)
+{
+ CInode *in = get_inode(m->get_ino());
+ assert(in);
+
+ dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << endl;
+
+ if (in->is_auth()) {
+ assert(in->is_presync());
+ } else {
+ assert(in->is_syncbyauth());
+ assert(in->pending_sync_request);
+ }
+
+ // remove it from waiting list
+ in->sync_waiting_for_ack.erase(m->get_source());
+
+ if (MSG_ADDR_ISCLIENT(m->get_source()) && !m->did_have()) {
+ // erase from cached_by too!
+ in->cached_by_remove(m->get_source());
+ }
+
+ if (m->replica_wantsback())
+ in->sync_replicawantback = true;
+
+ if (in->sync_waiting_for_ack.size()) {
+
+ // more coming
+ dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << ", still waiting for " << in->sync_waiting_for_ack << endl;
+
+ } else {
+
+ // yay!
+ dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << ", last one" << endl;
+
+ if (!in->is_auth()) {
+ // replica, sync ack back to auth
+ assert(in->pending_sync_request);
+ inode_sync_ack(in, in->pending_sync_request, true);
+ in->pending_sync_request = 0;
+ delete m;
+ return;
+ }
+
+ in->dist_state &= ~CINODE_DIST_PRESYNC;
+ in->dist_state |= CINODE_DIST_SYNCBYME;
+ in->put(CINODE_PIN_PRESYNC);
+
+ // do waiters!
+ in->finish_waiting(CINODE_WAIT_SYNC);
+
+
+ // release sync right away?
+ if (in->is_syncbyme()) {
+ if (in->is_freezing()) {
+ dout(7) << "handle_sync_ack freezing " << *in << ", dropping sync immediately" << endl;
+ inode_sync_release(in);
+ }
+ else if (in->sync_replicawantback) {
+ dout(7) << "handle_sync_ack replica wantback, releasing sync immediately" << endl;
+ inode_sync_release(in);
+ }
+ else if ((in->is_softasync() && !g_conf.mdcache_sticky_sync_softasync) ||
+ (!in->is_softasync() && !g_conf.mdcache_sticky_sync_normal)) {
+ dout(7) << "handle_sync_ack !sticky, releasing sync immediately" << endl;
+ inode_sync_release(in);
+ }
+ else {
+ dout(7) << "handle_sync_ack sticky sync is on, keeping sync for now" << endl;
+ }
+ } else {
+ dout(7) << "handle_sync_ack don't have sync anymore, something must have just released it?" << endl;
+ }
+ }
+
+ delete m; // done
+}
+
+
+void MDCache::handle_inode_sync_release(MInodeSyncRelease *m)
+{
+ CInode *in = get_inode(m->get_ino());
+
+ if (!in) {
+ dout(7) << "handle_sync_release " << m->get_ino() << ", don't have it, dropping" << endl;
+ delete m; // done
+ return;
+ }
+
+ if (!in->is_syncbyauth()) {
+ dout(7) << "handle_sync_release " << *in << ", not flagged as sync" << endl;
+ assert(0); // this shouldn't happen.
+ delete m; // done
+ return;
+ }
+
+ dout(7) << "handle_sync_release " << *in << endl;
+ assert(!in->is_auth());
+
+ // release state
+ in->dist_state &= ~CINODE_DIST_SYNCBYAUTH;
+
+ // waiters?
+ if (in->is_waitonunsync()) {
+ in->put(CINODE_PIN_WAITONUNSYNC);
+ in->dist_state &= ~CINODE_DIST_WAITONUNSYNC;
+
+ // finish
+ in->finish_waiting(CINODE_WAIT_UNSYNC);
+ }
+
+ // client readers?
+ if (in->is_open_write()) {
+ dout(7) << "handle_sync_release releasing clients " << in->get_open_write() << endl;
+ for (multiset<int>::iterator it = in->get_open_write().begin();
+ it != in->get_open_write().end();
+ it++) {
+ mds->messenger->send_message(new MInodeSyncRelease(in),
+ MSG_ADDR_CLIENT(*it), 0,
+ MDS_PORT_CACHE);
+ }
+ }
+
+
+ // done
+ delete m;
+}
+
+
+void MDCache::handle_inode_sync_recall(MInodeSyncRecall *m)
+{
+ CInode *in = get_inode(m->get_ino());
+
+ if (!in) {
+ dout(7) << "handle_sync_recall " << m->get_ino() << ", don't have it, wtf" << endl;
+ assert(0); // shouldn't happen
+ delete m; // done
+ return;
+ }
+ if(!in->is_auth()) {
+ do_ino_proxy(in, m);
+ return;
+ }
+
+ if (in->is_syncbyme()) {
+ dout(7) << "handle_sync_recall " << *in << ", releasing" << endl;
+ inode_sync_release(in);
+ }
+ else if (in->is_presync()) {
+ dout(7) << "handle_sync_recall " << *in << " is presync, flagging" << endl;
+ in->sync_replicawantback = true;
+ }
+ else {
+ dout(7) << "handle_sync_recall " << m->get_ino() << ", not flagged as sync or presync, dropping" << endl;
+ }
+
+ // done
+ delete m;
+}
+
+
+
+
+
+
+
+
+
+
+// DIR SYNC
+
+/*
+
+ dir sync
+
+ - this are used when a directory is HASHED only. namely,
+ - to stat the dir inode we need an accurate directory size (????)
+ - for a readdir
+
+*/
+
+void MDCache::dir_sync_start(CDir *dir)
+{
+ // wait for all replicas
+ dout(5) << "sync_start on " << *dir << endl;
+
+ assert(dir->is_hashed());
+ assert(dir->is_auth());
+ assert(!dir->is_presync());
+ assert(!dir->is_sync());
+
+ dir->sync_waiting_for_ack = mds->get_cluster()->get_mds_set();
+ dir->state_set(CDIR_STATE_PRESYNC);
+ dir->auth_pin();
+
+ //dir->sync_replicawantback = false;
+
+ // send messages
+ for (set<int>::iterator it = dir->sync_waiting_for_ack.begin();
+ it != dir->sync_waiting_for_ack.end();
+ it++) {
+ mds->messenger->send_message(new MDirSyncStart(dir->ino(), mds->get_nodeid()),
+ MSG_ADDR_MDS(*it), MDS_PORT_CACHE,
+ MDS_PORT_CACHE);
+ }
+}
+
+
+void MDCache::dir_sync_release(CDir *dir)
+{
+
+
+}
+
+void MDCache::dir_sync_wait(CDir *dir)
+{
+
+}
+
+
+void handle_dir_sync_start(MDirSyncStart *m)
+{
+}
+
+
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MANCHORREPLY_H
+#define __MANCHORREPLY_H
+
+#include <vector>
+
+#include "msg/Message.h"
+#include "mds/AnchorTable.h"
+
+#include "MAnchorRequest.h"
+
+
+class MAnchorReply : public Message {
+ int op;
+ inodeno_t ino;
+ vector<Anchor*> trace;
+
+ public:
+ MAnchorReply() {}
+ MAnchorReply(MAnchorRequest *req) : Message(MSG_MDS_ANCHORREPLY) {
+ this->op = req->get_op();
+ this->ino = req->get_ino();
+ }
+ ~MAnchorReply() {
+ for (unsigned i=0; i<trace.size(); i++) delete trace[i];
+ }
+ virtual char *get_type_name() { return "arep"; }
+
+ void set_trace(vector<Anchor*>& trace) { this->trace = trace; }
+
+ int get_op() { return op; }
+ inodeno_t get_ino() { return ino; }
+ vector<Anchor*>& get_trace() { return trace; }
+
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(op), (char*)&op);
+ off += sizeof(op);
+ payload.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ int n;
+ payload.copy(off, sizeof(int), (char*)&n);
+ off += sizeof(int);
+ for (int i=0; i<n; i++) {
+ Anchor *a = new Anchor;
+ a->_decode(payload, off);
+ trace.push_back(a);
+ }
+ }
+
+ virtual void encode_payload() {
+ payload.append((char*)&op, sizeof(op));
+ payload.append((char*)&ino, sizeof(ino));
+ int n = trace.size();
+ payload.append((char*)&n, sizeof(int));
+ for (int i=0; i<n; i++)
+ trace[i]->_encode(payload);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MANCHORREQUEST_H
+#define __MANCHORREQUEST_H
+
+#include <vector>
+
+#include "msg/Message.h"
+#include "mds/AnchorTable.h"
+
+#define ANCHOR_OP_CREATE 1
+#define ANCHOR_OP_DESTROY 2
+#define ANCHOR_OP_LOOKUP 3
+#define ANCHOR_OP_UPDATE 4
+
+class MAnchorRequest : public Message {
+ int op;
+ inodeno_t ino;
+ vector<Anchor*> trace;
+
+ public:
+ MAnchorRequest() {}
+ MAnchorRequest(int op, inodeno_t ino) : Message(MSG_MDS_ANCHORREQUEST) {
+ this->op = op;
+ this->ino = ino;
+ }
+ ~MAnchorRequest() {
+ for (unsigned i=0; i<trace.size(); i++) delete trace[i];
+ }
+ virtual char *get_type_name() { return "areq"; }
+
+ void set_trace(vector<Anchor*>& trace) { this->trace = trace; }
+
+ int get_op() { return op; }
+ inodeno_t get_ino() { return ino; }
+ vector<Anchor*>& get_trace() { return trace; }
+
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(op), (char*)&op);
+ off += sizeof(op);
+ payload.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ int n;
+ payload.copy(off, sizeof(int), (char*)&n);
+ off += sizeof(int);
+ for (int i=0; i<n; i++) {
+ Anchor *a = new Anchor;
+ a->_decode(payload, off);
+ trace.push_back(a);
+ }
+ }
+
+ virtual void encode_payload() {
+ payload.append((char*)&op, sizeof(op));
+ payload.append((char*)&ino, sizeof(ino));
+ int n = trace.size();
+ payload.append((char*)&n, sizeof(int));
+ for (int i=0; i<n; i++)
+ trace[i]->_encode(payload);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MCACHEEXPIRE_H
+#define __MCACHEEXPIRE_H
+
+
+class MCacheExpire : public Message {
+ map<inodeno_t, int> inodes;
+ map<inodeno_t, int> dirs;
+ int from;
+
+ public:
+ map<inodeno_t,int>& get_inodes() { return inodes; }
+ map<inodeno_t,int>& get_dirs() { return dirs; }
+ int get_from() { return from; }
+
+ MCacheExpire() {}
+ MCacheExpire(int from) : Message(MSG_MDS_CACHEEXPIRE) {
+ this->from = from;
+ }
+ virtual char *get_type_name() { return "CEx";}
+
+ void add_inode(inodeno_t ino, int nonce) {
+ inodes.insert(pair<inodeno_t,int>(ino,nonce));
+ }
+ void add_dir(inodeno_t ino, int nonce) {
+ dirs.insert(pair<inodeno_t,int>(ino,nonce));
+ }
+
+ virtual void decode_payload(crope& s, int& off) {
+ int n;
+
+ s.copy(off, sizeof(from), (char*)&from);
+ off += sizeof(from);
+
+ // inodes
+ s.copy(off, sizeof(int), (char*)&n);
+ off += sizeof(int);
+ for (int i=0; i<n; i++) {
+ inodeno_t ino;
+ int nonce;
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ s.copy(off, sizeof(int), (char*)&nonce);
+ off += sizeof(int);
+ inodes.insert(pair<inodeno_t, int>(ino,nonce));
+ }
+
+ // dirs
+ s.copy(off, sizeof(int), (char*)&n);
+ off += sizeof(int);
+ for (int i=0; i<n; i++) {
+ inodeno_t ino;
+ int nonce;
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ s.copy(off, sizeof(int), (char*)&nonce);
+ off += sizeof(int);
+ dirs.insert(pair<inodeno_t, int>(ino,nonce));
+ }
+ }
+
+ void rope_map(crope& s, map<inodeno_t,int>& mp) {
+ int n = mp.size();
+ s.append((char*)&n, sizeof(int));
+ for (map<inodeno_t,int>::iterator it = mp.begin();
+ it != mp.end();
+ it++) {
+ inodeno_t ino = it->first;
+ int nonce = it->second;
+ s.append((char*)&ino, sizeof(ino));
+ s.append((char*)&nonce, sizeof(nonce));
+ }
+ }
+
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&from, sizeof(from));
+ rope_map(s, inodes);
+ rope_map(s, dirs);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MCLIENTBOOT_H
+#define __MCLIENTBOOT_H
+
+#include "msg/Message.h"
+
+class MClientBoot : public Message {
+
+ public:
+ MClientBoot() : Message(MSG_CLIENT_BOOT) {
+ }
+
+ char *get_type_name() { return "Cboot"; }
+
+ virtual void decode_payload(crope& s, int& off) {
+ }
+ virtual void encode_payload(crope& s) {
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MCLIENTFILECAPS_H
+#define __MCLIENTFILECAPS_H
+
+#define CLIENT_FILECAP_RELEASE 1 // mds closed the cap
+#define CLIENT_FILECAP_STALE 2 // mds has exported the cap
+#define CLIENT_FILECAP_REAP 3 // mds has imported the cap from get_mds()
+
+class MClientFileCaps : public Message {
+ public:
+ static const int FILECAP_RELEASE = 1;
+ static const int FILECAP_STALE = 2;
+ static const int FILECAP_REAP = 3;
+
+
+ private:
+ inode_t inode;
+ int caps;
+ long seq;
+ int wanted;
+ //int client;
+
+ int special; // stale || reap; in conjunction w/ mds value
+ int mds;
+
+ public:
+ inodeno_t get_ino() { return inode.ino; }
+ inode_t& get_inode() { return inode; }
+ int get_caps() { return caps; }
+ int get_wanted() { return wanted; }
+ long get_seq() { return seq; }
+ //int get_client() { return client; }
+
+ // for cap migration
+ int get_mds() { return mds; }
+ int get_special() { return special; }
+
+ //void set_client(int c) { client = c; }
+ void set_caps(int c) { caps = c; }
+ void set_wanted(int w) { wanted = w; }
+
+ void set_mds(int m) { mds = m; }
+ void set_special(int s) { special = s; }
+
+ MClientFileCaps() {}
+ MClientFileCaps(inode_t& inode,
+ long seq,
+ int caps,
+ int wanted,
+ int special=0,
+ int mds=0) :
+ Message(MSG_CLIENT_FILECAPS) {
+ this->inode = inode;
+ this->seq = seq;
+ this->caps = caps;
+ this->wanted = wanted;
+ this->special = special;
+ this->mds = mds;
+ }
+ virtual char *get_type_name() { return "Cfcap";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(seq), (char*)&seq);
+ off += sizeof(seq);
+ s.copy(off, sizeof(inode), (char*)&inode);
+ off += sizeof(inode);
+ s.copy(off, sizeof(caps), (char*)&caps);
+ off += sizeof(caps);
+ s.copy(off, sizeof(wanted), (char*)&wanted);
+ off += sizeof(wanted);
+ //s.copy(off, sizeof(client), (char*)&client);
+ //off += sizeof(client);
+ s.copy(off, sizeof(mds), (char*)&mds);
+ off += sizeof(mds);
+ s.copy(off, sizeof(special), (char*)&special);
+ off += sizeof(special);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&seq, sizeof(seq));
+ s.append((char*)&inode, sizeof(inode));
+ s.append((char*)&caps, sizeof(caps));
+ s.append((char*)&wanted, sizeof(wanted));
+ //s.append((char*)&client, sizeof(client));
+ s.append((char*)&mds,sizeof(mds));
+ s.append((char*)&special,sizeof(special));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MCLIENTINODEAUTHUPDATE_H
+#define __MCLIENTINODEAUTHUPDATE_H
+
+class MClientInodeAuthUpdate : public Message {
+ inodeno_t ino;
+ int newauth;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+ int get_auth() { return newauth; }
+
+ MClientInodeAuthUpdate() {}
+ MClientInodeAuthUpdate(inodeno_t ino, int newauth) :
+ Message(MSG_CLIENT_INODEAUTHUPDATE) {
+ this->ino = ino;
+ this->newauth = newauth;
+ }
+ virtual char *get_type_name() { return "Ciau";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ s.copy(off, sizeof(newauth), (char*)&newauth);
+ off += sizeof(newauth);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&ino,sizeof(ino));
+ s.append((char*)&newauth,sizeof(newauth));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MCLIENTMOUNT_H
+#define __MCLIENTMOUNT_H
+
+#include "msg/Message.h"
+
+class MClientMount : public Message {
+
+ public:
+ MClientMount() : Message(MSG_CLIENT_MOUNT) {
+ }
+
+ char *get_type_name() { return "Cmnt"; }
+
+ virtual void decode_payload(crope& s, int& off) {
+ }
+ virtual void encode_payload(crope& s) {
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MCLIENTMOUNTACK_H
+#define __MCLIENTMOUNTACK_H
+
+#include "msg/Message.h"
+#include "MClientMount.h"
+#include "mds/MDSMap.h"
+#include "osd/OSDMap.h"
+
+
+class MClientMountAck : public Message {
+ long pcid;
+ bufferlist osd_map_state;
+ bufferlist mds_map_state;
+
+ public:
+ MClientMountAck() {}
+ MClientMountAck(MClientMount *mnt, MDSMap *mdsmap, OSDMap *osdmap) : Message(MSG_CLIENT_MOUNTACK) {
+ this->pcid = mnt->get_pcid();
+ mdsmap->encode( mds_map_state );
+ osdmap->encode( osd_map_state );
+ }
+
+ bufferlist& get_mds_map_state() { return mds_map_state; }
+ bufferlist& get_osd_map_state() { return osd_map_state; }
+
+ void set_pcid(long pcid) { this->pcid = pcid; }
+ long get_pcid() { return pcid; }
+
+ char *get_type_name() { return "CmntA"; }
+
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(pcid), (char*)&pcid);
+ off += sizeof(pcid);
+ ::_decode( mds_map_state, payload, off);
+ ::_decode( osd_map_state, payload, off);
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&pcid, sizeof(pcid));
+ ::_encode( mds_map_state, payload );
+ ::_encode( osd_map_state, payload );
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MCLIENTREPLY_H
+#define __MCLIENTREPLY_H
+
+#include "include/types.h"
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "mds/CDir.h"
+#include "mds/CDentry.h"
+
+#include <vector>
+using namespace std;
+
+class CInode;
+
+/***
+ *
+ * MClientReply - container message for MDS reply to a client's MClientRequest
+ *
+ * key fields:
+ * long tid - transaction id, so the client can match up with pending request
+ * int result - error code, or fh if it was open
+ *
+ * for most requests:
+ * trace is a vector of c_inoe_info's tracing from root to the file/dir/whatever
+ * the operation referred to, so that the client can update it's info about what
+ * metadata lives on what MDS.
+ *
+ * for readdir replies:
+ * dir_contents is a vector c_inode_info*'s.
+ *
+ * that's mostly it, i think!
+ *
+ */
+
+class InodeStat {
+
+ public:
+ inode_t inode;
+ string symlink; // symlink content (if symlink)
+
+
+ // mds distribution hints
+ int dir_auth;
+ bool hashed, replicated;
+ bool spec_defined;
+ set<int> dist; // where am i replicated?
+
+ public:
+ InodeStat() {}
+ InodeStat(CInode *in, int whoami) :
+ inode(in->inode)
+ {
+ // inode.mask
+ inode.mask = INODE_MASK_BASE;
+ if (in->filelock.can_read(in->is_auth()))
+ inode.mask |= INODE_MASK_PERM;
+ if (in->hardlock.can_read(in->is_auth()))
+ inode.mask |= INODE_MASK_SIZE | INODE_MASK_MTIME; // fixme when we separate this out.
+
+ // symlink content?
+ if (in->is_symlink())
+ symlink = in->symlink;
+
+ // replicated where?
+ if (in->dir && in->dir->is_auth()) {
+ spec_defined = true;
+ in->dir->get_dist_spec(this->dist, whoami);
+ } else
+ spec_defined = false;
+
+ if (in->dir)
+ dir_auth = in->dir->get_dir_auth();
+ else
+ dir_auth = -1;
+
+ // dir info
+ hashed = (in->dir && in->dir->is_hashed()); // FIXME not quite right.
+ replicated = (in->dir && in->dir->is_rep());
+ }
+
+ void _encode(bufferlist &bl) {
+ bl.append((char*)&inode, sizeof(inode));
+ bl.append((char*)&spec_defined, sizeof(spec_defined));
+ bl.append((char*)&dir_auth, sizeof(dir_auth));
+ bl.append((char*)&hashed, sizeof(hashed));
+ bl.append((char*)&replicated, sizeof(replicated));
+
+ ::_encode(symlink, bl);
+ ::_encode(dist, bl); // distn
+ }
+
+ void _decode(bufferlist &bl, int& off) {
+ bl.copy(off, sizeof(inode), (char*)&inode);
+ off += sizeof(inode);
+ bl.copy(off, sizeof(spec_defined), (char*)&spec_defined);
+ off += sizeof(spec_defined);
+ bl.copy(off, sizeof(dir_auth), (char*)&dir_auth);
+ off += sizeof(dir_auth);
+ bl.copy(off, sizeof(hashed), (char*)&hashed);
+ off += sizeof(hashed);
+ bl.copy(off, sizeof(replicated), (char*)&replicated);
+ off += sizeof(replicated);
+
+ ::_decode(symlink, bl, off);
+ ::_decode(dist, bl, off);
+ }
+};
+
+
+typedef struct {
+ long pcid;
+ long tid;
+ int op;
+ int result; // error code
+ unsigned char file_caps; // for open
+ long file_caps_seq;
+ __uint64_t file_data_version; // for client buffercache consistency
+
+ int _num_trace_in;
+ int _dir_size;
+} MClientReply_st;
+
+class MClientReply : public Message {
+ // reply data
+ MClientReply_st st;
+
+ string path;
+ list<InodeStat*> trace_in;
+ list<string> trace_dn;
+
+ list<InodeStat*> dir_in;
+ list<string> dir_dn;
+
+ public:
+ void set_pcid(long pcid) { this->st.pcid = pcid; }
+ long get_pcid() { return st.pcid; }
+
+ long get_tid() { return st.tid; }
+ int get_op() { return st.op; }
+
+ int get_result() { return st.result; }
+ const string& get_path() { return path; }
+
+ inodeno_t get_ino() { return trace_in.back()->inode.ino; }
+ const inode_t& get_inode() { return trace_in.back()->inode; }
+
+ const list<InodeStat*>& get_trace_in() { return trace_in; }
+ const list<string>& get_trace_dn() { return trace_dn; }
+
+ const list<InodeStat*>& get_dir_in() { return dir_in; }
+ const list<string>& get_dir_dn() { return dir_dn; }
+
+ unsigned char get_file_caps() { return st.file_caps; }
+ long get_file_caps_seq() { return st.file_caps_seq; }
+ __uint64_t get_file_data_version() { return st.file_data_version; }
+
+ void set_result(int r) { st.result = r; }
+ void set_file_caps(unsigned char c) { st.file_caps = c; }
+ void set_file_caps_seq(long s) { st.file_caps_seq = s; }
+ void set_file_data_version(__uint64_t v) { st.file_data_version = v; }
+
+ MClientReply() {};
+ MClientReply(MClientRequest *req, int result = 0) :
+ Message(MSG_CLIENT_REPLY) {
+ memset(&st, 0, sizeof(st));
+ this->st.pcid = req->get_pcid(); // match up procedure call id!!!
+ this->st.tid = req->get_tid();
+ this->st.op = req->get_op();
+ this->path = req->get_path();
+
+ this->st.result = result;
+
+ st._dir_size = 0;
+ st._num_trace_in = 0;
+ }
+ virtual ~MClientReply() {
+ list<InodeStat*>::iterator it;
+
+ for (it = trace_in.begin(); it != trace_in.end(); ++it)
+ delete *it;
+ for (it = dir_in.begin(); it != dir_in.end(); ++it)
+ delete *it;
+ }
+ virtual char *get_type_name() { return "creply"; }
+
+
+ // serialization
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(st), (char*)&st);
+ off += sizeof(st);
+
+ _decode(path, payload, off);
+
+ for (int i=0; i<st._num_trace_in; ++i) {
+ if (i) {
+ string ref_dn;
+ ::_decode(ref_dn, payload, off);
+ trace_dn.push_back(ref_dn);
+ }
+ InodeStat *ci = new InodeStat;
+ ci->_decode(payload, off);
+ trace_in.push_back(ci);
+ }
+
+ for (int i=0; i<st._dir_size; ++i) {
+ InodeStat *ci = new InodeStat;
+ ci->_decode(payload, off);
+ dir_in.push_back(ci);
+ string dn;
+ ::_decode(dn, payload, off);
+ dir_dn.push_back(dn);
+ }
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&st, sizeof(st));
+ _encode(path, payload);
+
+ // trace
+ list<string>::iterator pdn = trace_dn.begin();
+ list<InodeStat*>::iterator pin;
+ for (pin = trace_in.begin();
+ pin != trace_in.end();
+ ++pin) {
+ if (pin != trace_in.begin()) {
+ ::_encode(*pdn, payload);
+ ++pdn;
+ }
+ (*pin)->_encode(payload);
+ }
+
+ // dir contents
+ pdn = dir_dn.begin();
+ for (pin = dir_in.begin();
+ pin != dir_in.end();
+ ++pin, ++pdn) {
+ (*pin)->_encode(payload);
+ ::_encode(*pdn, payload);
+ }
+ }
+
+ // builders
+ /*
+ void add_dir_item(string& dn, InodeStat *in) {
+ dir_dn.push_back(dn);
+ dir_in.push_back(in);
+ ++st._dir_size;
+ }*/
+ void take_dir_items(list<InodeStat*>& inls,
+ list<string>& dnls,
+ int num) {
+ dir_in.swap(inls);
+ dir_dn.swap(dnls);
+ st._dir_size = num;
+ }
+ void copy_dir_items(const list<InodeStat*>& inls,
+ const list<string>& dnls) {
+ list<string>::const_iterator pdn = dnls.begin();
+ list<InodeStat*>::const_iterator pin = inls.begin();
+ while (pin != inls.end()) {
+ // copy!
+ InodeStat *i = new InodeStat;
+ *i = **pin;
+ dir_in.push_back(i);
+ dir_dn.push_back(*pdn);
+ ++pin;
+ ++pdn;
+ ++st._dir_size;
+ }
+ }
+
+ void set_trace_dist(CInode *in, int whoami) {
+ st._num_trace_in = 0;
+ while (in) {
+ // add this inode to trace, along with referring dentry name
+ if (in->get_parent_dn())
+ trace_dn.push_front(in->get_parent_dn()->get_name());
+ trace_in.push_front(new InodeStat(in, whoami));
+ ++st._num_trace_in;
+
+ in = in->get_parent_inode();
+ }
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MCLIENTREQUEST_H
+#define __MCLIENTREQUEST_H
+
+#include <vector>
+
+#include "msg/Message.h"
+#include "include/filepath.h"
+#include "mds/MDS.h"
+
+/**
+ *
+ * MClientRequest - container for a client METADATA request. created/sent by clients.
+ * can be forwarded around between MDS's.
+ *
+ * int client - the originating client
+ * long pcid - procedure call id, used to match request+response.
+ * long tid - transaction id, unique among requests for that client. probably just a counter!
+ * -> the MDS passes the Request to the Reply constructor, so this always matches.
+ *
+ * int op - the metadata op code. MDS_OP_RENAME, etc.
+ * int caller_uid, _gid - guess
+ *
+ * arguments: one or more of these are defined, depending on the metadata op:
+ * inodeno ino - used by close(), along with fh. not strictly necessary except MDS is currently coded lame.
+ * filepath path - main file argument (almost everything)
+ * string sarg - string argument (if a second arg is needed, e.g. rename, symlink)
+ * int iarg - int arg... file mode for open, fh for close, mode for mkdir, etc.
+ * int iarg2 - second int arg... gid for chown (iarg is uid)
+ * time_t targ, targ2 - time args, used by utime
+ *
+ * That's basically it!
+ *
+ */
+
+
+typedef struct {
+ long tid;
+ int client;
+ int op;
+
+ entity_inst_t client_inst;
+
+ int caller_uid, caller_gid;
+ inodeno_t ino;
+
+ int iarg, iarg2;
+ time_t targ, targ2;
+
+ inodeno_t mds_wants_replica_in_dirino;
+
+ size_t sizearg;
+} MClientRequest_st;
+
+
+class MClientRequest : public Message {
+ MClientRequest_st st;
+ filepath path;
+ string sarg;
+ string sarg2;
+
+
+ public:
+ MClientRequest() {}
+ MClientRequest(int op, int client) : Message(MSG_CLIENT_REQUEST) {
+ memset(&st, 0, sizeof(st));
+ this->st.op = op;
+ this->st.client = client;
+ this->st.iarg = 0;
+ }
+ virtual char *get_type_name() { return "creq"; }
+
+ // keep a pcid (procedure call id) to match up request+reply
+ //void set_pcid(long pcid) { this->st.pcid = pcid; }
+ //long get_pcid() { return st.pcid; }
+
+ // normal fields
+ void set_tid(long t) { st.tid = t; }
+ void set_path(string& p) { path.set_path(p); }
+ void set_path(const char *p) { path.set_path(p); }
+ void set_path(const filepath& fp) { path = fp; }
+ void set_caller_uid(int u) { st.caller_uid = u; }
+ void set_caller_gid(int g) { st.caller_gid = g; }
+ void set_ino(inodeno_t ino) { st.ino = ino; }
+ void set_iarg(int i) { st.iarg = i; }
+ void set_iarg2(int i) { st.iarg2 = i; }
+ void set_targ(time_t& t) { st.targ = t; }
+ void set_targ2(time_t& t) { st.targ2 = t; }
+ void set_sarg(string& arg) { this->sarg = arg; }
+ void set_sarg(const char *arg) { this->sarg = arg; }
+ void set_sarg2(string& arg) { this->sarg2 = arg; }
+ void set_sizearg(size_t s) { st.sizearg = s; }
+ void set_mds_wants_replica_in_dirino(inodeno_t dirino) {
+ st.mds_wants_replica_in_dirino = dirino; }
+
+ void set_client_inst(const entity_inst_t& i) { st.client_inst = i; }
+ const entity_inst_t& get_client_inst() { return st.client_inst; }
+
+ int get_client() { return st.client; }
+ long get_tid() { return st.tid; }
+ int get_op() { return st.op; }
+ int get_caller_uid() { return st.caller_uid; }
+ int get_caller_gid() { return st.caller_gid; }
+ inodeno_t get_ino() { return st.ino; }
+ string& get_path() { return path.get_path(); }
+ filepath& get_filepath() { return path; }
+ int get_iarg() { return st.iarg; }
+ int get_iarg2() { return st.iarg2; }
+ time_t get_targ() { return st.targ; }
+ time_t get_targ2() { return st.targ2; }
+ string& get_sarg() { return sarg; }
+ string& get_sarg2() { return sarg2; }
+ size_t get_sizearg() { return st.sizearg; }
+ inodeno_t get_mds_wants_replica_in_dirino() {
+ return st.mds_wants_replica_in_dirino; }
+
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(st), (char*)&st);
+ off += sizeof(st);
+ path._decode(payload, off);
+ _decode(sarg, payload, off);
+ _decode(sarg2, payload, off);
+ }
+
+ virtual void encode_payload() {
+ payload.append((char*)&st, sizeof(st));
+ path._encode(payload);
+ _encode(sarg, payload);
+ _encode(sarg2, payload);
+ }
+
+ void print(ostream& out) {
+ out << "clientreq(client" << get_client()
+ << "." << get_tid()
+ //<< ".pcid=" << get_pcid()
+ << ":";
+ switch(get_op()) {
+ case MDS_OP_STAT:
+ out << "stat"; break;
+ case MDS_OP_LSTAT:
+ out << "lstat"; break;
+ case MDS_OP_UTIME:
+ out << "utime"; break;
+ case MDS_OP_CHMOD:
+ out << "chmod"; break;
+ case MDS_OP_CHOWN:
+ out << "chown"; break;
+
+ case MDS_OP_READDIR:
+ out << "readdir"; break;
+ case MDS_OP_MKNOD:
+ out << "mknod"; break;
+ case MDS_OP_LINK:
+ out << "link"; break;
+ case MDS_OP_UNLINK:
+ out << "unlink"; break;
+ case MDS_OP_RENAME:
+ out << "rename"; break;
+
+ case MDS_OP_MKDIR:
+ out << "mkdir"; break;
+ case MDS_OP_RMDIR:
+ out << "rmdir"; break;
+ case MDS_OP_SYMLINK:
+ out << "symlink"; break;
+
+ case MDS_OP_OPEN:
+ out << "open"; break;
+ case MDS_OP_TRUNCATE:
+ out << "truncate"; break;
+ case MDS_OP_FSYNC:
+ out << "fsync"; break;
+ case MDS_OP_RELEASE:
+ out << "release"; break;
+ default:
+ out << "unknown=" << get_op();
+ }
+ if (get_path().length())
+ out << "=" << get_path();
+ if (get_sarg().length())
+ out << " " << get_sarg();
+ out << ")";
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MDENTRYUNLINK_H
+#define __MDENTRYUNLINK_H
+
+class MDentryUnlink : public Message {
+ inodeno_t dirino;
+ string dn;
+
+ public:
+ inodeno_t get_dirino() { return dirino; }
+ string& get_dn() { return dn; }
+
+ MDentryUnlink() {}
+ MDentryUnlink(inodeno_t dirino, string& dn) :
+ Message(MSG_MDS_DENTRYUNLINK) {
+ this->dirino = dirino;
+ this->dn = dn;
+ }
+ virtual char *get_type_name() { return "Dun";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(dirino), (char*)&dirino);
+ off += sizeof(dirino);
+ _unrope(dn, s, off);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&dirino,sizeof(dirino));
+ _rope(dn, s);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MDIREXPIRE_H
+#define __MDIREXPIRE_H
+
+typedef struct {
+ inodeno_t ino;
+ int nonce;
+ int from;
+} MDirExpire_st;
+
+class MDirExpire : public Message {
+ MDirExpire_st st;
+
+ public:
+ inodeno_t get_ino() { return st.ino; }
+ int get_from() { return st.from; }
+ int get_nonce() { return st.nonce; }
+
+ MDirExpire() {}
+ MDirExpire(inodeno_t ino, int from, int nonce) :
+ Message(MSG_MDS_DIREXPIRE) {
+ st.ino = ino;
+ st.from = from;
+ st.nonce = nonce;
+ }
+ virtual char *get_type_name() { return "DirEx";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(st), (char*)&st);
+ off += sizeof(st);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&st,sizeof(st));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MDIREXPIREREQ_H
+#define __MDIREXPIREREQ_H
+
+typedef struct {
+ inodeno_t ino;
+ int nonce;
+ int from;
+} MDirExpireReq_st;
+
+class MDirExpire : public Message {
+ MDirExpireReq_st st;
+
+ public:
+ inodeno_t get_ino() { return st.ino; }
+ int get_from() { return st.from; }
+ int get_nonce() { return st.nonce; }
+
+ MDirExpire() {}
+ MDirExpire(inodeno_t ino, int from, int nonce) :
+ Message(MSG_MDS_DIREXPIREREQ) {
+ st.ino = ino;
+ st.from = from;
+ st.nonce = nonce;
+ }
+ virtual char *get_type_name() { return "DirExR";}
+
+ virtual void decode_payload(crope& s) {
+ s.copy(0, sizeof(st), (char*)&st);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&st,sizeof(st));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MDIRUPDATE_H
+#define __MDIRUPDATE_H
+
+#include "msg/Message.h"
+
+typedef struct {
+ inodeno_t ino;
+ int dir_rep;
+ int discover;
+} MDirUpdate_st;
+
+class MDirUpdate : public Message {
+ MDirUpdate_st st;
+ set<int> dir_rep_by;
+ string path;
+
+ public:
+ inodeno_t get_ino() { return st.ino; }
+ int get_dir_rep() { return st.dir_rep; }
+ set<int>& get_dir_rep_by() { return dir_rep_by; }
+ bool should_discover() { return st.discover > 0; }
+ string& get_path() { return path; }
+
+ void tried_discover() {
+ if (st.discover) st.discover--;
+ }
+
+ MDirUpdate() {}
+ MDirUpdate(inodeno_t ino,
+ int dir_rep,
+ set<int>& dir_rep_by,
+ string& path,
+ bool discover = false) :
+ Message(MSG_MDS_DIRUPDATE) {
+ this->st.ino = ino;
+ this->st.dir_rep = dir_rep;
+ this->dir_rep_by = dir_rep_by;
+ if (discover) this->st.discover = 5;
+ this->path = path;
+ }
+ virtual char *get_type_name() { return "dup"; }
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(st), (char*)&st);
+ off += sizeof(st);
+ _unrope(dir_rep_by, s, off);
+ _unrope(path, s, off);
+ }
+
+ virtual void encode_payload(crope& r) {
+ r.append((char*)&st, sizeof(st));
+ _rope(dir_rep_by, r);
+ _rope(path, r);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MDISCOVER_H
+#define __MDISCOVER_H
+
+#include "msg/Message.h"
+#include "mds/CDir.h"
+#include "include/filepath.h"
+
+#include <vector>
+#include <string>
+using namespace std;
+
+
+class MDiscover : public Message {
+ int asker;
+ inodeno_t base_ino; // 0 -> none, want root
+ bool want_base_dir;
+ bool want_root_inode;
+
+ filepath want; // ... [/]need/this/stuff
+
+ public:
+ int get_asker() { return asker; }
+ inodeno_t get_base_ino() { return base_ino; }
+ filepath& get_want() { return want; }
+ const string& get_dentry(int n) { return want[n]; }
+ bool wants_base_dir() { return want_base_dir; }
+
+ MDiscover() { }
+ MDiscover(int asker,
+ inodeno_t base_ino,
+ filepath& want,
+ bool want_base_dir = true,
+ bool want_root_inode = false) :
+ Message(MSG_MDS_DISCOVER) {
+ this->asker = asker;
+ this->base_ino = base_ino;
+ this->want = want;
+ this->want_base_dir = want_base_dir;
+ this->want_root_inode = want_root_inode;
+ }
+ virtual char *get_type_name() { return "Dis"; }
+
+ virtual void decode_payload(crope& r, int& off) {
+ r.copy(off, sizeof(asker), (char*)&asker);
+ off += sizeof(asker);
+ r.copy(off, sizeof(base_ino), (char*)&base_ino);
+ off += sizeof(base_ino);
+ r.copy(off, sizeof(bool), (char*)&want_base_dir);
+ off += sizeof(bool);
+ want._unrope(r, off);
+ }
+ virtual void encode_payload(crope& r) {
+ r.append((char*)&asker, sizeof(asker));
+ r.append((char*)&base_ino, sizeof(base_ino));
+ r.append((char*)&want_base_dir, sizeof(want_base_dir));
+ want._rope(r);
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MDISCOVERREPLY_H
+#define __MDISCOVERREPLY_H
+
+#include "msg/Message.h"
+#include "mds/CDir.h"
+#include "mds/CInode.h"
+#include "include/filepath.h"
+
+#include <vector>
+#include <string>
+using namespace std;
+
+#define max(a,b) ((a)>(b) ? (a):(b))
+
+
+/**
+ * MDiscoverReply - return new replicas (of inodes, dirs, dentries)
+ *
+ * we group returned items by (dir, dentry, inode). each
+ * item in each set shares an index (it's "depth").
+ *
+ * we can start and end with any type.
+ * no_base_dir = true if the first group has an inode but no dir
+ * no_base_dentry = true if the first group has an inode but no dentry
+ * they are false if there is no returned data, ie the first group is empty.
+ *
+ * we also return errors:
+ * error_flag_dn(string) - the specified dentry dne
+ * error_flag_dir - the last item wasn't a dir, so we couldn't continue.
+ *
+ * depth() gives us the number of depth units/indices for which we have
+ * information. this INCLUDES those for which we have errors but no data.
+ *
+ * see MDCache::handle_discover, handle_discover_reply.
+ *
+
+ old crap, maybe not accurate:
+
+ // dir [ + ... ] : discover want_base_dir=true
+
+ // dentry [ + inode [ + ... ] ] : discover want_base_dir=false
+ // no_base_dir=true
+ // -> we only exclude inode if dentry is null+xlock
+
+ // inode [ + ... ], base_ino = 0 : discover base_ino=0, start w/ root ino,
+ // no_base_dir=no_base_dentry=true
+
+ *
+ */
+
+class MDiscoverReply : public Message {
+ inodeno_t base_ino;
+ bool no_base_dir; // no base dir (but IS dentry+inode)
+ bool no_base_dentry; // no base dentry (but IS inode)
+ bool flag_error_dn;
+ bool flag_error_dir;
+ string error_dentry; // dentry that was not found (to trigger waiters on asker)
+
+
+ vector<CDirDiscover*> dirs; // not inode-aligned if no_base_dir = true.
+ filepath path; // not inode-aligned if no_base_dentry = true
+ vector<bool> path_xlock;
+ vector<CInodeDiscover*> inodes;
+
+ public:
+ // accessors
+ inodeno_t get_base_ino() { return base_ino; }
+ int get_num_inodes() { return inodes.size(); }
+ int get_num_dentries() { return path.depth(); }
+ int get_num_dirs() { return dirs.size(); }
+
+ int get_depth() { // return depth of deepest object (in dir/dentry/inode units)
+ return max( inodes.size(), // at least this many
+ max( no_base_dentry + path.depth() + flag_error_dn, // inode start + path + possible error
+ dirs.size() + no_base_dir )); // dn/inode + dirs
+ }
+
+ bool has_base_dir() { return !no_base_dir && dirs.size(); }
+ bool has_base_dentry() { return !no_base_dentry && path.depth(); }
+ bool has_root() {
+ if (base_ino == 0) {
+ assert(no_base_dir && no_base_dentry);
+ return true;
+ }
+ return false;
+ }
+ const string& get_path() { return path.get_path(); }
+ bool get_path_xlock(int i) { return path_xlock[i]; }
+
+ // bool is_flag_forward() { return flag_forward; }
+ bool is_flag_error_dn() { return flag_error_dn; }
+ bool is_flag_error_dir() { return flag_error_dir; }
+ string& get_error_dentry() { return error_dentry; }
+
+ // these index _arguments_ are aligned to each ([[dir, ] dentry, ] inode) set.
+ CDirDiscover& get_dir(int n) { return *(dirs[n - no_base_dir]); }
+ const string& get_dentry(int n) { return path[n - no_base_dentry]; }
+ bool get_dentry_xlock(int n) { return path_xlock[n - no_base_dentry]; }
+ CInodeDiscover& get_inode(int n) { return *(inodes[n]); }
+ inodeno_t get_ino(int n) { return inodes[n]->get_ino(); }
+
+ // cons
+ MDiscoverReply() {}
+ MDiscoverReply(inodeno_t base_ino) :
+ Message(MSG_MDS_DISCOVERREPLY) {
+ this->base_ino = base_ino;
+ flag_error_dn = false;
+ flag_error_dir = false;
+ no_base_dir = no_base_dentry = false;
+ }
+ ~MDiscoverReply() {
+ for (vector<CDirDiscover*>::iterator it = dirs.begin();
+ it != dirs.end();
+ it++)
+ delete *it;
+ for (vector<CInodeDiscover*>::iterator it = inodes.begin();
+ it != inodes.end();
+ it++)
+ delete *it;
+ }
+ virtual char *get_type_name() { return "DisR"; }
+
+ // builders
+ bool is_empty() {
+ return dirs.empty() && path.depth() == 0 &&
+ inodes.empty() &&
+ !flag_error_dn &&
+ !flag_error_dir;
+ }
+ void set_path(const filepath& dp) { path = dp; }
+ void add_dentry(const string& dn, bool xlock) {
+ if (path.depth() == 0 && dirs.empty()) no_base_dir = true;
+ path.add_dentry(dn);
+ path_xlock.push_back(xlock);
+ }
+
+ void add_inode(CInodeDiscover* din) {
+ if (inodes.empty() && path.depth() == 0) no_base_dir = no_base_dentry = true;
+ inodes.push_back( din );
+ }
+
+ void add_dir(CDirDiscover* dir) {
+ dirs.push_back( dir );
+ }
+
+ // void set_flag_forward() { flag_forward = true; }
+ void set_flag_error_dn(const string& dn) {
+ flag_error_dn = true;
+ error_dentry = dn;
+ }
+ void set_flag_error_dir() {
+ flag_error_dir = true;
+ }
+
+
+ // ...
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(base_ino), (char*)&base_ino);
+ off += sizeof(base_ino);
+ payload.copy(off, sizeof(bool), (char*)&no_base_dir);
+ off += sizeof(bool);
+ payload.copy(off, sizeof(bool), (char*)&no_base_dentry);
+ off += sizeof(bool);
+ // payload.copy(off, sizeof(bool), (char*)&flag_forward);
+ //off += sizeof(bool);
+ payload.copy(off, sizeof(bool), (char*)&flag_error_dn);
+ off += sizeof(bool);
+
+ _decode(error_dentry, payload, off);
+ payload.copy(off, sizeof(bool), (char*)&flag_error_dir);
+ off += sizeof(bool);
+
+ // dirs
+ int n;
+ payload.copy(off, sizeof(int), (char*)&n);
+ off += sizeof(int);
+ for (int i=0; i<n; i++) {
+ dirs.push_back( new CDirDiscover() );
+ dirs[i]->_decode(payload, off);
+ }
+ //dout(12) << n << " dirs out" << endl;
+
+ // inodes
+ payload.copy(off, sizeof(int), (char*)&n);
+ off += sizeof(int);
+ for (int i=0; i<n; i++) {
+ inodes.push_back( new CInodeDiscover() );
+ inodes[i]->_decode(payload, off);
+ }
+ //dout(12) << n << " inodes out" << endl;
+
+ // filepath
+ path._decode(payload, off);
+ //dout(12) << path.depth() << " dentries out" << endl;
+
+ // path_xlock
+ payload.copy(off, sizeof(int), (char*)&n);
+ off += sizeof(int);
+ for (int i=0; i<n; i++) {
+ bool b;
+ payload.copy(off, sizeof(bool), (char*)&b);
+ off += sizeof(bool);
+ path_xlock.push_back(b);
+ }
+ }
+ void encode_payload() {
+ payload.append((char*)&base_ino, sizeof(base_ino));
+ payload.append((char*)&no_base_dir, sizeof(bool));
+ payload.append((char*)&no_base_dentry, sizeof(bool));
+ // payload.append((char*)&flag_forward, sizeof(bool));
+ payload.append((char*)&flag_error_dn, sizeof(bool));
+
+ _encode(error_dentry, payload);
+ payload.append((char*)&flag_error_dir, sizeof(bool));
+
+ // dirs
+ int n = dirs.size();
+ payload.append((char*)&n, sizeof(int));
+ for (vector<CDirDiscover*>::iterator it = dirs.begin();
+ it != dirs.end();
+ it++)
+ (*it)->_encode( payload );
+ //dout(12) << n << " dirs in" << endl;
+
+ // inodes
+ n = inodes.size();
+ payload.append((char*)&n, sizeof(int));
+ for (vector<CInodeDiscover*>::iterator it = inodes.begin();
+ it != inodes.end();
+ it++)
+ (*it)->_encode( payload );
+ //dout(12) << n << " inodes in" << endl;
+
+ // path
+ path._encode( payload );
+ //dout(12) << path.depth() << " dentries in" << endl;
+
+ // path_xlock
+ n = path_xlock.size();
+ payload.append((char*)&n, sizeof(int));
+ for (vector<bool>::iterator it = path_xlock.begin();
+ it != path_xlock.end();
+ it++) {
+ bool b = *it;
+ payload.append((char*)&b, sizeof(bool));
+ }
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MEXPORTDIR_H
+#define __MEXPORTDIR_H
+
+#include "msg/Message.h"
+
+
+class MExportDir : public Message {
+ inodeno_t ino;
+
+ int ndirs;
+ bufferlist state;
+
+ list<inodeno_t> exports;
+
+ // hashed pre-discovers
+ //map<inodeno_t, set<string> > hashed_prediscover;
+
+ public:
+ MExportDir() {}
+ MExportDir(CInode *in) :
+ Message(MSG_MDS_EXPORTDIR) {
+ this->ino = in->inode.ino;
+ ndirs = 0;
+ }
+ virtual char *get_type_name() { return "Ex"; }
+
+ inodeno_t get_ino() { return ino; }
+ int get_ndirs() { return ndirs; }
+ bufferlist& get_state() { return state; }
+ list<inodeno_t>& get_exports() { return exports; }
+
+ void add_dir(bufferlist& dir) {
+ state.claim_append( dir );
+ ndirs++;
+ }
+ void add_export(CDir *dir) { exports.push_back(dir->ino()); }
+
+
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ payload.copy(off, sizeof(ndirs), (char*)&ndirs);
+ off += sizeof(ndirs);
+
+ // exports
+ int nex;
+ payload.copy(off, sizeof(nex), (char*)&nex);
+ off += sizeof(int);
+ dout(12) << nex << " nested exports out" << endl;
+ for (int i=0; i<nex; i++) {
+ inodeno_t dirino;
+ payload.copy(off, sizeof(dirino), (char*)&dirino);
+ off += sizeof(dirino);
+ exports.push_back(dirino);
+ }
+
+ // dir data
+ size_t len;
+ payload.copy(off, sizeof(len), (char*)&len);
+ off += sizeof(len);
+ state.substr_of(payload, off, len);
+ off += len;
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+ payload.append((char*)&ndirs, sizeof(ndirs));
+
+ // exports
+ int nex = exports.size();
+ dout(12) << nex << " nested exports in" << endl;
+ payload.append((char*)&nex, sizeof(int));
+ for (list<inodeno_t>::iterator it = exports.begin();
+ it != exports.end();
+ it++) {
+ inodeno_t ino = *it;
+ payload.append((char*)&ino, sizeof(ino));
+ }
+
+ // dir data
+ size_t len = state.length();
+ payload.append((char*)&len, sizeof(len));
+ payload.claim_append(state);
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MEXPORTDIRACK_H
+#define __MEXPORTDIRACK_H
+
+#include "MExportDir.h"
+
+class MExportDirAck : public Message {
+ inodeno_t ino;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+
+ MExportDirAck() {}
+ MExportDirAck(MExportDir *req) :
+ Message(MSG_MDS_EXPORTDIRACK) {
+ ino = req->get_ino();
+ }
+ virtual char *get_type_name() { return "ExAck"; }
+
+ virtual void decode_payload(crope& s) {
+ s.copy(0, sizeof(ino), (char*)&ino);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&ino, sizeof(ino));
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MEXPORTDIRDISCOVER_H
+#define __MEXPORTDIRDISCOVER_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MExportDirDiscover : public Message {
+ inodeno_t ino;
+ string path;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+ string& get_path() { return path; }
+
+ MExportDirDiscover() {}
+ MExportDirDiscover(CInode *in) :
+ Message(MSG_MDS_EXPORTDIRDISCOVER) {
+ in->make_path(path);
+ ino = in->ino();
+ }
+ virtual char *get_type_name() { return "ExDis"; }
+
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ _unrope(path, s, off);
+ }
+
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&ino, sizeof(ino));
+ _rope(path, s);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MEXPORTDIRDISCOVERACK_H
+#define __MEXPORTDIRDISCOVERACK_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MExportDirDiscoverAck : public Message {
+ inodeno_t ino;
+ bool success;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+ bool is_success() { return success; }
+
+ MExportDirDiscoverAck() {}
+ MExportDirDiscoverAck(inodeno_t ino, bool success=true) :
+ Message(MSG_MDS_EXPORTDIRDISCOVERACK) {
+ this->ino = ino;
+ this->success = false;
+ }
+ virtual char *get_type_name() { return "ExDisA"; }
+
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ s.copy(off, sizeof(success), (char*)&success);
+ off += sizeof(success);
+ }
+
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&ino, sizeof(ino));
+ s.append((char*)&success, sizeof(success));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MEXPORTDIRFINISH_H
+#define __MEXPORTDIRFINISH_H
+
+#include "MExportDir.h"
+
+class MExportDirFinish : public Message {
+ inodeno_t ino;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+
+ MExportDirFinish() {}
+ MExportDirFinish(inodeno_t ino) :
+ Message(MSG_MDS_EXPORTDIRFINISH) {
+ this->ino = ino;
+ }
+ virtual char *get_type_name() { return "ExFin"; }
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&ino, sizeof(ino));
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MEXPORTDIRNOTIFY_H
+#define __MEXPORTDIRNOTIFY_H
+
+#include "msg/Message.h"
+#include <string>
+using namespace std;
+
+class MExportDirNotify : public Message {
+ int new_auth;
+ int old_auth;
+ inodeno_t ino;
+
+ list<inodeno_t> exports; // bounds; these dirs are _not_ included (tho the inodes are)
+ list<inodeno_t> subdirs;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+ int get_new_auth() { return new_auth; }
+ int get_old_auth() { return old_auth; }
+ list<inodeno_t>& get_exports() { return exports; }
+ list<inodeno_t>::iterator subdirs_begin() { return subdirs.begin(); }
+ list<inodeno_t>::iterator subdirs_end() { return subdirs.end(); }
+ int num_subdirs() { return subdirs.size(); }
+
+ MExportDirNotify() {}
+ MExportDirNotify(inodeno_t ino, int old_auth, int new_auth) :
+ Message(MSG_MDS_EXPORTDIRNOTIFY) {
+ this->ino = ino;
+ this->old_auth = old_auth;
+ this->new_auth = new_auth;
+ }
+ virtual char *get_type_name() { return "ExNot"; }
+
+ void copy_subdirs(list<inodeno_t>& s) {
+ this->subdirs = s;
+ }
+ void copy_exports(list<inodeno_t>& ex) {
+ this->exports = ex;
+ }
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(int), (char*)&new_auth);
+ off += sizeof(int);
+ s.copy(off, sizeof(int), (char*)&old_auth);
+ off += sizeof(int);
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+
+ // notify
+ int n;
+ s.copy(off, sizeof(int), (char*)&n);
+ off += sizeof(int);
+ for (int i=0; i<n; i++) {
+ inodeno_t ino;
+ s.copy(off, sizeof(ino), (char*)&ino);
+ exports.push_back(ino);
+ off += sizeof(inodeno_t);
+ }
+
+ // subdirs
+ s.copy(off, sizeof(int), (char*)&n);
+ off += sizeof(int);
+ for (int i=0; i<n; i++) {
+ inodeno_t ino;
+ s.copy(off, sizeof(ino), (char*)&ino);
+ subdirs.push_back(ino);
+ off += sizeof(inodeno_t);
+ }
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&new_auth, sizeof(int));
+ s.append((char*)&old_auth, sizeof(int));
+ s.append((char*)&ino, sizeof(ino));
+
+ // notify
+ int n = exports.size();
+ s.append((char*)&n, sizeof(int));
+ for (list<inodeno_t>::iterator it = exports.begin();
+ it != exports.end();
+ it++) {
+ inodeno_t ino = *it;
+ s.append((char*)&ino, sizeof(ino));
+ }
+
+ // subdirs
+ n = subdirs.size();
+ s.append((char*)&n, sizeof(int));
+ for (list<inodeno_t>::iterator it = subdirs.begin();
+ it != subdirs.end();
+ it++) {
+ inodeno_t ino = *it;
+ s.append((char*)&ino, sizeof(ino));
+ }
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MEXPORTDIRNOTIFYACK_H
+#define __MEXPORTDIRNOTIFYACK_H
+
+#include "msg/Message.h"
+#include <string>
+using namespace std;
+
+class MExportDirNotifyAck : public Message {
+ inodeno_t ino;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+
+ MExportDirNotifyAck() {}
+ MExportDirNotifyAck(inodeno_t ino) :
+ Message(MSG_MDS_EXPORTDIRNOTIFYACK) {
+ this->ino = ino;
+ }
+ virtual char *get_type_name() { return "ExNotA"; }
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ }
+
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&ino, sizeof(ino));
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MEXPORTDIRPREP_H
+#define __MEXPORTDIRPREP_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MExportDirPrep : public Message {
+ inodeno_t ino;
+
+ /* nested export discover payload.
+ not all inodes will have dirs; they may require a separate discover.
+ dentries are the links to each inode.
+ dirs map includes base dir (ino)
+ */
+ list<inodeno_t> exports;
+
+ list<CInodeDiscover*> inodes;
+ map<inodeno_t,inodeno_t> inode_dirino;
+ map<inodeno_t,string> inode_dentry;
+
+ map<inodeno_t,CDirDiscover*> dirs;
+
+ bool b_did_assim;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+ list<inodeno_t>& get_exports() { return exports; }
+ list<CInodeDiscover*>& get_inodes() { return inodes; }
+ inodeno_t get_containing_dirino(inodeno_t ino) {
+ return inode_dirino[ino];
+ }
+ string& get_dentry(inodeno_t ino) {
+ return inode_dentry[ino];
+ }
+ bool have_dir(inodeno_t ino) {
+ return dirs.count(ino);
+ }
+ CDirDiscover* get_dir(inodeno_t ino) {
+ return dirs[ino];
+ }
+
+ bool did_assim() { return b_did_assim; }
+ void mark_assim() { b_did_assim = true; }
+
+ MExportDirPrep() {
+ b_did_assim = false;
+ }
+ MExportDirPrep(CInode *in) :
+ Message(MSG_MDS_EXPORTDIRPREP) {
+ ino = in->ino();
+ b_did_assim = false;
+ }
+ ~MExportDirPrep() {
+ for (list<CInodeDiscover*>::iterator iit = inodes.begin();
+ iit != inodes.end();
+ iit++)
+ delete *iit;
+ for (map<inodeno_t,CDirDiscover*>::iterator dit = dirs.begin();
+ dit != dirs.end();
+ dit++)
+ delete dit->second;
+ }
+
+
+ virtual char *get_type_name() { return "ExP"; }
+
+
+
+
+ void add_export(inodeno_t dirino) {
+ exports.push_back( dirino );
+ }
+ void add_inode(inodeno_t dirino, string& dentry, CInodeDiscover *in) {
+ inodes.push_back(in);
+ inode_dirino.insert(pair<inodeno_t, inodeno_t>(in->get_ino(), dirino));
+ inode_dentry.insert(pair<inodeno_t, string>(in->get_ino(), dentry));
+ }
+ void add_dir(CDirDiscover *dir) {
+ dirs.insert(pair<inodeno_t, CDirDiscover*>(dir->get_ino(), dir));
+ }
+
+
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+
+ // exports
+ int ne;
+ payload.copy(off, sizeof(int), (char*)&ne);
+ off += sizeof(int);
+ for (int i=0; i<ne; i++) {
+ inodeno_t ino;
+ payload.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ exports.push_back(ino);
+ }
+
+ // inodes
+ int ni;
+ payload.copy(off, sizeof(int), (char*)&ni);
+ off += sizeof(int);
+ for (int i=0; i<ni; i++) {
+ // inode
+ CInodeDiscover *in = new CInodeDiscover;
+ in->_decode(payload, off);
+ inodes.push_back(in);
+
+ // dentry
+ string d;
+ _decode(d, payload, off);
+ inode_dentry[in->get_ino()] = d;
+
+ // dir ino
+ inodeno_t dino;
+ payload.copy(off, sizeof(dino), (char*)&dino);
+ off += sizeof(dino);
+ inode_dirino[in->get_ino()] = dino;
+ }
+
+ // dirs
+ int nd;
+ payload.copy(off, sizeof(int), (char*)&nd);
+ off += sizeof(int);
+ for (int i=0; i<nd; i++) {
+ CDirDiscover *dir = new CDirDiscover;
+ dir->_decode(payload, off);
+ dirs[dir->get_ino()] = dir;
+ }
+ }
+
+ virtual void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+
+ // exports
+ int ne = exports.size();
+ payload.append((char*)&ne, sizeof(int));
+ for (list<inodeno_t>::iterator it = exports.begin();
+ it != exports.end();
+ it++) {
+ inodeno_t ino = *it;
+ payload.append((char*)&ino, sizeof(ino));
+ }
+
+ // inodes
+ int ni = inodes.size();
+ payload.append((char*)&ni, sizeof(int));
+ for (list<CInodeDiscover*>::iterator iit = inodes.begin();
+ iit != inodes.end();
+ iit++) {
+ (*iit)->_encode(payload);
+
+ // dentry
+ _encode(inode_dentry[(*iit)->get_ino()], payload);
+
+ // dir ino
+ inodeno_t ino = inode_dirino[(*iit)->get_ino()];
+ payload.append((char*)&ino, sizeof(ino));
+ }
+
+ // dirs
+ int nd = dirs.size();
+ payload.append((char*)&nd, sizeof(int));
+ for (map<inodeno_t,CDirDiscover*>::iterator dit = dirs.begin();
+ dit != dirs.end();
+ dit++)
+ dit->second->_encode(payload);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MEXPORTDIRPREPACK_H
+#define __MEXPORTDIRPREPACK_H
+
+#include "msg/Message.h"
+#include "include/types.h"
+
+class MExportDirPrepAck : public Message {
+ inodeno_t ino;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+
+ MExportDirPrepAck() {}
+ MExportDirPrepAck(inodeno_t ino) :
+ Message(MSG_MDS_EXPORTDIRPREPACK) {
+ this->ino = ino;
+ }
+
+ virtual char *get_type_name() { return "ExPAck"; }
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&ino, sizeof(ino));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MEXPORTDIRWARNING_H
+#define __MEXPORTDIRWARNING_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MExportDirWarning : public Message {
+ inodeno_t ino;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+
+ MExportDirWarning() {}
+ MExportDirWarning(inodeno_t ino) :
+ Message(MSG_MDS_EXPORTDIRWARNING) {
+ this->ino = ino;
+ }
+
+ virtual char *get_type_name() { return "ExW"; }
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&ino, sizeof(ino));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MFAILURE_H
+#define __MFAILURE_H
+
+#include "msg/Message.h"
+
+
+class MFailure : public Message {
+ public:
+ msg_addr_t failed;
+ entity_inst_t inst;
+
+ MFailure() {}
+ MFailure(msg_addr_t f, entity_inst_t& i) :
+ Message(MSG_FAILURE),
+ failed(f), inst(i) {}
+
+ msg_addr_t get_failed() { return failed; }
+ entity_inst_t& get_inst() { return inst; }
+
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(failed), (char*)&failed);
+ off += sizeof(failed);
+ payload.copy(off, sizeof(inst), (char*)&inst);
+ off += sizeof(inst);
+ }
+ void encode_payload() {
+ payload.append((char*)&failed, sizeof(failed));
+ payload.append((char*)&inst, sizeof(inst));
+ }
+
+ virtual char *get_type_name() { return "fail"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MFAILUREACK_H
+#define __MFAILUREACK_H
+
+#include "MFailure.h"
+
+
+class MFailureAck : public Message {
+ public:
+ msg_addr_t failed;
+ MFailureAck(MFailure *m) : Message(MSG_FAILURE_ACK) {
+ this->failed = m->get_failed();
+ }
+ MFailureAck() {}
+
+ msg_addr_t get_failed() { return failed; }
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(0, sizeof(failed), (char*)&failed);
+ off += sizeof(failed);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&failed, sizeof(failed));
+ }
+
+ virtual char *get_type_name() { return "faila"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MGENERICMESSAGE_H
+#define __MGENERICMESSAGE_H
+
+#include "msg/Message.h"
+
+class MGenericMessage : public Message {
+ char tname[20];
+ //long pcid;
+
+ public:
+ MGenericMessage(int t) : Message(t) {
+ sprintf(tname, "generic%d", get_type());
+ }
+
+ //void set_pcid(long pcid) { this->pcid = pcid; }
+ //long get_pcid() { return pcid; }
+
+ char *get_type_name() { return tname; }
+
+ virtual void decode_payload() {
+ //int off = 0;
+ //payload.copy(off, sizeof(pcid), (char*)&pcid);
+ //off += sizeof(pcid);
+ }
+ virtual void encode_payload() {
+ //payload.append((char*)&pcid, sizeof(pcid));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MHASHDIR_H
+#define __MHASHDIR_H
+
+#include "msg/Message.h"
+
+class MHashDir : public Message {
+ inodeno_t ino;
+ bufferlist state;
+ int nden;
+
+ public:
+ MHashDir() {}
+ MHashDir(inodeno_t ino) :
+ Message(MSG_MDS_HASHDIR) {
+ this->ino = ino;
+ nden = 0;
+ }
+ virtual char *get_type_name() { return "Ha"; }
+
+ inodeno_t get_ino() { return ino; }
+ bufferlist& get_state() { return state; }
+ bufferlist* get_state_ptr() { return &state; }
+ int get_nden() { return nden; }
+
+ void set_nden(int n) { nden = n; }
+ void inc_nden() { nden++; }
+
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ payload.copy(off, sizeof(nden), (char*)&nden);
+ off += sizeof(nden);
+
+ size_t len;
+ payload.copy(off, sizeof(len), (char*)&len);
+ off += sizeof(len);
+ state.substr_of(payload, off, len);
+ }
+ void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+ payload.append((char*)&nden, sizeof(nden));
+ size_t size = state.length();
+ payload.append((char*)&size, sizeof(size));
+ payload.claim_append(state);
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MHASHDIRACK_H
+#define __MHASHDIRACK_H
+
+#include "MHashDir.h"
+
+class MHashDirAck : public Message {
+ inodeno_t ino;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+
+ MHashDirAck() {}
+ MHashDirAck(inodeno_t ino) :
+ Message(MSG_MDS_HASHDIRACK) {
+ this->ino = ino;
+ }
+ virtual char *get_type_name() { return "HAck"; }
+
+ virtual void decode_payload() {
+ payload.copy(0, sizeof(ino), (char*)&ino);
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MHASHDIRDISCOVER_H
+#define __MHASHDIRDISCOVER_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MHashDirDiscover : public Message {
+ inodeno_t ino;
+ string path;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+ string& get_path() { return path; }
+
+ MHashDirDiscover() {}
+ MHashDirDiscover(CInode *in) :
+ Message(MSG_MDS_HASHDIRDISCOVER) {
+ in->make_path(path);
+ ino = in->ino();
+ }
+ virtual char *get_type_name() { return "HDis"; }
+
+
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ _decode(path, payload, off);
+ }
+
+ void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+ _encode(path, payload);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MHASHDIRDISCOVERACK_H
+#define __MHASHDIRDISCOVERACK_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MHashDirDiscoverAck : public Message {
+ inodeno_t ino;
+ bool success;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+ bool is_success() { return success; }
+
+ MHashDirDiscoverAck() {}
+ MHashDirDiscoverAck(inodeno_t ino, bool success=true) :
+ Message(MSG_MDS_HASHDIRDISCOVERACK) {
+ this->ino = ino;
+ this->success = false;
+ }
+ virtual char *get_type_name() { return "HDisA"; }
+
+
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ payload.copy(off, sizeof(success), (char*)&success);
+ off += sizeof(success);
+ }
+
+ void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+ payload.append((char*)&success, sizeof(success));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MHASHDIRNOTIFY_H
+#define __MHASHDIRNOTIFY_H
+
+#include "msg/Message.h"
+
+class MHashDirNotify : public Message {
+ inodeno_t ino;
+ int from;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+ int get_from() { return from; }
+
+ MHashDirNotify() {}
+ MHashDirNotify(inodeno_t ino, int from) :
+ Message(MSG_MDS_HASHDIRNOTIFY) {
+ this->ino = ino;
+ this->from = from;
+ }
+ virtual char *get_type_name() { return "HN"; }
+
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ payload.copy(off, sizeof(from), (char*)&from);
+ off += sizeof(from);
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+ payload.append((char*)&from, sizeof(from));
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MHASHDIRPREP_H
+#define __MHASHDIRPREP_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MHashDirPrep : public Message {
+ inodeno_t ino;
+ bool assim;
+
+ // subdir dentry names + inodes
+ map<string,CInodeDiscover*> inodes;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+ map<string,CInodeDiscover*>& get_inodes() { return inodes; }
+
+ bool did_assim() { return assim; }
+ void mark_assim() { assert(!assim); assim = true; }
+
+ MHashDirPrep() : assim(false) { }
+ MHashDirPrep(inodeno_t ino) :
+ Message(MSG_MDS_HASHDIRPREP),
+ assim(false) {
+ this->ino = ino;
+ }
+ ~MHashDirPrep() {
+ for (map<string,CInodeDiscover*>::iterator it = inodes.begin();
+ it != inodes.end();
+ it++)
+ delete it->second;
+ }
+
+
+ virtual char *get_type_name() { return "HP"; }
+
+ void add_inode(const string& dentry, CInodeDiscover *in) {
+ inodes[dentry] = in;
+ }
+
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+
+ // inodes
+ int ni;
+ payload.copy(off, sizeof(int), (char*)&ni);
+ off += sizeof(int);
+ for (int i=0; i<ni; i++) {
+ // dentry
+ string dname;
+ _decode(dname, payload, off);
+
+ // inode
+ CInodeDiscover *in = new CInodeDiscover;
+ in->_decode(payload, off);
+
+ inodes[dname] = in;
+ }
+ }
+
+ virtual void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+
+ // inodes
+ int ni = inodes.size();
+ payload.append((char*)&ni, sizeof(int));
+ for (map<string,CInodeDiscover*>::iterator iit = inodes.begin();
+ iit != inodes.end();
+ iit++) {
+ _encode(iit->first, payload); // dentry
+ iit->second->_encode(payload); // inode
+ }
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MHASHDIRPREPACK_H
+#define __MHASHDIRPREPACK_H
+
+#include "msg/Message.h"
+#include "include/types.h"
+
+class MHashDirPrepAck : public Message {
+ inodeno_t ino;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+
+ MHashDirPrepAck() {}
+ MHashDirPrepAck(inodeno_t ino) :
+ Message(MSG_MDS_HASHDIRPREPACK) {
+ this->ino = ino;
+ }
+
+ virtual char *get_type_name() { return "HPAck"; }
+
+ void decode_payload() {
+ payload.copy(0, sizeof(ino), (char*)&ino);
+ }
+ void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MHASHREADDIR_H
+#define __MHASHREADDIR_H
+
+#include "include/types.h"
+#include "msg/Message.h"
+
+class MHashReaddir : public Message {
+ inodeno_t ino;
+
+ public:
+ MHashReaddir() { }
+ MHashReaddir(inodeno_t ino) :
+ Message(MSG_MDS_HASHREADDIR) {
+ this->ino = ino;
+ }
+
+ inodeno_t get_ino() { return ino; }
+
+ virtual char *get_type_name() { return "Hls"; }
+
+ virtual void decode_payload() {
+ payload.copy(0, sizeof(ino), (char*)&ino);
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MHASHREADDIRREPLY_H
+#define __MHASHREADDIRREPLY_H
+
+#include "MClientReply.h"
+
+class MHashReaddirReply : public Message {
+ inodeno_t ino;
+
+ list<InodeStat*> dir_in;
+ list<string> dir_dn;
+
+ int num;
+
+ public:
+ MHashReaddirReply() { }
+ MHashReaddirReply(inodeno_t _ino, list<InodeStat*>& inls, list<string>& dnls, int n) :
+ Message(MSG_MDS_HASHREADDIRREPLY),
+ ino(_ino),
+ num(n) {
+ dir_in.swap(inls);
+ dir_dn.swap(dnls);
+ }
+ ~MHashReaddirReply() {
+ for (list<InodeStat*>::iterator it = dir_in.begin(); it != dir_in.end(); it++)
+ delete *it;
+ }
+
+ inodeno_t get_ino() { return ino; }
+ list<InodeStat*>& get_in() { return dir_in; }
+ list<string>& get_dn() { return dir_dn; }
+
+ virtual char *get_type_name() { return "Hls"; }
+
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ int n;
+ payload.copy(n, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ string dn;
+ ::_decode(dn, payload, off);
+ dir_dn.push_back(dn);
+
+ InodeStat *ci = new InodeStat;
+ ci->_decode(payload, off);
+ dir_in.push_back(ci);
+ }
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+ int n = dir_in.size(); // FIXME?
+ payload.append((char*)&n, sizeof(n));
+ list<string>::iterator pdn = dir_dn.begin();
+ for (list<InodeStat*>::iterator pin = dir_in.begin();
+ pin != dir_in.end();
+ ++pin, ++pdn) {
+ ::_encode(*pdn, payload);
+ (*pin)->_encode(payload);
+ }
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MHEARTBEAT_H
+#define __MHEARTBEAT_H
+
+#include "include/types.h"
+#include "msg/Message.h"
+
+class MHeartbeat : public Message {
+ mds_load_t load;
+ int beat;
+ map<int, float> import_map;
+
+ public:
+ mds_load_t& get_load() { return load; }
+ int get_beat() { return beat; }
+
+ map<int, float>& get_import_map() {
+ return import_map;
+ }
+
+ MHeartbeat() {}
+ MHeartbeat(mds_load_t& load, int beat) :
+ Message(MSG_MDS_HEARTBEAT) {
+ this->load = load;
+ this->beat = beat;
+ }
+
+ virtual char *get_type_name() { return "HB"; }
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off,sizeof(load), (char*)&load);
+ off += sizeof(load);
+ s.copy(off, sizeof(beat), (char*)&beat);
+ off += sizeof(beat);
+
+ int n;
+ s.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ while (n--) {
+ int f;
+ s.copy(off, sizeof(f), (char*)&f);
+ off += sizeof(f);
+ float v;
+ s.copy(off, sizeof(v), (char*)&v);
+ off += sizeof(v);
+ import_map[f] = v;
+ }
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&load, sizeof(load));
+ s.append((char*)&beat, sizeof(beat));
+
+ int n = import_map.size();
+ s.append((char*)&n, sizeof(n));
+ for (map<int, float>::iterator it = import_map.begin();
+ it != import_map.end();
+ it++) {
+ int f = it->first;
+ s.append((char*)&f, sizeof(f));
+ float v = it->second;
+ s.append((char*)&v, sizeof(v));
+ }
+
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MINODEEXPIRE_H
+#define __MINODEEXPIRE_H
+
+typedef struct {
+ inodeno_t ino;
+ int nonce;
+ int from;
+} MInodeExpire_st;
+
+class MInodeExpire : public Message {
+ MInodeExpire_st st;
+
+ public:
+ inodeno_t get_ino() { return st.ino; }
+ int get_from() { return st.from; }
+ int get_nonce() { return st.nonce; }
+
+ MInodeExpire() {}
+ MInodeExpire(inodeno_t ino, int from, int nonce) :
+ Message(MSG_MDS_INODEEXPIRE) {
+ st.ino = ino;
+ st.from = from;
+ st.nonce = nonce;
+ }
+ virtual char *get_type_name() { return "InEx";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(st), (char*)&st);
+ off += sizeof(st);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&st,sizeof(st));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MINODEFILECAPS_H
+#define __MINODEFILECAPS_H
+
+class MInodeFileCaps : public Message {
+ inodeno_t ino;
+ int from;
+ int caps;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+ int get_from() { return from; }
+ int get_caps() { return caps; }
+
+ MInodeFileCaps() {}
+ // from auth
+ MInodeFileCaps(inodeno_t ino, int from, int caps) :
+ Message(MSG_MDS_INODEFILECAPS) {
+
+ this->ino = ino;
+ this->from = from;
+ this->caps = caps;
+ }
+
+ virtual char *get_type_name() { return "Icap";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(from), (char*)&from);
+ off += sizeof(from);
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ s.copy(off, sizeof(caps), (char*)&caps);
+ off += sizeof(caps);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&from, sizeof(from));
+ s.append((char*)&ino, sizeof(ino));
+ s.append((char*)&caps, sizeof(caps));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MINODELINK_H
+#define __MINODELINK_H
+
+typedef struct {
+ inodeno_t ino;
+ int from;
+} MInodeLink_st;
+
+class MInodeLink : public Message {
+ MInodeLink_st st;
+
+ public:
+ inodeno_t get_ino() { return st.ino; }
+ int get_from() { return st.from; }
+
+ MInodeLink() {}
+ MInodeLink(inodeno_t ino, int from) :
+ Message(MSG_MDS_INODELINK) {
+ st.ino = ino;
+ st.from = from;
+ }
+ virtual char *get_type_name() { return "InL";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(st), (char*)&st);
+ off += sizeof(st);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&st,sizeof(st));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MINODELINKACK_H
+#define __MINODELINKACK_H
+
+typedef struct {
+ inodeno_t ino;
+ bool success;
+} MInodeLinkAck_st;
+
+class MInodeLinkAck : public Message {
+ MInodeLinkAck_st st;
+
+ public:
+ inodeno_t get_ino() { return st.ino; }
+ bool is_success() { return st.success; }
+
+ MInodeLinkAck() {}
+ MInodeLinkAck(inodeno_t ino, bool success) :
+ Message(MSG_MDS_INODELINKACK) {
+ st.ino = ino;
+ st.success = success;
+ }
+ virtual char *get_type_name() { return "InLA";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(st), (char*)&st);
+ off += sizeof(st);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&st,sizeof(st));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MINODEUNLINK_H
+#define __MINODEUNLINK_H
+
+typedef struct {
+ inodeno_t ino;
+ int from;
+} MInodeUnlink_st;
+
+class MInodeUnlink : public Message {
+ MInodeUnlink_st st;
+
+ public:
+ inodeno_t get_ino() { return st.ino; }
+ int get_from() { return st.from; }
+
+ MInodeUnlink() {}
+ MInodeUnlink(inodeno_t ino, int from) :
+ Message(MSG_MDS_INODEUNLINK) {
+ st.ino = ino;
+ st.from = from;
+ }
+ virtual char *get_type_name() { return "InUl";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(st), (char*)&st);
+ off += sizeof(st);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&st,sizeof(st));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MINODEUNLINKACK_H
+#define __MINODEUNLINKACK_H
+
+typedef struct {
+ inodeno_t ino;
+} MInodeUnlinkAck_st;
+
+class MInodeUnlinkAck : public Message {
+ MInodeUnlinkAck_st st;
+
+ public:
+ inodeno_t get_ino() { return st.ino; }
+
+ MInodeUnlinkAck() {}
+ MInodeUnlinkAck(inodeno_t ino) :
+ Message(MSG_MDS_INODEUNLINKACK) {
+ st.ino = ino;
+ }
+ virtual char *get_type_name() { return "InUlA";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(st), (char*)&st);
+ off += sizeof(st);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&st,sizeof(st));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MINODEUPDATE_H
+#define __MINODEUPDATE_H
+
+#include "msg/Message.h"
+
+#include <set>
+using namespace std;
+
+class MInodeUpdate : public Message {
+ int nonce;
+ crope inode_basic_state;
+
+ public:
+ inodeno_t get_ino() {
+ inodeno_t ino;
+ inode_basic_state.copy(0, sizeof(inodeno_t), (char*)&ino);
+ return ino;
+ }
+ int get_nonce() { return nonce; }
+
+ MInodeUpdate() {}
+ MInodeUpdate(CInode *in, int nonce) :
+ Message(MSG_MDS_INODEUPDATE) {
+ inode_basic_state = in->encode_basic_state();
+ this->nonce = nonce;
+ }
+ virtual char *get_type_name() { return "Iup"; }
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(int), (char*)&nonce);
+ off += sizeof(int);
+ size_t len;
+ s.copy(off, sizeof(len), (char*)&len);
+ off += sizeof(len);
+ inode_basic_state = s.substr(off, len);
+ off += len;
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&nonce, sizeof(int));
+ size_t len = inode_basic_state.length();
+ s.append((char*)&len, sizeof(len));
+ s.append(inode_basic_state);
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MLOCK_H
+#define __MLOCK_H
+
+#include "msg/Message.h"
+
+#define LOCK_OTYPE_IHARD 1
+#define LOCK_OTYPE_IFILE 2
+#define LOCK_OTYPE_DIR 3
+#define LOCK_OTYPE_DN 4
+
+// for replicas
+#define LOCK_AC_SYNC 0
+#define LOCK_AC_MIXED 1
+#define LOCK_AC_LOCK 2
+
+#define LOCK_AC_REQXLOCKACK 9 // req dentry xlock
+#define LOCK_AC_REQXLOCKNAK 10 // req dentry xlock
+#define LOCK_AC_LOCKNAK 12 // for dentry xlock
+
+
+#define LOCK_AC_FOR_REPLICA(a) ((a) <= 10)
+#define LOCK_AC_FOR_AUTH(a) ((a) >= 11)
+
+// for auth
+
+#define LOCK_AC_SYNCACK 13
+#define LOCK_AC_MIXEDACK 14
+#define LOCK_AC_LOCKACK 15
+
+
+#define LOCK_AC_REQREAD 19
+#define LOCK_AC_REQWRITE 20
+
+#define LOCK_AC_REQXLOCK 21
+#define LOCK_AC_REQXLOCKC 22 // create if necessary
+#define LOCK_AC_UNXLOCK 23
+
+#define lock_ac_name(x)
+
+
+class MLock : public Message {
+ int asker; // who is initiating this request
+ int action; // action type
+
+ char otype; // lock object type
+ inodeno_t ino; // ino ref, or possibly
+ string dn; // dentry name
+ bufferlist data; // and possibly some data
+ string path; // possibly a path too (for dentry lock discovers)
+
+ public:
+ inodeno_t get_ino() { return ino; }
+ string& get_dn() { return dn; }
+ bufferlist& get_data() { return data; }
+ int get_asker() { return asker; }
+ int get_action() { return action; }
+ int get_otype() { return otype; }
+ string& get_path() { return path; }
+
+ MLock() {}
+ MLock(int action, int asker) :
+ Message(MSG_MDS_LOCK) {
+ this->action = action;
+ this->asker = asker;
+ }
+ virtual char *get_type_name() { return "ILock"; }
+
+ void set_ino(inodeno_t ino, char ot) {
+ otype = ot;
+ this->ino = ino;
+ }
+ void set_dirino(inodeno_t dirino) {
+ otype = LOCK_OTYPE_DIR;
+ this->ino = ino;
+ }
+ void set_dn(inodeno_t dirino, string& dn) {
+ otype = LOCK_OTYPE_DN;
+ this->ino = dirino;
+ this->dn = dn;
+ }
+ void set_data(bufferlist& data) {
+ this->data.claim( data );
+ }
+ void set_path(const string& p) {
+ path = p;
+ }
+
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off,sizeof(action), (char*)&action);
+ off += sizeof(action);
+ payload.copy(off,sizeof(asker), (char*)&asker);
+ off += sizeof(asker);
+ payload.copy(off,sizeof(otype), (char*)&otype);
+ off += sizeof(otype);
+ payload.copy(off,sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ ::_decode(dn, payload, off);
+ ::_decode(path, payload, off);
+ ::_decode(data, payload, off);
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&action, sizeof(action));
+ payload.append((char*)&asker, sizeof(asker));
+ payload.append((char*)&otype, sizeof(otype));
+ payload.append((char*)&ino, sizeof(inodeno_t));
+ ::_encode(dn, payload);
+ ::_encode(path, payload);
+ ::_encode(data, payload);
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MMDSBOOT_H
+#define __MMDSBOOT_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMDSBoot : public Message {
+ public:
+ MMDSBoot() : Message(MSG_MDS_BOOT) {
+ }
+
+ char *get_type_name() { return "mdsboot"; }
+
+ void encode_payload() {
+ //payload.append((char*)&sb, sizeof(sb));
+ }
+ void decode_payload() {
+ //int off = 0;
+ //payload.copy(off, sizeof(sb), (char*)&sb);
+ //off += sizeof(sb);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MMDSGETMAP_H
+#define __MMDSGETMAP_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMDSGetMap : public Message {
+ public:
+ MMDSGetMap() : Message(MSG_MDS_GETMAP) {
+ }
+
+ char *get_type_name() { return "mdsgetmap"; }
+
+ void encode_payload() {
+ //payload.append((char*)&sb, sizeof(sb));
+ }
+ void decode_payload() {
+ //int off = 0;
+ //payload.copy(off, sizeof(sb), (char*)&sb);
+ //off += sizeof(sb);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MMDSMAP_H
+#define __MMDSMAP_H
+
+#include "msg/Message.h"
+#include "mds/MDSMap.h"
+
+
+class MMDSMap : public Message {
+ public:
+ map<epoch_t, bufferlist> maps;
+ map<epoch_t, bufferlist> incremental_maps;
+
+ epoch_t get_first() {
+ epoch_t e = 0;
+ map<epoch_t, bufferlist>::iterator i = maps.begin();
+ if (i != maps.end()) e = i->first;
+ i = incremental_maps.begin();
+ if (i != incremental_maps.end() &&
+ (e == 0 || i->first < e)) e = i->first;
+ return e;
+ }
+ epoch_t get_last() {
+ epoch_t e = 0;
+ map<epoch_t, bufferlist>::reverse_iterator i = maps.rbegin();
+ if (i != maps.rend()) e = i->first;
+ i = incremental_maps.rbegin();
+ if (i != incremental_maps.rend() &&
+ (e == 0 || i->first > e)) e = i->first;
+ return e;
+ }
+
+
+ MMDSMap() :
+ Message(MSG_MDS_MAP) {}
+ MMDSMap(MDSMap *mm) :
+ Message(MSG_MDS_MAP) {
+ mm->encode(maps[mm->get_epoch()]);
+ }
+
+
+ // marshalling
+ virtual void decode_payload() {
+ int off = 0;
+ ::_decode(maps, payload, off);
+ ::_decode(incremental_maps, payload, off);
+ }
+ virtual void encode_payload() {
+ ::_encode(maps, payload);
+ ::_encode(incremental_maps, payload);
+ }
+
+ virtual char *get_type_name() { return "mdsmap"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MMONELECTIONACK_H
+#define __MMONELECTIONACK_H
+
+#include "msg/Message.h"
+
+
+class MMonElectionAck : public Message {
+ public:
+ MMonElectionAck() : Message(MSG_MON_ELECTION_ACK) {}
+
+ virtual char *get_type_name() { return "election_ack"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MMONELECTIONCOLLECT_H
+#define __MMONELECTIONCOLLECT_H
+
+#include "msg/Message.h"
+
+
+class MMonElectionCollect : public Message {
+ public:
+ int read_num;
+
+ MMonElectionCollect() {}
+ MMonElectionCollect(int n) :
+ Message(MSG_MON_ELECTION_COLLECT),
+ read_num(n) {}
+
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(read_num), (char*)&read_num);
+ off += sizeof(read_num);
+ }
+ void encode_payload() {
+ payload.append((char*)&read_num, sizeof(read_num));
+ }
+
+ virtual char *get_type_name() { return "MonElCollect"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MMONELECTIONPROPOSE_H
+#define __MMONELECTIONPROPOSE_H
+
+#include "msg/Message.h"
+
+
+class MMonElectionPropose : public Message {
+ public:
+ MMonElectionPropose() : Message(MSG_MON_ELECTION_PROPOSE) {}
+
+ virtual char *get_type_name() { return "election_propose"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MMONELECTIONREFRESH_H
+#define __MMONELECTIONREFRESH_H
+
+#include "msg/Message.h"
+
+#include "mon/Elector.h"
+
+class MMonElectionRefresh : public Message {
+ public:
+ int p;
+ Elector::State state;
+ int refresh_num;
+
+ MMonElectionRefresh() {}
+ MMonElectionRefresh(int _p, Elector::State& s, int r) :
+ Message(MSG_MON_ELECTION_REFRESH),
+ p(_p), state(s), refresh_num(r) {}
+
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(p), (char*)&p);
+ off += sizeof(p);
+ payload.copy(off, sizeof(state), (char*)&state);
+ off += sizeof(state);
+ payload.copy(off, sizeof(refresh_num), (char*)&refresh_num);
+ off += sizeof(refresh_num);
+ }
+ void encode_payload() {
+ payload.append((char*)&p, sizeof(p));
+ payload.append((char*)&state, sizeof(state));
+ payload.append((char*)&refresh_num, sizeof(refresh_num));
+ }
+
+ virtual char *get_type_name() { return "MonElRefresh"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MMONELECTIONSTATUS_H
+#define __MMONELECTIONSTATUS_H
+
+#include "msg/Message.h"
+
+#include "mon/Elector.h"
+
+class MMonElectionStatus : public Message {
+ public:
+ int q;
+ int read_num;
+ map<int,Elector::State> registry;
+
+ MMonElectionStatus() {}
+ MMonElectionStatus(int _q, int r, map<int,Elector::State> reg) :
+ Message(MSG_MON_ELECTION_STATUS),
+ q(_q), read_num(r), registry(reg) {}
+
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(q), (char*)&q);
+ off += sizeof(q);
+ payload.copy(off, sizeof(read_num), (char*)&read_num);
+ off += sizeof(read_num);
+ ::_decode(registry, payload, off);
+ }
+ void encode_payload() {
+ payload.append((char*)&q, sizeof(q));
+ payload.append((char*)&read_num, sizeof(read_num));
+ ::_encode(registry, payload);
+ }
+
+ virtual char *get_type_name() { return "MonElStatus"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MMONELECTIONVICTORY_H
+#define __MMONELECTIONVICTORY_H
+
+#include "msg/Message.h"
+
+
+class MMonElectionVictory : public Message {
+ public:
+ //set<int> active_set;
+
+ MMonElectionVictory(/*set<int>& as*/) : Message(MSG_MON_ELECTION_VICTORY)//,
+ //active_set(as)
+ {}
+
+ virtual char *get_type_name() { return "election_victory"; }
+
+ /*
+ void encode_payload() {
+ ::_encode(active_set, payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ ::_decode(active_set, payload, off);
+ }
+ */
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MMONOSDMAPINFO_H
+#define __MMONOSDMAPINFO_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMonOSDMapInfo : public Message {
+ public:
+ epoch_t epoch;
+ epoch_t mon_epoch;
+
+ epoch_t get_epoch() { return epoch; }
+ epoch_t get_mon_epoch() { return mon_epoch; }
+
+ MMonOSDMapInfo(epoch_t e, epoch_t me) :
+ Message(MSG_MON_OSDMAP_UPDATE_PREPARE),
+ epoch(e), mon_epoch(me) {
+ }
+
+ char *get_type_name() { return "omap_info"; }
+
+ void encode_payload() {
+ payload.append((char*)&epoch, sizeof(epoch));
+ payload.append((char*)&mon_epoch, sizeof(mon_epoch));
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ payload.copy(off, sizeof(mon_epoch), (char*)&mon_epoch);
+ off += sizeof(mon_epoch);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MMONOSDMAPLEASE_H
+#define __MMONOSDMAPLEASE_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMonOSDMapLease : public Message {
+ epoch_t epoch;
+ utime_t lease_expire;
+
+ public:
+ epoch_t get_epoch() { return epoch; }
+ const utime_t& get_lease_expire() { return lease_expire; }
+
+ MMonOSDMapLease(epoch_t e, utime_t le) :
+ Message(MSG_MON_OSDMAP_LEASE),
+ epoch(e), lease_expire(le) {
+ }
+
+ char *get_type_name() { return "omap_lease"; }
+
+ void encode_payload() {
+ payload.append((char*)&epoch, sizeof(epoch));
+ payload.append((char*)&lease_expire, sizeof(lease_expire));
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ payload.copy(off, sizeof(lease_expire), (char*)&lease_expire);
+ off += sizeof(lease_expire);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MMONOSDMAPLEASEACK_H
+#define __MMONOSDMAPLEASEACK_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMonOSDMapLeaseAck : public Message {
+ epoch_t epoch;
+
+public:
+ epoch_t get_epoch() { return epoch; }
+
+ MMonOSDMapLeaseAck(epoch_t e) :
+ Message(MSG_MON_OSDMAP_LEASE_ACK),
+ epoch(e) {
+ }
+
+ char *get_type_name() { return "omap_lease_ack"; }
+
+ void encode_payload() {
+ payload.append((char*)&epoch, sizeof(epoch));
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MMONOSDMAPUPDATEACK_H
+#define __MMONOSDMAPUPDATEACK_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMonOSDMapUpdateAck : public Message {
+public:
+ epoch_t epoch;
+
+ MMonOSDMapUpdateAck(epoch_t e) :
+ Message(MSG_MON_OSDMAP_UPDATE_ACK),
+ epoch(e) {
+ }
+
+ char *get_type_name() { return "omap_update_ack"; }
+
+ void encode_payload() {
+ payload.append((char*)&epoch, sizeof(epoch));
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MMONOSDMAPUPDATECOMMIT_H
+#define __MMONOSDMAPUPDATECOMMIT_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMonOSDMapUpdateCommit : public Message {
+ public:
+ epoch_t epoch;
+
+ MMonOSDMapUpdateCommit(epoch_t e) :
+ Message(MSG_MON_OSDMAP_UPDATE_COMMIT),
+ epoch(e) {
+ }
+
+ char *get_type_name() { return "omap_update_commit"; }
+
+ void encode_payload() {
+ payload.append((char*)&epoch, sizeof(epoch));
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MMONOSDMAPUPDATEPREPARE_H
+#define __MMONOSDMAPUPDATEPREPARE_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MMonOSDMapUpdatePrepare : public Message {
+ public:
+ epoch_t epoch;
+ bufferlist map_bl;
+ bufferlist inc_map_bl;
+
+ epoch_t get_epoch() { return epoch; }
+
+ MMonOSDMapUpdatePrepare(epoch_t e,
+ bufferlist& mbl, bufferlist& incmbl) :
+ Message(MSG_MON_OSDMAP_UPDATE_PREPARE),
+ epoch(e),
+ map_bl(mbl), inc_map_bl(incmbl) {
+ }
+
+ char *get_type_name() { return "omap_update_prepare"; }
+
+ void encode_payload() {
+ payload.append((char*)&epoch, sizeof(epoch));
+ ::_encode(map_bl, payload);
+ ::_encode(inc_map_bl, payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ ::_decode(map_bl, payload, off);
+ ::_decode(inc_map_bl, payload, off);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MNSCONNECT_H
+#define __MNSCONNECT_H
+
+#include "msg/Message.h"
+#include "msg/tcp.h"
+
+class MNSConnect : public Message {
+ tcpaddr_t tcpaddr;
+
+ public:
+ MNSConnect() {}
+ MNSConnect(tcpaddr_t t) :
+ Message(MSG_NS_CONNECT) {
+ tcpaddr = t;
+ }
+
+ char *get_type_name() { return "NSCon"; }
+
+ tcpaddr_t& get_addr() { return tcpaddr; }
+
+ void encode_payload() {
+ payload.append((char*)&tcpaddr, sizeof(tcpaddr));
+ }
+ void decode_payload() {
+ payload.copy(0, sizeof(tcpaddr), (char*)&tcpaddr);
+ }
+};
+
+
+#endif
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MNSCONNECTACK_H
+#define __MNSCONNECTACK_H
+
+#include "msg/Message.h"
+#include "msg/TCPMessenger.h"
+
+class MNSConnectAck : public Message {
+ int rank;
+ int inst;
+
+ public:
+ MNSConnectAck() {}
+ MNSConnectAck(int r, int g=0) :
+ Message(MSG_NS_CONNECTACK) {
+ rank = r;
+ inst = g;
+ }
+
+ char *get_type_name() { return "NSConA"; }
+
+ int get_rank() { return rank; }
+ int get_inst() { return inst; }
+
+ void encode_payload() {
+ payload.append((char*)&rank, sizeof(rank));
+ payload.append((char*)&inst, sizeof(inst));
+ }
+ void decode_payload() {
+ unsigned off = 0;
+ payload.copy(off, sizeof(rank), (char*)&rank);
+ off += sizeof(rank);
+ payload.copy(off, sizeof(inst), (char*)&inst);
+ off += sizeof(inst);
+ }
+};
+
+
+#endif
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MNSFAILURE_H
+#define __MNSFAILURE_H
+
+#include "msg/Message.h"
+#include "msg/tcp.h"
+
+class MNSFailure : public Message {
+ //msg_addr_t entity;
+ entity_inst_t inst;
+
+ public:
+ MNSFailure() {}
+ MNSFailure(entity_inst_t& i) :
+ Message(MSG_NS_FAILURE),
+ //entity(w),
+ inst(i) {}
+
+ char *get_type_name() { return "NSFail"; }
+
+ //msg_addr_t &get_entity() { return entity; }
+ entity_inst_t &get_inst() { return inst; }
+
+ void encode_payload() {
+ //payload.append((char*)&entity, sizeof(entity));
+ payload.append((char*)&inst, sizeof(inst));
+ }
+ void decode_payload() {
+ unsigned off = 0;
+ //payload.copy(off, sizeof(entity), (char*)&entity);
+ //off += sizeof(entity);
+ payload.copy(off, sizeof(inst), (char*)&inst);
+ off += sizeof(inst);
+ }
+};
+
+
+#endif
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MNSLOOKUP_H
+#define __MNSLOOKUP_H
+
+#include "msg/Message.h"
+
+class MNSLookup : public Message {
+ msg_addr_t entity;
+
+ public:
+ MNSLookup() {}
+ MNSLookup(msg_addr_t e) :
+ Message(MSG_NS_LOOKUP) {
+ entity = e;
+ }
+
+ char *get_type_name() { return "NSLook"; }
+
+ msg_addr_t get_entity() { return entity; }
+
+ void encode_payload() {
+ payload.append((char*)&entity, sizeof(entity));
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(entity), (char*)&entity);
+ off += sizeof(entity);
+ }
+};
+
+
+#endif
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MNSLOOKUPREPLY_H
+#define __MNSLOOKUPREPLY_H
+
+#include "msg/Message.h"
+#include "msg/TCPMessenger.h"
+
+class MNSLookupReply : public Message {
+ public:
+ map<msg_addr_t, entity_inst_t> entity_map;
+
+ public:
+ MNSLookupReply() {}
+ MNSLookupReply(MNSLookup *m) :
+ Message(MSG_NS_LOOKUPREPLY) {
+ }
+
+ char *get_type_name() { return "NSLookR"; }
+
+ void encode_payload() {
+ ::_encode(entity_map, payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ ::_decode(entity_map, payload, off);
+ }
+};
+
+
+#endif
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MNSREGISTER_H
+#define __MNSREGISTER_H
+
+#include "msg/Message.h"
+#include "msg/TCPMessenger.h"
+
+class MNSRegister : public Message {
+ msg_addr_t addr;
+ int rank;
+ long tid;
+
+ public:
+ MNSRegister() {}
+ MNSRegister(msg_addr_t a, int r, int ti) :
+ Message(MSG_NS_REGISTER) {
+ addr = a;
+ rank = r;
+ tid = ti;
+ }
+
+ char *get_type_name() { return "NSReg"; }
+
+ msg_addr_t get_entity() { return addr; }
+ int get_rank() { return rank; }
+ long get_tid() { return tid; }
+
+ void encode_payload() {
+ payload.append((char*)&addr, sizeof(addr));
+ payload.append((char*)&rank, sizeof(rank));
+ payload.append((char*)&tid, sizeof(tid));
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(addr), (char*)&addr);
+ off += sizeof(addr);
+ payload.copy(off, sizeof(rank), (char*)&rank);
+ off += sizeof(rank);
+ payload.copy(off, sizeof(tid), (char*)&tid);
+ off += sizeof(tid);
+ }
+};
+
+
+#endif
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MNSREGISTERACK_H
+#define __MNSREGISTERACK_H
+
+#include "msg/Message.h"
+#include "msg/TCPMessenger.h"
+
+class MNSRegisterAck : public Message {
+ msg_addr_t entity;
+ long tid;
+
+ public:
+ MNSRegisterAck() {}
+ MNSRegisterAck(long t, msg_addr_t e) :
+ Message(MSG_NS_REGISTERACK) {
+ entity = e;
+ tid = t;
+ }
+
+ char *get_type_name() { return "NSRegA"; }
+
+ msg_addr_t get_entity() { return entity; }
+ long get_tid() { return tid; }
+
+ void encode_payload() {
+ payload.append((char*)&entity, sizeof(entity));
+ payload.append((char*)&tid, sizeof(tid));
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(entity), (char*)&entity);
+ off += sizeof(entity);
+ payload.copy(off, sizeof(tid), (char*)&tid);
+ off += sizeof(tid);
+ }
+};
+
+
+#endif
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MOSDBOOT_H
+#define __MOSDBOOT_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MOSDBoot : public Message {
+ public:
+ OSDSuperblock sb;
+
+ MOSDBoot() {}
+ MOSDBoot(OSDSuperblock& s) :
+ Message(MSG_OSD_BOOT),
+ sb(s) {
+ }
+
+ char *get_type_name() { return "oboot"; }
+
+ void encode_payload() {
+ payload.append((char*)&sb, sizeof(sb));
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(sb), (char*)&sb);
+ off += sizeof(sb);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MOSDFAILURE_H
+#define __MOSDFAILURE_H
+
+#include "msg/Message.h"
+
+
+class MOSDFailure : public Message {
+ public:
+ msg_addr_t failed;
+ entity_inst_t inst;
+ epoch_t epoch;
+
+ MOSDFailure() {}
+ MOSDFailure(msg_addr_t f, const entity_inst_t& i, epoch_t e) :
+ Message(MSG_OSD_FAILURE),
+ failed(f), inst(i), epoch(e) {}
+
+ msg_addr_t get_failed() { return failed; }
+ entity_inst_t& get_inst() { return inst; }
+ epoch_t get_epoch() { return epoch; }
+
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(failed), (char*)&failed);
+ off += sizeof(failed);
+ payload.copy(off, sizeof(inst), (char*)&inst);
+ off += sizeof(inst);
+ payload.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ }
+ void encode_payload() {
+ payload.append((char*)&failed, sizeof(failed));
+ payload.append((char*)&inst, sizeof(inst));
+ payload.append((char*)&epoch, sizeof(epoch));
+ }
+
+ virtual char *get_type_name() { return "osdfail"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MOSDGETMAP_H
+#define __MOSDGETMAP_H
+
+#include "msg/Message.h"
+
+#include "include/types.h"
+
+class MOSDGetMap : public Message {
+ public:
+ epoch_t since;
+
+ //MOSDGetMap() : since(0) {}
+ MOSDGetMap(epoch_t s=0) :
+ Message(MSG_OSD_GETMAP),
+ since(s) {
+ }
+
+ epoch_t get_since() { return since; }
+
+ char *get_type_name() { return "getomap"; }
+
+ void encode_payload() {
+ payload.append((char*)&since, sizeof(since));
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(since), (char*)&since);
+ off += sizeof(since);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __MOSDIN_H
+#define __MOSDIN_H
+
+#include "msg/Message.h"
+
+
+class MOSDIn : public Message {
+ public:
+ epoch_t map_epoch;
+
+ MOSDIn(epoch_t e) : Message(MSG_OSD_IN), map_epoch(e) {
+ }
+ MOSDIn() {}
+
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(map_epoch), (char*)&map_epoch);
+ off += sizeof(map_epoch);
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&map_epoch, sizeof(map_epoch));
+ }
+
+ virtual char *get_type_name() { return "oin"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MOSDGETMAPACK_H
+#define __MOSDGETMAPACK_H
+
+#include "msg/Message.h"
+#include "osd/OSDMap.h"
+
+
+class MOSDMap : public Message {
+ public:
+ map<epoch_t, bufferlist> maps;
+ map<epoch_t, bufferlist> incremental_maps;
+
+ epoch_t get_first() {
+ epoch_t e = 0;
+ map<epoch_t, bufferlist>::iterator i = maps.begin();
+ if (i != maps.end()) e = i->first;
+ i = incremental_maps.begin();
+ if (i != incremental_maps.end() &&
+ (e == 0 || i->first < e)) e = i->first;
+ return e;
+ }
+ epoch_t get_last() {
+ epoch_t e = 0;
+ map<epoch_t, bufferlist>::reverse_iterator i = maps.rbegin();
+ if (i != maps.rend()) e = i->first;
+ i = incremental_maps.rbegin();
+ if (i != incremental_maps.rend() &&
+ (e == 0 || i->first > e)) e = i->first;
+ return e;
+ }
+
+
+ MOSDMap() :
+ Message(MSG_OSD_MAP) {}
+ MOSDMap(OSDMap *oc) :
+ Message(MSG_OSD_MAP) {
+ oc->encode(maps[oc->get_epoch()]);
+ }
+
+
+ // marshalling
+ virtual void decode_payload() {
+ int off = 0;
+ ::_decode(maps, payload, off);
+ ::_decode(incremental_maps, payload, off);
+ }
+ virtual void encode_payload() {
+ ::_encode(maps, payload);
+ ::_encode(incremental_maps, payload);
+ }
+
+ virtual char *get_type_name() { return "omap"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MOSDOP_H
+#define __MOSDOP_H
+
+#include "msg/Message.h"
+
+/*
+ * OSD op
+ *
+ * oid - object id
+ * op - OSD_OP_DELETE, etc.
+ *
+ */
+
+//#define OSD_OP_MKFS 20
+
+// client ops
+#define OSD_OP_READ 1
+#define OSD_OP_STAT 2
+
+#define OSD_OP_WRNOOP 10
+#define OSD_OP_WRITE 11
+#define OSD_OP_DELETE 12
+#define OSD_OP_TRUNCATE 13
+#define OSD_OP_ZERO 14
+
+#define OSD_OP_WRLOCK 20
+#define OSD_OP_WRUNLOCK 21
+#define OSD_OP_RDLOCK 22
+#define OSD_OP_RDUNLOCK 23
+#define OSD_OP_UPLOCK 24
+#define OSD_OP_DNLOCK 25
+
+#define OSD_OP_PULL 30
+#define OSD_OP_PUSH 31
+
+
+typedef struct {
+ long pcid;
+
+ // who's asking?
+ tid_t tid;
+ msg_addr_t client;
+ entity_inst_t client_inst;
+
+ // for replication
+ tid_t rep_tid;
+
+ object_t oid;
+ objectrev_t rev;
+ pg_t pg;
+
+ epoch_t map_epoch;
+
+ eversion_t pg_trim_to; // primary->replica: trim to here
+
+ int op;
+ size_t length, offset;
+ eversion_t version;
+ eversion_t old_version;
+
+ bool want_ack;
+ bool want_commit;
+} MOSDOp_st;
+
+class MOSDOp : public Message {
+public:
+ static const char* get_opname(int op) {
+ switch (op) {
+ case OSD_OP_READ: return "read";
+ case OSD_OP_STAT: return "stat";
+
+ case OSD_OP_WRNOOP: return "wrnoop";
+ case OSD_OP_WRITE: return "write";
+ case OSD_OP_ZERO: return "zero";
+ case OSD_OP_DELETE: return "delete";
+ case OSD_OP_TRUNCATE: return "truncate";
+ case OSD_OP_WRLOCK: return "wrlock";
+ case OSD_OP_WRUNLOCK: return "wrunlock";
+ case OSD_OP_RDLOCK: return "rdlock";
+ case OSD_OP_RDUNLOCK: return "rdunlock";
+ case OSD_OP_UPLOCK: return "uplock";
+ case OSD_OP_DNLOCK: return "dnlock";
+
+ case OSD_OP_PULL: return "pull";
+ case OSD_OP_PUSH: return "push";
+ default: assert(0);
+ }
+ return 0;
+ }
+
+private:
+ MOSDOp_st st;
+ bufferlist data;
+ map<string,bufferptr> attrset;
+
+ friend class MOSDOpReply;
+
+ public:
+ const tid_t get_tid() { return st.tid; }
+ const msg_addr_t& get_client() { return st.client; }
+ const entity_inst_t& get_client_inst() { return st.client_inst; }
+ void set_client_inst(const entity_inst_t& i) { st.client_inst = i; }
+
+ const tid_t get_rep_tid() { return st.rep_tid; }
+ void set_rep_tid(tid_t t) { st.rep_tid = t; }
+
+ const object_t get_oid() { return st.oid; }
+ const pg_t get_pg() { return st.pg; }
+ const epoch_t get_map_epoch() { return st.map_epoch; }
+
+ //const int get_pg_role() { return st.pg_role; } // who am i asking for?
+ const eversion_t get_version() { return st.version; }
+ //const eversion_t get_old_version() { return st.old_version; }
+
+ void set_rev(objectrev_t r) { st.rev = r; }
+ objectrev_t get_rev() { return st.rev; }
+
+ const eversion_t get_pg_trim_to() { return st.pg_trim_to; }
+ void set_pg_trim_to(eversion_t v) { st.pg_trim_to = v; }
+
+ const int get_op() { return st.op; }
+ void set_op(int o) { st.op = o; }
+
+ const size_t get_length() { return st.length; }
+ const size_t get_offset() { return st.offset; }
+
+ map<string,bufferptr>& get_attrset() { return attrset; }
+ void set_attrset(map<string,bufferptr> &as) { attrset = as; }
+
+ const bool wants_ack() { return st.want_ack; }
+ const bool wants_commit() { return st.want_commit; }
+
+
+ void set_data(bufferlist &d) {
+ data.claim(d);
+ }
+ bufferlist& get_data() {
+ return data;
+ }
+ size_t get_data_len() { return data.length(); }
+
+
+ // keep a pcid (procedure call id) to match up request+reply
+ void set_pcid(long pcid) { this->st.pcid = pcid; }
+ long get_pcid() { return st.pcid; }
+
+ MOSDOp(long tid, msg_addr_t asker,
+ object_t oid, pg_t pg, epoch_t mapepoch, int op) :
+ Message(MSG_OSD_OP) {
+ memset(&st, 0, sizeof(st));
+ this->st.client = asker;
+ this->st.tid = tid;
+ this->st.rep_tid = 0;
+
+ this->st.oid = oid;
+ this->st.pg = pg;
+ this->st.map_epoch = mapepoch;
+ this->st.op = op;
+
+ this->st.want_ack = true;
+ this->st.want_commit = true;
+ }
+ MOSDOp() {}
+
+ //void set_pg_role(int r) { st.pg_role = r; }
+ //void set_rg_nrep(int n) { st.rg_nrep = n; }
+
+ void set_length(size_t l) { st.length = l; }
+ void set_offset(size_t o) { st.offset = o; }
+ void set_version(eversion_t v) { st.version = v; }
+ void set_old_version(eversion_t ov) { st.old_version = ov; }
+
+ void set_want_ack(bool b) { st.want_ack = b; }
+ void set_want_commit(bool b) { st.want_commit = b; }
+
+ // marshalling
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(st), (char*)&st);
+ off += sizeof(st);
+ ::_decode(attrset, payload, off);
+ ::_decode(data, payload, off);
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&st, sizeof(st));
+ ::_encode(attrset, payload);
+ ::_encode(data, payload);
+ }
+
+ virtual char *get_type_name() { return "oop"; }
+};
+
+inline ostream& operator<<(ostream& out, MOSDOp& op)
+{
+ return out << "MOSDOp(" << op.get_client() << "." << op.get_tid()
+ << " op " << MOSDOp::get_opname(op.get_op())
+ << " oid " << hex << op.get_oid() << dec << " " << &op << ")";
+}
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MOSDOPREPLY_H
+#define __MOSDOPREPLY_H
+
+#include "msg/Message.h"
+
+#include "MOSDOp.h"
+#include "osd/ObjectStore.h"
+
+/*
+ * OSD op reply
+ *
+ * oid - object id
+ * op - OSD_OP_DELETE, etc.
+ *
+ */
+
+
+typedef struct {
+ // req
+ long pcid;
+ tid_t tid;
+ tid_t rep_tid;
+
+ object_t oid;
+ pg_t pg;
+
+ int op;
+
+ // reply
+ int result;
+ bool commit;
+ size_t length, offset;
+ size_t object_size;
+ eversion_t version;
+
+ eversion_t pg_complete_thru;
+
+ epoch_t map_epoch;
+} MOSDOpReply_st;
+
+
+class MOSDOpReply : public Message {
+ MOSDOpReply_st st;
+ bufferlist data;
+ map<string,bufferptr> attrset;
+
+ public:
+ long get_tid() { return st.tid; }
+ long get_rep_tid() { return st.rep_tid; }
+ object_t get_oid() { return st.oid; }
+ pg_t get_pg() { return st.pg; }
+ int get_op() { return st.op; }
+ bool get_commit() { return st.commit; }
+
+ int get_result() { return st.result; }
+ size_t get_length() { return st.length; }
+ size_t get_offset() { return st.offset; }
+ size_t get_object_size() { return st.object_size; }
+ eversion_t get_version() { return st.version; }
+ map<string,bufferptr>& get_attrset() { return attrset; }
+
+ eversion_t get_pg_complete_thru() { return st.pg_complete_thru; }
+ void set_pg_complete_thru(eversion_t v) { st.pg_complete_thru = v; }
+
+ void set_result(int r) { st.result = r; }
+ void set_length(size_t s) { st.length = s; }
+ void set_offset(size_t o) { st.offset = o; }
+ void set_object_size(size_t s) { st.object_size = s; }
+ void set_version(eversion_t v) { st.version = v; }
+ void set_attrset(map<string,bufferptr> &as) { attrset = as; }
+
+ void set_op(int op) { st.op = op; }
+ void set_tid(tid_t t) { st.tid = t; }
+ void set_rep_tid(tid_t t) { st.rep_tid = t; }
+
+ // data payload
+ void set_data(bufferlist &d) {
+ data.claim(d);
+ }
+ bufferlist& get_data() {
+ return data;
+ }
+
+ // osdmap
+ epoch_t get_map_epoch() { return st.map_epoch; }
+
+ // keep a pcid (procedure call id) to match up request+reply
+ void set_pcid(long pcid) { this->st.pcid = pcid; }
+ long get_pcid() { return st.pcid; }
+
+public:
+ MOSDOpReply(MOSDOp *req, int result, epoch_t e, bool commit) :
+ Message(MSG_OSD_OPREPLY) {
+ memset(&st, 0, sizeof(st));
+ this->st.pcid = req->st.pcid;
+
+ this->st.op = req->st.op;
+ this->st.tid = req->st.tid;
+ this->st.rep_tid = req->st.rep_tid;
+
+ this->st.oid = req->st.oid;
+ this->st.pg = req->st.pg;
+ this->st.result = result;
+ this->st.commit = commit;
+
+ this->st.length = req->st.length; // speculative... OSD should ensure these are correct
+ this->st.offset = req->st.offset;
+ this->st.version = req->st.version;
+
+ this->st.map_epoch = e;
+ }
+ MOSDOpReply() {}
+
+
+ // marshalling
+ virtual void decode_payload() {
+ payload.copy(0, sizeof(st), (char*)&st);
+ payload.splice(0, sizeof(st));
+ int off = 0;
+ ::_decode(attrset, payload, off);
+ ::_decode(data, payload, off);
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&st, sizeof(st));
+ ::_encode(attrset, payload);
+ ::_encode(data, payload);
+ }
+
+ virtual char *get_type_name() { return "oopr"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __MOSDOUT_H
+#define __MOSDOUT_H
+
+#include "msg/Message.h"
+
+
+class MOSDOut : public Message {
+ public:
+ epoch_t map_epoch;
+
+ MOSDOut(epoch_t e) : Message(MSG_OSD_OUT), map_epoch(e) {
+ }
+ MOSDOut() {}
+
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(map_epoch), (char*)&map_epoch);
+ off += sizeof(map_epoch);
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&map_epoch, sizeof(map_epoch));
+ }
+
+ virtual char *get_type_name() { return "oout"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MOSDPGLOG_H
+#define __MOSDPGLOG_H
+
+#include "msg/Message.h"
+
+class MOSDPGLog : public Message {
+ epoch_t epoch;
+ pg_t pgid;
+
+public:
+ PG::Info info;
+ PG::Log log;
+ PG::Missing missing;
+
+ epoch_t get_epoch() { return epoch; }
+ pg_t get_pgid() { return pgid; }
+
+ MOSDPGLog() {}
+ MOSDPGLog(version_t mv, pg_t pgid) :
+ Message(MSG_OSD_PG_LOG) {
+ this->epoch = mv;
+ this->pgid = pgid;
+ }
+
+ char *get_type_name() { return "PGlog"; }
+
+ void encode_payload() {
+ payload.append((char*)&epoch, sizeof(epoch));
+ payload.append((char*)&pgid, sizeof(pgid));
+ payload.append((char*)&info, sizeof(info));
+ log._encode(payload);
+ missing._encode(payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ payload.copy(off, sizeof(pgid), (char*)&pgid);
+ off += sizeof(pgid);
+ payload.copy(off, sizeof(info), (char*)&info);
+ off += sizeof(info);
+ log._decode(payload, off);
+ missing._decode(payload, off);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MOSDPGPEERNOTIFY_H
+#define __MOSDPGPEERNOTIFY_H
+
+#include "msg/Message.h"
+
+#include "osd/PG.h"
+
+/*
+ * PGNotify - notify primary of my PGs and versions.
+ */
+
+class MOSDPGNotify : public Message {
+ epoch_t epoch;
+ list<PG::Info> pg_list; // pgid -> version
+
+ public:
+ version_t get_epoch() { return epoch; }
+ list<PG::Info>& get_pg_list() { return pg_list; }
+
+ MOSDPGNotify() {}
+ MOSDPGNotify(epoch_t e, list<PG::Info>& l) :
+ Message(MSG_OSD_PG_NOTIFY) {
+ this->epoch = e;
+ pg_list.splice(pg_list.begin(),l);
+ }
+
+ char *get_type_name() { return "PGnot"; }
+
+ void encode_payload() {
+ payload.append((char*)&epoch, sizeof(epoch));
+ _encode(pg_list, payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ _decode(pg_list, payload, off);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MOSDPGPEER_H
+#define __MOSDPGPEER_H
+
+#include "msg/Message.h"
+
+
+class MOSDPGPeer : public Message {
+ __uint64_t map_version;
+ list<pg_t> pg_list;
+
+ bool complete;
+
+ public:
+ __uint64_t get_version() { return map_version; }
+ list<pg_t>& get_pg_list() { return pg_list; }
+ bool get_complete() { return complete; }
+
+ MOSDPGPeer() {}
+ MOSDPGPeer(__uint64_t v, list<pg_t>& l, bool c=false) :
+ Message(MSG_OSD_PG_PEER) {
+ this->map_version = v;
+ this->complete = c;
+ pg_list.splice(pg_list.begin(), l);
+ }
+
+ char *get_type_name() { return "PGPeer"; }
+
+ void encode_payload() {
+ payload.append((char*)&map_version, sizeof(map_version));
+ payload.append((char*)&complete, sizeof(complete));
+ _encode(pg_list, payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(map_version), (char*)&map_version);
+ off += sizeof(map_version);
+ payload.copy(off, sizeof(complete), (char*)&complete);
+ off += sizeof(complete);
+ _decode(pg_list, payload, off);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MOSDPGPEERACK_H
+#define __MOSDPGPEERACK_H
+
+#include "msg/Message.h"
+#include "osd/OSD.h"
+
+class MOSDPGPeerAck : public Message {
+ __uint64_t map_version;
+
+ public:
+ list<pg_t> pg_dne; // pg dne
+ map<pg_t, PGReplicaInfo > pg_state; // state, lists, etc.
+
+ __uint64_t get_version() { return map_version; }
+
+ MOSDPGPeerAck() {}
+ MOSDPGPeerAck(__uint64_t v) :
+ Message(MSG_OSD_PG_PEERACK) {
+ this->map_version = v;
+ }
+
+ char *get_type_name() { return "PGPeer"; }
+
+ void encode_payload() {
+ payload.append((char*)&map_version, sizeof(map_version));
+ _encode(pg_dne, payload);
+
+ int n = pg_state.size();
+ payload.append((char*)&n, sizeof(n));
+ for (map<pg_t, PGReplicaInfo >::iterator it = pg_state.begin();
+ it != pg_state.end();
+ it++) {
+ payload.append((char*)&it->first, sizeof(it->first));
+ it->second._encode(payload);
+ }
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(map_version), (char*)&map_version);
+ off += sizeof(map_version);
+ _decode(pg_dne, payload, off);
+
+ int n;
+ payload.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++) {
+ pg_t pgid;
+ payload.copy(off, sizeof(pgid), (char*)&pgid);
+ off += sizeof(pgid);
+ pg_state[pgid]._decode(payload, off);
+ }
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MOSDPEERREQUEST_H
+#define __MOSDPEERREQUEST_H
+
+#include "msg/Message.h"
+
+
+class MOSDPGPeerRequest : public Message {
+ __uint64_t map_version;
+ list<repgroup_t> pg_list;
+
+ public:
+ __uint64_t get_version() { return map_version; }
+ list<repgroup_t>& get_pg_list() { return pg_list; }
+
+ MOSDPGPeerRequest() {}
+ MOSDPGPeerRequest(__uint64_t v, list<repgroup_t>& l) :
+ Message(MSG_OSD_PG_PEERREQUEST) {
+ this->map_version = v;
+ pg_list.splice(pg_list.begin(), l);
+ }
+
+ char *get_type_name() { return "PGPR"; }
+
+ void encode_payload() {
+ payload.append((char*)&map_version, sizeof(map_version));
+ _encode(pg_list, payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(map_version), (char*)&map_version);
+ off += sizeof(map_version);
+ _decode(pg_list, payload, off);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MOSDPGQUERY_H
+#define __MOSDPGQUERY_H
+
+#include "msg/Message.h"
+
+/*
+ * PGQuery - query another OSD as to the contents of their PGs
+ */
+
+class MOSDPGQuery : public Message {
+ version_t epoch;
+
+ public:
+ version_t get_epoch() { return epoch; }
+ map<pg_t,PG::Query> pg_list;
+
+ MOSDPGQuery() {}
+ MOSDPGQuery(epoch_t e, map<pg_t,PG::Query>& ls) :
+ Message(MSG_OSD_PG_QUERY),
+ epoch(e), pg_list(ls) {
+ }
+
+ char *get_type_name() { return "PGq"; }
+
+ void encode_payload() {
+ payload.append((char*)&epoch, sizeof(epoch));
+ ::_encode(pg_list, payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ ::_decode(pg_list, payload, off);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MOSDPGREMOVE_H
+#define __MOSDPGREMOVE_H
+
+#include "msg/Message.h"
+
+
+class MOSDPGRemove : public Message {
+ epoch_t epoch;
+
+ public:
+ set<pg_t> pg_list;
+
+ epoch_t get_epoch() { return epoch; }
+
+ MOSDPGRemove() {}
+ MOSDPGRemove(epoch_t e, set<pg_t>& l) :
+ Message(MSG_OSD_PG_REMOVE) {
+ this->epoch = e;
+ pg_list = l;
+ }
+
+ char *get_type_name() { return "PGrm"; }
+
+ void encode_payload() {
+ payload.append((char*)&epoch, sizeof(epoch));
+ _encode(pg_list, payload);
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ _decode(pg_list, payload, off);
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MOSDPGQUERYREPLY_H
+#define __MOSDPGQUERYREPLY_H
+
+#include "msg/Message.h"
+
+class MOSDPGSummary : public Message {
+ epoch_t epoch;
+ pg_t pgid;
+
+public:
+ PG::PGInfo info;
+ bufferlist sumbl;
+
+ epoch_t get_epoch() { return epoch; }
+
+ MOSDPGSummary() {}
+ MOSDPGSummary(version_t mv, pg_t pgid, PG::PGSummary &summary) :
+ Message(MSG_OSD_PG_SUMMARY) {
+ this->epoch = mv;
+ this->pgid = pgid;
+ summary._encode(sumbl);
+ }
+
+ pg_t get_pgid() { return pgid; }
+ bufferlist& get_summary_bl() {
+ return sumbl;
+ }
+
+ char *get_type_name() { return "PGsum"; }
+
+ void encode_payload() {
+ payload.append((char*)&epoch, sizeof(epoch));
+ payload.append((char*)&pgid, sizeof(pgid));
+ payload.append((char*)&info, sizeof(info));
+ payload.claim_append(sumbl);
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ payload.copy(off, sizeof(pgid), (char*)&pgid);
+ off += sizeof(pgid);
+ payload.copy(off, sizeof(info), (char*)&info);
+ off += sizeof(info);
+
+ payload.splice(0, off);
+ sumbl.claim(payload);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MOSDPGUPDATE_H
+#define __MOSDPGUPDATE_H
+
+#include "msg/Message.h"
+
+class MOSDPGUpdate : public Message {
+ version_t map_version;
+ pg_t pgid;
+ //pginfo_t info;
+ bool complete;
+ version_t last_any_complete;
+
+ public:
+ version_t get_version() { return map_version; }
+ pg_t get_pgid() { return pgid; }
+ //pginfo_t& get_pginfo() { return info; }
+ bool is_complete() { return complete; }
+ version_t get_last_any_complete() { return last_any_complete; }
+
+ MOSDPGUpdate() {}
+ MOSDPGUpdate(version_t mv, pg_t pgid, bool complete, version_t last_any_complete) :
+ Message(MSG_OSD_PG_UPDATE) {
+ this->map_version = mv;
+ this->pgid = pgid;
+ this->complete = complete;
+ this->last_any_complete = last_any_complete;
+ }
+
+ char *get_type_name() { return "PGUp"; }
+
+ void encode_payload() {
+ payload.append((char*)&map_version, sizeof(map_version));
+ payload.append((char*)&pgid, sizeof(pgid));
+ payload.append((char*)&complete, sizeof(complete));
+ payload.append((char*)&last_any_complete, sizeof(last_any_complete));
+ }
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(map_version), (char*)&map_version);
+ off += sizeof(map_version);
+ payload.copy(off, sizeof(pgid), (char*)&pgid);
+ off += sizeof(pgid);
+ payload.copy(off, sizeof(complete), (char*)&complete);
+ off += sizeof(complete);
+ payload.copy(off, sizeof(last_any_complete), (char*)&last_any_complete);
+ off += sizeof(last_any_complete);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MOSDPING_H
+#define __MOSDPING_H
+
+#include "msg/Message.h"
+
+
+class MOSDPing : public Message {
+ public:
+ epoch_t map_epoch;
+ bool ack;
+ float avg_qlen;
+
+ MOSDPing(epoch_t e,
+ float aq,
+ bool a=false) : Message(MSG_OSD_PING), map_epoch(e), ack(a), avg_qlen(aq) {
+ }
+ MOSDPing() {}
+
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(map_epoch), (char*)&map_epoch);
+ off += sizeof(map_epoch);
+ payload.copy(off, sizeof(ack), (char*)&ack);
+ off += sizeof(ack);
+ payload.copy(off, sizeof(avg_qlen), (char*)&avg_qlen);
+ off += sizeof(avg_qlen);
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&map_epoch, sizeof(map_epoch));
+ payload.append((char*)&ack, sizeof(ack));
+ payload.append((char*)&avg_qlen, sizeof(avg_qlen));
+ }
+
+ virtual char *get_type_name() { return "oping"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __MPING_H
+#define __MPING_H
+
+#include "msg/Message.h"
+
+
+class MPing : public Message {
+ public:
+ int seq;
+ MPing(int s) : Message(MSG_PING) {
+ seq = s;
+ }
+ MPing() : Message(MSG_PING) {}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(0, sizeof(seq), (char*)&seq);
+ off += sizeof(seq);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&seq, sizeof(seq));
+ }
+
+ virtual char *get_type_name() { return "ping"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MPINGACK_H
+#define __MPINGACK_H
+
+#include "MPing.h"
+
+
+class MPingAck : public Message {
+ public:
+ int seq;
+ MPingAck() {}
+ MPingAck(MPing *p) : Message(MSG_PING_ACK) {
+ this->seq = p->seq;
+ }
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(0, sizeof(seq), (char*)&seq);
+ off += sizeof(seq);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&seq, sizeof(seq));
+ }
+
+ virtual char *get_type_name() { return "pinga"; }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MRENAME_H
+#define __MRENAME_H
+
+class MRename : public Message {
+ inodeno_t srcdirino;
+ string srcname;
+ inodeno_t destdirino;
+ string destname;
+ int initiator;
+
+ bufferlist inode_state;
+
+ public:
+ int get_initiator() { return initiator; }
+ inodeno_t get_srcdirino() { return srcdirino; }
+ string& get_srcname() { return srcname; }
+ inodeno_t get_destdirino() { return destdirino; }
+ string& get_destname() { return destname; }
+ bufferlist& get_inode_state() { return inode_state; }
+
+ MRename() {}
+ MRename(int initiator,
+ inodeno_t srcdirino,
+ const string& srcname,
+ inodeno_t destdirino,
+ const string& destname,
+ bufferlist& inode_state) :
+ Message(MSG_MDS_RENAME) {
+ this->initiator = initiator;
+ this->srcdirino = srcdirino;
+ this->srcname = srcname;
+ this->destdirino = destdirino;
+ this->destname = destname;
+ this->inode_state.claim( inode_state );
+ }
+ virtual char *get_type_name() { return "Rn";}
+
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(initiator), (char*)&initiator);
+ off += sizeof(initiator);
+ payload.copy(off, sizeof(srcdirino), (char*)&srcdirino);
+ off += sizeof(srcdirino);
+ payload.copy(off, sizeof(destdirino), (char*)&destdirino);
+ off += sizeof(destdirino);
+ _decode(srcname, payload, off);
+ _decode(destname, payload, off);
+ size_t len;
+ payload.copy(off, sizeof(len), (char*)&len);
+ off += sizeof(len);
+ inode_state.substr_of(payload, off, len);
+ off += len;
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&initiator,sizeof(initiator));
+ payload.append((char*)&srcdirino,sizeof(srcdirino));
+ payload.append((char*)&destdirino,sizeof(destdirino));
+ _encode(srcname, payload);
+ _encode(destname, payload);
+ size_t len = inode_state.length();
+ payload.append((char*)&len, sizeof(len));
+ payload.claim_append(inode_state);
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MRENAMEACK_H
+#define __MRENAMEACK_H
+
+/* FIXME: relateive to dn, not inode */
+
+class MRenameAck : public Message {
+ inodeno_t ino;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+
+ MRenameAck() {}
+ MRenameAck(inodeno_t ino) :
+ Message(MSG_MDS_RENAMEACK) {
+ this->ino = ino;
+ }
+ virtual char *get_type_name() { return "RnAck";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&ino,sizeof(ino));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MRENAMENOTIFY_H
+#define __MRENAMENOTIFY_H
+
+class MRenameNotify : public Message {
+ inodeno_t ino;
+ inodeno_t srcdirino;
+ string srcname;
+ inodeno_t destdirino;
+ string destname;
+ string destdirpath;
+ int srcauth;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+ inodeno_t get_srcdirino() { return srcdirino; }
+ string& get_srcname() { return srcname; }
+ inodeno_t get_destdirino() { return destdirino; }
+ string& get_destname() { return destname; }
+ string& get_destdirpath() { return destdirpath; }
+ int get_srcauth() { return srcauth; }
+
+ MRenameNotify() {}
+ MRenameNotify(inodeno_t ino,
+ inodeno_t srcdirino,
+ const string& srcname,
+ inodeno_t destdirino,
+ const string& destdirpath,
+ const string& destname,
+ int srcauth
+ ) :
+ Message(MSG_MDS_RENAMENOTIFY) {
+ this->ino = ino;
+ this->srcdirino = srcdirino;
+ this->srcname = srcname;
+ this->destdirino = destdirino;
+ this->destname = destname;
+ this->destdirpath = destdirpath;
+ this->srcauth = srcauth;
+ }
+ virtual char *get_type_name() { return "Rnot";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ s.copy(off, sizeof(srcdirino), (char*)&srcdirino);
+ off += sizeof(srcdirino);
+ s.copy(off, sizeof(destdirino), (char*)&destdirino);
+ off += sizeof(destdirino);
+ _unrope(srcname, s, off);
+ _unrope(destname, s, off);
+ _unrope(destdirpath, s, off);
+ s.copy(off, sizeof(srcauth), (char*)&srcauth);
+ off += sizeof(srcauth);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&ino,sizeof(ino));
+ s.append((char*)&srcdirino,sizeof(srcdirino));
+ s.append((char*)&destdirino,sizeof(destdirino));
+ _rope(srcname, s);
+ _rope(destname, s);
+ _rope(destdirpath, s);
+ s.append((char*)&srcauth, sizeof(srcauth));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MRENAMENOTIFYACK_H
+#define __MRENAMENOTIFYACK_H
+
+class MRenameNotifyAck : public Message {
+ inodeno_t ino;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+
+ MRenameNotifyAck() {}
+ MRenameNotifyAck(inodeno_t ino) :
+ Message(MSG_MDS_RENAMENOTIFYACK) {
+ this->ino = ino;
+ }
+ virtual char *get_type_name() { return "RnotA";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&ino,sizeof(ino));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MRENAMEPREP_H
+#define __MRENAMEPREP_H
+
+class MRenamePrep : public Message {
+ inodeno_t srcdirino;
+ string srcname;
+ string srcpath;
+ inodeno_t destdirino;
+ string destname;
+ string destpath;
+ int initiator;
+ int srcauth;
+
+ public:
+ int get_initiator() { return initiator; }
+ inodeno_t get_srcdirino() { return srcdirino; }
+ string& get_srcname() { return srcname; }
+ string& get_srcpath() { return srcpath; }
+ int get_srcauth() { return srcauth; }
+ inodeno_t get_destdirino() { return destdirino; }
+ string& get_destname() { return destname; }
+ string& get_destpath() { return destpath; }
+
+ MRenamePrep() {}
+ MRenamePrep(int initiator,
+ inodeno_t srcdirino,
+ const string& srcname,
+ const string& srcpath,
+ inodeno_t destdirino,
+ const string& destname,
+ const string& destpath,
+ int srcauth) :
+ Message(MSG_MDS_RENAMEPREP) {
+ this->initiator = initiator;
+ this->srcdirino = srcdirino;
+ this->srcname = srcname;
+ this->srcpath = srcpath;
+ this->destdirino = destdirino;
+ this->destname = destname;
+ this->destpath = destpath;
+ this->srcauth = srcauth;
+ }
+ virtual char *get_type_name() { return "RnP";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(initiator), (char*)&initiator);
+ off += sizeof(initiator);
+ s.copy(off, sizeof(srcdirino), (char*)&srcdirino);
+ off += sizeof(srcdirino);
+ s.copy(off, sizeof(destdirino), (char*)&destdirino);
+ off += sizeof(destdirino);
+ _unrope(srcname, s, off);
+ _unrope(srcpath, s, off);
+ _unrope(destname, s, off);
+ _unrope(destpath, s, off);
+ s.copy(off, sizeof(srcauth), (char*)&srcauth);
+ off += sizeof(srcauth);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&initiator,sizeof(initiator));
+ s.append((char*)&srcdirino,sizeof(srcdirino));
+ s.append((char*)&destdirino,sizeof(destdirino));
+ _rope(srcname, s);
+ _rope(srcpath, s);
+ _rope(destname, s);
+ _rope(destpath, s);
+ s.append((char*)&srcauth, sizeof(srcauth));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MRENAMEREQ_H
+#define __MRENAMEREQ_H
+
+class MRenameReq : public Message {
+ int initiator;
+ inodeno_t srcdirino;
+ string srcname;
+ inodeno_t destdirino;
+ string destname;
+ string destpath;
+ int destauth;
+
+ public:
+ int get_initiator() { return initiator; }
+ inodeno_t get_srcdirino() { return srcdirino; }
+ string& get_srcname() { return srcname; }
+ inodeno_t get_destdirino() { return destdirino; }
+ string& get_destname() { return destname; }
+ string& get_destpath() { return destpath; }
+ int get_destauth() { return destauth; }
+
+ MRenameReq() {}
+ MRenameReq(int initiator,
+ inodeno_t srcdirino,
+ const string& srcname,
+ inodeno_t destdirino,
+ const string& destname,
+ const string& destpath,
+ int destauth) :
+ Message(MSG_MDS_RENAMEREQ) {
+ this->initiator = initiator;
+ this->srcdirino = srcdirino;
+ this->srcname = srcname;
+ this->destdirino = destdirino;
+ this->destname = destname;
+ this->destpath = destpath;
+ this->destauth = destauth;
+ }
+ virtual char *get_type_name() { return "RnReq";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(initiator), (char*)&initiator);
+ off += sizeof(initiator);
+ s.copy(off, sizeof(srcdirino), (char*)&srcdirino);
+ off += sizeof(srcdirino);
+ s.copy(off, sizeof(destdirino), (char*)&destdirino);
+ off += sizeof(destdirino);
+ _unrope(srcname, s, off);
+ _unrope(destname, s, off);
+ _unrope(destpath, s, off);
+ s.copy(off, sizeof(destauth), (char*)&destauth);
+ off += sizeof(destauth);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&initiator,sizeof(initiator));
+ s.append((char*)&srcdirino,sizeof(srcdirino));
+ s.append((char*)&destdirino,sizeof(destdirino));
+ _rope(srcname, s);
+ _rope(destname, s);
+ _rope(destpath, s);
+ s.append((char*)&destauth, sizeof(destauth));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MRENAMEWARNING_H
+#define __MRENAMEWARNING_H
+
+class MRenameWarning : public Message {
+ inodeno_t ino;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+
+ MRenameWarning() {}
+ MRenameWarning(inodeno_t ino) :
+ Message(MSG_MDS_RENAMEWARNING) {
+ this->ino = ino;
+ }
+ virtual char *get_type_name() { return "RnW";}
+
+ virtual void decode_payload(crope& s, int& off) {
+ s.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ }
+ virtual void encode_payload(crope& s) {
+ s.append((char*)&ino,sizeof(ino));
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MUNHASHDIR_H
+#define __MUNHASHDIR_H
+
+#include "msg/Message.h"
+
+class MUnhashDir : public Message {
+ inodeno_t ino;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+
+ MUnhashDir() {}
+ MUnhashDir(inodeno_t ino) :
+ Message(MSG_MDS_UNHASHDIR) {
+ this->ino = ino;
+ }
+ virtual char *get_type_name() { return "UH"; }
+
+ virtual void decode_payload() {
+ payload.copy(0, sizeof(ino), (char*)&ino);
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MUNHASHDIRACK_H
+#define __MUNHASHDIRACK_H
+
+#include "msg/Message.h"
+
+class MUnhashDirAck : public Message {
+ inodeno_t ino;
+ bufferlist state;
+ int nden;
+
+ public:
+ MUnhashDirAck() {}
+ MUnhashDirAck(inodeno_t ino, bufferlist& bl, int nden) :
+ Message(MSG_MDS_UNHASHDIRACK) {
+ this->ino = ino;
+ state.claim(bl);
+ this->nden = nden;
+ }
+ virtual char *get_type_name() { return "UHaA"; }
+
+ inodeno_t get_ino() { return ino; }
+ bufferlist& get_state() { return state; }
+ bufferlist* get_state_ptr() { return &state; }
+ int get_nden() { return nden; }
+
+ //void set_nden(int n) { nden = n; }
+ //void inc_nden() { nden++; }
+
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ payload.copy(off, sizeof(nden), (char*)&nden);
+ off += sizeof(nden);
+
+ size_t len;
+ payload.copy(off, sizeof(len), (char*)&len);
+ off += sizeof(len);
+ state.substr_of(payload, off, len);
+ }
+ void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+ payload.append((char*)&nden, sizeof(nden));
+ size_t size = state.length();
+ payload.append((char*)&size, sizeof(size));
+ payload.claim_append(state);
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MUNHASHDIRNOTIFY_H
+#define __MUNHASHDIRNOTIFY_H
+
+#include "msg/Message.h"
+
+class MUnhashDirNotify : public Message {
+ inodeno_t ino;
+ //int peer;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+ //int get_peer() { return peer; }
+
+ MUnhashDirNotify() {}
+ MUnhashDirNotify(inodeno_t ino/*, int peer*/) :
+ Message(MSG_MDS_UNHASHDIRNOTIFY) {
+ this->ino = ino;
+ //this->peer = peer;
+ }
+ virtual char *get_type_name() { return "UHN"; }
+
+ virtual void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+ //payload.copy(off, sizeof(peer), (char*)&peer);
+ //off += sizeof(peer);
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+ //payload.append((char*)&peer, sizeof(peer));
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MUNHASHDIRNOTIFYACK_H
+#define __MUNHASHDIRNOTIFYACK_H
+
+#include "msg/Message.h"
+
+class MUnhashDirNotifyAck : public Message {
+ inodeno_t ino;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+
+ MUnhashDirNotifyAck() {}
+ MUnhashDirNotifyAck(inodeno_t ino) :
+ Message(MSG_MDS_UNHASHDIRNOTIFYACK) {
+ this->ino = ino;
+ }
+ virtual char *get_type_name() { return "UHNa"; }
+
+ virtual void decode_payload() {
+ payload.copy(0, sizeof(ino), (char*)&ino);
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MUNHASHDIRPREP_H
+#define __MUNHASHDIRPREP_H
+
+#include "msg/Message.h"
+
+class MUnhashDirPrep : public Message {
+ inodeno_t ino;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+
+ MUnhashDirPrep() {}
+ MUnhashDirPrep(inodeno_t ino) :
+ Message(MSG_MDS_UNHASHDIRPREP) {
+ this->ino = ino;
+ }
+ virtual char *get_type_name() { return "UHP"; }
+
+ virtual void decode_payload() {
+ payload.copy(0, sizeof(ino), (char*)&ino);
+ }
+ virtual void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MUNHASHDIRPREPACK_H
+#define __MUNHASHDIRPREPACK_H
+
+#include "msg/Message.h"
+#include "mds/CInode.h"
+#include "include/types.h"
+
+class MUnhashDirPrepAck : public Message {
+ inodeno_t ino;
+ bool assim;
+
+ // subdir dentry names + inodes
+ map<string,CInodeDiscover*> inodes;
+
+ public:
+ inodeno_t get_ino() { return ino; }
+ map<string,CInodeDiscover*>& get_inodes() { return inodes; }
+
+ bool did_assim() { return assim; }
+ void mark_assim() { assert(!assim); assim = true; }
+
+ MUnhashDirPrepAck() : assim(false) { }
+ MUnhashDirPrepAck(inodeno_t ino) :
+ Message(MSG_MDS_UNHASHDIRPREPACK),
+ assim(false) {
+ this->ino = ino;
+ }
+ ~MUnhashDirPrepAck() {
+ for (map<string,CInodeDiscover*>::iterator it = inodes.begin();
+ it != inodes.end();
+ it++)
+ delete it->second;
+ }
+
+
+ virtual char *get_type_name() { return "HP"; }
+
+ void add_inode(const string& dentry, CInodeDiscover *in) {
+ inodes[dentry] = in;
+ }
+
+ void decode_payload() {
+ int off = 0;
+ payload.copy(off, sizeof(ino), (char*)&ino);
+ off += sizeof(ino);
+
+ // inodes
+ int ni;
+ payload.copy(off, sizeof(int), (char*)&ni);
+ off += sizeof(int);
+ for (int i=0; i<ni; i++) {
+ // dentry
+ string dname;
+ _decode(dname, payload, off);
+
+ // inode
+ CInodeDiscover *in = new CInodeDiscover;
+ in->_decode(payload, off);
+
+ inodes[dname] = in;
+ }
+ }
+
+ virtual void encode_payload() {
+ payload.append((char*)&ino, sizeof(ino));
+
+ // inodes
+ int ni = inodes.size();
+ payload.append((char*)&ni, sizeof(int));
+ for (map<string,CInodeDiscover*>::iterator iit = inodes.begin();
+ iit != inodes.end();
+ iit++) {
+ _encode(iit->first, payload); // dentry
+ iit->second->_encode(payload); // inode
+ }
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mon/MonMap.h"
+
+
+bool parse_ip_port(const char *s, tcpaddr_t& tcpaddr)
+{
+ unsigned char addr[4];
+ int port = 0;
+
+ int count = 0; // digit count
+
+ while (1) {
+ // parse the #.
+ int val = 0;
+ int numdigits = 0;
+
+ while (*s >= '0' && *s <= '9') {
+ int digit = *s - '0';
+ //cout << "digit " << digit << endl;
+ val *= 10;
+ val += digit;
+ numdigits++;
+ s++;
+ }
+ //cout << "val " << val << endl;
+
+ if (numdigits == 0) return false; // no digits
+ if (count < 3 && *s != '.') return false; // should have 3 periods
+ if (count == 3 && *s != ':') return false; // then a colon
+ s++;
+
+ if (count <= 3)
+ addr[count] = val;
+ else
+ port = val;
+
+ count++;
+ if (count == 5) break;
+ }
+
+ // copy into inst
+ memcpy((char*)&tcpaddr.sin_addr.s_addr, (char*)addr, 4);
+ tcpaddr.sin_port = port;
+
+ return true;
+}
+
+
+int main(int argc, char **argv)
+{
+ vector<char*> args;
+ argv_to_vec(argc, argv, args);
+
+ MonMap monmap;
+
+ char *outfn = ".ceph_monmap";
+
+ for (unsigned i=0; i<args.size(); i++) {
+ if (strcmp(args[i], "--out") == 0)
+ outfn = args[++i];
+ else {
+ // parse ip:port
+ tcpaddr_t addr;
+ if (!parse_ip_port(args[i], addr)) {
+ cerr << "mkmonmap: invalid ip:port '" << args[i] << "'" << endl;
+ return -1;
+ }
+ entity_inst_t inst;
+ inst.set_addr(addr);
+ cout << "mkmonmap: mon" << monmap.num_mon << " " << inst << endl;
+ monmap.add_mon(inst);
+ }
+ }
+
+ if (monmap.num_mon == 0) {
+ cerr << "usage: mkmonmap ip:port [...]" << endl;
+ return -1;
+ }
+
+ // write it out
+ cout << "mkmonmap: writing monmap to " << outfn << " (" << monmap.num_mon << " monitors)" << endl;
+ int r = monmap.write(outfn);
+ assert(r >= 0);
+
+ return 0;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "ClientMonitor.h"
+#include "Monitor.h"
+#include "MDSMonitor.h"
+
+#include "messages/MClientBoot.h"
+#include "messages/MMDSMap.h"
+//#include "messages/MMDSFailure.h"
+
+#include "common/Timer.h"
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client "
+#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client "
+
+
+
+
+void ClientMonitor::dispatch(Message *m)
+{
+ switch (m->get_type()) {
+
+ case MSG_CLIENT_BOOT:
+ handle_client_boot((MClientBoot*)m);
+ break;
+
+ /*
+ case MSG_client_FAILURE:
+ handle_client_failure((MClientFailure*)m);
+ break;
+ */
+
+ default:
+ assert(0);
+ }
+}
+
+void ClientMonitor::handle_client_boot(MClientBoot *m)
+{
+ dout(7) << "client_boot from " << m->get_source() << " at " << m->get_source_inst() << endl;
+ assert(m->get_source().is_client());
+ int from = m->get_source().num();
+
+ // choose an MDS id
+ if (from < 0 ||
+ (client_map.count(m->get_source()) && client_map[m->get_source()] != m->get_source_inst())) {
+ from = ++num_clients;
+ dout(10) << "client_boot assigned client" << from << endl;
+ }
+
+ client_map[MSG_ADDR_CLIENT(from)] = m->get_source_inst();
+
+ // reply with latest mds map
+ mon->mdsmon->send_latest(MSG_ADDR_CLIENT(from), m->get_source_inst());
+ delete m;
+}
+
+/*
+void ClientMonitor::handle_mds_shutdown(Message *m)
+{
+ assert(m->get_source().is_mds());
+ int from = m->get_source().num();
+
+ mdsmap.mds_inst.erase(from);
+ mdsmap.all_mds.erase(from);
+
+ dout(7) << "mds_shutdown from " << m->get_source()
+ << ", still have " << mdsmap.all_mds
+ << endl;
+
+ // tell someone?
+ // fixme
+
+ delete m;
+}
+
+*/
+
+/*
+void ClientMonitor::bcast_latest_mds()
+{
+ dout(10) << "bcast_latest_mds " << mdsmap.get_epoch() << endl;
+
+ // tell mds
+ for (set<int>::iterator p = mdsmap.get_mds().begin();
+ p != mdsmap.get_mds().end();
+ p++) {
+ if (mdsmap.is_down(*p)) continue;
+ send_full(MSG_ADDR_MDS(*p), mdsmap.get_inst(*p));
+ }
+}
+
+*/
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __CLIENTMONITOR_H
+#define __CLIENTMONITOR_H
+
+#include <map>
+#include <set>
+using namespace std;
+
+#include "include/types.h"
+#include "msg/Messenger.h"
+
+#include "mds/MDSMap.h"
+
+class Monitor;
+
+class ClientMonitor : public Dispatcher {
+ Monitor *mon;
+ Messenger *messenger;
+ Mutex &lock;
+
+ private:
+ int num_clients;
+ map<msg_addr_t,entity_inst_t> client_map;
+
+ void bcast_latest_mds();
+
+ //void accept_pending(); // accept pending, new map.
+ //void send_incremental(epoch_t since, msg_addr_t dest);
+
+ void handle_client_boot(class MClientBoot *m);
+
+ public:
+ ClientMonitor(Monitor *mn, Messenger *m, Mutex& l) : mon(mn), messenger(m), lock(l),
+ num_clients(0) { }
+
+ void dispatch(Message *m);
+ void tick(); // check state, take actions
+};
+
+#endif
--- /dev/null
+
+#include "Elector.h"
+#include "Monitor.h"
+
+#include "common/Timer.h"
+
+#include "messages/MMonElectionPropose.h"
+#include "messages/MMonElectionAck.h"
+#include "messages/MMonElectionVictory.h"
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << "mon" << whoami << " "
+#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << "mon" << whoami << " "
+
+
+void Elector::start()
+{
+ dout(5) << "start -- can i be leader?" << endl;
+
+ leader_acked = -1;
+
+ // start by trying to elect me
+ start_stamp = g_clock.now();
+ acked_me.clear();
+ acked_me.insert(whoami);
+ electing_me = true;
+
+ // bcast to everyone else
+ for (unsigned i=0; i<mon->monmap->num_mon; ++i) {
+ if (i == whoami) continue;
+ mon->messenger->send_message(new MMonElectionPropose,
+ MSG_ADDR_MON(i), mon->monmap->get_inst(i));
+ }
+
+ reset_timer();
+}
+
+void Elector::defer(int who)
+{
+ dout(5) << "defer -- i'm deferring to " << who << endl;
+
+ if (electing_me) {
+ acked_me.clear();
+ electing_me = false;
+ }
+
+ // ack them
+ leader_acked = who;
+ ack_stamp = g_clock.now();
+ mon->messenger->send_message(new MMonElectionAck,
+ MSG_ADDR_MON(who), mon->monmap->get_inst(who));
+
+ // set a timer
+ reset_timer();
+}
+
+
+class C_Mon_ElectionExpire : public Context {
+ Elector *elector;
+public:
+ C_Mon_ElectionExpire(Elector *e) : elector(e) { }
+ void finish(int r) {
+ elector->expire();
+ }
+};
+
+void Elector::reset_timer()
+{
+ // set the timer
+ cancel_timer();
+ expire_event = new C_Mon_ElectionExpire(this);
+ g_timer.add_event_after(g_conf.mon_lease,
+ expire_event);
+}
+
+void Elector::cancel_timer()
+{
+ if (expire_event)
+ g_timer.cancel_event(expire_event);
+}
+
+void Elector::expire()
+{
+ dout(5) << "election timer expired" << endl;
+
+ // did i win?
+ if (electing_me &&
+ acked_me.size() > mon->monmap->num_mon / 2) {
+ // i win
+ victory();
+ } else {
+ // whoever i deferred to didn't declare victory quickly enough.
+ start();
+ }
+}
+
+
+void Elector::victory()
+{
+ // tell everyone
+ for (unsigned i=0; i<mon->monmap->num_mon; ++i) {
+ if (i == whoami) continue;
+ mon->messenger->send_message(new MMonElectionVictory,
+ MSG_ADDR_MON(i), mon->monmap->get_inst(i));
+ }
+
+ // tell monitor
+ mon->win_election(acked_me);
+}
+
+
+void Elector::handle_propose(MMonElectionPropose *m)
+{
+ dout(5) << "propose from " << m->get_source() << endl;
+ int from = m->get_source().num();
+
+ if (from > whoami) {
+ // wait, i should win!
+ if (!electing_me)
+ start();
+ } else {
+ // they would win over me
+ if (leader_acked < 0 || // haven't acked anyone yet, or
+ leader_acked > from) { // they would win over who you did ack
+ defer(from);
+ } else {
+ // ignore them!
+ dout(5) << "no, we already acked " << leader_acked << endl;
+ }
+ }
+
+ delete m;
+}
+
+void Elector::handle_ack(MMonElectionAck *m)
+{
+ dout(5) << "ack from " << m->get_source() << endl;
+ int from = m->get_source().num();
+
+ if (electing_me) {
+ // thanks
+ acked_me.insert(from);
+ dout(5) << " so far i have " << acked_me << endl;
+
+ // is that _everyone_?
+ if (acked_me.size() == mon->monmap->num_mon) {
+ // if yes, shortcut to election finish
+ victory();
+ }
+ } else {
+ // ignore, i'm deferring already.
+ }
+
+ delete m;
+}
+
+void Elector::handle_victory(MMonElectionVictory *m)
+{
+ dout(5) << "victory from " << m->get_source() << endl;
+ int from = m->get_source().num();
+
+ if (from < whoami) {
+ // ok, fine, they win
+ mon->lose_election(from);
+
+ // cancel my timer
+ cancel_timer();
+ } else {
+ // no, that makes no sense, i should win. start over!
+ start();
+ }
+}
+
+
+
+
+void Elector::dispatch(Message *m)
+{
+ switch (m->get_type()) {
+ case MSG_MON_ELECTION_ACK:
+ handle_ack((MMonElectionAck*)m);
+ break;
+
+ case MSG_MON_ELECTION_PROPOSE:
+ handle_propose((MMonElectionPropose*)m);
+ break;
+
+ case MSG_MON_ELECTION_VICTORY:
+ handle_victory((MMonElectionVictory*)m);
+ break;
+
+ default:
+ assert(0);
+ }
+}
+
+
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MON_ELECTOR_H
+#define __MON_ELECTOR_H
+
+#include <map>
+using namespace std;
+
+#include "include/types.h"
+#include "msg/Message.h"
+
+#include "include/Context.h"
+
+#include "common/Timer.h"
+
+class Monitor;
+
+
+class Elector {
+ private:
+ Monitor *mon;
+ int whoami;
+
+ Context *expire_event;
+
+ void reset_timer();
+ void cancel_timer();
+
+ // electing me
+ bool electing_me;
+ utime_t start_stamp;
+ set<int> acked_me;
+
+ // electing them
+ int leader_acked; // who i've acked
+ utime_t ack_stamp; // and when
+
+ public:
+
+ void start(); // start an electing me
+ void defer(int who);
+ void expire(); // timer goes off
+ void victory();
+
+ void handle_propose(class MMonElectionPropose *m);
+ void handle_ack(class MMonElectionAck *m);
+ void handle_victory(class MMonElectionVictory *m);
+
+
+ public:
+ Elector(Monitor *m, int w) : mon(m), whoami(w) {
+ // initialize all those values!
+ // ...
+ }
+
+ void dispatch(Message *m);
+};
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "MDSMonitor.h"
+#include "Monitor.h"
+
+#include "messages/MMDSBoot.h"
+#include "messages/MMDSMap.h"
+#include "messages/MMDSGetMap.h"
+//#include "messages/MMDSFailure.h"
+
+#include "common/Timer.h"
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " "
+#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " "
+
+
+
+/********* MDS map **************/
+
+void MDSMonitor::create_initial()
+{
+ mdsmap.epoch = 0; // until everyone boots
+ mdsmap.ctime = g_clock.now();
+ for (int i=0; i<g_conf.num_mds; i++) {
+ mdsmap.all_mds.insert(i);
+ mdsmap.down_mds.insert(i);
+ }
+}
+
+void MDSMonitor::dispatch(Message *m)
+{
+ switch (m->get_type()) {
+
+ case MSG_MDS_BOOT:
+ handle_mds_boot((MMDSBoot*)m);
+ break;
+
+ case MSG_MDS_GETMAP:
+ handle_mds_getmap((MMDSGetMap*)m);
+ break;
+
+ /*
+ case MSG_MDS_FAILURE:
+ handle_mds_failure((MMDSFailure*)m);
+ break;
+ */
+
+ case MSG_SHUTDOWN:
+ handle_mds_shutdown(m);
+ break;
+
+ default:
+ assert(0);
+ }
+}
+
+void MDSMonitor::handle_mds_boot(MMDSBoot *m)
+{
+ dout(7) << "mds_boot from " << m->get_source() << " at " << m->get_source_inst() << endl;
+ assert(m->get_source().is_mds());
+ int from = m->get_source().num();
+
+ // choose an MDS id
+ if (from < 0 || !mdsmap.is_down(from)) {
+ for (from=0; ; ++from)
+ if (mdsmap.is_down(from)) break;
+ dout(10) << "mds_boot assigned mds" << from << endl;
+ }
+
+ if (mdsmap.get_epoch() == 0) {
+ // waiting for boot!
+ mdsmap.mds_inst[from] = m->get_source_inst();
+ mdsmap.down_mds.erase(from);
+
+ if ((int)mdsmap.mds_inst.size() == mdsmap.get_num_mds()) {
+ mdsmap.inc_epoch();
+ dout(-7) << "mds_boot all MDSs booted." << endl;
+ mdsmap.encode(maps[mdsmap.get_epoch()]); // 1
+
+ bcast_latest_mds();
+ send_current();
+ } else {
+ dout(7) << "mds_boot waiting for "
+ << (mdsmap.get_num_mds() - mdsmap.mds_inst.size())
+ << " mdss to boot" << endl;
+ }
+ return;
+ } else {
+ dout(0) << "mds_boot everyone already booted, so who is this? write me." << endl;
+ assert(0);
+ }
+}
+
+void MDSMonitor::handle_mds_shutdown(Message *m)
+{
+ assert(m->get_source().is_mds());
+ int from = m->get_source().num();
+
+ mdsmap.mds_inst.erase(from);
+ mdsmap.all_mds.erase(from);
+
+ dout(7) << "mds_shutdown from " << m->get_source()
+ << ", still have " << mdsmap.all_mds
+ << endl;
+
+ // tell someone?
+ // fixme
+
+ delete m;
+}
+
+
+void MDSMonitor::handle_mds_getmap(MMDSGetMap *m)
+{
+ dout(7) << "mds_getmap from " << m->get_source() << " " << m->get_source_inst() << endl;
+ if (mdsmap.get_epoch() > 0)
+ send_full(m->get_source(), m->get_source_inst());
+ else
+ awaiting_map[m->get_source()] = m->get_source_inst();
+}
+
+
+void MDSMonitor::bcast_latest_mds()
+{
+ dout(10) << "bcast_latest_mds " << mdsmap.get_epoch() << endl;
+
+ // tell mds
+ for (set<int>::iterator p = mdsmap.get_mds().begin();
+ p != mdsmap.get_mds().end();
+ p++) {
+ if (mdsmap.is_down(*p)) continue;
+ send_full(MSG_ADDR_MDS(*p), mdsmap.get_inst(*p));
+ }
+}
+
+void MDSMonitor::send_full(msg_addr_t dest, const entity_inst_t& inst)
+{
+ dout(11) << "send_full to " << dest << " inst " << inst << endl;
+ messenger->send_message(new MMDSMap(&mdsmap), dest, inst);
+}
+
+void MDSMonitor::send_current()
+{
+ dout(10) << "mds_send_current " << mdsmap.get_epoch() << endl;
+ for (map<msg_addr_t,entity_inst_t>::iterator i = awaiting_map.begin();
+ i != awaiting_map.end();
+ i++)
+ send_full(i->first, i->second);
+ awaiting_map.clear();
+}
+
+void MDSMonitor::send_latest(msg_addr_t dest, const entity_inst_t& inst)
+{
+ // FIXME: check if we're locked, etc.
+ if (mdsmap.get_epoch() > 0)
+ send_full(dest, inst);
+ else
+ awaiting_map[dest] = inst;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MDSMONITOR_H
+#define __MDSMONITOR_H
+
+#include <map>
+#include <set>
+using namespace std;
+
+#include "include/types.h"
+#include "msg/Messenger.h"
+
+#include "mds/MDSMap.h"
+
+class Monitor;
+
+class MDSMonitor : public Dispatcher {
+ Monitor *mon;
+ Messenger *messenger;
+ Mutex &lock;
+
+ // mds maps
+ public:
+ MDSMap mdsmap;
+
+ private:
+ map<epoch_t, bufferlist> maps;
+
+ //map<epoch_t, bufferlist> inc_maps;
+ //MDSMap::Incremental pending_inc;
+
+ map<msg_addr_t,entity_inst_t> awaiting_map;
+
+
+ // maps
+ void create_initial();
+ void send_current(); // send current map to waiters.
+ void send_full(msg_addr_t dest, const entity_inst_t& inst);
+ void bcast_latest_mds();
+
+ //void accept_pending(); // accept pending, new map.
+ //void send_incremental(epoch_t since, msg_addr_t dest);
+
+ void handle_mds_boot(class MMDSBoot *m);
+ void handle_mds_failure(class MMDSFailure *m);
+ void handle_mds_getmap(class MMDSGetMap *m);
+ void handle_mds_shutdown(Message *m);
+
+
+
+ public:
+ MDSMonitor(Monitor *mn, Messenger *m, Mutex& l) : mon(mn), messenger(m), lock(l) {
+ create_initial();
+ }
+
+ void dispatch(Message *m);
+ void tick(); // check state, take actions
+
+ void send_latest(msg_addr_t dest, const entity_inst_t& inst);
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MONMAP_H
+#define __MONMAP_H
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "msg/Message.h"
+#include "include/types.h"
+
+class MonMap {
+ public:
+ epoch_t epoch; // what epoch of the osd cluster descriptor is this
+ int num_mon;
+ vector<entity_inst_t> mon_inst;
+
+ int last_mon; // last mon i talked to
+
+ MonMap(int s=0) : epoch(0), num_mon(s), mon_inst(s), last_mon(-1) {}
+
+ void add_mon(entity_inst_t inst) {
+ mon_inst.push_back(inst);
+ num_mon++;
+ }
+
+ // pick a mon.
+ // choice should be stable, unless we explicitly ask for a new one.
+ int pick_mon(bool newmon=false) {
+ if (newmon || (last_mon < 0)) {
+ last_mon = 0; //last_mon = rand() % num_mon;
+ }
+ return last_mon;
+ }
+
+ const entity_inst_t get_inst(int m) {
+ assert(m < num_mon);
+ return mon_inst[m];
+ }
+
+ void encode(bufferlist& blist) {
+ blist.append((char*)&epoch, sizeof(epoch));
+ blist.append((char*)&num_mon, sizeof(num_mon));
+
+ _encode(mon_inst, blist);
+ }
+
+ void decode(bufferlist& blist) {
+ int off = 0;
+ blist.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ blist.copy(off, sizeof(num_mon), (char*)&num_mon);
+ off += sizeof(num_mon);
+
+ _decode(mon_inst, blist, off);
+ }
+
+ int write(char *fn) {
+ // encode
+ bufferlist bl;
+ encode(bl);
+
+ // write
+ int fd = ::open(fn, O_RDWR|O_CREAT);
+ if (fd < 0) return fd;
+ ::fchmod(fd, 0644);
+ ::write(fd, (void*)bl.c_str(), bl.length());
+ ::close(fd);
+ return 0;
+ }
+
+ int read(char *fn) {
+ // read
+ bufferlist bl;
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) return fd;
+ struct stat st;
+ ::fstat(fd, &st);
+ bufferptr bp(st.st_size);
+ bl.append(bp);
+ ::read(fd, (void*)bl.c_str(), bl.length());
+ ::close(fd);
+
+ // decode
+ decode(bl);
+ return 0;
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+// TODO: missing run() method, which creates the two main timers, refreshTimer and readTimer
+
+#include "Monitor.h"
+
+#include "osd/OSDMap.h"
+
+#include "ebofs/Ebofs.h"
+
+#include "msg/Message.h"
+#include "msg/Messenger.h"
+
+#include "messages/MPing.h"
+#include "messages/MPingAck.h"
+#include "messages/MGenericMessage.h"
+
+#include "common/Timer.h"
+#include "common/Clock.h"
+
+#include "OSDMonitor.h"
+#include "MDSMonitor.h"
+#include "ClientMonitor.h"
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " "
+#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " "
+
+
+
+void Monitor::init()
+{
+ dout(1) << "init" << endl;
+
+ // store
+ char s[80];
+ sprintf(s, "dev/mon%d", whoami);
+ store = new Ebofs(s);
+
+ if (g_conf.mkfs)
+ store->mkfs();
+ int r = store->mount();
+ assert(r >= 0);
+
+ // create
+ osdmon = new OSDMonitor(this, messenger, lock);
+ mdsmon = new MDSMonitor(this, messenger, lock);
+ clientmon = new ClientMonitor(this, messenger, lock);
+
+ // i'm ready!
+ messenger->set_dispatcher(this);
+
+ // start ticker
+ reset_tick();
+
+ // call election?
+ assert(monmap->num_mon != 2);
+ if (monmap->num_mon >= 3)
+ call_election();
+}
+
+void Monitor::shutdown()
+{
+ dout(1) << "shutdown" << endl;
+
+ cancel_tick();
+
+ if (store) {
+ store->umount();
+ delete store;
+ }
+
+ // stop osds.
+ for (set<int>::iterator it = osdmon->osdmap.get_osds().begin();
+ it != osdmon->osdmap.get_osds().end();
+ it++) {
+ if (osdmon->osdmap.is_down(*it)) continue;
+ dout(10) << "sending shutdown to osd" << *it << endl;
+ messenger->send_message(new MGenericMessage(MSG_SHUTDOWN),
+ MSG_ADDR_OSD(*it), osdmon->osdmap.get_inst(*it));
+ }
+
+ // monitors too.
+ for (int i=0; i<monmap->num_mon; i++)
+ if (i != whoami)
+ messenger->send_message(new MGenericMessage(MSG_SHUTDOWN),
+ MSG_ADDR_MON(i), monmap->get_inst(i));
+
+ // clean up
+ if (monmap) delete monmap;
+ if (osdmon) delete osdmon;
+ if (mdsmon) delete mdsmon;
+ if (clientmon) delete clientmon;
+
+ // die.
+ messenger->shutdown();
+ delete messenger;
+}
+
+
+void Monitor::call_election()
+{
+ if (monmap->num_mon == 1) return;
+
+ dout(10) << "call_election" << endl;
+ state = STATE_STARTING;
+
+ elector.start();
+
+ osdmon->election_starting();
+ //mdsmon->election_starting();
+}
+
+
+
+
+
+void Monitor::dispatch(Message *m)
+{
+ lock.Lock();
+ {
+ switch (m->get_type()) {
+
+ // misc
+ case MSG_PING_ACK:
+ handle_ping_ack((MPingAck*)m);
+ break;
+
+ case MSG_SHUTDOWN:
+ if (m->get_source().is_mds()) {
+ mdsmon->dispatch(m);
+ if (mdsmon->mdsmap.get_num_mds() == 0)
+ shutdown();
+ }
+ else if (m->get_source().is_osd()) {
+ osdmon->dispatch(m);
+ }
+ break;
+
+
+ // OSDs
+ case MSG_OSD_GETMAP:
+ case MSG_OSD_FAILURE:
+ case MSG_OSD_BOOT:
+ case MSG_OSD_IN:
+ case MSG_OSD_OUT:
+ osdmon->dispatch(m);
+ break;
+
+
+ // MDSs
+ case MSG_MDS_BOOT:
+ case MSG_MDS_GETMAP:
+ mdsmon->dispatch(m);
+ break;
+
+ // clients
+ case MSG_CLIENT_BOOT:
+ clientmon->dispatch(m);
+ break;
+
+
+ // elector messages
+ case MSG_MON_ELECTION_PROPOSE:
+ case MSG_MON_ELECTION_ACK:
+ case MSG_MON_ELECTION_VICTORY:
+ elector.dispatch(m);
+ break;
+
+
+ default:
+ dout(0) << "unknown message " << *m << endl;
+ assert(0);
+ }
+ }
+ lock.Unlock();
+}
+
+
+void Monitor::handle_shutdown(Message *m)
+{
+ dout(1) << "shutdown from " << m->get_source() << endl;
+
+ shutdown();
+ delete m;
+}
+
+void Monitor::handle_ping_ack(MPingAck *m)
+{
+ // ...
+
+ delete m;
+}
+
+
+
+
+/************ TIMER ***************/
+
+class C_Mon_Tick : public Context {
+ Monitor *mon;
+public:
+ C_Mon_Tick(Monitor *m) : mon(m) {}
+ void finish(int r) {
+ mon->tick(this);
+ }
+};
+
+
+void Monitor::cancel_tick()
+{
+ if (!tick_timer) return;
+
+ if (g_timer.cancel_event(tick_timer)) {
+ dout(10) << "cancel_tick canceled" << endl;
+ } else {
+ // already dispatched!
+ dout(10) << "cancel_tick timer dispatched, waiting to cancel" << endl;
+ tick_timer = (Context*)1; // hackish.
+ while (tick_timer)
+ tick_timer_cond.Wait(lock);
+ }
+}
+
+void Monitor::reset_tick()
+{
+ if (tick_timer)
+ cancel_tick();
+ tick_timer = new C_Mon_Tick(this);
+ g_timer.add_event_after(g_conf.mon_tick_interval, tick_timer);
+}
+
+
+void Monitor::tick(Context *timer)
+{
+ lock.Lock();
+ {
+ if (tick_timer != timer) {
+ dout(10) << "tick - canceled" << endl;
+ tick_timer = 0;
+ tick_timer_cond.Signal();
+ lock.Unlock();
+ return;
+ }
+
+ tick_timer = 0;
+
+ // ok go.
+ dout(10) << "tick" << endl;
+
+ osdmon->tick();
+
+ // next tick!
+ reset_tick();
+ }
+ lock.Unlock();
+}
+
+
+
+
+
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MONITOR_H
+#define __MONITOR_H
+
+#include "include/types.h"
+#include "msg/Messenger.h"
+
+#include "MonMap.h"
+#include "Elector.h"
+
+class ObjectStore;
+class OSDMonitor;
+class MDSMonitor;
+class ClientMonitor;
+
+class Monitor : public Dispatcher {
+protected:
+ // me
+ int whoami;
+ Messenger *messenger;
+ Mutex lock;
+
+ MonMap *monmap;
+
+ // timer.
+ Context *tick_timer;
+ Cond tick_timer_cond;
+ void cancel_tick();
+ void reset_tick();
+ friend class C_Mon_Tick;
+
+ // my local store
+ ObjectStore *store;
+
+ const static int INO_ELECTOR = 1;
+ const static int INO_MON_MAP = 2;
+ const static int INO_OSD_MAP = 10;
+ const static int INO_OSD_INC_MAP = 11;
+ const static int INO_MDS_MAP = 20;
+
+ // elector
+ Elector elector;
+ friend class Elector;
+
+ epoch_t mon_epoch; // monitor epoch (election instance)
+ set<int> quorum; // current active set of monitors (if !starting)
+
+ //void call_election();
+
+ // monitor state
+ const static int STATE_STARTING = 0; // electing
+ const static int STATE_LEADER = 1;
+ const static int STATE_PEON = 2;
+ int state;
+
+ int leader; // current leader (to best of knowledge)
+ utime_t last_called_election; // [starting] last time i called an election
+
+ bool is_starting() { return state == STATE_STARTING; }
+ bool is_leader() { return state == STATE_LEADER; }
+ bool is_peon() { return state == STATE_PEON; }
+
+ // my public services
+ OSDMonitor *osdmon;
+ MDSMonitor *mdsmon;
+ ClientMonitor *clientmon;
+
+ // messages
+ void handle_shutdown(Message *m);
+ void handle_ping_ack(class MPingAck *m);
+
+ friend class OSDMonitor;
+ friend class MDSMonitor;
+ friend class ClientMonitor;
+
+
+ // initiate election
+ void call_election();
+
+ // called by Elector when it's finished
+ void win_election(set<int>& active) {
+ leader = whoami;
+ quorum = active;
+ state = STATE_LEADER;
+ }
+ void lose_election(int l) {
+ state = STATE_PEON;
+ leader = l;
+ }
+
+ public:
+ Monitor(int w, Messenger *m, MonMap *mm) :
+ whoami(w),
+ messenger(m),
+ monmap(mm),
+ tick_timer(0),
+ store(0),
+ elector(this, w),
+ mon_epoch(0),
+ state(STATE_STARTING),
+ leader(0),
+ osdmon(0), mdsmon(0), clientmon(0)
+ {
+ // hack leader, until election works.
+ if (whoami == 0)
+ state = STATE_LEADER;
+ else
+ state = STATE_PEON;
+ }
+
+
+ void init();
+ void shutdown();
+ void dispatch(Message *m);
+ void tick(Context *timer);
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "OSDMonitor.h"
+#include "Monitor.h"
+#include "MDSMonitor.h"
+
+#include "osd/ObjectStore.h"
+
+#include "messages/MOSDFailure.h"
+#include "messages/MOSDMap.h"
+#include "messages/MOSDGetMap.h"
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDIn.h"
+#include "messages/MOSDOut.h"
+
+#include "messages/MMonOSDMapInfo.h"
+#include "messages/MMonOSDMapLease.h"
+#include "messages/MMonOSDMapLeaseAck.h"
+#include "messages/MMonOSDMapUpdatePrepare.h"
+#include "messages/MMonOSDMapUpdateAck.h"
+#include "messages/MMonOSDMapUpdateCommit.h"
+
+#include "common/Timer.h"
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(" << (state == STATE_INIT ? (const char*)"init":(state == STATE_SYNC ? (const char*)"sync":(state == STATE_LOCK ? (const char*)"lock":(state == STATE_UPDATING ? (const char*)"updating":(const char*)"?\?")))) << ") e" << osdmap.get_epoch() << " "
+#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(" << (state == STATE_INIT ? (const char*)"init":(state == STATE_SYNC ? (const char*)"sync":(state == STATE_LOCK ? (const char*)"lock":(state == STATE_UPDATING ? (const char*)"updating":(const char*)"?\?")))) << ") e" << osdmap.get_epoch() << " "
+
+
+class C_Mon_FakeOSDFailure : public Context {
+ OSDMonitor *mon;
+ int osd;
+ bool down;
+public:
+ C_Mon_FakeOSDFailure(OSDMonitor *m, int o, bool d) : mon(m), osd(o), down(d) {}
+ void finish(int r) {
+ mon->fake_osd_failure(osd,down);
+ }
+};
+
+
+void OSDMonitor::fake_osdmap_update()
+{
+ dout(1) << "fake_osdmap_update" << endl;
+ accept_pending();
+
+ // tell a random osd
+ int osd = rand() % g_conf.num_osd;
+ send_incremental(osdmap.get_epoch()-1, // ick! FIXME
+ MSG_ADDR_OSD(osd), osdmap.get_inst(osd));
+}
+
+
+void OSDMonitor::fake_reorg()
+{
+ int r = rand() % g_conf.num_osd;
+
+ if (osdmap.is_out(r)) {
+ dout(1) << "fake_reorg marking osd" << r << " in" << endl;
+ pending_inc.new_in.push_back(r);
+ } else {
+ dout(1) << "fake_reorg marking osd" << r << " out" << endl;
+ pending_inc.new_out.push_back(r);
+ }
+
+ accept_pending();
+
+ // tell him!
+ send_incremental(osdmap.get_epoch()-1, MSG_ADDR_OSD(r), osdmap.get_inst(r));
+
+ // do it again?
+ /*
+ if (g_conf.num_osd - d > 4 &&
+ g_conf.num_osd - d > g_conf.num_osd/2)
+ g_timer.add_event_after(g_conf.fake_osdmap_expand,
+ new C_Mon_Faker(this));
+ */
+}
+
+
+
+void OSDMonitor::init()
+{
+ // start with blank map
+
+ // load my last state from the store
+ bufferlist bl;
+ if (get_map_bl(0, bl)) { // FIXME
+ // yay!
+ osdmap.decode(bl);
+ dout(1) << "init got epoch " << osdmap.get_epoch() << " from store" << endl;
+
+ // set up pending_inc
+ pending_inc.epoch = osdmap.get_epoch()+1;
+
+ } else {
+ // FIXME. when elections work!
+ if (mon->is_leader()) {
+ create_initial();
+ issue_leases();
+ }
+ }
+}
+
+
+
+
+/************ MAPS ****************/
+
+
+void OSDMonitor::create_initial()
+{
+ dout(1) << "create_initial generating osdmap from g_conf" << endl;
+
+ // <HACK set up OSDMap from g_conf>
+ osdmap.mon_epoch = mon->mon_epoch;
+ osdmap.ctime = g_clock.now();
+
+ if (g_conf.osd_pg_bits) {
+ osdmap.set_pg_bits(g_conf.osd_pg_bits);
+ } else {
+ int osdbits = 1;
+ int n = g_conf.num_osd;
+ while (n) {
+ n = n >> 1;
+ osdbits++;
+ }
+
+ // 2 bits per osd.
+ osdmap.set_pg_bits(osdbits + 2);
+ }
+
+ // start at epoch 0 until all osds boot
+ //osdmap.inc_epoch(); // = 1
+ //assert(osdmap.get_epoch() == 1);
+
+ if (g_conf.num_osd >= 12) {
+ int ndom = g_conf.osd_max_rep;
+ UniformBucket *domain[ndom];
+ int domid[ndom];
+ for (int i=0; i<ndom; i++) {
+ domain[i] = new UniformBucket(1, 0);
+ domid[i] = osdmap.crush.add_bucket(domain[i]);
+ }
+
+ // add osds
+ int nper = ((g_conf.num_osd - 1) / ndom) + 1;
+ cerr << ndom << " failure domains, " << nper << " osds each" << endl;
+ int i = 0;
+ for (int dom=0; dom<ndom; dom++) {
+ for (int j=0; j<nper; j++) {
+ osdmap.osds.insert(i);
+ domain[dom]->add_item(i, 1.0);
+ //cerr << "osd" << i << " in domain " << dom << endl;
+ i++;
+ if (i == g_conf.num_osd) break;
+ }
+ }
+
+ // root
+ Bucket *root = new ListBucket(2);
+ for (int i=0; i<ndom; i++) {
+ //cerr << "dom " << i << " w " << domain[i]->get_weight() << endl;
+ root->add_item(domid[i], domain[i]->get_weight());
+ }
+ int nroot = osdmap.crush.add_bucket(root);
+
+ // rules
+ for (int i=1; i<=ndom; i++) {
+ osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot));
+ osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 1));
+ osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0));
+ osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+ }
+
+ // test
+ //vector<int> out;
+ //osdmap.pg_to_osds(0x40200000110ULL, out);
+
+ } else {
+ // one bucket
+ Bucket *b = new UniformBucket(1, 0);
+ int root = osdmap.crush.add_bucket(b);
+ for (int i=0; i<g_conf.num_osd; i++) {
+ osdmap.osds.insert(i);
+ b->add_item(i, 1.0);
+ }
+
+ for (int i=1; i<=g_conf.osd_max_rep; i++) {
+ osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0));
+ osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+ }
+ }
+
+ if (g_conf.mds_local_osd) {
+ // add mds osds, but don't put them in the crush mapping func
+ for (int i=0; i<g_conf.num_mds; i++)
+ osdmap.osds.insert(i+10000);
+ }
+
+ // </HACK>
+
+ // fake osd failures
+ for (map<int,float>::iterator i = g_fake_osd_down.begin();
+ i != g_fake_osd_down.end();
+ i++) {
+ dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << endl;
+ g_timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 1));
+ }
+ for (map<int,float>::iterator i = g_fake_osd_out.begin();
+ i != g_fake_osd_out.end();
+ i++) {
+ dout(0) << "will fake osd" << i->first << " OUT after " << i->second << endl;
+ g_timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 0));
+ }
+}
+
+
+bool OSDMonitor::get_map_bl(epoch_t epoch, bufferlist& bl)
+{
+ object_t oid(Monitor::INO_OSD_MAP, epoch);
+ if (!mon->store->exists(oid))
+ return false;
+ int r = mon->store->read(oid, 0, 0, bl);
+ assert(r > 0);
+ return true;
+}
+
+bool OSDMonitor::get_inc_map_bl(epoch_t epoch, bufferlist& bl)
+{
+ object_t oid(Monitor::INO_OSD_INC_MAP, epoch);
+ if (!mon->store->exists(oid))
+ return false;
+ int r = mon->store->read(oid, 0, 0, bl);
+ assert(r > 0);
+ return true;
+}
+
+
+void OSDMonitor::save_map()
+{
+ bufferlist bl;
+ osdmap.encode(bl);
+
+ ObjectStore::Transaction t;
+ t.write(object_t(Monitor::INO_OSD_MAP,0), 0, bl.length(), bl);
+ t.write(object_t(Monitor::INO_OSD_MAP,osdmap.get_epoch()), 0, bl.length(), bl);
+ mon->store->apply_transaction(t);
+ mon->store->sync();
+}
+
+void OSDMonitor::save_inc_map(OSDMap::Incremental &inc)
+{
+ bufferlist bl;
+ osdmap.encode(bl);
+
+ bufferlist incbl;
+ inc.encode(incbl);
+
+ ObjectStore::Transaction t;
+ t.write(object_t(Monitor::INO_OSD_MAP,0), 0, bl.length(), bl);
+ t.write(object_t(Monitor::INO_OSD_MAP,osdmap.get_epoch()), 0, bl.length(), bl); // not strictly needed??
+ t.write(object_t(Monitor::INO_OSD_INC_MAP,osdmap.get_epoch()), 0, incbl.length(), incbl);
+ mon->store->apply_transaction(t);
+ mon->store->sync();
+}
+
+
+
+void OSDMonitor::dispatch(Message *m)
+{
+ switch (m->get_type()) {
+
+ // services
+ case MSG_OSD_GETMAP:
+ handle_osd_getmap((MOSDGetMap*)m);
+ break;
+ case MSG_OSD_FAILURE:
+ handle_osd_failure((MOSDFailure*)m);
+ break;
+ case MSG_OSD_BOOT:
+ handle_osd_boot((MOSDBoot*)m);
+ break;
+ case MSG_OSD_IN:
+ handle_osd_in((MOSDIn*)m);
+ break;
+ case MSG_OSD_OUT:
+ handle_osd_out((MOSDOut*)m);
+ break;
+
+ // replication
+ case MSG_MON_OSDMAP_INFO:
+ handle_info((MMonOSDMapInfo*)m);
+ break;
+ case MSG_MON_OSDMAP_LEASE:
+ handle_lease((MMonOSDMapLease*)m);
+ break;
+ case MSG_MON_OSDMAP_LEASE_ACK:
+ handle_lease_ack((MMonOSDMapLeaseAck*)m);
+ break;
+ case MSG_MON_OSDMAP_UPDATE_PREPARE:
+ handle_update_prepare((MMonOSDMapUpdatePrepare*)m);
+ break;
+ case MSG_MON_OSDMAP_UPDATE_ACK:
+ handle_update_ack((MMonOSDMapUpdateAck*)m);
+ break;
+ case MSG_MON_OSDMAP_UPDATE_COMMIT:
+ handle_update_commit((MMonOSDMapUpdateCommit*)m);
+ break;
+
+ default:
+ assert(0);
+ }
+}
+
+
+
+void OSDMonitor::handle_osd_failure(MOSDFailure *m)
+{
+ dout(1) << "osd failure: " << m->get_failed() << " from " << m->get_source() << endl;
+
+ // FIXME
+ // take their word for it
+ int from = m->get_failed().num();
+ if (osdmap.is_up(from) &&
+ (osdmap.osd_inst.count(from) == 0 ||
+ osdmap.osd_inst[from] == m->get_inst())) {
+ pending_inc.new_down[from] = m->get_inst();
+
+ if (osdmap.is_in(from))
+ down_pending_out[from] = g_clock.now();
+
+ //awaiting_maps[pending_inc.epoch][m->get_source()] =
+
+ accept_pending();
+
+ send_incremental(m->get_epoch(), m->get_source(), m->get_source_inst());
+
+ send_waiting();
+ bcast_latest_mds();
+ }
+
+ delete m;
+}
+
+
+void OSDMonitor::fake_osd_failure(int osd, bool down)
+{
+ lock.Lock();
+ {
+ if (down) {
+ dout(1) << "fake_osd_failure DOWN osd" << osd << endl;
+ pending_inc.new_down[osd] = osdmap.osd_inst[osd];
+ } else {
+ dout(1) << "fake_osd_failure OUT osd" << osd << endl;
+ pending_inc.new_out.push_back(osd);
+ }
+ accept_pending();
+ bcast_latest_osd();
+ bcast_latest_mds();
+ }
+ lock.Unlock();
+}
+
+
+void OSDMonitor::handle_osd_boot(MOSDBoot *m)
+{
+ dout(7) << "osd_boot from " << m->get_source() << endl;
+ assert(m->get_source().is_osd());
+ int from = m->get_source().num();
+
+ if (osdmap.get_epoch() == 0) {
+ // waiting for boot!
+ osdmap.osd_inst[from] = m->get_source_inst();
+
+ if (osdmap.osd_inst.size() == osdmap.osds.size()) {
+ dout(-7) << "osd_boot all osds booted." << endl;
+ osdmap.inc_epoch();
+
+ save_map();
+
+ pending_inc.epoch = osdmap.get_epoch()+1; // 2
+
+ bcast_latest_osd();
+ bcast_latest_mds();
+ } else {
+ dout(7) << "osd_boot waiting for "
+ << (osdmap.osds.size() - osdmap.osd_inst.size())
+ << " osds to boot" << endl;
+ }
+ return;
+ }
+
+ // already up? mark down first?
+ if (osdmap.is_up(from)) {
+ pending_inc.new_down[from] = osdmap.osd_inst[from];
+ accept_pending();
+ }
+
+ // mark up.
+ down_pending_out.erase(from);
+ assert(osdmap.is_down(from));
+ pending_inc.new_up[from] = m->get_source_inst();
+
+ // mark in?
+ if (osdmap.out_osds.count(from))
+ pending_inc.new_in.push_back(from);
+
+ accept_pending();
+
+ // the booting osd will spread word
+ send_incremental(m->sb.current_epoch, m->get_source(), m->get_source_inst());
+ delete m;
+
+ // tell mds
+ bcast_latest_mds();
+}
+
+void OSDMonitor::handle_osd_in(MOSDIn *m)
+{
+ dout(7) << "osd_in from " << m->get_source() << endl;
+ int from = m->get_source().num();
+
+ if (osdmap.is_out(from))
+ pending_inc.new_in.push_back(from);
+ accept_pending();
+ send_incremental(m->map_epoch, m->get_source(), m->get_source_inst());
+}
+
+void OSDMonitor::handle_osd_out(MOSDOut *m)
+{
+ dout(7) << "osd_out from " << m->get_source() << endl;
+ int from = m->get_source().num();
+ if (osdmap.is_in(from)) {
+ pending_inc.new_out.push_back(from);
+ accept_pending();
+ send_incremental(m->map_epoch, m->get_source(), m->get_source_inst());
+ }
+}
+
+void OSDMonitor::handle_osd_getmap(MOSDGetMap *m)
+{
+ dout(7) << "osd_getmap from " << m->get_source() << " since " << m->get_since() << endl;
+
+ if (osdmap.get_epoch() == 0) {
+ awaiting_map[m->get_source()].first = m->get_source_inst();
+ awaiting_map[m->get_source()].second = m->get_since();
+ } else {
+ //if (m->get_since())
+ send_incremental(m->get_since(), m->get_source(), m->get_source_inst());
+ //else
+ //send_full(m->get_source(), m->get_source_inst());
+ }
+ delete m;
+}
+
+
+
+void OSDMonitor::accept_pending()
+{
+ dout(-10) << "accept_pending " << osdmap.get_epoch() << " -> " << pending_inc.epoch << endl;
+
+ // accept pending into a new map!
+ pending_inc.ctime = g_clock.now();
+ pending_inc.mon_epoch = mon->mon_epoch;
+
+ // advance!
+ osdmap.apply_incremental(pending_inc);
+
+ // save it.
+ save_inc_map( pending_inc );
+
+ // tell me about it
+ for (map<int,entity_inst_t>::iterator i = pending_inc.new_up.begin();
+ i != pending_inc.new_up.end();
+ i++) {
+ dout(0) << "osd" << i->first << " UP " << i->second << endl;
+ derr(0) << "osd" << i->first << " UP " << i->second << endl;
+ messenger->mark_up(MSG_ADDR_OSD(i->first), i->second);
+ }
+ for (map<int,entity_inst_t>::iterator i = pending_inc.new_down.begin();
+ i != pending_inc.new_down.end();
+ i++) {
+ dout(0) << "osd" << i->first << " DOWN " << i->second << endl;
+ derr(0) << "osd" << i->first << " DOWN " << i->second << endl;
+ messenger->mark_down(MSG_ADDR_OSD(i->first), i->second);
+ }
+ for (list<int>::iterator i = pending_inc.new_in.begin();
+ i != pending_inc.new_in.end();
+ i++) {
+ dout(0) << "osd" << *i << " IN" << endl;
+ derr(0) << "osd" << *i << " IN" << endl;
+ }
+ for (list<int>::iterator i = pending_inc.new_out.begin();
+ i != pending_inc.new_out.end();
+ i++) {
+ dout(0) << "osd" << *i << " OUT" << endl;
+ derr(0) << "osd" << *i << " OUT" << endl;
+ }
+
+ // clear new pending
+ OSDMap::Incremental next(osdmap.get_epoch() + 1);
+ pending_inc = next;
+}
+
+void OSDMonitor::send_waiting()
+{
+ dout(10) << "send_waiting " << osdmap.get_epoch() << endl;
+
+ for (map<msg_addr_t,pair<entity_inst_t,epoch_t> >::iterator i = awaiting_map.begin();
+ i != awaiting_map.end();
+ i++)
+ send_incremental(i->second.second, i->first, i->second.first);
+}
+
+
+void OSDMonitor::send_full(msg_addr_t who, const entity_inst_t& inst)
+{
+ messenger->send_message(new MOSDMap(&osdmap), who, inst);
+}
+
+void OSDMonitor::send_incremental(epoch_t since, msg_addr_t dest, const entity_inst_t& inst)
+{
+ dout(5) << "osd_send_incremental " << since << " -> " << osdmap.get_epoch()
+ << " to " << dest << endl;
+
+ MOSDMap *m = new MOSDMap;
+
+ for (epoch_t e = osdmap.get_epoch();
+ e > since;
+ e--) {
+ bufferlist bl;
+ if (get_inc_map_bl(e, bl)) {
+ dout(10) << "osd_send_incremental inc " << e << endl;
+ m->incremental_maps[e] = bl;
+ }
+ else if (get_map_bl(e, bl)) {
+ dout(10) << "osd_send_incremental full " << e << endl;
+ m->maps[e] = bl;
+ }
+ else {
+ assert(0); // we should have all maps.
+ }
+ }
+
+ messenger->send_message(m, dest, inst);
+}
+
+
+
+void OSDMonitor::bcast_latest_mds()
+{
+ epoch_t e = osdmap.get_epoch();
+ dout(1) << "bcast_latest_mds epoch " << e << endl;
+
+ // tell mds
+ for (set<int>::iterator i = mon->mdsmon->mdsmap.get_mds().begin();
+ i != mon->mdsmon->mdsmap.get_mds().end();
+ i++) {
+ if (mon->mdsmon->mdsmap.is_down(*i)) continue;
+ send_incremental(osdmap.get_epoch()-1, MSG_ADDR_MDS(*i), mon->mdsmon->mdsmap.get_inst(*i));
+ }
+}
+
+void OSDMonitor::bcast_latest_osd()
+{
+ epoch_t e = osdmap.get_epoch();
+ dout(1) << "bcast_latest_osd epoch " << e << endl;
+
+ // tell osds
+ set<int> osds;
+ osdmap.get_all_osds(osds);
+ for (set<int>::iterator it = osds.begin();
+ it != osds.end();
+ it++) {
+ if (osdmap.is_down(*it)) continue;
+
+ send_incremental(osdmap.get_epoch()-1, MSG_ADDR_OSD(*it), osdmap.get_inst(*it));
+ }
+}
+
+
+
+void OSDMonitor::tick()
+{
+ // mark down osds out?
+ utime_t now = g_clock.now();
+ list<int> mark_out;
+ for (map<int,utime_t>::iterator i = down_pending_out.begin();
+ i != down_pending_out.end();
+ i++) {
+ utime_t down = now;
+ down -= i->second;
+
+ if (down.sec() >= g_conf.mon_osd_down_out_interval) {
+ dout(10) << "tick marking osd" << i->first << " OUT after " << down << " sec" << endl;
+ mark_out.push_back(i->first);
+ }
+ }
+ for (list<int>::iterator i = mark_out.begin();
+ i != mark_out.end();
+ i++) {
+ down_pending_out.erase(*i);
+ pending_inc.new_out.push_back( *i );
+ }
+ if (!mark_out.empty()) {
+ accept_pending();
+
+ // hrmpf. bcast map for now. FIXME FIXME.
+ bcast_latest_osd();
+ }
+}
+
+void OSDMonitor::election_starting()
+{
+ dout(10) << "election_starting" << endl;
+}
+
+void OSDMonitor::election_finished()
+{
+ dout(10) << "election_starting" << endl;
+
+ state = STATE_INIT;
+
+ if (mon->is_leader()) {
+ // leader.
+ if (mon->monmap->num_mon == 1) {
+ // hmm, it's just me!
+ state = STATE_SYNC;
+ }
+ }
+ else if (mon->is_peon()) {
+ // peon. send info
+ messenger->send_message(new MMonOSDMapInfo(osdmap.epoch, osdmap.mon_epoch),
+ MSG_ADDR_MON(mon->leader), mon->monmap->get_inst(mon->leader));
+ }
+
+}
+
+
+
+void OSDMonitor::handle_info(MMonOSDMapInfo *m)
+{
+ dout(10) << "handle_info from " << m->get_source()
+ << " epoch " << m->get_epoch() << " in mon_epoch " << m->get_mon_epoch()
+ << endl;
+
+ epoch_t epoch = m->get_epoch();
+
+ // did they have anything?
+ if (epoch > 0) {
+ // make sure it's current.
+ if (epoch == osdmap.get_epoch()) {
+ if (osdmap.mon_epoch != m->get_mon_epoch()) {
+ dout(10) << "handle_info had divergent epoch " << m->get_epoch()
+ << ", mon_epoch " << m->get_mon_epoch() << " != " << osdmap.mon_epoch << endl;
+ epoch--;
+ }
+ } else {
+ bufferlist bl;
+ get_map_bl(epoch, bl);
+
+ OSDMap old;
+ old.decode(bl);
+
+ if (old.mon_epoch != m->get_mon_epoch()) {
+ dout(10) << "handle_info had divergent epoch " << m->get_epoch()
+ << ", mon_epoch " << m->get_mon_epoch() << " != " << old.mon_epoch << endl;
+ epoch--;
+ }
+ }
+ }
+
+ // bring up to date
+ if (epoch < osdmap.get_epoch())
+ send_incremental(epoch, m->get_source(), m->get_source_inst());
+
+ delete m;
+}
+
+
+void OSDMonitor::issue_leases()
+{
+ dout(10) << "issue_leases" << endl;
+ assert(mon->is_leader());
+
+ // set lease endpoint
+ lease_expire = g_clock.now();
+ lease_expire += g_conf.mon_lease;
+
+ pending_ack.clear();
+
+ for (set<int>::iterator i = mon->quorum.begin();
+ i != mon->quorum.end();
+ i++) {
+ if (*i == mon->whoami) continue;
+ messenger->send_message(new MMonOSDMapLease(osdmap.get_epoch(), lease_expire),
+ MSG_ADDR_MON(*i), mon->monmap->get_inst(*i));
+ pending_ack.insert(*i);
+ }
+}
+
+void OSDMonitor::handle_lease(MMonOSDMapLease *m)
+{
+ if (m->get_epoch() != osdmap.get_epoch() + 1) {
+ dout(10) << "map_lease from " << m->get_source()
+ << " on epoch " << m->get_epoch() << ", but i am " << osdmap.get_epoch() << endl;
+ assert(0);
+ delete m;
+ return;
+ }
+
+ dout(10) << "map_lease from " << m->get_source() << " expires " << lease_expire << endl;
+ lease_expire = m->get_lease_expire();
+
+ delete m;
+}
+
+void OSDMonitor::handle_lease_ack(MMonOSDMapLeaseAck *m)
+{
+ // right epoch?
+ if (m->get_epoch() != osdmap.get_epoch()) {
+ dout(10) << "map_lease_ack from " << m->get_source()
+ << " on old epoch " << m->get_epoch() << ", dropping" << endl;
+ delete m;
+ return;
+ }
+
+ // within time limit?
+ if (g_clock.now() >= lease_expire) {
+ dout(10) << "map_lease_ack from " << m->get_source()
+ << ", but lease expired, calling election" << endl;
+ mon->call_election();
+ delete m;
+ return;
+ }
+
+ assert(m->get_source().is_mon());
+ int from = m->get_source().num();
+
+ assert(pending_ack.count(from));
+ pending_ack.erase(from);
+
+ if (pending_ack.empty()) {
+ dout(10) << "map_lease_ack from " << m->get_source()
+ << ", last one" << endl;
+ } else {
+ dout(10) << "map_lease_ack from " << m->get_source()
+ << ", still waiting on " << pending_ack << endl;
+ }
+
+ delete m;
+}
+
+
+void OSDMonitor::update_map()
+{
+ // lock map
+ state = STATE_UPDATING;
+ pending_ack.clear();
+
+ // set lease endpoint
+ lease_expire += g_conf.mon_lease;
+
+ // send prepare
+ epoch_t epoch = osdmap.get_epoch();
+ bufferlist map_bl, inc_map_bl;
+ if (!get_inc_map_bl(epoch, inc_map_bl))
+ get_map_bl(epoch, map_bl);
+
+ for (set<int>::iterator i = mon->quorum.begin();
+ i != mon->quorum.end();
+ i++) {
+ if (*i == mon->whoami) continue;
+ messenger->send_message(new MMonOSDMapUpdatePrepare(epoch,
+ map_bl, inc_map_bl),
+ MSG_ADDR_MON(*i), mon->monmap->get_inst(*i));
+ pending_ack.insert(*i);
+ }
+}
+
+
+
+void OSDMonitor::handle_update_prepare(MMonOSDMapUpdatePrepare *m)
+{
+ dout(10) << "map_update_prepare from " << m->get_source() << " epoch " << m->get_epoch() << endl;
+ // accept map
+ assert(m->get_epoch() == osdmap.get_epoch() + 1);
+
+ if (m->inc_map_bl.length()) {
+ int off = 0;
+ pending_inc.decode(m->inc_map_bl, off);
+ accept_pending();
+ } else {
+ osdmap.decode(m->map_bl);
+ }
+
+ // state
+ state = STATE_LOCK;
+ //lease_expire = m->lease_expire;
+
+ // ack
+ messenger->send_message(new MMonOSDMapUpdateAck(osdmap.get_epoch()),
+ m->get_source(), m->get_source_inst());
+ delete m;
+}
+
+void OSDMonitor::handle_update_ack(MMonOSDMapUpdateAck *m)
+{
+ /*
+ // right epoch?
+ if (m->get_epoch() != osdmap.get_epoch()) {
+ dout(10) << "map_update_ack from " << m->get_source()
+ << " on old epoch " << m->get_epoch() << ", dropping" << endl;
+ delete m;
+ return;
+ }
+
+ // within time limit?
+ if (g_clock.now() >= lease_expire) {
+ dout(10) << "map_update_ack from " << m->get_source()
+ << ", but lease expired, calling election" << endl;
+ state = STATE_SYNC;
+ mon->call_election();
+ return;
+ }
+
+ assert(m->get_source().is_mon());
+ int from = m->get_source().num();
+
+ assert(pending_lease_ack.count(from));
+ pending_lease_ack.erase(from);
+
+ if (pending_lease_ack.empty()) {
+ dout(10) << "map_update_ack from " << m->get_source()
+ << ", last one" << endl;
+ state = STATE_SYNC;
+
+ // send lease commit
+ for (map<int>::iterator i = mon->quorum.begin();
+ i != mon->quorum.end();
+ i++) {
+ if (i == mon->whoami) continue;
+ messenger->send_message(new MMonOSDMapLeaseCommit(osdmap),
+ MSG_ADDR_MON(*i), mon->monmap->get_inst(*i));
+ }
+ } else {
+ dout(10) << "map_update_ack from " << m->get_source()
+ << ", still waiting on " << pending_lease_ack << endl;
+ }
+*/
+}
+
+void OSDMonitor::handle_update_commit(MMonOSDMapUpdateCommit *m)
+{
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __OSDMONITOR_H
+#define __OSDMONITOR_H
+
+#include <map>
+#include <set>
+using namespace std;
+
+#include "include/types.h"
+#include "msg/Messenger.h"
+
+#include "osd/OSDMap.h"
+
+class Monitor;
+
+class OSDMonitor : public Dispatcher {
+ Monitor *mon;
+ Messenger *messenger;
+ Mutex &lock;
+
+ // osd maps
+public:
+ OSDMap osdmap;
+
+private:
+ map<msg_addr_t, pair<entity_inst_t, epoch_t> > awaiting_map;
+
+ void create_initial();
+ bool get_map_bl(epoch_t epoch, bufferlist &bl);
+ bool get_inc_map_bl(epoch_t epoch, bufferlist &bl);
+
+ void save_map();
+ void save_inc_map(OSDMap::Incremental &inc);
+
+ // [leader]
+ OSDMap::Incremental pending_inc;
+ map<int,utime_t> down_pending_out; // osd down -> out
+
+ set<int> pending_ack;
+
+ // we are distributed
+ const static int STATE_INIT = 0; // startup
+ const static int STATE_SYNC = 1; // sync map copy (readonly)
+ const static int STATE_LOCK = 2; // [peon] map locked
+ const static int STATE_UPDATING = 3; // [leader] map locked, waiting for peon ack
+
+ int state;
+ utime_t lease_expire; // when lease expires
+
+ void init();
+
+ // maps
+ void accept_pending(); // accept pending, new map.
+ void send_waiting(); // send current map to waiters.
+ void send_full(msg_addr_t dest, const entity_inst_t& inst);
+ void send_incremental(epoch_t since, msg_addr_t dest, const entity_inst_t& inst);
+ void bcast_latest_mds();
+ void bcast_latest_osd();
+
+ void update_map();
+
+ void handle_osd_boot(class MOSDBoot *m);
+ void handle_osd_in(class MOSDIn *m);
+ void handle_osd_out(class MOSDOut *m);
+ void handle_osd_failure(class MOSDFailure *m);
+ void handle_osd_getmap(class MOSDGetMap *m);
+
+ void handle_info(class MMonOSDMapInfo*);
+ void handle_lease(class MMonOSDMapLease*);
+ void handle_lease_ack(class MMonOSDMapLeaseAck*);
+ void handle_update_prepare(class MMonOSDMapUpdatePrepare*);
+ void handle_update_ack(class MMonOSDMapUpdateAck*);
+ void handle_update_commit(class MMonOSDMapUpdateCommit*);
+
+ public:
+ OSDMonitor(Monitor *mn, Messenger *m, Mutex& l) :
+ mon(mn), messenger(m), lock(l),
+ state(STATE_SYNC) {
+ init();
+ }
+
+ void dispatch(Message *m);
+ void tick(); // check state, take actions
+
+ void election_starting(); // abort whatever.
+ void election_finished(); // reinitialize whatever.
+
+ void issue_leases();
+
+ void fake_osd_failure(int osd, bool down);
+ void fake_osdmap_update();
+ void fake_reorg();
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "Dispatcher.h"
+#include "Messenger.h"
+
+#include "mds/MDS.h"
+
+/*
+int Dispatcher::send_message(Message *m, msg_addr_t dest, int dest_port)
+{
+ assert(0);
+ //return dis_messenger->send_message(m, dest, dest_port, MDS_PORT_SERVER); // on my port!
+}
+*/
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __DISPATCHER_H
+#define __DISPATCHER_H
+
+#include "Message.h"
+
+class Messenger;
+
+class Dispatcher {
+ public:
+ virtual ~Dispatcher() { }
+
+ // how i receive messages
+ virtual void dispatch(Message *m) = 0;
+
+
+ // how i deal with transmission failures.
+ virtual void ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst) { delete m; }
+
+ // lookups
+ virtual bool ms_lookup(msg_addr_t dest, entity_inst_t& inst) { assert(0); return 0; }
+
+ // this is how i send messages
+ //int send_message(Message *m, msg_addr_t dest, int dest_port);
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "Message.h"
+#include "FakeMessenger.h"
+#include "mds/MDS.h"
+
+#include "common/Timer.h"
+
+#include "common/LogType.h"
+#include "common/Logger.h"
+
+#include "config.h"
+
+#undef dout
+#define dout(x) if ((x) <= g_conf.debug_ms) cout << g_clock.now() << " "
+
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <map>
+#include <cassert>
+#include <iostream>
+
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include <pthread.h>
+
+
+// global queue.
+
+int nranks = 0; // this identify each entity_inst_t
+
+map<int, FakeMessenger*> directory;
+hash_map<int, Logger*> loggers;
+LogType fakemsg_logtype;
+
+set<int> shutdown_set;
+
+Mutex lock;
+Cond cond;
+
+bool pending_timer = false;
+
+bool awake = false;
+bool fm_shutdown = false;
+pthread_t thread_id;
+
+
+
+class C_FakeKicker : public Context {
+ void finish(int r) {
+ dout(18) << "timer kick" << endl;
+ pending_timer = true;
+ lock.Lock();
+ cond.Signal(); // why not
+ lock.Unlock();
+ }
+};
+
+void FakeMessenger::callback_kick()
+{
+ pending_timer = true;
+ lock.Lock();
+ cond.Signal(); // why not
+ lock.Unlock();
+}
+
+void *fakemessenger_thread(void *ptr)
+{
+ //dout(1) << "thread start, setting timer kicker" << endl;
+ //g_timer.set_messenger_kicker(new C_FakeKicker());
+ //msgr_callback_kicker = new C_FakeKicker();
+
+ lock.Lock();
+ while (1) {
+ dout(20) << "thread waiting" << endl;
+ if (fm_shutdown) break;
+ awake = false;
+ cond.Wait(lock);
+ awake = true;
+ dout(20) << "thread woke up" << endl;
+ if (fm_shutdown) break;
+
+ fakemessenger_do_loop_2();
+
+ if (directory.empty()) break;
+ }
+ lock.Unlock();
+
+ //cout << "unsetting messenger" << endl;
+ //g_timer.unset_messenger_kicker();
+ //g_timer.unset_messenger();
+ //msgr_callback_kicker = 0;
+
+ dout(1) << "thread finish (i woke up but no messages, bye)" << endl;
+ return 0;
+}
+
+
+void fakemessenger_startthread() {
+ pthread_create(&thread_id, NULL, fakemessenger_thread, 0);
+}
+
+void fakemessenger_stopthread() {
+ cout << "fakemessenger_stopthread setting stop flag" << endl;
+ lock.Lock();
+ fm_shutdown = true;
+ lock.Unlock();
+ cond.Signal();
+
+ fakemessenger_wait();
+}
+
+void fakemessenger_wait()
+{
+ cout << "fakemessenger_wait waiting" << endl;
+ void *ptr;
+ pthread_join(thread_id, &ptr);
+}
+
+
+
+
+// lame main looper
+
+int fakemessenger_do_loop()
+{
+ lock.Lock();
+ fakemessenger_do_loop_2();
+ lock.Unlock();
+
+ g_timer.shutdown();
+ return 0;
+}
+
+
+int fakemessenger_do_loop_2()
+{
+ //lock.Lock();
+ dout(18) << "do_loop begin." << endl;
+
+ while (1) {
+ bool didone = false;
+
+ dout(18) << "do_loop top" << endl;
+
+ /*// timer?
+ if (pending_timer) {
+ pending_timer = false;
+ dout(5) << "pending timer" << endl;
+ g_timer.execute_pending();
+ }
+ */
+
+ // callbacks
+ lock.Unlock();
+ Messenger::do_callbacks();
+ lock.Lock();
+
+ // messages
+ map<int, FakeMessenger*>::iterator it = directory.begin();
+ while (it != directory.end()) {
+ FakeMessenger *mgr = it->second;
+
+ dout(18) << "messenger " << mgr << " at " << mgr->get_myaddr() << " has " << mgr->num_incoming() << " queued" << endl;
+
+
+ if (!mgr->is_ready()) {
+ dout(18) << "messenger " << mgr << " at " << mgr->get_myaddr() << " has no dispatcher, skipping" << endl;
+ it++;
+ continue;
+ }
+
+ Message *m = mgr->get_message();
+ it++;
+
+ if (m) {
+ //dout(18) << "got " << m << endl;
+ dout(1) << "---- '" << m->get_type_name()
+ << "' from " << m->get_source() // << ':' << m->get_source_port()
+ << " to " << m->get_dest() //<< ':' << m->get_dest_port()
+ << " ---- " << m
+ << endl;
+
+ if (g_conf.fakemessenger_serialize) {
+ // encode
+ if (m->empty_payload())
+ m->encode_payload();
+ msg_envelope_t env = m->get_envelope();
+ bufferlist bl;
+ bl.claim( m->get_payload() );
+ //bl.c_str(); // condense into 1 buffer
+
+ delete m;
+
+ // decode
+ m = decode_message(env, bl);
+ assert(m);
+ }
+
+ didone = true;
+
+ lock.Unlock();
+ mgr->dispatch(m);
+ lock.Lock();
+ }
+ }
+
+ // deal with shutdowns.. dleayed to avoid concurrent directory modification
+ if (!shutdown_set.empty()) {
+ for (set<int>::iterator it = shutdown_set.begin();
+ it != shutdown_set.end();
+ it++) {
+ dout(7) << "fakemessenger: removing " << *it << " from directory" << endl;
+ assert(directory.count(*it));
+ directory.erase(*it);
+ if (directory.empty()) {
+ dout(1) << "fakemessenger: last shutdown" << endl;
+ ::fm_shutdown = true;
+ }
+ }
+ shutdown_set.clear();
+ }
+
+ if (!didone)
+ break;
+ }
+
+
+ dout(18) << "do_loop end (no more messages)." << endl;
+ //lock.Unlock();
+ return 0;
+}
+
+
+FakeMessenger::FakeMessenger(msg_addr_t me) : Messenger(me)
+{
+ entity_inst_t fakeinst;
+ lock.Lock();
+ {
+ // assign rank
+ fakeinst.addr.sin_port =
+ fakeinst.rank = nranks++;
+ set_myinst(fakeinst);
+
+ // add to directory
+ directory[ fakeinst.rank ] = this;
+ }
+ lock.Unlock();
+
+
+ cout << "fakemessenger " << get_myaddr() << " messenger is " << this << " at " << fakeinst << endl;
+
+ //g_timer.set_messenger(this);
+
+ qlen = 0;
+
+ /*
+ string name;
+ name = "m.";
+ name += MSG_ADDR_TYPE(myaddr);
+ int w = MSG_ADDR_NUM(myaddr);
+ if (w >= 1000) name += ('0' + ((w/1000)%10));
+ if (w >= 100) name += ('0' + ((w/100)%10));
+ if (w >= 10) name += ('0' + ((w/10)%10));
+ name += ('0' + ((w/1)%10));
+
+ loggers[ myaddr ] = new Logger(name, (LogType*)&fakemsg_logtype);
+ */
+}
+
+FakeMessenger::~FakeMessenger()
+{
+
+}
+
+
+int FakeMessenger::shutdown()
+{
+ //cout << "shutdown on messenger " << this << " has " << num_incoming() << " queued" << endl;
+ lock.Lock();
+ assert(directory.count(get_myinst().rank) == 1);
+ shutdown_set.insert(get_myinst().rank);
+
+ /*
+ directory.erase(myaddr);
+ if (directory.empty()) {
+ dout(1) << "fakemessenger: last shutdown" << endl;
+ ::fm_shutdown = true;
+ cond.Signal(); // why not
+ }
+ */
+
+ /*
+ if (loggers[myaddr]) {
+ delete loggers[myaddr];
+ loggers.erase(myaddr);
+ }
+ */
+
+ lock.Unlock();
+ return 0;
+}
+
+/*
+void FakeMessenger::trigger_timer(Timer *t)
+{
+ // note timer to call
+ pending_timer = t;
+
+ // wake up thread?
+ cond.Signal(); // why not
+}
+*/
+
+void FakeMessenger::reset_myaddr(msg_addr_t m)
+{
+ dout(1) << "reset_myaddr from " << get_myaddr() << " to " << m << endl;
+ _set_myaddr(m);
+}
+
+
+int FakeMessenger::send_message(Message *m, msg_addr_t dest, entity_inst_t inst, int port, int fromport)
+{
+ m->set_source(get_myaddr(), fromport);
+ m->set_dest(dest, port);
+ //m->set_lamport_send_stamp( get_lamport() );
+
+ m->set_source_inst(get_myinst());
+
+ lock.Lock();
+
+ // deliver
+ try {
+#ifdef LOG_MESSAGES
+ // stats
+ loggers[get_myaddr()]->inc("+send",1);
+ loggers[dest]->inc("-recv",1);
+
+ char s[20];
+ sprintf(s,"+%s", m->get_type_name());
+ loggers[get_myaddr()]->inc(s);
+ sprintf(s,"-%s", m->get_type_name());
+ loggers[dest]->inc(s);
+#endif
+
+ // queue
+ FakeMessenger *dm = directory[inst.rank];
+ if (!dm) {
+ dout(1) << "** destination " << dest << " (" << inst << ") dne" << endl;
+ assert(dm);
+ }
+ dm->queue_incoming(m);
+
+ dout(1) << "--> " << get_myaddr() << " sending " << m << " '" << m->get_type_name() << "'"
+ << " to " << dest
+ << endl;//" m " << dm << " has " << dm->num_incoming() << " queued" << endl;
+
+ }
+ catch (...) {
+ cout << "no destination " << dest << endl;
+ assert(0);
+ }
+
+
+ // wake up loop?
+ if (!awake) {
+ dout(10) << "waking up fakemessenger thread" << endl;
+ cond.Signal();
+ lock.Unlock();
+ } else
+ lock.Unlock();
+
+ return 0;
+}
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __FAKEMESSENGER_H
+#define __FAKEMESSENGER_H
+
+#include "Messenger.h"
+#include "Dispatcher.h"
+
+#include <list>
+#include <map>
+
+class Timer;
+
+class FakeMessenger : public Messenger {
+ protected:
+ class Logger *logger;
+
+ int qlen;
+ list<Message*> incoming; // incoming queue
+
+ public:
+ FakeMessenger(msg_addr_t me);
+ ~FakeMessenger();
+
+ virtual int shutdown();
+
+ void reset_myaddr(msg_addr_t m);
+
+ // msg interface
+ virtual int send_message(Message *m, msg_addr_t dest, entity_inst_t inst, int port=0, int fromport=0);
+
+ // events
+ //virtual void trigger_timer(Timer *t);
+
+ int get_dispatch_queue_len() { return qlen; }
+
+ void callback_kick();
+
+ // -- incoming queue --
+ // (that nothing uses)
+ Message *get_message() {
+ if (!incoming.empty()) {
+ Message *m = incoming.front();
+ incoming.pop_front();
+ qlen--;
+ return m;
+ }
+ return NULL;
+ }
+ bool queue_incoming(Message *m) {
+ incoming.push_back(m);
+ qlen++;
+ return true;
+ }
+ int num_incoming() {
+ //return incoming.size();
+ return qlen;
+ }
+
+};
+
+int fakemessenger_do_loop();
+int fakemessenger_do_loop_2();
+void fakemessenger_startthread();
+void fakemessenger_stopthread();
+void fakemessenger_wait();
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "HostMonitor.h"
+
+#include "msg/Message.h"
+#include "msg/Messenger.h"
+
+#include "messages/MPing.h"
+#include "messages/MPingAck.h"
+#include "messages/MFailure.h"
+#include "messages/MFailureAck.h"
+
+#include "common/Timer.h"
+#include "common/Clock.h"
+
+#define DBL 10
+
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug) cout << whoami << " hostmon: "
+
+
+// timer contexts
+
+class C_HM_InitiateHeartbeat : public Context {
+ HostMonitor *hm;
+public:
+ C_HM_InitiateHeartbeat(HostMonitor *hm) {
+ this->hm = hm;
+ }
+ void finish(int r) {
+ //cout << "HEARTBEAT" << endl;
+ hm->pending_events.erase(this);
+ hm->initiate_heartbeat();
+ }
+};
+
+class C_HM_CheckHeartbeat : public Context {
+ HostMonitor *hm;
+public:
+ C_HM_CheckHeartbeat(HostMonitor *hm) {
+ this->hm = hm;
+ }
+ void finish(int r) {
+ //cout << "CHECK" << endl;
+ hm->pending_events.erase(this);
+ hm->check_heartbeat();
+ }
+};
+
+
+
+// startup/shutdown
+
+void HostMonitor::init()
+{
+ dout(DBL) << "init" << endl;
+
+ // hack params for now
+ heartbeat_interval = 10;
+ max_ping_time = 2;
+ max_heartbeat_misses = 3;
+ notify_retry_interval = 10;
+
+ // schedule first hb
+ schedule_heartbeat();
+}
+
+
+void HostMonitor::shutdown()
+{
+ // cancel any events
+ for (set<Context*>::iterator it = pending_events.begin();
+ it != pending_events.end();
+ it++) {
+ g_timer.cancel_event(*it);
+ delete *it;
+ }
+ pending_events.clear();
+}
+
+
+// schedule next heartbeat
+
+void HostMonitor::schedule_heartbeat()
+{
+ dout(DBL) << "schedule_heartbeat" << endl;
+ Context *e = new C_HM_InitiateHeartbeat(this);
+ pending_events.insert(e);
+ g_timer.add_event_after(heartbeat_interval, e);
+}
+
+
+// take note of a live host
+
+void HostMonitor::host_is_alive(msg_addr_t host)
+{
+ if (hosts.count(host))
+ status[host].last_heard_from = g_clock.gettime();
+}
+
+
+// do heartbeat
+
+void HostMonitor::initiate_heartbeat()
+{
+ time_t now = g_clock.gettime();
+
+ // send out pings
+ inflight_pings.clear();
+ for (set<msg_addr_t>::iterator it = hosts.begin();
+ it != hosts.end();
+ it++) {
+ // have i heard from them recently?
+ if (now - status[*it].last_heard_from < heartbeat_interval) {
+ dout(DBL) << "skipping " << *it << ", i heard from them recently" << endl;
+ } else {
+ dout(DBL) << "pinging " << *it << endl;
+ status[*it].last_pinged = now;
+ inflight_pings.insert(*it);
+
+ messenger->send_message(new MPing(1), *it, 0);
+ }
+ }
+
+ // set timer to check results
+ Context *e = new C_HM_CheckHeartbeat(this);
+ pending_events.insert(e);
+ g_timer.add_event_after(max_ping_time, e);
+ dout(10) << "scheduled check " << e << endl;
+
+ schedule_heartbeat(); // schedule next heartbeat
+}
+
+
+// check results
+
+void HostMonitor::check_heartbeat()
+{
+ dout(DBL) << "check_heartbeat()" << endl;
+
+ // check inflight pings
+ for (set<msg_addr_t>::iterator it = inflight_pings.begin();
+ it != inflight_pings.end();
+ it++) {
+ status[*it].num_heartbeats_missed++;
+
+ dout(DBL) << "no response from " << *it << " for " << status[*it].num_heartbeats_missed << " beats" << endl;
+
+ if (status[*it].num_heartbeats_missed >= max_heartbeat_misses) {
+ if (acked_failures.count(*it)) {
+ dout(DBL) << *it << " is already failed" << endl;
+ } else {
+ if (unacked_failures.count(*it)) {
+ dout(DBL) << *it << " is already failed, but unacked, sending another failure message" << endl;
+ } else {
+ dout(DBL) << "failing " << *it << endl;
+ unacked_failures.insert(*it);
+ }
+
+ /*if (false) // do this in NewMessenger for now! FIXME
+ for (set<msg_addr_t>::iterator nit = notify.begin();
+ nit != notify.end();
+ nit++) {
+ messenger->send_message(new MFailure(*it, messenger->get_inst(*it)),
+ *nit, notify_port, 0);
+ }
+ */
+ }
+ }
+ }
+
+ // forget about the pings.
+ inflight_pings.clear();
+}
+
+
+// incoming messages
+
+void HostMonitor::proc_message(Message *m)
+{
+ switch (m->get_type()) {
+
+ case MSG_PING_ACK:
+ handle_ping_ack((MPingAck*)m);
+ break;
+
+ case MSG_FAILURE_ACK:
+ handle_failure_ack((MFailureAck*)m);
+ break;
+
+ }
+}
+
+void HostMonitor::handle_ping_ack(MPingAck *m)
+{
+ msg_addr_t from = m->get_source();
+
+ dout(DBL) << "ping ack from " << from << endl;
+ status[from].last_pinged = g_clock.gettime();
+ status[from].num_heartbeats_missed = 0;
+ inflight_pings.erase(from);
+
+ delete m;
+}
+
+void HostMonitor::handle_failure_ack(MFailureAck *m)
+{
+
+ // FIXME: this doesn't handle failed -> alive transitions gracefully at all..
+
+ // the higher-up's acknowledged our failure notification, we can stop resending it.
+ msg_addr_t failed = m->get_failed();
+ dout(DBL) << "handle_failure_ack " << failed << endl;
+ unacked_failures.erase(failed);
+ acked_failures.insert(failed);
+
+ delete m;
+}
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __HOSTMONITOR_H
+#define __HOSTMONITOR_H
+
+#include <time.h>
+
+#include <map>
+#include <set>
+using namespace std;
+
+#include "include/Context.h"
+#include "msg/Message.h"
+
+class Message;
+class Messenger;
+
+typedef struct {
+ time_t last_heard_from;
+ time_t last_pinged;
+ int num_heartbeats_missed;
+} monitor_rec_t;
+
+class HostMonitor {
+ Messenger *messenger;
+ string whoami;
+
+ // hosts i monitor
+ set<msg_addr_t> hosts;
+
+ // who i tell when they fail
+ set<msg_addr_t> notify;
+ int notify_port;
+
+ // their status
+ map<msg_addr_t,monitor_rec_t> status;
+
+ set<msg_addr_t> inflight_pings; // pings we sent that haven't replied yet
+
+ set<msg_addr_t> unacked_failures; // failed hosts that haven't been acked yet.
+ set<msg_addr_t> acked_failures; // these failures have been acked.
+
+ float heartbeat_interval; // how often to do a heartbeat
+ float max_ping_time; // how long before it's a miss
+ int max_heartbeat_misses; // how many misses before i tell
+ float notify_retry_interval; // how often to retry failure notification
+
+ public:
+ set<Context*> pending_events;
+
+ private:
+ void schedule_heartbeat();
+
+ public:
+ HostMonitor(Messenger *m, string& whoami) {
+ this->messenger = m;
+ this->whoami = whoami;
+ notify_port = 0;
+ }
+ set<msg_addr_t>& get_hosts() { return hosts; }
+ set<msg_addr_t>& get_notify() { return notify; }
+ void set_notify_port(int p) { notify_port = p; }
+
+ void remove_host(msg_addr_t h) {
+ hosts.erase(h);
+ status.erase(h);
+ unacked_failures.erase(h);
+ acked_failures.erase(h);
+ }
+
+ void init();
+ void shutdown();
+
+ void host_is_alive(msg_addr_t who);
+
+ void proc_message(Message *m);
+ void handle_ping_ack(class MPingAck *m);
+ void handle_failure_ack(class MFailureAck *m);
+
+ void initiate_heartbeat();
+ void check_heartbeat();
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "config.h"
+#include "include/error.h"
+
+#include "common/Timer.h"
+#include "common/Mutex.h"
+
+#include "MPIMessenger.h"
+#include "Message.h"
+
+#include <iostream>
+#include <cassert>
+using namespace std;
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#include <unistd.h>
+#include <mpi.h>
+
+/*
+ * We make a directory, so that we can have multiple Messengers in the
+ * same process (rank). This is useful for benchmarking and creating lots of
+ * simulated clients, e.g.
+ */
+
+hash_map<int, MPIMessenger*> directory;
+list<Message*> outgoing, incoming;
+list<MPI_Request*> unfinished_sends;
+map<MPI_Request*, Message*> unfinished_send_message;
+
+/* this process */
+int mpi_world;
+int mpi_rank;
+bool mpi_done = false; // set this flag to stop the event loop
+
+
+#define FUNNEL_MPI // if we want to funnel mpi through a single thread
+#define TAG_UNSOLICITED 0
+#define DBLVL 18
+
+// the key used to fetch the tag for the current thread.
+pthread_key_t tag_key;
+pthread_t thread_id = 0; // thread id of the event loop. init value == nobody
+
+Mutex sender_lock;
+Mutex out_queue_lock;
+
+bool pending_timer;
+
+
+// our lock for any common data; it's okay to have only the one global mutex
+// because our common data isn't a whole lot.
+//static pthread_mutex_t mutex;
+
+// the number of distinct threads we've seen so far; used to generate
+// a unique tag for each thread.
+//static int nthreads = 10;
+
+//#define TAG_UNSOLICITED 0
+
+// debug
+#undef dout
+#define dout(l) if (l<=g_conf.debug) cout << "[MPI " << mpi_rank << "/" << mpi_world << " " << getpid() << "." << pthread_self() << "] "
+
+
+
+/*****
+ * MPI global methods for process-wide startup, shutdown.
+ */
+
+int mpimessenger_init(int& argc, char**& argv)
+{
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &mpi_world);
+ MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+ char hostname[100];
+ gethostname(hostname,100);
+ int pid = getpid();
+
+ dout(12) << "init: i am " << hostname << " pid " << pid << endl;
+
+ assert(mpi_world > g_conf.num_osd+g_conf.num_mds);
+
+ return mpi_rank;
+}
+
+int mpimessenger_shutdown()
+{
+ dout(5) << "mpimessenger_shutdown barrier waiting for all to finish" << endl;
+ MPI_Barrier (MPI_COMM_WORLD);
+ dout(1) << "mpimessenger_shutdown all done, MPI_Finalize()" << endl;
+ MPI_Finalize();
+ return 0;
+}
+
+int mpimessenger_world()
+{
+ return mpi_world;
+}
+
+
+
+/***
+ * internal send/recv
+ */
+
+
+/*
+ * get fresh MPI_Request* (on heap) for a new async MPI_Isend
+ */
+
+MPI_Request *mpi_prep_send_req() {
+ MPI_Request *req = new MPI_Request;
+ unfinished_sends.push_back(req);
+ dout(DBLVL) << "prep_send_req " << req << endl;
+ return req;
+}
+
+
+/*
+ * clean up MPI_Request*'s for Isends that have completed.
+ * also, hose any associated Message*'s for Messages that are completely sent.
+ *
+ * if wait=true, block and wait for sends to finish.
+ */
+
+void mpi_reap_sends(bool wait=false) {
+ sender_lock.Lock();
+
+ list<MPI_Request*>::iterator it = unfinished_sends.begin();
+ while (it != unfinished_sends.end()) {
+ MPI_Status status;
+ int flag;
+
+ if (wait) {
+ MPI_Wait(*it, &status);
+ } else {
+ MPI_Test(*it, &flag, &status);
+ if (!flag) break; // not finished yet
+ }
+
+ dout(DBLVL) << "send " << *it << " completed" << endl;
+
+ if (unfinished_send_message.count(*it)) {
+ dout(DBLVL) << "send message " << unfinished_send_message[*it] << " completed" << endl;
+ delete unfinished_send_message[*it];
+ unfinished_send_message.erase(*it);
+ }
+
+ delete *it;
+ it++;
+ unfinished_sends.pop_front();
+ }
+
+ dout(DBLVL) << "reap has " << unfinished_sends.size() << " Isends outstanding, " << unfinished_send_message.size() << " messages" << endl;
+
+ sender_lock.Unlock();
+}
+
+
+void mpi_finish_sends() {
+ mpi_reap_sends(true);
+}
+
+
+/*
+ * recv a Message*
+ */
+Message *mpi_recv(int tag)
+{
+ // envelope
+ dout(DBLVL) << "mpi_recv waiting for message tag " << tag << endl;
+
+ MPI_Status status;
+ msg_envelope_t env;
+
+ ASSERT(MPI_Recv((void*)&env,
+ sizeof(env),
+ MPI_CHAR,
+ MPI_ANY_SOURCE,// status.MPI_SOURCE,//MPI_ANY_SOURCE,
+ tag,
+ MPI_COMM_WORLD,
+ &status/*,
+ &recv_env_req*/) == MPI_SUCCESS);
+ assert(status.count == MSG_ENVELOPE_LEN);
+
+ if (env.type == 0) {
+ dout(DBLVL) << "mpi_recv got type 0 message, kicked!" << endl;
+ return 0;
+ }
+
+ dout(DBLVL) << "mpi_recv got envelope " << status.count << ", type=" << env.type << " src " << env.source << " dst " << env.dest << " nchunks=" << env.nchunks << " from " << status.MPI_SOURCE << endl;
+
+ // payload
+ bufferlist blist;
+ for (int i=0; i<env.nchunks; i++) {
+ MPI_Status fragstatus;
+ ASSERT(MPI_Probe(status.MPI_SOURCE,
+ tag,
+ MPI_COMM_WORLD,
+ &fragstatus) == MPI_SUCCESS);
+
+ bufferptr bp = new buffer(fragstatus.count);
+
+ ASSERT(MPI_Recv(bp.c_str(),
+ fragstatus.count,
+ MPI_CHAR,
+ status.MPI_SOURCE,
+ tag,
+ MPI_COMM_WORLD,
+ &fragstatus) == MPI_SUCCESS);
+
+ blist.push_back(bp);
+
+ dout(DBLVL) << "mpi_recv got frag " << i << " of " << env.nchunks << " len " << fragstatus.count << endl;
+ }
+
+ dout(DBLVL) << "mpi_recv got " << blist.length() << " byte message tag " << status.MPI_TAG << endl;
+
+ // unmarshall message
+ Message *m = decode_message(env, blist);
+ return m;
+}
+
+
+/*
+ * send a Message* over the wire. ** do not block **.
+ */
+int mpi_send(Message *m, int tag)
+{
+ int rank = MPI_DEST_TO_RANK(m->get_dest(), mpi_world);
+
+ // local?
+ if (rank == mpi_rank) {
+ dout(DBLVL) << "queuing local delivery" << endl;
+ incoming.push_back(m);
+ return 0;
+ }
+
+ // marshall
+ if (m->empty_payload())
+ m->encode_payload();
+ msg_envelope_t *env = &m->get_envelope();
+ env->nchunks = m->get_payload().buffers().size();
+
+ dout(7) << "sending " << *m << " to " << MSG_ADDR_NICE(env->dest) << " (rank " << rank << ")" << endl;
+
+#ifndef FUNNEL_MPI
+ sender_lock.Lock();
+#endif
+
+ // send envelope
+ ASSERT(MPI_Isend((void*)env,
+ sizeof(*env),
+ MPI_CHAR,
+ rank,
+ tag,
+ MPI_COMM_WORLD,
+ mpi_prep_send_req()) == MPI_SUCCESS);
+
+ // payload
+ int i = 0;
+ for (list<bufferptr>::iterator it = m->get_payload().buffers().begin();
+ it != m->get_payload().buffers().end();
+ it++) {
+ dout(DBLVL) << "mpi_sending frag " << i << " len " << (*it).length() << endl;
+ //MPI_Request *req = new MPI_Request;
+ ASSERT(MPI_Isend((void*)(*it).c_str(),
+ (*it).length(),
+ MPI_CHAR,
+ rank,
+ tag,
+ MPI_COMM_WORLD,
+ mpi_prep_send_req()) == MPI_SUCCESS);
+ i++;
+ }
+
+ // attach message to last send, so we can free it later
+ MPI_Request *req = unfinished_sends.back();
+ unfinished_send_message[req] = m;
+
+ dout(DBLVL) << "mpi_send done, attached message to Isend " << req << endl;
+
+#ifndef FUNNEL_MPI
+ sender_lock.Unlock();
+#endif
+ return 0;
+}
+
+
+
+// get the tag for this thread
+
+#ifndef FUNNEL_MPI
+static int get_thread_tag()
+{
+ int tag = (int)pthread_getspecific(tag_key);
+
+ if (tag == 0) {
+ // first time this thread has performed MPI messaging
+
+ if (pthread_mutex_lock(&mutex) < 0)
+ SYSERROR();
+
+ tag = ++nthreads;
+
+ if (pthread_mutex_unlock(&mutex) < 0)
+ SYSERROR();
+
+ if (pthread_setspecific(tag_key, (void*)tag) < 0)
+ SYSERROR();
+ }
+
+ return tag;
+}
+#endif
+
+
+
+// recv event loop, for unsolicited messages.
+
+void* mpimessenger_loop(void*)
+{
+ dout(5) << "mpimessenger_loop start pid " << getpid() << endl;
+
+ while (1) {
+
+ // outgoing
+ mpi_reap_sends();
+
+#ifdef FUNNEL_MPI
+ // check outgoing queue
+ out_queue_lock.Lock();
+ if (outgoing.size()) {
+ dout(10) << outgoing.size() << " outgoing messages" << endl;
+ for (list<Message*>::iterator it = outgoing.begin();
+ it != outgoing.end();
+ it++) {
+ mpi_send(*it, TAG_UNSOLICITED);
+ }
+ }
+ outgoing.clear();
+ out_queue_lock.Unlock();
+#endif
+
+
+ // timer events?
+ if (pending_timer) {
+ dout(DBLVL) << "pending timer" << endl;
+ g_timer.execute_pending();
+ }
+
+ // done?
+ if (mpi_done &&
+ incoming.empty() &&
+ outgoing.empty() &&
+ !pending_timer) break;
+
+
+ // incoming
+ Message *m = 0;
+
+ if (incoming.size()) {
+ dout(12) << "loop pulling message off incoming" << endl;
+ m = incoming.front();
+ incoming.pop_front();
+ }
+ else {
+ // check mpi
+ dout(12) << "loop waiting for incoming messages" << endl;
+
+ // get message
+ m = mpi_recv(TAG_UNSOLICITED);
+ }
+
+ // dispatch?
+ if (m) {
+ int dest = m->get_dest();
+ if (directory.count(dest)) {
+ Messenger *who = directory[ dest ];
+
+ dout(4) << "---- '" << m->get_type_name() <<
+ "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() <<
+ " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- "
+ << m
+ << endl;
+
+ who->dispatch(m);
+ } else {
+ dout (1) << "---- i don't know who " << dest << " is." << endl;
+ assert(0);
+ break;
+ }
+ }
+
+ }
+
+ dout(5) << "finishing async sends" << endl;
+ mpi_finish_sends();
+
+ g_timer.shutdown();
+
+ dout(5) << "mpimessenger_loop exiting loop" << endl;
+ return 0;
+}
+
+
+// start/stop mpi receiver thread (for unsolicited messages)
+int mpimessenger_start()
+{
+ dout(5) << "starting thread" << endl;
+
+ // start a thread
+ pthread_create(&thread_id,
+ NULL,
+ mpimessenger_loop,
+ 0);
+ return 0;
+}
+
+
+/*
+ * kick and wake up _loop (to pick up new outgoing message, or quit)
+ */
+
+MPI_Request kick_req;
+msg_envelope_t kick_env;
+
+void mpimessenger_kick_loop()
+{
+ // if we're same thread as the loop, no kicking necessary
+ if (pthread_self() == thread_id) return;
+
+ kick_env.type = 0;
+
+ sender_lock.Lock();
+ ASSERT(MPI_Isend(&kick_env, // kick sync for now, but ONLY because it makes me feel safer.
+ sizeof(kick_env),
+ MPI_CHAR,
+ mpi_rank,
+ TAG_UNSOLICITED,
+ MPI_COMM_WORLD,
+ mpi_prep_send_req()) == MPI_SUCCESS);
+ sender_lock.Unlock();
+}
+
+
+// stop thread
+
+void mpimessenger_stop()
+{
+ dout(5) << "mpimessenger_stop stopping thread" << endl;
+
+ if (mpi_done) {
+ dout(1) << "mpimessenger_stop called, but already done!" << endl;
+ assert(!mpi_done);
+ }
+
+ // set finish flag
+ mpi_done = true;
+ mpimessenger_kick_loop();
+
+ // wait for thread to stop
+ mpimessenger_wait();
+}
+
+
+// wait for thread to finish
+
+void mpimessenger_wait()
+{
+ void *returnval;
+ dout(10) << "mpimessenger_wait waiting for thread to finished." << endl;
+ pthread_join(thread_id, &returnval);
+ dout(10) << "mpimessenger_wait thread finished." << endl;
+}
+
+
+
+
+/***********
+ * MPIMessenger class implementation
+ */
+
+class C_MPIKicker : public Context {
+ void finish(int r) {
+ dout(DBLVL) << "timer kick" << endl;
+ mpimessenger_kick_loop();
+ }
+};
+
+MPIMessenger::MPIMessenger(msg_addr_t myaddr) : Messenger(myaddr)
+{
+ // my address
+ this->myaddr = myaddr;
+
+ // register myself in the messenger directory
+ directory[myaddr] = this;
+
+ // register to execute timer events
+ g_timer.set_messenger_kicker(new C_MPIKicker());
+
+ // logger
+ /*
+ string name;
+ name = "m.";
+ name += MSG_ADDR_TYPE(whoami);
+ int w = MSG_ADDR_NUM(whoami);
+ if (w >= 1000) name += ('0' + ((w/1000)%10));
+ if (w >= 100) name += ('0' + ((w/100)%10));
+ if (w >= 10) name += ('0' + ((w/10)%10));
+ name += ('0' + ((w/1)%10));
+
+ logger = new Logger(name, (LogType*)&mpimsg_logtype);
+ loggers[ whoami ] = logger;
+ */
+}
+
+MPIMessenger::~MPIMessenger()
+{
+ //delete logger;
+}
+
+
+int MPIMessenger::shutdown()
+{
+ // remove me from the directory
+ directory.erase(myaddr);
+
+ // no more timer events
+ g_timer.unset_messenger_kicker();
+
+ // last one?
+ if (directory.empty()) {
+ dout(10) << "shutdown last mpimessenger on rank " << mpi_rank << " shut down" << endl;
+ pthread_t whoami = pthread_self();
+
+ dout(15) << "whoami = " << whoami << ", thread = " << thread_id << endl;
+ if (whoami == thread_id) {
+ // i am the event loop thread, just set flag!
+ dout(15) << " set mpi_done=true" << endl;
+ mpi_done = true;
+ } else {
+ // i am a different thread, tell the event loop to stop.
+ dout(15) << " calling mpimessenger_stop()" << endl;
+ mpimessenger_stop();
+ }
+ } else {
+ dout(10) << "shutdown still " << directory.size() << " other messengers on rank " << mpi_rank << endl;
+ }
+ return 0;
+}
+
+
+
+
+/***
+ * public messaging interface
+ */
+
+
+/* note: send_message _MUST_ be non-blocking */
+int MPIMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport)
+{
+ // set envelope
+ m->set_source(myaddr, fromport);
+ m->set_dest(dest, port);
+
+#ifdef FUNNEL_MPI
+
+ // queue up
+ out_queue_lock.Lock();
+ dout(DBLVL) << "queuing outgoing message " << *m << endl;
+ outgoing.push_back(m);
+ out_queue_lock.Unlock();
+
+ mpimessenger_kick_loop();
+
+#else
+
+ // send in this thread
+ mpi_send(m, m->get_pcid());
+
+#endif
+ return 0;
+}
+
+
+
+
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MPIMESSENGER_H
+#define __MPIMESSENGER_H
+
+#include "Messenger.h"
+#include "Dispatcher.h"
+
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define MPI_DEST_TO_RANK(dest,world) ((dest)<(NUMMDS+NUMOSD) ? \
+ (dest) : \
+ ((NUMMDS+NUMOSD)+(((dest)-NUMMDS-NUMOSD) % ((world)-NUMMDS-NUMOSD))))
+
+class Timer;
+
+class MPIMessenger : public Messenger {
+ protected:
+ msg_addr_t myaddr; // my address
+ //class Logger *logger; // for logging
+
+ public:
+ MPIMessenger(msg_addr_t myaddr);
+ ~MPIMessenger();
+
+ // init, shutdown MPI and associated event loop thread.
+ virtual int shutdown();
+
+ // message interface
+ virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0);
+};
+
+/**
+ * these are all ONE per process.
+ */
+extern int mpimessenger_world(); // get world size
+extern int mpimessenger_init(int& argc, char**& argv); // init mpi
+extern int mpimessenger_start(); // start thread
+extern void mpimessenger_stop(); // stop thread.
+extern void mpimessenger_wait(); // wait for thread to finish.
+extern int mpimessenger_shutdown(); // finalize MPI
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include <pthread.h>
+#include "mpi.h"
+
+#include "include/config.h"
+#include "include/error.h"
+#include "Messenger.h"
+#include "MTMessenger.h"
+
+// This module uses MPI to implement a blocking sendrecv function that
+// feels more like a procedure call and less like event processesing.
+//
+// Threads are not independently addressable in MPI, only processes
+// are. However, MPI does include a user defined tag in the message
+// envelope, and a reader may selectively read only messages with a
+// matching tag. The modules assign an integer to each thread to use
+// as the tag.
+//
+
+// our lock for any common data; it's okay to have only the one global mutex
+// because our common data isn't a whole lot.
+static pthread_mutex_t mutex;
+
+// the key used to fetch the tag for the current thread.
+pthread_key_t tag_key;
+
+// the number of distinct threads we've seen so far; used to generate
+// a unique tag for each thread.
+static int nthreads;
+
+// the MPI identity of this process
+static int mpi_rank;
+
+
+// get the tag for this thread
+static int get_tag()
+{
+ int tag = (int)pthread_getspecific(tag_key);
+
+ if (tag == 0) {
+ // first time this thread has performed MPI messaging
+
+ if (pthread_mutex_lock(&mutex) < 0)
+ SYSERROR();
+
+ tag = ++nthreads;
+
+ if (pthread_mutex_unlock(&mutex) < 0)
+ SYSERROR();
+
+ if (pthread_setspecific(tag_key, (void*)tag) < 0)
+ SYSERROR();
+ }
+
+ return tag;
+}
+
+
+// marshall a message and send it over MPI
+static void send(Message *m, int rank, int tag)
+{
+ // marshall the message
+ crope r;
+ m->encode(r);
+ int size = r.length();
+
+ char *buf = (char*)r.c_str();
+ ASSERT(MPI_Send(buf,
+ size,
+ MPI_CHAR,
+ rank,
+ tag,
+ MPI_COMM_WORLD) == MPI_SUCCESS);
+}
+
+// read a message from MPI and unmarshall it
+static Message *receive(int tag)
+{
+ MPI_Status status;
+
+ // get message size
+ ASSERT(MPI_Probe(MPI_ANY_SOURCE,
+ tag,
+ MPI_COMM_WORLD,
+ &status) == MPI_SUCCESS);
+
+ // get message; there may be multiple messages on the queue, we
+ // need to be sure to read the one which corresponds to size
+ // obtained above.
+ char *buf = new char[status.count];
+ ASSERT(MPI_Recv(buf,
+ status.count,
+ MPI_CHAR,
+ status.MPI_SOURCE,
+ status.MPI_TAG,
+ MPI_COMM_WORLD,
+ &status) == MPI_SUCCESS);
+
+ // unmarshall message
+ crope r(buf, status.count);
+ delete[] buf;
+ Message *m = decode_message(r);
+
+ return m;
+}
+
+MTMessenger::MTMessenger(int& argc, char**& argv)
+{
+ // setup MPI; MPI errors will probably invoke the default MPI error
+ // handler, which aborts the program with a friendly message rather
+ // than returning from a function; just in case, we abort the
+ // program if we get an MPI error.
+
+ int provided;
+ ASSERT(MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided)
+ == MPI_SUCCESS);
+
+ ASSERT(MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank) == MPI_SUCCESS);
+
+ if (pthread_mutex_init(&mutex, NULL) < 0)
+ SYSERROR();
+
+ if (pthread_key_create(&tag_key, NULL) < 0)
+ SYSERROR();
+
+ nthreads = 0;
+}
+
+MTMessenger::~MTMessenger()
+{
+ // ignore shutdown errors
+
+ pthread_key_delete(tag_key);
+
+ pthread_mutex_destroy(&mutex);
+
+ MPI_Finalize();
+}
+
+// send a request and wait for the response
+Message *MTMessenger::sendrecv(Message *m, msg_addr_t dest)
+{
+ int dest_tag = 0; // servers listen for any tag
+ int my_tag = get_tag();
+
+ // set our envelope (not to be confused with the MPI envelope)
+ m->set_source(mpi_rank, my_tag);
+ m->set_dest(dest, dest_tag);
+
+ send(m, dest, dest_tag);
+
+ return receive(my_tag);
+}
+
+// receive a request from anyone
+Message *MTMessenger::recvreq()
+{
+ return receive(MPI_ANY_TAG);
+}
+
+// forward request, masquerading as original source
+void MTMessenger::fwdreq(Message *req, int dest)
+{
+ int dest_tag = 0; // servers listen for any tag
+
+ // set our envelope (not to be confused with the MPI envelope)
+ req->set_dest(dest, dest_tag);
+
+ send(req, dest, dest_tag);
+}
+
+// send a response to the originator of the request
+void MTMessenger::sendresp(Message *req, Message *resp)
+{
+ int req_rank = req->get_source();
+ int req_tag = req->get_source_port();
+ int my_tag = get_tag();
+
+ // set our envelope (not to be confused with the MPI envelope)
+ resp->set_source(mpi_rank, my_tag);
+ resp->set_dest(req_rank, req_tag);
+
+ send(resp, req_rank, req_tag);
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __MTMESSENGER_H
+#define __MTMESSENGER_H
+
+#include "Message.h"
+#include "SerialMessenger.h"
+
+// Marshall and unmarshall OBFS messages, send and receive them over
+// MPI.
+
+class MTMessenger
+{
+public:
+ // sets up the queues and internal thread; the MPI initialization
+ // will scan argc/argv for MPI specific flags and remove them from
+ // argc/argv.
+ MTMessenger(int &argc, char **&argv);
+
+ // tears it all down
+ ~MTMessenger();
+
+ // send a request to a server and wait (block) for the response;
+ virtual Message *sendrecv(Message *m, msg_addr_t dest);
+
+ // wait (block) for a request from anyone
+ Message *recvreq();
+
+ // forward request, masquerading as original source
+ void fwdreq(Message *req, int dest);
+
+ // send the response to the originator of the request
+ virtual void sendresp(Message *req, Message *resp);
+
+
+}; // class MTMessenger
+
+#endif // __MTMESSENGER_H
--- /dev/null
+
+#include <cassert>
+#include <iostream>
+using namespace std;
+
+#include "include/types.h"
+
+#include "Message.h"
+
+#include "messages/MGenericMessage.h"
+
+#include "messages/MNSConnect.h"
+#include "messages/MNSConnectAck.h"
+#include "messages/MNSRegister.h"
+#include "messages/MNSRegisterAck.h"
+#include "messages/MNSLookup.h"
+#include "messages/MNSLookupReply.h"
+#include "messages/MNSFailure.h"
+
+#include "messages/MMonElectionAck.h"
+#include "messages/MMonElectionPropose.h"
+#include "messages/MMonElectionVictory.h"
+
+#include "messages/MPing.h"
+#include "messages/MPingAck.h"
+#include "messages/MFailure.h"
+#include "messages/MFailureAck.h"
+
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDIn.h"
+#include "messages/MOSDOut.h"
+#include "messages/MOSDFailure.h"
+#include "messages/MOSDPing.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDMap.h"
+#include "messages/MOSDGetMap.h"
+#include "messages/MOSDPGNotify.h"
+#include "messages/MOSDPGQuery.h"
+#include "messages/MOSDPGLog.h"
+#include "messages/MOSDPGRemove.h"
+
+#include "messages/MClientBoot.h"
+#include "messages/MClientMount.h"
+#include "messages/MClientMountAck.h"
+#include "messages/MClientRequest.h"
+#include "messages/MClientReply.h"
+#include "messages/MClientFileCaps.h"
+
+#include "messages/MMDSGetMap.h"
+#include "messages/MMDSMap.h"
+#include "messages/MMDSBoot.h"
+
+#include "messages/MDirUpdate.h"
+#include "messages/MDiscover.h"
+#include "messages/MDiscoverReply.h"
+
+#include "messages/MExportDirDiscover.h"
+#include "messages/MExportDirDiscoverAck.h"
+#include "messages/MExportDirPrep.h"
+#include "messages/MExportDirPrepAck.h"
+#include "messages/MExportDirWarning.h"
+#include "messages/MExportDir.h"
+#include "messages/MExportDirNotify.h"
+#include "messages/MExportDirNotifyAck.h"
+#include "messages/MExportDirFinish.h"
+
+#include "messages/MHashReaddir.h"
+#include "messages/MHashReaddirReply.h"
+
+#include "messages/MHashDirDiscover.h"
+#include "messages/MHashDirDiscoverAck.h"
+#include "messages/MHashDirPrep.h"
+#include "messages/MHashDirPrepAck.h"
+#include "messages/MHashDir.h"
+#include "messages/MHashDirAck.h"
+#include "messages/MHashDirNotify.h"
+
+#include "messages/MUnhashDirPrep.h"
+#include "messages/MUnhashDirPrepAck.h"
+#include "messages/MUnhashDir.h"
+#include "messages/MUnhashDirAck.h"
+#include "messages/MUnhashDirNotify.h"
+#include "messages/MUnhashDirNotifyAck.h"
+
+#include "messages/MRenameWarning.h"
+#include "messages/MRenameNotify.h"
+#include "messages/MRenameNotifyAck.h"
+#include "messages/MRename.h"
+#include "messages/MRenamePrep.h"
+#include "messages/MRenameReq.h"
+#include "messages/MRenameAck.h"
+#include "messages/MDentryUnlink.h"
+
+#include "messages/MHeartbeat.h"
+
+#include "messages/MAnchorRequest.h"
+#include "messages/MAnchorReply.h"
+#include "messages/MInodeLink.h"
+#include "messages/MInodeLinkAck.h"
+
+//#include "messages/MInodeUpdate.h"
+#include "messages/MInodeExpire.h"
+#include "messages/MDirExpire.h"
+#include "messages/MCacheExpire.h"
+#include "messages/MInodeFileCaps.h"
+
+#include "messages/MLock.h"
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug) cout << "messenger: "
+#define DEBUGLVL 10 // debug level of output
+
+
+
+
+
+
+
+Message *
+decode_message(msg_envelope_t& env, bufferlist& payload)
+{
+ // make message
+ Message *m = 0;
+ switch(env.type) {
+
+ // -- with payload --
+
+ case MSG_NS_CONNECT:
+ m = new MNSConnect();
+ break;
+ case MSG_NS_CONNECTACK:
+ m = new MNSConnectAck();
+ break;
+ case MSG_NS_REGISTER:
+ m = new MNSRegister();
+ break;
+ case MSG_NS_REGISTERACK:
+ m = new MNSRegisterAck();
+ break;
+ case MSG_NS_LOOKUP:
+ m = new MNSLookup();
+ break;
+ case MSG_NS_LOOKUPREPLY:
+ m = new MNSLookupReply();
+ break;
+ case MSG_NS_FAILURE:
+ m = new MNSFailure();
+ break;
+
+ case MSG_MON_ELECTION_PROPOSE:
+ m = new MMonElectionPropose;
+ break;
+ case MSG_MON_ELECTION_ACK:
+ m = new MMonElectionAck;
+ break;
+ case MSG_MON_ELECTION_VICTORY:
+ m = new MMonElectionVictory;
+ break;
+
+ case MSG_PING:
+ m = new MPing();
+ break;
+ case MSG_PING_ACK:
+ m = new MPingAck();
+ break;
+ case MSG_FAILURE:
+ m = new MFailure();
+ break;
+ case MSG_FAILURE_ACK:
+ m = new MFailureAck();
+ break;
+
+ case MSG_OSD_BOOT:
+ m = new MOSDBoot();
+ break;
+ case MSG_OSD_IN:
+ m = new MOSDIn();
+ break;
+ case MSG_OSD_OUT:
+ m = new MOSDOut();
+ break;
+ case MSG_OSD_FAILURE:
+ m = new MOSDFailure();
+ break;
+ case MSG_OSD_PING:
+ m = new MOSDPing();
+ break;
+ case MSG_OSD_OP:
+ m = new MOSDOp();
+ break;
+ case MSG_OSD_OPREPLY:
+ m = new MOSDOpReply();
+ break;
+
+ case MSG_OSD_MAP:
+ m = new MOSDMap();
+ break;
+ case MSG_OSD_GETMAP:
+ m = new MOSDGetMap();
+ break;
+
+ case MSG_OSD_PG_NOTIFY:
+ m = new MOSDPGNotify();
+ break;
+ case MSG_OSD_PG_QUERY:
+ m = new MOSDPGQuery();
+ break;
+ case MSG_OSD_PG_LOG:
+ m = new MOSDPGLog();
+ break;
+ case MSG_OSD_PG_REMOVE:
+ m = new MOSDPGRemove();
+ break;
+
+ // clients
+ case MSG_CLIENT_BOOT:
+ m = new MClientBoot();
+ break;
+ case MSG_CLIENT_MOUNT:
+ m = new MClientMount();
+ break;
+ case MSG_CLIENT_MOUNTACK:
+ m = new MClientMountAck();
+ break;
+ case MSG_CLIENT_REQUEST:
+ m = new MClientRequest();
+ break;
+ case MSG_CLIENT_REPLY:
+ m = new MClientReply();
+ break;
+ case MSG_CLIENT_FILECAPS:
+ m = new MClientFileCaps();
+ break;
+
+ // mds
+ case MSG_MDS_GETMAP:
+ m = new MMDSGetMap();
+ break;
+ case MSG_MDS_MAP:
+ m = new MMDSMap();
+ break;
+ case MSG_MDS_BOOT:
+ m = new MMDSBoot();
+ break;
+
+ case MSG_MDS_DIRUPDATE:
+ m = new MDirUpdate();
+ break;
+
+ case MSG_MDS_DISCOVER:
+ m = new MDiscover();
+ break;
+ case MSG_MDS_DISCOVERREPLY:
+ m = new MDiscoverReply();
+ break;
+
+ case MSG_MDS_EXPORTDIRDISCOVER:
+ m = new MExportDirDiscover();
+ break;
+ case MSG_MDS_EXPORTDIRDISCOVERACK:
+ m = new MExportDirDiscoverAck();
+ break;
+
+ case MSG_MDS_EXPORTDIR:
+ m = new MExportDir();
+ break;
+
+ case MSG_MDS_EXPORTDIRFINISH:
+ m = new MExportDirFinish();
+ break;
+
+ case MSG_MDS_EXPORTDIRNOTIFY:
+ m = new MExportDirNotify();
+ break;
+
+ case MSG_MDS_EXPORTDIRNOTIFYACK:
+ m = new MExportDirNotifyAck();
+ break;
+
+ case MSG_MDS_EXPORTDIRPREP:
+ m = new MExportDirPrep();
+ break;
+
+ case MSG_MDS_EXPORTDIRPREPACK:
+ m = new MExportDirPrepAck();
+ break;
+
+ case MSG_MDS_EXPORTDIRWARNING:
+ m = new MExportDirWarning();
+ break;
+
+
+ case MSG_MDS_HASHREADDIR:
+ m = new MHashReaddir();
+ break;
+ case MSG_MDS_HASHREADDIRREPLY:
+ m = new MHashReaddirReply();
+ break;
+
+ case MSG_MDS_HASHDIRDISCOVER:
+ m = new MHashDirDiscover();
+ break;
+ case MSG_MDS_HASHDIRDISCOVERACK:
+ m = new MHashDirDiscoverAck();
+ break;
+ case MSG_MDS_HASHDIRPREP:
+ m = new MHashDirPrep();
+ break;
+ case MSG_MDS_HASHDIRPREPACK:
+ m = new MHashDirPrepAck();
+ break;
+ case MSG_MDS_HASHDIR:
+ m = new MHashDir();
+ break;
+ case MSG_MDS_HASHDIRACK:
+ m = new MHashDirAck();
+ break;
+ case MSG_MDS_HASHDIRNOTIFY:
+ m = new MHashDirNotify();
+ break;
+
+ case MSG_MDS_UNHASHDIRPREP:
+ m = new MUnhashDirPrep();
+ break;
+ case MSG_MDS_UNHASHDIRPREPACK:
+ m = new MUnhashDirPrepAck();
+ break;
+ case MSG_MDS_UNHASHDIR:
+ m = new MUnhashDir();
+ break;
+ case MSG_MDS_UNHASHDIRACK:
+ m = new MUnhashDirAck();
+ break;
+ case MSG_MDS_UNHASHDIRNOTIFY:
+ m = new MUnhashDirNotify();
+ break;
+ case MSG_MDS_UNHASHDIRNOTIFYACK:
+ m = new MUnhashDirNotifyAck();
+ break;
+
+ case MSG_MDS_RENAMEWARNING:
+ m = new MRenameWarning();
+ break;
+ case MSG_MDS_RENAMENOTIFY:
+ m = new MRenameNotify();
+ break;
+ case MSG_MDS_RENAMENOTIFYACK:
+ m = new MRenameNotifyAck();
+ break;
+ case MSG_MDS_RENAME:
+ m = new MRename();
+ break;
+ case MSG_MDS_RENAMEPREP:
+ m = new MRenamePrep();
+ break;
+ case MSG_MDS_RENAMEREQ:
+ m = new MRenameReq();
+ break;
+ case MSG_MDS_RENAMEACK:
+ m = new MRenameAck();
+ break;
+
+ case MSG_MDS_DENTRYUNLINK:
+ m = new MDentryUnlink();
+ break;
+
+ case MSG_MDS_HEARTBEAT:
+ m = new MHeartbeat();
+ break;
+
+ case MSG_MDS_CACHEEXPIRE:
+ m = new MCacheExpire();
+ break;
+
+ case MSG_MDS_ANCHORREQUEST:
+ m = new MAnchorRequest();
+ break;
+ case MSG_MDS_ANCHORREPLY:
+ m = new MAnchorReply();
+ break;
+
+ case MSG_MDS_INODELINK:
+ m = new MInodeLink();
+ break;
+ case MSG_MDS_INODELINKACK:
+ m = new MInodeLinkAck();
+ break;
+
+ /* case MSG_MDS_INODEUPDATE:
+ m = new MInodeUpdate();
+ break;
+ */
+
+ case MSG_MDS_INODEEXPIRE:
+ m = new MInodeExpire();
+ break;
+
+ case MSG_MDS_INODEFILECAPS:
+ m = new MInodeFileCaps();
+ break;
+
+ case MSG_MDS_DIREXPIRE:
+ m = new MDirExpire();
+ break;
+
+ case MSG_MDS_LOCK:
+ m = new MLock();
+ break;
+
+
+ // -- simple messages without payload --
+
+ case MSG_CLOSE:
+ case MSG_NS_STARTED:
+ case MSG_NS_UNREGISTER:
+ case MSG_SHUTDOWN:
+ case MSG_MDS_SHUTDOWNSTART:
+ case MSG_MDS_SHUTDOWNFINISH:
+ case MSG_CLIENT_UNMOUNT:
+ case MSG_OSD_MKFS_ACK:
+ m = new MGenericMessage(env.type);
+ break;
+
+ default:
+ dout(1) << "can't decode unknown message type " << env.type << endl;
+ assert(0);
+ }
+
+ // env
+ m->set_envelope(env);
+
+ // decode
+ m->set_payload(payload);
+ m->decode_payload();
+
+ // done!
+ return m;
+}
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __MESSAGE_H
+#define __MESSAGE_H
+
+#define MSG_CLOSE 0
+
+#define MSG_NS_CONNECT 1
+#define MSG_NS_CONNECTACK 2
+#define MSG_NS_REGISTER 3
+#define MSG_NS_REGISTERACK 4
+#define MSG_NS_STARTED 5
+#define MSG_NS_UNREGISTER 6
+#define MSG_NS_LOOKUP 7
+#define MSG_NS_LOOKUPREPLY 8
+#define MSG_NS_FAILURE 9
+
+
+#define MSG_PING 10
+#define MSG_PING_ACK 11
+
+#define MSG_FAILURE 12
+#define MSG_FAILURE_ACK 13
+
+#define MSG_SHUTDOWN 99999
+
+
+#define MSG_MON_ELECTION_ACK 15
+#define MSG_MON_ELECTION_PROPOSE 16
+#define MSG_MON_ELECTION_VICTORY 17
+
+#define MSG_MON_OSDMAP_INFO 20
+#define MSG_MON_OSDMAP_LEASE 21
+#define MSG_MON_OSDMAP_LEASE_ACK 22
+#define MSG_MON_OSDMAP_UPDATE_PREPARE 23
+#define MSG_MON_OSDMAP_UPDATE_ACK 24
+#define MSG_MON_OSDMAP_UPDATE_COMMIT 25
+
+#define MSG_OSD_OP 40 // delete, etc.
+#define MSG_OSD_OPREPLY 41 // delete, etc.
+#define MSG_OSD_PING 42
+
+#define MSG_OSD_GETMAP 43
+#define MSG_OSD_MAP 44
+
+#define MSG_OSD_BOOT 45
+#define MSG_OSD_MKFS_ACK 46
+
+#define MSG_OSD_FAILURE 47
+
+#define MSG_OSD_IN 48
+#define MSG_OSD_OUT 49
+
+
+
+#define MSG_OSD_PG_NOTIFY 50
+#define MSG_OSD_PG_QUERY 51
+#define MSG_OSD_PG_SUMMARY 52
+#define MSG_OSD_PG_LOG 53
+#define MSG_OSD_PG_REMOVE 54
+
+#define MSG_CLIENT_REQUEST 60
+#define MSG_CLIENT_REPLY 61
+//#define MSG_CLIENT_DONE 62
+#define MSG_CLIENT_FILECAPS 63
+#define MSG_CLIENT_INODEAUTHUPDATE 64
+
+#define MSG_CLIENT_BOOT 70
+#define MSG_CLIENT_MOUNT 71
+#define MSG_CLIENT_MOUNTACK 72
+#define MSG_CLIENT_UNMOUNT 73
+
+
+// *** MDS ***
+
+#define MSG_MDS_BOOT 100
+#define MSG_MDS_GETMAP 101
+#define MSG_MDS_MAP 102
+#define MSG_MDS_HEARTBEAT 103
+
+#define MSG_MDS_DISCOVER 110
+#define MSG_MDS_DISCOVERREPLY 111
+
+#define MSG_MDS_INODEGETREPLICA 112
+#define MSG_MDS_INODEGETREPLICAACK 113
+
+#define MSG_MDS_INODEFILECAPS 115
+
+#define MSG_MDS_INODEUPDATE 120
+#define MSG_MDS_DIRUPDATE 121
+#define MSG_MDS_INODEEXPIRE 122
+#define MSG_MDS_DIREXPIRE 123
+
+#define MSG_MDS_DIREXPIREREQ 124
+
+#define MSG_MDS_CACHEEXPIRE 125
+
+#define MSG_MDS_ANCHORREQUEST 130
+#define MSG_MDS_ANCHORREPLY 131
+
+#define MSG_MDS_INODELINK 140
+#define MSG_MDS_INODELINKACK 141
+#define MSG_MDS_INODEUNLINK 142
+#define MSG_MDS_INODEUNLINKACK 143
+
+#define MSG_MDS_EXPORTDIRDISCOVER 150
+#define MSG_MDS_EXPORTDIRDISCOVERACK 151
+#define MSG_MDS_EXPORTDIRPREP 152
+#define MSG_MDS_EXPORTDIRPREPACK 153
+#define MSG_MDS_EXPORTDIRWARNING 154
+#define MSG_MDS_EXPORTDIR 155
+#define MSG_MDS_EXPORTDIRNOTIFY 156
+#define MSG_MDS_EXPORTDIRNOTIFYACK 157
+#define MSG_MDS_EXPORTDIRFINISH 158
+
+
+#define MSG_MDS_HASHDIRDISCOVER 160
+#define MSG_MDS_HASHDIRDISCOVERACK 161
+#define MSG_MDS_HASHDIRPREP 162
+#define MSG_MDS_HASHDIRPREPACK 163
+#define MSG_MDS_HASHDIR 164
+#define MSG_MDS_HASHDIRACK 165
+#define MSG_MDS_HASHDIRNOTIFY 166
+
+#define MSG_MDS_HASHREADDIR 168
+#define MSG_MDS_HASHREADDIRREPLY 169
+
+#define MSG_MDS_UNHASHDIRPREP 170
+#define MSG_MDS_UNHASHDIRPREPACK 171
+#define MSG_MDS_UNHASHDIR 172
+#define MSG_MDS_UNHASHDIRACK 173
+#define MSG_MDS_UNHASHDIRNOTIFY 174
+#define MSG_MDS_UNHASHDIRNOTIFYACK 175
+
+#define MSG_MDS_DENTRYUNLINK 200
+
+#define MSG_MDS_RENAMEWARNING 300 // sent from src to bystanders
+#define MSG_MDS_RENAMENOTIFY 301 // sent from dest to bystanders
+#define MSG_MDS_RENAMENOTIFYACK 302 // sent back to src
+#define MSG_MDS_RENAMEACK 303 // sent from src to initiator, to xlock_finish
+
+#define MSG_MDS_RENAMEPREP 304 // sent from initiator to dest auth (if dir)
+#define MSG_MDS_RENAMEREQ 305 // sent from initiator (or dest if dir) to src auth
+#define MSG_MDS_RENAME 306 // sent from src to dest, includes inode
+
+#define MSG_MDS_LOCK 500
+
+#define MSG_MDS_SHUTDOWNSTART 900
+#define MSG_MDS_SHUTDOWNFINISH 901
+
+
+#include <stdlib.h>
+#include <cassert>
+
+#include <iostream>
+#include <list>
+using std::list;
+
+#include <ext/hash_map>
+#include <ext/rope>
+
+using __gnu_cxx::crope;
+
+#include "include/buffer.h"
+
+#include "tcp.h"
+
+
+
+
+// use fixed offsets and static entity -> logical addr mapping!
+#define MSG_ADDR_NAMER_BASE 0
+#define MSG_ADDR_RANK_BASE 1
+#define MSG_ADDR_MDS_BASE 2
+#define MSG_ADDR_OSD_BASE 3
+#define MSG_ADDR_MON_BASE 4
+#define MSG_ADDR_CLIENT_BASE 5
+
+#define MSG_ADDR_NEW -1
+
+
+// new typed msg_addr_t way!
+class msg_addr_t {
+public:
+ int _type;
+ int _num;
+
+ msg_addr_t() : _type(0), _num(0) {}
+ msg_addr_t(int t, int n) : _type(t), _num(n) {}
+
+ int num() const { return _num; }
+ int type() const { return _type; }
+ const char *type_str() const {
+ switch (type()) {
+ case MSG_ADDR_RANK_BASE: return "rank";
+ case MSG_ADDR_MDS_BASE: return "mds";
+ case MSG_ADDR_OSD_BASE: return "osd";
+ case MSG_ADDR_MON_BASE: return "mon";
+ case MSG_ADDR_CLIENT_BASE: return "client";
+ case MSG_ADDR_NAMER_BASE: return "namer";
+ }
+ return "unknown";
+ }
+
+ bool is_new() const { return num() == MSG_ADDR_NEW; }
+
+ bool is_client() const { return type() == MSG_ADDR_CLIENT_BASE; }
+ bool is_mds() const { return type() == MSG_ADDR_MDS_BASE; }
+ bool is_osd() const { return type() == MSG_ADDR_OSD_BASE; }
+ bool is_mon() const { return type() == MSG_ADDR_MON_BASE; }
+ bool is_namer() const { return type() == MSG_ADDR_NAMER_BASE; }
+};
+
+inline bool operator== (const msg_addr_t& l, const msg_addr_t& r) { return (l._type == r._type) && (l._num == r._num); }
+inline bool operator!= (const msg_addr_t& l, const msg_addr_t& r) { return (l._type != r._type) || (l._num != r._num); }
+inline bool operator< (const msg_addr_t& l, const msg_addr_t& r) { return (l._type < r._type) || (l._type == r._type && l._num < r._num); }
+
+inline std::ostream& operator<<(std::ostream& out, const msg_addr_t& addr) {
+ //if (addr.is_namer()) return out << "namer";
+ if (addr.is_new() || addr.num() < 0)
+ return out << addr.type_str() << "?";
+ else
+ return out << addr.type_str() << addr.num();
+}
+
+namespace __gnu_cxx {
+ template<> struct hash< msg_addr_t >
+ {
+ size_t operator()( const msg_addr_t m ) const
+ {
+ static hash<int> H;
+ return H(m.type() ^ m.num());
+ }
+ };
+}
+
+#define MSG_ADDR_RANK(x) msg_addr_t(MSG_ADDR_RANK_BASE,x)
+#define MSG_ADDR_MDS(x) msg_addr_t(MSG_ADDR_MDS_BASE,x)
+#define MSG_ADDR_OSD(x) msg_addr_t(MSG_ADDR_OSD_BASE,x)
+#define MSG_ADDR_MON(x) msg_addr_t(MSG_ADDR_MON_BASE,x)
+#define MSG_ADDR_CLIENT(x) msg_addr_t(MSG_ADDR_CLIENT_BASE,x)
+#define MSG_ADDR_NAMER(x) msg_addr_t(MSG_ADDR_NAMER_BASE,x)
+
+#define MSG_ADDR_UNDEF msg_addr_t()
+#define MSG_ADDR_DIRECTORY MSG_ADDR_NAMER(0)
+
+#define MSG_ADDR_RANK_NEW MSG_ADDR_RANK(MSG_ADDR_NEW)
+#define MSG_ADDR_MDS_NEW MSG_ADDR_MDS(MSG_ADDR_NEW)
+#define MSG_ADDR_OSD_NEW MSG_ADDR_OSD(MSG_ADDR_NEW)
+#define MSG_ADDR_CLIENT_NEW MSG_ADDR_CLIENT(MSG_ADDR_NEW)
+#define MSG_ADDR_NAMER_NEW MSG_ADDR_NAMER(MSG_ADDR_NEW)
+
+
+class entity_inst_t {
+ public:
+ tcpaddr_t addr;
+ __int64_t rank;
+
+ entity_inst_t() : rank(-1) {
+ memset(&addr, 0, sizeof(addr));
+ }
+ entity_inst_t(tcpaddr_t& a, int r) : addr(a), rank(r) {
+ memset(&addr, 0, sizeof(addr));
+ }
+
+ void set_addr(tcpaddr_t a) {
+ addr = a;
+
+ // figure out rank
+ rank = *((unsigned*)&a.sin_addr.s_addr);
+ rank |= (__uint64_t)a.sin_port << 32;
+ }
+};
+
+inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { return a.rank == b.rank && a.addr == b.addr; }
+inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { return !(a == b); }
+inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return a.rank > b.rank; }
+inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return a.rank >= b.rank; }
+inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { return a.rank < b.rank; }
+inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { return a.rank <= b.rank; }
+
+inline ostream& operator<<(ostream& out, const entity_inst_t &i)
+{
+ //return out << "rank" << i.rank << "_" << i.addr;
+ return out << i.addr;
+}
+
+
+// abstract Message class
+
+
+
+typedef struct {
+ int type;
+ msg_addr_t source, dest;
+ entity_inst_t source_inst;
+ int source_port, dest_port;
+ int nchunks;
+ __uint64_t lamport_send_stamp;
+ __uint64_t lamport_recv_stamp;
+} msg_envelope_t;
+
+#define MSG_ENVELOPE_LEN sizeof(msg_envelope_t)
+
+
+class Message {
+ private:
+
+ protected:
+ msg_envelope_t env; // envelope
+ bufferlist payload; // payload
+
+ friend class Messenger;
+public:
+
+ public:
+ Message() {
+ env.source_port = env.dest_port = -1;
+ env.source = env.dest = MSG_ADDR_UNDEF;
+ env.nchunks = 0;
+ env.lamport_send_stamp = 0;
+ env.lamport_recv_stamp = 0;
+ };
+ Message(int t) {
+ env.source_port = env.dest_port = -1;
+ env.source = env.dest = MSG_ADDR_UNDEF;
+ env.nchunks = 0;
+ env.type = t;
+ env.lamport_send_stamp = 0;
+ env.lamport_recv_stamp = 0;
+ }
+ virtual ~Message() {
+ }
+
+ void set_lamport_send_stamp(__uint64_t t) { env.lamport_send_stamp = t; }
+ void set_lamport_recv_stamp(__uint64_t t) { env.lamport_recv_stamp = t; }
+ __uint64_t get_lamport_send_stamp() { return env.lamport_send_stamp; }
+ __uint64_t get_lamport_recv_stamp() { return env.lamport_recv_stamp; }
+
+
+ // for rpc-type procedural messages (pcid = procedure call id)
+ virtual long get_pcid() { return 0; }
+ virtual void set_pcid(long t) { assert(0); } // overload me
+
+ void clear_payload() { payload.clear(); }
+ bool empty_payload() { return payload.length() == 0; }
+ bufferlist& get_payload() {
+ return payload;
+ }
+ void set_payload(bufferlist& bl) {
+ payload.claim(bl);
+ }
+ msg_envelope_t& get_envelope() {
+ return env;
+ }
+ void set_envelope(msg_envelope_t& env) {
+ this->env = env;
+ }
+
+
+ // ENVELOPE ----
+
+ // type
+ int get_type() { return env.type; }
+ void set_type(int t) { env.type = t; }
+ virtual char *get_type_name() = 0;
+
+ // source/dest
+ msg_addr_t& get_dest() { return env.dest; }
+ void set_dest(msg_addr_t a, int p) { env.dest = a; env.dest_port = p; }
+ int get_dest_port() { return env.dest_port; }
+
+ msg_addr_t& get_source() { return env.source; }
+ void set_source(msg_addr_t a, int p) { env.source = a; env.source_port = p; }
+ int get_source_port() { return env.source_port; }
+
+ entity_inst_t& get_source_inst() { return env.source_inst; }
+ void set_source_inst(const entity_inst_t &i) { env.source_inst = i; }
+
+ // PAYLOAD ----
+ void reset_payload() {
+ payload.clear();
+ }
+
+ // overload either the rope version (easier!)
+ virtual void encode_payload(crope& s) { assert(0); }
+ virtual void decode_payload(crope& s, int& off) { assert(0); }
+
+ // of the bufferlist versions (faster!)
+ virtual void decode_payload() {
+ // use a crope for convenience, small messages, etc. FIXME someday.
+ crope ser;
+ for (list<bufferptr>::const_iterator it = payload.buffers().begin();
+ it != payload.buffers().end();
+ it++)
+ ser.append((*it).c_str(), (*it).length());
+
+ int off = 0;
+ decode_payload(ser, off);
+ assert((unsigned)off == payload.length());
+ }
+ virtual void encode_payload() {
+ assert(payload.length() == 0); // caller should reset payload
+
+ // use crope for convenience, small messages. FIXME someday.
+ crope r;
+ encode_payload(r);
+
+ // copy payload
+ payload.push_back( buffer::copy(r.c_str(), r.length()) );
+ }
+
+ virtual void print(ostream& out) {
+ out << get_type_name();
+ }
+
+};
+
+extern Message *decode_message(msg_envelope_t &env, bufferlist& bl);
+inline ostream& operator<<(ostream& out, Message& m) {
+ m.print(out);
+ return out;
+}
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include <ext/rope>
+#include "include/types.h"
+
+#include "Message.h"
+#include "Messenger.h"
+#include "messages/MGenericMessage.h"
+
+#include <cassert>
+#include <iostream>
+using namespace std;
+
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug) cout << "messenger: "
+#define DEBUGLVL 10 // debug level of output
+
+
+
+// --------
+// callbacks
+
+Mutex msgr_callback_lock;
+list<Context*> msgr_callback_queue;
+//Context* msgr_callback_kicker = 0;
+
+void Messenger::queue_callback(Context *c) {
+ msgr_callback_lock.Lock();
+ msgr_callback_queue.push_back(c);
+ msgr_callback_lock.Unlock();
+
+ callback_kick();
+}
+void Messenger::queue_callbacks(list<Context*>& ls) {
+ msgr_callback_lock.Lock();
+ msgr_callback_queue.splice(msgr_callback_queue.end(), ls);
+ msgr_callback_lock.Unlock();
+
+ callback_kick();
+}
+
+void Messenger::do_callbacks() {
+ // take list
+ msgr_callback_lock.Lock();
+ list<Context*> ls;
+ ls.splice(ls.begin(), msgr_callback_queue);
+ msgr_callback_lock.Unlock();
+
+ // do them
+ for (list<Context*>::iterator it = ls.begin();
+ it != ls.end();
+ it++) {
+ dout(10) << "--- doing callback " << *it << endl;
+ (*it)->finish(0);
+ delete *it;
+ }
+}
+
+// ---------
+// incoming messages
+
+void Messenger::dispatch(Message *m)
+{
+ assert(dispatcher);
+ dispatcher->dispatch(m);
+}
+
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef __MESSENGER_H
+#define __MESSENGER_H
+
+#include <map>
+using namespace std;
+
+#include "Message.h"
+#include "Dispatcher.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "include/Context.h"
+
+
+typedef __uint64_t lamport_t;
+
+
+class MDS;
+class Timer;
+
+class Messenger {
+ private:
+ Dispatcher *dispatcher;
+ msg_addr_t _myaddr;
+ entity_inst_t _myinst;
+
+
+ public:
+ Messenger(msg_addr_t w) : dispatcher(0), _myaddr(w) { }
+ virtual ~Messenger() { }
+
+ const entity_inst_t &get_myinst() { return _myinst; }
+ void set_myinst(entity_inst_t& v) { _myinst = v; }
+
+ msg_addr_t get_myaddr() { return _myaddr; }
+ void _set_myaddr(msg_addr_t m) { _myaddr = m; }
+
+ virtual void reset_myaddr(msg_addr_t m) = 0;
+
+
+ virtual int shutdown() = 0;
+
+ // callbacks
+ static void do_callbacks();
+
+ void queue_callback(Context *c);
+ void queue_callbacks(list<Context*>& ls);
+ virtual void callback_kick() = 0;
+
+ virtual int get_dispatch_queue_len() { return 0; };
+
+ // setup
+ void set_dispatcher(Dispatcher *d) { dispatcher = d; ready(); }
+ Dispatcher *get_dispatcher() { return dispatcher; }
+ virtual void ready() { }
+ bool is_ready() { return dispatcher != 0; }
+
+ // dispatch incoming messages
+ virtual void dispatch(Message *m);
+
+ // send message
+ virtual void prepare_dest(const entity_inst_t& inst) {}
+ //virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0) = 0;
+ virtual int send_message(Message *m, msg_addr_t dest, entity_inst_t inst,
+ int port=0, int fromport=0) = 0;
+
+
+ // make a procedure call
+ //virtual Message* sendrecv(Message *m, msg_addr_t dest, int port=0);
+
+
+ virtual void mark_down(msg_addr_t a, entity_inst_t& i) {}
+ virtual void mark_up(msg_addr_t a, entity_inst_t& i) {}
+ //virtual void reset(msg_addr_t a) { mark_down(a); mark_up(a); }
+
+};
+
+
+
+
+
+#endif
--- /dev/null
+
+#include "NewMessenger.h"
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "config.h"
+
+#include "messages/MGenericMessage.h"
+#include "messages/MNSConnect.h"
+#include "messages/MNSConnectAck.h"
+#include "messages/MNSRegister.h"
+#include "messages/MNSRegisterAck.h"
+#include "messages/MNSLookup.h"
+#include "messages/MNSLookupReply.h"
+#include "messages/MNSFailure.h"
+
+//#include "messages/MFailure.h"
+
+#include <netdb.h>
+
+
+#undef dout
+#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- rank" << rank.my_rank << " "
+#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- rank" << rank.my_rank << " "
+
+
+
+#include "tcp.cc"
+
+
+Rank rank;
+
+
+/********************************************
+ * Namer
+ */
+
+Rank::Namer::Namer(EntityMessenger *msgr) :
+ messenger(msgr),
+ nrank(0), nclient(0), nmds(0), nosd(0), nmon(0)
+{
+ assert(rank.my_rank == 0);
+ nrank = g_conf.num_mon;
+
+ // announce myself
+ /*
+ cerr << "ceph ns is " << rank.accepter.listen_addr << endl;
+ cout << "export CEPH_NAMESERVER=" << rank.accepter.listen_addr << endl;
+ int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT);
+ ::write(fd, (void*)&rank.accepter.listen_addr, sizeof(tcpaddr_t));
+ ::fchmod(fd, 0755);
+ ::close(fd);
+ */
+
+ // ok
+ messenger->set_dispatcher(this);
+}
+
+Rank::Namer::~Namer()
+{
+ //::unlink(".ceph_ns");
+}
+
+
+void Rank::Namer::dispatch(Message *m)
+{
+ rank.lock.Lock();
+ int type = m->get_type();
+ switch (type) {
+ case MSG_NS_CONNECT:
+ handle_connect((class MNSConnect*)m);
+ break;
+ case MSG_NS_REGISTER:
+ handle_register((class MNSRegister*)m);
+ break;
+ case MSG_NS_STARTED:
+ handle_started(m);
+ break;
+ case MSG_NS_UNREGISTER:
+ handle_unregister(m);
+ break;
+ case MSG_NS_LOOKUP:
+ handle_lookup((class MNSLookup*)m);
+ break;
+ case MSG_NS_FAILURE:
+ handle_failure((class MNSFailure*)m);
+ break;
+
+ case MSG_FAILURE_ACK:
+ delete m;
+ break;
+
+ default:
+ assert(0);
+ }
+ rank.lock.Unlock();
+}
+
+void Rank::Namer::handle_connect(MNSConnect *m)
+{
+ int newrank = nrank++;
+ dout(2) << "namer.handle_connect from new rank" << newrank << " " << m->get_addr() << endl;
+
+ rank.entity_map[MSG_ADDR_RANK(newrank)].addr = m->get_addr();
+ rank.entity_map[MSG_ADDR_RANK(newrank)].rank = newrank;
+ rank.entity_unstarted.insert(MSG_ADDR_RANK(newrank));
+
+ messenger->send_message(new MNSConnectAck(newrank),
+ MSG_ADDR_RANK(newrank), rank.entity_map[MSG_ADDR_RANK(newrank)]);
+ delete m;
+}
+
+void Rank::Namer::manual_insert_inst(const entity_inst_t &inst)
+{
+ rank.entity_map[MSG_ADDR_RANK(inst.rank)] = inst;
+}
+
+void Rank::Namer::handle_register(MNSRegister *m)
+{
+ dout(10) << "namer.handle_register from rank " << m->get_rank()
+ << " addr " << m->get_entity() << endl;
+
+ // pick id
+ msg_addr_t entity = m->get_entity();
+
+ if (entity.is_new()) {
+ // make up a new address!
+ switch (entity.type()) {
+ case MSG_ADDR_MDS_BASE:
+ entity = MSG_ADDR_MDS(nmds++);
+ break;
+
+ case MSG_ADDR_OSD_BASE:
+ entity = MSG_ADDR_OSD(nosd++);
+ break;
+
+ case MSG_ADDR_CLIENT_BASE:
+ entity = MSG_ADDR_CLIENT(nclient++);
+ break;
+
+ default:
+ assert(0);
+ }
+ } else {
+ // specific address!
+ }
+
+
+ // register
+ if (rank.entity_map.count(entity)) {
+ dout(1) << "namer.handle_register re-registering " << entity
+ << " inst " << m->get_source_inst()
+ << " (was " << rank.entity_map[entity] << ")"
+ << endl;
+ } else {
+ dout(1) << "namer.handle_register registering " << entity
+ << " inst " << m->get_source_inst()
+ << endl;
+ }
+ rank.entity_map[entity] = m->get_source_inst();
+ rank.entity_unstarted.insert(entity);
+
+ // reply w/ new id
+ messenger->send_message(new MNSRegisterAck(m->get_tid(), entity),
+ m->get_source(), rank.entity_map[entity]);
+
+ delete m;
+}
+
+void Rank::Namer::handle_started(Message *m)
+{
+ msg_addr_t who = m->get_source();
+ dout(10) << "namer.handle_started from entity " << who << endl;
+
+ assert(rank.entity_unstarted.count(who));
+ rank.entity_unstarted.erase(who);
+
+ // anybody waiting?
+ if (waiting.count(who)) {
+ list<Message*> ls;
+ ls.swap(waiting[who]);
+ waiting.erase(who);
+
+ dout(10) << "doing waiters on " << who << endl;
+ for (list<Message*>::iterator it = ls.begin();
+ it != ls.end();
+ it++)
+ dispatch(*it);
+ }
+
+}
+
+void Rank::Namer::handle_unregister(Message *m)
+{
+ msg_addr_t who = m->get_source();
+ dout(1) << "namer.handle_unregister entity " << who << endl;
+
+ rank.show_dir();
+
+ assert(rank.entity_map.count(who));
+ rank.entity_map.erase(who);
+
+ rank.show_dir();
+
+ // shut myself down? kick watcher.
+ if (rank.entity_map.size() == 2) {
+ dout(10) << "namer.handle_unregister stopping namer" << endl;
+ rank.lock.Unlock();
+ messenger->shutdown();
+ delete messenger;
+ rank.lock.Lock();
+ }
+
+ delete m;
+}
+
+
+void Rank::Namer::handle_lookup(MNSLookup *m)
+{
+ // have it?
+ if (rank.entity_map.count(m->get_entity()) == 0) {
+ dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> dne" << endl;
+ waiting[m->get_entity()].push_back(m);
+ return;
+ }
+
+ if (rank.entity_unstarted.count(m->get_entity())) {
+ dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> unstarted" << endl;
+ waiting[m->get_entity()].push_back(m);
+ return;
+ }
+
+ // look it up!
+ MNSLookupReply *reply = new MNSLookupReply(m);
+
+ reply->entity_map[m->get_entity()] = rank.entity_map[m->get_entity()];
+
+ dout(10) << "namer " << m->get_source()
+ << " lookup '" << m->get_entity()
+ << "' -> " << rank.entity_map[m->get_entity()] << endl;
+
+ messenger->send_message(reply, m->get_source(), m->get_source_inst());
+ delete m;
+}
+
+void Rank::Namer::handle_failure(MNSFailure *m)
+{
+ dout(10) << "namer.handle_failure inst " << m->get_inst()
+ << endl;
+
+ // search for entities on this instance
+ list<msg_addr_t> rm;
+ for (hash_map<msg_addr_t,entity_inst_t>::iterator i = rank.entity_map.begin();
+ i != rank.entity_map.end();
+ i++) {
+ if (i->second != m->get_inst()) continue;
+ rm.push_back(i->first);
+ }
+ for (list<msg_addr_t>::iterator i = rm.begin();
+ i != rm.end();
+ i++) {
+ dout(10) << "namer.handle_failure inst " << m->get_inst()
+ << ", removing " << *i << endl;
+
+ rank.entity_map.erase(*i);
+ rank.entity_unstarted.erase(*i);
+
+ /*
+ if ((*i).is_osd()) {
+ // tell the monitor
+ messenger->send_message(new MFailure(*i, m->get_inst()), MSG_ADDR_MON(0));
+ }
+ */
+ }
+
+ delete m;
+}
+
+
+
+/********************************************
+ * Accepter
+ */
+
+int Rank::Accepter::start()
+{
+ // bind to a socket
+ dout(10) << "accepter.start binding to listen " << endl;
+
+ /* socket creation */
+ listen_sd = socket(AF_INET,SOCK_STREAM,0);
+ assert(listen_sd > 0);
+
+ /* bind to port */
+ memset((char*)&listen_addr, 0, sizeof(listen_addr));
+ listen_addr.sin_family = AF_INET;
+ listen_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ listen_addr.sin_port = 0;
+
+ int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr));
+ assert(rc >= 0);
+
+ socklen_t llen = sizeof(listen_addr);
+ getsockname(listen_sd, (sockaddr*)&listen_addr, &llen);
+
+ int myport = listen_addr.sin_port;
+
+ // listen!
+ rc = ::listen(listen_sd, 1000);
+ assert(rc >= 0);
+
+ //dout(10) << "accepter.start listening on " << myport << endl;
+
+ // my address is...
+ char host[100];
+ bzero(host, 100);
+ gethostname(host, 100);
+ //dout(10) << "accepter.start my hostname is " << host << endl;
+
+ struct hostent *myhostname = gethostbyname( host );
+
+ struct sockaddr_in my_addr;
+ memset(&my_addr, 0, sizeof(my_addr));
+
+ my_addr.sin_family = myhostname->h_addrtype;
+ memcpy((char *) &my_addr.sin_addr.s_addr,
+ myhostname->h_addr_list[0],
+ myhostname->h_length);
+ my_addr.sin_port = myport;
+
+ listen_addr = my_addr;
+
+ dout(10) << "accepter.start listen addr is " << listen_addr << endl;
+
+ // start thread
+ create();
+
+ return 0;
+}
+
+void *Rank::Accepter::entry()
+{
+ dout(10) << "accepter starting" << endl;
+
+ while (!done) {
+ // accept
+ struct sockaddr_in addr;
+ socklen_t slen = sizeof(addr);
+ int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen);
+ if (sd > 0) {
+ dout(10) << "accepted incoming on sd " << sd << endl;
+
+ Receiver *r = new Receiver(sd);
+ r->create();
+
+ rank.lock.Lock();
+ rank.receivers.insert(r);
+ rank.lock.Unlock();
+ } else {
+ dout(10) << "no incoming connection?" << endl;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+/**************************************
+ * Receiver
+ */
+
+void *Rank::Receiver::entry()
+{
+ while (!done) {
+ Message *m = read_message();
+ if (!m) {
+ ::close(sd);
+ break;
+ }
+
+ dout(10) << "receiver.entry got message for " << m->get_dest() << endl;
+
+ EntityMessenger *entity = 0;
+
+ rank.lock.Lock();
+ {
+ if (rank.down.count(m->get_dest())) {
+ dout(0) << "receiver.entry dest " << m->get_dest() << " down, dropping " << *m << endl;
+ delete m;
+
+ if (rank.looking_up.count(m->get_dest()) == 0)
+ rank.lookup(m->get_dest());
+ }
+ else if (rank.entity_map.count(m->get_source()) &&
+ rank.entity_map[m->get_source()] > m->get_source_inst()) {
+ derr(0) << "receiver.entry source " << m->get_source()
+ << " inst " << m->get_source_inst()
+ << " < " << rank.entity_map[m->get_source()]
+ << ", dropping " << *m << endl;
+ delete m;
+ }
+ else {
+ if (rank.entity_map.count(m->get_source()) &&
+ rank.entity_map[m->get_source()] > m->get_source_inst()) {
+ derr(0) << "receiver.entry source " << m->get_source()
+ << " inst " << m->get_source_inst()
+ << " > " << rank.entity_map[m->get_source()]
+ << ", WATCH OUT " << *m << endl;
+ rank.entity_map[m->get_source()] = m->get_source_inst();
+ }
+
+ if (m->get_dest().type() == MSG_ADDR_RANK_BASE) {
+ // ours.
+ rank.dispatch(m);
+ } else {
+ if (g_conf.ms_single_dispatch) {
+ // submit to single dispatch queue
+ rank._submit_single_dispatch(m);
+ } else {
+ if (rank.local.count(m->get_dest())) {
+ // find entity
+ entity = rank.local[m->get_dest()];
+ } else {
+ derr(0) << "got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl;
+ rank.waiting_for_lookup[m->get_dest()].push_back(m);
+ }
+ }
+ }
+ }
+ }
+ rank.lock.Unlock();
+
+ if (entity)
+ entity->queue_message(m); // queue
+ }
+
+ // add to reap queue
+ rank.lock.Lock();
+ rank.receiver_reap_queue.push_back(this);
+ rank.wait_cond.Signal();
+ rank.lock.Unlock();
+
+ return 0;
+}
+
+Message *Rank::Receiver::read_message()
+{
+ // envelope
+ //dout(10) << "receiver.read_message from sd " << sd << endl;
+
+ msg_envelope_t env;
+ if (!tcp_read( sd, (char*)&env, sizeof(env) ))
+ return 0;
+
+ if (env.type == 0) {
+ dout(10) << "receiver got dummy env, bailing" << endl;
+ return 0;
+ }
+
+ dout(20) << "receiver got envelope type=" << env.type
+ << " src " << env.source << " dst " << env.dest
+ << " nchunks=" << env.nchunks
+ << endl;
+
+ // payload
+ bufferlist blist;
+ for (int i=0; i<env.nchunks; i++) {
+ int size;
+ tcp_read( sd, (char*)&size, sizeof(size) );
+
+ bufferptr bp = new buffer(size);
+
+ if (!tcp_read( sd, bp.c_str(), size )) return 0;
+
+ blist.push_back(bp);
+
+ dout(20) << "receiver got frag " << i << " of " << env.nchunks
+ << " len " << bp.length() << endl;
+ }
+
+ // unmarshall message
+ size_t s = blist.length();
+ Message *m = decode_message(env, blist);
+
+ dout(20) << "receiver got " << s << " byte message from "
+ << m->get_source() << endl;
+
+ return m;
+}
+
+
+/**************************************
+ * Sender
+ */
+
+int Rank::Sender::connect()
+{
+ dout(10) << "sender(" << inst << ").connect" << endl;
+
+ // create socket?
+ sd = socket(AF_INET,SOCK_STREAM,0);
+ assert(sd > 0);
+
+ // bind any port
+ struct sockaddr_in myAddr;
+ myAddr.sin_family = AF_INET;
+ myAddr.sin_addr.s_addr = htonl(INADDR_ANY);
+ myAddr.sin_port = htons( 0 );
+
+ int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr));
+ assert(rc>=0);
+
+ // connect!
+ int r = ::connect(sd, (sockaddr*)&inst.addr, sizeof(myAddr));
+ if (r < 0) return r;
+
+ // identify myself
+ // FIXME
+
+ return 0;
+}
+
+
+void Rank::Sender::finish()
+{
+ dout(10) << "sender(" << inst << ").finish" << endl;
+
+ // make sure i get reaped.
+ rank.lock.Lock();
+ rank.sender_reap_queue.push_back(this);
+ rank.wait_cond.Signal();
+ rank.lock.Unlock();
+}
+
+void Rank::Sender::fail_and_requeue(list<Message*>& out)
+{
+ dout(10) << "sender(" << inst << ").fail" << endl;// and requeue" << endl;
+
+ // tell namer
+ if (!rank.messenger) {
+ derr(0) << "FATAL error: can't send failure to namer0, not connected yet" << endl;
+ assert(0);
+ }
+
+ // old and unnecessary?
+ if (0)
+ rank.messenger->send_message(new MNSFailure(inst),
+ MSG_ADDR_NAMER(0));
+
+
+ // FIXME: possible race before i reclaim lock here?
+
+ Dispatcher *dis = 0;
+ msg_addr_t dis_dest;
+
+ list<Message*> lost;
+
+ // requeue my messages
+ rank.lock.Lock();
+ lock.Lock();
+ {
+ // include out at front of queue
+ q.splice(q.begin(), out);
+ dout(10) << "sender(" << inst << ").fail "
+ << q.size() << " messages" << endl;
+
+ if (0) {
+ lost.swap(q);
+ } else {
+
+ while (!q.empty()) {
+ // don't keep reconnecting..
+ if (rank.entity_map.count(q.front()->get_dest()) &&
+ rank.entity_map[q.front()->get_dest()] == inst)
+ rank.down.insert(q.front()->get_dest());
+ //rank.entity_map.erase(q.front()->get_dest());
+
+ if (!dis &&
+ rank.local.count(q.front()->get_source())) {
+ dis_dest = q.front()->get_dest();
+ dis = rank.local[q.front()->get_source()]->get_dispatcher();
+ }
+
+ if (g_conf.ms_requeue_on_sender_fail)
+ rank.submit_message( q.front() );
+ else
+ lost.push_back( q.front() );
+ q.pop_front();
+ }
+ }
+
+ // deactivate myself
+ if (rank.rank_sender.count(inst.rank) &&
+ rank.rank_sender[inst.rank] == this)
+ rank.rank_sender.erase(inst.rank);
+
+ // stop sender loop
+ done = true;
+ }
+ lock.Unlock();
+
+
+ // send special failure msg?
+ if (dis) {
+ for (list<Message*>::iterator p = lost.begin();
+ p != lost.end();
+ p++)
+ dis->ms_handle_failure(*p, dis_dest, inst);
+ }
+
+ rank.lock.Unlock();
+}
+
+void *Rank::Sender::entry()
+{
+ // connect
+ if (sd == 0) {
+ int rc = connect();
+ if (rc < 0) {
+ list<Message*> out;
+ derr(0) << "error connecting to " << inst << endl;
+ fail_and_requeue(out);
+ finish();
+ return 0;
+ }
+ }
+
+ lock.Lock();
+ while (!q.empty() || !done) {
+
+ if (!q.empty()) {
+ dout(20) << "sender(" << inst << ") grabbing message(s)" << endl;
+
+ // grab outgoing list
+ list<Message*> out;
+ out.swap(q);
+
+ // drop lock while i send these
+ lock.Unlock();
+
+ while (!out.empty()) {
+ Message *m = out.front();
+ out.pop_front();
+
+ dout(20) << "sender(" << inst << ") sending " << *m << endl;
+
+ // stamp.
+ m->set_source_inst(rank.my_inst);
+
+ // marshall
+ if (m->empty_payload())
+ m->encode_payload();
+
+ if (write_message(m) < 0) {
+ // failed!
+ derr(0) << "error sending to " << m->get_dest() << " on " << inst << endl;
+ out.push_front(m);
+ fail_and_requeue(out);
+ break;
+ }
+ }
+
+ lock.Lock();
+ continue;
+ }
+
+ // wait
+ dout(20) << "sender(" << inst << ") sleeping" << endl;
+ cond.Wait(lock);
+ }
+ lock.Unlock();
+
+ finish();
+ return 0;
+}
+
+
+int Rank::Sender::write_message(Message *m)
+{
+ // get envelope, buffers
+ msg_envelope_t *env = &m->get_envelope();
+ bufferlist blist;
+ blist.claim( m->get_payload() );
+
+#ifdef TCP_KEEP_CHUNKS
+ env->nchunks = blist.buffers().size();
+#else
+ env->nchunks = 1;
+#endif
+
+ dout(20)// << g_clock.now()
+ << " sending " << m << " " << *m
+ << " to " << m->get_dest()
+ << endl;
+
+ // send envelope
+ int r = tcp_write( sd, (char*)env, sizeof(*env) );
+ if (r < 0) {
+ derr(20) << "error sending envelope for " << *m
+ << " to " << m->get_dest() << endl;
+ return -1;
+ }
+
+ // payload
+#ifdef TCP_KEEP_CHUNKS
+ // send chunk-wise
+ int i = 0;
+ for (list<bufferptr>::iterator it = blist.buffers().begin();
+ it != blist.buffers().end();
+ it++) {
+ dout(10) << "tcp_sending frag " << i << " len " << (*it).length() << endl;
+ int size = (*it).length();
+ r = tcp_write( sd, (char*)&size, sizeof(size) );
+ if (r < 0) {
+ derr(20) << "error sending chunk len for " << *m << " to " << m->get_dest() << endl;
+ return -1;
+ }
+ r = tcp_write( sd, (*it).c_str(), size );
+ if (r < 0) {
+ derr(20) << "error sending data chunk for " << *m << " to " << m->get_dest() << endl;
+ return -1;
+ }
+ i++;
+ }
+#else
+ // one big chunk
+ int size = blist.length();
+ r = tcp_write( sd, (char*)&size, sizeof(size) );
+ if (r < 0) {
+ derr(20) << "error sending data len for " << *m << " to " << m->get_dest() << endl;
+ return -1;
+ }
+ for (list<bufferptr>::iterator it = blist.buffers().begin();
+ it != blist.buffers().end();
+ it++) {
+ r = tcp_write( sd, (*it).c_str(), (*it).length() );
+ if (r < 0) {
+ derr(20) << "error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl;
+ return -1;
+ }
+ }
+#endif
+
+ // delete message
+ delete m;
+ return 0;
+}
+
+
+
+/********************************************
+ * Rank
+ */
+
+Rank::Rank(int r) :
+ single_dispatcher(this),
+ my_rank(r),
+ namer(0) {
+}
+Rank::~Rank()
+{
+ //FIXME
+ if (namer) delete namer;
+}
+
+
+void Rank::_submit_single_dispatch(Message *m)
+{
+ assert(lock.is_locked());
+
+ if (local.count(m->get_dest()) &&
+ local[m->get_dest()]->is_ready()) {
+ rank.single_dispatch_queue.push_back(m);
+ rank.single_dispatch_cond.Signal();
+ } else {
+ waiting_for_ready[m->get_dest()].push_back(m);
+ }
+}
+
+
+void Rank::single_dispatcher_entry()
+{
+ lock.Lock();
+ while (!single_dispatch_stop || !single_dispatch_queue.empty()) {
+ if (!single_dispatch_queue.empty()) {
+ list<Message*> ls;
+ ls.swap(single_dispatch_queue);
+
+ lock.Unlock();
+ {
+ while (!ls.empty()) {
+ Message *m = ls.front();
+ ls.pop_front();
+
+ dout(1) //<< g_clock.now()
+ << "---- "
+ << m->get_source() << ':' << m->get_source_port()
+ << " to " << m->get_dest() << ':' << m->get_dest_port()
+ << " ---- " << m->get_type_name()
+ << " ---- " << m
+ << endl;
+
+ if (m->get_dest().type() == MSG_ADDR_RANK_BASE)
+ rank.dispatch(m);
+ else {
+ assert(local.count(m->get_dest()));
+ local[m->get_dest()]->dispatch(m);
+ }
+ }
+ }
+ lock.Lock();
+ continue;
+ }
+ single_dispatch_cond.Wait(lock);
+ }
+ lock.Unlock();
+}
+
+
+/*
+ * note: assumes lock is held
+ */
+void Rank::reaper()
+{
+ assert(lock.is_locked());
+
+ while (!receiver_reap_queue.empty()) {
+ Receiver *r = receiver_reap_queue.front();
+ receiver_reap_queue.pop_front();
+ //dout(10) << "reaper reaping receiver sd " << r->sd << endl;
+ receivers.erase(r);
+ r->join();
+ dout(10) << "reaper reaped receiver sd " << r->sd << endl;
+ delete r;
+ }
+
+ while (!sender_reap_queue.empty()) {
+ Sender *s = sender_reap_queue.front();
+ sender_reap_queue.pop_front();
+ //dout(10) << "reaper reaping sender rank " << s->dest_rank << " at " << s->tcpaddr << endl;
+ if (rank_sender.count(s->inst.rank) &&
+ rank_sender[s->inst.rank] == s)
+ rank_sender.erase(s->inst.rank);
+ s->join();
+ dout(10) << "reaper reaped sender " << s->inst << endl;
+ delete s;
+ }
+}
+
+
+int Rank::start_rank()
+{
+ dout(10) << "start_rank" << endl;
+
+ // bind to a socket
+ if (accepter.start() < 0)
+ return -1;
+
+ // start single thread dispatcher?
+ if (g_conf.ms_single_dispatch) {
+ single_dispatch_stop = false;
+ single_dispatcher.create();
+ }
+
+ lock.Lock();
+
+ if (my_rank < 0) {
+ dout(10) << "start_rank connecting to namer0" << endl;
+
+ // connect to namer
+ assert(entity_map.count(MSG_ADDR_NAMER(0)));
+ Sender *sender = connect_rank(entity_map[MSG_ADDR_NAMER(0)]);
+
+ // send
+ Message *m = new MNSConnect(accepter.listen_addr);
+ m->set_dest(MSG_ADDR_NAMER(0), 0);
+ sender->send(m);
+
+ // wait
+ while (my_rank < 0)
+ waiting_for_rank.Wait(lock);
+ assert(my_rank >= 0);
+
+ dout(10) << "start_rank got rank " << my_rank << endl;
+
+ // create rank entity
+ entity_map[MSG_ADDR_RANK(my_rank)] = my_inst;
+ local[MSG_ADDR_RANK(my_rank)] = messenger = new EntityMessenger(MSG_ADDR_RANK(my_rank));
+ messenger->set_dispatcher(this);
+ } else {
+ // my_inst
+ my_inst.addr = accepter.listen_addr;
+ my_inst.rank = my_rank;
+
+ // create my rank
+ msg_addr_t raddr = MSG_ADDR_RANK(my_rank);
+ entity_map[raddr] = my_inst;
+ entity_unstarted.insert(raddr);
+ local[raddr] = messenger = new EntityMessenger(raddr);
+ messenger->set_dispatcher(this);
+
+ dout(1) << "start_rank " << my_rank << " at " << my_inst << endl;
+ }
+
+ lock.Unlock();
+ return 0;
+}
+
+void Rank::start_namer()
+{
+ // create namer0
+ msg_addr_t naddr = MSG_ADDR_NAMER(0);
+ entity_map[naddr] = my_inst;
+ local[naddr] = new EntityMessenger(naddr);
+ namer = new Namer(local[naddr]);
+}
+
+void Rank::set_namer(const tcpaddr_t& ns)
+{
+ entity_map[MSG_ADDR_NAMER(0)].addr = ns;
+ entity_map[MSG_ADDR_NAMER(0)].rank = 0;
+}
+
+/* connect_rank
+ * NOTE: assumes rank.lock held.
+ */
+Rank::Sender *Rank::connect_rank(const entity_inst_t& inst)
+{
+ assert(rank.lock.is_locked());
+ assert(inst != rank.my_inst);
+
+ dout(10) << "connect_rank to " << inst << endl;
+
+ // create sender
+ Sender *sender = new Sender(inst);
+ //int rc = sender->connect();
+ //assert(rc >= 0);
+
+ // start thread.
+ sender->create();
+
+ // old sender?
+ assert(rank.rank_sender.count(inst.rank) == 0);
+ //if (rank.rank_sender.count(r))
+ //rank.rank_sender[r]->stop();
+
+ // ok!
+ rank.rank_sender[inst.rank] = sender;
+ return sender;
+}
+
+
+
+
+
+void Rank::show_dir()
+{
+ dout(10) << "show_dir ---" << endl;
+
+ for (hash_map<msg_addr_t, entity_inst_t>::iterator i = entity_map.begin();
+ i != entity_map.end();
+ i++) {
+ if (local.count(i->first)) {
+ dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl;
+ } else {
+ dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl;
+ }
+ }
+}
+
+
+/* lookup
+ * NOTE: assumes directory.lock held
+ */
+void Rank::lookup(msg_addr_t addr)
+{
+ dout(10) << "lookup " << addr << endl;
+ assert(lock.is_locked());
+
+ assert(looking_up.count(addr) == 0);
+ looking_up.insert(addr);
+
+ MNSLookup *r = new MNSLookup(addr);
+ messenger->send_message(r, MSG_ADDR_DIRECTORY);
+}
+
+
+
+/* register_entity
+ */
+Rank::EntityMessenger *Rank::register_entity(msg_addr_t addr)
+{
+ dout(10) << "register_entity " << addr << endl;
+ lock.Lock();
+
+ // register with namer
+ static long reg_attempt = 0;
+ long id = ++reg_attempt;
+
+ Message *reg = new MNSRegister(addr, my_rank, id);
+ reg->set_source(MSG_ADDR_RANK(my_rank), 0);
+ reg->set_source_inst(my_inst);
+ reg->set_dest(MSG_ADDR_DIRECTORY, 0);
+
+ // prepare cond
+ Cond cond;
+ waiting_for_register_cond[id] = &cond;
+
+ // send request
+ lock.Unlock();
+ submit_message(reg);
+ lock.Lock();
+
+ // wait
+ while (!waiting_for_register_result.count(id))
+ cond.Wait(lock);
+
+ // grab result
+ addr = waiting_for_register_result[id];
+ dout(10) << "register_entity got " << addr << endl;
+
+ // clean up
+ waiting_for_register_cond.erase(id);
+ waiting_for_register_result.erase(id);
+
+ // create messenger
+ EntityMessenger *msgr = new EntityMessenger(addr);
+
+ // add to directory
+ entity_map[addr] = my_inst;
+ local[addr] = msgr;
+
+ // was anyone waiting?
+ if (waiting_for_lookup.count(addr)) {
+ submit_messages(waiting_for_lookup[addr]);
+ waiting_for_lookup.erase(addr);
+ }
+
+ lock.Unlock();
+ return msgr;
+}
+
+void Rank::unregister_entity(EntityMessenger *msgr)
+{
+ lock.Lock();
+ dout(10) << "unregister_entity " << msgr->get_myaddr() << endl;
+
+ // remove from local directory.
+ assert(local.count(msgr->get_myaddr()));
+ local.erase(msgr->get_myaddr());
+
+ if (my_rank > 0) {
+ assert(entity_map.count(msgr->get_myaddr()));
+ entity_map.erase(msgr->get_myaddr());
+ } // else namer will do it.
+
+ // tell namer.
+ if (msgr->get_myaddr() != MSG_ADDR_NAMER(0) &&
+ msgr->get_myaddr() != MSG_ADDR_RANK(0))
+ msgr->send_message(new MGenericMessage(MSG_NS_UNREGISTER),
+ MSG_ADDR_NAMER(0));
+
+ // kick wait()?
+ if (local.size() <= 2)
+ wait_cond.Signal();
+
+ lock.Unlock();
+}
+
+
+void Rank::submit_messages(list<Message*>& ls)
+{
+ for (list<Message*>::iterator i = ls.begin(); i != ls.end(); i++)
+ submit_message(*i);
+ ls.clear();
+}
+
+
+void Rank::prepare_dest(msg_addr_t dest)
+{
+ lock.Lock();
+
+ if (entity_map.count( dest )) {
+ // remote, known rank addr.
+ entity_inst_t inst = entity_map[dest];
+
+ if (inst == my_inst) {
+ //dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl;
+ //waiting_for_lookup[dest].push_back(m);
+ }
+ else if (rank_sender.count( inst.rank ) &&
+ rank_sender[inst.rank]->inst == inst) {
+ //dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl;
+ // connected.
+ //sender = rank_sender[ inst.rank ];
+ } else {
+ //dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl;
+ // not connected.
+ connect_rank( inst );
+ }
+ } else {
+ // unknown dest rank or rank addr.
+ if (looking_up.count(dest) == 0) {
+ //dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl;
+ lookup(dest);
+ } else {
+ //dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl;
+ }
+ }
+
+ lock.Unlock();
+}
+
+void Rank::submit_message(Message *m, const entity_inst_t& dest_inst)
+{
+ const msg_addr_t dest = m->get_dest();
+
+ // lookup
+ EntityMessenger *entity = 0;
+ Sender *sender = 0;
+
+ lock.Lock();
+ {
+ // local?
+ if (dest_inst.rank == my_inst.rank) {
+ if (local.count(dest)) {
+ // local
+ dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl;
+ if (g_conf.ms_single_dispatch) {
+ _submit_single_dispatch(m);
+ } else {
+ entity = local[dest];
+ }
+ } else {
+ // mid-register
+ dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl;
+ assert(0);
+ waiting_for_lookup[dest].push_back(m);
+ }
+ }
+ else {
+ // remote.
+ if (rank_sender.count( dest_inst.rank )) {
+ //&&
+ //rank_sender[dest_inst.rank]->inst == dest_inst) {
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connected." << endl;
+ // connected.
+ sender = rank_sender[ dest_inst.rank ];
+ } else {
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl;
+ // not connected.
+ sender = connect_rank( dest_inst );
+ }
+ }
+ }
+ lock.Unlock();
+
+ // do it
+ if (entity) {
+ // local!
+ dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl;
+ entity->queue_message(m);
+ }
+ else if (sender) {
+ // remote!
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl;
+ sender->send(m);
+ }
+}
+
+
+void Rank::submit_message(Message *m)
+{
+ const msg_addr_t dest = m->get_dest();
+
+ // lookup
+ EntityMessenger *entity = 0;
+ Sender *sender = 0;
+
+ lock.Lock();
+ {
+ if (down.count(dest)) {
+ // black hole.
+ dout(0) << "submit_message " << *m << " dest " << dest << " down, dropping" << endl;
+ delete m;
+
+ if (looking_up.count(dest) == 0)
+ lookup(dest);
+
+ } else if (local.count(dest)) {
+ dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl;
+
+ // local
+ if (g_conf.ms_single_dispatch) {
+ _submit_single_dispatch(m);
+ } else {
+ entity = local[dest];
+ }
+ } else if (entity_map.count( dest )) {
+ // remote, known rank addr.
+ entity_inst_t inst = entity_map[dest];
+
+ if (inst == my_inst) {
+ dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl;
+ waiting_for_lookup[dest].push_back(m);
+ }
+ else if (rank_sender.count( inst.rank ) &&
+ rank_sender[inst.rank]->inst == inst) {
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl;
+ // connected.
+ sender = rank_sender[ inst.rank ];
+ } else {
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl;
+ // not connected.
+ sender = connect_rank( inst );
+ }
+ } else {
+ // unknown dest rank or rank addr.
+ if (looking_up.count(dest) == 0) {
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl;
+ lookup(dest);
+ } else {
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl;
+ }
+ waiting_for_lookup[dest].push_back(m);
+ }
+ }
+ lock.Unlock();
+
+ // do it
+ if (entity) {
+ // local!
+ dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl;
+ entity->queue_message(m);
+ }
+ else if (sender) {
+ // remote!
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl;
+ sender->send(m);
+ }
+}
+
+
+
+
+void Rank::dispatch(Message *m)
+{
+ lock.Lock();
+
+ dout(10) << "dispatching " << *m << endl;
+
+ switch (m->get_type()) {
+ case MSG_NS_CONNECTACK:
+ handle_connect_ack((MNSConnectAck*)m);
+ break;
+
+ case MSG_NS_REGISTERACK:
+ handle_register_ack((MNSRegisterAck*)m);
+ break;
+
+ case MSG_NS_LOOKUPREPLY:
+ handle_lookup_reply((MNSLookupReply*)m);
+ break;
+
+ default:
+ assert(0);
+ }
+
+ lock.Unlock();
+}
+
+void Rank::handle_connect_ack(MNSConnectAck *m)
+{
+ dout(10) << "handle_connect_ack, my rank is " << m->get_rank() << endl;
+ my_rank = m->get_rank();
+
+ my_inst.addr = accepter.listen_addr;
+ my_inst.rank = my_rank;
+
+ waiting_for_rank.SignalAll();
+ delete m;
+
+ // logger!
+ /*dout(10) << "logger" << endl;
+ char names[100];
+ sprintf(names, "rank%d", my_rank);
+ string name = names;
+
+ if (g_conf.tcp_log) {
+ logger = new Logger(name, (LogType*)&rank_logtype);
+ rank_logtype.add_set("num");
+ rank_logtype.add_inc("in");
+ rank_logtype.add_inc("inb");
+ rank_logtype.add_inc("dis");
+ rank_logtype.add_set("inq");
+ rank_logtype.add_set("inqb");
+ rank_logtype.add_set("outq");
+ rank_logtype.add_set("outqb");
+ }
+ */
+}
+
+
+void Rank::handle_register_ack(MNSRegisterAck *m)
+{
+ dout(10) << "handle_register_ack " << m->get_entity() << endl;
+
+ long tid = m->get_tid();
+ waiting_for_register_result[tid] = m->get_entity();
+ waiting_for_register_cond[tid]->Signal();
+ delete m;
+}
+
+void Rank::handle_lookup_reply(MNSLookupReply *m)
+{
+ list<Message*> waiting;
+ dout(10) << "got lookup reply" << endl;
+
+ for (map<msg_addr_t,entity_inst_t>::iterator it = m->entity_map.begin();
+ it != m->entity_map.end();
+ it++) {
+ dout(10) << "lookup got " << it->first << " at " << it->second << endl;
+ msg_addr_t addr = it->first;
+ entity_inst_t inst = it->second;
+
+ if (down.count(addr)) {
+ // ignore
+ dout(10) << "ignoring lookup results for " << addr << ", who is down" << endl;
+ //assert(entity_map.count(addr) == 0);
+ continue;
+ }
+
+ if (entity_map.count(addr) &&
+ entity_map[addr] > inst) {
+ dout(10) << "ignoring lookup results for " << addr << ", " \
+ << entity_map[addr] << " > " << inst << endl;
+ continue;
+ }
+
+ // update map.
+ entity_map[addr] = inst;
+
+ if (inst.rank == my_rank) {
+ // local
+ dout(10) << "delivering lookup results locally" << endl;
+ if (local.count(addr)) {
+ if (g_conf.ms_single_dispatch) {
+ single_dispatch_queue.splice(single_dispatch_queue.end(),
+ waiting_for_lookup[addr]);
+ } else {
+ local[addr]->queue_messages(waiting_for_lookup[addr]);
+ }
+ waiting_for_lookup.erase(addr);
+ } else
+ lookup(addr); // try again!
+
+ } else {
+ // remote
+ if (rank_sender.count(inst.rank) == 0)
+ connect_rank(inst);
+ else if (rank_sender[inst.rank]->inst != inst) {
+ dout(0) << "lookup got rank addr change, WATCH OUT" << endl;
+ // FIXME BUG possible message loss weirdness?
+ rank_sender[inst.rank]->stop();
+ rank_sender.erase(inst.rank);
+ connect_rank(inst);
+ }
+
+ // take waiters
+ Sender *sender = rank_sender[inst.rank];
+ assert(sender);
+
+ if (waiting_for_lookup.count(addr)) {
+ sender->send(waiting_for_lookup[addr]);
+ waiting_for_lookup.erase(addr);
+ }
+ }
+ }
+
+ delete m;
+}
+
+
+void Rank::wait()
+{
+ lock.Lock();
+ while (1) {
+ // reap dead senders, receivers.
+ reaper();
+
+ if (local.size() == 0) {
+ dout(10) << "wait: everything stopped" << endl;
+ break; // everything stopped.
+ }
+
+ if (local.size() == 1 &&
+ !messenger->is_stopped()) {
+ dout(10) << "wait: stopping rank" << endl;
+ lock.Unlock();
+ messenger->shutdown();
+ delete messenger;
+ lock.Lock();
+ continue;
+ }
+
+ wait_cond.Wait(lock);
+ }
+ lock.Unlock();
+
+ // done! clean up.
+
+ // stop dispatch thread
+ if (g_conf.ms_single_dispatch) {
+ dout(10) << "wait: stopping dispatch thread" << endl;
+ lock.Lock();
+ single_dispatch_stop = true;
+ single_dispatch_cond.Signal();
+ lock.Unlock();
+ single_dispatcher.join();
+ }
+
+ // reap senders and receivers
+ lock.Lock();
+ {
+ dout(10) << "wait: stopping senders" << endl;
+ for (hash_map<int,Sender*>::iterator i = rank_sender.begin();
+ i != rank_sender.end();
+ i++)
+ i->second->stop();
+ while (!rank_sender.empty()) {
+ wait_cond.Wait(lock);
+ reaper();
+ }
+
+ if (0) { // stop() no worky on receivers! we leak, but who cares.
+ dout(10) << "wait: stopping receivers" << endl;
+ for (set<Receiver*>::iterator i = receivers.begin();
+ i != receivers.end();
+ i++)
+ (*i)->stop();
+ while (!receivers.empty()) {
+ wait_cond.Wait(lock);
+ reaper();
+ }
+ }
+
+ }
+ lock.Unlock();
+
+ dout(10) << "wait: done." << endl;
+}
+
+
+
+int Rank::find_ns_addr(tcpaddr_t &nsa)
+{
+ // file?
+ int fd = ::open(".ceph_ns",O_RDONLY);
+ if (fd > 0) {
+ ::read(fd, (void*)&nsa, sizeof(nsa));
+ ::close(fd);
+ cout << "ceph ns is " << nsa << endl;
+ return 0;
+ }
+
+ // env var?
+ char *nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER");
+ if (nsaddr) {
+ while (nsaddr[0] != '=') nsaddr++;
+ nsaddr++;
+
+ if (tcp_hostlookup(nsaddr, nsa) < 0) {
+ cout << "can't resolve " << nsaddr << endl;
+ return -1;
+ }
+
+ cout << "ceph ns is " << nsa << endl;
+ return 0;
+ }
+
+ cerr << "i can't find ceph ns addr in .ceph_ns or CEPH_NAMESERVER" << endl;
+ return -1;
+}
+
+
+
+/**********************************
+ * EntityMessenger
+ */
+
+Rank::EntityMessenger::EntityMessenger(msg_addr_t myaddr) :
+ Messenger(myaddr),
+ stop(false),
+ dispatch_thread(this)
+{
+}
+Rank::EntityMessenger::~EntityMessenger()
+{
+}
+
+void Rank::EntityMessenger::dispatch_entry()
+{
+ lock.Lock();
+ while (!stop) {
+ if (!dispatch_queue.empty()) {
+ list<Message*> ls;
+ ls.swap(dispatch_queue);
+
+ lock.Unlock();
+ {
+ // deliver
+ while (!ls.empty()) {
+ Message *m = ls.front();
+ ls.pop_front();
+ dout(1) //<< g_clock.now()
+ << "---- "
+ << m->get_source() << ':' << m->get_source_port()
+ << " to " << m->get_dest() << ':' << m->get_dest_port()
+ << " ---- " << m->get_type_name()
+ << " ---- " << m->get_source_inst()
+ << " ---- " << m
+ << endl;
+ dispatch(m);
+ }
+ }
+ lock.Lock();
+ continue;
+ }
+ cond.Wait(lock);
+ }
+ lock.Unlock();
+}
+
+void Rank::EntityMessenger::ready()
+{
+ dout(10) << "ready " << get_myaddr() << endl;
+
+ if (g_conf.ms_single_dispatch) {
+ rank.lock.Lock();
+ if (rank.waiting_for_ready.count(get_myaddr())) {
+ rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(),
+ rank.waiting_for_ready[get_myaddr()]);
+ rank.waiting_for_ready.erase(get_myaddr());
+ rank.single_dispatch_cond.Signal();
+ }
+ rank.lock.Unlock();
+ } else {
+ // start my dispatch thread
+ dispatch_thread.create();
+ }
+
+ // tell namer
+ if (get_myaddr() != MSG_ADDR_NAMER(0))
+ send_message(new MGenericMessage(MSG_NS_STARTED), MSG_ADDR_NAMER(0));
+}
+
+
+int Rank::EntityMessenger::shutdown()
+{
+ dout(10) << "shutdown " << get_myaddr() << endl;
+
+ // deregister
+ rank.unregister_entity(this);
+
+ // stop my dispatch thread
+ if (dispatch_thread.am_self()) {
+ dout(1) << "shutdown i am dispatch, setting stop flag" << endl;
+ stop = true;
+ } else {
+ dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl;
+ lock.Lock();
+ stop = true;
+ cond.Signal();
+ lock.Unlock();
+ dispatch_thread.join();
+ }
+
+ return 0;
+}
+
+
+void Rank::EntityMessenger::prepare_send_message(msg_addr_t dest)
+{
+ rank.prepare_dest(dest);
+}
+
+int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst)
+{
+ // set envelope
+ m->set_source(get_myaddr(), 0);
+ m->set_dest(dest, 0);
+
+ m->set_source_inst(rank.my_inst);
+
+ dout(1) << "--> "
+ << m->get_source() //<< ':' << m->get_source_port()
+ << " to " << m->get_dest() //<< ':' << m->get_dest_port()
+ << " ---- " << m->get_type_name()
+ << " ---- " << rank.my_inst << " --> " << inst
+ << " ---- " << m
+ << endl;
+
+ rank.submit_message(m, inst);
+
+ return 0;
+}
+
+
+int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport)
+{
+ // set envelope
+ m->set_source(get_myaddr(), fromport);
+ m->set_dest(dest, port);
+
+ m->set_source_inst(rank.my_inst);
+
+ dout(1) << "--> "
+ << m->get_source() //<< ':' << m->get_source_port()
+ << " to " << m->get_dest() //<< ':' << m->get_dest_port()
+ << " ---- " << m->get_type_name()
+ << " ---- " << rank.my_inst << " --> ?"
+ << " ---- " << m
+ << endl;
+
+ rank.submit_message(m);
+
+ return 0;
+}
+
+
+void Rank::EntityMessenger::mark_down(msg_addr_t a, entity_inst_t& i)
+{
+ assert(a != get_myaddr());
+ rank.mark_down(a,i);
+}
+
+void Rank::mark_down(msg_addr_t a, entity_inst_t& inst)
+{
+ if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer
+ lock.Lock();
+ if (down.count(a) == 0) {
+ if (entity_map.count(a) &&
+ entity_map[a] > inst) {
+ dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl;
+ derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl;
+ // do nothing!
+ } else {
+ down.insert(a);
+
+ if (entity_map.count(a) == 0) {
+ // don't know it
+ dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl;
+ derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl;
+
+ waiting_for_lookup.erase(a);
+ looking_up.erase(a);
+ } else {
+ // know it
+ assert(entity_map[a] <= inst);
+ dout(10) << "mark_down " << a << " inst " << inst << endl;
+ derr(10) << "mark_down " << a << " inst " << inst << endl;
+
+ entity_map.erase(a);
+
+ if (rank_sender.count(inst.rank)) {
+ rank_sender[inst.rank]->stop();
+ rank_sender.erase(inst.rank);
+ }
+ }
+ }
+ }
+ lock.Unlock();
+}
+
+void Rank::EntityMessenger::mark_up(msg_addr_t a, entity_inst_t& i)
+{
+ assert(a != get_myaddr());
+ rank.mark_up(a, i);
+}
+
+void Rank::mark_up(msg_addr_t a, entity_inst_t& i)
+{
+ if (my_rank == 0) return;
+ lock.Lock();
+ {
+ dout(10) << "mark_up " << a << " inst " << i << endl;
+ derr(10) << "mark_up " << a << " inst " << i << endl;
+
+ down.erase(a);
+
+ assert(i.rank != my_rank); // hrm?
+
+ if (entity_map.count(a) == 0 ||
+ entity_map[a] < i) {
+ entity_map[a] = i;
+ connect_rank(i);
+ } else if (entity_map[a] == i) {
+ dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl;
+ derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl;
+ } else {
+ dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl;
+ derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl;
+ }
+
+ //if (waiting_for_lookup.count(a))
+ //lookup(a);
+ }
+ lock.Unlock();
+}
+
--- /dev/null
+#ifndef __NEWMESSENGER_H
+#define __NEWMESSENGER_H
+
+
+#include <list>
+#include <map>
+using namespace std;
+#include <ext/hash_map>
+#include <ext/hash_set>
+using namespace __gnu_cxx;
+
+
+#include "include/types.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "Messenger.h"
+#include "Message.h"
+#include "tcp.h"
+
+
+
+
+/* Rank - per-process
+ */
+class Rank : public Dispatcher {
+
+ class EntityMessenger;
+ class Sender;
+ class Receiver;
+
+ // namer
+ class Namer : public Dispatcher {
+ public:
+ EntityMessenger *messenger; // namerN
+
+ int nrank;
+ int nclient, nmds, nosd, nmon;
+
+ map<msg_addr_t, list<Message*> > waiting;
+
+ Namer(EntityMessenger *msgr);
+ ~Namer();
+
+ void handle_connect(class MNSConnect*);
+ void handle_register(class MNSRegister *m);
+ void handle_started(Message *m);
+ void handle_lookup(class MNSLookup *m);
+ void handle_unregister(Message *m);
+ void handle_failure(class MNSFailure *m);
+
+ void dispatch(Message *m);
+
+ void manual_insert_inst(const entity_inst_t &inst);
+
+ };
+
+ // incoming
+ class Accepter : public Thread {
+ public:
+ bool done;
+
+ tcpaddr_t listen_addr;
+ int listen_sd;
+
+ Accepter() : done(false) {}
+
+ void *entry();
+ void stop() {
+ done = true;
+ ::close(listen_sd);
+ join();
+ }
+ int start();
+ } accepter;
+
+
+ class Receiver : public Thread {
+ public:
+ int sd;
+ bool done;
+
+ Receiver(int _sd) : sd(_sd), done(false) {}
+
+ void *entry();
+ void stop() {
+ done = true;
+ ::close(sd);
+ //join();
+ }
+ Message *read_message();
+ };
+
+
+ // outgoing
+ class Sender : public Thread {
+ public:
+ entity_inst_t inst;
+ bool done;
+ int sd;
+
+ set<msg_addr_t> entities;
+ list<Message*> q;
+
+ Mutex lock;
+ Cond cond;
+
+ Sender(const entity_inst_t& i, int s=0) : inst(i), done(false), sd(s) {}
+ virtual ~Sender() {}
+
+ void *entry();
+
+ int connect();
+ void fail_and_requeue(list<Message*>& ls);
+ void finish();
+
+ void stop() {
+ lock.Lock();
+ done = true;
+ cond.Signal();
+ lock.Unlock();
+ }
+
+ void send(Message *m) {
+ lock.Lock();
+ q.push_back(m);
+ cond.Signal();
+ lock.Unlock();
+ }
+ void send(list<Message*>& ls) {
+ lock.Lock();
+ q.splice(q.end(), ls);
+ cond.Signal();
+ lock.Unlock();
+ }
+
+ int write_message(Message *m);
+ };
+
+
+
+ // messenger interface
+ class EntityMessenger : public Messenger {
+ Mutex lock;
+ Cond cond;
+ list<Message*> dispatch_queue;
+ bool stop;
+
+ class DispatchThread : public Thread {
+ EntityMessenger *m;
+ public:
+ DispatchThread(EntityMessenger *_m) : m(_m) {}
+ void *entry() {
+ m->dispatch_entry();
+ return 0;
+ }
+ } dispatch_thread;
+ void dispatch_entry();
+
+ public:
+ void queue_message(Message *m) {
+ lock.Lock();
+ dispatch_queue.push_back(m);
+ cond.Signal();
+ lock.Unlock();
+ }
+ void queue_messages(list<Message*> ls) {
+ lock.Lock();
+ dispatch_queue.splice(dispatch_queue.end(), ls);
+ cond.Signal();
+ lock.Unlock();
+ }
+
+ public:
+ EntityMessenger(msg_addr_t myaddr);
+ ~EntityMessenger();
+
+ void ready();
+ bool is_stopped() { return stop; }
+
+ void wait() {
+ dispatch_thread.join();
+ }
+
+ virtual void callback_kick() {}
+ virtual int shutdown();
+ virtual void prepare_send_message(msg_addr_t dest);
+ virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0);
+ virtual int send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst);
+
+ virtual void mark_down(msg_addr_t a, entity_inst_t& i);
+ virtual void mark_up(msg_addr_t a, entity_inst_t& i);
+ //virtual void reset(msg_addr_t a);
+ };
+
+
+ class SingleDispatcher : public Thread {
+ Rank *rank;
+ public:
+ SingleDispatcher(Rank *r) : rank(r) {}
+ void *entry() {
+ rank->single_dispatcher_entry();
+ return 0;
+ }
+ } single_dispatcher;
+
+ Cond single_dispatch_cond;
+ bool single_dispatch_stop;
+ list<Message*> single_dispatch_queue;
+
+ map<msg_addr_t, list<Message*> > waiting_for_ready;
+
+ void single_dispatcher_entry();
+ void _submit_single_dispatch(Message *m);
+
+
+ // Rank stuff
+ public:
+ Mutex lock;
+ Cond wait_cond; // for wait()
+
+ // my rank
+ int my_rank;
+ Cond waiting_for_rank;
+
+ // my instance
+ entity_inst_t my_inst;
+
+ // lookup
+ hash_map<msg_addr_t, entity_inst_t> entity_map;
+ hash_set<msg_addr_t> entity_unstarted;
+
+ map<msg_addr_t, list<Message*> > waiting_for_lookup;
+ set<msg_addr_t> looking_up;
+
+ hash_set<msg_addr_t> down;
+
+ // register
+ map<int, Cond* > waiting_for_register_cond;
+ map<int, msg_addr_t > waiting_for_register_result;
+
+ // local
+ map<msg_addr_t, EntityMessenger*> local;
+
+ // remote
+ hash_map<int, Sender*> rank_sender;
+
+ set<Receiver*> receivers;
+
+ list<Sender*> sender_reap_queue;
+ list<Receiver*> receiver_reap_queue;
+
+ EntityMessenger *messenger; // rankN
+ Namer *namer;
+
+
+ void show_dir();
+
+ void lookup(msg_addr_t addr);
+
+ void dispatch(Message *m);
+ void handle_connect_ack(class MNSConnectAck *m);
+ void handle_register_ack(class MNSRegisterAck *m);
+ void handle_lookup_reply(class MNSLookupReply *m);
+
+ Sender *connect_rank(const entity_inst_t& inst);
+
+ void mark_down(msg_addr_t addr, entity_inst_t& i);
+ void mark_up(msg_addr_t addr, entity_inst_t& i);
+
+ tcpaddr_t get_listen_addr() { return accepter.listen_addr; }
+
+ void reaper();
+
+
+public:
+ Rank(int r=-1);
+ ~Rank();
+
+ int find_ns_addr(tcpaddr_t &tcpaddr);
+
+ void set_namer(const tcpaddr_t& ns);
+ void start_namer();
+
+ int start_rank();
+ void wait();
+
+ EntityMessenger *register_entity(msg_addr_t addr);
+ void unregister_entity(EntityMessenger *ms);
+
+ void submit_message(Message *m, const entity_inst_t& inst);
+ void prepare_dest(msg_addr_t dest);
+ void submit_message(Message *m);
+ void submit_messages(list<Message*>& ls);
+
+ // create a new messenger
+ EntityMessenger *new_entity(msg_addr_t addr);
+
+} ;
+
+extern Rank rank;
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "NewerMessenger.h"
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "config.h"
+
+#include "messages/MGenericMessage.h"
+#include "messages/MNSConnect.h"
+#include "messages/MNSConnectAck.h"
+#include "messages/MNSRegister.h"
+#include "messages/MNSRegisterAck.h"
+#include "messages/MNSLookup.h"
+#include "messages/MNSLookupReply.h"
+#include "messages/MNSFailure.h"
+
+//#include "messages/MFailure.h"
+
+#include <netdb.h>
+
+
+#undef dout
+#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- rank" << rank.my_rank << " "
+#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- rank" << rank.my_rank << " "
+
+
+
+#include "tcp.cc"
+
+
+Rank rank;
+
+
+/********************************************
+ * Namer
+ */
+
+Rank::Namer::Namer(EntityMessenger *msgr) :
+ messenger(msgr),
+ nrank(0), nclient(0), nmds(0), nosd(0), nmon(0)
+{
+ assert(rank.my_rank == 0);
+ nrank = g_conf.num_mon;
+
+ // announce myself
+ /*
+ cerr << "ceph ns is " << rank.accepter.listen_addr << endl;
+ cout << "export CEPH_NAMESERVER=" << rank.accepter.listen_addr << endl;
+ int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT);
+ ::write(fd, (void*)&rank.accepter.listen_addr, sizeof(tcpaddr_t));
+ ::fchmod(fd, 0755);
+ ::close(fd);
+ */
+
+ // ok
+ messenger->set_dispatcher(this);
+}
+
+Rank::Namer::~Namer()
+{
+ //::unlink(".ceph_ns");
+}
+
+
+void Rank::Namer::dispatch(Message *m)
+{
+ rank.lock.Lock();
+ int type = m->get_type();
+ switch (type) {
+ case MSG_NS_CONNECT:
+ handle_connect((class MNSConnect*)m);
+ break;
+ case MSG_NS_REGISTER:
+ handle_register((class MNSRegister*)m);
+ break;
+ case MSG_NS_STARTED:
+ handle_started(m);
+ break;
+ case MSG_NS_UNREGISTER:
+ handle_unregister(m);
+ break;
+ case MSG_NS_LOOKUP:
+ handle_lookup((class MNSLookup*)m);
+ break;
+ case MSG_NS_FAILURE:
+ handle_failure((class MNSFailure*)m);
+ break;
+
+ case MSG_FAILURE_ACK:
+ delete m;
+ break;
+
+ default:
+ assert(0);
+ }
+ rank.lock.Unlock();
+}
+
+void Rank::Namer::handle_connect(MNSConnect *m)
+{
+ int newrank = nrank++;
+ dout(2) << "namer.handle_connect from new rank" << newrank << " " << m->get_addr() << endl;
+
+ rank.entity_map[MSG_ADDR_RANK(newrank)].addr = m->get_addr();
+ rank.entity_map[MSG_ADDR_RANK(newrank)].rank = newrank;
+ rank.entity_unstarted.insert(MSG_ADDR_RANK(newrank));
+
+ messenger->send_message(new MNSConnectAck(newrank),
+ MSG_ADDR_RANK(newrank), rank.entity_map[MSG_ADDR_RANK(newrank)]);
+ delete m;
+}
+
+void Rank::Namer::manual_insert_inst(const entity_inst_t &inst)
+{
+ rank.entity_map[MSG_ADDR_RANK(inst.rank)] = inst;
+}
+
+void Rank::Namer::handle_register(MNSRegister *m)
+{
+ dout(10) << "namer.handle_register from rank " << m->get_rank()
+ << " addr " << m->get_entity() << endl;
+
+ // pick id
+ msg_addr_t entity = m->get_entity();
+
+ if (entity.is_new()) {
+ // make up a new address!
+ switch (entity.type()) {
+ case MSG_ADDR_MDS_BASE:
+ entity = MSG_ADDR_MDS(nmds++);
+ break;
+
+ case MSG_ADDR_OSD_BASE:
+ entity = MSG_ADDR_OSD(nosd++);
+ break;
+
+ case MSG_ADDR_CLIENT_BASE:
+ entity = MSG_ADDR_CLIENT(nclient++);
+ break;
+
+ default:
+ assert(0);
+ }
+ } else {
+ // specific address!
+ }
+
+
+ // register
+ if (rank.entity_map.count(entity)) {
+ dout(1) << "namer.handle_register re-registering " << entity
+ << " inst " << m->get_source_inst()
+ << " (was " << rank.entity_map[entity] << ")"
+ << endl;
+ } else {
+ dout(1) << "namer.handle_register registering " << entity
+ << " inst " << m->get_source_inst()
+ << endl;
+ }
+ rank.entity_map[entity] = m->get_source_inst();
+ rank.entity_unstarted.insert(entity);
+
+ // reply w/ new id
+ messenger->send_message(new MNSRegisterAck(m->get_tid(), entity),
+ m->get_source(), rank.entity_map[entity]);
+
+ delete m;
+}
+
+void Rank::Namer::handle_started(Message *m)
+{
+ msg_addr_t who = m->get_source();
+ dout(10) << "namer.handle_started from entity " << who << endl;
+
+ assert(rank.entity_unstarted.count(who));
+ rank.entity_unstarted.erase(who);
+
+ // anybody waiting?
+ if (waiting.count(who)) {
+ list<Message*> ls;
+ ls.swap(waiting[who]);
+ waiting.erase(who);
+
+ dout(10) << "doing waiters on " << who << endl;
+ for (list<Message*>::iterator it = ls.begin();
+ it != ls.end();
+ it++)
+ dispatch(*it);
+ }
+
+}
+
+void Rank::Namer::handle_unregister(Message *m)
+{
+ msg_addr_t who = m->get_source();
+ dout(1) << "namer.handle_unregister entity " << who << endl;
+
+ rank.show_dir();
+
+ assert(rank.entity_map.count(who));
+ rank.entity_map.erase(who);
+
+ rank.show_dir();
+
+ // shut myself down? kick watcher.
+ if (rank.entity_map.size() == 2) {
+ dout(10) << "namer.handle_unregister stopping namer" << endl;
+ rank.lock.Unlock();
+ messenger->shutdown();
+ delete messenger;
+ rank.lock.Lock();
+ }
+
+ delete m;
+}
+
+
+void Rank::Namer::handle_lookup(MNSLookup *m)
+{
+ // have it?
+ if (rank.entity_map.count(m->get_entity()) == 0) {
+ dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> dne" << endl;
+ waiting[m->get_entity()].push_back(m);
+ return;
+ }
+
+ if (rank.entity_unstarted.count(m->get_entity())) {
+ dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> unstarted" << endl;
+ waiting[m->get_entity()].push_back(m);
+ return;
+ }
+
+ // look it up!
+ MNSLookupReply *reply = new MNSLookupReply(m);
+
+ reply->entity_map[m->get_entity()] = rank.entity_map[m->get_entity()];
+
+ dout(10) << "namer " << m->get_source()
+ << " lookup '" << m->get_entity()
+ << "' -> " << rank.entity_map[m->get_entity()] << endl;
+
+ messenger->send_message(reply, m->get_source(), m->get_source_inst());
+ delete m;
+}
+
+void Rank::Namer::handle_failure(MNSFailure *m)
+{
+ dout(10) << "namer.handle_failure inst " << m->get_inst()
+ << endl;
+
+ // search for entities on this instance
+ list<msg_addr_t> rm;
+ for (hash_map<msg_addr_t,entity_inst_t>::iterator i = rank.entity_map.begin();
+ i != rank.entity_map.end();
+ i++) {
+ if (i->second != m->get_inst()) continue;
+ rm.push_back(i->first);
+ }
+ for (list<msg_addr_t>::iterator i = rm.begin();
+ i != rm.end();
+ i++) {
+ dout(10) << "namer.handle_failure inst " << m->get_inst()
+ << ", removing " << *i << endl;
+
+ rank.entity_map.erase(*i);
+ rank.entity_unstarted.erase(*i);
+
+ /*
+ if ((*i).is_osd()) {
+ // tell the monitor
+ messenger->send_message(new MFailure(*i, m->get_inst()), MSG_ADDR_MON(0));
+ }
+ */
+ }
+
+ delete m;
+}
+
+
+
+/********************************************
+ * Accepter
+ */
+
+int Rank::Accepter::start()
+{
+ // bind to a socket
+ dout(10) << "accepter.start binding to listen " << endl;
+
+ /* socket creation */
+ listen_sd = socket(AF_INET,SOCK_STREAM,0);
+ assert(listen_sd > 0);
+
+ /* bind to port */
+ memset((char*)&listen_addr, 0, sizeof(listen_addr));
+ listen_addr.sin_family = AF_INET;
+ listen_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ listen_addr.sin_port = 0;
+
+ int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr));
+ assert(rc >= 0);
+
+ socklen_t llen = sizeof(listen_addr);
+ getsockname(listen_sd, (sockaddr*)&listen_addr, &llen);
+
+ int myport = listen_addr.sin_port;
+
+ // listen!
+ rc = ::listen(listen_sd, 1000);
+ assert(rc >= 0);
+
+ //dout(10) << "accepter.start listening on " << myport << endl;
+
+ // my address is...
+ char host[100];
+ bzero(host, 100);
+ gethostname(host, 100);
+ //dout(10) << "accepter.start my hostname is " << host << endl;
+
+ struct hostent *myhostname = gethostbyname( host );
+
+ struct sockaddr_in my_addr;
+ memset(&my_addr, 0, sizeof(my_addr));
+
+ my_addr.sin_family = myhostname->h_addrtype;
+ memcpy((char *) &my_addr.sin_addr.s_addr,
+ myhostname->h_addr_list[0],
+ myhostname->h_length);
+ my_addr.sin_port = myport;
+
+ listen_addr = my_addr;
+
+ dout(10) << "accepter.start listen addr is " << listen_addr << endl;
+
+ // start thread
+ create();
+
+ return 0;
+}
+
+void *Rank::Accepter::entry()
+{
+ dout(10) << "accepter starting" << endl;
+
+ while (!done) {
+ // accept
+ struct sockaddr_in addr;
+ socklen_t slen = sizeof(addr);
+ int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen);
+ if (sd > 0) {
+ dout(10) << "accepted incoming on sd " << sd << endl;
+
+ rank.lock.Lock();
+ Pipe *p = new Pipe(sd);
+ rank.pipes.insert(p);
+ rank.lock.Unlock();
+ } else {
+ dout(10) << "no incoming connection?" << endl;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+
+/**************************************
+ * Pipe
+ */
+
+int Rank::Pipe::accept()
+{
+ // my creater gave me sd via accept()
+
+ // announce myself.
+ int rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst));
+ if (rc < 0) {
+ ::close(sd);
+ done = true;
+ return -1;
+ }
+
+ // identify peer
+ rc = tcp_read(sd, (char*)&peer_inst, sizeof(peer_inst));
+ if (rc < 0) {
+ dout(10) << "pipe(? " << this << ").accept couldn't read peer inst" << endl;
+ ::close(sd);
+ done = true;
+ return -1;
+ }
+
+ // create writer thread.
+ writer_running = true;
+ writer_thread.create();
+
+ // register pipe.
+ if (peer_inst.rank >= 0) {
+ rank.lock.Lock();
+ {
+ if (rank.rank_pipe.count(peer_inst.rank) == 0) {
+ // install a pipe!
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst << endl;
+ rank.rank_pipe[peer_inst.rank] = this;
+ } else {
+ // low ranks' Pipes "win"
+ if (peer_inst.rank < rank.my_inst.rank ||
+ rank.my_inst.rank < 0) {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst
+ << ", already had pipe, but switching to this new one" << endl;
+ // switch to this new Pipe
+ rank.rank_pipe[peer_inst.rank]->close(); // close old one
+ rank.rank_pipe[peer_inst.rank] = this;
+ } else {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst
+ << ", already had pipe, sticking with it" << endl;
+ }
+ }
+ }
+ rank.lock.Unlock();
+ } else {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is unranked " << peer_inst << endl;
+ }
+
+ return 0; // success.
+}
+
+int Rank::Pipe::connect()
+{
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect" << endl;
+
+ // create socket?
+ sd = socket(AF_INET,SOCK_STREAM,0);
+ assert(sd > 0);
+
+ // bind any port
+ struct sockaddr_in myAddr;
+ myAddr.sin_family = AF_INET;
+ myAddr.sin_addr.s_addr = htonl(INADDR_ANY);
+ myAddr.sin_port = htons( 0 );
+
+ int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr));
+ assert(rc>=0);
+
+ // connect!
+ rc = ::connect(sd, (sockaddr*)&peer_inst.addr, sizeof(myAddr));
+ if (rc < 0) return rc;
+
+ // identify peer
+ entity_inst_t inst;
+ rc = tcp_read(sd, (char*)&inst, sizeof(inst));
+ if (inst.rank < 0)
+ inst = peer_inst; // i know better than they do.
+ if (peer_inst != inst && inst.rank > 0) {
+ derr(0) << "pipe(" << peer_inst << ' ' << this << ").connect peer is " << inst << ", wtf" << endl;
+ assert(0);
+ return -1;
+ }
+
+ // identify myself
+ rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst));
+ if (rc < 0)
+ return -1;
+
+ // register pipe
+ rank.lock.Lock();
+ {
+ if (rank.rank_pipe.count(peer_inst.rank) == 0) {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect registering pipe" << endl;
+ rank.rank_pipe[peer_inst.rank] = this;
+ } else {
+ // this is normal.
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect pipe already registered." << endl;
+ }
+ }
+ rank.lock.Unlock();
+
+ // start reader
+ reader_running = true;
+ reader_thread.create();
+
+ return 0;
+}
+
+
+void Rank::Pipe::close()
+{
+ if (sent_close) {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").close already closing" << endl;
+ return;
+ }
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").close" << endl;
+
+ // unreg ourselves
+ rank.lock.Lock();
+ {
+ if (rank.rank_pipe.count(peer_inst.rank) &&
+ rank.rank_pipe[peer_inst.rank] == this) {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").close unregistering pipe" << endl;
+ rank.rank_pipe.erase(peer_inst.rank);
+ }
+ }
+ rank.lock.Unlock();
+
+ // queue close message.
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").close queueing MSG_CLOSE" << endl;
+ lock.Lock();
+ q.push_back(new MGenericMessage(MSG_CLOSE));
+ cond.Signal();
+ sent_close = true;
+ lock.Unlock();
+}
+
+
+/* read msgs from socket.
+ * also, server.
+ *
+ */
+void Rank::Pipe::reader()
+{
+ if (server)
+ accept();
+
+ // loop.
+ while (!done) {
+ Message *m = read_message();
+ if (!m || m->get_type() == 0) {
+ if (m) {
+ delete m;
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader read MSG_CLOSE message" << endl;
+ } else {
+ derr(10) << "pipe(" << peer_inst << ' ' << this << ").reader read null message" << endl;
+ }
+
+ if (!sent_close)
+ close();
+
+ done = true;
+ cond.Signal(); // wake up writer too.
+ break;
+ }
+
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader got message for " << m->get_dest() << endl;
+
+ EntityMessenger *entity = 0;
+
+ rank.lock.Lock();
+ {
+ if (rank.entity_map.count(m->get_source()) &&
+ rank.entity_map[m->get_source()] > m->get_source_inst()) {
+ derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader source " << m->get_source()
+ << " inst " << m->get_source_inst()
+ << " > " << rank.entity_map[m->get_source()]
+ << ", WATCH OUT " << *m << endl;
+ assert(0);
+ }
+
+ if (m->get_dest().type() == MSG_ADDR_RANK_BASE) {
+ // ours.
+ rank.dispatch(m);
+ } else {
+ if (g_conf.ms_single_dispatch) {
+ // submit to single dispatch queue
+ rank._submit_single_dispatch(m);
+ } else {
+ if (rank.local.count(m->get_dest())) {
+ // find entity
+ entity = rank.local[m->get_dest()];
+ } else {
+ derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl;
+ assert(0); // FIXME do this differently
+ //rank.waiting_for_lookup[m->get_dest()].push_back(m);
+ }
+ }
+ }
+ }
+ rank.lock.Unlock();
+
+ if (entity)
+ entity->queue_message(m); // queue
+ }
+
+
+ // reap?
+ bool reap = false;
+ lock.Lock();
+ {
+ reader_running = false;
+ if (!writer_running) reap = true;
+ }
+ lock.Unlock();
+
+ if (reap) {
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader queueing for reap" << endl;
+ ::close(sd);
+ rank.lock.Lock();
+ {
+ rank.pipe_reap_queue.push_back(this);
+ rank.wait_cond.Signal();
+ }
+ rank.lock.Unlock();
+ }
+}
+
+
+/* write msgs to socket.
+ * also, client.
+ */
+void Rank::Pipe::writer()
+{
+ if (!server) {
+ int rc = connect();
+ if (rc < 0) {
+ derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error connecting" << endl;
+ done = true;
+ list<Message*> out;
+ fail(out);
+ }
+ }
+
+ // loop.
+ lock.Lock();
+ while (!q.empty() || !done) {
+
+ if (!q.empty()) {
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer grabbing message(s)" << endl;
+
+ // grab outgoing list
+ list<Message*> out;
+ out.swap(q);
+
+ // drop lock while i send these
+ lock.Unlock();
+
+ while (!out.empty()) {
+ Message *m = out.front();
+ out.pop_front();
+
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sending " << *m << endl;
+
+ // stamp.
+ m->set_source_inst(rank.my_inst);
+
+ // marshall
+ if (m->empty_payload())
+ m->encode_payload();
+
+ if (write_message(m) < 0) {
+ // failed!
+ derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest() << endl;
+ out.push_front(m);
+ fail(out);
+ done = true;
+ break;
+ }
+
+ // did i just send a close?
+ if (m->get_type() == MSG_CLOSE)
+ done = true;
+
+ // clean up
+ delete m;
+ }
+
+ lock.Lock();
+ continue;
+ }
+
+ // wait
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sleeping" << endl;
+ cond.Wait(lock);
+ }
+ lock.Unlock();
+
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer finishing" << endl;
+
+ // reap?
+ bool reap = false;
+ lock.Lock();
+ {
+ writer_running = false;
+ if (!reader_running) reap = true;
+ }
+ lock.Unlock();
+
+ if (reap) {
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer queueing for reap" << endl;
+ ::close(sd);
+ rank.lock.Lock();
+ {
+ rank.pipe_reap_queue.push_back(this);
+ rank.wait_cond.Signal();
+ }
+ rank.lock.Unlock();
+ }
+}
+
+
+Message *Rank::Pipe::read_message()
+{
+ // envelope
+ //dout(10) << "receiver.read_message from sd " << sd << endl;
+
+ msg_envelope_t env;
+ if (!tcp_read( sd, (char*)&env, sizeof(env) ))
+ return 0;
+
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got envelope type=" << env.type
+ << " src " << env.source << " dst " << env.dest
+ << " nchunks=" << env.nchunks
+ << endl;
+
+ // payload
+ bufferlist blist;
+ for (int i=0; i<env.nchunks; i++) {
+ int size;
+ if (!tcp_read( sd, (char*)&size, sizeof(size) )) return 0;
+
+ if (size == 0) continue;
+
+ bufferptr bp(size);
+
+ if (!tcp_read( sd, bp.c_str(), size )) return 0;
+
+ blist.push_back(bp);
+
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got frag " << i << " of " << env.nchunks
+ << " len " << bp.length() << endl;
+ }
+
+ // unmarshall message
+ size_t s = blist.length();
+ Message *m = decode_message(env, blist);
+
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got " << s << " byte message from "
+ << m->get_source() << endl;
+
+ return m;
+}
+
+
+
+int Rank::Pipe::write_message(Message *m)
+{
+ // get envelope, buffers
+ msg_envelope_t *env = &m->get_envelope();
+ bufferlist blist;
+ blist.claim( m->get_payload() );
+
+#ifdef TCP_KEEP_CHUNKS
+ env->nchunks = blist.buffers().size();
+#else
+ env->nchunks = 1;
+#endif
+
+ dout(20)// << g_clock.now()
+ << "pipe(" << peer_inst << ' ' << this << ").writer sending " << m << " " << *m
+ << " to " << m->get_dest()
+ << endl;
+
+ // send envelope
+ int r = tcp_write( sd, (char*)env, sizeof(*env) );
+ if (r < 0) {
+ derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending envelope for " << *m
+ << " to " << m->get_dest() << endl;
+ return -1;
+ }
+
+ // payload
+#ifdef TCP_KEEP_CHUNKS
+ // send chunk-wise
+ int i = 0;
+ for (list<bufferptr>::const_iterator it = blist.buffers().begin();
+ it != blist.buffers().end();
+ it++) {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl;
+ int size = (*it).length();
+ r = tcp_write( sd, (char*)&size, sizeof(size) );
+ if (r < 0) {
+ derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl;
+ return -1;
+ }
+ r = tcp_write( sd, (*it).c_str(), size );
+ if (r < 0) {
+ derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data chunk for " << *m << " to " << m->get_dest() << endl;
+ return -1;
+ }
+ i++;
+ }
+#else
+ // one big chunk
+ int size = blist.length();
+ r = tcp_write( sd, (char*)&size, sizeof(size) );
+ if (r < 0) {
+ derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl;
+ return -1;
+ }
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl;
+
+ for (list<bufferptr>::const_iterator it = blist.buffers().begin();
+ it != blist.buffers().end();
+ it++) {
+ if ((*it).length() == 0) continue; // blank buffer.
+ r = tcp_write( sd, (char*)(*it).c_str(), (*it).length() );
+ if (r < 0) {
+ derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl;
+ return -1;
+ }
+ }
+#endif
+
+ return 0;
+}
+
+
+void Rank::Pipe::fail(list<Message*>& out)
+{
+ derr(10) << "pipe(" << peer_inst << ' ' << this << ").fail" << endl;
+
+ // tell namer
+ if (!rank.messenger) {
+ derr(0) << "FATAL error: can't send failure to namer0, not connected yet" << endl;
+ assert(0);
+ }
+
+ // FIXME: possible race before i reclaim lock here?
+
+ // deactivate myself
+ rank.lock.Lock();
+ {
+ if (rank.rank_pipe.count(peer_inst.rank) &&
+ rank.rank_pipe[peer_inst.rank] == this)
+ rank.rank_pipe.erase(peer_inst.rank);
+ }
+ rank.lock.Unlock();
+
+ // what do i do about reader()? FIXME
+
+ // sort my messages by (source) dispatcher, dest.
+ map<Dispatcher*, map<msg_addr_t, list<Message*> > > by_dis;
+ lock.Lock();
+ {
+ // include out at front of queue
+ q.splice(q.begin(), out);
+
+ // sort
+ while (!q.empty()) {
+ if (q.front()->get_type() == MSG_CLOSE) {
+ delete q.front();
+ }
+ else if (rank.local.count(q.front()->get_source())) {
+ Dispatcher *dis = rank.local[q.front()->get_source()]->get_dispatcher();
+ by_dis[dis][q.front()->get_dest()].push_back(q.front());
+ }
+ else {
+ // oh well. sending entity musta just shut down?
+ assert(0);
+ delete q.front();
+ }
+ q.pop_front();
+ }
+ }
+ lock.Unlock();
+
+ // report failure(s) to dispatcher(s)
+ for (map<Dispatcher*, map<msg_addr_t, list<Message*> > >::iterator i = by_dis.begin();
+ i != by_dis.end();
+ ++i)
+ for (map<msg_addr_t, list<Message*> >::iterator j = i->second.begin();
+ j != i->second.end();
+ ++j)
+ for (list<Message*>::iterator k = j->second.begin();
+ k != j->second.end();
+ ++k) {
+ derr(1) << "pipe(" << peer_inst << ' ' << this << ").fail on " << **k << " to " << j->first << " inst " << peer_inst << endl;
+ i->first->ms_handle_failure(*k, j->first, peer_inst);
+ }
+}
+
+
+
+
+
+
+/********************************************
+ * Rank
+ */
+
+Rank::Rank(int r) :
+ single_dispatcher(this),
+ my_rank(r),
+ namer(0) {
+}
+Rank::~Rank()
+{
+ //FIXME
+ if (namer) delete namer;
+}
+
+
+void Rank::_submit_single_dispatch(Message *m)
+{
+ assert(lock.is_locked());
+
+ if (local.count(m->get_dest()) &&
+ local[m->get_dest()]->is_ready()) {
+ rank.single_dispatch_queue.push_back(m);
+ rank.single_dispatch_cond.Signal();
+ } else {
+ waiting_for_ready[m->get_dest()].push_back(m);
+ }
+}
+
+
+void Rank::single_dispatcher_entry()
+{
+ lock.Lock();
+ while (!single_dispatch_stop || !single_dispatch_queue.empty()) {
+ if (!single_dispatch_queue.empty()) {
+ list<Message*> ls;
+ ls.swap(single_dispatch_queue);
+
+ lock.Unlock();
+ {
+ while (!ls.empty()) {
+ Message *m = ls.front();
+ ls.pop_front();
+
+ dout(1) //<< g_clock.now()
+ << "---- "
+ << m->get_source()// << ':' << m->get_source_port()
+ << " to " << m->get_dest()// << ':' << m->get_dest_port()
+ << " ---- " << m->get_type_name()
+ << " ---- " << m
+ << endl;
+
+ if (m->get_dest().type() == MSG_ADDR_RANK_BASE)
+ rank.dispatch(m);
+ else {
+ assert(local.count(m->get_dest()));
+ local[m->get_dest()]->dispatch(m);
+ }
+ }
+ }
+ lock.Lock();
+ continue;
+ }
+ single_dispatch_cond.Wait(lock);
+ }
+ lock.Unlock();
+}
+
+
+/*
+ * note: assumes lock is held
+ */
+void Rank::reaper()
+{
+ dout(10) << "reaper" << endl;
+ assert(lock.is_locked());
+
+ while (!pipe_reap_queue.empty()) {
+ Pipe *p = pipe_reap_queue.front();
+ dout(10) << "reaper reaping pipe " << p->get_peer_inst() << endl;
+ pipe_reap_queue.pop_front();
+ assert(pipes.count(p));
+ pipes.erase(p);
+ p->join();
+ dout(10) << "reaper reaped pipe " << p->get_peer_inst() << endl;
+ delete p;
+ }
+}
+
+
+int Rank::start_rank()
+{
+ dout(10) << "start_rank" << endl;
+
+ // bind to a socket
+ if (accepter.start() < 0)
+ return -1;
+
+ // start single thread dispatcher?
+ if (g_conf.ms_single_dispatch) {
+ single_dispatch_stop = false;
+ single_dispatcher.create();
+ }
+
+ lock.Lock();
+
+ // my_inst
+ my_inst.addr = accepter.listen_addr;
+ my_inst.rank = my_rank;
+
+ if (my_rank < 0) {
+ dout(10) << "start_rank connecting to namer0" << endl;
+
+ // connect to namer
+ assert(entity_map.count(MSG_ADDR_NAMER(0)));
+ Pipe *pipe = connect_rank(entity_map[MSG_ADDR_NAMER(0)]);
+
+ // send
+ Message *m = new MNSConnect(accepter.listen_addr);
+ m->set_dest(MSG_ADDR_NAMER(0), 0);
+ pipe->send(m);
+
+ // wait
+ while (my_rank < 0)
+ waiting_for_rank.Wait(lock);
+ assert(my_rank >= 0);
+
+ dout(10) << "start_rank got rank " << my_rank << endl;
+
+ // create rank entity
+ entity_map[MSG_ADDR_RANK(my_rank)] = my_inst;
+ local[MSG_ADDR_RANK(my_rank)] = messenger = new EntityMessenger(MSG_ADDR_RANK(my_rank));
+ messenger->set_dispatcher(this);
+ } else {
+ // create my rank
+ msg_addr_t raddr = MSG_ADDR_RANK(my_rank);
+ entity_map[raddr] = my_inst;
+ entity_unstarted.insert(raddr);
+ local[raddr] = messenger = new EntityMessenger(raddr);
+ messenger->set_dispatcher(this);
+
+ dout(1) << "start_rank " << my_rank << " at " << my_inst << endl;
+ }
+
+ lock.Unlock();
+ return 0;
+}
+
+void Rank::start_namer()
+{
+ // create namer0
+ msg_addr_t naddr = MSG_ADDR_NAMER(0);
+ entity_map[naddr] = my_inst;
+ local[naddr] = new EntityMessenger(naddr);
+ namer = new Namer(local[naddr]);
+ namer_inst = my_inst;
+}
+
+void Rank::set_namer(const tcpaddr_t& ns)
+{
+ namer_inst.addr = entity_map[MSG_ADDR_NAMER(0)].addr = ns;
+ namer_inst.rank = entity_map[MSG_ADDR_NAMER(0)].rank = 0;
+}
+
+/* connect_rank
+ * NOTE: assumes rank.lock held.
+ */
+Rank::Pipe *Rank::connect_rank(const entity_inst_t& inst)
+{
+ assert(rank.lock.is_locked());
+ assert(inst != rank.my_inst);
+
+ dout(10) << "connect_rank to " << inst << endl;
+
+ // create pipe
+ Pipe *pipe = new Pipe(inst);
+ rank.rank_pipe[inst.rank] = pipe;
+ pipes.insert(pipe);
+
+ return pipe;
+}
+
+
+
+
+
+void Rank::show_dir()
+{
+ dout(10) << "show_dir ---" << endl;
+
+ for (hash_map<msg_addr_t, entity_inst_t>::iterator i = entity_map.begin();
+ i != entity_map.end();
+ i++) {
+ if (local.count(i->first)) {
+ dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl;
+ } else {
+ dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl;
+ }
+ }
+}
+
+
+/* lookup
+ * NOTE: assumes directory.lock held
+ */
+void Rank::lookup(msg_addr_t addr)
+{
+ dout(10) << "lookup " << addr << endl;
+ assert(lock.is_locked());
+
+ assert(looking_up.count(addr) == 0);
+ looking_up.insert(addr);
+
+ MNSLookup *r = new MNSLookup(addr);
+ messenger->send_message(r, MSG_ADDR_NAMER(0), namer_inst);
+}
+
+
+
+/* register_entity
+ */
+Rank::EntityMessenger *Rank::register_entity(msg_addr_t addr)
+{
+ dout(10) << "register_entity " << addr << endl;
+ lock.Lock();
+
+ // register with namer
+ static long reg_attempt = 0;
+ long id = ++reg_attempt;
+
+ Message *reg = new MNSRegister(addr, my_rank, id);
+ reg->set_source(MSG_ADDR_RANK(my_rank), 0);
+ reg->set_source_inst(my_inst);
+ reg->set_dest(MSG_ADDR_DIRECTORY, 0);
+
+ // prepare cond
+ Cond cond;
+ waiting_for_register_cond[id] = &cond;
+
+ // send request
+ lock.Unlock();
+ submit_message(reg);
+ lock.Lock();
+
+ // wait
+ while (!waiting_for_register_result.count(id))
+ cond.Wait(lock);
+
+ // grab result
+ addr = waiting_for_register_result[id];
+ dout(10) << "register_entity got " << addr << endl;
+
+ // clean up
+ waiting_for_register_cond.erase(id);
+ waiting_for_register_result.erase(id);
+
+ // create messenger
+ EntityMessenger *msgr = new EntityMessenger(addr);
+
+ // add to directory
+ entity_map[addr] = my_inst;
+ local[addr] = msgr;
+
+ // was anyone waiting?
+ if (waiting_for_lookup.count(addr)) {
+ submit_messages(waiting_for_lookup[addr]);
+ waiting_for_lookup.erase(addr);
+ }
+
+ lock.Unlock();
+ return msgr;
+}
+
+void Rank::unregister_entity(EntityMessenger *msgr)
+{
+ lock.Lock();
+ dout(10) << "unregister_entity " << msgr->get_myaddr() << endl;
+
+ // remove from local directory.
+ assert(local.count(msgr->get_myaddr()));
+ local.erase(msgr->get_myaddr());
+
+ if (my_rank > 0) {
+ assert(entity_map.count(msgr->get_myaddr()));
+ entity_map.erase(msgr->get_myaddr());
+ } // else namer will do it.
+
+ // tell namer.
+ if (msgr->get_myaddr() != MSG_ADDR_NAMER(0) &&
+ msgr->get_myaddr() != MSG_ADDR_RANK(0))
+ msgr->send_message(new MGenericMessage(MSG_NS_UNREGISTER),
+ MSG_ADDR_NAMER(0), namer_inst);
+
+ // kick wait()?
+ if (local.size() <= 2)
+ wait_cond.Signal();
+
+ lock.Unlock();
+}
+
+
+void Rank::submit_messages(list<Message*>& ls)
+{
+ for (list<Message*>::iterator i = ls.begin(); i != ls.end(); i++)
+ submit_message(*i);
+ ls.clear();
+}
+
+
+
+void Rank::submit_message(Message *m, const entity_inst_t& dest_inst)
+{
+ const msg_addr_t dest = m->get_dest();
+
+ // lookup
+ EntityMessenger *entity = 0;
+ Pipe *pipe = 0;
+
+ lock.Lock();
+ {
+ // local?
+ if (dest_inst.rank == my_inst.rank) {
+ if (local.count(dest)) {
+ // local
+ dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl;
+ if (g_conf.ms_single_dispatch) {
+ _submit_single_dispatch(m);
+ } else {
+ entity = local[dest];
+ }
+ } else {
+ // mid-register
+ dout(20) << "submit_message " << *m << " dest " << dest << " " << dest_inst << " local but mid-register, waiting." << endl;
+ assert(0); // hmpf
+ waiting_for_lookup[dest].push_back(m);
+ }
+ }
+ else {
+ // remote.
+ if (rank_pipe.count( dest_inst.rank )) {
+ //&&
+ //rank_pipe[dest_inst.rank]->inst == dest_inst) {
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", already connected." << endl;
+ // connected.
+ pipe = rank_pipe[ dest_inst.rank ];
+ } else {
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl;
+ // not connected.
+ pipe = connect_rank( dest_inst );
+ }
+ }
+ }
+ lock.Unlock();
+
+ // do it
+ if (entity) {
+ // local!
+ dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl;
+ entity->queue_message(m);
+ }
+ else if (pipe) {
+ // remote!
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl;
+ pipe->send(m);
+ }
+}
+
+
+void Rank::submit_message(Message *m)
+{
+ const msg_addr_t dest = m->get_dest();
+
+ // lookup
+ EntityMessenger *entity = 0;
+ Pipe *pipe = 0;
+
+ lock.Lock();
+ {
+ if (local.count(dest)) {
+ dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl;
+
+ // local
+ if (g_conf.ms_single_dispatch) {
+ _submit_single_dispatch(m);
+ } else {
+ entity = local[dest];
+ }
+ } else if (entity_map.count( dest )) {
+ // remote, known rank addr.
+ entity_inst_t inst = entity_map[dest];
+
+ if (inst == my_inst) {
+ dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl;
+ waiting_for_lookup[dest].push_back(m);
+ }
+ else if (rank_pipe.count( inst.rank ) &&
+ rank_pipe[inst.rank]->get_peer_inst() == inst) {
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl;
+ // connected.
+ pipe = rank_pipe[ inst.rank ];
+ } else {
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl;
+ // not connected.
+ pipe = connect_rank( inst );
+ }
+ } else {
+ // unknown dest rank or rank addr.
+ if (looking_up.count(dest) == 0) {
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl;
+ lookup(dest);
+ } else {
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl;
+ }
+ waiting_for_lookup[dest].push_back(m);
+ }
+ }
+ lock.Unlock();
+
+ // do it
+ if (entity) {
+ // local!
+ dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl;
+ entity->queue_message(m);
+ }
+ else if (pipe) {
+ // remote!
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl;
+ pipe->send(m);
+ }
+}
+
+
+
+
+void Rank::dispatch(Message *m)
+{
+ lock.Lock();
+
+ dout(10) << "dispatching " << *m << endl;
+
+ switch (m->get_type()) {
+ case MSG_NS_CONNECTACK:
+ handle_connect_ack((MNSConnectAck*)m);
+ break;
+
+ case MSG_NS_REGISTERACK:
+ handle_register_ack((MNSRegisterAck*)m);
+ break;
+
+ case MSG_NS_LOOKUPREPLY:
+ handle_lookup_reply((MNSLookupReply*)m);
+ break;
+
+ default:
+ assert(0);
+ }
+
+ lock.Unlock();
+}
+
+void Rank::handle_connect_ack(MNSConnectAck *m)
+{
+ dout(10) << "handle_connect_ack, my rank is " << m->get_rank() << endl;
+ my_rank = m->get_rank();
+
+ my_inst.addr = accepter.listen_addr;
+ my_inst.rank = my_rank;
+
+ waiting_for_rank.SignalAll();
+ delete m;
+
+ // logger!
+ /*dout(10) << "logger" << endl;
+ char names[100];
+ sprintf(names, "rank%d", my_rank);
+ string name = names;
+
+ if (g_conf.tcp_log) {
+ logger = new Logger(name, (LogType*)&rank_logtype);
+ rank_logtype.add_set("num");
+ rank_logtype.add_inc("in");
+ rank_logtype.add_inc("inb");
+ rank_logtype.add_inc("dis");
+ rank_logtype.add_set("inq");
+ rank_logtype.add_set("inqb");
+ rank_logtype.add_set("outq");
+ rank_logtype.add_set("outqb");
+ }
+ */
+}
+
+
+void Rank::handle_register_ack(MNSRegisterAck *m)
+{
+ dout(10) << "handle_register_ack " << m->get_entity() << endl;
+
+ long tid = m->get_tid();
+ waiting_for_register_result[tid] = m->get_entity();
+ waiting_for_register_cond[tid]->Signal();
+ delete m;
+}
+
+void Rank::handle_lookup_reply(MNSLookupReply *m)
+{
+ list<Message*> waiting;
+ dout(10) << "got lookup reply" << endl;
+
+ for (map<msg_addr_t,entity_inst_t>::iterator it = m->entity_map.begin();
+ it != m->entity_map.end();
+ it++) {
+ dout(10) << "lookup got " << it->first << " at " << it->second << endl;
+ msg_addr_t addr = it->first;
+ entity_inst_t inst = it->second;
+
+ if (entity_map.count(addr) &&
+ entity_map[addr] > inst) {
+ dout(10) << "ignoring lookup results for " << addr << ", " \
+ << entity_map[addr] << " > " << inst << endl;
+ continue;
+ }
+
+ // update map.
+ entity_map[addr] = inst;
+
+ if (inst.rank == my_rank) {
+ // local
+ dout(10) << "delivering lookup results locally" << endl;
+ if (local.count(addr)) {
+ if (g_conf.ms_single_dispatch) {
+ single_dispatch_queue.splice(single_dispatch_queue.end(),
+ waiting_for_lookup[addr]);
+ } else {
+ local[addr]->queue_messages(waiting_for_lookup[addr]);
+ }
+ waiting_for_lookup.erase(addr);
+ } else
+ lookup(addr); // try again!
+
+ } else {
+ // remote
+ if (rank_pipe.count(inst.rank) == 0)
+ connect_rank(inst);
+ else if (rank_pipe[inst.rank]->get_peer_inst() != inst) {
+ dout(0) << "lookup got rank addr change, WATCH OUT" << endl;
+ // FIXME BUG possible message loss weirdness?
+ rank_pipe[inst.rank]->close();
+ rank_pipe.erase(inst.rank);
+ connect_rank(inst);
+ }
+
+ // take waiters
+ Pipe *pipe = rank_pipe[inst.rank];
+ assert(pipe);
+
+ if (waiting_for_lookup.count(addr)) {
+ pipe->send(waiting_for_lookup[addr]);
+ waiting_for_lookup.erase(addr);
+ }
+ }
+ }
+
+ delete m;
+}
+
+
+void Rank::wait()
+{
+ lock.Lock();
+ while (1) {
+ // reap dead pipes
+ reaper();
+
+ if (local.size() == 0) {
+ dout(10) << "wait: everything stopped" << endl;
+ break; // everything stopped.
+ }
+
+ if (local.size() == 1 &&
+ !messenger->is_stopped()) {
+ dout(10) << "wait: stopping rank" << endl;
+ lock.Unlock();
+ messenger->shutdown();
+ delete messenger;
+ lock.Lock();
+ continue;
+ }
+
+ wait_cond.Wait(lock);
+ }
+ lock.Unlock();
+
+ // done! clean up.
+
+ // stop dispatch thread
+ if (g_conf.ms_single_dispatch) {
+ dout(10) << "wait: stopping dispatch thread" << endl;
+ lock.Lock();
+ single_dispatch_stop = true;
+ single_dispatch_cond.Signal();
+ lock.Unlock();
+ single_dispatcher.join();
+ }
+
+ // reap pipes
+ lock.Lock();
+ {
+ dout(10) << "wait: closing pipes" << endl;
+ list<Pipe*> toclose;
+ for (hash_map<int,Pipe*>::iterator i = rank_pipe.begin();
+ i != rank_pipe.end();
+ i++)
+ toclose.push_back(i->second);
+ for (list<Pipe*>::iterator i = toclose.begin();
+ i != toclose.end();
+ i++)
+ (*i)->close();
+
+ dout(10) << "wait: waiting for pipes " << pipes << " to close" << endl;
+ while (!pipes.empty()) {
+ wait_cond.Wait(lock);
+ reaper();
+ }
+ }
+ lock.Unlock();
+
+ dout(10) << "wait: done." << endl;
+}
+
+
+
+int Rank::find_ns_addr(tcpaddr_t &nsa)
+{
+ // file?
+ int fd = ::open(".ceph_ns",O_RDONLY);
+ if (fd > 0) {
+ ::read(fd, (void*)&nsa, sizeof(nsa));
+ ::close(fd);
+ cout << "ceph ns is " << nsa << endl;
+ return 0;
+ }
+
+ // env var?
+ char *nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER");
+ if (nsaddr) {
+ while (nsaddr[0] != '=') nsaddr++;
+ nsaddr++;
+
+ if (tcp_hostlookup(nsaddr, nsa) < 0) {
+ cout << "can't resolve " << nsaddr << endl;
+ return -1;
+ }
+
+ cout << "ceph ns is " << nsa << endl;
+ return 0;
+ }
+
+ cerr << "i can't find ceph ns addr in .ceph_ns or CEPH_NAMESERVER" << endl;
+ return -1;
+}
+
+
+
+/**********************************
+ * EntityMessenger
+ */
+
+Rank::EntityMessenger::EntityMessenger(msg_addr_t myaddr) :
+ Messenger(myaddr),
+ stop(false),
+ dispatch_thread(this)
+{
+}
+Rank::EntityMessenger::~EntityMessenger()
+{
+}
+
+void Rank::EntityMessenger::dispatch_entry()
+{
+ lock.Lock();
+ while (!stop) {
+ if (!dispatch_queue.empty()) {
+ list<Message*> ls;
+ ls.swap(dispatch_queue);
+
+ lock.Unlock();
+ {
+ // deliver
+ while (!ls.empty()) {
+ Message *m = ls.front();
+ ls.pop_front();
+ dout(1) //<< g_clock.now()
+ << "---- "
+ << m->get_source()// << ':' << m->get_source_port()
+ << " to " << m->get_dest()// << ':' << m->get_dest_port()
+ << " ---- " << m->get_type_name()
+ << " ---- " << m->get_source_inst()
+ << " ---- " << m
+ << endl;
+ dispatch(m);
+ }
+ }
+ lock.Lock();
+ continue;
+ }
+ cond.Wait(lock);
+ }
+ lock.Unlock();
+}
+
+void Rank::EntityMessenger::ready()
+{
+ dout(10) << "ready " << get_myaddr() << endl;
+
+ if (g_conf.ms_single_dispatch) {
+ rank.lock.Lock();
+ if (rank.waiting_for_ready.count(get_myaddr())) {
+ rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(),
+ rank.waiting_for_ready[get_myaddr()]);
+ rank.waiting_for_ready.erase(get_myaddr());
+ rank.single_dispatch_cond.Signal();
+ }
+ rank.lock.Unlock();
+ } else {
+ // start my dispatch thread
+ dispatch_thread.create();
+ }
+
+ // tell namer
+ if (get_myaddr() != MSG_ADDR_NAMER(0) &&
+ get_myaddr() != MSG_ADDR_RANK(0))
+ send_message(new MGenericMessage(MSG_NS_STARTED), MSG_ADDR_NAMER(0), rank.namer_inst);
+}
+
+
+int Rank::EntityMessenger::shutdown()
+{
+ dout(10) << "shutdown " << get_myaddr() << endl;
+
+ // deregister
+ rank.unregister_entity(this);
+
+ // stop my dispatch thread
+ if (dispatch_thread.am_self()) {
+ dout(1) << "shutdown i am dispatch, setting stop flag" << endl;
+ stop = true;
+ } else {
+ dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl;
+ lock.Lock();
+ stop = true;
+ cond.Signal();
+ lock.Unlock();
+ dispatch_thread.join();
+ }
+
+ return 0;
+}
+
+
+void Rank::EntityMessenger::prepare_dest(const entity_inst_t& inst)
+{
+ rank.lock.Lock();
+ {
+ if (rank.rank_pipe.count(inst.rank) == 0)
+ rank.connect_rank(inst);
+ }
+ rank.lock.Unlock();
+}
+
+int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst,
+ int port, int fromport)
+{
+ // set envelope
+ m->set_source(get_myaddr(), fromport);
+ m->set_dest(dest, port);
+
+ m->set_source_inst(rank.my_inst);
+
+ dout(1) << "--> "
+ << m->get_source() //<< ':' << m->get_source_port()
+ << " to " << m->get_dest() //<< ':' << m->get_dest_port()
+ << " ---- " << m->get_type_name()
+ << " ---- " << rank.my_inst << " --> " << inst
+ << " ---- " << m
+ << endl;
+
+ rank.submit_message(m, inst);
+
+ return 0;
+}
+
+
+int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport)
+{
+ // set envelope
+ m->set_source(get_myaddr(), fromport);
+ m->set_dest(dest, port);
+
+ m->set_source_inst(rank.my_inst);
+
+ dout(1) << "--> "
+ << m->get_source() //<< ':' << m->get_source_port()
+ << " to " << m->get_dest() //<< ':' << m->get_dest_port()
+ << " ---- " << m->get_type_name()
+ << " ---- " << rank.my_inst << " --> ? (DEPRECATED)"
+ << " ---- " << m
+ << endl;
+
+ rank.submit_message(m);
+
+ return 0;
+}
+
+
+void Rank::EntityMessenger::mark_down(msg_addr_t a, entity_inst_t& i)
+{
+ assert(a != get_myaddr());
+ rank.mark_down(a,i);
+}
+
+void Rank::mark_down(msg_addr_t a, entity_inst_t& inst)
+{
+ //if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer
+ lock.Lock();
+ if (entity_map.count(a) &&
+ entity_map[a] > inst) {
+ dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl;
+ derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl;
+ // do nothing!
+ } else {
+ if (entity_map.count(a) == 0) {
+ // don't know it
+ dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl;
+ derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl;
+
+ waiting_for_lookup.erase(a);
+ looking_up.erase(a);
+ } else {
+ // know it
+ assert(entity_map[a] <= inst);
+ dout(10) << "mark_down " << a << " inst " << inst << endl;
+ derr(10) << "mark_down " << a << " inst " << inst << endl;
+
+ entity_map.erase(a);
+
+ if (rank_pipe.count(inst.rank)) {
+ rank_pipe[inst.rank]->close();
+ rank_pipe.erase(inst.rank);
+ }
+
+ // kill rank# too? only if i'm the namer.
+ if (my_rank == 0) {
+ entity_map.erase(MSG_ADDR_RANK(inst.rank));
+ }
+ }
+ }
+ lock.Unlock();
+}
+
+void Rank::EntityMessenger::mark_up(msg_addr_t a, entity_inst_t& i)
+{
+ assert(a != get_myaddr());
+ rank.mark_up(a, i);
+}
+
+void Rank::mark_up(msg_addr_t a, entity_inst_t& i)
+{
+ if (my_rank == 0) return;
+ lock.Lock();
+ {
+ dout(10) << "mark_up " << a << " inst " << i << endl;
+ derr(10) << "mark_up " << a << " inst " << i << endl;
+
+ assert(i.rank != my_rank); // hrm?
+
+ if (entity_map.count(a) == 0 ||
+ entity_map[a] < i) {
+ entity_map[a] = i;
+ connect_rank(i);
+ } else if (entity_map[a] == i) {
+ dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl;
+ derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl;
+ } else {
+ dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl;
+ derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl;
+ }
+
+ //if (waiting_for_lookup.count(a))
+ //lookup(a);
+ }
+ lock.Unlock();
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __NEWMESSENGER_H
+#define __NEWMESSENGER_H
+
+
+#include <list>
+#include <map>
+using namespace std;
+#include <ext/hash_map>
+#include <ext/hash_set>
+using namespace __gnu_cxx;
+
+
+#include "include/types.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "Messenger.h"
+#include "Message.h"
+#include "tcp.h"
+
+
+
+
+/* Rank - per-process
+ */
+class Rank : public Dispatcher {
+
+ class EntityMessenger;
+ class Pipe;
+
+ // namer
+ class Namer : public Dispatcher {
+ public:
+ EntityMessenger *messenger; // namerN
+
+ int nrank;
+ int nclient, nmds, nosd, nmon;
+
+ map<msg_addr_t, list<Message*> > waiting;
+
+ Namer(EntityMessenger *msgr);
+ ~Namer();
+
+ void handle_connect(class MNSConnect*);
+ void handle_register(class MNSRegister *m);
+ void handle_started(Message *m);
+ void handle_lookup(class MNSLookup *m);
+ void handle_unregister(Message *m);
+ void handle_failure(class MNSFailure *m);
+
+ void dispatch(Message *m);
+
+ void manual_insert_inst(const entity_inst_t &inst);
+
+ };
+
+ // incoming
+ class Accepter : public Thread {
+ public:
+ bool done;
+
+ tcpaddr_t listen_addr;
+ int listen_sd;
+
+ Accepter() : done(false) {}
+
+ void *entry();
+ void stop() {
+ done = true;
+ ::close(listen_sd);
+ join();
+ }
+ int start();
+ } accepter;
+
+
+
+ class Pipe {
+ protected:
+ int sd;
+ bool done;
+ entity_inst_t peer_inst;
+ bool server;
+ bool sent_close;
+
+ bool reader_running;
+ bool writer_running;
+
+ list<Message*> q;
+ Mutex lock;
+ Cond cond;
+
+ int accept(); // server handshake
+ int connect(); // client handshake
+ void reader();
+ void writer();
+
+ Message *read_message();
+ int write_message(Message *m);
+ void fail(list<Message*>& ls);
+
+ // threads
+ class Reader : public Thread {
+ Pipe *pipe;
+ public:
+ Reader(Pipe *p) : pipe(p) {}
+ void *entry() { pipe->reader(); return 0; }
+ } reader_thread;
+ friend class Reader;
+
+ class Writer : public Thread {
+ Pipe *pipe;
+ public:
+ Writer(Pipe *p) : pipe(p) {}
+ void *entry() { pipe->writer(); return 0; }
+ } writer_thread;
+ friend class Writer;
+
+ public:
+ Pipe(int s) : sd(s),
+ done(false), server(true),
+ sent_close(false),
+ reader_running(false), writer_running(false),
+ reader_thread(this), writer_thread(this) {
+ // server
+ reader_running = true;
+ reader_thread.create();
+ }
+ Pipe(const entity_inst_t &pi) : sd(0),
+ done(false), peer_inst(pi), server(false),
+ sent_close(false),
+ reader_running(false), writer_running(false),
+ reader_thread(this), writer_thread(this) {
+ // client
+ writer_running = true;
+ writer_thread.create();
+ }
+
+ // public constructors
+ static const Pipe& Server(int s);
+ static const Pipe& Client(const entity_inst_t& pi);
+
+ entity_inst_t& get_peer_inst() { return peer_inst; }
+
+ void close();
+ void join() {
+ writer_thread.join();
+ reader_thread.join();
+ }
+
+ void send(Message *m) {
+ lock.Lock();
+ q.push_back(m);
+ cond.Signal();
+ lock.Unlock();
+ }
+ void send(list<Message*>& ls) {
+ lock.Lock();
+ q.splice(q.end(), ls);
+ cond.Signal();
+ lock.Unlock();
+ }
+ };
+
+
+
+ // messenger interface
+ class EntityMessenger : public Messenger {
+ Mutex lock;
+ Cond cond;
+ list<Message*> dispatch_queue;
+ bool stop;
+
+ class DispatchThread : public Thread {
+ EntityMessenger *m;
+ public:
+ DispatchThread(EntityMessenger *_m) : m(_m) {}
+ void *entry() {
+ m->dispatch_entry();
+ return 0;
+ }
+ } dispatch_thread;
+ void dispatch_entry();
+
+ public:
+ void queue_message(Message *m) {
+ lock.Lock();
+ dispatch_queue.push_back(m);
+ cond.Signal();
+ lock.Unlock();
+ }
+ void queue_messages(list<Message*> ls) {
+ lock.Lock();
+ dispatch_queue.splice(dispatch_queue.end(), ls);
+ cond.Signal();
+ lock.Unlock();
+ }
+
+ public:
+ EntityMessenger(msg_addr_t myaddr);
+ ~EntityMessenger();
+
+ void ready();
+ bool is_stopped() { return stop; }
+
+ void wait() {
+ dispatch_thread.join();
+ }
+
+ virtual void callback_kick() {}
+ virtual int shutdown();
+ virtual void prepare_dest(const entity_inst_t& inst);
+ virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0);
+ virtual int send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst,
+ int port=0, int fromport=0);
+
+ virtual void mark_down(msg_addr_t a, entity_inst_t& i);
+ virtual void mark_up(msg_addr_t a, entity_inst_t& i);
+ //virtual void reset(msg_addr_t a);
+ };
+
+
+ class SingleDispatcher : public Thread {
+ Rank *rank;
+ public:
+ SingleDispatcher(Rank *r) : rank(r) {}
+ void *entry() {
+ rank->single_dispatcher_entry();
+ return 0;
+ }
+ } single_dispatcher;
+
+ Cond single_dispatch_cond;
+ bool single_dispatch_stop;
+ list<Message*> single_dispatch_queue;
+
+ map<msg_addr_t, list<Message*> > waiting_for_ready;
+
+ void single_dispatcher_entry();
+ void _submit_single_dispatch(Message *m);
+
+
+ // Rank stuff
+ public:
+ Mutex lock;
+ Cond wait_cond; // for wait()
+
+ // my rank
+ int my_rank;
+ Cond waiting_for_rank;
+
+ // my instance
+ entity_inst_t my_inst;
+
+ // lookup
+ hash_map<msg_addr_t, entity_inst_t> entity_map;
+ hash_set<msg_addr_t> entity_unstarted;
+
+ map<msg_addr_t, list<Message*> > waiting_for_lookup;
+ set<msg_addr_t> looking_up;
+
+ // register
+ map<int, Cond* > waiting_for_register_cond;
+ map<int, msg_addr_t > waiting_for_register_result;
+
+ // local
+ map<msg_addr_t, EntityMessenger*> local;
+
+ // remote
+ hash_map<int, Pipe*> rank_pipe;
+
+ set<Pipe*> pipes;
+ list<Pipe*> pipe_reap_queue;
+
+ EntityMessenger *messenger; // rankN
+ Namer *namer;
+
+ entity_inst_t namer_inst;
+
+ void show_dir();
+
+ void lookup(msg_addr_t addr);
+
+ void dispatch(Message *m);
+ void handle_connect_ack(class MNSConnectAck *m);
+ void handle_register_ack(class MNSRegisterAck *m);
+ void handle_lookup_reply(class MNSLookupReply *m);
+
+ Pipe *connect_rank(const entity_inst_t& inst);
+
+ void mark_down(msg_addr_t addr, entity_inst_t& i);
+ void mark_up(msg_addr_t addr, entity_inst_t& i);
+
+ tcpaddr_t get_listen_addr() { return accepter.listen_addr; }
+
+ void reaper();
+
+
+public:
+ Rank(int r=-1);
+ ~Rank();
+
+ int find_ns_addr(tcpaddr_t &tcpaddr);
+
+ void set_namer(const tcpaddr_t& ns);
+ void start_namer();
+
+ int start_rank();
+ void wait();
+
+ EntityMessenger *register_entity(msg_addr_t addr);
+ void unregister_entity(EntityMessenger *ms);
+
+ void submit_message(Message *m, const entity_inst_t& inst);
+ void prepare_dest(const entity_inst_t& inst);
+ void submit_message(Message *m);
+ void submit_messages(list<Message*>& ls);
+
+ // create a new messenger
+ EntityMessenger *new_entity(msg_addr_t addr);
+
+} ;
+
+
+
+extern Rank rank;
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#ifndef _RWLock_Posix_
+#define _RWLock_Posix_
+
+#include <pthread.h>
+
+class RWLock
+{
+ mutable pthread_rwlock_t L;
+
+ public:
+
+ RWLock() {
+ pthread_rwlock_init(&L, NULL);
+ }
+
+ virtual ~RWLock() {
+ pthread_rwlock_unlock(&L);
+ pthread_rwlock_destroy(&L);
+ }
+
+ void unlock() {
+ pthread_rwlock_unlock(&L);
+ }
+ void get_read() {
+ pthread_rwlock_rdlock(&L);
+ }
+ void put_read() { unlock(); }
+ void get_write() {
+ pthread_rwlock_wrlock(&L);
+ }
+ void put_write() { unlock(); }
+};
+
+#endif // !_Mutex_Posix_
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __SERIAL_MESSENGER_H
+#define __SERIAL_MESSENGER_H
+
+#include "Dispatcher.h"
+#include "Message.h"
+
+class SerialMessenger : public Dispatcher {
+ public:
+ virtual void dispatch(Message *m) = 0; // i receive my messages here
+ virtual void send(Message *m, msg_addr_t dest, int port=0, int fromport=0) = 0; // doesn't block
+ virtual Message *sendrecv(Message *m, msg_addr_t dest, int port=0, int fromport=0) = 0; // blocks for matching reply
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "SimpleMessenger.h"
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "config.h"
+
+#include "messages/MGenericMessage.h"
+#include "messages/MNSConnect.h"
+#include "messages/MNSConnectAck.h"
+#include "messages/MNSRegister.h"
+#include "messages/MNSRegisterAck.h"
+#include "messages/MNSLookup.h"
+#include "messages/MNSLookupReply.h"
+#include "messages/MNSFailure.h"
+
+//#include "messages/MFailure.h"
+
+#include <netdb.h>
+
+
+#undef dout
+#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- " << rank.my_inst.addr << " "
+#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- " << rank.my_inst.addr << " "
+
+
+
+#include "tcp.cc"
+
+
+Rank rank;
+
+
+
+/********************************************
+ * Accepter
+ */
+
+int Rank::Accepter::start()
+{
+ // bind to a socket
+ dout(10) << "accepter.start binding to listen " << endl;
+
+ /* socket creation */
+ listen_sd = socket(AF_INET,SOCK_STREAM,0);
+ assert(listen_sd > 0);
+
+ /* bind to port */
+ int rc = bind(listen_sd, (struct sockaddr *) &rank.listen_addr, sizeof(rank.listen_addr));
+ if (rc < 0)
+ derr(0) << "accepter.start unable to bind to " << rank.listen_addr << endl;
+ assert(rc >= 0);
+
+ socklen_t llen = sizeof(rank.listen_addr);
+ getsockname(listen_sd, (sockaddr*)&rank.listen_addr, &llen);
+
+ int myport = rank.listen_addr.sin_port;
+
+ // listen!
+ rc = ::listen(listen_sd, 1000);
+ assert(rc >= 0);
+
+ //dout(10) << "accepter.start listening on " << myport << endl;
+
+ // my address is...
+ char host[100];
+ bzero(host, 100);
+ gethostname(host, 100);
+ //dout(10) << "accepter.start my hostname is " << host << endl;
+
+ struct hostent *myhostname = gethostbyname( host );
+
+ struct sockaddr_in my_addr;
+ memset(&my_addr, 0, sizeof(my_addr));
+
+ my_addr.sin_family = myhostname->h_addrtype;
+ memcpy((char *) &my_addr.sin_addr.s_addr,
+ myhostname->h_addr_list[0],
+ myhostname->h_length);
+ my_addr.sin_port = myport;
+
+ rank.listen_addr = my_addr;
+
+ dout(10) << "accepter.start listen addr is " << rank.listen_addr << endl;
+
+ // start thread
+ create();
+
+ return 0;
+}
+
+void *Rank::Accepter::entry()
+{
+ dout(10) << "accepter starting" << endl;
+
+ while (!done) {
+ // accept
+ struct sockaddr_in addr;
+ socklen_t slen = sizeof(addr);
+ int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen);
+ if (sd > 0) {
+ dout(10) << "accepted incoming on sd " << sd << endl;
+
+ rank.lock.Lock();
+ Pipe *p = new Pipe(sd);
+ rank.pipes.insert(p);
+ rank.lock.Unlock();
+ } else {
+ dout(10) << "no incoming connection?" << endl;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+
+/**************************************
+ * Pipe
+ */
+
+int Rank::Pipe::accept()
+{
+ // my creater gave me sd via accept()
+
+ // announce myself.
+ int rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst));
+ if (rc < 0) {
+ ::close(sd);
+ done = true;
+ return -1;
+ }
+
+ // identify peer
+ rc = tcp_read(sd, (char*)&peer_inst, sizeof(peer_inst));
+ if (rc < 0) {
+ dout(10) << "pipe(? " << this << ").accept couldn't read peer inst" << endl;
+ ::close(sd);
+ done = true;
+ return -1;
+ }
+
+ // create writer thread.
+ writer_running = true;
+ writer_thread.create();
+
+ // register pipe.
+ if (peer_inst.rank >= 0) {
+ rank.lock.Lock();
+ {
+ if (rank.rank_pipe.count(peer_inst.rank) == 0) {
+ // install a pipe!
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst << endl;
+ rank.rank_pipe[peer_inst.rank] = this;
+ } else {
+ // low ranks' Pipes "win"
+ if (peer_inst.rank < rank.my_inst.rank ||
+ rank.my_inst.rank < 0) {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst
+ << ", already had pipe, but switching to this new one" << endl;
+ // switch to this new Pipe
+ rank.rank_pipe[peer_inst.rank]->close(); // close old one
+ rank.rank_pipe[peer_inst.rank] = this;
+ } else {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst
+ << ", already had pipe, sticking with it" << endl;
+ }
+ }
+ }
+ rank.lock.Unlock();
+ } else {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is unranked " << peer_inst << endl;
+ }
+
+ return 0; // success.
+}
+
+int Rank::Pipe::connect()
+{
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect" << endl;
+
+ // create socket?
+ sd = socket(AF_INET,SOCK_STREAM,0);
+ assert(sd > 0);
+
+ // bind any port
+ struct sockaddr_in myAddr;
+ myAddr.sin_family = AF_INET;
+ myAddr.sin_addr.s_addr = htonl(INADDR_ANY);
+ myAddr.sin_port = htons( 0 );
+
+ int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr));
+ assert(rc>=0);
+
+ // connect!
+ rc = ::connect(sd, (sockaddr*)&peer_inst.addr, sizeof(myAddr));
+ if (rc < 0) return rc;
+
+ // identify peer
+ entity_inst_t inst;
+ rc = tcp_read(sd, (char*)&inst, sizeof(inst));
+ if (inst.rank < 0)
+ inst = peer_inst; // i know better than they do.
+ if (peer_inst != inst && inst.rank > 0) {
+ derr(0) << "pipe(" << peer_inst << ' ' << this << ").connect peer is " << inst << ", wtf" << endl;
+ assert(0);
+ return -1;
+ }
+
+ // identify myself
+ rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst));
+ if (rc < 0)
+ return -1;
+
+ // register pipe
+ rank.lock.Lock();
+ {
+ if (rank.rank_pipe.count(peer_inst.rank) == 0) {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect registering pipe" << endl;
+ rank.rank_pipe[peer_inst.rank] = this;
+ } else {
+ // this is normal.
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect pipe already registered." << endl;
+ }
+ }
+ rank.lock.Unlock();
+
+ // start reader
+ reader_running = true;
+ reader_thread.create();
+
+ return 0;
+}
+
+
+void Rank::Pipe::close()
+{
+ if (sent_close) {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").close already closing" << endl;
+ return;
+ }
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").close" << endl;
+
+ // unreg ourselves
+ rank.lock.Lock();
+ {
+ if (rank.rank_pipe.count(peer_inst.rank) &&
+ rank.rank_pipe[peer_inst.rank] == this) {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").close unregistering pipe" << endl;
+ rank.rank_pipe.erase(peer_inst.rank);
+ }
+ }
+ rank.lock.Unlock();
+
+ // queue close message.
+ if (socket_error) {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").close not queueing MSG_CLOSE, socket error" << endl;
+ } else {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").close queueing MSG_CLOSE" << endl;
+ lock.Lock();
+ q.push_back(new MGenericMessage(MSG_CLOSE));
+ cond.Signal();
+ sent_close = true;
+ lock.Unlock();
+ }
+}
+
+
+/* read msgs from socket.
+ * also, server.
+ *
+ */
+void Rank::Pipe::reader()
+{
+ if (server)
+ accept();
+
+ // loop.
+ while (!done) {
+ Message *m = read_message();
+ if (!m || m->get_type() == 0) {
+ if (m) {
+ delete m;
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader read MSG_CLOSE message" << endl;
+ } else {
+ derr(10) << "pipe(" << peer_inst << ' ' << this << ").reader read null message" << endl;
+ }
+
+ if (!sent_close)
+ close();
+
+ done = true;
+ cond.Signal(); // wake up writer too.
+ break;
+ }
+
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader got message for " << m->get_dest() << endl;
+
+ EntityMessenger *entity = 0;
+
+ rank.lock.Lock();
+ {
+ if (rank.entity_map.count(m->get_source()) &&
+ rank.entity_map[m->get_source()] > m->get_source_inst()) {
+ derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader source " << m->get_source()
+ << " inst " << m->get_source_inst()
+ << " > " << rank.entity_map[m->get_source()]
+ << ", WATCH OUT " << *m << endl;
+ assert(0);
+ }
+
+ if (g_conf.ms_single_dispatch) {
+ // submit to single dispatch queue
+ rank._submit_single_dispatch(m);
+ } else {
+ if (rank.local.count(m->get_dest())) {
+ // find entity
+ entity = rank.local[m->get_dest()];
+ } else {
+ entity = rank.find_unnamed(m->get_dest());
+ if (!entity) {
+ derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl;
+ assert(0); // FIXME do this differently
+ }
+ }
+ }
+ }
+ rank.lock.Unlock();
+
+ if (entity)
+ entity->queue_message(m); // queue
+ }
+
+
+ // reap?
+ bool reap = false;
+ lock.Lock();
+ {
+ reader_running = false;
+ if (!writer_running) reap = true;
+ }
+ lock.Unlock();
+
+ if (reap) {
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader queueing for reap" << endl;
+ ::close(sd);
+ rank.lock.Lock();
+ {
+ rank.pipe_reap_queue.push_back(this);
+ rank.wait_cond.Signal();
+ }
+ rank.lock.Unlock();
+ }
+}
+
+
+/* write msgs to socket.
+ * also, client.
+ */
+void Rank::Pipe::writer()
+{
+ if (!server) {
+ int rc = connect();
+ if (rc < 0) {
+ derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error connecting" << endl;
+ done = true;
+ list<Message*> out;
+ fail(out);
+ }
+ }
+
+ // loop.
+ lock.Lock();
+ while (!q.empty() || !done) {
+
+ if (!q.empty()) {
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer grabbing message(s)" << endl;
+
+ // grab outgoing list
+ list<Message*> out;
+ out.swap(q);
+
+ // drop lock while i send these
+ lock.Unlock();
+
+ while (!out.empty()) {
+ Message *m = out.front();
+ out.pop_front();
+
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sending " << *m << endl;
+
+ // stamp.
+ m->set_source_inst(rank.my_inst);
+
+ // marshall
+ if (m->empty_payload())
+ m->encode_payload();
+
+ if (write_message(m) < 0) {
+ // failed!
+ derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest() << endl;
+ out.push_front(m);
+ fail(out);
+ done = true;
+ break;
+ }
+
+ // did i just send a close?
+ if (m->get_type() == MSG_CLOSE)
+ done = true;
+
+ // clean up
+ delete m;
+ }
+
+ lock.Lock();
+ continue;
+ }
+
+ // wait
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sleeping" << endl;
+ cond.Wait(lock);
+ }
+ lock.Unlock();
+
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer finishing" << endl;
+
+ // reap?
+ bool reap = false;
+ lock.Lock();
+ {
+ writer_running = false;
+ if (!reader_running) reap = true;
+ }
+ lock.Unlock();
+
+ if (reap) {
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer queueing for reap" << endl;
+ ::close(sd);
+ rank.lock.Lock();
+ {
+ rank.pipe_reap_queue.push_back(this);
+ rank.wait_cond.Signal();
+ }
+ rank.lock.Unlock();
+ }
+}
+
+
+Message *Rank::Pipe::read_message()
+{
+ // envelope
+ //dout(10) << "receiver.read_message from sd " << sd << endl;
+
+ msg_envelope_t env;
+ if (!tcp_read( sd, (char*)&env, sizeof(env) )) {
+ socket_error = true;
+ return 0;
+ }
+
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got envelope type=" << env.type
+ << " src " << env.source << " dst " << env.dest
+ << " nchunks=" << env.nchunks
+ << endl;
+
+ // payload
+ bufferlist blist;
+ for (int i=0; i<env.nchunks; i++) {
+ int size;
+ if (!tcp_read( sd, (char*)&size, sizeof(size) )) {
+ socket_error = true;
+ return 0;
+ }
+
+ if (size == 0) continue;
+
+ bufferptr bp(size);
+
+ if (!tcp_read( sd, bp.c_str(), size )) {
+ socket_error = true;
+ return 0;
+ }
+
+ blist.push_back(bp);
+
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got frag " << i << " of " << env.nchunks
+ << " len " << bp.length() << endl;
+ }
+
+ // unmarshall message
+ size_t s = blist.length();
+ Message *m = decode_message(env, blist);
+
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got " << s << " byte message from "
+ << m->get_source() << endl;
+
+ return m;
+}
+
+
+
+int Rank::Pipe::write_message(Message *m)
+{
+ // get envelope, buffers
+ msg_envelope_t *env = &m->get_envelope();
+ bufferlist blist;
+ blist.claim( m->get_payload() );
+
+#ifdef TCP_KEEP_CHUNKS
+ env->nchunks = blist.buffers().size();
+#else
+ env->nchunks = 1;
+#endif
+
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sending " << m << " " << *m
+ << " to " << m->get_dest()
+ << endl;
+
+ // send envelope
+ int r = tcp_write( sd, (char*)env, sizeof(*env) );
+ if (r < 0) {
+ derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending envelope for " << *m
+ << " to " << m->get_dest() << endl;
+ socket_error = true;
+ return -1;
+ }
+
+ // payload
+#ifdef TCP_KEEP_CHUNKS
+ // send chunk-wise
+ int i = 0;
+ for (list<bufferptr>::const_iterator it = blist.buffers().begin();
+ it != blist.buffers().end();
+ it++) {
+ dout(10) << "pipe(" << peer_inst << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl;
+ int size = (*it).length();
+ r = tcp_write( sd, (char*)&size, sizeof(size) );
+ if (r < 0) {
+ derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl;
+ socket_error = true;
+ return -1;
+ }
+ r = tcp_write( sd, (*it).c_str(), size );
+ if (r < 0) {
+ derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data chunk for " << *m << " to " << m->get_dest() << endl;
+ socket_error = true;
+ return -1;
+ }
+ i++;
+ }
+#else
+ // one big chunk
+ int size = blist.length();
+ r = tcp_write( sd, (char*)&size, sizeof(size) );
+ if (r < 0) {
+ derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl;
+ socket_error = true;
+ return -1;
+ }
+ dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl;
+
+ for (list<bufferptr>::const_iterator it = blist.buffers().begin();
+ it != blist.buffers().end();
+ it++) {
+ if ((*it).length() == 0) continue; // blank buffer.
+ r = tcp_write( sd, (char*)(*it).c_str(), (*it).length() );
+ if (r < 0) {
+ derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl;
+ socket_error = true;
+ return -1;
+ }
+ }
+#endif
+
+ return 0;
+}
+
+
+void Rank::Pipe::fail(list<Message*>& out)
+{
+ derr(10) << "pipe(" << peer_inst << ' ' << this << ").fail" << endl;
+
+ // FIXME: possible race before i reclaim lock here?
+
+ // deactivate myself
+ rank.lock.Lock();
+ {
+ if (rank.rank_pipe.count(peer_inst.rank) &&
+ rank.rank_pipe[peer_inst.rank] == this)
+ rank.rank_pipe.erase(peer_inst.rank);
+ }
+ rank.lock.Unlock();
+
+ // what do i do about reader()? FIXME
+
+ // sort my messages by (source) dispatcher, dest.
+ map<Dispatcher*, map<msg_addr_t, list<Message*> > > by_dis;
+ lock.Lock();
+ {
+ // include out at front of queue
+ q.splice(q.begin(), out);
+
+ // sort
+ while (!q.empty()) {
+ if (q.front()->get_type() == MSG_CLOSE) {
+ delete q.front();
+ }
+ else if (rank.local.count(q.front()->get_source())) {
+ EntityMessenger *mgr = rank.local[q.front()->get_source()];
+ Dispatcher *dis = mgr->get_dispatcher();
+ if (mgr->is_stopped()) {
+ // ignore.
+ dout(1) << "pipe(" << peer_inst << ' ' << this << ").fail on " << *q.front() << ", dispatcher stopping, ignoring." << endl;
+ delete q.front();
+ } else {
+ by_dis[dis][q.front()->get_dest()].push_back(q.front());
+ }
+ }
+ else {
+ // oh well. sending entity musta just shut down?
+ assert(0);
+ delete q.front();
+ }
+ q.pop_front();
+ }
+ }
+ lock.Unlock();
+
+ // report failure(s) to dispatcher(s)
+ for (map<Dispatcher*, map<msg_addr_t, list<Message*> > >::iterator i = by_dis.begin();
+ i != by_dis.end();
+ ++i)
+ for (map<msg_addr_t, list<Message*> >::iterator j = i->second.begin();
+ j != i->second.end();
+ ++j)
+ for (list<Message*>::iterator k = j->second.begin();
+ k != j->second.end();
+ ++k) {
+ derr(1) << "pipe(" << peer_inst << ' ' << this << ").fail on " << **k << " to " << j->first << " inst " << peer_inst << endl;
+ i->first->ms_handle_failure(*k, j->first, peer_inst);
+ }
+}
+
+
+
+
+
+
+/********************************************
+ * Rank
+ */
+
+Rank::Rank() :
+ single_dispatcher(this) {
+ // default to any listen_addr
+ memset((char*)&listen_addr, 0, sizeof(listen_addr));
+ listen_addr.sin_family = AF_INET;
+ listen_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ listen_addr.sin_port = 0;
+}
+Rank::~Rank()
+{
+}
+
+void Rank::set_listen_addr(tcpaddr_t& a)
+{
+ dout(10) << "set_listen_addr " << a << endl;
+ memcpy((char*)&listen_addr.sin_addr.s_addr, (char*)&a.sin_addr.s_addr, 4);
+ listen_addr.sin_port = a.sin_port;
+}
+
+
+void Rank::_submit_single_dispatch(Message *m)
+{
+ assert(lock.is_locked());
+
+ if (local.count(m->get_dest()) &&
+ local[m->get_dest()]->is_ready()) {
+ rank.single_dispatch_queue.push_back(m);
+ rank.single_dispatch_cond.Signal();
+ } else {
+ waiting_for_ready[m->get_dest()].push_back(m);
+ }
+}
+
+
+void Rank::single_dispatcher_entry()
+{
+ lock.Lock();
+ while (!single_dispatch_stop || !single_dispatch_queue.empty()) {
+ if (!single_dispatch_queue.empty()) {
+ list<Message*> ls;
+ ls.swap(single_dispatch_queue);
+
+ lock.Unlock();
+ {
+ while (!ls.empty()) {
+ Message *m = ls.front();
+ ls.pop_front();
+
+ dout(1) << m->get_dest()
+ << " <-- " << m->get_source() << " " << m->get_source_inst()
+ << " ---- " << *m
+ << " -- " << m
+ << endl;
+
+ assert(local.count(m->get_dest()));
+ local[m->get_dest()]->dispatch(m);
+ }
+ }
+ lock.Lock();
+ continue;
+ }
+ single_dispatch_cond.Wait(lock);
+ }
+ lock.Unlock();
+}
+
+
+/*
+ * note: assumes lock is held
+ */
+void Rank::reaper()
+{
+ dout(10) << "reaper" << endl;
+ assert(lock.is_locked());
+
+ while (!pipe_reap_queue.empty()) {
+ Pipe *p = pipe_reap_queue.front();
+ dout(10) << "reaper reaping pipe " << p->get_peer_inst() << endl;
+ pipe_reap_queue.pop_front();
+ assert(pipes.count(p));
+ pipes.erase(p);
+ p->join();
+ dout(10) << "reaper reaped pipe " << p->get_peer_inst() << endl;
+ delete p;
+ }
+}
+
+
+int Rank::start_rank()
+{
+ dout(10) << "start_rank" << endl;
+
+ // bind to a socket
+ if (accepter.start() < 0)
+ return -1;
+
+ // start single thread dispatcher?
+ if (g_conf.ms_single_dispatch) {
+ single_dispatch_stop = false;
+ single_dispatcher.create();
+ }
+
+ lock.Lock();
+
+ // my_inst
+ my_inst.set_addr( listen_addr );
+
+ dout(1) << "start_rank at " << my_inst << endl;
+
+ lock.Unlock();
+ return 0;
+}
+
+
+
+/* connect_rank
+ * NOTE: assumes rank.lock held.
+ */
+Rank::Pipe *Rank::connect_rank(const entity_inst_t& inst)
+{
+ assert(rank.lock.is_locked());
+ assert(inst != rank.my_inst);
+
+ dout(10) << "connect_rank to " << inst << endl;
+
+ // create pipe
+ Pipe *pipe = new Pipe(inst);
+ rank.rank_pipe[inst.rank] = pipe;
+ pipes.insert(pipe);
+
+ return pipe;
+}
+
+
+
+
+
+void Rank::show_dir()
+{
+ dout(10) << "show_dir ---" << endl;
+
+ for (hash_map<msg_addr_t, entity_inst_t>::iterator i = entity_map.begin();
+ i != entity_map.end();
+ i++) {
+ if (local.count(i->first)) {
+ dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl;
+ } else {
+ dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl;
+ }
+ }
+}
+
+Rank::EntityMessenger *Rank::find_unnamed(msg_addr_t a)
+{
+ // find an unnamed local entity of the right type
+ for (map<msg_addr_t, EntityMessenger*>::iterator p = local.begin();
+ p != local.end();
+ ++p) {
+ if (p->first.type() == a.type() && p->first.is_new())
+ return p->second;
+ }
+ return 0;
+}
+
+
+
+
+/* register_entity
+ */
+Rank::EntityMessenger *Rank::register_entity(msg_addr_t addr)
+{
+ dout(10) << "register_entity " << addr << endl;
+ lock.Lock();
+
+ // create messenger
+ EntityMessenger *msgr = new EntityMessenger(addr);
+
+ // add to directory
+ entity_map[addr] = my_inst;
+ local[addr] = msgr;
+
+ lock.Unlock();
+ return msgr;
+}
+
+
+void Rank::unregister_entity(EntityMessenger *msgr)
+{
+ lock.Lock();
+ dout(10) << "unregister_entity " << msgr->get_myaddr() << endl;
+
+ // remove from local directory.
+ assert(local.count(msgr->get_myaddr()));
+ local.erase(msgr->get_myaddr());
+ assert(entity_map.count(msgr->get_myaddr()));
+ entity_map.erase(msgr->get_myaddr());
+
+ wait_cond.Signal();
+
+ lock.Unlock();
+}
+
+
+void Rank::submit_message(Message *m, const entity_inst_t& dest_inst)
+{
+ const msg_addr_t dest = m->get_dest();
+
+ // lookup
+ EntityMessenger *entity = 0;
+ Pipe *pipe = 0;
+
+ lock.Lock();
+ {
+ // local?
+ if (dest_inst.rank == my_inst.rank) {
+ if (local.count(dest)) {
+ // local
+ dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl;
+ if (g_conf.ms_single_dispatch) {
+ _submit_single_dispatch(m);
+ } else {
+ entity = local[dest];
+ }
+ } else {
+ derr(0) << "submit_message " << *m << " dest " << dest << " " << dest_inst << " local but not in local map?" << endl;
+ assert(0); // hmpf
+ }
+ }
+ else {
+ // remote.
+ if (rank_pipe.count( dest_inst.rank )) {
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", already connected." << endl;
+ // connected.
+ pipe = rank_pipe[ dest_inst.rank ];
+ } else {
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl;
+ // not connected.
+ pipe = connect_rank( dest_inst );
+ }
+ }
+ }
+ lock.Unlock();
+
+ // do it
+ if (entity) {
+ // local!
+ dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl;
+ entity->queue_message(m);
+ }
+ else if (pipe) {
+ // remote!
+ dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl;
+ pipe->send(m);
+ }
+}
+
+
+
+
+
+void Rank::wait()
+{
+ lock.Lock();
+ while (1) {
+ // reap dead pipes
+ reaper();
+
+ if (local.empty()) {
+ dout(10) << "wait: everything stopped" << endl;
+ break; // everything stopped.
+ }
+
+ wait_cond.Wait(lock);
+ }
+ lock.Unlock();
+
+ // done! clean up.
+
+ // stop dispatch thread
+ if (g_conf.ms_single_dispatch) {
+ dout(10) << "wait: stopping dispatch thread" << endl;
+ lock.Lock();
+ single_dispatch_stop = true;
+ single_dispatch_cond.Signal();
+ lock.Unlock();
+ single_dispatcher.join();
+ }
+
+ // reap pipes
+ lock.Lock();
+ {
+ dout(10) << "wait: closing pipes" << endl;
+ list<Pipe*> toclose;
+ for (hash_map<__int64_t,Pipe*>::iterator i = rank_pipe.begin();
+ i != rank_pipe.end();
+ i++)
+ toclose.push_back(i->second);
+ for (list<Pipe*>::iterator i = toclose.begin();
+ i != toclose.end();
+ i++)
+ (*i)->close();
+
+ dout(10) << "wait: waiting for pipes " << pipes << " to close" << endl;
+ while (!pipes.empty()) {
+ wait_cond.Wait(lock);
+ reaper();
+ }
+ }
+ lock.Unlock();
+
+ dout(10) << "wait: done." << endl;
+}
+
+
+
+
+
+
+/**********************************
+ * EntityMessenger
+ */
+
+Rank::EntityMessenger::EntityMessenger(msg_addr_t myaddr) :
+ Messenger(myaddr),
+ stop(false),
+ dispatch_thread(this)
+{
+ set_myinst(rank.my_inst);
+}
+Rank::EntityMessenger::~EntityMessenger()
+{
+}
+
+void Rank::EntityMessenger::dispatch_entry()
+{
+ lock.Lock();
+ while (!stop) {
+ if (!dispatch_queue.empty()) {
+ list<Message*> ls;
+ ls.swap(dispatch_queue);
+
+ lock.Unlock();
+ {
+ // deliver
+ while (!ls.empty()) {
+ Message *m = ls.front();
+ ls.pop_front();
+ dout(1) << m->get_dest()
+ << " <-- " << m->get_source() << " " << m->get_source_inst()
+ << " ---- " << *m
+ << " -- " << m
+ << endl;
+ dispatch(m);
+ }
+ }
+ lock.Lock();
+ continue;
+ }
+ cond.Wait(lock);
+ }
+ lock.Unlock();
+}
+
+void Rank::EntityMessenger::ready()
+{
+ dout(10) << "ready " << get_myaddr() << endl;
+
+ if (g_conf.ms_single_dispatch) {
+ rank.lock.Lock();
+ if (rank.waiting_for_ready.count(get_myaddr())) {
+ rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(),
+ rank.waiting_for_ready[get_myaddr()]);
+ rank.waiting_for_ready.erase(get_myaddr());
+ rank.single_dispatch_cond.Signal();
+ }
+ rank.lock.Unlock();
+ } else {
+ // start my dispatch thread
+ dispatch_thread.create();
+ }
+}
+
+
+int Rank::EntityMessenger::shutdown()
+{
+ dout(10) << "shutdown " << get_myaddr() << endl;
+
+ // deregister
+ rank.unregister_entity(this);
+
+ // stop my dispatch thread
+ if (dispatch_thread.am_self()) {
+ dout(1) << "shutdown i am dispatch, setting stop flag" << endl;
+ stop = true;
+ } else {
+ dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl;
+ lock.Lock();
+ stop = true;
+ cond.Signal();
+ lock.Unlock();
+ dispatch_thread.join();
+ }
+
+ return 0;
+}
+
+
+void Rank::EntityMessenger::prepare_dest(const entity_inst_t& inst)
+{
+ rank.lock.Lock();
+ {
+ if (rank.rank_pipe.count(inst.rank) == 0)
+ rank.connect_rank(inst);
+ }
+ rank.lock.Unlock();
+}
+
+int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, entity_inst_t inst,
+ int port, int fromport)
+{
+ // set envelope
+ m->set_source(get_myaddr(), fromport);
+ m->set_dest(dest, port);
+
+ m->set_source_inst(rank.my_inst);
+
+ dout(1) << m->get_source()
+ << " --> " << m->get_dest() << " " << inst
+ << " -- " << *m
+ << " -- " << m
+ << endl;
+
+ rank.submit_message(m, inst);
+
+ return 0;
+}
+
+
+void Rank::EntityMessenger::reset_myaddr(msg_addr_t newaddr)
+{
+ msg_addr_t oldaddr = get_myaddr();
+ dout(10) << "set_myaddr " << oldaddr << " to " << newaddr << endl;
+
+ rank.entity_map.erase(oldaddr);
+ rank.local.erase(oldaddr);
+ rank.entity_map[newaddr] = rank.my_inst;
+ rank.local[newaddr] = this;
+
+ _set_myaddr(newaddr);
+}
+
+
+
+
+void Rank::EntityMessenger::mark_down(msg_addr_t a, entity_inst_t& i)
+{
+ assert(a != get_myaddr());
+ rank.mark_down(a,i);
+}
+
+void Rank::mark_down(msg_addr_t a, entity_inst_t& inst)
+{
+ //if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer
+ lock.Lock();
+ if (entity_map.count(a) &&
+ entity_map[a] > inst) {
+ dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl;
+ derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl;
+ // do nothing!
+ } else {
+ if (entity_map.count(a) == 0) {
+ // don't know it
+ dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl;
+ derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl;
+ } else {
+ // know it
+ assert(entity_map[a] <= inst);
+ dout(10) << "mark_down " << a << " inst " << inst << endl;
+ derr(10) << "mark_down " << a << " inst " << inst << endl;
+
+ entity_map.erase(a);
+
+ if (rank_pipe.count(inst.rank)) {
+ rank_pipe[inst.rank]->close();
+ rank_pipe.erase(inst.rank);
+ }
+ }
+ }
+ lock.Unlock();
+}
+
+void Rank::EntityMessenger::mark_up(msg_addr_t a, entity_inst_t& i)
+{
+ assert(a != get_myaddr());
+ rank.mark_up(a, i);
+}
+
+void Rank::mark_up(msg_addr_t a, entity_inst_t& i)
+{
+ lock.Lock();
+ {
+ dout(10) << "mark_up " << a << " inst " << i << endl;
+ derr(10) << "mark_up " << a << " inst " << i << endl;
+
+ if (entity_map.count(a) == 0 ||
+ entity_map[a] < i) {
+ entity_map[a] = i;
+ connect_rank(i);
+ } else if (entity_map[a] == i) {
+ dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl;
+ derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl;
+ } else {
+ dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl;
+ derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl;
+ }
+
+ //if (waiting_for_lookup.count(a))
+ //lookup(a);
+ }
+ lock.Unlock();
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __SIMPLEMESSENGER_H
+#define __SIMPLEMESSENGER_H
+
+
+#include <list>
+#include <map>
+using namespace std;
+#include <ext/hash_map>
+#include <ext/hash_set>
+using namespace __gnu_cxx;
+
+
+#include "include/types.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "Messenger.h"
+#include "Message.h"
+#include "tcp.h"
+
+
+
+
+/* Rank - per-process
+ */
+class Rank {
+
+ class EntityMessenger;
+ class Pipe;
+
+ // incoming
+ class Accepter : public Thread {
+ public:
+ bool done;
+
+ int listen_sd;
+
+ Accepter() : done(false) {}
+
+ void *entry();
+ void stop() {
+ done = true;
+ ::close(listen_sd);
+ join();
+ }
+ int start();
+ } accepter;
+
+
+ // pipe
+ class Pipe {
+ protected:
+ int sd;
+ bool done;
+ entity_inst_t peer_inst;
+ bool server;
+ bool sent_close;
+ bool socket_error;
+
+ bool reader_running;
+ bool writer_running;
+
+ list<Message*> q;
+ Mutex lock;
+ Cond cond;
+
+ int accept(); // server handshake
+ int connect(); // client handshake
+ void reader();
+ void writer();
+
+ Message *read_message();
+ int write_message(Message *m);
+ void fail(list<Message*>& ls);
+
+ // threads
+ class Reader : public Thread {
+ Pipe *pipe;
+ public:
+ Reader(Pipe *p) : pipe(p) {}
+ void *entry() { pipe->reader(); return 0; }
+ } reader_thread;
+ friend class Reader;
+
+ class Writer : public Thread {
+ Pipe *pipe;
+ public:
+ Writer(Pipe *p) : pipe(p) {}
+ void *entry() { pipe->writer(); return 0; }
+ } writer_thread;
+ friend class Writer;
+
+ public:
+ Pipe(int s) : sd(s),
+ done(false), server(true),
+ sent_close(false), socket_error(false),
+ reader_running(false), writer_running(false),
+ reader_thread(this), writer_thread(this) {
+ // server
+ reader_running = true;
+ reader_thread.create();
+ }
+ Pipe(const entity_inst_t &pi) : sd(0),
+ done(false), peer_inst(pi), server(false),
+ sent_close(false),
+ reader_running(false), writer_running(false),
+ reader_thread(this), writer_thread(this) {
+ // client
+ writer_running = true;
+ writer_thread.create();
+ }
+
+ // public constructors
+ static const Pipe& Server(int s);
+ static const Pipe& Client(const entity_inst_t& pi);
+
+ entity_inst_t& get_peer_inst() { return peer_inst; }
+
+ void close();
+ void join() {
+ writer_thread.join();
+ reader_thread.join();
+ }
+
+ void send(Message *m) {
+ lock.Lock();
+ q.push_back(m);
+ cond.Signal();
+ lock.Unlock();
+ }
+ void send(list<Message*>& ls) {
+ lock.Lock();
+ q.splice(q.end(), ls);
+ cond.Signal();
+ lock.Unlock();
+ }
+ };
+
+
+
+ // messenger interface
+ class EntityMessenger : public Messenger {
+ Mutex lock;
+ Cond cond;
+ list<Message*> dispatch_queue;
+ bool stop;
+
+ class DispatchThread : public Thread {
+ EntityMessenger *m;
+ public:
+ DispatchThread(EntityMessenger *_m) : m(_m) {}
+ void *entry() {
+ m->dispatch_entry();
+ return 0;
+ }
+ } dispatch_thread;
+ void dispatch_entry();
+
+ public:
+ void queue_message(Message *m) {
+ lock.Lock();
+ dispatch_queue.push_back(m);
+ cond.Signal();
+ lock.Unlock();
+ }
+ void queue_messages(list<Message*> ls) {
+ lock.Lock();
+ dispatch_queue.splice(dispatch_queue.end(), ls);
+ cond.Signal();
+ lock.Unlock();
+ }
+
+ public:
+ EntityMessenger(msg_addr_t myaddr);
+ ~EntityMessenger();
+
+ void ready();
+ bool is_stopped() { return stop; }
+
+ void wait() {
+ dispatch_thread.join();
+ }
+
+ void reset_myaddr(msg_addr_t m);
+
+ void callback_kick() {}
+ int shutdown();
+ void prepare_dest(const entity_inst_t& inst);
+ int send_message(Message *m, msg_addr_t dest, entity_inst_t inst,
+ int port=0, int fromport=0);
+
+ void mark_down(msg_addr_t a, entity_inst_t& i);
+ void mark_up(msg_addr_t a, entity_inst_t& i);
+ };
+
+
+ class SingleDispatcher : public Thread {
+ Rank *rank;
+ public:
+ SingleDispatcher(Rank *r) : rank(r) {}
+ void *entry() {
+ rank->single_dispatcher_entry();
+ return 0;
+ }
+ } single_dispatcher;
+
+ Cond single_dispatch_cond;
+ bool single_dispatch_stop;
+ list<Message*> single_dispatch_queue;
+
+ map<msg_addr_t, list<Message*> > waiting_for_ready;
+
+ void single_dispatcher_entry();
+ void _submit_single_dispatch(Message *m);
+
+
+ // Rank stuff
+ public:
+ Mutex lock;
+ Cond wait_cond; // for wait()
+
+ // where i listen
+ tcpaddr_t listen_addr;
+
+ // my instance
+ entity_inst_t my_inst;
+
+ // lookup
+ hash_map<msg_addr_t, entity_inst_t> entity_map;
+ hash_set<msg_addr_t> entity_unstarted;
+
+ // local
+ map<msg_addr_t, EntityMessenger*> local;
+
+ // remote
+ hash_map<__int64_t, Pipe*> rank_pipe;
+
+ set<Pipe*> pipes;
+ list<Pipe*> pipe_reap_queue;
+
+ void show_dir();
+
+ Pipe *connect_rank(const entity_inst_t& inst);
+
+ void mark_down(msg_addr_t addr, entity_inst_t& i);
+ void mark_up(msg_addr_t addr, entity_inst_t& i);
+
+ tcpaddr_t get_listen_addr() { return listen_addr; }
+
+ void reaper();
+
+ EntityMessenger *find_unnamed(msg_addr_t a);
+
+public:
+ Rank();
+ ~Rank();
+
+ void set_listen_addr(tcpaddr_t& a);
+
+ int start_rank();
+ void wait();
+
+ EntityMessenger *register_entity(msg_addr_t addr);
+ void rename_entity(EntityMessenger *ms, msg_addr_t newaddr);
+ void unregister_entity(EntityMessenger *ms);
+
+ void submit_message(Message *m, const entity_inst_t& inst);
+ void prepare_dest(const entity_inst_t& inst);
+
+ // create a new messenger
+ EntityMessenger *new_entity(msg_addr_t addr);
+
+} ;
+
+
+
+extern Rank rank;
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "TCPDirectory.h"
+
+#include "messages/MNSConnect.h"
+#include "messages/MNSConnectAck.h"
+#include "messages/MNSRegister.h"
+#include "messages/MNSRegisterAck.h"
+#include "messages/MNSLookup.h"
+#include "messages/MNSLookupReply.h"
+//#include "messages/MNSUnregister.h"
+
+#include "config.h"
+#undef dout
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_ns) cout << "nameserver: "
+
+void tcp_open(int rank);
+
+
+void TCPDirectory::handle_connect(MNSConnect *m)
+{
+ int rank = nrank++;
+ dout(2) << "connect from new rank " << rank << " " << m->get_addr() << endl;
+
+ dir[MSG_ADDR_RANK(rank)] = rank;
+ messenger->map_entity_rank(MSG_ADDR_RANK(rank), rank);
+
+ rank_addr[rank] = m->get_addr();
+ messenger->map_rank_addr(rank, m->get_addr());
+
+ messenger->send_message(new MNSConnectAck(rank),
+ MSG_ADDR_RANK(rank));
+ delete m;
+}
+
+
+
+void TCPDirectory::handle_register(MNSRegister *m)
+{
+ dout(10) << "register from rank " << m->get_rank() << " addr " << MSG_ADDR_NICE(m->get_entity()) << endl;
+
+ // pick id
+ int rank = m->get_rank();
+ msg_addr_t entity = m->get_entity();
+
+ if (entity.is_new()) {
+ // make up a new address!
+ switch (entity.type()) {
+
+ case MSG_ADDR_RANK_BASE: // stupid client should be able to figure this out
+ entity = MSG_ADDR_RANK(rank);
+ break;
+
+ case MSG_ADDR_MDS_BASE:
+ entity = MSG_ADDR_MDS(nmds++);
+ break;
+
+ case MSG_ADDR_OSD_BASE:
+ entity = MSG_ADDR_OSD(nosd++);
+ break;
+
+ case MSG_ADDR_CLIENT_BASE:
+ entity = MSG_ADDR_CLIENT(nclient++);
+ break;
+
+ default:
+ assert(0);
+ }
+ } else {
+ // specific address!
+ assert(dir.count(entity) == 0); // make sure it doesn't exist yet.
+ }
+
+ dout(2) << "registered " << MSG_ADDR_NICE(entity) << endl;
+
+ // register
+ dir[entity] = rank;
+
+ if (entity == MSG_ADDR_RANK(rank)) // map this locally now so we can reply
+ messenger->map_entity_rank(entity, rank); // otherwise wait until they send STARTED msg
+
+ hold.insert(entity);
+
+ ++version;
+ update_log[version] = entity;
+
+ // reply w/ new id
+ messenger->send_message(new MNSRegisterAck(m->get_tid(), entity),
+ MSG_ADDR_RANK(rank));
+ delete m;
+}
+
+void TCPDirectory::handle_started(Message *m)
+{
+ msg_addr_t entity = m->get_source();
+
+ dout(3) << "start signal from " << MSG_ADDR_NICE(entity) << endl;
+ hold.erase(entity);
+ messenger->map_entity_rank(entity, dir[entity]);
+
+ // waiters?
+ if (waiting.count(entity)) {
+ list<Message*> ls;
+ ls.splice(ls.begin(), waiting[entity]);
+ waiting.erase(entity);
+
+ dout(10) << "doing waiter on " << MSG_ADDR_NICE(entity) << endl;
+ for (list<Message*>::iterator it = ls.begin();
+ it != ls.end();
+ it++) {
+ dispatch(*it);
+ }
+ }
+}
+
+void TCPDirectory::handle_unregister(Message *m)
+{
+ msg_addr_t who = m->get_source();
+ dout(2) << "unregister from entity " << MSG_ADDR_NICE(who) << endl;
+
+ assert(dir.count(who));
+ dir.erase(who);
+
+ // shutdown?
+ if (dir.size() <= 2) {
+ dout(2) << "dir is empty except for me, shutting down" << endl;
+ tcpmessenger_stop_nameserver();
+ }
+ else {
+ if (0) {
+ dout(10) << "dir size now " << dir.size() << endl;
+ for (hash_map<msg_addr_t, int>::iterator it = dir.begin();
+ it != dir.end();
+ it++) {
+ dout(10) << " dir: " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl;
+ }
+ }
+ }
+
+}
+
+
+void TCPDirectory::handle_lookup(MNSLookup *m)
+{
+ // have it?
+ if (dir.count(m->get_entity()) == 0 ||
+ hold.count(m->get_entity())) {
+ dout(2) << MSG_ADDR_NICE(m->get_source()) << " lookup '" << MSG_ADDR_NICE(m->get_entity()) << "' -> dne or on hold" << endl;
+ waiting[m->get_entity()].push_back(m);
+ return;
+ }
+
+ // look it up!
+ MNSLookupReply *reply = new MNSLookupReply(m);
+
+ int rank = dir[m->get_entity()];
+ reply->entity_map[m->get_entity()] = rank;
+ reply->rank_addr[rank] = rank_addr[rank];
+
+ dout(2) << MSG_ADDR_NICE(m->get_source()) << " lookup '" << MSG_ADDR_NICE(m->get_entity()) << "' -> rank " << rank << endl;
+
+ messenger->send_message(reply,
+ m->get_source(), m->get_source_port());
+ delete m;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __TCPDIRECTORY_H
+#define __TCPDIRECTORY_H
+
+/*
+ * rank -- a process (listening on some host:port)
+ * entity -- a logical entity (osd123, mds3, client3245, etc.)
+ *
+ * multiple entities can coexist on a single rank.
+ */
+
+#include "Dispatcher.h"
+#include "TCPMessenger.h"
+
+#include <map>
+using namespace std;
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#include <sys/types.h>
+//#include <sys/stat.h>
+#include <fcntl.h>
+
+class TCPDirectory : public Dispatcher {
+ protected:
+ // how i communicate
+ TCPMessenger *messenger;
+
+ // directory
+ hash_map<msg_addr_t, int> dir; // entity -> rank
+ hash_map<int, tcpaddr_t> rank_addr; // rank -> ADDR (e.g. host:port)
+
+ __uint64_t version;
+ map<__uint64_t, msg_addr_t> update_log;
+
+ int nrank;
+ int nclient, nmds, nosd;
+
+ set<msg_addr_t> hold;
+ map<msg_addr_t, list<Message*> > waiting;
+
+ // messages
+ void handle_connect(class MNSConnect*);
+ void handle_register(class MNSRegister *m);
+ void handle_started(Message *m);
+ void handle_lookup(class MNSLookup *m);
+ void handle_unregister(Message *m);
+
+ public:
+ TCPDirectory(TCPMessenger *m) :
+ messenger(m),
+ version(0),
+ nrank(0), nclient(0), nmds(0), nosd(0) {
+ messenger->set_dispatcher(this);
+
+ // i am rank 0!
+ dir[MSG_ADDR_DIRECTORY] = 0;
+ rank_addr[0] = m->get_tcpaddr();
+ ++nrank;
+
+ // announce nameserver
+ cout << "export CEPH_NAMESERVER=" << m->get_tcpaddr() << endl;
+
+ int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT);
+ ::write(fd, (void*)&m->get_tcpaddr(), sizeof(tcpaddr_t));
+ ::fchmod(fd, 0755);
+ ::close(fd);
+ }
+ ~TCPDirectory() {
+ ::unlink(".ceph_ns");
+ }
+
+ void dispatch(Message *m) {
+ switch (m->get_type()) {
+ case MSG_NS_CONNECT:
+ handle_connect((class MNSConnect*)m);
+ break;
+ case MSG_NS_REGISTER:
+ handle_register((class MNSRegister*)m);
+ break;
+ case MSG_NS_STARTED:
+ handle_started(m);
+ break;
+ case MSG_NS_UNREGISTER:
+ handle_unregister(m);
+ break;
+ case MSG_NS_LOOKUP:
+ handle_lookup((class MNSLookup*)m);
+ break;
+
+ default:
+ assert(0);
+ }
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "config.h"
+#include "include/error.h"
+
+#include "common/Timer.h"
+#include "common/Mutex.h"
+
+#include "TCPMessenger.h"
+#include "Message.h"
+
+#include <iostream>
+#include <cassert>
+using namespace std;
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+#include <errno.h>
+# include <netdb.h>
+# include <sys/socket.h>
+# include <netinet/in.h>
+# include <arpa/inet.h>
+#include <sys/select.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/types.h>
+
+#include <unistd.h>
+
+#include "messages/MGenericMessage.h"
+#include "messages/MNSConnect.h"
+#include "messages/MNSConnectAck.h"
+#include "messages/MNSRegister.h"
+#include "messages/MNSRegisterAck.h"
+#include "messages/MNSLookup.h"
+#include "messages/MNSLookupReply.h"
+
+#include "TCPDirectory.h"
+
+#include "common/Logger.h"
+
+#define DBL 18
+
+//#define TCP_SERIALMARSHALL // do NOT turn this off until you check messages/* encode_payload methods
+//#define TCP_SERIALOUT // be paranoid/annoying and send messages in same thread
+
+
+TCPMessenger *rankmessenger = 0; //
+
+TCPDirectory *nameserver = 0; // only defined on rank 0
+TCPMessenger *nsmessenger = 0;
+
+
+/***************************/
+LogType rank_logtype;
+Logger *logger;
+
+int stat_num = 0;
+off_t stat_inq = 0, stat_inqb = 0;
+off_t stat_disq = 0, stat_disqb = 0;
+off_t stat_outq = 0, stat_outqb = 0;
+/***************************/
+
+
+// local directory
+hash_map<msg_addr_t, TCPMessenger*> directory; // local
+hash_set<msg_addr_t> directory_ready;
+Mutex directory_lock;
+
+// connecting
+struct sockaddr_in listen_addr; // my listen addr
+int listen_sd = 0;
+int my_rank = -1;
+Cond waiting_for_rank;
+
+// register
+long regid = 0;
+map<int, Cond* > waiting_for_register_cond;
+map<int, msg_addr_t > waiting_for_register_result;
+
+// incoming messages
+list<Message*> incoming;
+Mutex incoming_lock;
+Cond incoming_cond;
+
+// outgoing messages
+/*
+list<Message*> outgoing;
+Mutex outgoing_lock;
+Cond outgoing_cond;
+*/
+
+class OutThread : public Thread {
+public:
+ Mutex lock;
+ Cond cond;
+ list<Message*> q;
+ bool done;
+
+ OutThread() : done(false) {}
+ virtual ~OutThread() {}
+
+ void *entry();
+
+ void stop() {
+ lock.Lock();
+ done = true;
+ cond.Signal();
+ lock.Unlock();
+ join();
+ }
+
+ void send(Message *m) {
+ lock.Lock();
+ q.push_back(m);
+ cond.Signal();
+ lock.Unlock();
+ }
+} single_out_thread;
+
+Mutex lookup_lock; //
+hash_map<msg_addr_t, int> entity_rank; // entity -> rank
+hash_map<int, int> rank_sd; // outgoing sockets, rank -> sd
+hash_map<int, OutThread*> rank_out;
+hash_map<int, tcpaddr_t> rank_addr; // rank -> tcpaddr
+map<msg_addr_t, list<Message*> > waiting_for_lookup;
+
+
+/* this process */
+bool tcp_done = false; // set this flag to stop the event loop
+
+
+// threads
+pthread_t dispatch_thread_id = 0; // thread id of the event loop. init value == nobody
+pthread_t out_thread_id = 0; // thread id of the event loop. init value == nobody
+pthread_t listen_thread_id = 0;
+map<int, pthread_t> in_threads; // sd -> threadid
+
+//bool pending_timer = false;
+
+// per-rank fun
+
+
+// debug
+#undef dout
+#define dout(l) if (l<=g_conf.debug) cout << "[TCP " << my_rank /*(<< " " << getpid() << "." << pthread_self() */ << "] "
+
+
+#include "tcp.cc"
+
+// some declarations
+void tcp_open(int rank);
+int tcp_send(Message *m);
+void tcpmessenger_kick_dispatch_loop();
+OutThread *tcp_lookup(Message *m);
+
+int tcpmessenger_get_rank()
+{
+ return my_rank;
+}
+
+
+int tcpmessenger_findns(tcpaddr_t &nsa)
+{
+ char *nsaddr = 0;
+ bool have_nsa = false;
+
+ // env var?
+ /*int e_len = 0;
+ for (int i=0; envp[i]; i++)
+ e_len += strlen(envp[i]) + 1;
+ */
+ nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER");
+ if (nsaddr) {
+ while (nsaddr[0] != '=') nsaddr++;
+ nsaddr++;
+ }
+
+ else {
+ // file?
+ int fd = ::open(".ceph_ns",O_RDONLY);
+ if (fd > 0) {
+ ::read(fd, (void*)&nsa, sizeof(nsa));
+ ::close(fd);
+ have_nsa = true;
+ nsaddr = "from .ceph_ns";
+ }
+ }
+
+ if (!nsaddr && !have_nsa) {
+ cerr << "i need ceph ns addr.. either CEPH_NAMESERVER env var or --ns blah" << endl;
+ return -1;
+ //exit(-1);
+ }
+
+ // look up nsaddr?
+ if (!have_nsa && tcpmessenger_lookup(nsaddr, nsa) < 0) {
+ return -1;
+ }
+
+ dout(2) << "ceph ns is " << nsaddr << " or " << nsa << endl;
+ return 0;
+}
+
+
+
+/** rankserver
+ *
+ * one per rank. handles entity->rank lookup replies.
+ */
+
+class RankServer : public Dispatcher {
+public:
+ void dispatch(Message *m) {
+ lookup_lock.Lock();
+
+ dout(DBL) << "rankserver dispatching " << *m << endl;
+
+ switch (m->get_type()) {
+ case MSG_NS_CONNECTACK:
+ handle_connect_ack((MNSConnectAck*)m);
+ break;
+
+ case MSG_NS_REGISTERACK:
+ handle_register_ack((MNSRegisterAck*)m);
+ break;
+
+ case MSG_NS_LOOKUPREPLY:
+ handle_lookup_reply((MNSLookupReply*)m);
+ break;
+
+ default:
+ assert(0);
+ }
+
+ lookup_lock.Unlock();
+ }
+
+ void handle_connect_ack(MNSConnectAck *m) {
+ dout(DBL) << "my rank is " << m->get_rank();
+ my_rank = m->get_rank();
+
+ // now that i know my rank,
+ entity_rank[MSG_ADDR_RANK(my_rank)] = my_rank;
+ rank_addr[my_rank] = listen_addr;
+
+ waiting_for_rank.SignalAll();
+
+ delete m;
+
+ // logger!
+ dout(DBL) << "logger" << endl;
+ char names[100];
+ sprintf(names, "rank%d", my_rank);
+ string name = names;
+
+ if (g_conf.tcp_log) {
+ logger = new Logger(name, (LogType*)&rank_logtype);
+ rank_logtype.add_set("num");
+ rank_logtype.add_inc("in");
+ rank_logtype.add_inc("inb");
+ rank_logtype.add_inc("dis");
+ rank_logtype.add_set("inq");
+ rank_logtype.add_set("inqb");
+ rank_logtype.add_set("outq");
+ rank_logtype.add_set("outqb");
+ }
+
+ }
+
+ void handle_register_ack(MNSRegisterAck *m) {
+ long tid = m->get_tid();
+ waiting_for_register_result[tid] = m->get_entity();
+ waiting_for_register_cond[tid]->Signal();
+ delete m;
+ }
+
+ void handle_lookup_reply(MNSLookupReply *m) {
+ list<Message*> waiting;
+ dout(DBL) << "got lookup reply" << endl;
+
+ for (map<msg_addr_t, int>::iterator it = m->entity_rank.begin();
+ it != m->entity_rank.end();
+ it++) {
+ dout(DBL) << "lookup got " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl;
+ entity_rank[it->first] = it->second;
+
+ if (it->second == my_rank) {
+ // deliver locally
+ dout(-DBL) << "delivering lookup results locally" << endl;
+ incoming_lock.Lock();
+
+ for (list<Message*>::iterator i = waiting_for_lookup[it->first].begin();
+ i != waiting_for_lookup[it->first].end();
+ i++) {
+ stat_inq++;
+ stat_inqb += (*i)->get_payload().length();
+ (*i)->decode_payload();
+ incoming.push_back(*i);
+ }
+ incoming_cond.Signal();
+ incoming_lock.Unlock();
+ } else {
+ // take waiters
+ waiting.splice(waiting.begin(), waiting_for_lookup[it->first]);
+ }
+ waiting_for_lookup.erase(it->first);
+
+ }
+
+ for (map<int,tcpaddr_t>::iterator it = m->rank_addr.begin();
+ it != m->rank_addr.end();
+ it++) {
+ dout(DBL) << "lookup got rank " << it->first << " addr " << it->second << endl;
+ rank_addr[it->first] = it->second;
+
+ // open it now
+ if (rank_sd.count(it->first) == 0)
+ tcp_open(it->first);
+ }
+
+ // send waiting messages
+#ifdef TCP_SERIALOUT
+ for (list<Message*>::iterator it = waiting.begin();
+ it != waiting.end();
+ it++) {
+ OutThread *outt = tcp_lookup(*it);
+ assert(outt);
+ tcp_send(*it);
+ }
+#else
+ for (list<Message*>::iterator it = waiting.begin();
+ it != waiting.end();
+ it++) {
+ OutThread *outt = tcp_lookup(*it);
+ assert(outt);
+ outt->send(*it);
+// dout(0) << "lookup done, splicing in " << *it << endl;
+ }
+#endif
+
+ delete m;
+ }
+
+} rankserver;
+
+
+class C_TCPKicker : public Context {
+ void finish(int r) {
+ dout(DBL) << "timer kick" << endl;
+ tcpmessenger_kick_dispatch_loop();
+ }
+};
+
+void TCPMessenger::callback_kick()
+{
+ tcpmessenger_kick_dispatch_loop();
+}
+
+
+extern int tcpmessenger_lookup(char *str, tcpaddr_t& ta)
+{
+ char *host = str;
+ char *port = 0;
+
+ for (int i=0; str[i]; i++) {
+ if (str[i] == ':') {
+ port = str+i+1;
+ str[i] = 0;
+ break;
+ }
+ }
+ if (!port) {
+ cerr << "addr '" << str << "' doesn't look like 'host:port'" << endl;
+ return -1;
+ }
+ //cout << "host '" << host << "' port '" << port << "'" << endl;
+
+ int iport = atoi(port);
+
+ struct hostent *myhostname = gethostbyname( host );
+ if (!myhostname) {
+ cerr << "host " << host << " not found" << endl;
+ return -1;
+ }
+
+ memset(&ta, 0, sizeof(ta));
+
+ //cout << "addrtype " << myhostname->h_addrtype << " len " << myhostname->h_length << endl;
+
+ ta.sin_family = myhostname->h_addrtype;
+ memcpy((char *)&ta.sin_addr,
+ myhostname->h_addr,
+ myhostname->h_length);
+ ta.sin_port = iport;
+
+ cout << "lookup '" << host << ":" << port << "' -> " << ta << endl;
+
+ return 0;
+}
+
+
+
+/*****
+ * global methods for process-wide startup, shutdown.
+ */
+
+int tcpmessenger_init()
+{
+ // LISTEN
+ dout(DBL) << "binding to listen " << endl;
+
+ /* socket creation */
+ listen_sd = socket(AF_INET,SOCK_STREAM,0);
+ assert(listen_sd > 0);
+
+ /* bind to port */
+ memset((char*)&listen_addr, 0, sizeof(listen_addr));
+ listen_addr.sin_family = AF_INET;
+ listen_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ listen_addr.sin_port = 0;
+
+ int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr));
+ assert(rc >= 0);
+
+ socklen_t llen = sizeof(listen_addr);
+ getsockname(listen_sd, (sockaddr*)&listen_addr, &llen);
+
+ int myport = listen_addr.sin_port;
+
+ // listen!
+ rc = ::listen(listen_sd, 1000);
+ assert(rc >= 0);
+
+ dout(DBL) << "listening on " << myport << endl;
+
+ // my address is...
+ char host[100];
+ gethostname(host, 100);
+ dout(DBL) << "my hostname is " << host << endl;
+
+ struct hostent *myhostname = gethostbyname( host );
+
+ struct sockaddr_in my_addr;
+ memset(&my_addr, 0, sizeof(my_addr));
+
+ my_addr.sin_family = myhostname->h_addrtype;
+ memcpy((char *) &my_addr.sin_addr.s_addr,
+ myhostname->h_addr_list[0],
+ myhostname->h_length);
+ my_addr.sin_port = myport;
+
+ listen_addr = my_addr;
+
+ dout(DBL) << "listen addr is " << listen_addr << endl;
+
+ // register to execute timer events
+ //g_timer.set_messenger_kicker(new C_TCPKicker());
+
+
+ dout(DBL) << "init done" << endl;
+ return 0;
+}
+
+
+// on first rank only
+void tcpmessenger_start_nameserver(tcpaddr_t& diraddr)
+{
+ dout(DBL) << "starting nameserver on " << MSG_ADDR_NICE(MSG_ADDR_DIRECTORY) << endl;
+
+ // i am rank 0.
+ nsmessenger = new TCPMessenger(MSG_ADDR_DIRECTORY);
+
+ // start name server
+ nameserver = new TCPDirectory(nsmessenger);
+
+ // diraddr is my addr!
+ diraddr = rank_addr[0] = listen_addr;
+ my_rank = 0;
+ entity_rank[MSG_ADDR_DIRECTORY] = 0;
+}
+void tcpmessenger_stop_nameserver()
+{
+ if (nsmessenger) {
+ dout(DBL) << "shutting down nsmessenger" << endl;
+ TCPMessenger *m = nsmessenger;
+ nsmessenger = 0;
+ m->shutdown();
+ delete m;
+ }
+}
+
+// on all ranks
+void tcpmessenger_start_rankserver(tcpaddr_t& ns)
+{
+ // connect to nameserver
+ entity_rank[MSG_ADDR_DIRECTORY] = 0;
+ rank_addr[0] = ns;
+ tcp_open(0);
+
+ if (my_rank >= 0) {
+ // i know my rank
+ rankmessenger = new TCPMessenger(MSG_ADDR_RANK(my_rank));
+ } else {
+ // start rank messenger, and discover my rank.
+ rankmessenger = new TCPMessenger(MSG_ADDR_RANK_NEW);
+ }
+}
+void tcpmessenger_stop_rankserver()
+{
+ if (rankmessenger) {
+ dout(DBL) << "shutting down rankmessenger" << endl;
+ rankmessenger->shutdown();
+ delete rankmessenger;
+ rankmessenger = 0;
+ }
+}
+
+
+
+
+
+
+int tcpmessenger_shutdown()
+{
+ dout(DBL) << "tcpmessenger_shutdown barrier" << endl;
+
+
+ dout(2) << "tcpmessenger_shutdown closing all sockets etc" << endl;
+
+ // bleh
+ for (hash_map<int,int>::iterator it = rank_sd.begin();
+ it != rank_sd.end();
+ it++) {
+ ::close(it->second);
+ }
+
+ return 0;
+}
+
+
+
+
+/***
+ * internal send/recv
+ */
+
+
+
+
+/*
+ * recv a Message*
+ */
+
+
+
+Message *tcp_recv(int sd)
+{
+ // envelope
+ dout(DBL) << "tcp_recv receiving message from sd " << sd << endl;
+
+ msg_envelope_t env;
+ if (!tcp_read( sd, (char*)&env, sizeof(env) ))
+ return 0;
+
+ if (env.type == 0) {
+ dout(DBL) << "got dummy env, bailing" << endl;
+ return 0;
+ }
+
+ dout(DBL) << "tcp_recv got envelope type=" << env.type << " src " << MSG_ADDR_NICE(env.source) << " dst " << MSG_ADDR_NICE(env.dest) << " nchunks=" << env.nchunks << endl;
+
+ // payload
+ bufferlist blist;
+ for (int i=0; i<env.nchunks; i++) {
+ int size;
+ tcp_read( sd, (char*)&size, sizeof(size) );
+
+ bufferptr bp = new buffer(size);
+
+ if (!tcp_read( sd, bp.c_str(), size )) return 0;
+
+ blist.push_back(bp);
+
+ dout(DBL) << "tcp_recv got frag " << i << " of " << env.nchunks << " len " << bp.length() << endl;
+ }
+
+ // unmarshall message
+ size_t s = blist.length();
+ Message *m = decode_message(env, blist);
+
+ if (logger) {
+ logger->inc("in");
+ logger->inc("inb", s+sizeof(env));
+ }
+
+ dout(DBL) << "tcp_recv got " << s << " byte message from " << MSG_ADDR_NICE(m->get_source()) << endl;
+
+ return m;
+}
+
+
+
+
+void tcp_open(int rank)
+{
+ dout(DBL) << "tcp_open to rank " << rank << " at " << rank_addr[rank] << endl;
+
+ // create socket?
+ int sd = socket(AF_INET,SOCK_STREAM,0);
+ assert(sd > 0);
+
+ // bind any port
+ struct sockaddr_in myAddr;
+ myAddr.sin_family = AF_INET;
+ myAddr.sin_addr.s_addr = htonl(INADDR_ANY);
+ myAddr.sin_port = htons( 0 );
+
+ int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr));
+ assert(rc>=0);
+
+ // connect!
+ int r = connect(sd, (sockaddr*)&rank_addr[rank], sizeof(myAddr));
+ assert(r >= 0);
+
+ //dout(DBL) << "tcp_open connected to " << who << endl;
+ assert(rank_sd.count(rank) == 0);
+ rank_sd[rank] = sd;
+
+ if (g_conf.tcp_multi_out) {
+ rank_out[rank] = new OutThread();
+ rank_out[rank]->create();
+ } else {
+ rank_out[rank] = &single_out_thread;
+ if (!single_out_thread.is_started())
+ single_out_thread.create();
+ }
+}
+
+
+void tcp_marshall(Message *m)
+{
+ // marshall
+ if (m->empty_payload())
+ m->encode_payload();
+}
+
+OutThread *tcp_lookup(Message *m)
+{
+ msg_addr_t addr = m->get_dest();
+
+ if (!entity_rank.count(m->get_dest())) {
+ // lookup and wait.
+ if (waiting_for_lookup.count(addr)) {
+ dout(DBL) << "already looking up " << MSG_ADDR_NICE(addr) << endl;
+ } else {
+ dout(DBL) << "lookup on " << MSG_ADDR_NICE(addr) << " for " << m << endl;
+ MNSLookup *r = new MNSLookup(addr);
+ rankmessenger->send_message(r, MSG_ADDR_DIRECTORY);
+ }
+
+ // add waiter
+ waiting_for_lookup[addr].push_back(m);
+ return 0;
+ }
+
+ int rank = entity_rank[m->get_dest()];
+
+ if (rank_sd.count(rank) == 0) { // should only happen on rank0?
+ tcp_open(rank);
+ }
+ assert(rank_sd.count(rank));
+ m->set_tcp_sd( rank_sd[rank] );
+ return rank_out[rank];
+}
+
+
+/*
+ * send a Message* over the wire. ** do not block **.
+ */
+int tcp_send(Message *m)
+{
+ /*int rank = entity_rank[m->get_dest()];
+ //if (rank_sd.count(rank) == 0) tcp_open(rank);
+ assert(rank_sd.count(rank));
+
+ int sd = rank_sd[rank];
+ assert(sd);
+ */
+ int sd = m->get_tcp_sd();
+ assert(sd);
+
+ // get envelope, buffers
+ msg_envelope_t *env = &m->get_envelope();
+ bufferlist blist;
+ blist.claim( m->get_payload() );
+
+#ifdef TCP_KEEP_CHUNKS
+ env->nchunks = blist.buffers().size();
+#else
+ env->nchunks = 1;
+#endif
+
+ // HACK osd -> client only
+ //if (m->get_source() >= MSG_ADDR_OSD(0) && m->get_source() < MSG_ADDR_CLIENT(0) &&
+ // m->get_dest() >= MSG_ADDR_CLIENT(0))
+ dout(DBL) << g_clock.now() << " sending " << m << " " << *m << " to " << MSG_ADDR_NICE(m->get_dest())
+ //<< " rank " << rank
+ << " sd " << sd << endl;
+
+ // send envelope
+ int r = tcp_write( sd, (char*)env, sizeof(*env) );
+ if (r < 0) { cerr << "error sending envelope for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); }
+
+ // payload
+#ifdef TCP_KEEP_CHUNKS
+ // send chunk-wise
+ int i = 0;
+ for (list<bufferptr>::iterator it = blist.buffers().begin();
+ it != blist.buffers().end();
+ it++) {
+ dout(DBL) << "tcp_sending frag " << i << " len " << (*it).length() << endl;
+ int size = (*it).length();
+ r = tcp_write( sd, (char*)&size, sizeof(size) );
+ if (r < 0) { cerr << "error sending chunk len for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); }
+ r = tcp_write( sd, (*it).c_str(), size );
+ if (r < 0) { cerr << "error sending data chunk for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); }
+ i++;
+ }
+#else
+ // one big chunk
+ int size = blist.length();
+ r = tcp_write( sd, (char*)&size, sizeof(size) );
+ if (r < 0) { cerr << "error sending data len for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); }
+ for (list<bufferptr>::iterator it = blist.buffers().begin();
+ it != blist.buffers().end();
+ it++) {
+ r = tcp_write( sd, (*it).c_str(), (*it).length() );
+ if (r < 0) { cerr << "error sending data megachunk for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << " : len " << (*it).length() << endl; assert(0); }
+ }
+#endif
+
+ // hose message
+ delete m;
+ return 0;
+}
+
+
+
+
+
+/** tcp_outthread
+ * this thread watching the outgoing queue, and encodes+sends any queued messages
+ */
+
+void* OutThread::entry()
+{
+ lock.Lock();
+ while (!q.empty() || !done) {
+
+ if (!q.empty()) {
+ dout(DBL) << "outthread grabbing message(s)" << endl;
+
+ // grab outgoing list
+ list<Message*> out;
+ out.splice(out.begin(), q);
+
+ // drop lock while i send these
+ lock.Unlock();
+
+ while (!out.empty()) {
+ Message *m = out.front();
+ out.pop_front();
+
+ dout(DBL) << "outthread sending " << m << endl;
+
+ if (!g_conf.tcp_serial_marshall)
+ tcp_marshall(m);
+
+ tcp_send(m);
+ }
+
+ lock.Lock();
+ continue;
+ }
+
+ // wait
+ dout(DBL) << "outthread sleeping" << endl;
+ cond.Wait(lock);
+ }
+ dout(DBL) << "outthread done" << endl;
+
+ lock.Unlock();
+ return 0;
+}
+
+
+
+/** tcp_inthread
+ * read incoming messages from a given peer.
+ * give received and decoded messages to dispatch loop.
+ */
+void *tcp_inthread(void *r)
+{
+ int sd = (int)r;
+
+ dout(DBL) << "tcp_inthread reading on sd " << sd << endl;
+
+ while (!tcp_done) {
+ Message *m = tcp_recv(sd);
+ if (!m) break;
+ msg_addr_t who = m->get_source();
+
+ dout(20) << g_clock.now() << " inthread got " << m << " from sd " << sd << " who is " << who << endl;
+
+ // give to dispatch loop
+ size_t sz = m->get_payload().length();
+
+ if (g_conf.tcp_multi_dispatch) {
+ const msg_addr_t dest = m->get_dest();
+ directory_lock.Lock();
+ TCPMessenger *messenger = directory[ dest ];
+ directory_lock.Unlock();
+
+ if (messenger)
+ messenger->dispatch_queue(m);
+ else
+ dout(0) << "dest " << dest << " dne" << endl;
+
+ } else {
+ // single dispatch queue
+ incoming_lock.Lock();
+ {
+ //dout(-20) << "in1 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl;
+ //assert(stat_inq == incoming.size());
+ incoming.push_back(m);
+ incoming_cond.Signal();
+
+ stat_inq++;
+ //assert(stat_inq == incoming.size());
+ //dout(-20) << "in2 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl;
+ stat_inqb += sz;
+ }
+ incoming_lock.Unlock();
+ }
+
+ if (logger) {
+ //logger->inc("in");
+ //logger->inc("inb", sz);
+ }
+ }
+
+ dout(DBL) << "tcp_inthread closing " << sd << endl;
+
+ //::close(sd);
+ return 0;
+}
+
+/** tcp_accepthread
+ * accept incoming connections from peers.
+ * start a tcp_inthread for each.
+ */
+void *tcp_acceptthread(void *)
+{
+ dout(DBL) << "tcp_acceptthread starting" << endl;
+
+ while (!tcp_done) {
+ //dout(DBL) << "accepting, left = " << left << endl;
+
+ struct sockaddr_in addr;
+ socklen_t slen = sizeof(addr);
+ int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen);
+ if (sd > 0) {
+ dout(DBL) << "accepted incoming on sd " << sd << endl;
+
+ pthread_t th;
+ pthread_create(&th,
+ NULL,
+ tcp_inthread,
+ (void*)sd);
+ in_threads[sd] = th;
+ } else {
+ dout(DBL) << "no incoming connection?" << endl;
+ break;
+ }
+ }
+ return 0;
+}
+
+
+
+
+/** tcp_dispatchthread
+ * wait for pending timers, incoming messages. dispatch them.
+ */
+void TCPMessenger::dispatch_entry()
+{
+ incoming_lock.Lock();
+ while (!incoming.empty() || !incoming_stop) {
+ if (!incoming.empty()) {
+ // grab incoming messages
+ list<Message*> in;
+ in.splice(in.begin(), incoming);
+
+ assert(stat_disq == 0);
+ stat_disq = stat_inq;
+ stat_disqb = stat_inqb;
+ stat_inq = 0;
+ stat_inqb = 0;
+
+ // drop lock while we deliver
+ //assert(stat_inq == incoming.size());
+ incoming_lock.Unlock();
+
+ // dispatch!
+ while (!in.empty()) {
+ Message *m = in.front();
+ in.pop_front();
+
+ stat_disq--;
+ stat_disqb -= m->get_payload().length();
+ if (logger) {
+ logger->set("inq", stat_inq+stat_disq);
+ logger->set("inqb", stat_inqb+stat_disq);
+ logger->inc("dis");
+ }
+
+ dout(4) << g_clock.now() << " ---- '" << m->get_type_name() <<
+ "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() <<
+ " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- "
+ << m
+ << endl;
+
+ dispatch(m);
+ }
+
+ continue;
+ }
+
+ // sleep
+ dout(DBL) << "dispatch: waiting for incoming messages" << endl;
+ incoming_cond.Wait(incoming_lock);
+ dout(DBL) << "dispatch: woke up" << endl;
+ }
+ incoming_lock.Unlock();
+}
+
+
+void* tcp_dispatchthread(void*)
+{
+ dout(5) << "tcp_dispatchthread start pid " << getpid() << endl;
+
+ while (1) {
+ // inq?
+ incoming_lock.Lock();
+
+ // done?
+ if (tcp_done && incoming.empty()) {
+ incoming_lock.Unlock();
+ break;
+ }
+
+ // wait?
+ if (incoming.empty()) {
+ // wait
+ dout(DBL) << "dispatch: incoming empty, waiting for incoming messages" << endl;
+ incoming_cond.Wait(incoming_lock);
+ dout(DBL) << "dispatch: woke up" << endl;
+ }
+
+ // grab incoming messages
+ //dout(-20) << "dis1 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl;
+ //assert(stat_inq == incoming.size());
+
+ list<Message*> in;
+ in.splice(in.begin(), incoming);
+
+ assert(stat_disq == 0);
+ stat_disq = stat_inq;
+ stat_disqb = stat_inqb;
+ stat_inq = 0;
+ stat_inqb = 0;
+ //assert(stat_inq == incoming.size());
+ //dout(-20) << "dis2 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl;
+
+ // drop lock while we deliver
+ incoming_lock.Unlock();
+
+ // dispatch!
+ while (!in.empty()) {
+ Message *m = in.front();
+ in.pop_front();
+
+ stat_disq--;
+ stat_disqb -= m->get_payload().length();
+ if (logger) {
+ logger->set("inq", stat_inq+stat_disq);
+ logger->set("inqb", stat_inqb+stat_disq);
+ logger->inc("dis");
+ }
+
+ dout(DBL) << "dispatch doing " << *m << endl;
+
+ // for rankserver?
+ if (m->get_type() == MSG_NS_CONNECTACK || // i just connected
+ m->get_dest() == MSG_ADDR_RANK(my_rank)) {
+ dout(DBL) << " giving to rankserver" << endl;
+ rankserver.dispatch(m);
+ continue;
+ }
+
+ // ok
+ msg_addr_t dest = m->get_dest();
+ directory_lock.Lock();
+ if (directory.count(dest)) {
+ Messenger *who = directory[ dest ];
+ directory_lock.Unlock();
+
+ dout(4) << g_clock.now() << " ---- '" << m->get_type_name() <<
+ "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() <<
+ " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- "
+ << *m
+ << endl;
+
+ who->dispatch(m);
+ } else {
+ directory_lock.Unlock();
+ dout (1) << "---- i don't know who " << MSG_ADDR_NICE(dest) << " " << dest << " is." << endl;
+ assert(0);
+ }
+ }
+ assert(stat_disq == 0);
+
+ }
+
+
+ g_timer.shutdown();
+
+ dout(5) << "tcp_dispatchthread exiting loop" << endl;
+ return 0;
+}
+
+
+// start/stop mpi receiver thread (for unsolicited messages)
+int tcpmessenger_start()
+{
+ dout(5) << "starting accept thread" << endl;
+ pthread_create(&listen_thread_id,
+ NULL,
+ tcp_acceptthread,
+ 0);
+
+ dout(5) << "starting dispatch thread" << endl;
+
+ // start a thread
+ pthread_create(&dispatch_thread_id,
+ NULL,
+ tcp_dispatchthread,
+ 0);
+
+
+ /*
+ dout(5) << "starting outgoing thread" << endl;
+ pthread_create(&out_thread_id,
+ NULL,
+ tcp_outthread,
+ 0);
+ */
+ if (!g_conf.tcp_multi_out)
+ single_out_thread.create();
+ return 0;
+}
+
+
+/*
+ * kick and wake up _loop (to pick up new outgoing message, or quit)
+ */
+
+void tcpmessenger_kick_dispatch_loop()
+{
+ if (g_conf.tcp_multi_dispatch) {
+ assert(0);
+ // all of them
+ /*for (hash_map<msg_addr_t, TCPMessenger*>::iterator i = directory.begin();
+ i != directory.end();
+ i++)
+ i->second->dispatch_kick();
+ */
+ } else {
+ // just one
+ dout(DBL) << "kicking" << endl;
+ incoming_lock.Lock();
+ dout(DBL) << "prekick" << endl;
+ incoming_cond.Signal();
+ incoming_lock.Unlock();
+ dout(DBL) << "kicked" << endl;
+ }
+}
+
+/*
+void tcpmessenger_kick_outgoing_loop()
+{
+ outgoing_lock.Lock();
+ outgoing_cond.Signal();
+ outgoing_lock.Unlock();
+}
+*/
+
+
+// wait for thread to finish
+
+void tcpmessenger_wait()
+{
+ if (g_conf.tcp_multi_dispatch) {
+ // new way
+ incoming_lock.Lock();
+ while (!tcp_done)
+ incoming_cond.Wait(incoming_lock);
+ incoming_lock.Unlock();
+ } else {
+ // old way
+ dout(10) << "tcpmessenger_wait waking up dispatch loop" << endl;
+ tcpmessenger_kick_dispatch_loop();
+
+ void *returnval;
+ dout(10) << "tcpmessenger_wait waiting for thread to finished." << endl;
+ pthread_join(dispatch_thread_id, &returnval);
+ dout(10) << "tcpmessenger_wait thread finished." << endl;
+ }
+}
+
+
+
+
+msg_addr_t register_entity(msg_addr_t addr)
+{
+ lookup_lock.Lock();
+
+ // prepare to wait
+ long id = ++regid;
+ Cond cond;
+ waiting_for_register_cond[id] = &cond;
+
+ if (my_rank < 0) {
+ dout(DBL) << "register_entity don't know my rank, connecting" << endl;
+
+ // connect to nameserver; discover my rank.
+ Message *m = new MNSConnect(listen_addr);
+ m->set_dest(MSG_ADDR_DIRECTORY, 0);
+ tcp_marshall(m);
+ OutThread *outt = tcp_lookup(m);
+ assert(outt);
+ tcp_send(m);
+
+ // wait for reply
+ while (my_rank < 0)
+ waiting_for_rank.Wait(lookup_lock);
+ assert(my_rank > 0);
+ }
+
+ // send req
+ dout(DBL) << "register_entity " << MSG_ADDR_NICE(addr) << endl;
+ Message *m = new MNSRegister(addr, my_rank, id);
+ m->set_dest(MSG_ADDR_DIRECTORY, 0);
+ tcp_marshall(m);
+ OutThread *outt = tcp_lookup(m);
+ assert(outt);
+ tcp_send(m);
+
+ // wait?
+ while (!waiting_for_register_result.count(id))
+ cond.Wait(lookup_lock);
+
+ // get result, clean up
+ msg_addr_t entity = waiting_for_register_result[id];
+ waiting_for_register_result.erase(id);
+ waiting_for_register_cond.erase(id);
+
+ dout(DBL) << "register_entity got " << MSG_ADDR_NICE(entity) << endl;
+
+ lookup_lock.Unlock();
+
+ // ok!
+ return entity;
+}
+
+
+
+/***********
+ * Tcpmessenger class implementation
+ */
+
+
+TCPMessenger::TCPMessenger(msg_addr_t myaddr) :
+ Messenger(myaddr),
+ dispatch_thread(this)
+{
+ if (myaddr != MSG_ADDR_DIRECTORY) {
+ // register!
+ myaddr = register_entity(myaddr);
+ }
+
+
+ // my address
+ set_myaddr( myaddr );
+
+ // register myself in the messenger directory
+ directory_lock.Lock();
+ {
+ directory[myaddr] = this;
+
+ stat_num++;
+ if (logger) logger->set("num", stat_num);
+ }
+ directory_lock.Unlock();
+
+ // register to execute timer events
+ //g_timer.set_messenger_kicker(new C_TCPKicker());
+ // g_timer.set_messenger(this);
+}
+
+
+void TCPMessenger::ready()
+{
+ directory_lock.Lock();
+ directory_ready.insert(get_myaddr());
+ directory_lock.Unlock();
+
+ if (get_myaddr() != MSG_ADDR_DIRECTORY) {
+ // started! tell namer we are up and running.
+ lookup_lock.Lock();
+ {
+ Message *m = new MGenericMessage(MSG_NS_STARTED);
+ m->set_source(get_myaddr(), 0);
+ m->set_dest(MSG_ADDR_DIRECTORY, 0);
+ tcp_marshall(m);
+ OutThread *outt = tcp_lookup(m);
+ assert(outt);
+ tcp_send(m);
+ }
+ lookup_lock.Unlock();
+ }
+}
+
+
+TCPMessenger::~TCPMessenger()
+{
+ //delete logger;
+}
+
+tcpaddr_t& TCPMessenger::get_tcpaddr()
+{
+ return listen_addr;
+}
+
+void TCPMessenger::map_entity_rank(msg_addr_t e, int r)
+{
+ lookup_lock.Lock();
+ entity_rank[e] = r;
+ lookup_lock.Unlock();
+}
+
+void TCPMessenger::map_rank_addr(int r, tcpaddr_t a)
+{
+ lookup_lock.Lock();
+ rank_addr[r] = a;
+ lookup_lock.Unlock();
+}
+
+
+int TCPMessenger::get_dispatch_queue_len()
+{
+ return stat_inq+stat_disq;
+}
+
+
+int TCPMessenger::shutdown()
+{
+ dout(DBL) << "shutdown by " << MSG_ADDR_NICE(get_myaddr()) << endl;
+
+ // dont' send unregistery from nsmessenger shutdown!
+ if (this != nsmessenger &&
+ (my_rank > 0 || nsmessenger)) {
+ dout(DBL) << "sending unregister from " << MSG_ADDR_NICE(get_myaddr()) << " to ns" << endl;
+ send_message(new MGenericMessage(MSG_NS_UNREGISTER),
+ MSG_ADDR_DIRECTORY);
+ }
+
+ // remove me from the directory
+ directory_lock.Lock();
+ directory.erase(get_myaddr());
+
+ // last one?
+ bool lastone = directory.empty();
+ //dout(1) << "lastone = " << lastone << " .. " << directory.size() << endl;
+
+
+ // or almost last one?
+ if (rankmessenger && directory.size() == 1) {
+ directory_lock.Unlock();
+ tcpmessenger_stop_rankserver();
+ directory_lock.Lock();
+ }
+
+ stat_num--;
+ if (logger) logger->set("num", stat_num);
+
+ directory_lock.Unlock();
+
+ // last one?
+ if (lastone) {
+ dout(2) << "shutdown last tcpmessenger on rank " << my_rank << " shut down" << endl;
+ //pthread_t whoami = pthread_self();
+
+ // no more timer events
+ //g_timer.unset_messenger();
+
+ // close incoming sockets
+ //void *r;
+ for (map<int,pthread_t>::iterator it = in_threads.begin();
+ it != in_threads.end();
+ it++) {
+ dout(DBL) << "closing reader on sd " << it->first << endl;
+ ::close(it->first);
+ //pthread_join(it->second, &r);
+ }
+
+ if (g_conf.tcp_multi_dispatch) {
+ // kill off dispatch threads
+ dout(DBL) << "killing dispatch threads" << endl;
+ for (hash_map<msg_addr_t,TCPMessenger*>::iterator it = directory.begin();
+ it != directory.end();
+ it++)
+ it->second->dispatch_stop();
+ }
+
+ dout(DBL) << "setting tcp_done" << endl;
+
+ // kick/kill incoming thread
+ incoming_lock.Lock();
+ tcp_done = true;
+ incoming_cond.Signal();
+ incoming_lock.Unlock();
+
+ // finish off outgoing thread
+ dout(10) << "waiting for outgoing to finish" << endl;
+ if (g_conf.tcp_multi_out) {
+ for (hash_map<int,OutThread*>::iterator it = rank_out.begin();
+ it != rank_out.end();
+ it++) {
+ it->second->stop();
+ delete it->second;
+ }
+ } else {
+ single_out_thread.stop();
+ }
+
+
+ /*
+
+ dout(15) << "whoami = " << whoami << ", thread = " << dispatch_thread_id << endl;
+ if (whoami == thread_id) {
+ // i am the event loop thread, just set flag!
+ dout(15) << " set tcp_done=true" << endl;
+ tcp_done = true;
+ }
+ */
+ }
+ return 0;
+}
+
+
+
+
+/***
+ * public messaging interface
+ */
+
+
+/* note: send_message _MUST_ be non-blocking */
+int TCPMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport)
+{
+ // set envelope
+ m->set_source(get_myaddr(), fromport);
+ m->set_dest(dest, port);
+ m->set_lamport_send_stamp( get_lamport() );
+
+ dout(4) << "--> " << m->get_type_name()
+ << " from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port()
+ << " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port()
+ << " ---- " << m
+ << endl;
+
+ // local?
+ TCPMessenger *entity = 0;
+ directory_lock.Lock();
+ if (directory.count(dest) &&
+ directory_ready.count(dest)) entity = directory[dest];
+ directory_lock.Unlock();
+
+ if (entity) {
+ // local!
+ ::incoming_lock.Lock();
+ {
+ dout(20) << " queueing locally for " << dest << " " << m << endl; //", stat_inq " << stat_inq << ", incomign " << ::incoming.size() << endl;
+ //assert(stat_inq == ::incoming.size());
+ ::incoming.push_back(m);
+ ::incoming_cond.Signal();
+ stat_inq++;
+ //assert(stat_inq == ::incoming.size());
+ //dout(-20) << " stat_inq " << stat_inq << ", incoming " << ::incoming.size() << endl;
+ stat_inqb += m->get_payload().length();
+ }
+ ::incoming_lock.Unlock();
+ } else {
+ // remote!
+
+ if (g_conf.tcp_serial_marshall)
+ tcp_marshall(m);
+
+ if (g_conf.tcp_serial_out) {
+ lookup_lock.Lock();
+ // send in this thread
+ if (tcp_lookup(m))
+ tcp_send(m);
+ lookup_lock.Unlock();
+ } else {
+ lookup_lock.Lock();
+ OutThread *outt = tcp_lookup(m);
+ lookup_lock.Unlock();
+
+ if (outt) outt->send(m);
+ }
+ }
+
+ return 0;
+}
+
+
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __TCPMESSENGER_H
+#define __TCPMESSENGER_H
+
+#include "Messenger.h"
+#include "Dispatcher.h"
+#include "common/Thread.h"
+
+#include "tcp.h"
+
+class Timer;
+
+
+class TCPMessenger : public Messenger {
+ protected:
+
+ //class Logger *logger; // for logging
+
+ bool incoming_stop;
+ Mutex incoming_lock;
+ list<Message*> incoming;
+ Cond incoming_cond;
+
+ class DispatchThread : public Thread {
+ TCPMessenger *m;
+ public:
+ DispatchThread(TCPMessenger *_m) : m(_m) {}
+ void *entry() {
+ m->dispatch_entry();
+ return 0;
+ }
+ } dispatch_thread;
+
+ void dispatch_entry();
+
+public:
+ void dispatch_start() {
+ incoming_stop = false;
+ dispatch_thread.create();
+ }
+ /* void dispatch_kick() {
+ incoming_lock.Lock();
+ incoming_cond.Signal();
+ incoming_lock.Unlock();
+ }*/
+ void dispatch_stop() {
+ incoming_lock.Lock();
+ incoming_stop = true;
+ incoming_cond.Signal();
+ incoming_lock.Unlock();
+ dispatch_thread.join();
+ }
+ void dispatch_queue(Message *m) {
+ incoming_lock.Lock();
+ incoming.push_back(m);
+ incoming_cond.Signal();
+ incoming_lock.Unlock();
+ }
+
+ public:
+ TCPMessenger(msg_addr_t myaddr);
+ ~TCPMessenger();
+
+ void ready();
+
+ tcpaddr_t& get_tcpaddr();
+ void map_entity_rank(msg_addr_t e, int r);
+ void map_rank_addr(int r, tcpaddr_t a);
+
+ int get_dispatch_queue_len();
+
+ void callback_kick();
+
+ // init, shutdown MPI and associated event loop thread.
+ virtual int shutdown();
+
+ // message interface
+ virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0);
+};
+
+/**
+ * these are all ONE per process.
+ */
+
+extern int tcpmessenger_lookup(char *str, tcpaddr_t& ta);
+
+extern int tcpmessenger_findns(tcpaddr_t &nsa);
+
+extern int tcpmessenger_init();
+extern int tcpmessenger_start(); // start thread
+extern void tcpmessenger_wait(); // wait for thread to finish.
+extern int tcpmessenger_shutdown(); // finalize MPI
+
+extern void tcpmessenger_start_nameserver(tcpaddr_t& ta); // on rank 0
+extern void tcpmessenger_stop_nameserver(); // on rank 0
+extern void tcpmessenger_start_rankserver(tcpaddr_t& ta); // on all ranks
+extern void tcpmessenger_stop_rankserver(); // on all ranks
+
+extern int tcpmessenger_get_rank();
+
+
+#endif
--- /dev/null
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "include/error.h"
+
+#define EXIT_USAGE_ERROR -1 /* error codes for program exit */
+#define EXIT_SYSTEM_ERROR -2
+#define EXIT_GENERIC_ERROR -3
+#define MSGSIZ 1024 /* maximum error message length */
+
+/* print usage error message and exit */
+void userror(const char *use, const char *fmt, ...)
+{
+ char msg[MSGSIZ];
+ int len;
+
+ va_list ap;
+ va_start(ap, fmt);
+
+ len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap);
+ len += snprintf(msg+len, MSGSIZ-len, "\n");
+ len += snprintf(msg+len, MSGSIZ-len, use);
+ fprintf(stderr, "%s\n", msg);
+ exit(EXIT_USAGE_ERROR);
+
+ va_end(ap);
+}
+
+/* print system error message and exit */
+void syserror(const char *fmt, ...)
+{
+ char msg[MSGSIZ];
+ int len;
+
+ va_list ap;
+ va_start(ap, fmt);
+
+ len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap);
+ len += snprintf(msg+len, MSGSIZ-len, ": %s\n", strerror(errno));
+ fprintf(stderr, "%s", msg);
+ exit(EXIT_SYSTEM_ERROR);
+
+ va_end(ap);
+}
+
+/* print error message and exit */
+void exiterror(const char *fmt, ...)
+{
+ char msg[MSGSIZ];
+ int len;
+
+ va_list ap;
+ va_start(ap, fmt);
+
+ len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap);
+ fprintf(stderr, "%s\n", msg);
+ exit(EXIT_GENERIC_ERROR);
+
+ va_end(ap);
+}
+
+/* print error message */
+void error(const char *fmt, ...)
+{
+ char msg[MSGSIZ];
+ int len;
+
+ va_list ap;
+ va_start(ap, fmt);
+
+ len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap);
+ fprintf(stderr, "%s\n", msg);
+
+ va_end(ap);
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include <mpi.h>
+
+#include "TCPMessenger.h"
+
+/*
+ * start up TCPMessenger via MPI.
+ */
+
+pair<int,int> mpi_bootstrap_tcp(int& argc, char**& argv)
+{
+ tcpmessenger_init();
+ tcpmessenger_start();
+
+ // exchnage addresses with other nodes
+ MPI_Init(&argc, &argv);
+
+ int mpi_world;
+ int mpi_rank;
+ MPI_Comm_size(MPI_COMM_WORLD, &mpi_world);
+ MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+ //dout(1) << "i am " << mpi_rank << " of " << mpi_world << endl;
+
+ // start up directory?
+ tcpaddr_t ta;
+ if (mpi_rank == 0) {
+ dout(30) << "i am rank 0, starting ns directory" << endl;
+ tcpmessenger_start_nameserver(ta);
+ } else {
+ memset(&ta, 0, sizeof(ta));
+ }
+
+ // distribute tcpaddr
+ int r = MPI_Bcast(&ta, sizeof(ta), MPI_CHAR,
+ 0, MPI_COMM_WORLD);
+
+ dout(30) << "r = " << r << " ns tcpaddr is " << ta << endl;
+ tcpmessenger_start_rankserver(ta);
+
+ MPI_Barrier(MPI_COMM_WORLD);
+ //g_clock.tare();
+ MPI_Finalize();
+
+ return pair<int,int>(mpi_rank, mpi_world);
+}
+
+
--- /dev/null
+#include <mpi.h>
+#include "NewMessenger.h"
+
+/*
+ * start up NewMessenger via MPI.
+ */
+
+pair<int,int> mpi_bootstrap_new(int& argc, char**& argv)
+{
+ MPI_Init(&argc, &argv);
+
+ int mpi_world;
+ int mpi_rank;
+ MPI_Comm_size(MPI_COMM_WORLD, &mpi_world);
+ MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+ tcpaddr_t nsaddr;
+ memset(&nsaddr, 0, sizeof(nsaddr));
+
+ if (mpi_rank == 0) {
+ // i am root.
+ rank.my_rank = 0;
+ rank.start_rank(nsaddr);
+ nsaddr = rank.get_listen_addr();
+ }
+
+ int r = MPI_Bcast(&nsaddr, sizeof(nsaddr), MPI_CHAR,
+ 0, MPI_COMM_WORLD);
+
+ dout(30) << "r = " << r << " ns tcpaddr is " << nsaddr << endl;
+
+ if (mpi_rank != 0) {
+ rank.start_rank(nsaddr);
+ }
+
+ MPI_Barrier(MPI_COMM_WORLD);
+
+ //g_clock.tare();
+
+ MPI_Finalize();
+
+ return pair<int,int>(mpi_rank, mpi_world);
+}
--- /dev/null
+
+#include "tcp.h"
+
+/******************
+ * tcp crap
+ */
+
+bool tcp_read(int sd, char *buf, int len)
+{
+ while (len > 0) {
+ int got = ::recv( sd, buf, len, 0 );
+ if (got == 0) {
+ dout(18) << "tcp_read socket " << sd << " closed" << endl;
+ return false;
+ }
+ if (got < 0) {
+ dout(18) << "tcp_read bailing with " << got << endl;
+ return false;
+ }
+ assert(got >= 0);
+ len -= got;
+ buf += got;
+ //dout(DBL) << "tcp_read got " << got << ", " << len << " left" << endl;
+ }
+ return true;
+}
+
+int tcp_write(int sd, char *buf, int len)
+{
+ //dout(DBL) << "tcp_write writing " << len << endl;
+ assert(len > 0);
+ while (len > 0) {
+ int did = ::send( sd, buf, len, 0 );
+ if (did < 0) {
+ dout(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << endl;
+ //cerr << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << endl;
+ }
+ //assert(did >= 0);
+ if (did < 0) return did;
+ len -= did;
+ buf += did;
+ //dout(DBL) << "tcp_write did " << did << ", " << len << " left" << endl;
+ }
+ return 0;
+}
+
+
+int tcp_hostlookup(char *str, tcpaddr_t& ta)
+{
+ char *host = str;
+ char *port = 0;
+
+ for (int i=0; str[i]; i++) {
+ if (str[i] == ':') {
+ port = str+i+1;
+ str[i] = 0;
+ break;
+ }
+ }
+ if (!port) {
+ cerr << "addr '" << str << "' doesn't look like 'host:port'" << endl;
+ return -1;
+ }
+ //cout << "host '" << host << "' port '" << port << "'" << endl;
+
+ int iport = atoi(port);
+
+ struct hostent *myhostname = gethostbyname( host );
+ if (!myhostname) {
+ cerr << "host " << host << " not found" << endl;
+ return -1;
+ }
+
+ memset(&ta, 0, sizeof(ta));
+
+ //cout << "addrtype " << myhostname->h_addrtype << " len " << myhostname->h_length << endl;
+
+ ta.sin_family = myhostname->h_addrtype;
+ memcpy((char *)&ta.sin_addr,
+ myhostname->h_addr,
+ myhostname->h_length);
+ ta.sin_port = iport;
+
+ cout << "lookup '" << host << ":" << port << "' -> " << ta << endl;
+
+ return 0;
+}
--- /dev/null
+#ifndef __TCP_H
+#define __TCP_H
+
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+typedef struct sockaddr_in tcpaddr_t;
+
+using std::ostream;
+
+inline ostream& operator<<(ostream& out, const tcpaddr_t &a)
+{
+ unsigned char addr[4];
+ memcpy((char*)addr, (char*)&a.sin_addr.s_addr, 4);
+ out << (unsigned)addr[0] << "."
+ << (unsigned)addr[1] << "."
+ << (unsigned)addr[2] << "."
+ << (unsigned)addr[3] << ":"
+ << (int)a.sin_port;
+ return out;
+}
+
+extern bool tcp_read(int sd, char *buf, int len);
+extern int tcp_write(int sd, char *buf, int len);
+extern int tcp_hostlookup(char *str, tcpaddr_t& ta);
+
+inline bool operator==(const tcpaddr_t& a, const tcpaddr_t& b) {
+ return strncmp((const char*)&a, (const char*)&b, sizeof(a)) == 0;
+}
+inline bool operator!=(const tcpaddr_t& a, const tcpaddr_t& b) {
+ return strncmp((const char*)&a, (const char*)&b, sizeof(a)) != 0;
+}
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include <fcntl.h>
+
+#include "config.h"
+
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "mon/Monitor.h"
+#include "client/Client.h"
+#include "client/SyntheticClient.h"
+
+#include "msg/SimpleMessenger.h"
+
+#include "common/Timer.h"
+
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define NUMCLIENT g_conf.num_client
+
+class C_Test : public Context {
+public:
+ void finish(int r) {
+ cout << "C_Test->finish(" << r << ")" << endl;
+ }
+};
+
+
+/*
+ * start up NewMessenger via MPI.
+ */
+#include <mpi.h>
+
+pair<int,int> mpi_bootstrap_new(int& argc, char**& argv, MonMap *monmap)
+{
+ MPI_Init(&argc, &argv);
+
+ int mpi_world;
+ int mpi_rank;
+ MPI_Comm_size(MPI_COMM_WORLD, &mpi_world);
+ MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+
+ // first, synchronize clocks.
+ MPI_Barrier(MPI_COMM_WORLD);
+ //dout(-10) << "tare" << endl;
+ g_clock.tare();
+
+ // start up all monitors at known addresses.
+ entity_inst_t moninst[mpi_world]; // only care about first g_conf.num_mon of these.
+
+ rank.start_rank(); // bind and listen
+
+ if (mpi_rank < g_conf.num_mon) {
+ moninst[mpi_rank].set_addr( rank.get_listen_addr() );
+
+ //cerr << mpi_rank << " at " << rank.get_listen_addr() << endl;
+ }
+
+ MPI_Gather( &moninst[mpi_rank], sizeof(entity_inst_t), MPI_CHAR,
+ moninst, sizeof(entity_inst_t), MPI_CHAR,
+ 0, MPI_COMM_WORLD);
+
+ if (mpi_rank == 0) {
+ for (int i=0; i<g_conf.num_mon; i++) {
+ cerr << "mon" << i << " is at " << moninst[i] << endl;
+ monmap->mon_inst[i] = moninst[i];
+ }
+ }
+
+
+ // distribute monmap
+ bufferlist bl;
+ if (mpi_rank == 0) {
+ monmap->encode(bl);
+ monmap->write(".ceph_monmap");
+ } else {
+ int l = g_conf.num_mon * 1000; // nice'n big.
+ bufferptr bp(l);
+ bl.append(bp);
+ }
+
+ MPI_Bcast(bl.c_str(), bl.length(), MPI_CHAR,
+ 0, MPI_COMM_WORLD);
+
+ if (mpi_rank > 0) {
+ monmap->decode(bl);
+ }
+
+ // wait for everyone!
+ MPI_Barrier(MPI_COMM_WORLD);
+
+ return pair<int,int>(mpi_rank, mpi_world);
+}
+
+utime_t tick_start;
+int tick_count = 0;
+
+class C_Tick : public Context {
+public:
+ void finish(int) {
+ utime_t now = g_clock.now() - tick_start;
+ dout(0) << "tick +" << g_conf.tick << " -> " << now << " (" << tick_count << ")" << endl;
+ tick_count += g_conf.tick;
+ utime_t next = tick_start;
+ next.sec_ref() += tick_count;
+ g_timer.add_event_at(next, new C_Tick);
+ }
+};
+
+class C_Die : public Context {
+public:
+ void finish(int) {
+ cerr << "die" << endl;
+ exit(1);
+ }
+};
+
+class C_Debug : public Context {
+ public:
+ void finish(int) {
+ int size = &g_conf.debug_after - &g_conf.debug;
+ memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size);
+ dout(0) << "debug_after flipping debug settings" << endl;
+ }
+};
+
+
+int main(int argc, char **argv)
+{
+ vector<char*> args;
+ argv_to_vec(argc, argv, args);
+
+ map<int,int> kill_osd_after;
+ if (1) {
+ vector<char*> nargs;
+ for (unsigned i=0; i<args.size(); i++) {
+ if (strcmp(args[i],"--kill_osd_after") == 0) {
+ int o = atoi(args[++i]);
+ int w = atoi(args[++i]);
+ kill_osd_after[o] = w;
+ }
+ else {
+ nargs.push_back( args[i] );
+ }
+ }
+ args.swap(nargs);
+ }
+
+ parse_config_options(args);
+ parse_syn_options(args);
+
+ if (g_conf.kill_after)
+ g_timer.add_event_after(g_conf.kill_after, new C_Die);
+ if (g_conf.debug_after)
+ g_timer.add_event_after(g_conf.debug_after, new C_Debug);
+
+ if (g_conf.tick) {
+ tick_start = g_clock.now();
+ g_timer.add_event_after(g_conf.tick, new C_Tick);
+ }
+
+ vector<char*> nargs;
+ for (unsigned i=0; i<args.size(); i++) {
+ //cout << "a " << args[i] << endl;
+ // unknown arg, pass it on.
+ nargs.push_back(args[i]);
+ }
+
+ args = nargs;
+ if (!args.empty()) {
+ for (unsigned i=0; i<args.size(); i++)
+ cerr << "stray arg " << args[i] << endl;
+ }
+ assert(args.empty());
+
+
+ // start up messenger via MPI
+ MonMap *monmap = new MonMap(g_conf.num_mon);
+ pair<int,int> mpiwho = mpi_bootstrap_new(argc, argv, monmap);
+ int myrank = mpiwho.first;
+ int world = mpiwho.second;
+
+ int need = 0;
+ if (g_conf.ms_skip_rank0) need++;
+ need += NUMMDS;
+ if (g_conf.ms_stripe_osds)
+ need++;
+ else
+ need += NUMOSD;
+ if (NUMCLIENT) {
+ if (!g_conf.ms_overlay_clients)
+ need += 1;
+ }
+ assert(need <= world);
+
+ if (myrank == 0)
+ cerr << "nummds " << NUMMDS << " numosd " << NUMOSD << " numclient " << NUMCLIENT << " .. need " << need << ", have " << world << endl;
+
+
+ char hostname[100];
+ gethostname(hostname,100);
+ int pid = getpid();
+
+ int started = 0;
+
+ //if (myrank == 0) g_conf.debug = 20;
+
+ // create mon
+ if (myrank < g_conf.num_mon) {
+ Monitor *mon = new Monitor(myrank, rank.register_entity(MSG_ADDR_MON(myrank)), monmap);
+ mon->init();
+ }
+
+
+ // wait for monitors to start.
+ MPI_Barrier(MPI_COMM_WORLD);
+
+ // okay, home free!
+ MPI_Finalize();
+
+
+ // create mds
+ map<int,MDS*> mds;
+ map<int,OSD*> mdsosd;
+ for (int i=0; i<NUMMDS; i++) {
+ if (myrank != g_conf.ms_skip_rank0+i) continue;
+ Messenger *m = rank.register_entity(MSG_ADDR_MDS(i));
+ cerr << "mds" << i << " at " << rank.my_inst << " " << hostname << "." << pid << endl;
+ mds[i] = new MDS(i, m, monmap);
+ mds[i]->init();
+ started++;
+
+ if (g_conf.mds_local_osd) {
+ mdsosd[i] = new OSD(i+10000, rank.register_entity(MSG_ADDR_OSD(i+10000)), monmap);
+ mdsosd[i]->init();
+ }
+ }
+
+ // create osd
+ map<int,OSD*> osd;
+ int max_osd_nodes = world - NUMMDS - g_conf.ms_skip_rank0; // assumes 0 clients, if we stripe.
+ int osds_per_node = (NUMOSD-1)/max_osd_nodes + 1;
+ for (int i=0; i<NUMOSD; i++) {
+ if (g_conf.ms_stripe_osds) {
+ if (myrank != g_conf.ms_skip_rank0+NUMMDS + i / osds_per_node) continue;
+ } else {
+ if (myrank != g_conf.ms_skip_rank0+NUMMDS + i) continue;
+ }
+
+ if (kill_osd_after.count(i))
+ g_timer.add_event_after(kill_osd_after[i], new C_Die);
+
+ Messenger *m = rank.register_entity(MSG_ADDR_OSD(i));
+ cerr << "osd" << i << " at " << rank.my_inst << " " << hostname << "." << pid << endl;
+ osd[i] = new OSD(i, m, monmap);
+ osd[i]->init();
+ started++;
+ }
+
+ if (g_conf.ms_overlay_clients) sleep(5);
+
+ // create client
+ int skip_osd = NUMOSD;
+ if (g_conf.ms_overlay_clients)
+ skip_osd = 0; // put clients with osds too!
+ int client_nodes = world - NUMMDS - skip_osd - g_conf.ms_skip_rank0;
+ int clients_per_node = 1;
+ if (NUMCLIENT && client_nodes > 0) clients_per_node = (NUMCLIENT-1) / client_nodes + 1;
+ set<int> clientlist;
+ map<int,Client *> client;//[NUMCLIENT];
+ map<int,SyntheticClient *> syn;//[NUMCLIENT];
+ for (int i=0; i<NUMCLIENT; i++) {
+ //if (myrank != NUMMDS + NUMOSD + i % client_nodes) continue;
+ if (myrank != g_conf.ms_skip_rank0+NUMMDS + skip_osd + i / clients_per_node) continue;
+ clientlist.insert(i);
+ client[i] = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), monmap);
+
+ // logger?
+ if (client_logger == 0) {
+ char s[80];
+ sprintf(s,"clnode.%d", myrank);
+ client_logger = new Logger(s, &client_logtype);
+
+ client_logtype.add_inc("lsum");
+ client_logtype.add_inc("lnum");
+ client_logtype.add_inc("lwsum");
+ client_logtype.add_inc("lwnum");
+ client_logtype.add_inc("lrsum");
+ client_logtype.add_inc("lrnum");
+ client_logtype.add_inc("trsum");
+ client_logtype.add_inc("trnum");
+ client_logtype.add_inc("wrlsum");
+ client_logtype.add_inc("wrlnum");
+ client_logtype.add_inc("lstatsum");
+ client_logtype.add_inc("lstatnum");
+ client_logtype.add_inc("ldirsum");
+ client_logtype.add_inc("ldirnum");
+ client_logtype.add_inc("readdir");
+ client_logtype.add_inc("stat");
+ }
+
+ client[i]->init();
+ started++;
+
+ syn[i] = new SyntheticClient(client[i]);
+ }
+
+ if (!clientlist.empty()) dout(2) << "i have " << clientlist << endl;
+
+ int nclients = 0;
+ for (set<int>::iterator it = clientlist.begin();
+ it != clientlist.end();
+ it++) {
+ int i = *it;
+
+ //cerr << "starting synthetic client" << i << " on rank " << myrank << endl;
+ client[i]->mount();
+ syn[i]->start_thread();
+
+ nclients++;
+ }
+ if (nclients) {
+ cerr << nclients << " clients at " << rank.my_inst << " " << hostname << "." << pid << endl;
+ }
+
+ for (set<int>::iterator it = clientlist.begin();
+ it != clientlist.end();
+ it++) {
+ int i = *it;
+
+ // cout << "waiting for synthetic client" << i << " to finish" << endl;
+ syn[i]->join_thread();
+ delete syn[i];
+
+ client[i]->unmount();
+ //cout << "client" << i << " unmounted" << endl;
+ client[i]->shutdown();
+
+ delete client[i];
+ }
+
+
+ if (myrank && !started) {
+ //dout(1) << "IDLE" << endl;
+ cerr << "idle at " << rank.my_inst << " " << hostname << "." << pid << endl;
+ //rank.stop_rank();
+ }
+
+ // wait for everything to finish
+ rank.wait();
+
+ if (started) cerr << "newsyn finishing" << endl;
+
+ return 0; // whatever, cleanup hangs sometimes (stopping ebofs threads?).
+
+
+ // cleanup
+ for (map<int,MDS*>::iterator i = mds.begin(); i != mds.end(); i++)
+ delete i->second;
+ for (map<int,OSD*>::iterator i = mdsosd.begin(); i != mdsosd.end(); i++)
+ delete i->second;
+ for (map<int,OSD*>::iterator i = osd.begin(); i != osd.end(); i++)
+ delete i->second;
+ /*
+ for (map<int,Client*>::iterator i = client.begin(); i != client.end(); i++)
+ delete i->second;
+ for (map<int,SyntheticClient*>::iterator i = syn.begin(); i != syn.end(); i++)
+ delete i->second;
+ */
+ /*
+ for (int i=0; i<NUMMDS; i++) {
+ if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_MDS(i),world)) continue;
+ delete mds[i];
+ }
+ for (int i=0; i<NUMOSD; i++) {
+ if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_OSD(i),world)) continue;
+ delete osd[i];
+ }
+ for (int i=0; i<NUMCLIENT; i++) {
+ if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_CLIENT(i),world)) continue;
+ delete client[i];
+ }
+ */
+
+
+ return 0;
+}
+
--- /dev/null
+
+#include "include/types.h"
+
+#include "Ager.h"
+#include "ObjectStore.h"
+
+#include "config.h"
+#include "common/Clock.h"
+
+// ick
+#include "ebofs/Ebofs.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#ifdef DARWIN
+#include <sys/param.h>
+#include <sys/mount.h>
+#endif // DARWIN
+
+
+int myrand()
+{
+ if (0)
+ return rand();
+ else {
+ static int n = 0;
+ srand(n++);
+ return rand();
+ }
+}
+
+
+object_t Ager::age_get_oid() {
+ if (!age_free_oids.empty()) {
+ object_t o = age_free_oids.front();
+ age_free_oids.pop_front();
+ return o;
+ }
+ object_t last = age_cur_oid;
+ ++age_cur_oid.bno;
+ return last;
+}
+
+ssize_t Ager::age_pick_size() {
+ ssize_t max = file_size_distn.sample() * 1024;
+ return max/2 + (myrand() % 100) * max/200 + 1;
+}
+
+bool start_debug = false;
+
+__uint64_t Ager::age_fill(float pc, utime_t until) {
+ int max = 1024*1024;
+ bufferptr bp(max);
+ bp.zero();
+ bufferlist bl;
+ bl.push_back(bp);
+ __uint64_t wrote = 0;
+ while (1) {
+ if (g_clock.now() > until) break;
+
+ struct statfs st;
+ store->statfs(&st);
+ float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks);
+ float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to
+ //float a = (float)(st.f_bfree) / (float)st.f_blocks;
+ //dout(10) << "age_fill at " << a << " / " << pc << " .. " << st.f_blocks << " " << st.f_bavail << endl;
+ if (free >= pc) {
+ dout(2) << "age_fill at " << free << " / " << avail << " / " << " / " << pc << " stopping" << endl;
+ break;
+ }
+
+ // make sure we can write to it..
+ if (avail > .98 ||
+ avail - free > .02)
+ store->sync();
+
+ object_t oid = age_get_oid();
+
+ int b = myrand() % 10;
+ age_objects[b].push_back(oid);
+
+ ssize_t s = age_pick_size();
+ wrote += (s + 4095) / 4096;
+
+
+
+
+ dout(2) << "age_fill at " << free << " / " << avail << " / " << pc << " creating " << hex << oid << dec << " sz " << s << endl;
+
+
+ if (false && !g_conf.ebofs_verify && start_debug && wrote > 1000000ULL) {
+ /*
+
+
+ 1005700
+?
+1005000
+1005700
+ 1005710
+ 1005725ULL
+ 1005750ULL
+ 1005800
+ 1006000
+
+// 99 1000500 ? 1000750 1006000
+*/
+ g_conf.debug_ebofs = 30;
+ g_conf.ebofs_verify = true;
+ }
+
+ off_t off = 0;
+ while (s) {
+ ssize_t t = MIN(s, max);
+ bufferlist sbl;
+ sbl.substr_of(bl, 0, t);
+ store->write(oid, off, t, sbl, false);
+ off += t;
+ s -= t;
+ }
+ oid.bno++;
+ }
+
+ return wrote*4; // KB
+}
+
+void Ager::age_empty(float pc) {
+ int nper = 20;
+ int n = nper;
+
+ //g_conf.ebofs_verify = true;
+
+ while (1) {
+ struct statfs st;
+ store->statfs(&st);
+ float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks);
+ float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to
+ dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << endl;//" stopping" << endl;
+ if (free <= pc) {
+ dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " stopping" << endl;
+ break;
+ }
+
+ int b = myrand() % 10;
+ n--;
+ if (n == 0 || age_objects[b].empty()) {
+ dout(2) << "age_empty sync" << endl;
+ //sync();
+ //sync();
+ n = nper;
+ continue;
+ }
+ object_t oid = age_objects[b].front();
+ age_objects[b].pop_front();
+
+ dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " removing " << hex << oid << dec << endl;
+
+ store->remove(oid);
+ age_free_oids.push_back(oid);
+ }
+
+ g_conf.ebofs_verify = false;
+}
+
+void pfrag(__uint64_t written, ObjectStore::FragmentationStat &st)
+{
+ cout << "#gb wr\ttotal\tn x\tavg x\tavg per\tavg j\tfree\tn fr\tavg fr\tnum<2\tsum<2\tnum<4\tsum<4\t..."
+ << endl;
+ cout << written
+ << "\t" << st.total
+ << "\t" << st.num_extent
+ << "\t" << st.avg_extent
+ << "\t" << st.avg_extent_per_object
+ << "\t" << st.avg_extent_jump
+ << "\t" << st.total_free
+ << "\t" << st.num_free_extent
+ << "\t" << st.avg_free_extent;
+
+ int n = st.num_extent;
+ for (__uint64_t i=1; i <= 30; i += 1) {
+ cout << "\t" << st.extent_dist[i];
+ cout << "\t" << st.extent_dist_sum[i];
+ //cout << "\ta " << (st.extent_dist[i] ? (st.extent_dist_sum[i] / st.extent_dist[i]):0);
+ n -= st.extent_dist[i];
+ if (n == 0) break;
+ }
+ cout << endl;
+}
+
+
+void Ager::age(int time,
+ float high_water, // fill to this %
+ float low_water, // then empty to this %
+ int count, // this many times
+ float final_water, // and end here ( <= low_water)
+ int fake_size_mb) {
+
+ store->_fake_writes(true);
+ srand(0);
+
+ utime_t start = g_clock.now();
+ utime_t until = start;
+ until.sec_ref() += time;
+
+ int elapsed = 0;
+ int freelist_inc = 60;
+ utime_t nextfl = start;
+ nextfl.sec_ref() += freelist_inc;
+
+ while (age_objects.size() < 10) age_objects.push_back( list<object_t>() );
+
+ if (fake_size_mb) {
+ int fake_bl = fake_size_mb * 256;
+ struct statfs st;
+ store->statfs(&st);
+ float f = (float)fake_bl / (float)st.f_blocks;
+ high_water = (float)high_water * f;
+ low_water = (float)low_water * f;
+ final_water = (float)final_water * f;
+ dout(2) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << endl;
+ }
+
+ // init size distn (once)
+ if (!did_distn) {
+ did_distn = true;
+ age_cur_oid = object_t(0,1);
+ file_size_distn.add(1, 19.0758125+0.65434375);
+ file_size_distn.add(512, 35.6566);
+ file_size_distn.add(1024, 27.7271875);
+ file_size_distn.add(2*1024, 16.63503125);
+ //file_size_distn.add(4*1024, 106.82384375);
+ //file_size_distn.add(8*1024, 81.493375);
+ //file_size_distn.add(16*1024, 14.13553125);
+ //file_size_distn.add(32*1024, 2.176);
+ //file_size_distn.add(256*1024, 0.655938);
+ //file_size_distn.add(512*1024, 0.1480625);
+ //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit
+ file_size_distn.normalize();
+ }
+
+ // clear
+ for (int i=0; i<10; i++)
+ age_objects[i].clear();
+
+ ObjectStore::FragmentationStat st;
+
+ __uint64_t wrote = 0;
+
+ for (int c=1; c<=count; c++) {
+ if (g_clock.now() > until) break;
+
+ //if (c == 7) start_debug = true;
+
+ dout(1) << "#age " << c << "/" << count << " filling to " << high_water << endl;
+ __uint64_t w = age_fill(high_water, until);
+ //dout(1) << "age wrote " << w << endl;
+ wrote += w;
+ //store->sync();
+ //store->_get_frag_stat(st);
+ //pfrag(st);
+
+
+ if (c == count) {
+ dout(1) << "#age final empty to " << final_water << endl;
+ age_empty(final_water);
+ } else {
+ dout(1) << "#age " << c << "/" << count << " emptying to " << low_water << endl;
+ age_empty(low_water);
+ }
+ //store->sync();
+ //store->sync();
+
+ // show frag state
+ store->_get_frag_stat(st);
+ pfrag(wrote / (1024ULL*1024ULL) , // GB
+ st);
+
+ // dump freelist?
+ if (g_clock.now() > nextfl) {
+ elapsed += freelist_inc;
+ save_freelist(elapsed);
+ nextfl.sec_ref() += freelist_inc;
+ }
+ }
+
+ // dump the freelist
+ save_freelist(0);
+ exit(0); // hack
+
+ // ok!
+ store->_fake_writes(false);
+ store->sync();
+ store->sync();
+ dout(1) << "age finished" << endl;
+}
+
+
+void Ager::load_freelist()
+{
+ dout(1) << "load_freelist" << endl;
+
+ struct stat st;
+
+ int r = ::stat("ebofs.freelist", &st);
+ assert(r == 0);
+
+ bufferptr bp(st.st_size);
+ bufferlist bl;
+ bl.push_back(bp);
+ int fd = ::open("ebofs.freelist", O_RDONLY);
+ ::read(fd, bl.c_str(), st.st_size);
+ ::close(fd);
+
+ ((Ebofs*)store)->_import_freelist(bl);
+ store->sync();
+ store->sync();
+}
+
+void Ager::save_freelist(int el)
+{
+ dout(1) << "save_freelist " << el << endl;
+ char s[100];
+ sprintf(s, "ebofs.freelist.%d", el);
+ bufferlist bl;
+ ((Ebofs*)store)->_export_freelist(bl);
+ ::unlink(s);
+ int fd = ::open(s, O_CREAT|O_WRONLY);
+ ::fchmod(fd, 0644);
+ ::write(fd, bl.c_str(), bl.length());
+ ::close(fd);
+}
--- /dev/null
+#ifndef __AGER_H
+#define __AGER_H
+
+#include "include/types.h"
+#include "include/Distribution.h"
+#include "ObjectStore.h"
+#include "common/Clock.h"
+
+#include <list>
+#include <vector>
+using namespace std;
+
+class Ager {
+ ObjectStore *store;
+
+ private:
+ list<object_t> age_free_oids;
+ object_t age_cur_oid;
+ vector< list<object_t> > age_objects;
+ Distribution file_size_distn; //kb
+ bool did_distn;
+
+ void age_empty(float pc);
+ __uint64_t age_fill(float pc, utime_t until);
+ ssize_t age_pick_size();
+ object_t age_get_oid();
+
+ public:
+ Ager(ObjectStore *s) : store(s), did_distn(false) {}
+
+ void age(int time,
+ float high_water, // fill to this %
+ float low_water, // then empty to this %
+ int count, // this many times
+ float final_water, // and end here ( <= low_water)
+ int fake_size_mb=0);
+
+ void save_freelist(int);
+ void load_freelist();
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __BERKELEYDB_H
+#define __BERKELEYDB_H
+
+#include <db.h>
+#include <unistd.h>
+
+#include <list>
+using namespace std;
+
+
+template<typename K, typename D>
+class BDBMap {
+ private:
+ DB *dbp;
+
+ public:
+ BDBMap() : dbp(0) {}
+ ~BDBMap() {
+ close();
+ }
+
+ bool is_open() { return dbp ? true:false; }
+
+ // open/close
+ int open(const char *fn) {
+ //cout << "open " << fn << endl;
+
+ int r;
+ if ((r = db_create(&dbp, NULL, 0)) != 0) {
+ cerr << "db_create: " << db_strerror(r) << endl;
+ assert(0);
+ }
+
+ dbp->set_errfile(dbp, stderr);
+ dbp->set_errpfx(dbp, "bdbmap");
+
+ r = dbp->open(dbp, NULL, fn, NULL, DB_BTREE, DB_CREATE, 0644);
+ if (r != 0) {
+ dbp->err(dbp, r, "%s", fn);
+ }
+ assert(r == 0);
+ return 0;
+ }
+ void close() {
+ if (dbp) {
+ dbp->close(dbp,0);
+ dbp = 0;
+ }
+ }
+ void remove(const char *fn) {
+ if (!dbp) open(fn);
+ if (dbp) {
+ dbp->remove(dbp, fn, 0, 0);
+ dbp = 0;
+ } else {
+ ::unlink(fn);
+ }
+ }
+
+ // accessors
+ int put(K key,
+ D data) {
+ DBT k;
+ memset(&k, 0, sizeof(k));
+ k.data = &key;
+ k.size = sizeof(K);
+ DBT d;
+ memset(&d, 0, sizeof(d));
+ d.data = &data;
+ d.size = sizeof(data);
+ return dbp->put(dbp, NULL, &k, &d, 0);
+ }
+
+ int get(K key,
+ D& data) {
+ DBT k;
+ memset(&k, 0, sizeof(k));
+ k.data = &key;
+ k.size = sizeof(key);
+ DBT d;
+ memset(&d, 0, sizeof(d));
+ d.data = &data;
+ d.size = sizeof(data);
+ int r = dbp->get(dbp, NULL, &k, &d, 0);
+ return r;
+ }
+
+ int del(K key) {
+ DBT k;
+ memset(&k, 0, sizeof(k));
+ k.data = &key;
+ k.size = sizeof(key);
+ return dbp->del(dbp, NULL, &k, 0);
+ }
+
+ int list_keys(list<K>& ls) {
+ DBC *cursor = 0;
+ int r = dbp->cursor(dbp, NULL, &cursor, 0);
+ assert(r == 0);
+
+ DBT k,d;
+ memset(&k, 0, sizeof(k));
+ memset(&d, 0, sizeof(d));
+
+ while ((r = cursor->c_get(cursor, &k, &d, DB_NEXT)) == 0) {
+ K key;
+ assert(k.size == sizeof(key));
+ memcpy(&key, k.data, k.size);
+ ls.push_back(key);
+ }
+ if (r != DB_NOTFOUND) {
+ dbp->err(dbp, r, "DBcursor->get");
+ assert(r == DB_NOTFOUND);
+ }
+
+ cursor->c_close(cursor);
+ return 0;
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __FAKE_H
+#define __FAKE_H
+
+#include "include/types.h"
+
+#include <list>
+#include <set>
+#include <ext/hash_map>
+using namespace std;
+using namespace __gnu_cxx;
+
+class FakeStoreCollections {
+ private:
+ Mutex faker_lock;
+ ObjectStore *store;
+ hash_map<coll_t, set<object_t> > fakecollections;
+
+ public:
+ FakeStoreCollections(ObjectStore *s) : store(s) {}
+
+ // faked collections
+ int list_collections(list<coll_t>& ls) {
+ faker_lock.Lock();
+ int r = 0;
+ for (hash_map< coll_t, set<object_t> >::iterator p = fakecollections.begin();
+ p != fakecollections.end();
+ p++) {
+ r++;
+ ls.push_back(p->first);
+ }
+ faker_lock.Unlock();
+ return r;
+ }
+
+ int create_collection(coll_t c,
+ Context *onsafe=0) {
+ faker_lock.Lock();
+ fakecollections[c].size();
+ if (onsafe) store->sync(onsafe);
+ faker_lock.Unlock();
+ return 0;
+ }
+
+ int destroy_collection(coll_t c,
+ Context *onsafe=0) {
+ int r = 0;
+ faker_lock.Lock();
+ if (fakecollections.count(c)) {
+ fakecollections.erase(c);
+ //fakecattr.erase(c);
+ if (onsafe) store->sync(onsafe);
+ } else
+ r = -1;
+ faker_lock.Unlock();
+ return r;
+ }
+
+ int collection_stat(coll_t c, struct stat *st) {
+ return collection_exists(c) ? 0:-1;
+ }
+
+ bool collection_exists(coll_t c) {
+ faker_lock.Lock();
+ int r = fakecollections.count(c);
+ faker_lock.Unlock();
+ return r;
+ }
+
+ int collection_add(coll_t c, object_t o,
+ Context *onsafe=0) {
+ faker_lock.Lock();
+ fakecollections[c].insert(o);
+ if (onsafe) store->sync(onsafe);
+ faker_lock.Unlock();
+ return 0;
+ }
+
+ int collection_remove(coll_t c, object_t o,
+ Context *onsafe=0) {
+ faker_lock.Lock();
+ fakecollections[c].erase(o);
+ if (onsafe) store->sync(onsafe);
+ faker_lock.Unlock();
+ return 0;
+ }
+
+ int collection_list(coll_t c, list<object_t>& o) {
+ faker_lock.Lock();
+ int r = 0;
+ for (set<object_t>::iterator p = fakecollections[c].begin();
+ p != fakecollections[c].end();
+ p++) {
+ o.push_back(*p);
+ r++;
+ }
+ faker_lock.Unlock();
+ return r;
+ }
+
+};
+
+class FakeStoreAttrs {
+ private:
+
+ class FakeAttrSet {
+ public:
+ map<string, bufferptr> attrs;
+
+ int getattr(const char *name, void *value, size_t size) {
+ string n = name;
+ if (attrs.count(n)) {
+ size_t l = MIN( attrs[n].length(), size );
+ bufferlist bl;
+ bl.append(attrs[n]);
+ bl.copy(0, l, (char*)value);
+ return l;
+ }
+ return -1;
+ }
+ int getattrs(map<string,bufferptr>& aset) {
+ aset = attrs;
+ return 0;
+ }
+ int setattrs(map<string,bufferptr>& aset) {
+ attrs = aset;
+ return 0;
+ }
+
+ int setattr(const char *name, const void *value, size_t size) {
+ string n = name;
+ bufferptr bp = buffer::copy((char*)value, size);
+ attrs[n] = bp;
+ return 0;
+ }
+
+ int listattr(char *attrs, size_t size) {
+ assert(0);
+ return 0;
+ }
+
+ int rmattr(const char *name) {
+ string n = name;
+ attrs.erase(n);
+ return 0;
+ }
+
+ bool empty() { return attrs.empty(); }
+ };
+
+ Mutex faker_lock;
+ ObjectStore *store;
+ hash_map<object_t, FakeAttrSet> fakeoattrs;
+ hash_map<coll_t, FakeAttrSet> fakecattrs;
+
+ public:
+ FakeStoreAttrs(ObjectStore *s) : store(s) {}
+
+ int setattr(object_t oid, const char *name,
+ const void *value, size_t size,
+ Context *onsafe=0) {
+ faker_lock.Lock();
+ int r = fakeoattrs[oid].setattr(name, value, size);
+ if (onsafe) store->sync(onsafe);
+ faker_lock.Unlock();
+ return r;
+ }
+ int setattrs(object_t oid, map<string,bufferptr>& aset) {
+ faker_lock.Lock();
+ int r = fakeoattrs[oid].setattrs(aset);
+ faker_lock.Unlock();
+ return r;
+ }
+ int getattr(object_t oid, const char *name,
+ void *value, size_t size) {
+ faker_lock.Lock();
+ int r = fakeoattrs[oid].getattr(name, value, size);
+ faker_lock.Unlock();
+ return r;
+ }
+ int getattrs(object_t oid, map<string,bufferptr>& aset) {
+ faker_lock.Lock();
+ int r = fakeoattrs[oid].getattrs(aset);
+ faker_lock.Unlock();
+ return r;
+ }
+ int rmattr(object_t oid, const char *name,
+ Context *onsafe=0) {
+ faker_lock.Lock();
+ int r = fakeoattrs[oid].rmattr(name);
+ if (onsafe) store->sync(onsafe);
+ faker_lock.Unlock();
+ return r;
+ }
+
+ int listattr(object_t oid, char *attrs, size_t size) {
+ faker_lock.Lock();
+ int r = fakeoattrs[oid].listattr(attrs,size);
+ faker_lock.Unlock();
+ return r;
+ }
+
+ int collection_setattr(coll_t c, const char *name,
+ void *value, size_t size,
+ Context *onsafe=0) {
+ faker_lock.Lock();
+ int r = fakecattrs[c].setattr(name, value, size);
+ if (onsafe) store->sync(onsafe);
+ faker_lock.Unlock();
+ return r;
+ }
+ int collection_rmattr(coll_t c, const char *name,
+ Context *onsafe=0) {
+ faker_lock.Lock();
+ int r = fakecattrs[c].rmattr(name);
+ if (onsafe) store->sync(onsafe);
+ faker_lock.Unlock();
+ return r;
+ }
+ int collection_getattr(coll_t c, const char *name,
+ void *value, size_t size) {
+ faker_lock.Lock();
+ int r = fakecattrs[c].getattr(name, value, size);
+ faker_lock.Unlock();
+ return r;
+ }
+ int collection_listattr(coll_t c, char *attrs, size_t size) {
+ faker_lock.Lock();
+ int r = fakecattrs[c].listattr(attrs,size);
+ faker_lock.Unlock();
+ return r;
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "FakeStore.h"
+#include "include/types.h"
+
+#include "common/Timer.h"
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <iostream>
+#include <cassert>
+#include <errno.h>
+#include <dirent.h>
+//#include <sys/xattr.h>
+//#include <sys/vfs.h>
+
+#ifdef DARWIN
+#include <sys/param.h>
+#include <sys/mount.h>
+#endif // DARWIN
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug) cout << "osd" << whoami << ".fakestore "
+
+#include "include/buffer.h"
+
+#include <map>
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+// crap-a-crap hash
+#define HASH_DIRS 0x80
+#define HASH_MASK 0x7f
+// end crap hash
+
+
+
+
+
+
+
+int FakeStore::mount()
+{
+ if (g_conf.fakestore_dev) {
+ dout(0) << "mounting" << endl;
+ char cmd[100];
+ sprintf(cmd,"mount %s", g_conf.fakestore_dev);
+ system(cmd);
+ }
+
+ string mydir;
+ get_dir(mydir);
+
+ dout(5) << "init with basedir " << mydir << endl;
+
+ // make sure global base dir exists
+ struct stat st;
+ int r = ::stat(basedir.c_str(), &st);
+ if (r != 0) {
+ dout(1) << "unable to stat basedir " << basedir << ", r = " << r << endl;
+ return r;
+ }
+
+ // all okay.
+ return 0;
+}
+
+int FakeStore::umount()
+{
+ dout(5) << "finalize" << endl;
+
+ if (g_conf.fakestore_dev) {
+ char cmd[100];
+ dout(0) << "umounting" << endl;
+ sprintf(cmd,"umount %s", g_conf.fakestore_dev);
+ system(cmd);
+ }
+
+ // nothing
+ return 0;
+}
+
+
+int FakeStore::statfs(struct statfs *buf)
+{
+ string mydir;
+ get_dir(mydir);
+ return ::statfs(mydir.c_str(), buf);
+}
+
+
+
+
+void FakeStore::get_dir(string& dir) {
+ char s[30];
+ sprintf(s, "%d", whoami);
+ dir = basedir + "/" + s;
+}
+void FakeStore::get_oname(object_t oid, string& fn) {
+ char s[100];
+ static hash<object_t> H;
+ sprintf(s, "%d/%02x/%016llx.%08x.%d", whoami, H(oid) & HASH_MASK, oid.ino, oid.bno, oid.rev);
+ fn = basedir + "/" + s;
+ // dout(1) << "oname is " << fn << endl;
+}
+
+
+
+void FakeStore::wipe_dir(string mydir)
+{
+ DIR *dir = ::opendir(mydir.c_str());
+ if (dir) {
+ dout(10) << "wiping " << mydir << endl;
+ struct dirent *ent = 0;
+
+ while ((ent = ::readdir(dir)) != 0) {
+ if (ent->d_name[0] == '.') continue;
+ dout(25) << "mkfs unlinking " << ent->d_name << endl;
+ string fn = mydir + "/" + ent->d_name;
+ ::unlink(fn.c_str());
+ }
+
+ ::closedir(dir);
+ } else {
+ dout(1) << "mkfs couldn't read dir " << mydir << endl;
+ }
+}
+
+int FakeStore::mkfs()
+{
+ if (g_conf.fakestore_dev) {
+ dout(0) << "mounting" << endl;
+ char cmd[100];
+ sprintf(cmd,"mount %s", g_conf.fakestore_dev);
+ system(cmd);
+ }
+
+
+ int r = 0;
+ struct stat st;
+ string mydir;
+ get_dir(mydir);
+
+ dout(1) << "mkfs in " << mydir << endl;
+
+
+ // make sure my dir exists
+ r = ::stat(mydir.c_str(), &st);
+ if (r != 0) {
+ dout(10) << "creating " << mydir << endl;
+ mkdir(mydir.c_str(), 0755);
+ r = ::stat(mydir.c_str(), &st);
+ if (r != 0) {
+ dout(1) << "couldnt create dir, r = " << r << endl;
+ return r;
+ }
+ }
+ else wipe_dir(mydir);
+
+ // hashed bits too
+ for (int i=0; i<HASH_DIRS; i++) {
+ char s[4];
+ sprintf(s, "%02x", i);
+ string subdir = mydir + "/" + s;
+ r = ::stat(subdir.c_str(), &st);
+ if (r != 0) {
+ dout(2) << " creating " << subdir << endl;
+ ::mkdir(subdir.c_str(), 0755);
+ r = ::stat(subdir.c_str(), &st);
+ if (r != 0) {
+ dout(1) << "couldnt create subdir, r = " << r << endl;
+ return r;
+ }
+ }
+ else
+ wipe_dir( subdir );
+ }
+
+ if (g_conf.fakestore_dev) {
+ char cmd[100];
+ dout(0) << "umounting" << endl;
+ sprintf(cmd,"umount %s", g_conf.fakestore_dev);
+ system(cmd);
+ }
+
+ dout(1) << "mkfs done in " << mydir << endl;
+
+ return r;
+}
+
+
+
+bool FakeStore::exists(object_t oid)
+{
+ struct stat st;
+ if (stat(oid, &st) == 0)
+ return true;
+ else
+ return false;
+}
+
+
+int FakeStore::stat(object_t oid,
+ struct stat *st)
+{
+ dout(20) << "stat " << oid << endl;
+ string fn;
+ get_oname(oid,fn);
+ int r = ::stat(fn.c_str(), st);
+ return r;
+}
+
+
+
+int FakeStore::remove(object_t oid, Context *onsafe)
+{
+ dout(20) << "remove " << oid << endl;
+ string fn;
+ get_oname(oid,fn);
+ int r = ::unlink(fn.c_str());
+ if (onsafe) sync(onsafe);
+ return r;
+}
+
+int FakeStore::truncate(object_t oid, off_t size, Context *onsafe)
+{
+ dout(20) << "truncate " << oid << " size " << size << endl;
+
+ string fn;
+ get_oname(oid,fn);
+ int r = ::truncate(fn.c_str(), size);
+ if (onsafe) sync(onsafe);
+ return r;
+}
+
+int FakeStore::read(object_t oid,
+ off_t offset, size_t len,
+ bufferlist& bl) {
+ dout(20) << "read " << oid << " len " << len << " off " << offset << endl;
+
+ string fn;
+ get_oname(oid,fn);
+
+ int fd = ::open(fn.c_str(), O_RDONLY);
+ if (fd < 0) {
+ dout(10) << "read couldn't open " << fn.c_str() << " errno " << errno << " " << strerror(errno) << endl;
+ return fd;
+ }
+ ::flock(fd, LOCK_EX); // lock for safety
+
+ off_t actual = lseek(fd, offset, SEEK_SET);
+ size_t got = 0;
+
+ if (len == 0) {
+ struct stat st;
+ fstat(fd, &st);
+ len = st.st_size;
+ }
+
+ if (actual == offset) {
+ bufferptr bptr(len); // prealloc space for entire read
+ got = ::read(fd, bptr.c_str(), len);
+ bptr.set_length(got); // properly size the buffer
+ if (got > 0) bl.push_back( bptr ); // put it in the target bufferlist
+ }
+ ::flock(fd, LOCK_UN);
+ ::close(fd);
+ return got;
+}
+
+
+int FakeStore::write(object_t oid,
+ off_t offset, size_t len,
+ bufferlist& bl,
+ Context *onsafe)
+{
+ dout(20) << "write " << oid << " len " << len << " off " << offset << endl;
+
+ string fn;
+ get_oname(oid,fn);
+
+ ::mknod(fn.c_str(), 0644, 0); // in case it doesn't exist yet.
+
+ int flags = O_WRONLY;//|O_CREAT;
+ int fd = ::open(fn.c_str(), flags);
+ if (fd < 0) {
+ dout(1) << "write couldn't open " << fn.c_str() << " flags " << flags << " errno " << errno << " " << strerror(errno) << endl;
+ return fd;
+ }
+ ::flock(fd, LOCK_EX); // lock for safety
+ //::fchmod(fd, 0664);
+
+ // seek
+ off_t actual = lseek(fd, offset, SEEK_SET);
+ int did = 0;
+ assert(actual == offset);
+
+ // write buffers
+ for (list<bufferptr>::const_iterator it = bl.buffers().begin();
+ it != bl.buffers().end();
+ it++) {
+ int r = ::write(fd, (char*)(*it).c_str(), (*it).length());
+ if (r > 0)
+ did += r;
+ else {
+ dout(1) << "couldn't write to " << fn.c_str() << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl;
+ }
+ }
+
+ if (did < 0) {
+ dout(1) << "couldn't write to " << fn.c_str() << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl;
+ }
+
+ ::flock(fd, LOCK_UN);
+
+ // schedule sync
+ if (onsafe) sync(onsafe);
+
+ ::close(fd);
+
+ return did;
+}
+
+
+class C_FakeSync : public Context {
+public:
+ Context *c;
+ int *n;
+ C_FakeSync(Context *c_, int *n_) : c(c_), n(n_) {
+ ++*n;
+ }
+ void finish(int r) {
+ c->finish(r);
+ --(*n);
+ //cout << "sync, " << *n << " still unsync" << endl;
+ }
+};
+
+void FakeStore::sync(Context *onsafe)
+{
+ if (g_conf.fakestore_fake_sync) {
+ g_timer.add_event_after((float)g_conf.fakestore_fake_sync,
+ new C_FakeSync(onsafe, &unsync));
+
+ } else {
+ assert(0); // der..no implemented anymore
+ }
+}
+
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __FAKESTORE_H
+#define __FAKESTORE_H
+
+#include "ObjectStore.h"
+#include "common/ThreadPool.h"
+#include "common/Mutex.h"
+
+#include "Fake.h"
+//#include "FakeStoreBDBCollections.h"
+
+
+#include <map>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+
+// fake attributes in memory, if we need to.
+
+
+class FakeStore : public ObjectStore,
+ public FakeStoreAttrs,
+ public FakeStoreCollections {
+ string basedir;
+ int whoami;
+
+ int unsync;
+
+ Mutex lock;
+
+ // fns
+ void get_dir(string& dir);
+ void get_oname(object_t oid, string& fn);
+ void wipe_dir(string mydir);
+
+
+ public:
+ FakeStore(char *base, int whoami) : FakeStoreAttrs(this), FakeStoreCollections(this)
+ {
+ this->basedir = base;
+ this->whoami = whoami;
+ unsync = 0;
+ }
+
+
+ int mount();
+ int umount();
+ int mkfs();
+
+ int statfs(struct statfs *buf);
+
+ // ------------------
+ // objects
+ int pick_object_revision_lt(object_t& oid) {
+ return 0;
+ }
+ bool exists(object_t oid);
+ int stat(object_t oid, struct stat *st);
+ int remove(object_t oid, Context *onsafe);
+ int truncate(object_t oid, off_t size, Context *onsafe);
+ int read(object_t oid,
+ off_t offset, size_t len,
+ bufferlist& bl);
+ int write(object_t oid,
+ off_t offset, size_t len,
+ bufferlist& bl,
+ Context *onsafe);
+
+ void sync(Context *onsafe);
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __FAKESTOREBDBCOLLECTIONS_H
+#define __FAKESTOREBDBCOLLECTIONS_H
+
+#include "BDBMap.h"
+#include "ObjectStore.h"
+#include "common/Mutex.h"
+
+#define BDBHASH_DIRS 128LL
+#define BDBHASH_FUNC(x) (((x) ^ ((x)>>30) ^ ((x)>>18) ^ ((x)>>45) ^ 0xdead1234) * 884811 % BDBHASH_DIRS)
+
+class FakeStoreBDBCollections {
+ private:
+ int whoami;
+ string basedir;
+
+ Mutex bdblock;
+
+ // collection dbs
+ BDBMap<coll_t, int> collections;
+ map<coll_t, BDBMap<object_t, int>*> collection_map;
+
+ // dirs
+ void get_dir(string& dir) {
+ char s[30];
+ sprintf(s, "%d", whoami);
+ dir = basedir + "/" + s;
+ }
+ void get_collfn(coll_t c, string &fn) {
+ char s[100];
+ sprintf(s, "%d/%02llx/%016llx.co", whoami, BDBHASH_FUNC(c), c);
+ fn = basedir + "/" + s;
+ }
+
+ void open_collections() {
+ string cfn;
+ get_dir(cfn);
+ cfn += "/collections";
+ collections.open(cfn.c_str());
+ list<coll_t> ls;
+ collections.list_keys(ls);
+ }
+ void close_collections() {
+ if (collections.is_open())
+ collections.close();
+
+ for (map<coll_t, BDBMap<object_t, int>*>::iterator it = collection_map.begin();
+ it != collection_map.end();
+ it++) {
+ it->second->close();
+ }
+ collection_map.clear();
+ }
+
+ int open_collection(coll_t c) {
+ if (collection_map.count(c))
+ return 0; // already open.
+
+ string fn;
+ get_collfn(c,fn);
+ collection_map[c] = new BDBMap<coll_t,int>;
+ int r = collection_map[c]->open(fn.c_str());
+ if (r != 0)
+ collection_map.erase(c); // failed
+ return r;
+ }
+
+ public:
+ FakeStoreBDBCollections(int w, string& bd) : whoami(w), basedir(bd) {}
+ ~FakeStoreBDBCollections() {
+ close_collections();
+ }
+
+ int list_collections(list<coll_t>& ls) {
+ bdblock.Lock();
+ if (!collections.is_open()) open_collections();
+
+ ls.clear();
+ collections.list_keys(ls);
+ bdblock.Unlock();
+ return 0;
+ }
+ int create_collection(coll_t c) {
+ bdblock.Lock();
+ if (!collections.is_open()) open_collections();
+
+ collections.put(c, 1);
+ open_collection(c);
+ bdblock.Unlock();
+ return 0;
+ }
+ int destroy_collection(coll_t c) {
+ bdblock.Lock();
+ if (!collections.is_open()) open_collections();
+
+ collections.del(c);
+
+ open_collection(c);
+ collection_map[c]->close();
+
+ string fn;
+ get_collfn(c,fn);
+ collection_map[c]->remove(fn.c_str());
+ delete collection_map[c];
+ collection_map.erase(c);
+ bdblock.Unlock();
+ return 0;
+ }
+ int collection_stat(coll_t c, struct stat *st) {
+ bdblock.Lock();
+ if (!collections.is_open()) open_collections();
+
+ string fn;
+ get_collfn(c,fn);
+ int r = ::stat(fn.c_str(), st);
+ bdblock.Unlock();
+ return r;
+ }
+ bool collection_exists(coll_t c) {
+ bdblock.Lock();
+ struct stat st;
+ int r = collection_stat(c, &st) == 0;
+ bdblock.Unlock();
+ return r;
+ }
+ int collection_add(coll_t c, object_t o) {
+ bdblock.Lock();
+ if (!collections.is_open()) open_collections();
+
+ open_collection(c);
+ collection_map[c]->put(o,1);
+ bdblock.Unlock();
+ return 0;
+ }
+ int collection_remove(coll_t c, object_t o) {
+ bdblock.Lock();
+ if (!collections.is_open()) open_collections();
+
+ open_collection(c);
+ collection_map[c]->del(o);
+ bdblock.Unlock();
+ return 0;
+ }
+ int collection_list(coll_t c, list<object_t>& o) {
+ bdblock.Lock();
+ if (!collections.is_open()) open_collections();
+
+ open_collection(c);
+ collection_map[c]->list_keys(o);
+ bdblock.Unlock();
+ return 0;
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "OBFSStore.h"
+
+extern "C" {
+#include "../../uofs/uofs.h"
+}
+
+#include "common/Timer.h"
+
+#include "include/types.h"
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <iostream>
+#include <cassert>
+#include <errno.h>
+#include <dirent.h>
+
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug) cout << "osd" << whoami << ".obfs "
+
+OBFSStore::OBFSStore(int whoami, char *param, char *dev)
+{
+ this->whoami = whoami;
+ this->mounted = -1;
+ this->bdev_id = -1;
+ this->param[0] = 0;
+ this->dev[0] = 0;
+ if (dev)
+ strcpy(this->dev, dev);
+ if (param)
+ strcpy(this->param, param);
+}
+
+int OBFSStore::mount(void)
+{
+ dout(0) << "OBFS init!" << endl;
+ if ((this->bdev_id = device_open(this->dev, O_RDWR)) < 0) {
+ dout(0) << "device open FAILED on " << this->dev << ", errno " << errno << endl;
+ return -1;
+ }
+
+ this->mkfs();
+ this->mounted = uofs_mount(this->bdev_id,
+ g_conf.uofs_cache_size,
+ g_conf.uofs_min_flush_pages,
+ this->whoami);
+ switch (this->mounted) {
+ case -1:
+ this->mkfs();
+ //retry to mount
+ dout(0) << "remount the OBFS" << endl;
+ this->mounted = uofs_mount(this->bdev_id,
+ g_conf.uofs_cache_size,
+ g_conf.uofs_min_flush_pages,
+ this->whoami);
+ assert(this->mounted >= 0);
+ break;
+ case -2:
+ //fsck
+ dout(0) << "Need fsck! Simply formatted for now!" << endl;
+ this->mkfs();
+ this->mounted = uofs_mount(this->bdev_id,
+ g_conf.uofs_cache_size,
+ g_conf.uofs_min_flush_pages,
+ this->whoami);
+ assert(this->mounted >= 0);
+ break;
+ case 0:
+ //success
+ break;
+ default:
+ break;
+ }
+
+ if (this->mounted >= 0)
+ dout(0) << "successfully mounted!" << endl;
+ else
+ dout(0) << "error in mounting obfsstore!" << endl;
+
+ return 0;
+}
+
+int OBFSStore::mkfs(void)
+{
+ /*int donode_size_byte = 1024,
+ bd_ratio = 10,
+ reg_size_mb = 256,
+ sb_size_kb = 4,
+ lb_size_kb = 1024,
+ nr_hash_table_buckets = 1023,
+ delay_allocation = 1,
+ flush_interval = 5;
+ FILE *param;
+ */
+
+
+ if (this->mounted >= 0)
+ return 0;
+
+ dout(0) << "OBFS.mkfs!" << endl;
+ /*
+ if (strlen(this->param) > 0) {
+ param = fopen(this->param, "r");
+ if (param) {
+ //fscanf(param, "Block Device: %s\n", this->dev);
+ fscanf(param, "Donode Size: %d\n", &donode_size_byte);
+ fscanf(param, "Block vs Donode Ratio: %d\n", &bd_ratio);
+ fscanf(param, "Region Size: %d MB\n", ®_size_mb);
+ fscanf(param, "Small Block Size: %d KB\n", &sb_size_kb);
+ fscanf(param, "Large Block Size: %d KB\n", &lb_size_kb);
+ fscanf(param, "Hash Table Buckets: %d\n", &nr_hash_table_buckets);
+ fscanf(param, "Delayed Allocation: %d\n", &delay_allocation);
+ } else {
+ dout(0) << "read open FAILED on "<< this->param <<", errno " << errno << endl;
+ dout(0) << "use default parameters" << endl;
+ }
+ } else
+ dout(0) << "use default parameters" << endl;
+ */
+
+ if (this->bdev_id <= 0)
+ if ((this->bdev_id = device_open(this->dev, O_RDWR)) < 0) {
+ dout(0) << "device open FAILED on "<< this->dev <<", errno " << errno << endl;
+ return -1;
+ }
+
+ dout(0) << "start formating!" << endl;
+
+ uofs_format(this->bdev_id,
+ g_conf.uofs_onode_size,
+ g_conf.uofs_block_meta_ratio,
+ g_conf.uofs_segment_size,
+ g_conf.uofs_small_block_size,
+ g_conf.uofs_large_block_size,
+ g_conf.uofs_nr_hash_buckets,
+ g_conf.uofs_delay_allocation,
+ 0,//g_conf.uofs_dev_force_size,
+ g_conf.uofs_flush_interval,
+ 0);
+
+ dout(0) << "formatting complete!" << endl;
+ return 0;
+}
+
+int OBFSStore::umount(void)
+{
+ uofs_shutdown();
+ close(this->bdev_id);
+
+ return 0;
+}
+
+int OBFSStore::statfs(struct statfs *sfs)
+{
+ return 0;
+}
+
+bool OBFSStore::exists(object_t oid)
+{
+ //dout(0) << "calling function exists!" << endl;
+ return uofs_exist(oid);
+}
+
+int OBFSStore::stat(object_t oid, struct stat *st)
+{
+ dout(0) << "calling function stat!" << endl;
+ if (uofs_exist(oid)) return 0;
+ return -1;
+}
+
+int OBFSStore::remove(object_t oid)
+{
+ dout(0) << "calling remove function!" << endl;
+ return uofs_del(oid);
+}
+
+int OBFSStore::truncate(object_t oid, off_t size)
+{
+ dout(0) << "calling truncate function!" << endl;
+ //return uofs_truncate(oid, size);
+ return -1;
+}
+
+int OBFSStore::read(object_t oid, size_t len,
+ off_t offset, bufferlist &bl)
+{
+ //dout(0) << "calling read function!" << endl;
+ //dout(0) << oid << " 0 " << len << " " << offset << " 100" << endl;
+
+ // FIXME: page-align this and we can avoid a memcpy...
+ bl.push_back(new buffer(len));
+ return uofs_read(oid, bl.c_str(), offset, len);
+}
+
+int OBFSStore::write(object_t oid, size_t len,
+ off_t offset, bufferlist& bl, bool fsync)
+{
+ int ret = 0;
+
+ //dout(0) << "calling write function!" << endl;
+ //if (whoami == 0)
+ // dout(0) << oid << " 0 " << len << " " << offset << " 101" << endl;
+
+ for (list<bufferptr>::iterator p = bl.buffers().begin();
+ p != bl.buffers().end();
+ p++) {
+ ret += uofs_write(oid, (*p).c_str(), offset, len, 0);
+ }
+
+ if (fsync)
+ ret += uofs_sync(oid);
+
+ return ret;
+}
+
+
+int OBFSStore::write(object_t oid, size_t len,
+ off_t offset, bufferlist& bl, Context *onflush)
+{
+ int r = write(oid, len, offset, bl, false);
+ g_timer.add_event_after((float)g_conf.uofs_fake_sync, onflush);
+ return r;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef _OBFSSTORE_H_
+#define _OBFSSTORE_H_
+
+#include "ObjectStore.h"
+#include "Fake.h"
+
+class OBFSStore : public ObjectStore,
+ public FakeStoreAttrs,
+ public FakeStoreCollections {
+ int whoami;
+ int bdev_id;
+ int mounted;
+ char dev[128];
+ char param[128];
+
+ public:
+ OBFSStore(int whoami, char *param, char *dev);
+
+ int mount(void);
+ int umount(void);
+ int mkfs(void);
+
+ int statfs(struct statfs *);
+
+ bool exists(object_t oid);
+ int stat(object_t oid, struct stat *st);
+
+ int remove(object_t oid);
+ int truncate(object_t oid, off_t size);
+
+ int read(object_t oid, size_t len,
+ off_t offset, bufferlist& bl);
+ int write(object_t oid, size_t len,
+ off_t offset, bufferlist& bl,
+ bool fsync);
+ int write(object_t oid, size_t len,
+ off_t offset, bufferlist& bl,
+ Context *onflush);
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "include/types.h"
+
+#include "OSD.h"
+#include "OSDMap.h"
+
+#ifdef USE_OBFS
+# include "OBFSStore.h"
+#else
+# include "FakeStore.h"
+#endif
+
+#include "ebofs/Ebofs.h"
+
+#include "Ager.h"
+
+
+#include "msg/Messenger.h"
+#include "msg/Message.h"
+
+#include "messages/MGenericMessage.h"
+#include "messages/MPing.h"
+#include "messages/MPingAck.h"
+#include "messages/MOSDPing.h"
+#include "messages/MOSDFailure.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDIn.h"
+#include "messages/MOSDOut.h"
+
+#include "messages/MOSDMap.h"
+#include "messages/MOSDGetMap.h"
+#include "messages/MOSDPGNotify.h"
+#include "messages/MOSDPGQuery.h"
+#include "messages/MOSDPGLog.h"
+#include "messages/MOSDPGRemove.h"
+
+#include "common/Logger.h"
+#include "common/LogType.h"
+#include "common/Timer.h"
+#include "common/ThreadPool.h"
+
+#include <iostream>
+#include <cassert>
+#include <errno.h>
+#include <sys/stat.h>
+
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " "
+#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cerr << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " "
+
+char *osd_base_path = "./osddata";
+char *ebofs_base_path = "./dev";
+
+
+object_t SUPERBLOCK_OBJECT(0,0);
+
+
+// <hack> force remount hack for performance testing FakeStore
+class C_Remount : public Context {
+ OSD *osd;
+public:
+ C_Remount(OSD *o) : osd(o) {}
+ void finish(int) {
+ osd->force_remount();
+ }
+};
+
+void OSD::force_remount()
+{
+ dout(0) << "forcing remount" << endl;
+ osd_lock.Lock();
+ {
+ store->umount();
+ store->mount();
+ }
+ osd_lock.Unlock();
+ dout(0) << "finished remount" << endl;
+}
+// </hack>
+
+
+// cons/des
+
+LogType osd_logtype;
+
+OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev)
+{
+ whoami = id;
+ messenger = m;
+ monmap = mm;
+
+ osdmap = 0;
+ boot_epoch = 0;
+
+ last_tid = 0;
+ num_pulling = 0;
+
+ state = STATE_BOOTING;
+
+ hb_stat_ops = 0;
+ hb_stat_qlen = 0;
+
+ pending_ops = 0;
+ waiting_for_no_ops = false;
+
+ if (g_conf.osd_remount_at)
+ g_timer.add_event_after(g_conf.osd_remount_at, new C_Remount(this));
+
+
+
+ // init object store
+ // try in this order:
+ // dev/osd$num
+ // dev/osd.$hostname
+ // dev/osd.all
+
+ if (dev) {
+ strcpy(dev_path,dev);
+ } else {
+ char hostname[100];
+ hostname[0] = 0;
+ gethostname(hostname,100);
+
+ sprintf(dev_path, "%s/osd%d", ebofs_base_path, whoami);
+
+ struct stat sta;
+ if (::lstat(dev_path, &sta) != 0)
+ sprintf(dev_path, "%s/osd.%s", ebofs_base_path, hostname);
+
+ if (::lstat(dev_path, &sta) != 0)
+ sprintf(dev_path, "%s/osd.all", ebofs_base_path);
+ }
+
+ if (g_conf.ebofs) {
+ store = new Ebofs(dev_path);
+ //store->_fake_writes(true);
+ }
+#ifdef USE_OBFS
+ else if (g_conf.uofs) {
+ store = new OBFSStore(whoami, NULL, dev_path);
+ }
+#endif
+ else {
+ store = new FakeStore(osd_base_path, whoami);
+ }
+
+}
+
+OSD::~OSD()
+{
+ if (threadpool) { delete threadpool; threadpool = 0; }
+ if (osdmap) { delete osdmap; osdmap = 0; }
+ //if (monitor) { delete monitor; monitor = 0; }
+ if (messenger) { delete messenger; messenger = 0; }
+ if (logger) { delete logger; logger = 0; }
+ if (store) { delete store; store = 0; }
+}
+
+int OSD::init()
+{
+ osd_lock.Lock();
+ {
+ // mkfs?
+ if (g_conf.osd_mkfs) {
+ dout(2) << "mkfs" << endl;
+ store->mkfs();
+
+ // make up a superblock
+ //superblock.fsid = ???;
+ superblock.whoami = whoami;
+ }
+
+ // mount.
+ dout(2) << "mounting " << dev_path << endl;
+ int r = store->mount();
+ assert(r>=0);
+
+ if (g_conf.osd_mkfs) {
+ // age?
+ if (g_conf.osd_age_time != 0) {
+ dout(2) << "age" << endl;
+ Ager ager(store);
+ if (g_conf.osd_age_time < 0)
+ ager.load_freelist();
+ else
+ ager.age(g_conf.osd_age_time,
+ g_conf.osd_age,
+ g_conf.osd_age - .05,
+ 50000,
+ g_conf.osd_age - .05);
+ }
+ }
+ else {
+ dout(2) << "boot" << endl;
+
+ // read superblock
+ read_superblock();
+
+ // load up pgs (as they previously existed)
+ load_pgs();
+
+ dout(2) << "superblock: i am osd" << superblock.whoami << endl;
+ assert(whoami == superblock.whoami);
+ }
+
+
+ // log
+ char name[80];
+ sprintf(name, "osd%02d", whoami);
+ logger = new Logger(name, (LogType*)&osd_logtype);
+ osd_logtype.add_set("opq");
+ osd_logtype.add_inc("op");
+ osd_logtype.add_inc("c_rd");
+ osd_logtype.add_inc("c_rdb");
+ osd_logtype.add_inc("c_wr");
+ osd_logtype.add_inc("c_wrb");
+
+ osd_logtype.add_inc("r_push");
+ osd_logtype.add_inc("r_pushb");
+ osd_logtype.add_inc("r_wr");
+ osd_logtype.add_inc("r_wrb");
+
+ osd_logtype.add_inc("rlnum");
+
+ osd_logtype.add_set("numpg");
+ osd_logtype.add_set("pingset");
+
+ osd_logtype.add_set("buf");
+
+ osd_logtype.add_inc("map");
+ osd_logtype.add_inc("mapi");
+ osd_logtype.add_inc("mapidup");
+ osd_logtype.add_inc("mapf");
+ osd_logtype.add_inc("mapfdup");
+
+ // request thread pool
+ {
+ char name[80];
+ sprintf(name,"osd%d.threadpool", whoami);
+ threadpool = new ThreadPool<OSD*, pg_t>(name, g_conf.osd_maxthreads,
+ static_dequeueop,
+ this);
+ }
+
+ // i'm ready!
+ messenger->set_dispatcher(this);
+
+ // announce to monitor i exist and have booted.
+ int mon = monmap->pick_mon();
+ messenger->send_message(new MOSDBoot(superblock), MSG_ADDR_MON(mon), monmap->get_inst(mon));
+
+ // start the heart
+ next_heartbeat = new C_Heartbeat(this);
+ g_timer.add_event_after(g_conf.osd_heartbeat_interval, next_heartbeat);
+ }
+ osd_lock.Unlock();
+
+ //dout(0) << "osd_rep " << g_conf.osd_rep << endl;
+
+ return 0;
+}
+
+int OSD::shutdown()
+{
+ dout(1) << "shutdown, timer has " << g_timer.num_event << endl;
+
+ if (next_heartbeat) g_timer.cancel_event(next_heartbeat);
+
+ state = STATE_STOPPING;
+
+ // finish ops
+ wait_for_no_ops();
+
+ // stop threads
+ delete threadpool;
+ threadpool = 0;
+
+ // close pgs
+ for (hash_map<pg_t, PG*>::iterator p = pg_map.begin();
+ p != pg_map.end();
+ p++) {
+ delete p->second;
+ }
+ pg_map.clear();
+
+ // shut everything else down
+ //monitor->shutdown();
+ messenger->shutdown();
+
+ osd_lock.Unlock();
+ int r = store->umount();
+ osd_lock.Lock();
+ return r;
+}
+
+
+
+void OSD::write_superblock(ObjectStore::Transaction& t)
+{
+ dout(10) << "write_superblock " << superblock << endl;
+
+ bufferlist bl;
+ bl.append((char*)&superblock, sizeof(superblock));
+ t.write(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl);
+}
+
+int OSD::read_superblock()
+{
+ bufferlist bl;
+ int r = store->read(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl);
+ if (bl.length() != sizeof(superblock)) {
+ dout(10) << "read_superblock failed, r = " << r << ", i got " << bl.length() << " bytes, not " << sizeof(superblock) << endl;
+ return -1;
+ }
+
+ bl.copy(0, sizeof(superblock), (char*)&superblock);
+
+ dout(10) << "read_superblock " << superblock << endl;
+
+ // load up "current" osdmap
+ assert(!osdmap);
+ osdmap = new OSDMap;
+ bl.clear();
+ get_map_bl(superblock.current_epoch, bl);
+ osdmap->decode(bl);
+
+ assert(whoami == superblock.whoami); // fixme!
+ return 0;
+}
+
+
+// object locks
+
+PG *OSD::lock_pg(pg_t pgid)
+{
+ osd_lock.Lock();
+ PG *pg = _lock_pg(pgid);
+ osd_lock.Unlock();
+ return pg;
+}
+
+PG *OSD::_lock_pg(pg_t pgid)
+{
+ assert(pg_map.count(pgid));
+
+ if (pg_lock.count(pgid)) {
+ Cond c;
+ dout(15) << "lock_pg " << pgid << " waiting as " << &c << endl;
+ //cerr << "lock_pg " << pgid << " waiting as " << &c << endl;
+
+ list<Cond*>& ls = pg_lock_waiters[pgid]; // this is commit, right?
+ ls.push_back(&c);
+
+ while (pg_lock.count(pgid) ||
+ ls.front() != &c)
+ c.Wait(osd_lock);
+
+ assert(ls.front() == &c);
+ ls.pop_front();
+ if (ls.empty())
+ pg_lock_waiters.erase(pgid);
+ }
+
+ dout(15) << "lock_pg " << pgid << endl;
+ pg_lock.insert(pgid);
+
+ return pg_map[pgid];
+}
+
+void OSD::unlock_pg(pg_t pgid)
+{
+ osd_lock.Lock();
+ _unlock_pg(pgid);
+ osd_lock.Unlock();
+}
+
+void OSD::_unlock_pg(pg_t pgid)
+{
+ // unlock
+ assert(pg_lock.count(pgid));
+ pg_lock.erase(pgid);
+
+ if (pg_lock_waiters.count(pgid)) {
+ // someone is in line
+ Cond *c = pg_lock_waiters[pgid].front();
+ assert(c);
+ dout(15) << "unlock_pg " << pgid << " waking up next guy " << c << endl;
+ c->Signal();
+ } else {
+ // nobody waiting
+ dout(15) << "unlock_pg " << pgid << endl;
+ }
+}
+
+void OSD::_remove_pg(pg_t pgid)
+{
+ dout(10) << "_remove_pg " << pgid << endl;
+
+ // remove from store
+ list<object_t> olist;
+ store->collection_list(pgid, olist);
+
+ ObjectStore::Transaction t;
+ {
+ for (list<object_t>::iterator p = olist.begin();
+ p != olist.end();
+ p++)
+ t.remove(*p);
+ t.remove_collection(pgid);
+ t.remove(object_t(1,pgid)); // log too
+ }
+ store->apply_transaction(t);
+
+ // hose from memory
+ delete pg_map[pgid];
+ pg_map.erase(pgid);
+}
+
+
+void OSD::activate_pg(pg_t pgid, epoch_t epoch)
+{
+ osd_lock.Lock();
+ {
+ if (pg_map.count(pgid)) {
+ PG *pg = _lock_pg(pgid);
+ if (pg->is_crashed() &&
+ pg->is_replay() &&
+ pg->get_role() == 0 &&
+ pg->info.history.same_primary_since <= epoch) {
+ ObjectStore::Transaction t;
+ pg->activate(t);
+ store->apply_transaction(t);
+ }
+ _unlock_pg(pgid);
+ }
+ }
+
+ // finishers?
+ if (finished.empty()) {
+ osd_lock.Unlock();
+ } else {
+ list<Message*> waiting;
+ waiting.splice(waiting.begin(), finished);
+
+ osd_lock.Unlock();
+
+ for (list<Message*>::iterator it = waiting.begin();
+ it != waiting.end();
+ it++) {
+ dispatch(*it);
+ }
+ }
+}
+
+
+// -------------------------------------
+
+void OSD::heartbeat()
+{
+ osd_lock.Lock();
+
+ utime_t now = g_clock.now();
+ utime_t since = now;
+ since.sec_ref() -= g_conf.osd_heartbeat_interval;
+
+ // calc my stats
+ float avg_qlen = 0;
+ if (hb_stat_ops) avg_qlen = (float)hb_stat_qlen / (float)hb_stat_ops;
+
+ dout(5) << "heartbeat " << now
+ << ": ops " << hb_stat_ops
+ << ", avg qlen " << avg_qlen
+ << endl;
+
+ // reset until next time around
+ hb_stat_ops = 0;
+ hb_stat_qlen = 0;
+
+ // send pings
+ set<int> pingset;
+ for (hash_map<pg_t, PG*>::iterator i = pg_map.begin();
+ i != pg_map.end();
+ i++) {
+ PG *pg = i->second;
+
+ // we want to ping the primary.
+ if (pg->get_role() <= 0) continue;
+ if (pg->acting.size() < 1) continue;
+
+ if (pg->last_heartbeat < since) {
+ pg->last_heartbeat = now;
+ pingset.insert(pg->acting[0]);
+ }
+ }
+ for (set<int>::iterator i = pingset.begin();
+ i != pingset.end();
+ i++) {
+ _share_map_outgoing( MSG_ADDR_OSD(*i), osdmap->get_inst(*i) );
+ messenger->send_message(new MOSDPing(osdmap->get_epoch(), avg_qlen),
+ MSG_ADDR_OSD(*i), osdmap->get_inst(*i));
+ }
+
+ if (logger) logger->set("pingset", pingset.size());
+
+ // hack: fake reorg?
+ if (osdmap && g_conf.fake_osdmap_updates) {
+ int mon = monmap->pick_mon();
+ if ((rand() % g_conf.fake_osdmap_updates) == 0) {
+ //if ((rand() % (g_conf.num_osd / g_conf.fake_osdmap_updates)) == whoami / g_conf.fake_osdmap_updates) {
+ messenger->send_message(new MOSDIn(osdmap->get_epoch()),
+ MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ }
+ /*
+ if (osdmap->is_out(whoami)) {
+ messenger->send_message(new MOSDIn(osdmap->get_epoch()),
+ MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ }
+ else if ((rand() % g_conf.fake_osdmap_updates) == 0) {
+ //messenger->send_message(new MOSDOut(osdmap->get_epoch()),
+ //MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ }
+ }
+ */
+ }
+
+ // schedule next! randomly.
+ next_heartbeat = new C_Heartbeat(this);
+ float wait = .5 + ((float)(rand() % 10)/10.0) * (float)g_conf.osd_heartbeat_interval;
+ g_timer.add_event_after(wait, next_heartbeat);
+
+ osd_lock.Unlock();
+}
+
+
+
+// --------------------------------------
+// dispatch
+
+bool OSD::_share_map_incoming(msg_addr_t who, const entity_inst_t& inst, epoch_t epoch)
+{
+ bool shared = false;
+
+ // does client have old map?
+ if (who.is_client()) {
+ if (epoch < osdmap->get_epoch()) {
+ dout(10) << who << " has old map " << epoch << " < " << osdmap->get_epoch() << endl;
+ send_incremental_map(epoch, who, inst, true);
+ shared = true;
+ }
+ }
+
+ // does peer have old map?
+ if (who.is_osd()) {
+ // remember
+ if (peer_map_epoch[who] < epoch)
+ peer_map_epoch[who] = epoch;
+
+ // older?
+ if (peer_map_epoch[who] < osdmap->get_epoch()) {
+ dout(10) << who << " has old map " << epoch << " < " << osdmap->get_epoch() << endl;
+ send_incremental_map(epoch, who, inst, true);
+ peer_map_epoch[who] = osdmap->get_epoch(); // so we don't send it again.
+ shared = true;
+ }
+ }
+
+ return shared;
+}
+
+
+void OSD::_share_map_outgoing(msg_addr_t dest, const entity_inst_t& inst)
+{
+ assert(dest.is_osd());
+
+ if (dest.is_osd()) {
+ // send map?
+ if (peer_map_epoch.count(dest)) {
+ epoch_t pe = peer_map_epoch[dest];
+ if (pe < osdmap->get_epoch()) {
+ send_incremental_map(pe, dest, inst, true);
+ peer_map_epoch[dest] = osdmap->get_epoch();
+ }
+ } else {
+ // no idea about peer's epoch.
+ // ??? send recent ???
+ // do nothing.
+ }
+ }
+}
+
+
+
+void OSD::dispatch(Message *m)
+{
+ // lock!
+ osd_lock.Lock();
+
+ switch (m->get_type()) {
+
+ // -- don't need lock --
+ case MSG_PING:
+ dout(10) << "ping from " << m->get_source() << endl;
+ delete m;
+ break;
+
+ // -- don't need OSDMap --
+
+ /*
+ // host monitor
+ case MSG_PING_ACK:
+ case MSG_FAILURE_ACK:
+ monitor->proc_message(m);
+ break;
+ */
+
+ // map and replication
+ case MSG_OSD_MAP:
+ handle_osd_map((MOSDMap*)m);
+ break;
+
+ // osd
+ case MSG_SHUTDOWN:
+ shutdown();
+ delete m;
+ break;
+
+
+
+ // -- need OSDMap --
+
+ default:
+ {
+ // no map? starting up?
+ if (!osdmap) {
+ dout(7) << "no OSDMap, not booted" << endl;
+ waiting_for_osdmap.push_back(m);
+ break;
+ }
+
+ // down?
+ if (osdmap->is_down(whoami)) {
+ dout(7) << "i am marked down, dropping " << *m << endl;
+ delete m;
+ break;
+ }
+
+
+
+
+ // need OSDMap
+ switch (m->get_type()) {
+
+ case MSG_OSD_PING:
+ // take note.
+ handle_osd_ping((MOSDPing*)m);
+ break;
+
+ case MSG_OSD_PG_NOTIFY:
+ handle_pg_notify((MOSDPGNotify*)m);
+ break;
+ case MSG_OSD_PG_QUERY:
+ handle_pg_query((MOSDPGQuery*)m);
+ break;
+ case MSG_OSD_PG_LOG:
+ handle_pg_log((MOSDPGLog*)m);
+ break;
+ case MSG_OSD_PG_REMOVE:
+ handle_pg_remove((MOSDPGRemove*)m);
+ break;
+
+ case MSG_OSD_OP:
+ handle_op((MOSDOp*)m);
+ break;
+
+ // for replication etc.
+ case MSG_OSD_OPREPLY:
+ handle_op_reply((MOSDOpReply*)m);
+ break;
+
+
+ default:
+ dout(1) << " got unknown message " << m->get_type() << endl;
+ assert(0);
+ }
+ }
+ }
+
+ // finishers?
+ if (!finished.empty()) {
+ list<Message*> waiting;
+ waiting.splice(waiting.begin(), finished);
+
+ osd_lock.Unlock();
+
+ for (list<Message*>::iterator it = waiting.begin();
+ it != waiting.end();
+ it++) {
+ dispatch(*it);
+ }
+ return;
+ }
+
+ osd_lock.Unlock();
+}
+
+
+void OSD::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst)
+{
+ if (g_conf.ms_die_on_failure) {
+ exit(0);
+ }
+
+ if (dest.is_osd()) {
+ // failed osd. drop message, report to mon.
+ int mon = monmap->pick_mon();
+ dout(0) << "ms_handle_failure " << dest << " inst " << inst
+ << ", dropping and reporting to mon" << mon
+ << endl;
+ messenger->send_message(new MOSDFailure(dest, inst, osdmap->get_epoch()),
+ MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ delete m;
+ } else if (dest.is_mon()) {
+ // resend to a different monitor.
+ int mon = monmap->pick_mon(true);
+ dout(0) << "ms_handle_failure " << dest << " inst " << inst
+ << ", resending to mon" << mon
+ << endl;
+ messenger->send_message(m, MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ }
+ else {
+ // client?
+ dout(0) << "ms_handle_failure " << dest << " inst " << inst
+ << ", dropping" << endl;
+ delete m;
+ }
+}
+
+bool OSD::ms_lookup(msg_addr_t dest, entity_inst_t& inst)
+{
+ if (dest.is_osd()) {
+ assert(osdmap);
+ return osdmap->get_inst(dest.num(), inst);
+ }
+
+ assert(0);
+ return false;
+}
+
+
+
+
+void OSD::handle_osd_ping(MOSDPing *m)
+{
+ dout(20) << "osdping from " << m->get_source() << endl;
+ _share_map_incoming(m->get_source(), m->get_source_inst(), ((MOSDPing*)m)->map_epoch);
+
+ int from = m->get_source().num();
+ peer_qlen[from] = m->avg_qlen;
+
+ //if (!m->ack)
+ //messenger->send_message(new MOSDPing(osdmap->get_epoch(), true),
+ //m->get_source());
+
+ delete m;
+}
+
+
+
+
+// =====================================================
+// MAP
+
+void OSD::wait_for_new_map(Message *m)
+{
+ // ask
+ if (waiting_for_osdmap.empty()) {
+ int mon = monmap->pick_mon();
+ messenger->send_message(new MOSDGetMap(osdmap->get_epoch()),
+ MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ }
+
+ waiting_for_osdmap.push_back(m);
+}
+
+
+/** update_map
+ * assimilate new OSDMap(s). scan pgs, etc.
+ */
+void OSD::handle_osd_map(MOSDMap *m)
+{
+ wait_for_no_ops();
+
+ assert(osd_lock.is_locked());
+
+ ObjectStore::Transaction t;
+
+ if (osdmap) {
+ dout(3) << "handle_osd_map epochs ["
+ << m->get_first() << "," << m->get_last()
+ << "], i have " << osdmap->get_epoch()
+ << endl;
+ } else {
+ dout(3) << "handle_osd_map epochs ["
+ << m->get_first() << "," << m->get_last()
+ << "], i have none"
+ << endl;
+ osdmap = new OSDMap;
+ boot_epoch = m->get_last(); // hrm...?
+ }
+
+ logger->inc("mapmsg");
+
+ // store them?
+ for (map<epoch_t,bufferlist>::iterator p = m->maps.begin();
+ p != m->maps.end();
+ p++) {
+ object_t oid = get_osdmap_object_name(p->first);
+ if (store->exists(oid)) {
+ dout(10) << "handle_osd_map already had full map epoch " << p->first << endl;
+ logger->inc("mapfdup");
+ bufferlist bl;
+ get_map_bl(p->first, bl);
+ dout(10) << " .. it is " << bl.length() << " bytes" << endl;
+ continue;
+ }
+
+ dout(10) << "handle_osd_map got full map epoch " << p->first << endl;
+ //t.write(oid, 0, p->second.length(), p->second);
+ store->write(oid, 0, p->second.length(), p->second, 0);
+
+ if (p->first > superblock.newest_map)
+ superblock.newest_map = p->first;
+ if (p->first < superblock.oldest_map ||
+ superblock.oldest_map == 0)
+ superblock.oldest_map = p->first;
+
+ logger->inc("mapf");
+ }
+ for (map<epoch_t,bufferlist>::iterator p = m->incremental_maps.begin();
+ p != m->incremental_maps.end();
+ p++) {
+ object_t oid = get_inc_osdmap_object_name(p->first);
+ if (store->exists(oid)) {
+ dout(10) << "handle_osd_map already had incremental map epoch " << p->first << endl;
+ logger->inc("mapidup");
+ bufferlist bl;
+ get_inc_map_bl(p->first, bl);
+ dout(10) << " .. it is " << bl.length() << " bytes" << endl;
+ continue;
+ }
+
+ dout(10) << "handle_osd_map got incremental map epoch " << p->first << endl;
+ //t.write(oid, 0, p->second.length(), p->second);
+ store->write(oid, 0, p->second.length(), p->second, 0);
+
+ if (p->first > superblock.newest_map)
+ superblock.newest_map = p->first;
+ if (p->first < superblock.oldest_map ||
+ superblock.oldest_map == 0)
+ superblock.oldest_map = p->first;
+
+ logger->inc("mapi");
+ }
+
+ // advance if we can
+ bool advanced = false;
+
+ if (m->get_source().is_mon() && is_booting())
+ advanced = true;
+
+ epoch_t cur = superblock.current_epoch;
+ while (cur < superblock.newest_map) {
+ bufferlist bl;
+ if (m->incremental_maps.count(cur+1) ||
+ store->exists(get_inc_osdmap_object_name(cur+1))) {
+ dout(10) << "handle_osd_map decoding inc map epoch " << cur+1 << endl;
+
+ bufferlist bl;
+ if (m->incremental_maps.count(cur+1))
+ bl = m->incremental_maps[cur+1];
+ else
+ get_inc_map_bl(cur+1, bl);
+
+ OSDMap::Incremental inc;
+ int off = 0;
+ inc.decode(bl, off);
+
+ osdmap->apply_incremental(inc);
+
+ // archive the full map
+ bl.clear();
+ osdmap->encode(bl);
+ t.write( get_osdmap_object_name(cur+1), 0, bl.length(), bl);
+
+ // notify messenger
+ for (map<int,entity_inst_t>::iterator i = inc.new_down.begin();
+ i != inc.new_down.end();
+ i++) {
+ int osd = i->first;
+ if (osd == whoami) continue;
+ messenger->mark_down(MSG_ADDR_OSD(osd), i->second);
+ peer_map_epoch.erase(MSG_ADDR_OSD(osd));
+
+ // kick any replica ops
+ for (hash_map<pg_t,PG*>::iterator it = pg_map.begin();
+ it != pg_map.end();
+ it++) {
+ PG *pg = it->second;
+
+ _lock_pg(pg->info.pgid);
+ {
+ list<PG::RepOpGather*> ls; // do async; repop_ack() may modify pg->repop_gather
+ for (map<tid_t,PG::RepOpGather*>::iterator p = pg->repop_gather.begin();
+ p != pg->repop_gather.end();
+ p++) {
+ //dout(-1) << "checking repop tid " << p->first << endl;
+ if (p->second->waitfor_ack.count(osd) ||
+ p->second->waitfor_commit.count(osd))
+ ls.push_back(p->second);
+ }
+ for (list<PG::RepOpGather*>::iterator p = ls.begin();
+ p != ls.end();
+ p++)
+ repop_ack(pg, *p, -1, true, osd);
+ }
+ _unlock_pg(pg->info.pgid);
+ }
+ }
+ for (map<int,entity_inst_t>::iterator i = inc.new_up.begin();
+ i != inc.new_up.end();
+ i++) {
+ if (i->first == whoami) continue;
+ messenger->mark_up(MSG_ADDR_OSD(i->first), i->second);
+ peer_map_epoch.erase(MSG_ADDR_OSD(i->first));
+ }
+ }
+ else if (m->maps.count(cur+1) ||
+ store->exists(get_osdmap_object_name(cur+1))) {
+ dout(10) << "handle_osd_map decoding full map epoch " << cur+1 << endl;
+ bufferlist bl;
+ if (m->maps.count(cur+1))
+ bl = m->maps[cur+1];
+ else
+ get_map_bl(cur+1, bl);
+ osdmap->decode(bl);
+
+ // FIXME BUG: need to notify messenger of ups/downs!!
+ }
+ else {
+ dout(10) << "handle_osd_map missing epoch " << cur+1 << endl;
+ int mon = monmap->pick_mon();
+ messenger->send_message(new MOSDGetMap(cur), MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ break;
+ }
+
+ cur++;
+ superblock.current_epoch = cur;
+ advance_map(t);
+ advanced = true;
+ }
+
+ // all the way?
+ if (advanced && cur == superblock.newest_map) {
+ // yay!
+ activate_map(t);
+
+ // process waiters
+ take_waiters(waiting_for_osdmap);
+ }
+
+ // write updated pg state to store
+ for (hash_map<pg_t,PG*>::iterator i = pg_map.begin();
+ i != pg_map.end();
+ i++) {
+ pg_t pgid = i->first;
+ PG *pg = i->second;
+ t.collection_setattr( pgid, "info", &pg->info, sizeof(pg->info));
+ }
+
+ // superblock and commit
+ write_superblock(t);
+ store->apply_transaction(t);
+
+ //if (osdmap->get_epoch() == 1) store->sync(); // in case of early death, blah
+
+ delete m;
+}
+
+
+/**
+ * scan placement groups, initiate any replication
+ * activities.
+ */
+void OSD::advance_map(ObjectStore::Transaction& t)
+{
+ dout(7) << "advance_map epoch " << osdmap->get_epoch()
+ << " " << pg_map.size() << " pgs"
+ << endl;
+
+ if (osdmap->is_mkfs()) {
+ ps_t maxps = 1ULL << osdmap->get_pg_bits();
+ ps_t maxlps = 1ULL << osdmap->get_localized_pg_bits();
+ dout(1) << "mkfs on " << osdmap->get_pg_bits() << " bits, " << maxps << " pgs" << endl;
+ assert(osdmap->get_epoch() == 1);
+
+ //cerr << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << endl;
+ logger->set_start( osdmap->get_ctime() );
+
+ // create PGs
+ for (int nrep = 1;
+ nrep <= MIN(g_conf.num_osd, g_conf.osd_max_rep); // for low osd counts.. hackish bleh
+ nrep++) {
+ for (ps_t ps = 0; ps < maxps; ++ps) {
+ vector<int> acting;
+ pg_t pgid = osdmap->ps_nrep_to_pg(ps, nrep);
+ int nrep = osdmap->pg_to_acting_osds(pgid, acting);
+ int role = osdmap->calc_pg_role(whoami, acting, nrep);
+ if (role < 0) continue;
+
+ PG *pg = create_pg(pgid, t);
+ pg->set_role(role);
+ pg->acting.swap(acting);
+ pg->last_epoch_started_any =
+ pg->info.last_epoch_started =
+ pg->info.history.same_since =
+ pg->info.history.same_primary_since =
+ pg->info.history.same_acker_since = osdmap->get_epoch();
+ pg->activate(t);
+
+ dout(7) << "created " << *pg << endl;
+ }
+
+ for (ps_t ps = 0; ps < maxlps; ++ps) {
+ // local PG too
+ vector<int> acting;
+ pg_t pgid = osdmap->ps_osd_nrep_to_pg(ps, whoami, nrep);
+ int nrep = osdmap->pg_to_acting_osds(pgid, acting);
+ int role = osdmap->calc_pg_role(whoami, acting, nrep);
+
+ PG *pg = create_pg(pgid, t);
+ pg->acting.swap(acting);
+ pg->set_role(role);
+ pg->last_epoch_started_any =
+ pg->info.last_epoch_started =
+ pg->info.history.same_primary_since =
+ pg->info.history.same_acker_since =
+ pg->info.history.same_since = osdmap->get_epoch();
+ pg->activate(t);
+
+ dout(7) << "created " << *pg << endl;
+ }
+ }
+
+ dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << endl;
+
+ } else {
+ // scan existing pg's
+ for (hash_map<pg_t,PG*>::iterator it = pg_map.begin();
+ it != pg_map.end();
+ it++) {
+ pg_t pgid = it->first;
+ PG *pg = it->second;
+
+ // did i finish this epoch?
+ if (pg->is_active()) {
+ pg->info.last_epoch_finished = osdmap->get_epoch()-1;
+ }
+
+ // get new acting set
+ vector<int> tacting;
+ int nrep = osdmap->pg_to_acting_osds(pgid, tacting);
+ int role = osdmap->calc_pg_role(whoami, tacting, nrep);
+
+ // no change?
+ if (tacting == pg->acting)
+ continue;
+
+ // -- there was a change! --
+ _lock_pg(pgid);
+
+ int oldrole = pg->get_role();
+ int oldprimary = pg->get_primary();
+ int oldacker = pg->get_acker();
+ vector<int> oldacting = pg->acting;
+
+ // update PG
+ pg->acting.swap(tacting);
+ pg->set_role(role);
+
+ // did primary|acker change?
+ pg->info.history.same_since = osdmap->get_epoch();
+ if (oldprimary != pg->get_primary()) {
+ pg->info.history.same_primary_since = osdmap->get_epoch();
+ pg->cancel_recovery();
+ }
+ if (oldacker != pg->get_acker()) {
+ pg->info.history.same_acker_since = osdmap->get_epoch();
+ }
+
+ // deactivate.
+ pg->state_clear(PG::STATE_ACTIVE);
+
+ // reset primary state?
+ if (oldrole == 0 || pg->get_role() == 0)
+ pg->clear_primary_state();
+
+ // apply any repops in progress.
+ if (oldacker == whoami) {
+ // apply repops
+ for (map<tid_t,PG::RepOpGather*>::iterator p = pg->repop_gather.begin();
+ p != pg->repop_gather.end();
+ p++) {
+ if (!p->second->applied)
+ apply_repop(pg, p->second);
+ delete p->second->op;
+ delete p->second;
+ }
+ pg->repop_gather.clear();
+
+ // and repop waiters
+ for (map<tid_t, list<Message*> >::iterator p = pg->waiting_for_repop.begin();
+ p != pg->waiting_for_repop.end();
+ p++)
+ for (list<Message*>::iterator pm = p->second.begin();
+ pm != p->second.end();
+ pm++)
+ delete *pm;
+ pg->waiting_for_repop.clear();
+ }
+
+ if (role != oldrole) {
+ // old primary?
+ if (oldrole == 0) {
+ pg->state_clear(PG::STATE_CLEAN);
+
+ // take replay queue waiters
+ list<Message*> ls;
+ for (map<eversion_t,MOSDOp*>::iterator it = pg->replay_queue.begin();
+ it != pg->replay_queue.end();
+ it++)
+ ls.push_back(it->second);
+ pg->replay_queue.clear();
+ take_waiters(ls);
+
+ // take active waiters
+ take_waiters(pg->waiting_for_active);
+
+ // take object waiters
+ for (hash_map<object_t, list<Message*> >::iterator it = pg->waiting_for_missing_object.begin();
+ it != pg->waiting_for_missing_object.end();
+ it++)
+ take_waiters(it->second);
+ pg->waiting_for_missing_object.clear();
+ }
+
+ // new primary?
+ if (role == 0) {
+ // i am new primary
+ pg->state_clear(PG::STATE_STRAY);
+ } else {
+ // i am now replica|stray. we need to send a notify.
+ pg->state_set(PG::STATE_STRAY);
+
+ if (nrep == 0) {
+ pg->state_set(PG::STATE_CRASHED);
+ dout(1) << *pg << " is crashed" << endl;
+ }
+ }
+
+ // my role changed.
+ dout(10) << *pg << " " << oldacting << " -> " << pg->acting
+ << ", role " << oldrole << " -> " << role << endl;
+
+ } else {
+ // no role change.
+ // did primary change?
+ if (pg->get_primary() != oldprimary) {
+ // we need to announce
+ pg->state_set(PG::STATE_STRAY);
+
+ dout(10) << *pg << " " << oldacting << " -> " << pg->acting
+ << ", acting primary "
+ << oldprimary << " -> " << pg->get_primary()
+ << endl;
+ } else {
+ // primary is the same.
+ if (role == 0) {
+ // i am (still) primary. but my replica set changed.
+ pg->state_clear(PG::STATE_CLEAN);
+ pg->state_clear(PG::STATE_REPLAY);
+
+ dout(10) << *pg << " " << oldacting << " -> " << pg->acting
+ << ", replicas changed" << endl;
+ }
+ }
+ }
+
+
+ _unlock_pg(pgid);
+ }
+ }
+}
+
+void OSD::activate_map(ObjectStore::Transaction& t)
+{
+ dout(7) << "activate_map version " << osdmap->get_epoch() << endl;
+
+ map< int, list<PG::Info> > notify_list; // primary -> list
+ map< int, map<pg_t,PG::Query> > query_map; // peer -> PG -> get_summary_since
+
+ // scan pg's
+ for (hash_map<pg_t,PG*>::iterator it = pg_map.begin();
+ it != pg_map.end();
+ it++) {
+ //pg_t pgid = it->first;
+ PG *pg = it->second;
+
+ if (pg->is_active()) {
+ // update started counter
+ pg->info.last_epoch_started = osdmap->get_epoch();
+ }
+ else if (pg->get_role() == 0 && !pg->is_active()) {
+ // i am (inactive) primary
+ pg->build_prior();
+ pg->peer(t, query_map);
+ }
+ else if (pg->is_stray() &&
+ pg->get_primary() >= 0) {
+ // i am residual|replica
+ notify_list[pg->get_primary()].push_back(pg->info);
+ }
+
+ }
+
+ if (osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs
+ return;
+
+ // notify? (residual|replica)
+ do_notifies(notify_list);
+
+ // do queries.
+ do_queries(query_map);
+
+ logger->set("numpg", pg_map.size());
+}
+
+
+void OSD::send_incremental_map(epoch_t since, msg_addr_t dest, const entity_inst_t& inst, bool full)
+{
+ dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch()
+ << " to " << dest << endl;
+
+ MOSDMap *m = new MOSDMap;
+
+ for (epoch_t e = osdmap->get_epoch();
+ e > since;
+ e--) {
+ bufferlist bl;
+ if (get_inc_map_bl(e,bl)) {
+ m->incremental_maps[e].claim(bl);
+ } else if (get_map_bl(e,bl)) {
+ m->maps[e].claim(bl);
+ if (!full) break;
+ }
+ else {
+ assert(0); // we should have all maps.
+ }
+ }
+
+ messenger->send_message(m, dest, inst);
+}
+
+bool OSD::get_map_bl(epoch_t e, bufferlist& bl)
+{
+ return store->read(get_osdmap_object_name(e), 0, 0, bl) >= 0;
+}
+
+bool OSD::get_inc_map_bl(epoch_t e, bufferlist& bl)
+{
+ return store->read(get_inc_osdmap_object_name(e), 0, 0, bl) >= 0;
+}
+
+void OSD::get_map(epoch_t epoch, OSDMap &m)
+{
+ // find a complete map
+ list<OSDMap::Incremental> incs;
+ epoch_t e;
+ for (e = epoch; e > 0; e--) {
+ bufferlist bl;
+ if (get_map_bl(e, bl)) {
+ //dout(10) << "get_map " << epoch << " full " << e << endl;
+ m.decode(bl);
+ break;
+ } else {
+ OSDMap::Incremental inc;
+ bool got = get_inc_map(e, inc);
+ assert(got);
+ incs.push_front(inc);
+ }
+ }
+ assert(e > 0);
+
+ // apply incrementals
+ for (e++; e <= epoch; e++) {
+ //dout(10) << "get_map " << epoch << " inc " << e << endl;
+ m.apply_incremental( incs.front() );
+ incs.pop_front();
+ }
+}
+
+
+bool OSD::get_inc_map(epoch_t e, OSDMap::Incremental &inc)
+{
+ bufferlist bl;
+ if (!get_inc_map_bl(e, bl))
+ return false;
+ int off = 0;
+ inc.decode(bl, off);
+ return true;
+}
+
+
+
+
+
+bool OSD::require_current_map(Message *m, epoch_t ep)
+{
+ // older map?
+ if (ep < osdmap->get_epoch()) {
+ dout(7) << "require_current_map epoch " << ep << " < " << osdmap->get_epoch() << endl;
+ delete m; // discard and ignore.
+ return false;
+ }
+
+ // newer map?
+ if (ep > osdmap->get_epoch()) {
+ dout(7) << "require_current_map epoch " << ep << " > " << osdmap->get_epoch() << endl;
+ wait_for_new_map(m);
+ return false;
+ }
+
+ assert(ep == osdmap->get_epoch());
+ return true;
+}
+
+
+/*
+ * require that we have same (or newer) map, and that
+ * the source is the pg primary.
+ */
+bool OSD::require_same_or_newer_map(Message *m, epoch_t epoch)
+{
+ dout(10) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ")" << endl;
+
+ // newer map?
+ if (epoch > osdmap->get_epoch()) {
+ dout(7) << " from newer map epoch " << epoch << " > " << osdmap->get_epoch() << endl;
+ wait_for_new_map(m);
+ return false;
+ }
+
+ if (epoch < boot_epoch) {
+ dout(7) << " from pre-boot epoch " << epoch << " < " << boot_epoch << endl;
+ delete m;
+ return false;
+ }
+
+ return true;
+}
+
+
+
+
+// ======================================================
+// REPLICATION
+
+// PG
+
+bool OSD::pg_exists(pg_t pgid)
+{
+ return store->collection_exists(pgid);
+}
+
+PG *OSD::create_pg(pg_t pgid, ObjectStore::Transaction& t)
+{
+ if (pg_map.count(pgid)) {
+ dout(0) << "create_pg on " << pgid << ", already have " << *pg_map[pgid] << endl;
+ }
+ assert(pg_map.count(pgid) == 0);
+ assert(!pg_exists(pgid));
+
+ PG *pg = new PG(this, pgid);
+ pg_map[pgid] = pg;
+
+ t.create_collection(pgid);
+
+ return pg;
+}
+
+
+
+
+PG *OSD::get_pg(pg_t pgid)
+{
+ if (pg_map.count(pgid))
+ return pg_map[pgid];
+ return 0;
+}
+
+void OSD::load_pgs()
+{
+ dout(10) << "load_pgs" << endl;
+ assert(pg_map.empty());
+
+ list<coll_t> ls;
+ store->list_collections(ls);
+
+ for (list<coll_t>::iterator it = ls.begin();
+ it != ls.end();
+ it++) {
+ pg_t pgid = *it;
+
+ PG *pg = new PG(this, pgid);
+ pg_map[pgid] = pg;
+
+ // read pg info
+ store->collection_getattr(pgid, "info", &pg->info, sizeof(pg->info));
+
+ // read pg log
+ pg->read_log(store);
+
+ // generate state for current mapping
+ int nrep = osdmap->pg_to_acting_osds(pgid, pg->acting);
+ int role = osdmap->calc_pg_role(whoami, pg->acting, nrep);
+ pg->set_role(role);
+
+ dout(10) << "load_pgs loaded " << *pg << " " << pg->log << endl;
+ }
+}
+
+/**
+ * check epochs starting from start to verify the pg acting set hasn't changed
+ * up until now
+ */
+void OSD::project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from)
+{
+ dout(15) << "project_pg_history " << pgid
+ << " from " << from << " to " << osdmap->get_epoch()
+ << ", start " << h
+ << endl;
+
+ vector<int> last;
+ osdmap->pg_to_acting_osds(pgid, last);
+
+ for (epoch_t e = osdmap->get_epoch()-1;
+ e >= from;
+ e--) {
+ // verify during intermediate epoch
+ OSDMap oldmap;
+ get_map(e, oldmap);
+
+ vector<int> acting;
+ oldmap.pg_to_acting_osds(pgid, acting);
+
+ // acting set change?
+ if (acting != last &&
+ e <= h.same_since) {
+ dout(15) << "project_pg_history " << pgid << " changed in " << e+1
+ << " from " << acting << " -> " << last << endl;
+ h.same_since = e+1;
+ }
+
+ // primary change?
+ if (!(!acting.empty() && !last.empty() && acting[0] == last[0]) &&
+ e <= h.same_primary_since) {
+ dout(15) << "project_pg_history " << pgid << " primary changed in " << e+1 << endl;
+ h.same_primary_since = e+1;
+
+ if (g_conf.osd_rep == OSD_REP_PRIMARY)
+ h.same_acker_since = h.same_primary_since;
+ }
+
+ // acker change?
+ if (g_conf.osd_rep != OSD_REP_PRIMARY) {
+ if (!(!acting.empty() && !last.empty() && acting[acting.size()-1] == last[last.size()-1]) &&
+ e <= h.same_acker_since) {
+ dout(15) << "project_pg_history " << pgid << " acker changed in " << e+1 << endl;
+ h.same_acker_since = e+1;
+ }
+ }
+
+ if (h.same_since > e &&
+ h.same_primary_since > e &&
+ h.same_acker_since > e) break;
+ }
+
+ dout(15) << "project_pg_history end " << h << endl;
+}
+
+
+/** do_notifies
+ * Send an MOSDPGNotify to a primary, with a list of PGs that I have
+ * content for, and they are primary for.
+ */
+
+void OSD::do_notifies(map< int, list<PG::Info> >& notify_list)
+{
+ for (map< int, list<PG::Info> >::iterator it = notify_list.begin();
+ it != notify_list.end();
+ it++) {
+ if (it->first == whoami) {
+ dout(7) << "do_notify osd" << it->first << " is self, skipping" << endl;
+ continue;
+ }
+ dout(7) << "do_notify osd" << it->first << " on " << it->second.size() << " PGs" << endl;
+ MOSDPGNotify *m = new MOSDPGNotify(osdmap->get_epoch(), it->second);
+ _share_map_outgoing(MSG_ADDR_OSD(it->first), osdmap->get_inst(it->first));
+ messenger->send_message(m, MSG_ADDR_OSD(it->first), osdmap->get_inst(it->first));
+ }
+}
+
+
+/** do_queries
+ * send out pending queries for info | summaries
+ */
+void OSD::do_queries(map< int, map<pg_t,PG::Query> >& query_map)
+{
+ for (map< int, map<pg_t,PG::Query> >::iterator pit = query_map.begin();
+ pit != query_map.end();
+ pit++) {
+ int who = pit->first;
+ dout(7) << "do_queries querying osd" << who
+ << " on " << pit->second.size() << " PGs" << endl;
+
+ MOSDPGQuery *m = new MOSDPGQuery(osdmap->get_epoch(),
+ pit->second);
+ _share_map_outgoing(MSG_ADDR_OSD(who), osdmap->get_inst(who));
+ messenger->send_message(m, MSG_ADDR_OSD(who), osdmap->get_inst(who));
+ }
+}
+
+
+
+
+/** PGNotify
+ * from non-primary to primary
+ * includes PG::Info.
+ * NOTE: called with opqueue active.
+ */
+void OSD::handle_pg_notify(MOSDPGNotify *m)
+{
+ dout(7) << "handle_pg_notify from " << m->get_source() << endl;
+ int from = m->get_source().num();
+
+ if (!require_same_or_newer_map(m, m->get_epoch())) return;
+
+ ObjectStore::Transaction t;
+
+ // look for unknown PGs i'm primary for
+ map< int, map<pg_t,PG::Query> > query_map;
+
+ for (list<PG::Info>::iterator it = m->get_pg_list().begin();
+ it != m->get_pg_list().end();
+ it++) {
+ pg_t pgid = it->pgid;
+ PG *pg;
+
+ if (pg_map.count(pgid) == 0) {
+ // same primary?
+ PG::Info::History history = it->history;
+ project_pg_history(pgid, history, m->get_epoch());
+
+ if (m->get_epoch() < history.same_primary_since) {
+ dout(10) << "handle_pg_notify pg " << pgid << " dne, and primary changed in "
+ << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << endl;
+ continue;
+ }
+
+ // ok, create PG!
+ pg = create_pg(pgid, t);
+ osdmap->pg_to_acting_osds(pgid, pg->acting);
+ pg->set_role(0);
+ pg->info.history = history;
+
+ pg->last_epoch_started_any = it->last_epoch_started;
+ pg->build_prior();
+
+ t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info));
+
+ dout(10) << *pg << " is new" << endl;
+
+ // kick any waiters
+ if (waiting_for_pg.count(pgid)) {
+ take_waiters(waiting_for_pg[pgid]);
+ waiting_for_pg.erase(pgid);
+ }
+
+ _lock_pg(pgid);
+ } else {
+ // already had it. am i (still) the primary?
+ pg = _lock_pg(pgid);
+ if (m->get_epoch() < pg->info.history.same_primary_since) {
+ dout(10) << *pg << " handle_pg_notify primary changed in "
+ << pg->info.history.same_primary_since
+ << " (msg from " << m->get_epoch() << ")" << endl;
+ _unlock_pg(pgid);
+ continue;
+ }
+ }
+
+ // ok!
+
+ // stray?
+ bool acting = pg->is_acting(from);
+ if (!acting && (*it).last_epoch_started > 0) {
+ dout(10) << *pg << " osd" << from << " has stray content: " << *it << endl;
+ pg->stray_set.insert(from);
+ pg->state_clear(PG::STATE_CLEAN);
+ }
+
+ // save info.
+ bool had = pg->peer_info.count(from);
+ pg->peer_info[from] = *it;
+
+ if (had) {
+ if (pg->is_active() &&
+ (*it).is_clean() && acting) {
+ pg->clean_set.insert(from);
+ dout(10) << *pg << " osd" << from << " now clean (" << pg->clean_set
+ << "): " << *it << endl;
+ if (pg->is_all_clean()) {
+ dout(-10) << *pg << " now clean on all replicas" << endl;
+ pg->state_set(PG::STATE_CLEAN);
+ pg->clean_replicas();
+ }
+ } else {
+ // hmm, maybe keep an eye out for cases where we see this, but peer should happen.
+ dout(10) << *pg << " already had notify info from osd" << from << ": " << *it << endl;
+ }
+ } else {
+ // adjust prior?
+ if (it->last_epoch_started > pg->last_epoch_started_any)
+ pg->adjust_prior();
+
+ // peer
+ pg->peer(t, query_map);
+ }
+
+ _unlock_pg(pgid);
+ }
+
+ unsigned tr = store->apply_transaction(t);
+ assert(tr == 0);
+
+ do_queries(query_map);
+
+ delete m;
+}
+
+
+
+/** PGLog
+ * from non-primary to primary
+ * includes log and info
+ * from primary to non-primary
+ * includes log for use in recovery
+ * NOTE: called with opqueue active.
+ */
+
+void OSD::handle_pg_log(MOSDPGLog *m)
+{
+ int from = m->get_source().num();
+ const pg_t pgid = m->get_pgid();
+
+ if (!require_same_or_newer_map(m, m->get_epoch())) return;
+ if (pg_map.count(pgid) == 0) {
+ dout(10) << "handle_pg_log don't have pg " << pgid << ", dropping" << endl;
+ assert(m->get_epoch() < osdmap->get_epoch());
+ delete m;
+ return;
+ }
+
+ PG *pg = _lock_pg(pgid);
+ assert(pg);
+
+ if (m->get_epoch() < pg->info.history.same_since) {
+ dout(10) << "handle_pg_log " << *pg
+ << " from " << m->get_source()
+ << " is old, discarding"
+ << endl;
+ delete m;
+ return;
+ }
+
+ dout(7) << "handle_pg_log " << *pg
+ << " got " << m->log << " " << m->missing
+ << " from " << m->get_source() << endl;
+
+ //m->log.print(cout);
+
+ ObjectStore::Transaction t;
+
+ if (pg->is_primary()) {
+ // i am PRIMARY
+ assert(pg->peer_log_requested.count(from) ||
+ pg->peer_summary_requested.count(from));
+
+ pg->proc_replica_log(m->log, m->missing, from);
+
+ // peer
+ map< int, map<pg_t,PG::Query> > query_map;
+ pg->peer(t, query_map);
+ do_queries(query_map);
+
+ } else {
+ // i am REPLICA
+ dout(10) << *pg << " got " << m->log << " " << m->missing << endl;
+
+ // merge log
+ pg->merge_log(m->log, m->missing, from);
+ pg->proc_missing(m->log, m->missing, from);
+ assert(pg->missing.num_lost() == 0);
+
+ // ok activate!
+ pg->activate(t);
+ }
+
+ unsigned tr = store->apply_transaction(t);
+ assert(tr == 0);
+
+ _unlock_pg(pgid);
+
+ delete m;
+}
+
+
+/** PGQuery
+ * from primary to replica | stray
+ * NOTE: called with opqueue active.
+ */
+void OSD::handle_pg_query(MOSDPGQuery *m)
+{
+ dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << endl;
+ int from = m->get_source().num();
+
+ if (!require_same_or_newer_map(m, m->get_epoch())) return;
+
+ map< int, list<PG::Info> > notify_list;
+
+ for (map<pg_t,PG::Query>::iterator it = m->pg_list.begin();
+ it != m->pg_list.end();
+ it++) {
+ pg_t pgid = it->first;
+ PG *pg = 0;
+
+ if (pg_map.count(pgid) == 0) {
+ // same primary?
+ PG::Info::History history = it->second.history;
+ project_pg_history(pgid, history, m->get_epoch());
+
+ if (m->get_epoch() < history.same_since) {
+ dout(10) << " pg " << pgid << " dne, and pg has changed in "
+ << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << endl;
+ continue;
+ }
+
+ // get active rush mapping
+ vector<int> acting;
+ int nrep = osdmap->pg_to_acting_osds(pgid, acting);
+ int role = osdmap->calc_pg_role(whoami, acting, nrep);
+
+ if (role < 0) {
+ dout(10) << " pg " << pgid << " dne, and i am not an active replica" << endl;
+ PG::Info empty(pgid);
+ notify_list[from].push_back(empty);
+ continue;
+ }
+ assert(role > 0);
+
+ ObjectStore::Transaction t;
+ pg = create_pg(pgid, t);
+ pg->acting.swap( acting );
+ pg->set_role(role);
+ pg->info.history = history;
+
+ t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info));
+ store->apply_transaction(t);
+
+ dout(10) << *pg << " dne (before), but i am role " << role << endl;
+ _lock_pg(pgid);
+ } else {
+ pg = _lock_pg(pgid);
+
+ // same primary?
+ if (m->get_epoch() < pg->info.history.same_since) {
+ dout(10) << *pg << " handle_pg_query primary changed in "
+ << pg->info.history.same_since
+ << " (msg from " << m->get_epoch() << ")" << endl;
+ _unlock_pg(pgid);
+ continue;
+ }
+ }
+
+ // ok, process query!
+ assert(!pg->acting.empty());
+ assert(from == pg->acting[0]);
+
+ if (it->second.type == PG::Query::INFO) {
+ // info
+ dout(10) << *pg << " sending info" << endl;
+ notify_list[from].push_back(pg->info);
+ } else {
+ MOSDPGLog *m = new MOSDPGLog(osdmap->get_epoch(), pg->get_pgid());
+ m->info = pg->info;
+ m->missing = pg->missing;
+
+ if (it->second.type == PG::Query::LOG) {
+ dout(10) << *pg << " sending info+missing+log since split " << it->second.split
+ << " from floor " << it->second.floor
+ << endl;
+ if (!m->log.copy_after_unless_divergent(pg->log, it->second.split, it->second.floor)) {
+ dout(10) << *pg << " divergent, sending backlog" << endl;
+ it->second.type = PG::Query::BACKLOG;
+ }
+ }
+
+ if (it->second.type == PG::Query::BACKLOG) {
+ dout(10) << *pg << " sending info+missing+backlog" << endl;
+ if (pg->log.backlog) {
+ m->log = pg->log;
+ } else {
+ pg->generate_backlog();
+ m->log = pg->log;
+ pg->drop_backlog();
+ }
+ }
+ else if (it->second.type == PG::Query::FULLLOG) {
+ dout(10) << *pg << " sending info+missing+full log" << endl;
+ m->log.copy_non_backlog(pg->log);
+ }
+
+ dout(10) << *pg << " sending " << m->log << " " << m->missing << endl;
+ //m->log.print(cout);
+
+ _share_map_outgoing(MSG_ADDR_OSD(from), osdmap->get_inst(from));
+ messenger->send_message(m, MSG_ADDR_OSD(from), osdmap->get_inst(from));
+ }
+
+ _unlock_pg(pgid);
+ }
+
+ do_notifies(notify_list);
+
+ delete m;
+}
+
+
+void OSD::handle_pg_remove(MOSDPGRemove *m)
+{
+ dout(7) << "handle_pg_remove from " << m->get_source() << endl;
+
+ if (!require_same_or_newer_map(m, m->get_epoch())) return;
+
+ for (set<pg_t>::iterator it = m->pg_list.begin();
+ it != m->pg_list.end();
+ it++) {
+ pg_t pgid = *it;
+ PG *pg;
+
+ if (pg_map.count(pgid) == 0) {
+ dout(10) << " don't have pg " << pgid << endl;
+ continue;
+ }
+
+ pg = _lock_pg(pgid);
+
+ dout(10) << *pg << " removing." << endl;
+ assert(pg->get_role() == -1);
+
+ _remove_pg(pgid);
+
+ // unlock. there shouldn't be any waiters, since we're a stray, and pg is presumably clean0.
+ assert(pg_lock_waiters.count(pgid) == 0);
+ _unlock_pg(pgid);
+ }
+
+ delete m;
+}
+
+
+
+
+
+
+/*** RECOVERY ***/
+
+/** pull - request object from a peer
+ */
+void OSD::pull(PG *pg, object_t oid)
+{
+ assert(pg->missing.loc.count(oid));
+ eversion_t v = pg->missing.missing[oid];
+ int osd = pg->missing.loc[oid];
+
+ dout(7) << *pg << " pull " << oid
+ << " v " << v
+ << " from osd" << osd
+ << endl;
+
+ // send op
+ tid_t tid = ++last_tid;
+ MOSDOp *op = new MOSDOp(tid, messenger->get_myaddr(),
+ oid, pg->get_pgid(),
+ osdmap->get_epoch(),
+ OSD_OP_PULL);
+ op->set_version(v);
+ messenger->send_message(op, MSG_ADDR_OSD(osd), osdmap->get_inst(osd));
+
+ // take note
+ assert(pg->objects_pulling.count(oid) == 0);
+ num_pulling++;
+ pg->objects_pulling[oid] = v;
+}
+
+
+/** push - send object to a peer
+ */
+void OSD::push(PG *pg, object_t oid, int dest)
+{
+ // read data+attrs
+ bufferlist bl;
+ eversion_t v;
+ int vlen = sizeof(v);
+ map<string,bufferptr> attrset;
+
+ ObjectStore::Transaction t;
+ t.read(oid, 0, 0, &bl);
+ t.getattr(oid, "version", &v, &vlen);
+ t.getattrs(oid, attrset);
+ unsigned tr = store->apply_transaction(t);
+
+ assert(tr == 0); // !!!
+
+ // ok
+ dout(7) << *pg << " push " << oid << " v " << v
+ << " size " << bl.length()
+ << " to osd" << dest
+ << endl;
+
+ logger->inc("r_push");
+ logger->inc("r_pushb", bl.length());
+
+ // send
+ MOSDOp *op = new MOSDOp(++last_tid, MSG_ADDR_OSD(whoami),
+ oid, pg->info.pgid, osdmap->get_epoch(),
+ OSD_OP_PUSH);
+ op->set_offset(0);
+ op->set_length(bl.length());
+ op->set_data(bl); // note: claims bl, set length above here!
+ op->set_version(v);
+ op->set_attrset(attrset);
+
+ messenger->send_message(op, MSG_ADDR_OSD(dest), osdmap->get_inst(dest));
+}
+
+
+/** op_pull
+ * process request to pull an entire object.
+ * NOTE: called from opqueue.
+ */
+void OSD::op_pull(MOSDOp *op, PG *pg)
+{
+ const object_t oid = op->get_oid();
+ const eversion_t v = op->get_version();
+ int from = op->get_source().num();
+
+ dout(7) << *pg << " op_pull " << oid << " v " << op->get_version()
+ << " from " << op->get_source()
+ << endl;
+
+ // is a replica asking? are they missing it?
+ if (pg->is_primary()) {
+ // primary
+ assert(pg->peer_missing.count(from)); // we had better know this, from the peering process.
+
+ if (!pg->peer_missing[from].is_missing(oid)) {
+ dout(7) << *pg << " op_pull replica isn't actually missing it, we must have already pushed to them" << endl;
+ delete op;
+ return;
+ }
+
+ // do we have it yet?
+ if (waitfor_missing_object(op, pg))
+ return;
+ } else {
+ // non-primary
+ if (pg->missing.is_missing(oid)) {
+ dout(7) << *pg << " op_pull not primary, and missing " << oid << ", ignoring" << endl;
+ delete op;
+ return;
+ }
+ }
+
+ // push it back!
+ push(pg, oid, op->get_source().num());
+}
+
+
+/** op_push
+ * NOTE: called from opqueue.
+ */
+void OSD::op_push(MOSDOp *op, PG *pg)
+{
+ object_t oid = op->get_oid();
+ eversion_t v = op->get_version();
+
+ if (!pg->missing.is_missing(oid)) {
+ dout(7) << *pg << " op_push not missing " << oid << endl;
+ return;
+ }
+
+ dout(7) << *pg << " op_push "
+ << oid
+ << " v " << v
+ << " size " << op->get_length() << " " << op->get_data().length()
+ << endl;
+
+ assert(op->get_data().length() == op->get_length());
+
+ // write object and add it to the PG
+ ObjectStore::Transaction t;
+ t.remove(oid); // in case old version exists
+ t.write(oid, 0, op->get_length(), op->get_data());
+ t.setattrs(oid, op->get_attrset());
+ t.collection_add(pg->info.pgid, oid);
+
+ // close out pull op?
+ num_pulling--;
+ if (pg->objects_pulling.count(oid))
+ pg->objects_pulling.erase(oid);
+ pg->missing.got(oid, v);
+
+
+ // raise last_complete?
+ assert(pg->log.complete_to != pg->log.log.end());
+ while (pg->log.complete_to != pg->log.log.end()) {
+ if (pg->missing.missing.count(pg->log.complete_to->oid)) break;
+ if (pg->info.last_complete < pg->log.complete_to->version)
+ pg->info.last_complete = pg->log.complete_to->version;
+ pg->log.complete_to++;
+ }
+ dout(10) << *pg << " last_complete now " << pg->info.last_complete << endl;
+
+
+ // apply to disk!
+ t.collection_setattr(pg->info.pgid, "info", &pg->info, sizeof(pg->info));
+ unsigned r = store->apply_transaction(t);
+ assert(r == 0);
+
+
+
+ // am i primary? are others missing this too?
+ if (pg->is_primary()) {
+ for (unsigned i=1; i<pg->acting.size(); i++) {
+ int peer = pg->acting[i];
+ assert(pg->peer_missing.count(peer));
+ if (pg->peer_missing[peer].is_missing(oid)) {
+ // ok, push it, and they (will) have it now.
+ pg->peer_missing[peer].got(oid, v);
+ push(pg, oid, peer);
+ }
+ }
+ }
+
+ // continue recovery
+ pg->do_recovery();
+
+ // kick waiters
+ if (pg->waiting_for_missing_object.count(oid))
+ take_waiters(pg->waiting_for_missing_object[oid]);
+
+ delete op;
+}
+
+
+
+
+// op_rep_modify
+
+// commit (to disk) callback
+class C_OSD_RepModifyCommit : public Context {
+public:
+ OSD *osd;
+ MOSDOp *op;
+ int destosd;
+
+ eversion_t pg_last_complete;
+
+ Mutex lock;
+ Cond cond;
+ bool acked;
+ bool waiting;
+
+ C_OSD_RepModifyCommit(OSD *o, MOSDOp *oo, int dosd, eversion_t lc) :
+ osd(o), op(oo), destosd(dosd), pg_last_complete(lc),
+ acked(false), waiting(false) { }
+ void finish(int r) {
+ lock.Lock();
+ assert(!waiting);
+ while (!acked) {
+ waiting = true;
+ cond.Wait(lock);
+ }
+ assert(acked);
+ lock.Unlock();
+ osd->op_rep_modify_commit(op, destosd, pg_last_complete);
+ }
+ void ack() {
+ lock.Lock();
+ assert(!acked);
+ acked = true;
+ if (waiting) cond.Signal();
+
+ // discard my reference to buffer
+ op->get_data().clear();
+
+ lock.Unlock();
+ }
+};
+
+void OSD::op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complete)
+{
+ // send commit.
+ dout(10) << "rep_modify_commit on op " << *op
+ << ", sending commit to osd" << ackerosd
+ << endl;
+ MOSDOpReply *commit = new MOSDOpReply(op, 0, osdmap->get_epoch(), true);
+ commit->set_pg_complete_thru(last_complete);
+ messenger->send_message(commit, MSG_ADDR_OSD(ackerosd), osdmap->get_inst(ackerosd));
+ delete op;
+}
+
+// process a modification operation
+
+class C_OSD_WriteCommit : public Context {
+public:
+ OSD *osd;
+ pg_t pgid;
+ tid_t rep_tid;
+ eversion_t pg_last_complete;
+ C_OSD_WriteCommit(OSD *o, pg_t p, tid_t rt, eversion_t lc) : osd(o), pgid(p), rep_tid(rt), pg_last_complete(lc) {}
+ void finish(int r) {
+ osd->op_modify_commit(pgid, rep_tid, pg_last_complete);
+ }
+};
+
+
+/** op_rep_modify
+ * process a replicated modify.
+ * NOTE: called from opqueue.
+ */
+void OSD::op_rep_modify(MOSDOp *op, PG *pg)
+{
+ object_t oid = op->get_oid();
+ eversion_t nv = op->get_version();
+
+ const char *opname = MOSDOp::get_opname(op->get_op());
+
+ // check crev
+ objectrev_t crev = 0;
+ store->getattr(oid, "crev", (char*)&crev, sizeof(crev));
+
+ dout(10) << "op_rep_modify " << opname
+ << " " << oid
+ << " v " << nv
+ << " " << op->get_offset() << "~" << op->get_length()
+ << " in " << *pg
+ << endl;
+
+ // we better not be missing this.
+ assert(!pg->missing.is_missing(oid));
+
+ // prepare our transaction
+ ObjectStore::Transaction t;
+
+ // am i acker?
+ PG::RepOpGather *repop = 0;
+ int ackerosd = pg->acting[0];
+
+ if ((g_conf.osd_rep == OSD_REP_CHAIN || g_conf.osd_rep == OSD_REP_SPLAY)) {
+ ackerosd = pg->get_acker();
+
+ if (pg->is_acker()) {
+ // i am tail acker.
+ if (pg->repop_gather.count(op->get_rep_tid())) {
+ repop = pg->repop_gather[ op->get_rep_tid() ];
+ } else {
+ repop = new_repop_gather(pg, op);
+ }
+
+ // infer ack from source
+ int fromosd = op->get_source().num();
+ get_repop_gather(repop);
+ {
+ //assert(repop->waitfor_ack.count(fromosd)); // no, we may come thru here twice.
+ repop->waitfor_ack.erase(fromosd);
+ }
+ put_repop_gather(pg, repop);
+
+ // prepare dest socket
+ //messenger->prepare_send_message(op->get_client());
+ }
+
+ // chain? forward?
+ if (g_conf.osd_rep == OSD_REP_CHAIN && !pg->is_acker()) {
+ // chain rep, not at the tail yet.
+ int myrank = osdmap->calc_pg_rank(whoami, pg->acting);
+ int next = myrank+1;
+ if (next == (int)pg->acting.size())
+ next = 1;
+ issue_repop(pg, op, pg->acting[next]);
+ }
+ }
+
+ // do op?
+ C_OSD_RepModifyCommit *oncommit = 0;
+
+ logger->inc("r_wr");
+ logger->inc("r_wrb", op->get_length());
+
+ if (repop) {
+ // acker. we'll apply later.
+ if (op->get_op() != OSD_OP_WRNOOP) {
+ prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), pg, op->get_pg_trim_to());
+ prepare_op_transaction(repop->t, op, nv, crev, op->get_rev(), pg);
+ }
+ } else {
+ // middle|replica.
+ if (op->get_op() != OSD_OP_WRNOOP) {
+ prepare_log_transaction(t, op, nv, crev, op->get_rev(), pg, op->get_pg_trim_to());
+ prepare_op_transaction(t, op, nv, crev, op->get_rev(), pg);
+ }
+
+ oncommit = new C_OSD_RepModifyCommit(this, op, ackerosd, pg->info.last_complete);
+
+ // apply log update. and possibly update itself.
+ unsigned tr = store->apply_transaction(t, oncommit);
+ if (tr != 0 && // no errors
+ tr != 2) { // or error on collection_add
+ cerr << "error applying transaction: r = " << tr << endl;
+ assert(tr == 0);
+ }
+ }
+
+ // ack?
+ if (repop) {
+ // (logical) local ack. this may induce the actual update.
+ get_repop_gather(repop);
+ {
+ assert(repop->waitfor_ack.count(whoami));
+ repop->waitfor_ack.erase(whoami);
+ }
+ put_repop_gather(pg, repop);
+ }
+ else {
+ // send ack to acker?
+ if (g_conf.osd_rep != OSD_REP_CHAIN) {
+ MOSDOpReply *ack = new MOSDOpReply(op, 0, osdmap->get_epoch(), false);
+ messenger->send_message(ack, MSG_ADDR_OSD(ackerosd), osdmap->get_inst(ackerosd));
+ }
+
+ // ack myself.
+ assert(oncommit);
+ oncommit->ack();
+ }
+}
+
+
+// =========================================================
+// OPS
+
+void OSD::handle_op(MOSDOp *op)
+{
+ const pg_t pgid = op->get_pg();
+ PG *pg = get_pg(pgid);
+
+
+ logger->set("buf", buffer_total_alloc);
+
+ // update qlen stats
+ hb_stat_ops++;
+ hb_stat_qlen += pending_ops;
+
+
+ // require same or newer map
+ if (!require_same_or_newer_map(op, op->get_map_epoch())) return;
+
+ // share our map with sender, if they're old
+ _share_map_incoming(op->get_source(), op->get_source_inst(), op->get_map_epoch());
+
+ // what kind of op?
+ bool read = op->get_op() < 10; // read, stat. but not pull.
+
+ if (!op->get_source().is_osd()) {
+ // REGULAR OP (non-replication)
+
+ // note original source
+ op->set_client_inst( op->get_source_inst() );
+ op->clear_payload(); // and hose encoded payload (in case we forward)
+
+ // have pg?
+ if (!pg) {
+ dout(7) << "hit non-existent pg "
+ << pgid
+ << ", waiting" << endl;
+ waiting_for_pg[pgid].push_back(op);
+ return;
+ }
+
+ if (read) {
+ // read. am i the (same) acker?
+ if (//pg->get_acker() != whoami ||
+ op->get_map_epoch() < pg->info.history.same_acker_since) {
+ dout(7) << "acting acker is osd" << pg->get_acker()
+ << " since " << pg->info.history.same_acker_since
+ << ", dropping" << endl;
+ assert(op->get_map_epoch() < osdmap->get_epoch());
+ delete op;
+ return;
+ }
+ } else {
+ // write. am i the (same) primary?
+ if (pg->get_primary() != whoami ||
+ op->get_map_epoch() < pg->info.history.same_primary_since) {
+ dout(7) << "acting primary is osd" << pg->get_primary()
+ << " since " << pg->info.history.same_primary_since
+ << ", dropping" << endl;
+ assert(op->get_map_epoch() < osdmap->get_epoch());
+ delete op;
+ return;
+ }
+ }
+
+ // must be active.
+ if (!pg->is_active()) {
+ // replay?
+ if (op->get_version().version > 0) {
+ if (op->get_version() > pg->info.last_update) {
+ dout(7) << *pg << " queueing replay at " << op->get_version()
+ << " for " << *op << endl;
+ pg->replay_queue[op->get_version()] = op;
+ return;
+ } else {
+ dout(7) << *pg << " replay at " << op->get_version() << " <= " << pg->info.last_update
+ << " for " << *op
+ << ", will queue for WRNOOP" << endl;
+ }
+ }
+
+ dout(7) << *pg << " not active (yet)" << endl;
+ pg->waiting_for_active.push_back(op);
+ return;
+ }
+
+ // missing object?
+ if (read && op->get_oid().rev > 0) {
+ // versioned read. hrm.
+ // are we missing a revision that we might need?
+ object_t moid = op->get_oid();
+ if (pick_missing_object_rev(moid, pg)) {
+ // is there a local revision we might use instead?
+ object_t loid = op->get_oid();
+ if (store->pick_object_revision_lt(loid) &&
+ moid <= loid) {
+ // we need moid. pull it.
+ dout(10) << "handle_op read on " << op->get_oid()
+ << ", have " << loid
+ << ", but need missing " << moid
+ << ", pulling" << endl;
+ pull(pg, moid);
+ pg->waiting_for_missing_object[moid].push_back(op);
+ return;
+ }
+
+ dout(10) << "handle_op read on " << op->get_oid()
+ << ", have " << loid
+ << ", don't need missing " << moid
+ << endl;
+ }
+ } else {
+ // live revision. easy.
+ if (op->get_op() != OSD_OP_PUSH &&
+ waitfor_missing_object(op, pg)) return;
+ }
+
+ dout(7) << "handle_op " << *op << " in " << *pg << endl;
+
+
+ // balance reads?
+ if (read &&
+ g_conf.osd_balance_reads &&
+ pg->get_acker() == whoami) {
+ // test
+ if (false) {
+ if (pg->acting.size() > 1) {
+ int peer = pg->acting[1];
+ dout(-10) << "fwd client read op to osd" << peer << " for " << op->get_client() << " " << op->get_client_inst() << endl;
+ messenger->send_message(op, MSG_ADDR_OSD(peer), osdmap->get_inst(peer));
+ return;
+ }
+ }
+
+ // am i above my average?
+ float my_avg = hb_stat_qlen / hb_stat_ops;
+ if (pending_ops > my_avg) {
+ // is there a peer who is below my average?
+ for (unsigned i=1; i<pg->acting.size(); ++i) {
+ int peer = pg->acting[i];
+ if (peer_qlen.count(peer) &&
+ peer_qlen[peer] < my_avg) {
+ // calculate a probability that we should redirect
+ float p = (my_avg - peer_qlen[peer]) / my_avg; // this is dumb.
+
+ if (drand48() <= p) {
+ // take the first one
+ dout(-10) << "my qlen " << pending_ops << " > my_avg " << my_avg
+ << ", p=" << p
+ << ", fwd to peer w/ qlen " << peer_qlen[peer]
+ << " osd" << peer
+ << endl;
+ messenger->send_message(op, MSG_ADDR_OSD(peer), osdmap->get_inst(peer));
+ return;
+ }
+ }
+ }
+ }
+ }
+
+ } else {
+ // REPLICATION OP (it's from another OSD)
+
+ // have pg?
+ if (!pg) {
+ derr(-7) << "handle_rep_op " << *op
+ << " pgid " << pgid << " dne" << endl;
+ delete op;
+ //assert(0); // wtf, shouldn't happen.
+ return;
+ }
+
+ // check osd map: same set, or primary+acker?
+ if (g_conf.osd_rep == OSD_REP_CHAIN &&
+ op->get_map_epoch() < pg->info.history.same_since) {
+ dout(10) << "handle_rep_op pg changed " << pg->info.history
+ << " after " << op->get_map_epoch()
+ << ", dropping" << endl;
+ delete op;
+ return;
+ }
+ if (g_conf.osd_rep != OSD_REP_CHAIN &&
+ (op->get_map_epoch() < pg->info.history.same_primary_since ||
+ op->get_map_epoch() < pg->info.history.same_acker_since)) {
+ dout(10) << "handle_rep_op pg primary|acker changed " << pg->info.history
+ << " after " << op->get_map_epoch()
+ << ", dropping" << endl;
+ delete op;
+ return;
+ }
+
+ assert(pg->get_role() >= 0);
+ dout(7) << "handle_rep_op " << op << " in " << *pg << endl;
+ }
+
+ if (g_conf.osd_maxthreads < 1) {
+ _lock_pg(pgid);
+ do_op(op, pg); // do it now
+ _unlock_pg(pgid);
+ } else {
+ // queue for worker threads
+ if (read)
+ enqueue_op(0, op); // no locking needed for reads
+ else
+ enqueue_op(pgid, op);
+ }
+}
+
+void OSD::handle_op_reply(MOSDOpReply *op)
+{
+ if (op->get_map_epoch() < boot_epoch) {
+ dout(3) << "replica op reply from before boot" << endl;
+ delete op;
+ return;
+ }
+
+ // must be a rep op.
+ assert(op->get_source().is_osd());
+
+ // make sure we have the pg
+ const pg_t pgid = op->get_pg();
+ PG *pg = get_pg(pgid);
+
+ // require same or newer map
+ if (!require_same_or_newer_map(op, op->get_map_epoch())) return;
+
+ // share our map with sender, if they're old
+ _share_map_incoming(op->get_source(), op->get_source_inst(), op->get_map_epoch());
+
+ if (!pg) {
+ // hmm.
+ delete op;
+ }
+
+ if (g_conf.osd_maxthreads < 1) {
+ _lock_pg(pgid);
+ do_op(op, pg); // do it now
+ _unlock_pg(pgid);
+ } else {
+ enqueue_op(pgid, op); // queue for worker threads
+ }
+}
+
+
+/*
+ * enqueue called with osd_lock held
+ */
+void OSD::enqueue_op(pg_t pgid, Message *op)
+{
+ while (pending_ops > g_conf.osd_max_opq) {
+ dout(10) << "enqueue_op waiting for pending_ops " << pending_ops << " to drop to " << g_conf.osd_max_opq << endl;
+ op_queue_cond.Wait(osd_lock);
+ }
+
+ op_queue[pgid].push_back(op);
+ pending_ops++;
+ logger->set("opq", pending_ops);
+
+ threadpool->put_op(pgid);
+}
+
+/*
+ * NOTE: dequeue called in worker thread, without osd_lock
+ */
+void OSD::dequeue_op(pg_t pgid)
+{
+ Message *op = 0;
+ PG *pg = 0;
+
+ osd_lock.Lock();
+ {
+ if (pgid) {
+ // lock pg
+ pg = _lock_pg(pgid);
+ }
+
+ // get pending op
+ list<Message*> &ls = op_queue[pgid];
+ assert(!ls.empty());
+ op = ls.front();
+ ls.pop_front();
+
+ if (pgid) {
+ dout(10) << "dequeue_op " << op << " write pg " << pgid
+ << ls.size() << " / " << (pending_ops-1) << " more pending" << endl;
+ } else {
+ dout(10) << "dequeue_op " << op << " read "
+ << ls.size() << " / " << (pending_ops-1) << " more pending" << endl;
+ }
+
+ if (ls.empty())
+ op_queue.erase(pgid);
+ }
+ osd_lock.Unlock();
+
+ // do it
+ do_op(op, pg);
+
+ // finish
+ osd_lock.Lock();
+ {
+ if (pgid) {
+ // unlock pg
+ _unlock_pg(pgid);
+ }
+
+ dout(10) << "dequeue_op " << op << " finish" << endl;
+ assert(pending_ops > 0);
+
+ if (pending_ops > g_conf.osd_max_opq)
+ op_queue_cond.Signal();
+
+ pending_ops--;
+ logger->set("opq", pending_ops);
+ if (pending_ops == 0 && waiting_for_no_ops)
+ no_pending_ops.Signal();
+ }
+ osd_lock.Unlock();
+}
+
+
+
+/** do_op - do an op
+ * object lock will be held (if multithreaded)
+ * osd_lock NOT held.
+ */
+void OSD::do_op(Message *m, PG *pg)
+{
+ //dout(15) << "do_op " << *m << endl;
+
+ if (m->get_type() == MSG_OSD_OP) {
+ MOSDOp *op = (MOSDOp*)m;
+
+ logger->inc("op");
+
+ switch (op->get_op()) {
+
+ // reads
+ case OSD_OP_READ:
+ op_read(op);//, pg);
+ break;
+ case OSD_OP_STAT:
+ op_stat(op);//, pg);
+ break;
+
+ // rep stuff
+ case OSD_OP_PULL:
+ op_pull(op, pg);
+ break;
+ case OSD_OP_PUSH:
+ op_push(op, pg);
+ break;
+
+ // writes
+ case OSD_OP_WRNOOP:
+ case OSD_OP_WRITE:
+ case OSD_OP_ZERO:
+ case OSD_OP_DELETE:
+ case OSD_OP_TRUNCATE:
+ case OSD_OP_WRLOCK:
+ case OSD_OP_WRUNLOCK:
+ case OSD_OP_RDLOCK:
+ case OSD_OP_RDUNLOCK:
+ case OSD_OP_UPLOCK:
+ case OSD_OP_DNLOCK:
+ if (op->get_source().is_osd())
+ op_rep_modify(op, pg);
+ else
+ op_modify(op, pg);
+ break;
+
+ default:
+ assert(0);
+ }
+ }
+ else if (m->get_type() == MSG_OSD_OPREPLY) {
+ // must be replication.
+ MOSDOpReply *r = (MOSDOpReply*)m;
+ tid_t rep_tid = r->get_rep_tid();
+
+ if (pg->repop_gather.count(rep_tid)) {
+ // oh, good.
+ int fromosd = r->get_source().num();
+ repop_ack(pg, pg->repop_gather[rep_tid],
+ r->get_result(), r->get_commit(),
+ fromosd,
+ r->get_pg_complete_thru());
+ delete m;
+ } else {
+ // early ack.
+ pg->waiting_for_repop[rep_tid].push_back(r);
+ }
+
+ } else
+ assert(0);
+}
+
+
+
+void OSD::wait_for_no_ops()
+{
+ if (pending_ops > 0) {
+ dout(7) << "wait_for_no_ops - waiting for " << pending_ops << endl;
+ waiting_for_no_ops = true;
+ while (pending_ops > 0)
+ no_pending_ops.Wait(osd_lock);
+ waiting_for_no_ops = false;
+ assert(pending_ops == 0);
+ }
+ dout(7) << "wait_for_no_ops - none" << endl;
+}
+
+
+// ==============================
+// Object locking
+
+//
+// If the target object of the operation op is locked for writing by another client, the function puts op to the waiting queue waiting_for_wr_unlock
+// returns true if object was locked, otherwise returns false
+//
+bool OSD::block_if_wrlocked(MOSDOp* op)
+{
+ object_t oid = op->get_oid();
+
+ msg_addr_t source;
+ int len = store->getattr(oid, "wrlock", &source, sizeof(msg_addr_t));
+ //cout << "getattr returns " << len << " on " << oid << endl;
+
+ if (len == sizeof(source) &&
+ source != op->get_client()) {
+ //the object is locked for writing by someone else -- add the op to the waiting queue
+ waiting_for_wr_unlock[oid].push_back(op);
+ return true;
+ }
+
+ return false; //the object wasn't locked, so the operation can be handled right away
+}
+
+
+
+// ===============================
+// OPS
+
+/*
+int OSD::list_missing_revs(object_t oid, set<object_t>& revs, PG *pg)
+{
+ int c = 0;
+ oid.rev = 0;
+
+ map<object_t,eversion_t>::iterator p = pg->missing.missing.lower_bound(oid);
+ if (p == pg->missing.missing.end())
+ return 0; // clearly not
+
+ while (p->first.ino == oid.ino &&
+ p->first.bno == oid.bno) {
+ revs.insert(p->first);
+ c++;
+ }
+ return c;
+}*/
+
+bool OSD::pick_missing_object_rev(object_t& oid, PG *pg)
+{
+ map<object_t,eversion_t>::iterator p = pg->missing.missing.upper_bound(oid);
+ if (p == pg->missing.missing.end())
+ return false; // clearly no candidate
+
+ if (p->first.ino == oid.ino && p->first.bno == oid.bno) {
+ oid = p->first; // yes! it's an upper bound revision for me.
+ return true;
+ }
+ return false;
+}
+
+bool OSD::pick_object_rev(object_t& oid)
+{
+ object_t t = oid;
+
+ if (!store->pick_object_revision_lt(t))
+ return false; // we have no revisions of this object!
+
+ objectrev_t crev;
+ int r = store->getattr(t, "crev", &crev, sizeof(crev));
+ assert(r >= 0);
+ if (crev <= oid.rev) {
+ dout(10) << "pick_object_rev choosing " << t << " crev " << crev << " for " << oid << endl;
+ oid = t;
+ return true;
+ }
+
+ return false;
+}
+
+bool OSD::waitfor_missing_object(MOSDOp *op, PG *pg)
+{
+ const object_t oid = op->get_oid();
+
+ // are we missing the object?
+ if (pg->missing.missing.count(oid)) {
+ // we don't have it (yet).
+ eversion_t v = pg->missing.missing[oid];
+ if (pg->objects_pulling.count(oid)) {
+ dout(7) << "missing "
+ << oid
+ << " v " << v
+ << " in " << *pg
+ << ", already pulling"
+ << endl;
+ } else {
+ dout(7) << "missing "
+ << oid
+ << " v " << v
+ << " in " << *pg
+ << ", pulling"
+ << endl;
+ pull(pg, oid);
+ }
+ pg->waiting_for_missing_object[oid].push_back(op);
+ return true;
+ }
+
+ return false;
+}
+
+
+
+
+// READ OPS
+
+/** op_read
+ * client read op
+ * NOTE: called from opqueue.
+ */
+void OSD::op_read(MOSDOp *op)//, PG *pg)
+{
+ object_t oid = op->get_oid();
+
+ // if the target object is locked for writing by another client, put 'op' to the waiting queue
+ // for _any_ op type -- eg only the locker can unlock!
+ if (block_if_wrlocked(op)) return; // op will be handled later, after the object unlocks
+
+ dout(10) << "op_read " << oid
+ << " " << op->get_offset() << "~" << op->get_length()
+ //<< " in " << *pg
+ << endl;
+
+ long r = 0;
+ bufferlist bl;
+
+ if (oid.rev && !pick_object_rev(oid)) {
+ // we have no revision for this request.
+ r = -EEXIST;
+ } else {
+ // read into a buffer
+ r = store->read(oid,
+ op->get_offset(), op->get_length(),
+ bl);
+ }
+
+ // set up reply
+ MOSDOpReply *reply = new MOSDOpReply(op, 0, osdmap->get_epoch(), true);
+ if (r >= 0) {
+ reply->set_result(0);
+ reply->set_data(bl);
+ reply->set_length(r);
+
+ logger->inc("c_rd");
+ logger->inc("c_rdb", r);
+
+ } else {
+ reply->set_result(r); // error
+ reply->set_length(0);
+ }
+
+ dout(10) << " read got " << r << " / " << op->get_length() << " bytes from obj " << oid << endl;
+
+ logger->inc("rd");
+ if (r >= 0) logger->inc("rdb", r);
+
+ // send it
+ messenger->send_message(reply, op->get_client(), op->get_client_inst());
+
+ delete op;
+}
+
+
+/** op_stat
+ * client stat
+ * NOTE: called from opqueue
+ */
+void OSD::op_stat(MOSDOp *op)//, PG *pg)
+{
+ object_t oid = op->get_oid();
+
+ // if the target object is locked for writing by another client, put 'op' to the waiting queue
+ if (block_if_wrlocked(op)) return; //read will be handled later, after the object unlocks
+
+ struct stat st;
+ memset(&st, sizeof(st), 0);
+ int r = 0;
+
+ if (oid.rev && !pick_object_rev(oid)) {
+ // we have no revision for this request.
+ r = -EEXIST;
+ } else {
+ r = store->stat(oid, &st);
+ }
+
+ dout(3) << "op_stat on " << oid
+ << " r = " << r
+ << " size = " << st.st_size
+ //<< " in " << *pg
+ << endl;
+
+ MOSDOpReply *reply = new MOSDOpReply(op, r, osdmap->get_epoch(), true);
+ reply->set_object_size(st.st_size);
+ messenger->send_message(reply, op->get_client(), op->get_client_inst());
+
+ logger->inc("stat");
+
+ delete op;
+}
+
+
+
+/*********
+ * new repops
+ */
+
+void OSD::get_repop_gather(PG::RepOpGather *repop)
+{
+ //repop->lock.Lock();
+ dout(10) << "get_repop " << *repop << endl;
+}
+
+void OSD::apply_repop(PG *pg, PG::RepOpGather *repop)
+{
+ dout(10) << "apply_repop applying update on " << *repop << endl;
+ assert(!repop->applied);
+
+ Context *oncommit = new C_OSD_WriteCommit(this, pg->info.pgid, repop->rep_tid, repop->pg_local_last_complete);
+ unsigned r = store->apply_transaction(repop->t, oncommit);
+ if (r)
+ dout(-10) << "apply_repop apply transaction return " << r << " on " << *repop << endl;
+
+ // discard my reference to buffer
+ repop->op->get_data().clear();
+
+ repop->applied = true;
+}
+
+void OSD::put_repop_gather(PG *pg, PG::RepOpGather *repop)
+{
+ dout(10) << "put_repop " << *repop << endl;
+
+ // commit?
+ if (repop->can_send_commit() &&
+ repop->op->wants_commit()) {
+ // send commit.
+ MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), true);
+ dout(10) << "put_repop sending commit on " << *repop << " " << reply << endl;
+ messenger->send_message(reply, repop->op->get_client(), repop->op->get_client_inst());
+ repop->sent_commit = true;
+ }
+
+ // ack?
+ else if (repop->can_send_ack() &&
+ repop->op->wants_ack()) {
+ // apply
+ apply_repop(pg, repop);
+
+ // send ack
+ MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), false);
+ dout(10) << "put_repop sending ack on " << *repop << " " << reply << endl;
+ messenger->send_message(reply, repop->op->get_client(), repop->op->get_client_inst());
+ repop->sent_ack = true;
+
+ utime_t now = g_clock.now();
+ now -= repop->start;
+ logger->finc("rlsum", now);
+ logger->inc("rlnum", 1);
+ }
+
+ // done.
+ if (repop->can_delete()) {
+ // adjust peers_complete_thru
+ if (!repop->pg_complete_thru.empty()) {
+ eversion_t min = pg->info.last_complete; // hrm....
+ for (unsigned i=0; i<pg->acting.size(); i++) {
+ if (repop->pg_complete_thru[pg->acting[i]] < min) // note: if we haven't heard, it'll be zero, which is what we want.
+ min = repop->pg_complete_thru[pg->acting[i]];
+ }
+
+ if (min > pg->peers_complete_thru) {
+ dout(10) << "put_repop peers_complete_thru " << pg->peers_complete_thru << " -> " << min << " in " << *pg << endl;
+ pg->peers_complete_thru = min;
+ }
+ }
+
+ dout(10) << "put_repop deleting " << *repop << endl;
+ //repop->lock.Unlock();
+
+ assert(pg->repop_gather.count(repop->rep_tid));
+ pg->repop_gather.erase(repop->rep_tid);
+
+ delete repop->op;
+ delete repop;
+
+ } else {
+ //repop->lock.Unlock();
+ }
+}
+
+
+void OSD::issue_repop(PG *pg, MOSDOp *op, int osd)
+{
+ object_t oid = op->get_oid();
+
+ dout(7) << " issue_repop rep_tid " << op->get_rep_tid()
+ << " in " << *pg
+ << " o " << oid
+ << " to osd" << osd
+ << endl;
+
+ // forward the write/update/whatever
+ MOSDOp *wr = new MOSDOp(op->get_tid(),
+ op->get_client(),
+ oid,
+ pg->get_pgid(),
+ osdmap->get_epoch(),
+ op->get_op());
+ wr->get_data() = op->get_data(); // _copy_ bufferlist
+ wr->set_length(op->get_length());
+ wr->set_offset(op->get_offset());
+ wr->set_version(op->get_version());
+
+ wr->set_rep_tid(op->get_rep_tid());
+ wr->set_pg_trim_to(pg->peers_complete_thru);
+
+ messenger->send_message(wr, MSG_ADDR_OSD(osd), osdmap->get_inst(osd));
+}
+
+PG::RepOpGather *OSD::new_repop_gather(PG *pg,
+ MOSDOp *op)
+{
+ dout(10) << "new_repop_gather rep_tid " << op->get_rep_tid() << " on " << *op << " in " << *pg << endl;
+
+ PG::RepOpGather *repop = new PG::RepOpGather(op, op->get_rep_tid(),
+ op->get_version(),
+ pg->info.last_complete);
+
+ // osds. commits all come to me.
+ for (unsigned i=0; i<pg->acting.size(); i++) {
+ int osd = pg->acting[i];
+ repop->osds.insert(osd);
+ repop->waitfor_commit.insert(osd);
+ }
+
+ // acks vary:
+ if (g_conf.osd_rep == OSD_REP_CHAIN) {
+ // chain rep.
+ // there's my local ack...
+ repop->osds.insert(whoami);
+ repop->waitfor_ack.insert(whoami);
+ repop->waitfor_commit.insert(whoami);
+
+ // also, the previous guy will ack to me
+ int myrank = osdmap->calc_pg_rank(whoami, pg->acting);
+ if (myrank > 0) {
+ int osd = pg->acting[ myrank-1 ];
+ repop->osds.insert(osd);
+ repop->waitfor_ack.insert(osd);
+ repop->waitfor_commit.insert(osd);
+ }
+ } else {
+ // primary, splay. all osds ack to me.
+ for (unsigned i=0; i<pg->acting.size(); i++) {
+ int osd = pg->acting[i];
+ repop->waitfor_ack.insert(osd);
+ }
+ }
+
+ repop->start = g_clock.now();
+
+ pg->repop_gather[ repop->rep_tid ] = repop;
+
+ // anyone waiting? (acks that got here before the op did)
+ if (pg->waiting_for_repop.count(repop->rep_tid)) {
+ take_waiters(pg->waiting_for_repop[repop->rep_tid]);
+ pg->waiting_for_repop.erase(repop->rep_tid);
+ }
+
+ return repop;
+}
+
+
+void OSD::repop_ack(PG *pg, PG::RepOpGather *repop,
+ int result, bool commit,
+ int fromosd, eversion_t pg_complete_thru)
+{
+ MOSDOp *op = repop->op;
+
+ dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *op
+ << " result " << result << " commit " << commit << " from osd" << fromosd
+ << " in " << *pg
+ << endl;
+
+ get_repop_gather(repop);
+ {
+ if (commit) {
+ // commit
+ assert(repop->waitfor_commit.count(fromosd));
+ repop->waitfor_commit.erase(fromosd);
+ repop->waitfor_ack.erase(fromosd);
+ repop->pg_complete_thru[fromosd] = pg_complete_thru;
+ } else {
+ // ack
+ repop->waitfor_ack.erase(fromosd);
+ }
+ }
+ put_repop_gather(pg, repop);
+}
+
+
+
+
+
+/** op_modify_commit
+ * transaction commit on the acker.
+ */
+void OSD::op_modify_commit(pg_t pgid, tid_t rep_tid, eversion_t pg_complete_thru)
+{
+ PG *pg = lock_pg(pgid);
+ if (pg) {
+ if (pg->repop_gather.count(rep_tid)) {
+ PG::RepOpGather *repop = pg->repop_gather[rep_tid];
+
+ dout(10) << "op_modify_commit " << *repop->op << endl;
+ get_repop_gather(repop);
+ {
+ assert(repop->waitfor_commit.count(whoami));
+ repop->waitfor_commit.erase(whoami);
+ repop->pg_complete_thru[whoami] = pg_complete_thru;
+ }
+ put_repop_gather(pg, repop);
+ dout(10) << "op_modify_commit done on " << repop << endl;
+ } else {
+ dout(10) << "op_modify_commit pg " << pgid << " rep_tid " << rep_tid << " dne" << endl;
+ }
+
+ unlock_pg(pgid);
+ } else {
+ dout(10) << "op_modify_commit pg " << pgid << " dne" << endl;
+ }
+}
+
+
+/** op_modify
+ * process client modify op
+ * NOTE: called from opqueue.
+ */
+void OSD::op_modify(MOSDOp *op, PG *pg)
+{
+ object_t oid = op->get_oid();
+
+ const char *opname = MOSDOp::get_opname(op->get_op());
+
+ // are any peers missing this?
+ for (unsigned i=1; i<pg->acting.size(); i++) {
+ int peer = pg->acting[i];
+ if (pg->peer_missing.count(peer) &&
+ pg->peer_missing[peer].is_missing(oid)) {
+ // push it before this update.
+ // FIXME, this is probably extra much work (eg if we're about to overwrite)
+ pg->peer_missing[peer].got(oid);
+ push(pg, oid, peer);
+ }
+ }
+
+ // dup op?
+ reqid_t reqid(op->get_client(), op->get_tid());
+ if (pg->log.logged_req(reqid)) {
+ dout(-3) << "op_modify " << opname << " dup op " << reqid
+ << ", doing WRNOOP" << endl;
+ op->set_op(OSD_OP_WRNOOP);
+ opname = MOSDOp::get_opname(op->get_op());
+ }
+
+ // locked by someone else?
+ // for _any_ op type -- eg only the locker can unlock!
+ if (op->get_op() != OSD_OP_WRNOOP && // except WRNOOP; we just want to flush
+ block_if_wrlocked(op))
+ return; // op will be handled later, after the object unlocks
+
+
+ // check crev
+ objectrev_t crev = 0;
+ store->getattr(oid, "crev", (char*)&crev, sizeof(crev));
+
+ // assign version
+ eversion_t clone_version;
+ eversion_t nv = pg->log.top;
+ if (op->get_op() != OSD_OP_WRNOOP) {
+ nv.epoch = osdmap->get_epoch();
+ nv.version++;
+ assert(nv > pg->info.last_update);
+ assert(nv > pg->log.top);
+
+ // will clone?
+ if (crev && op->get_rev() && op->get_rev() > crev) {
+ clone_version = nv;
+ nv.version++;
+ }
+
+ if (op->get_version().version) {
+ // replay!
+ if (nv.version < op->get_version().version) {
+ nv.version = op->get_version().version;
+
+ // clone?
+ if (crev && op->get_rev() && op->get_rev() > crev) {
+ // backstep clone
+ clone_version = nv;
+ clone_version.version--;
+ }
+ }
+ }
+ }
+
+ // set version in op, for benefit of client and our eventual reply
+ op->set_version(nv);
+
+ dout(10) << "op_modify " << opname
+ << " " << oid
+ << " v " << nv
+ << " crev " << crev
+ << " rev " << op->get_rev()
+ << " " << op->get_offset() << "~" << op->get_length()
+ << endl;
+
+ if (op->get_op() == OSD_OP_WRITE) {
+ logger->inc("c_wr");
+ logger->inc("c_wrb", op->get_length());
+ }
+
+ // share latest osd map?
+ osd_lock.Lock();
+ {
+ for (unsigned i=1; i<pg->acting.size(); i++) {
+ int osd = pg->acting[i];
+ _share_map_outgoing( MSG_ADDR_OSD(osd), osdmap->get_inst(osd) );
+ }
+ }
+ osd_lock.Unlock();
+
+ // issue replica writes
+ PG::RepOpGather *repop = 0;
+ bool alone = (pg->acting.size() == 1);
+ tid_t rep_tid = ++last_tid;
+ op->set_rep_tid(rep_tid);
+
+ if (g_conf.osd_rep == OSD_REP_CHAIN && !alone) {
+ // chain rep. send to #2 only.
+ int next = pg->acting[1];
+ if (pg->acting.size() > 2)
+ next = pg->acting[2];
+ issue_repop(pg, op, next);
+ }
+ else if (g_conf.osd_rep == OSD_REP_SPLAY && !alone) {
+ // splay rep. send to rest.
+ for (unsigned i=1; i<pg->acting.size(); ++i)
+ //for (unsigned i=pg->acting.size()-1; i>=1; --i)
+ issue_repop(pg, op, pg->acting[i]);
+ } else {
+ // primary rep, or alone.
+ repop = new_repop_gather(pg, op);
+
+ // send to rest.
+ if (!alone)
+ for (unsigned i=1; i<pg->acting.size(); i++)
+ issue_repop(pg, op, pg->acting[i]);
+ }
+
+ if (repop) {
+ // we are acker.
+ if (op->get_op() != OSD_OP_WRNOOP) {
+ // log and update later.
+ prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), pg, pg->peers_complete_thru);
+ prepare_op_transaction(repop->t, op, nv, crev, op->get_rev(), pg);
+ }
+
+ // (logical) local ack.
+ // (if alone, this will apply the update.)
+ get_repop_gather(repop);
+ {
+ assert(repop->waitfor_ack.count(whoami));
+ repop->waitfor_ack.erase(whoami);
+ }
+ put_repop_gather(pg, repop);
+
+ } else {
+ // chain or splay. apply.
+ ObjectStore::Transaction t;
+ prepare_log_transaction(t, op, nv, crev, op->get_rev(), pg, pg->peers_complete_thru);
+ prepare_op_transaction(t, op, nv, crev, op->get_rev(), pg);
+
+ C_OSD_RepModifyCommit *oncommit = new C_OSD_RepModifyCommit(this, op, pg->get_acker(),
+ pg->info.last_complete);
+ unsigned r = store->apply_transaction(t, oncommit);
+ if (r != 0 && // no errors
+ r != 2) { // or error on collection_add
+ cerr << "error applying transaction: r = " << r << endl;
+ assert(r == 0);
+ }
+
+ oncommit->ack();
+ }
+}
+
+
+
+void OSD::prepare_log_transaction(ObjectStore::Transaction& t,
+ MOSDOp *op, eversion_t& version,
+ objectrev_t crev, objectrev_t rev,
+ PG *pg,
+ eversion_t trim_to)
+{
+ const object_t oid = op->get_oid();
+
+ // clone entry?
+ if (crev && rev && rev > crev) {
+ eversion_t cv = version;
+ cv.version--;
+ PG::Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv,
+ op->get_client(), op->get_tid());
+ pg->log.add(cloneentry);
+
+ dout(10) << "prepare_log_transaction " << op->get_op()
+ << " " << cloneentry
+ << " in " << *pg << endl;
+ }
+
+ // actual op
+ int opcode = PG::Log::Entry::MODIFY;
+ if (op->get_op() == OSD_OP_DELETE) opcode = PG::Log::Entry::DELETE;
+ PG::Log::Entry logentry(opcode, oid, version,
+ op->get_client(), op->get_tid());
+
+ dout(10) << "prepare_log_transaction " << op->get_op()
+ << " " << logentry
+ << " in " << *pg << endl;
+
+ // append to log
+ assert(version > pg->log.top);
+ pg->log.add(logentry);
+ assert(pg->log.top == version);
+ dout(10) << "prepare_log_transaction appended to " << *pg << endl;
+
+ // write to pg log on disk
+ pg->append_log(t, logentry, trim_to);
+}
+
+
+/** prepare_op_transaction
+ * apply an op to the store wrapped in a transaction.
+ */
+void OSD::prepare_op_transaction(ObjectStore::Transaction& t,
+ MOSDOp *op, eversion_t& version,
+ objectrev_t crev, objectrev_t rev,
+ PG *pg)
+{
+ const object_t oid = op->get_oid();
+ const pg_t pgid = op->get_pg();
+
+ bool did_clone = false;
+
+ dout(10) << "prepare_op_transaction " << MOSDOp::get_opname( op->get_op() )
+ << " " << oid
+ << " v " << version
+ << " crev " << crev
+ << " rev " << rev
+ << " in " << *pg << endl;
+
+ // WRNOOP does nothing.
+ if (op->get_op() == OSD_OP_WRNOOP)
+ return;
+
+ // raise last_complete?
+ if (pg->info.last_complete == pg->info.last_update)
+ pg->info.last_complete = version;
+
+ // raise last_update.
+ assert(version > pg->info.last_update);
+ pg->info.last_update = version;
+
+ // write pg info
+ t.collection_setattr(pgid, "info", &pg->info, sizeof(pg->info));
+
+ // clone?
+ if (crev && rev && rev > crev) {
+ object_t noid = oid;
+ noid.rev = rev;
+ dout(10) << "prepare_op_transaction cloning " << oid << " crev " << crev << " to " << noid << endl;
+ t.clone(oid, noid);
+ did_clone = true;
+ }
+
+ // apply the op
+ switch (op->get_op()) {
+ case OSD_OP_WRLOCK:
+ { // lock object
+ //r = store->setattr(oid, "wrlock", &op->get_asker(), sizeof(msg_addr_t), oncommit);
+ t.setattr(oid, "wrlock", &op->get_client(), sizeof(msg_addr_t));
+ }
+ break;
+
+ case OSD_OP_WRUNLOCK:
+ { // unlock objects
+ //r = store->rmattr(oid, "wrlock", oncommit);
+ t.rmattr(oid, "wrlock");
+
+ // unblock all operations that were waiting for this object to become unlocked
+ if (waiting_for_wr_unlock.count(oid)) {
+ take_waiters(waiting_for_wr_unlock[oid]);
+ waiting_for_wr_unlock.erase(oid);
+ }
+ }
+ break;
+
+ case OSD_OP_WRITE:
+ { // write
+ assert(op->get_data().length() == op->get_length());
+ bufferlist bl;
+ bl.claim( op->get_data() ); // give buffers to store; we keep *op in memory for a long time!
+
+ //if (oid < 100000000000000ULL) // hack hack-- don't write client data
+ t.write( oid, op->get_offset(), op->get_length(), bl );
+ }
+ break;
+
+ case OSD_OP_ZERO:
+ {
+ assert(0); // are you sure this is what you want?
+ // zero, remove, or truncate?
+ struct stat st;
+ int r = store->stat(oid, &st);
+ if (r >= 0) {
+ if (op->get_offset() + op->get_length() >= st.st_size) {
+ if (op->get_offset())
+ t.truncate(oid, op->get_length() + op->get_offset());
+ else
+ t.remove(oid);
+ } else {
+ // zero. the dumb way. FIXME.
+ bufferptr bp(op->get_length());
+ bp.zero();
+ bufferlist bl;
+ bl.push_back(bp);
+ t.write(oid, op->get_offset(), op->get_length(), bl);
+ }
+ } else {
+ // noop?
+ dout(10) << "apply_transaction zero on " << oid << ", but dne? stat returns " << r << endl;
+ }
+ }
+ break;
+
+ case OSD_OP_TRUNCATE:
+ { // truncate
+ //r = store->truncate(oid, op->get_offset());
+ t.truncate(oid, op->get_length() );
+ }
+ break;
+
+ case OSD_OP_DELETE:
+ { // delete
+ //r = store->remove(oid);
+ t.remove(oid);
+ }
+ break;
+
+ default:
+ assert(0);
+ }
+
+ // object collection, version
+ if (op->get_op() == OSD_OP_DELETE) {
+ // remove object from c
+ t.collection_remove(pgid, oid);
+ } else {
+ // add object to c
+ t.collection_add(pgid, oid);
+
+ // object version
+ t.setattr(oid, "version", &version, sizeof(version));
+
+ // set object crev
+ if (crev == 0 || // new object
+ did_clone) // we cloned
+ t.setattr(oid, "crev", &rev, sizeof(rev));
+ }
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __OSD_H
+#define __OSD_H
+
+#include "msg/Dispatcher.h"
+
+#include "common/Mutex.h"
+#include "common/ThreadPool.h"
+
+#include "mon/MonMap.h"
+
+#include "ObjectStore.h"
+#include "PG.h"
+
+#include <map>
+using namespace std;
+#include <ext/hash_map>
+#include <ext/hash_set>
+using namespace __gnu_cxx;
+
+#include "messages/MOSDOp.h"
+
+class Messenger;
+class Message;
+
+
+
+
+class OSD : public Dispatcher {
+public:
+
+ /** superblock
+ */
+ OSDSuperblock superblock;
+ epoch_t boot_epoch;
+
+ object_t get_osdmap_object_name(epoch_t epoch) { return object_t(0,epoch << 1); }
+ object_t get_inc_osdmap_object_name(epoch_t epoch) { return object_t(0, (epoch << 1) + 1); }
+
+ void write_superblock();
+ void write_superblock(ObjectStore::Transaction& t);
+ int read_superblock();
+
+
+ /** OSD **/
+ protected:
+ Messenger *messenger;
+ int whoami;
+
+ static const int STATE_BOOTING = 1;
+ static const int STATE_ACTIVE = 2;
+ static const int STATE_STOPPING = 3;
+
+ int state;
+
+ bool is_booting() { return state == STATE_BOOTING; }
+ bool is_active() { return state == STATE_ACTIVE; }
+ bool is_stopping() { return state == STATE_STOPPING; }
+
+
+ MonMap *monmap;
+
+ class Logger *logger;
+
+ // local store
+ char dev_path[100];
+ class ObjectStore *store;
+
+ // heartbeat
+ void heartbeat();
+
+ class C_Heartbeat : public Context {
+ OSD *osd;
+ public:
+ C_Heartbeat(OSD *o) : osd(o) {}
+ void finish(int r) {
+ osd->heartbeat();
+ }
+ } *next_heartbeat;
+
+ // global lock
+ Mutex osd_lock;
+
+ // -- stats --
+ int hb_stat_ops; // ops since last heartbeat
+ int hb_stat_qlen; // cumulative queue length since last hb
+
+ hash_map<int, float> peer_qlen;
+
+ // per-pg locking (serializing)
+ hash_set<pg_t> pg_lock;
+ hash_map<pg_t, list<Cond*> > pg_lock_waiters;
+ PG *lock_pg(pg_t pgid);
+ PG *_lock_pg(pg_t pgid);
+ void unlock_pg(pg_t pgid);
+ void _unlock_pg(pg_t pgid);
+
+ // finished waiting messages, that will go at tail of dispatch()
+ list<class Message*> finished;
+ void take_waiters(list<class Message*>& ls) {
+ finished.splice(finished.end(), ls);
+ }
+
+ // object locking
+ hash_map<object_t, list<Message*> > waiting_for_wr_unlock; /** list of operations for each object waiting for 'wrunlock' */
+
+ bool block_if_wrlocked(MOSDOp* op);
+
+ // -- ops --
+ class ThreadPool<class OSD*, pg_t> *threadpool;
+ hash_map<pg_t, list<Message*> > op_queue;
+ int pending_ops;
+ bool waiting_for_no_ops;
+ Cond no_pending_ops;
+ Cond op_queue_cond;
+
+ void wait_for_no_ops();
+
+ void enqueue_op(pg_t pgid, Message *op);
+ void dequeue_op(pg_t pgid);
+ static void static_dequeueop(OSD *o, pg_t pgid) {
+ o->dequeue_op(pgid);
+ };
+
+ void do_op(Message *m, PG *pg); // actually do it
+
+ void prepare_log_transaction(ObjectStore::Transaction& t, MOSDOp* op, eversion_t& version,
+ objectrev_t crev, objectrev_t rev, PG *pg, eversion_t trim_to);
+ void prepare_op_transaction(ObjectStore::Transaction& t, MOSDOp* op, eversion_t& version,
+ objectrev_t crev, objectrev_t rev, PG *pg);
+
+ bool waitfor_missing_object(MOSDOp *op, PG *pg);
+ bool pick_missing_object_rev(object_t& oid, PG *pg);
+ bool pick_object_rev(object_t& oid);
+
+
+
+ friend class PG;
+
+ protected:
+
+ // -- osd map --
+ class OSDMap *osdmap;
+ list<class Message*> waiting_for_osdmap;
+
+ hash_map<msg_addr_t, epoch_t> peer_map_epoch;
+ bool _share_map_incoming(msg_addr_t who, const entity_inst_t& inst, epoch_t epoch);
+ void _share_map_outgoing(msg_addr_t dest, const entity_inst_t& inst);
+
+ void wait_for_new_map(Message *m);
+ void handle_osd_map(class MOSDMap *m);
+
+ void advance_map(ObjectStore::Transaction& t);
+ void activate_map(ObjectStore::Transaction& t);
+
+ void get_map(epoch_t e, OSDMap &m);
+ bool get_map_bl(epoch_t e, bufferlist& bl);
+ bool get_inc_map_bl(epoch_t e, bufferlist& bl);
+ bool get_inc_map(epoch_t e, OSDMap::Incremental &inc);
+
+ void send_incremental_map(epoch_t since, msg_addr_t dest, const entity_inst_t& inst, bool full);
+
+
+
+ // -- replication --
+
+ // PG
+ hash_map<pg_t, PG*> pg_map;
+ void load_pgs();
+ bool pg_exists(pg_t pg);
+ PG *create_pg(pg_t pg, ObjectStore::Transaction& t); // create new PG
+ PG *get_pg(pg_t pg); // return existing PG, or null
+ void _remove_pg(pg_t pg); // remove from store and memory
+
+ void project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from);
+
+ void activate_pg(pg_t pgid, epoch_t epoch);
+
+ class C_Activate : public Context {
+ OSD *osd;
+ pg_t pgid;
+ epoch_t epoch;
+ public:
+ C_Activate(OSD *o, pg_t p, epoch_t e) : osd(o), pgid(p), epoch(e) {}
+ void finish(int r) {
+ osd->activate_pg(pgid, epoch);
+ }
+ };
+
+
+ tid_t last_tid;
+ int num_pulling;
+
+ hash_map<pg_t, list<Message*> > waiting_for_pg;
+
+ // replica ops
+ void get_repop_gather(PG::RepOpGather*);
+ void apply_repop(PG *pg, PG::RepOpGather *repop);
+ void put_repop_gather(PG *pg, PG::RepOpGather*);
+ void issue_repop(PG *pg, MOSDOp *op, int osd);
+ PG::RepOpGather *new_repop_gather(PG *pg, MOSDOp *op);
+ void repop_ack(PG *pg, PG::RepOpGather *repop,
+ int result, bool commit,
+ int fromosd, eversion_t pg_complete_thru=0);
+
+ void handle_rep_op_ack(MOSDOpReply *m);
+
+ // recovery
+ void do_notifies(map< int, list<PG::Info> >& notify_list);
+ void do_queries(map< int, map<pg_t,PG::Query> >& query_map);
+ void repeer(PG *pg, map< int, map<pg_t,PG::Query> >& query_map);
+
+ void pull(PG *pg, object_t oid);
+ void push(PG *pg, object_t oid, int dest);
+
+ bool require_current_map(Message *m, epoch_t v);
+ bool require_same_or_newer_map(Message *m, epoch_t e);
+
+ void handle_pg_query(class MOSDPGQuery *m);
+ void handle_pg_notify(class MOSDPGNotify *m);
+ void handle_pg_log(class MOSDPGLog *m);
+ void handle_pg_remove(class MOSDPGRemove *m);
+
+ void op_pull(class MOSDOp *op, PG *pg);
+ void op_push(class MOSDOp *op, PG *pg);
+
+ void op_rep_modify(class MOSDOp *op, PG *pg); // write, trucnate, delete
+ void op_rep_modify_commit(class MOSDOp *op, int ackerosd,
+ eversion_t last_complete);
+ friend class C_OSD_RepModifyCommit;
+
+
+ public:
+ OSD(int id, Messenger *m, MonMap *mm, char *dev = 0);
+ ~OSD();
+
+ // startup/shutdown
+ int init();
+ int shutdown();
+
+ // messages
+ virtual void dispatch(Message *m);
+ virtual void ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst);
+ virtual bool ms_lookup(msg_addr_t dest, entity_inst_t& inst);
+
+ void handle_osd_ping(class MOSDPing *m);
+ void handle_op(class MOSDOp *m);
+
+ void op_read(class MOSDOp *m);//, PG *pg);
+ void op_stat(class MOSDOp *m);//, PG *pg);
+ void op_modify(class MOSDOp *m, PG *pg);
+ void op_modify_commit(pg_t pgid, tid_t rep_tid, eversion_t pg_complete_thru);
+
+ // for replication
+ void handle_op_reply(class MOSDOpReply *m);
+
+ void force_remount();
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __OSDMAP_H
+#define __OSDMAP_H
+
+/*
+ * describe properties of the OSD cluster.
+ * disks, disk groups, total # osds,
+ *
+ */
+#include "config.h"
+#include "include/types.h"
+#include "msg/Message.h"
+#include "common/Mutex.h"
+#include "common/Clock.h"
+
+#include "crush/crush.h"
+using namespace crush;
+
+#include <vector>
+#include <list>
+#include <set>
+#include <map>
+using namespace std;
+
+
+/*
+ * some system constants
+ */
+
+// from LSB to MSB,
+#define PG_PS_BITS 16 // max bits for placement seed/group portion of PG
+#define PG_REP_BITS 6 // up to 64 replicas
+#define PG_TYPE_BITS 2
+#define PG_PS_MASK ((1LL<<PG_PS_BITS)-1)
+
+#define PG_TYPE_RAND 1 // default: distribution randomly
+#define PG_TYPE_STARTOSD 2 // place primary on a specific OSD (named by the pg_bits)
+
+// pg roles
+#define PG_ROLE_STRAY -1
+#define PG_ROLE_HEAD 0
+#define PG_ROLE_ACKER 1
+#define PG_ROLE_MIDDLE 2 // der.. misnomer
+//#define PG_ROLE_TAIL 2
+
+
+
+/** OSDMap
+ */
+class OSDMap {
+
+public:
+ class Incremental {
+ public:
+ epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch
+ epoch_t mon_epoch; // monitor epoch (election iteration)
+ utime_t ctime;
+ map<int,entity_inst_t> new_up;
+ map<int,entity_inst_t> new_down;
+ list<int> new_in;
+ list<int> new_out;
+ map<int,float> new_overload; // updated overload value
+ list<int> old_overload; // no longer overload
+
+ void encode(bufferlist& bl) {
+ bl.append((char*)&epoch, sizeof(epoch));
+ bl.append((char*)&mon_epoch, sizeof(mon_epoch));
+ bl.append((char*)&ctime, sizeof(ctime));
+ ::_encode(new_up, bl);
+ ::_encode(new_down, bl);
+ ::_encode(new_in, bl);
+ ::_encode(new_out, bl);
+ ::_encode(new_overload, bl);
+ }
+ void decode(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ bl.copy(off, sizeof(mon_epoch), (char*)&mon_epoch);
+ off += sizeof(mon_epoch);
+ bl.copy(off, sizeof(ctime), (char*)&ctime);
+ off += sizeof(ctime);
+ ::_decode(new_up, bl, off);
+ ::_decode(new_down, bl, off);
+ ::_decode(new_in, bl, off);
+ ::_decode(new_out, bl, off);
+ ::_decode(new_overload, bl, off);
+ }
+
+ Incremental(epoch_t e=0) : epoch(e), mon_epoch(0) {}
+ };
+
+private:
+ epoch_t epoch; // what epoch of the osd cluster descriptor is this
+ epoch_t mon_epoch; // monitor epoch (election iteration)
+ utime_t ctime; // epoch start time
+ int pg_bits; // placement group bits
+ int localized_pg_bits; // bits for localized pgs
+
+ set<int> osds; // all osds
+ set<int> down_osds; // list of down disks
+ set<int> out_osds; // list of unmapped disks
+ map<int,float> overload_osds;
+ map<int,entity_inst_t> osd_inst;
+
+ public:
+ Crush crush; // hierarchical map
+
+ friend class OSDMonitor;
+ friend class MDS;
+
+ public:
+ OSDMap() : epoch(0), mon_epoch(0), pg_bits(5), localized_pg_bits(3) {}
+
+ // map info
+ epoch_t get_epoch() const { return epoch; }
+ void inc_epoch() { epoch++; }
+
+ int get_pg_bits() const { return pg_bits; }
+ void set_pg_bits(int b) { pg_bits = b; }
+ int get_localized_pg_bits() const { return localized_pg_bits; }
+
+ const utime_t& get_ctime() const { return ctime; }
+
+ bool is_mkfs() const { return epoch == 1; }
+ //void set_mkfs() { assert(epoch == 1); }
+
+ /***** cluster state *****/
+ int num_osds() { return osds.size(); }
+ void get_all_osds(set<int>& ls) { ls = osds; }
+
+ const set<int>& get_osds() { return osds; }
+ const set<int>& get_down_osds() { return down_osds; }
+ const set<int>& get_out_osds() { return out_osds; }
+ const map<int,float>& get_overload_osds() { return overload_osds; }
+
+ bool is_down(int osd) { return down_osds.count(osd); }
+ bool is_up(int osd) { return !is_down(osd); }
+ bool is_out(int osd) { return out_osds.count(osd); }
+ bool is_in(int osd) { return !is_out(osd); }
+
+ const entity_inst_t& get_inst(int osd) {
+ assert(osd_inst.count(osd));
+ return osd_inst[osd];
+ }
+ bool get_inst(int osd, entity_inst_t& inst) {
+ if (osd_inst.count(osd)) {
+ inst = osd_inst[osd];
+ return true;
+ }
+ return false;
+ }
+
+ void mark_down(int o) { down_osds.insert(o); }
+ void mark_up(int o) { down_osds.erase(o); }
+ void mark_out(int o) { out_osds.insert(o); }
+ void mark_in(int o) { out_osds.erase(o); }
+
+
+ void apply_incremental(Incremental &inc) {
+ assert(inc.epoch == epoch+1);
+ epoch++;
+ mon_epoch = inc.mon_epoch;
+ ctime = inc.ctime;
+
+ for (map<int,entity_inst_t>::iterator i = inc.new_up.begin();
+ i != inc.new_up.end();
+ i++) {
+ assert(down_osds.count(i->first));
+ down_osds.erase(i->first);
+ assert(osd_inst.count(i->first) == 0);
+ osd_inst[i->first] = i->second;
+ //cout << "epoch " << epoch << " up osd" << i->first << endl;
+ }
+ for (map<int,entity_inst_t>::iterator i = inc.new_down.begin();
+ i != inc.new_down.end();
+ i++) {
+ assert(down_osds.count(i->first) == 0);
+ down_osds.insert(i->first);
+ assert(osd_inst.count(i->first) == 0 ||
+ osd_inst[i->first] == i->second);
+ osd_inst.erase(i->first);
+ //cout << "epoch " << epoch << " down osd" << i->first << endl;
+ }
+ for (list<int>::iterator i = inc.new_in.begin();
+ i != inc.new_in.end();
+ i++) {
+ assert(out_osds.count(*i));
+ out_osds.erase(*i);
+ //cout << "epoch " << epoch << " in osd" << *i << endl;
+ }
+ for (list<int>::iterator i = inc.new_out.begin();
+ i != inc.new_out.end();
+ i++) {
+ assert(out_osds.count(*i) == 0);
+ out_osds.insert(*i);
+ //cout << "epoch " << epoch << " out osd" << *i << endl;
+ }
+ for (map<int,float>::iterator i = inc.new_overload.begin();
+ i != inc.new_overload.end();
+ i++) {
+ overload_osds[i->first] = i->second;
+ }
+ for (list<int>::iterator i = inc.old_overload.begin();
+ i != inc.old_overload.end();
+ i++) {
+ assert(overload_osds.count(*i));
+ overload_osds.erase(*i);
+ }
+ }
+
+ // serialize, unserialize
+ void encode(bufferlist& blist) {
+ blist.append((char*)&epoch, sizeof(epoch));
+ blist.append((char*)&mon_epoch, sizeof(mon_epoch));
+ blist.append((char*)&ctime, sizeof(ctime));
+ blist.append((char*)&pg_bits, sizeof(pg_bits));
+
+ _encode(osds, blist);
+ _encode(down_osds, blist);
+ _encode(out_osds, blist);
+ _encode(overload_osds, blist);
+ _encode(osd_inst, blist);
+
+ crush._encode(blist);
+ }
+
+ void decode(bufferlist& blist) {
+ int off = 0;
+ blist.copy(off, sizeof(epoch), (char*)&epoch);
+ off += sizeof(epoch);
+ blist.copy(off, sizeof(mon_epoch), (char*)&mon_epoch);
+ off += sizeof(mon_epoch);
+ blist.copy(off, sizeof(ctime), (char*)&ctime);
+ off += sizeof(ctime);
+ blist.copy(off, sizeof(pg_bits), (char*)&pg_bits);
+ off += sizeof(pg_bits);
+
+ _decode(osds, blist, off);
+ _decode(down_osds, blist, off);
+ _decode(out_osds, blist, off);
+ _decode(overload_osds, blist, off);
+ _decode(osd_inst, blist, off);
+
+ crush._decode(blist, off);
+ }
+
+
+
+
+ /**** mapping facilities ****/
+
+ // oid -> pg
+ pg_t object_to_pg(object_t oid, FileLayout& layout) {
+ static crush::Hash H(777);
+
+ int policy = layout.object_layout;
+ if (policy == 0)
+ policy = g_conf.osd_object_layout;
+
+ int type = PG_TYPE_RAND;
+ ps_t ps;
+
+ switch (policy) {
+ case OBJECT_LAYOUT_LINEAR:
+ {
+ //const object_t ono = oid.bno;
+ //const inodeno_t ino = oid >> OID_ONO_BITS;
+ ps = (oid.bno + oid.ino) & PG_PS_MASK;
+ ps &= ((1ULL<<pg_bits)-1ULL);
+ }
+ break;
+
+ case OBJECT_LAYOUT_HASHINO:
+ {
+ //const object_t ono = oid & ((1ULL << OID_ONO_BITS)-1ULL);
+ //const inodeno_t ino = oid >> OID_ONO_BITS;
+ ps = (oid.bno + H(oid.ino)) & PG_PS_MASK;
+ ps &= ((1ULL<<pg_bits)-1ULL);
+ }
+ break;
+
+ case OBJECT_LAYOUT_HASH:
+ {
+ ps = H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ) & PG_PS_MASK;
+ ps &= ((1ULL<<pg_bits)-1ULL);
+ }
+ break;
+
+ case OBJECT_LAYOUT_STARTOSD:
+ {
+ ps = layout.osd;
+ type = PG_TYPE_STARTOSD;
+ }
+ break;
+ }
+
+ // construct final PG
+ /*pg_t pg = type;
+ pg = (pg << PG_REP_BITS) | (pg_t)layout.num_rep;
+ pg = (pg << PG_PS_BITS) | ps;
+ */
+ //cout << "pg " << hex << pg << dec << endl;
+ return pg_t(ps, 0, layout.num_rep);
+ }
+
+ // (ps, nrep) -> pg
+ pg_t ps_nrep_to_pg(ps_t ps, int nrep) {
+ /*return ((pg_t)ps & ((1ULL<<pg_bits)-1ULL))
+ | ((pg_t)nrep << PG_PS_BITS)
+ | ((pg_t)PG_TYPE_RAND << (PG_PS_BITS+PG_REP_BITS));
+ */
+ return pg_t(ps, 0, nrep, 0);
+ }
+ pg_t ps_osd_nrep_to_pg(ps_t ps, int osd, int nrep) {
+ /*return ((pg_t)osd)
+ | ((pg_t)nrep << PG_PS_BITS)
+ | ((pg_t)PG_TYPE_STARTOSD << (PG_PS_BITS+PG_REP_BITS));
+ */
+ return pg_t(ps, osd+1, nrep, 0);
+ }
+
+ // pg -> nrep
+ int pg_to_nrep(pg_t pg) {
+ return pg.u.fields.nrep;
+ //return (pg >> PG_PS_BITS) & ((1ULL << PG_REP_BITS)-1);
+ }
+
+ // pg -> ps
+ int pg_to_ps(pg_t pg) {
+ //return pg & PG_PS_MASK;
+ return pg.u.fields.ps;
+ }
+
+ // pg -> (osd list)
+ int pg_to_osds(pg_t pg,
+ vector<int>& osds) { // list of osd addr's
+ pg_t ps = pg_to_ps(pg);
+ int num_rep = pg_to_nrep(pg);
+ assert(num_rep > 0);
+
+ // map to osds[]
+ switch (g_conf.osd_pg_layout) {
+ case PG_LAYOUT_CRUSH:
+ {
+ int forcefeed = -1;
+ if (pg.u.fields.preferred > 0 &&
+ out_osds.count(pg.u.fields.preferred-1) == 0)
+ forcefeed = pg.u.fields.preferred-1;
+ crush.do_rule(crush.rules[num_rep], // FIXME rule thing.
+ ps,
+ osds,
+ out_osds, overload_osds,
+ forcefeed);
+ }
+ break;
+
+ case PG_LAYOUT_LINEAR:
+ for (int i=0; i<num_rep; i++)
+ osds.push_back( (i + ps*num_rep) % g_conf.num_osd );
+ break;
+
+ case PG_LAYOUT_HYBRID:
+ {
+ static crush::Hash H(777);
+ int h = H(ps);
+ for (int i=0; i<num_rep; i++)
+ osds.push_back( (h+i) % g_conf.num_osd );
+ }
+ break;
+
+ case PG_LAYOUT_HASH:
+ {
+ static crush::Hash H(777);
+ for (int i=0; i<num_rep; i++) {
+ int t = 1;
+ int osd = 0;
+ while (t++) {
+ osd = H(i, ps, t) % g_conf.num_osd;
+ int j = 0;
+ for (; j<i; j++)
+ if (osds[j] == osd) break;
+ if (j == i) break;
+ }
+ osds.push_back(osd);
+ }
+ }
+ break;
+
+ default:
+ assert(0);
+ }
+
+ if (pg.u.fields.preferred > 0 &&
+ g_conf.osd_pg_layout != PG_LAYOUT_CRUSH) {
+ int osd = pg.u.fields.preferred-1;
+
+ // already in there?
+ if (osds.empty()) {
+ osds.push_back(osd);
+ } else {
+ assert(num_rep > 0);
+ for (int i=1; i<num_rep; i++)
+ if (osds[i] == osd) {
+ // swap with position 0
+ osds[i] = osds[0];
+ }
+ osds[0] = osd;
+ }
+
+ if (is_out(osd))
+ osds.erase(osds.begin()); // oops, but it's down!
+ }
+
+ return osds.size();
+ }
+
+ // pg -> (up osd list)
+ int pg_to_acting_osds(pg_t pg,
+ vector<int>& osds) { // list of osd addr's
+ // get rush list
+ vector<int> raw;
+ pg_to_osds(pg, raw);
+
+ osds.clear();
+ for (unsigned i=0; i<raw.size(); i++) {
+ if (is_down(raw[i])) continue;
+ osds.push_back( raw[i] );
+ }
+ return osds.size();
+ }
+
+
+
+ // pg -> primary osd
+ int get_pg_primary(pg_t pg) {
+ vector<int> group;
+ int nrep = pg_to_osds(pg, group);
+ if (nrep)
+ return group[0];
+ return -1; // we fail!
+ }
+
+ // pg -> acting primary osd
+ int get_pg_acting_primary(pg_t pg) {
+ vector<int> group;
+ int nrep = pg_to_acting_osds(pg, group);
+ if (nrep > 0)
+ return group[0];
+ return -1; // we fail!
+ }
+ int get_pg_acting_tail(pg_t pg) {
+ vector<int> group;
+ int nrep = pg_to_acting_osds(pg, group);
+ if (nrep > 0)
+ return group[group.size()-1];
+ return -1; // we fail!
+ }
+
+
+ /* what replica # is a given osd? 0 primary, -1 for none. */
+ int calc_pg_rank(int osd, vector<int>& acting, int nrep=0) {
+ if (!nrep) nrep = acting.size();
+ for (int i=0; i<nrep; i++)
+ if (acting[i] == osd) return i;
+ return -1;
+ }
+ int calc_pg_role(int osd, vector<int>& acting, int nrep=0) {
+ if (!nrep) nrep = acting.size();
+ int rank = calc_pg_rank(osd, acting, nrep);
+
+ if (rank < 0) return PG_ROLE_STRAY;
+ else if (rank == 0) return PG_ROLE_HEAD;
+ else if (rank == 1) return PG_ROLE_ACKER;
+ else return PG_ROLE_MIDDLE;
+ }
+
+ int get_pg_role(pg_t pg, int osd) {
+ vector<int> group;
+ int nrep = pg_to_osds(pg, group);
+ return calc_pg_role(osd, group, nrep);
+ }
+
+ /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */
+ int get_pg_acting_rank(pg_t pg, int osd) {
+ vector<int> group;
+ int nrep = pg_to_acting_osds(pg, group);
+ return calc_pg_rank(osd, group, nrep);
+ }
+ /* role is -1 (stray), 0 (primary), 1 (replica) */
+ int get_pg_acting_role(pg_t pg, int osd) {
+ vector<int> group;
+ int nrep = pg_to_acting_osds(pg, group);
+ return calc_pg_role(osd, group, nrep);
+ }
+
+
+
+
+};
+
+
+#endif
--- /dev/null
+
+#include "ObjectStore.h"
+
+#include "config.h"
+#include "common/Clock.h"
+
+
+object_t ObjectStore::age_get_oid() {
+ if (!age_free_oids.empty()) {
+ object_t o = age_free_oids.front();
+ age_free_oids.pop_front();
+ return o;
+ }
+ return age_cur_oid++;
+ }
+
+ ssize_t ObjectStore::age_pick_size() {
+ ssize_t max = file_size_distn.sample() * 1024;
+ return max/2 + (rand() % 100) * max/200 + 1;
+ }
+
+ void ObjectStore::age_fill(float pc, utime_t until) {
+ bufferptr bp(1024*1024);
+ bp.zero();
+ bufferlist bl;
+ bl.push_back(bp);
+ while (1) {
+ if (g_clock.now() > until) break;
+
+ struct statfs st;
+ statfs(&st);
+ float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks;
+ if (a >= pc) {
+ dout(10) << "age_fill at " << a << " / " << pc << " stopping" << endl;
+ break;
+ }
+
+ object_t oid = age_get_oid();
+
+ int b = rand() % 10;
+ age_objects[b].push_back(oid);
+
+ ssize_t s = age_pick_size();
+
+ dout(10) << "age_fill at " << a << " / " << pc << " creating " << hex << oid << dec << " sz " << s << endl;
+
+ off_t off = 0;
+ while (s) {
+ ssize_t t = MIN(s, 1024*1024);
+ write(oid, t, off, bl, false);
+ off += t;
+ s -= t;
+ }
+ oid++;
+ }
+ }
+
+ void ObjectStore::age_empty(float pc) {
+ int nper = 20;
+ int n = nper;
+ while (1) {
+ struct statfs st;
+ statfs(&st);
+ float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks;
+ if (a <= pc) {
+ dout(10) << "age_empty at " << a << " / " << pc << " stopping" << endl;
+ break;
+ }
+
+ int b = rand() % 10;
+ n--;
+ if (n == 0 || age_objects[b].empty()) {
+ dout(10) << "age_empty sync" << endl;
+ //sync();
+ sync();
+ n = nper;
+ continue;
+ }
+ object_t oid = age_objects[b].front();
+ age_objects[b].pop_front();
+
+ dout(10) << "age_empty at " << a << " / " << pc << " removing " << hex << oid << dec << endl;
+
+ remove(oid);
+ age_free_oids.push_back(oid);
+ }
+ }
+
+
+ void ObjectStore::age(int time,
+ float high_water, // fill to this %
+ float low_water, // then empty to this %
+ int count, // this many times
+ float final_water, // and end here ( <= low_water)
+ int fake_size_mb) {
+ utime_t until = g_clock.now();
+ until.sec_ref() += time;
+
+ while (age_objects.size() < 10) age_objects.push_back( list<object_t>() );
+
+ if (fake_size_mb) {
+ int fake_bl = fake_size_mb * 256;
+ struct statfs st;
+ statfs(&st);
+ float f = (float)fake_bl / (float)st.f_blocks;
+ high_water = (float)high_water * f;
+ low_water = (float)low_water * f;
+ final_water = (float)final_water * f;
+ dout(10) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << endl;
+ }
+
+ // init size distn (once)
+ if (!did_distn) {
+ did_distn = true;
+ age_cur_oid = 1;
+ file_size_distn.add(1, 19.0758125+0.65434375);
+ file_size_distn.add(512, 35.6566);
+ file_size_distn.add(1024, 27.7271875);
+ file_size_distn.add(2*1024, 16.63503125);
+ //file_size_distn.add(4*1024, 106.82384375);
+ //file_size_distn.add(8*1024, 81.493375);
+ //file_size_distn.add(16*1024, 14.13553125);
+ //file_size_distn.add(32*1024, 2.176);
+ //file_size_distn.add(256*1024, 0.655938);
+ //file_size_distn.add(512*1024, 0.1480625);
+ //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit
+ file_size_distn.normalize();
+ }
+
+ // clear
+ for (int i=0; i<10; i++)
+ age_objects[i].clear();
+
+ for (int c=1; c<=count; c++) {
+ if (g_clock.now() > until) break;
+
+ dout(1) << "age " << c << "/" << count << " filling to " << high_water << endl;
+ age_fill(high_water, until);
+ if (c == count) {
+ dout(1) << "age final empty to " << final_water << endl;
+ age_empty(final_water);
+ } else {
+ dout(1) << "age " << c << "/" << count << " emptying to " << low_water << endl;
+ age_empty(low_water);
+ }
+ }
+ dout(1) << "age finished" << endl;
+ }
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __OBJECTSTORE_H
+#define __OBJECTSTORE_H
+
+#include "include/types.h"
+#include "include/Context.h"
+#include "include/buffer.h"
+
+#include "include/Distribution.h"
+
+#include <sys/stat.h>
+
+#ifdef DARWIN
+#include <sys/statvfs.h>
+#else
+#include <sys/vfs.h> /* or <sys/statfs.h> */
+#endif /* DARWIN */
+
+#include <list>
+using namespace std;
+
+#ifndef MIN
+# define MIN(a,b) ((a) < (b) ? (a):(b))
+#endif
+
+/*
+ * low-level interface to the local OSD file system
+ */
+
+
+
+class ObjectStore {
+public:
+
+
+ class FragmentationStat {
+ public:
+ int total;
+ int num_extent;
+ int avg_extent;
+ map<int,int> extent_dist; // powers of two
+ map<int,int> extent_dist_sum; // powers of two
+
+ float avg_extent_per_object;
+ int avg_extent_jump; // avg distance bweteen consecutive extents
+
+ int total_free;
+ int num_free_extent;
+ int avg_free_extent;
+ map<int,int> free_extent_dist; // powers of two
+ map<int,int> free_extent_dist_sum; // powers of two
+ };
+
+
+
+ /*********************************
+ * transaction
+ */
+ class Transaction {
+ public:
+ static const int OP_READ = 1; // oid, offset, len, pbl
+ static const int OP_STAT = 2; // oid, pstat
+ static const int OP_GETATTR = 3; // oid, attrname, pattrval
+ static const int OP_GETATTRS = 4; // oid, pattrset
+
+ static const int OP_WRITE = 10; // oid, offset, len, bl
+ static const int OP_TRUNCATE = 11; // oid, len
+ static const int OP_REMOVE = 13; // oid
+ static const int OP_SETATTR = 14; // oid, attrname, attrval
+ static const int OP_SETATTRS = 15; // oid, attrset
+ static const int OP_RMATTR = 16; // oid, attrname
+ static const int OP_CLONE = 17; // oid, newoid
+
+ static const int OP_MKCOLL = 20; // cid
+ static const int OP_RMCOLL = 21; // cid
+ static const int OP_COLL_ADD = 22; // cid, oid
+ static const int OP_COLL_REMOVE = 23; // cid, oid
+ static const int OP_COLL_SETATTR = 24; // cid, attrname, attrval
+ static const int OP_COLL_RMATTR = 25; // cid, attrname
+
+ list<int> ops;
+ list<bufferlist> bls;
+ list<object_t> oids;
+ list<coll_t> cids;
+ list<off_t> offsets;
+ list<size_t> lengths;
+ list<const char*> attrnames;
+ //list< pair<const void*,int> > attrvals;
+ list<bufferlist> attrbls;
+
+ list<bufferlist*> pbls;
+ list<struct stat*> psts;
+ list< pair<void*,int*> > pattrvals;
+ list< map<string,bufferptr>* > pattrsets;
+
+ void read(object_t oid, off_t off, size_t len, bufferlist *pbl) {
+ int op = OP_READ;
+ ops.push_back(op);
+ oids.push_back(oid);
+ offsets.push_back(off);
+ lengths.push_back(len);
+ pbls.push_back(pbl);
+ }
+ void stat(object_t oid, struct stat *st) {
+ int op = OP_STAT;
+ ops.push_back(op);
+ oids.push_back(oid);
+ psts.push_back(st);
+ }
+ void getattr(object_t oid, const char* name, void* val, int *plen) {
+ int op = OP_GETATTR;
+ ops.push_back(op);
+ oids.push_back(oid);
+ attrnames.push_back(name);
+ pattrvals.push_back(pair<void*,int*>(val,plen));
+ }
+ void getattrs(object_t oid, map<string,bufferptr>& aset) {
+ int op = OP_GETATTRS;
+ ops.push_back(op);
+ oids.push_back(oid);
+ pattrsets.push_back(&aset);
+ }
+
+ void write(object_t oid, off_t off, size_t len, bufferlist& bl) {
+ int op = OP_WRITE;
+ ops.push_back(op);
+ oids.push_back(oid);
+ offsets.push_back(off);
+ lengths.push_back(len);
+ bls.push_back(bl);
+ }
+ void truncate(object_t oid, off_t off) {
+ int op = OP_TRUNCATE;
+ ops.push_back(op);
+ oids.push_back(oid);
+ offsets.push_back(off);
+ }
+ void remove(object_t oid) {
+ int op = OP_REMOVE;
+ ops.push_back(op);
+ oids.push_back(oid);
+ }
+ void setattr(object_t oid, const char* name, const void* val, int len) {
+ int op = OP_SETATTR;
+ ops.push_back(op);
+ oids.push_back(oid);
+ attrnames.push_back(name);
+ //attrvals.push_back(pair<const void*,int>(val,len));
+ bufferlist bl;
+ bl.append((char*)val,len);
+ attrbls.push_back(bl);
+ }
+ void setattrs(object_t oid, map<string,bufferptr>& attrset) {
+ int op = OP_SETATTRS;
+ ops.push_back(op);
+ oids.push_back(oid);
+ pattrsets.push_back(&attrset);
+ }
+ void rmattr(object_t oid, const char* name) {
+ int op = OP_RMATTR;
+ ops.push_back(op);
+ oids.push_back(oid);
+ attrnames.push_back(name);
+ }
+ void clone(object_t oid, object_t noid) {
+ int op = OP_CLONE;
+ ops.push_back(op);
+ oids.push_back(oid);
+ oids.push_back(noid);
+ }
+ void create_collection(coll_t cid) {
+ int op = OP_MKCOLL;
+ ops.push_back(op);
+ cids.push_back(cid);
+ }
+ void remove_collection(coll_t cid) {
+ int op = OP_RMCOLL;
+ ops.push_back(op);
+ cids.push_back(cid);
+ }
+ void collection_add(coll_t cid, object_t oid) {
+ int op = OP_COLL_ADD;
+ ops.push_back(op);
+ cids.push_back(cid);
+ oids.push_back(oid);
+ }
+ void collection_remove(coll_t cid, object_t oid) {
+ int op = OP_COLL_REMOVE;
+ ops.push_back(op);
+ cids.push_back(cid);
+ oids.push_back(oid);
+ }
+ void collection_setattr(coll_t cid, const char* name, const void* val, int len) {
+ int op = OP_COLL_SETATTR;
+ ops.push_back(op);
+ cids.push_back(cid);
+ attrnames.push_back(name);
+ //attrvals.push_back(pair<const void*,int>(val,len));
+ bufferlist bl;
+ bl.append((char*)val, len);
+ attrbls.push_back(bl);
+ }
+ void collection_rmattr(coll_t cid, const char* name) {
+ int op = OP_COLL_RMATTR;
+ ops.push_back(op);
+ cids.push_back(cid);
+ attrnames.push_back(name);
+ }
+
+ // etc.
+ };
+
+
+
+ /* this implementation is here only for naive ObjectStores that
+ * do not do atomic transactions natively. it is not atomic.
+ */
+ virtual unsigned apply_transaction(Transaction& t, Context *onsafe=0) {
+ // non-atomic implementation
+ for (list<int>::iterator p = t.ops.begin();
+ p != t.ops.end();
+ p++) {
+ switch (*p) {
+ case Transaction::OP_READ:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ off_t offset = t.offsets.front(); t.offsets.pop_front();
+ size_t len = t.lengths.front(); t.lengths.pop_front();
+ bufferlist *pbl = t.pbls.front(); t.pbls.pop_front();
+ read(oid, offset, len, *pbl);
+ }
+ break;
+ case Transaction::OP_STAT:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ struct stat *st = t.psts.front(); t.psts.pop_front();
+ stat(oid, st);
+ }
+ break;
+ case Transaction::OP_GETATTR:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+ pair<void*,int*> pattrval = t.pattrvals.front(); t.pattrvals.pop_front();
+ *pattrval.second = getattr(oid, attrname, pattrval.first, *pattrval.second);
+ }
+ break;
+ case Transaction::OP_GETATTRS:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ map<string,bufferptr> *pset = t.pattrsets.front(); t.pattrsets.pop_front();
+ getattrs(oid, *pset);
+ }
+ break;
+
+ case Transaction::OP_WRITE:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ off_t offset = t.offsets.front(); t.offsets.pop_front();
+ size_t len = t.lengths.front(); t.lengths.pop_front();
+ bufferlist bl = t.bls.front(); t.bls.pop_front();
+ write(oid, offset, len, bl, 0);
+ }
+ break;
+
+ case Transaction::OP_TRUNCATE:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ off_t len = t.offsets.front(); t.offsets.pop_front();
+ truncate(oid, len, 0);
+ }
+ break;
+
+ case Transaction::OP_REMOVE:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ remove(oid, 0);
+ }
+ break;
+
+ case Transaction::OP_SETATTR:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+ //pair<const void*,int> attrval = t.attrvals.front(); t.attrvals.pop_front();
+ bufferlist bl;
+ bl.claim( t.attrbls.front() );
+ t.attrbls.pop_front();
+ setattr(oid, attrname, bl.c_str(), bl.length(), 0);
+ }
+ break;
+ case Transaction::OP_SETATTRS:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ map<string,bufferptr> *pattrset = t.pattrsets.front(); t.pattrsets.pop_front();
+ setattrs(oid, *pattrset, 0);
+ }
+ break;
+
+ case Transaction::OP_RMATTR:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+ rmattr(oid, attrname, 0);
+ }
+ break;
+
+ case Transaction::OP_CLONE:
+ {
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ object_t noid = t.oids.front(); t.oids.pop_front();
+ clone(oid, noid);
+ }
+ break;
+
+ case Transaction::OP_MKCOLL:
+ {
+ coll_t cid = t.cids.front(); t.cids.pop_front();
+ create_collection(cid, 0);
+ }
+ break;
+
+ case Transaction::OP_RMCOLL:
+ {
+ coll_t cid = t.cids.front(); t.cids.pop_front();
+ destroy_collection(cid, 0);
+ }
+ break;
+
+ case Transaction::OP_COLL_ADD:
+ {
+ coll_t cid = t.cids.front(); t.cids.pop_front();
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ collection_add(cid, oid, 0);
+ }
+ break;
+
+ case Transaction::OP_COLL_REMOVE:
+ {
+ coll_t cid = t.cids.front(); t.cids.pop_front();
+ object_t oid = t.oids.front(); t.oids.pop_front();
+ collection_remove(cid, oid, 0);
+ }
+ break;
+
+ case Transaction::OP_COLL_SETATTR:
+ {
+ coll_t cid = t.cids.front(); t.cids.pop_front();
+ const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+ //pair<const void*,int> attrval = t.attrvals.front(); t.attrvals.pop_front();
+ bufferlist bl;
+ bl.claim( t.attrbls.front() );
+ t.attrbls.pop_front();
+ collection_setattr(cid, attrname, bl.c_str(), bl.length(), 0);
+ }
+ break;
+
+ case Transaction::OP_COLL_RMATTR:
+ {
+ coll_t cid = t.cids.front(); t.cids.pop_front();
+ const char *attrname = t.attrnames.front(); t.attrnames.pop_front();
+ collection_rmattr(cid, attrname, 0);
+ }
+ break;
+
+
+ default:
+ cerr << "bad op " << *p << endl;
+ assert(0);
+ }
+ }
+
+ if (onsafe) sync(onsafe);
+
+ return 0; // FIXME count errors
+ }
+
+ /*********************************************/
+
+
+
+ public:
+ ObjectStore() {}
+ virtual ~ObjectStore() {}
+
+ // mgmt
+ virtual int mount() = 0;
+ virtual int umount() = 0;
+ virtual int mkfs() = 0; // wipe
+
+ virtual int statfs(struct statfs *buf) = 0;
+
+ // objects
+ virtual int pick_object_revision_lt(object_t& oid) = 0;
+
+ virtual bool exists(object_t oid) = 0; // useful?
+ virtual int stat(object_t oid, struct stat *st) = 0; // struct stat?
+
+ virtual int remove(object_t oid,
+ Context *onsafe=0) = 0;
+
+ virtual int truncate(object_t oid, off_t size,
+ Context *onsafe=0) = 0;
+
+ virtual int read(object_t oid,
+ off_t offset, size_t len,
+ bufferlist& bl) = 0;
+
+ /*virtual int write(object_t oid,
+ off_t offset, size_t len,
+ bufferlist& bl,
+ bool fsync=true) = 0;
+ */
+ virtual int write(object_t oid,
+ off_t offset, size_t len,
+ bufferlist& bl,
+ Context *onsafe) = 0;//{ return -1; }
+
+ virtual int setattr(object_t oid, const char *name,
+ const void *value, size_t size,
+ Context *onsafe=0) {return 0;} //= 0;
+ virtual int setattrs(object_t oid, map<string,bufferptr>& aset,
+ Context *onsafe=0) {return 0;} //= 0;
+ virtual int getattr(object_t oid, const char *name,
+ void *value, size_t size) {return 0;} //= 0;
+ virtual int getattrs(object_t oid, map<string,bufferptr>& aset) {return 0;};
+
+ virtual int rmattr(object_t oid, const char *name,
+ Context *onsafe=0) {return 0;}
+
+ virtual int clone(object_t oid, object_t noid) {
+ return -1;
+ }
+
+ virtual int listattr(object_t oid, char *attrs, size_t size) {return 0;} //= 0;
+
+ // collections
+ virtual int list_collections(list<coll_t>& ls) {return 0;}//= 0;
+ virtual int create_collection(coll_t c,
+ Context *onsafe=0) {return 0;}//= 0;
+ virtual int destroy_collection(coll_t c,
+ Context *onsafe=0) {return 0;}//= 0;
+ virtual bool collection_exists(coll_t c) {return 0;}
+ virtual int collection_stat(coll_t c, struct stat *st) {return 0;}//= 0;
+ virtual int collection_add(coll_t c, object_t o,
+ Context *onsafe=0) {return 0;}//= 0;
+ virtual int collection_remove(coll_t c, object_t o,
+ Context *onsafe=0) {return 0;}// = 0;
+ virtual int collection_list(coll_t c, list<object_t>& o) {return 0;}//= 0;
+
+ virtual int collection_setattr(coll_t cid, const char *name,
+ const void *value, size_t size,
+ Context *onsafe=0) {return 0;} //= 0;
+ virtual int collection_rmattr(coll_t cid, const char *name,
+ Context *onsafe=0) {return 0;} //= 0;
+ virtual int collection_getattr(coll_t cid, const char *name,
+ void *value, size_t size) {return 0;} //= 0;
+ virtual int collection_listattr(coll_t cid, char *attrs, size_t size) {return 0;} //= 0;
+
+ virtual void sync(Context *onsync) {};
+ virtual void sync() {};
+
+
+ virtual void _fake_writes(bool b) {};
+
+ virtual void _get_frag_stat(FragmentationStat& st) {};
+
+};
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "PG.h"
+#include "config.h"
+#include "OSD.h"
+
+#include "common/Timer.h"
+
+#include "messages/MOSDPGNotify.h"
+#include "messages/MOSDPGLog.h"
+#include "messages/MOSDPGRemove.h"
+
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << g_clock.now() << " osd" << osd->whoami << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " "
+
+
+/******* PGLog ********/
+
+void PG::Log::copy_after(const Log &other, eversion_t v)
+{
+ assert(v >= other.bottom);
+ top = bottom = other.top;
+ for (list<Entry>::const_reverse_iterator i = other.log.rbegin();
+ i != other.log.rend();
+ i++) {
+ if (i->version == v) break;
+ assert(i->version > v);
+ log.push_front(*i);
+ }
+ bottom = v;
+}
+
+bool PG::Log::copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor)
+{
+ assert(split >= other.bottom);
+ assert(floor >= other.bottom);
+ assert(floor <= split);
+ top = bottom = other.top;
+
+ /* runs on replica. split is primary's log.top. floor is how much they want.
+ split tell us if the primary is divergent.. e.g.:
+ -> i am A, B is primary, split is 2'6, floor is 2'2.
+A B C
+2'2 2'2
+2'3 2'3 2'3
+2'4 2'4 2'4
+3'5 | 2'5 2'5
+3'6 | 2'6
+3'7 |
+3'8 |
+3'9 |
+ -> i return full backlog.
+ */
+
+ for (list<Entry>::const_reverse_iterator i = other.log.rbegin();
+ i != other.log.rend();
+ i++) {
+ // is primary divergent?
+ // e.g. my 3'6 vs their 2'6 split
+ if (i->version.version == split.version && i->version.epoch > split.epoch) {
+ clear();
+ return false; // divergent!
+ }
+ if (i->version == floor) break;
+ assert(i->version > floor);
+
+ // e.g. my 2'23 > '12
+ log.push_front(*i);
+ }
+ bottom = floor;
+ return true;
+}
+
+void PG::Log::copy_non_backlog(const Log &other)
+{
+ if (other.backlog) {
+ top = other.top;
+ bottom = other.bottom;
+ for (list<Entry>::const_reverse_iterator i = other.log.rbegin();
+ i != other.log.rend();
+ i++)
+ if (i->version > bottom)
+ log.push_front(*i);
+ else
+ break;
+ } else {
+ *this = other;
+ }
+}
+
+
+
+void PG::IndexedLog::trim(ObjectStore::Transaction& t, eversion_t s)
+{
+ if (backlog && s < bottom)
+ s = bottom;
+
+ while (!log.empty()) {
+ Entry &e = *log.begin();
+
+ if (e.version > s) break;
+
+ assert(complete_to != log.begin());
+ assert(requested_to != log.begin());
+
+ // remove from index,
+ unindex(e);
+
+ // from log
+ log.pop_front();
+ }
+
+ // raise bottom?
+ if (backlog) backlog = false;
+ if (bottom < s) bottom = s;
+}
+
+
+void PG::IndexedLog::trim_write_ahead(eversion_t last_update)
+{
+ while (!log.empty() &&
+ log.rbegin()->version > last_update) {
+ // remove from index
+ unindex(*log.rbegin());
+
+ // remove
+ log.pop_back();
+ }
+}
+
+void PG::trim_write_ahead()
+{
+ if (info.last_update < log.top) {
+ dout(10) << "trim_write_ahead (" << info.last_update << "," << log.top << "]" << endl;
+ log.trim_write_ahead(info.last_update);
+ } else {
+ assert(info.last_update == log.top);
+ dout(10) << "trim_write_ahead last_update=top=" << info.last_update << endl;
+ }
+
+}
+
+void PG::proc_replica_log(Log &olog, Missing& omissing, int from)
+{
+ dout(10) << "proc_replica_log for osd" << from << ": " << olog << " " << omissing << endl;
+ assert(!is_active());
+
+ if (!have_master_log) {
+ // i'm building master log.
+ // note peer's missing.
+ peer_missing[from] = omissing;
+
+ // merge log into our own log
+ merge_log(olog, omissing, from);
+ proc_missing(olog, omissing, from);
+ } else {
+ // i'm just building missing lists.
+ peer_missing[from] = omissing;
+
+ // iterate over peer log. in reverse.
+ list<Log::Entry>::reverse_iterator pp = olog.log.rbegin();
+ eversion_t lu = peer_info[from].last_update;
+ while (pp != olog.log.rend()) {
+ if (!log.objects.count(pp->oid)) {
+ dout(10) << " divergent " << *pp << " not in our log, generating backlog" << endl;
+ generate_backlog();
+ }
+
+ if (!log.objects.count(pp->oid)) {
+ dout(10) << " divergent " << *pp << " dne, must have been new, ignoring" << endl;
+ ++pp;
+ continue;
+ }
+
+ if (log.objects[pp->oid]->version == pp->version) {
+ break; // we're no longer divergent.
+ //++pp;
+ //continue;
+ }
+
+ if (log.objects[pp->oid]->version > pp->version) {
+ dout(10) << " divergent " << *pp
+ << " superceded by " << log.objects[pp->oid]
+ << ", ignoring" << endl;
+ } else {
+ dout(10) << " divergent " << *pp << ", adding to missing" << endl;
+ peer_missing[from].add(pp->oid, pp->version);
+ }
+
+ ++pp;
+ if (pp != olog.log.rend())
+ lu = pp->version;
+ else
+ lu = olog.bottom;
+ }
+
+ if (lu < peer_info[from].last_update) {
+ dout(10) << " peer osd" << from << " last_update now " << lu << endl;
+ peer_info[from].last_update = lu;
+ if (lu < oldest_update) {
+ dout(10) << " oldest_update now " << lu << endl;
+ oldest_update = lu;
+ }
+ }
+
+ proc_missing(olog, peer_missing[from], from);
+ }
+}
+
+void PG::merge_log(Log &olog, Missing &omissing, int fromosd)
+{
+ dout(10) << "merge_log " << olog << " from osd" << fromosd
+ << " into " << log << endl;
+
+ //cout << "log" << endl;
+ //log.print(cout);
+ //cout << "olog" << endl;
+ //olog.print(cout);
+
+ if (log.empty() ||
+ (olog.bottom > log.top && olog.backlog)) { // e.g. log=(0,20] olog=(40,50]+backlog)
+
+ // swap and index
+ log.log.swap(olog.log);
+ log.index();
+
+ // find split point (old log.top) in new log
+ // add new items to missing along the way.
+ for (list<Log::Entry>::reverse_iterator p = log.log.rbegin();
+ p != log.log.rend();
+ p++) {
+ if (p->version <= log.top) {
+ // ok, p is at split point.
+
+ // was our old log divergent?
+ if (log.top > p->version) {
+ dout(10) << "merge_log i was (possibly) divergent for (" << p->version << "," << log.top << "]" << endl;
+ if (p->version < oldest_update)
+ oldest_update = p->version;
+
+ while (!olog.log.empty() &&
+ olog.log.rbegin()->version > p->version) {
+ Log::Entry &oe = *olog.log.rbegin(); // old entry (possibly divergent)
+ if (log.objects.count(oe.oid)) {
+ if (log.objects[oe.oid]->version < oe.version) {
+ dout(10) << "merge_log divergent entry " << oe
+ << " not superceded by " << *log.objects[oe.oid]
+ << ", adding to missing" << endl;
+ missing.add(oe.oid, oe.version);
+ } else {
+ dout(10) << "merge_log divergent entry " << oe
+ << " superceded by " << *log.objects[oe.oid]
+ << ", ignoring" << endl;
+ }
+ } else {
+ dout(10) << "merge_log divergent entry " << oe << ", adding to missing" << endl;
+ missing.add(oe.oid, oe.version);
+ }
+ olog.log.pop_back(); // discard divergent entry
+ }
+ }
+ break;
+ }
+
+ if (p->is_delete()) {
+ dout(10) << "merge_log merging " << *p << ", not missing" << endl;
+ missing.rm(p->oid, p->version);
+ } else {
+ dout(10) << "merge_log merging " << *p << ", now missing" << endl;
+ missing.add(p->oid, p->version);
+ }
+ }
+
+ info.last_update = log.top = olog.top;
+ info.log_bottom = log.bottom = olog.bottom;
+ info.log_backlog = log.backlog = olog.backlog;
+ }
+
+ else {
+ // i can merge the two logs!
+
+ // extend on bottom?
+ // FIXME: what if we have backlog, but they have lower bottom?
+ if (olog.bottom < log.bottom && olog.top >= log.bottom && !log.backlog) {
+ dout(10) << "merge_log extending bottom to " << olog.bottom
+ << (olog.backlog ? " +backlog":"")
+ << endl;
+
+ // ok
+ list<Log::Entry>::iterator from = olog.log.begin();
+ list<Log::Entry>::iterator to;
+ for (to = from;
+ to != olog.log.end();
+ to++) {
+ if (to->version > log.bottom) break;
+
+ // update our index while we're here
+ log.index(*to);
+
+ dout(15) << *to << endl;
+
+ // new missing object?
+ if (to->version > info.last_complete) {
+ if (to->is_update())
+ missing.add(to->oid, to->version);
+ else
+ missing.rm(to->oid, to->version);
+ }
+ }
+ assert(to != olog.log.end());
+
+ // splice into our log.
+ log.log.splice(log.log.begin(),
+ olog.log, from, to);
+
+ info.log_bottom = log.bottom = olog.bottom;
+ info.log_backlog = log.backlog = olog.backlog;
+ }
+
+ // extend on top?
+ if (olog.top > log.top &&
+ olog.bottom <= log.top) {
+ dout(10) << "merge_log extending top to " << olog.top << endl;
+
+ list<Log::Entry>::iterator to = olog.log.end();
+ list<Log::Entry>::iterator from = olog.log.end();
+ while (1) {
+ if (from == olog.log.begin()) break;
+ from--;
+ //dout(0) << "? " << *from << endl;
+ if (from->version < log.top) {
+ from++;
+ break;
+ }
+
+ log.index(*from);
+ dout(10) << "merge_log " << *from << endl;
+
+ // add to missing
+ if (from->is_update()) {
+ missing.add(from->oid, from->version);
+ } else
+ missing.rm(from->oid, from->version);
+ }
+
+ // remove divergent items
+ while (1) {
+ Log::Entry *oldtail = &(*log.log.rbegin());
+ if (oldtail->version.version+1 == from->version.version) break;
+
+ // divergent!
+ assert(oldtail->version.version >= from->version.version);
+
+ if (log.objects[oldtail->oid]->version == oldtail->version) {
+ // and significant.
+ dout(10) << "merge_log had divergent " << *oldtail << ", adding to missing" << endl;
+ //missing.add(oldtail->oid);
+ assert(0);
+ } else {
+ dout(10) << "merge_log had divergent " << *oldtail << ", already missing" << endl;
+ assert(missing.is_missing(oldtail->oid));
+ }
+ log.log.pop_back();
+ }
+
+ // splice
+ log.log.splice(log.log.end(),
+ olog.log, from, to);
+
+ info.last_update = log.top = olog.top;
+ }
+ }
+
+ dout(10) << "merge_log result " << log << " " << missing << endl;
+ //log.print(cout);
+
+}
+
+void PG::proc_missing(Log &olog, Missing &omissing, int fromosd)
+{
+ // found items?
+ for (map<object_t,eversion_t>::iterator p = missing.missing.begin();
+ p != missing.missing.end();
+ p++) {
+ if (omissing.is_missing(p->first)) {
+ assert(omissing.is_missing(p->first, p->second));
+ if (omissing.loc.count(p->first)) {
+ dout(10) << "proc_missing missing " << p->first << " " << p->second
+ << " on osd" << omissing.loc[p->first] << endl;
+ missing.loc[p->first] = omissing.loc[p->first];
+ } else {
+ dout(10) << "proc_missing missing " << p->first << " " << p->second
+ << " also LOST on source, osd" << fromosd << endl;
+ }
+ }
+ else if (p->second <= olog.top) {
+ dout(10) << "proc_missing missing " << p->first << " " << p->second
+ << " on source, osd" << fromosd << endl;
+ missing.loc[p->first] = fromosd;
+ } else {
+ dout(10) << "proc_missing " << p->first << " " << p->second
+ << " > olog.top " << olog.top << ", not found...."
+ << endl;
+ }
+ }
+
+ dout(10) << "proc_missing missing " << missing.missing << endl;
+}
+
+
+
+void PG::generate_backlog()
+{
+ dout(10) << "generate_backlog to " << log << endl;
+ assert(!log.backlog);
+ log.backlog = true;
+
+ list<object_t> olist;
+ osd->store->collection_list(info.pgid, olist);
+
+ int local = 0;
+ map<eversion_t,Log::Entry> add;
+ for (list<object_t>::iterator it = olist.begin();
+ it != olist.end();
+ it++) {
+ local++;
+
+ if (log.logged_object(*it)) continue; // already have it logged.
+
+ // add entry
+ Log::Entry e;
+ e.op = Log::Entry::MODIFY; // FIXME when we do smarter op codes!
+ e.oid = *it;
+ osd->store->getattr(*it,
+ "version",
+ &e.version, sizeof(e.version));
+ add[e.version] = e;
+ dout(10) << "generate_backlog found " << e << endl;
+ }
+
+ for (map<eversion_t,Log::Entry>::reverse_iterator i = add.rbegin();
+ i != add.rend();
+ i++) {
+ log.log.push_front(i->second);
+ log.index( *log.log.begin() ); // index
+ }
+
+ dout(10) << local << " local objects, "
+ << add.size() << " objects added to backlog, "
+ << log.objects.size() << " in pg" << endl;
+
+ //log.print(cout);
+}
+
+void PG::drop_backlog()
+{
+ dout(10) << "drop_backlog for " << log << endl;
+ //log.print(cout);
+
+ assert(log.backlog);
+ log.backlog = false;
+
+ while (!log.log.empty()) {
+ Log::Entry &e = *log.log.begin();
+ if (e.version > log.bottom) break;
+
+ dout(15) << "drop_backlog trimming " << e.version << endl;
+ log.unindex(e);
+ log.log.pop_front();
+ }
+}
+
+
+
+
+
+ostream& PG::Log::print(ostream& out) const
+{
+ out << *this << endl;
+ for (list<Entry>::const_iterator p = log.begin();
+ p != log.end();
+ p++)
+ out << *p << endl;
+ return out;
+}
+
+
+
+
+
+/******* PG ***********/
+void PG::build_prior()
+{
+ // build prior set.
+ prior_set.clear();
+
+ // current
+ for (unsigned i=1; i<acting.size(); i++)
+ prior_set.insert(acting[i]);
+
+ // and prior map(s), if OSDs are still up
+ for (epoch_t epoch = MAX(1, last_epoch_started_any);
+ epoch < osd->osdmap->get_epoch();
+ epoch++) {
+ OSDMap omap;
+ osd->get_map(epoch, omap);
+
+ vector<int> acting;
+ omap.pg_to_acting_osds(get_pgid(), acting);
+
+ for (unsigned i=0; i<acting.size(); i++) {
+ //dout(10) << "build prior considering epoch " << epoch << " osd" << acting[i] << endl;
+ if (osd->osdmap->is_up(acting[i]) && // is up now
+ acting[i] != osd->whoami) // and is not me
+ prior_set.insert(acting[i]);
+ }
+ }
+
+ dout(10) << "build_prior built " << prior_set << endl;
+}
+
+void PG::adjust_prior()
+{
+ assert(!prior_set.empty());
+
+ // raise last_epoch_started_any
+ epoch_t max = 0;
+ for (map<int,Info>::iterator it = peer_info.begin();
+ it != peer_info.end();
+ it++) {
+ if (it->second.last_epoch_started > max)
+ max = it->second.last_epoch_started;
+ }
+
+ dout(10) << "adjust_prior last_epoch_started_any "
+ << last_epoch_started_any << " -> " << max << endl;
+ assert(max > last_epoch_started_any);
+ last_epoch_started_any = max;
+
+ // rebuild prior set
+ build_prior();
+}
+
+
+void PG::clear_primary_state()
+{
+ dout(10) << "clear_primary_state" << endl;
+
+ // clear peering state
+ have_master_log = false;
+ prior_set.clear();
+ stray_set.clear();
+ clean_set.clear();
+ peer_info_requested.clear();
+ peer_log_requested.clear();
+ peer_info.clear();
+ peer_missing.clear();
+
+ last_epoch_started_any = info.last_epoch_started;
+}
+
+void PG::peer(ObjectStore::Transaction& t,
+ map< int, map<pg_t,Query> >& query_map)
+{
+ dout(10) << "peer. acting is " << acting
+ << ", prior_set is " << prior_set << endl;
+
+
+ /** GET ALL PG::Info *********/
+
+ // -- query info from everyone in prior_set.
+ bool missing_info = false;
+ for (set<int>::iterator it = prior_set.begin();
+ it != prior_set.end();
+ it++) {
+ if (peer_info.count(*it)) {
+ dout(10) << " have info from osd" << *it
+ << ": " << peer_info[*it]
+ << endl;
+ continue;
+ }
+ missing_info = true;
+
+ if (peer_info_requested.count(*it)) {
+ dout(10) << " waiting for osd" << *it << endl;
+ continue;
+ }
+
+ dout(10) << " querying info from osd" << *it << endl;
+ query_map[*it][info.pgid] = Query(Query::INFO, info.history);
+ peer_info_requested.insert(*it);
+ }
+ if (missing_info) return;
+
+
+ // -- ok, we have all (prior_set) info. (and maybe others.)
+
+ // did we crash?
+ dout(10) << " last_epoch_started_any " << last_epoch_started_any << endl;
+ if (last_epoch_started_any) {
+ OSDMap omap;
+ osd->get_map(last_epoch_started_any, omap);
+
+ // start with the last active set of replicas
+ set<int> last_started;
+ vector<int> acting;
+ omap.pg_to_acting_osds(get_pgid(), acting);
+ for (unsigned i=0; i<acting.size(); i++)
+ last_started.insert(acting[i]);
+
+ // make sure at least one of them is still up
+ for (epoch_t e = last_epoch_started_any+1;
+ e <= osd->osdmap->get_epoch();
+ e++) {
+ OSDMap omap;
+ osd->get_map(e, omap);
+
+ set<int> still_up;
+
+ for (set<int>::iterator i = last_started.begin();
+ i != last_started.end();
+ i++) {
+ //dout(10) << " down in epoch " << e << " is " << omap.get_down_osds() << endl;
+ if (omap.is_up(*i))
+ still_up.insert(*i);
+ }
+
+ last_started.swap(still_up);
+ //dout(10) << " still active as of epoch " << e << ": " << last_started << endl;
+ }
+
+ if (last_started.empty()) {
+ dout(10) << " crashed since epoch " << last_epoch_started_any << endl;
+ state_set(STATE_CRASHED);
+ } else {
+ dout(10) << " still active from last started: " << last_started << endl;
+ }
+ } else if (osd->osdmap->get_epoch() > 1) {
+ dout(10) << " crashed since epoch " << last_epoch_started_any << endl;
+ state_set(STATE_CRASHED);
+ }
+
+ dout(10) << " peers_complete_thru " << peers_complete_thru << endl;
+
+
+
+
+ /** CREATE THE MASTER PG::Log *********/
+
+ // who (of all priors and active) has the latest PG version?
+ eversion_t newest_update = info.last_update;
+ int newest_update_osd = osd->whoami;
+
+ oldest_update = info.last_update; // only of acting (current) osd set.
+ peers_complete_thru = info.last_complete;
+
+ for (map<int,Info>::iterator it = peer_info.begin();
+ it != peer_info.end();
+ it++) {
+ if (it->second.last_update > newest_update) {
+ newest_update = it->second.last_update;
+ newest_update_osd = it->first;
+ }
+ if (is_acting(it->first)) {
+ if (it->second.last_update < oldest_update)
+ oldest_update = it->second.last_update;
+ if (it->second.last_complete < peers_complete_thru)
+ peers_complete_thru = it->second.last_complete;
+ }
+ }
+
+ // gather log(+missing) from that person!
+ if (newest_update_osd != osd->whoami) {
+ if (peer_log_requested.count(newest_update_osd) ||
+ peer_summary_requested.count(newest_update_osd)) {
+ dout(10) << " newest update on osd" << newest_update_osd
+ << " v " << newest_update
+ << ", already queried"
+ << endl;
+ } else {
+ // we'd like it back to oldest_update, but will settle for log_bottom
+ eversion_t since = MAX(peer_info[newest_update_osd].log_bottom,
+ oldest_update);
+ if (peer_info[newest_update_osd].log_bottom < log.top) {
+ dout(10) << " newest update on osd" << newest_update_osd
+ << " v " << newest_update
+ << ", querying since " << since
+ << endl;
+ query_map[newest_update_osd][info.pgid] = Query(Query::LOG, log.top, since, info.history);
+ peer_log_requested.insert(newest_update_osd);
+ } else {
+ dout(10) << " newest update on osd" << newest_update_osd
+ << " v " << newest_update
+ << ", querying entire summary/backlog"
+ << endl;
+ assert((peer_info[newest_update_osd].last_complete >=
+ peer_info[newest_update_osd].log_bottom) ||
+ peer_info[newest_update_osd].log_backlog); // or else we're in trouble.
+ query_map[newest_update_osd][info.pgid] = Query(Query::BACKLOG, info.history);
+ peer_summary_requested.insert(newest_update_osd);
+ }
+ }
+ return;
+ } else {
+ dout(10) << " newest_update " << info.last_update << " (me)" << endl;
+ }
+
+ dout(10) << " oldest_update " << oldest_update << endl;
+
+ have_master_log = true;
+
+
+ // -- do i need to generate backlog for any of my peers?
+ if (oldest_update < log.bottom && !log.backlog) {
+ dout(10) << "generating backlog for some peers, bottom "
+ << log.bottom << " > " << oldest_update
+ << endl;
+ generate_backlog();
+ }
+
+
+ /** COLLECT MISSING+LOG FROM PEERS **********/
+ /*
+ we also detect divergent replicas here by pulling the full log
+ from everyone.
+ */
+
+ // gather missing from peers
+ for (unsigned i=1; i<acting.size(); i++) {
+ int peer = acting[i];
+ if (peer_info[peer].is_empty()) continue;
+ if (peer_log_requested.count(peer) ||
+ peer_summary_requested.count(peer)) continue;
+
+ dout(10) << " pulling log+missing from osd" << peer
+ << endl;
+ query_map[peer][info.pgid] = Query(Query::FULLLOG, info.history);
+ peer_log_requested.insert(peer);
+ }
+
+ // did we get them all?
+ bool have_missing = true;
+ for (unsigned i=1; i<acting.size(); i++) {
+ int peer = acting[i];
+ if (peer_info[peer].is_empty()) continue;
+ if (peer_missing.count(peer)) continue;
+
+ dout(10) << " waiting for log+missing from osd" << peer << endl;
+ have_missing = false;
+ }
+ if (!have_missing) return;
+
+ dout(10) << " peers_complete_thru " << peers_complete_thru << endl;
+
+
+ // -- ok. and have i located all pg contents?
+ if (missing.num_lost() > 0) {
+ dout(10) << "there are still " << missing.num_lost() << " lost objects" << endl;
+
+ // *****
+ // FIXME: i don't think this actually accomplishes anything!
+ // *****
+
+ // ok, let's get more summaries!
+ bool waiting = false;
+ for (map<int,Info>::iterator it = peer_info.begin();
+ it != peer_info.end();
+ it++) {
+ int peer = it->first;
+
+ if (peer_summary_requested.count(peer)) {
+ dout(10) << " already requested summary/backlog from osd" << peer << endl;
+ waiting = true;
+ continue;
+ }
+
+ dout(10) << " requesting summary/backlog from osd" << peer << endl;
+ query_map[peer][info.pgid] = Query(Query::BACKLOG, info.history);
+ peer_summary_requested.insert(peer);
+ waiting = true;
+ }
+
+ if (!waiting) {
+ dout(10) << missing.num_lost() << " objects are still lost, waiting+hoping for a notify from someone else!" << endl;
+ }
+ return;
+ }
+
+ // sanity check
+ assert(missing.num_lost() == 0);
+ assert(info.last_complete >= log.bottom || log.backlog);
+
+
+ // -- crash recovery?
+ if (is_crashed()) {
+ dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window << endl;
+ state_set(STATE_REPLAY);
+ g_timer.add_event_after(g_conf.osd_replay_window,
+ new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch()));
+ }
+ else if (!is_active()) {
+ // -- ok, activate!
+ activate(t);
+ }
+}
+
+
+void PG::activate(ObjectStore::Transaction& t)
+{
+ assert(!is_active());
+
+ // twiddle pg state
+ state_set(STATE_ACTIVE);
+ state_clear(STATE_STRAY);
+ if (is_crashed()) {
+ assert(is_replay());
+ state_clear(STATE_CRASHED);
+ state_clear(STATE_REPLAY);
+ }
+ info.last_epoch_started = osd->osdmap->get_epoch();
+
+ if (role == 0) { // primary state
+ peers_complete_thru = 0; // we don't know (yet)!
+ }
+
+ assert(info.last_complete >= log.bottom || log.backlog);
+
+ // write pg info
+ t.collection_setattr(info.pgid, "info", (char*)&info, sizeof(info));
+
+ // write log
+ write_log(t);
+
+ // clean up stray objects
+ clean_up_local(t);
+
+ // init complete pointer
+ if (info.last_complete == info.last_update) {
+ dout(10) << "activate - complete" << endl;
+ log.complete_to == log.log.end();
+ log.requested_to = log.log.end();
+ }
+ //else if (is_primary()) {
+ else if (true) {
+ dout(10) << "activate - not complete, " << missing << ", starting recovery" << endl;
+
+ // init complete_to
+ log.complete_to = log.log.begin();
+ while (log.complete_to->version < info.last_complete) {
+ log.complete_to++;
+ assert(log.complete_to != log.log.end());
+ }
+
+ // start recovery
+ log.requested_to = log.complete_to;
+ do_recovery();
+ } else {
+ dout(10) << "activate - not complete, " << missing << endl;
+ }
+
+
+ // if primary..
+ if (role == 0 &&
+ osd->osdmap->get_epoch() > 1) {
+ // who is clean?
+ clean_set.clear();
+ if (info.is_clean())
+ clean_set.insert(osd->whoami);
+
+ // start up replicas
+ for (unsigned i=1; i<acting.size(); i++) {
+ int peer = acting[i];
+ assert(peer_info.count(peer));
+
+ MOSDPGLog *m = new MOSDPGLog(osd->osdmap->get_epoch(),
+ info.pgid);
+ m->info = info;
+
+ if (peer_info[peer].last_update == info.last_update) {
+ // empty log
+ }
+ else if (peer_info[peer].last_update < log.bottom) {
+ // summary/backlog
+ assert(log.backlog);
+ m->log = log;
+ }
+ else {
+ // incremental log
+ assert(peer_info[peer].last_update < info.last_update);
+ m->log.copy_after(log, peer_info[peer].last_update);
+ }
+
+ // update local version of peer's missing list!
+ {
+ eversion_t plu = peer_info[peer].last_update;
+ Missing& pm = peer_missing[peer];
+ for (list<Log::Entry>::iterator p = m->log.log.begin();
+ p != m->log.log.end();
+ p++)
+ if (p->version > plu)
+ pm.add(p->oid, p->version);
+ }
+
+ dout(10) << "activate sending " << m->log << " " << m->missing
+ << " to osd" << peer << endl;
+ //m->log.print(cout);
+ osd->messenger->send_message(m, MSG_ADDR_OSD(peer), osd->osdmap->get_inst(peer));
+
+ // update our missing
+ if (peer_missing[peer].num_missing() == 0) {
+ dout(10) << "activate peer osd" << peer << " already clean, " << peer_info[peer] << endl;
+ assert(peer_info[peer].last_complete == info.last_update);
+ clean_set.insert(peer);
+ } else {
+ dout(10) << "activate peer osd" << peer << " " << peer_info[peer]
+ << " missing " << peer_missing[peer] << endl;
+ }
+
+ }
+
+ // discard unneeded peering state
+ //peer_log.clear(); // actually, do this carefully, in case peer() is called again.
+
+ // all clean?
+ if (is_all_clean()) {
+ state_set(STATE_CLEAN);
+ dout(10) << "activate all replicas clean" << endl;
+ clean_replicas();
+ }
+ }
+
+
+ // replay (queue them _before_ other waiting ops!)
+ if (!replay_queue.empty()) {
+ eversion_t c = info.last_update;
+ list<Message*> replay;
+ for (map<eversion_t,MOSDOp*>::iterator p = replay_queue.begin();
+ p != replay_queue.end();
+ p++) {
+ if (p->first <= info.last_update) {
+ dout(10) << "activate will WRNOOP " << p->first << " " << *p->second << endl;
+ replay.push_back(p->second);
+ continue;
+ }
+ if (p->first.version != c.version+1) {
+ dout(10) << "activate replay " << p->first
+ << " skipping " << c.version+1 - p->first.version
+ << " ops"
+ << endl;
+ }
+ dout(10) << "activate replay " << p->first << " " << *p->second << endl;
+ replay.push_back(p->second);
+ c = p->first;
+ }
+ replay_queue.clear();
+ osd->take_waiters(replay);
+ }
+
+ // waiters
+ osd->take_waiters(waiting_for_active);
+}
+
+/** clean_up_local
+ * remove any objects that we're storing but shouldn't.
+ * as determined by log.
+ */
+void PG::clean_up_local(ObjectStore::Transaction& t)
+{
+ dout(10) << "clean_up_local" << endl;
+
+ assert(info.last_update >= log.bottom); // otherwise we need some help!
+
+ if (log.backlog) {
+ // be thorough.
+ list<object_t> ls;
+ osd->store->collection_list(info.pgid, ls);
+ set<object_t> s;
+
+ for (list<object_t>::iterator i = ls.begin();
+ i != ls.end();
+ i++)
+ s.insert(*i);
+
+ set<object_t> did;
+ for (list<Log::Entry>::reverse_iterator p = log.log.rbegin();
+ p != log.log.rend();
+ p++) {
+ if (did.count(p->oid)) continue;
+ did.insert(p->oid);
+
+ if (p->is_delete()) {
+ if (s.count(p->oid)) {
+ dout(10) << " deleting " << p->oid
+ << " when " << p->version << endl;
+ t.remove(p->oid);
+ }
+ s.erase(p->oid);
+ } else {
+ // just leave old objects.. they're missing or whatever
+ s.erase(p->oid);
+ }
+ }
+
+ for (set<object_t>::iterator i = s.begin();
+ i != s.end();
+ i++) {
+ dout(10) << " deleting stray " << *i << endl;
+ t.remove(*i);
+ }
+
+ } else {
+ // just scan the log.
+ set<object_t> did;
+ for (list<Log::Entry>::reverse_iterator p = log.log.rbegin();
+ p != log.log.rend();
+ p++) {
+ if (did.count(p->oid)) continue;
+ did.insert(p->oid);
+
+ if (p->is_delete()) {
+ dout(10) << " deleting " << p->oid
+ << " when " << p->version << endl;
+ t.remove(p->oid);
+ } else {
+ // keep old(+missing) objects, just for kicks.
+ }
+ }
+ }
+}
+
+
+
+void PG::cancel_recovery()
+{
+ // forget about where missing items are, or anything we're pulling
+ missing.loc.clear();
+ osd->num_pulling -= objects_pulling.size();
+ objects_pulling.clear();
+}
+
+/**
+ * do one recovery op.
+ * return true if done, false if nothing left to do.
+ */
+bool PG::do_recovery()
+{
+ dout(-10) << "do_recovery pulling " << objects_pulling.size() << " in pg, "
+ << osd->num_pulling << "/" << g_conf.osd_max_pull << " total"
+ << endl;
+ dout(10) << "do_recovery " << missing << endl;
+
+ // can we slow down on this PG?
+ if (osd->num_pulling >= g_conf.osd_max_pull && !objects_pulling.empty()) {
+ dout(-10) << "do_recovery already pulling max, waiting" << endl;
+ return true;
+ }
+
+ // look at log!
+ Log::Entry *latest = 0;
+
+ while (log.requested_to != log.log.end()) {
+ assert(log.objects.count(log.requested_to->oid));
+ latest = log.objects[log.requested_to->oid];
+ assert(latest);
+
+ dout(10) << "do_recovery "
+ << *log.requested_to
+ << (objects_pulling.count(latest->oid) ? " (pulling)":"")
+ << endl;
+
+ if (latest->is_update() &&
+ !objects_pulling.count(latest->oid) &&
+ missing.is_missing(latest->oid)) {
+ osd->pull(this, latest->oid);
+ return true;
+ }
+
+ log.requested_to++;
+ }
+
+ if (!objects_pulling.empty()) {
+ dout(7) << "do_recovery requested everything, still waiting" << endl;
+ return false;
+ }
+
+ // done?
+ assert(missing.num_missing() == 0);
+ assert(info.last_complete == info.last_update);
+
+ if (is_primary()) {
+ // i am primary
+ dout(7) << "do_recovery complete, cleaning strays" << endl;
+ clean_set.insert(osd->whoami);
+ if (is_all_clean()) {
+ state_set(PG::STATE_CLEAN);
+ clean_replicas();
+ }
+ } else {
+ // tell primary
+ dout(7) << "do_recovery complete, telling primary" << endl;
+ list<PG::Info> ls;
+ ls.push_back(info);
+ osd->messenger->send_message(new MOSDPGNotify(osd->osdmap->get_epoch(),
+ ls),
+ MSG_ADDR_OSD(get_primary()), osd->osdmap->get_inst(get_primary()));
+ }
+
+ return false;
+}
+
+void PG::do_peer_recovery()
+{
+ dout(10) << "do_peer_recovery" << endl;
+
+ for (unsigned i=0; i<acting.size(); i++) {
+ int peer = acting[i];
+ if (peer_missing.count(peer) == 0 ||
+ peer_missing[peer].num_missing() == 0)
+ continue;
+
+ // oldest first!
+ object_t oid = peer_missing[peer].rmissing.begin()->second;
+ eversion_t v = peer_missing[peer].rmissing.begin()->first;
+
+ osd->push(this, oid, peer);
+
+ // do other peers need it too?
+ for (i++; i<acting.size(); i++) {
+ int peer = acting[i];
+ if (peer_missing.count(peer) &&
+ peer_missing[peer].is_missing(oid))
+ osd->push(this, oid, peer);
+ }
+
+ return;
+ }
+
+ // nothing to do!
+}
+
+
+
+void PG::clean_replicas()
+{
+ dout(10) << "clean_replicas. strays are " << stray_set << endl;
+
+ for (set<int>::iterator p = stray_set.begin();
+ p != stray_set.end();
+ p++) {
+ dout(10) << "sending PGRemove to osd" << *p << endl;
+ set<pg_t> ls;
+ ls.insert(info.pgid);
+ MOSDPGRemove *m = new MOSDPGRemove(osd->osdmap->get_epoch(), ls);
+ osd->messenger->send_message(m, MSG_ADDR_OSD(*p), osd->osdmap->get_inst(*p));
+ }
+
+ stray_set.clear();
+}
+
+
+
+void PG::write_log(ObjectStore::Transaction& t)
+{
+ // assemble buffer
+ bufferlist bl;
+
+ // build buffer
+ ondisklog.bottom = 0;
+ ondisklog.block_map.clear();
+ for (list<Log::Entry>::iterator p = log.log.begin();
+ p != log.log.end();
+ p++) {
+ if (bl.length() % 4096 == 0)
+ ondisklog.block_map[bl.length()] = p->version;
+ bl.append((char*)&(*p), sizeof(*p));
+ }
+ ondisklog.top = bl.length();
+
+ // write it
+ t.remove( object_t(1,info.pgid) );
+ t.write( object_t(1,info.pgid) , 0, bl.length(), bl);
+ t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom));
+ t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top));
+
+ t.collection_setattr(info.pgid, "info", &info, sizeof(info));
+}
+
+void PG::trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v)
+{
+ dout(15) << " trim_ondisk_log_to v " << v << endl;
+
+ map<off_t,eversion_t>::iterator p = ondisklog.block_map.begin();
+ while (p != ondisklog.block_map.end()) {
+ dout(15) << " " << p->first << " -> " << p->second << endl;
+ p++;
+ if (p == ondisklog.block_map.end() ||
+ p->second > v) { // too far!
+ p--; // back up
+ break;
+ }
+ }
+ dout(15) << " * " << p->first << " -> " << p->second << endl;
+ if (p == ondisklog.block_map.begin())
+ return; // can't trim anything!
+
+ // we can trim!
+ off_t trim = p->first;
+ dout(10) << " trimming ondisklog to [" << ondisklog.bottom << "," << ondisklog.top << ")" << endl;
+
+ ondisklog.bottom = trim;
+
+ // adjust block_map
+ while (p != ondisklog.block_map.begin())
+ ondisklog.block_map.erase(ondisklog.block_map.begin());
+
+ t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom));
+ t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top));
+}
+
+
+void PG::append_log(ObjectStore::Transaction& t, PG::Log::Entry& logentry,
+ eversion_t trim_to)
+{
+ // write entry on disk
+ bufferlist bl;
+ bl.append( (char*)&logentry, sizeof(logentry) );
+ if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME.
+ bufferptr bp(4096 - sizeof(logentry));
+ bl.push_back(bp);
+ }
+ t.write( object_t(1,info.pgid), ondisklog.top, bl.length(), bl );
+
+ // update block map?
+ if (ondisklog.top % 4096 == 0)
+ ondisklog.block_map[ondisklog.top] = logentry.version;
+
+ ondisklog.top += bl.length();
+ t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top));
+
+ // trim?
+ if (trim_to > log.bottom) {
+ dout(10) << " trimming " << log << " to " << trim_to << endl;
+ log.trim(t, trim_to);
+ info.log_bottom = log.bottom;
+ info.log_backlog = log.backlog;
+ trim_ondisklog_to(t, trim_to);
+ }
+ dout(10) << " ondisklog [" << ondisklog.bottom << "," << ondisklog.top << ")" << endl;
+}
+
+void PG::read_log(ObjectStore *store)
+{
+ // load bounds
+ ondisklog.bottom = ondisklog.top = 0;
+ store->collection_getattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom));
+ store->collection_getattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top));
+
+ log.backlog = info.log_backlog;
+ log.bottom = info.log_bottom;
+
+ if (ondisklog.top > 0) {
+ // read
+ bufferlist bl;
+ store->read(object_t(1,info.pgid), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl);
+
+ PG::Log::Entry e;
+ off_t pos = ondisklog.bottom;
+ while (pos < ondisklog.top) {
+ bl.copy(pos-ondisklog.bottom, sizeof(e), (char*)&e);
+ if (e.version > log.bottom || log.backlog) { // ignore items below log.bottom
+ if (pos % 4096 == 0)
+ ondisklog.block_map[pos] = e.version;
+ log.log.push_back(e);
+ }
+
+ pos += sizeof(e);
+ }
+ }
+ log.top = info.last_update;
+ log.index();
+
+ // build missing
+ set<object_t> did;
+ for (list<Log::Entry>::reverse_iterator i = log.log.rbegin();
+ i != log.log.rend();
+ i++) {
+ if (i->version <= info.last_complete) break;
+ if (did.count(i->oid)) continue;
+ did.insert(i->oid);
+
+ if (i->is_delete()) continue;
+
+ eversion_t v;
+ int r = osd->store->getattr(i->oid, "version", &v, sizeof(v));
+ if (r < 0 || v < i->version)
+ missing.add(i->oid, i->version);
+ }
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __PG_H
+#define __PG_H
+
+
+#include "include/types.h"
+#include "include/buffer.h"
+
+#include "OSDMap.h"
+#include "ObjectStore.h"
+#include "msg/Messenger.h"
+#include "messages/MOSDOpReply.h"
+
+#include "include/types.h"
+
+#include <list>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+
+class OSD;
+
+/* reqid_t - caller + tid to unique identify this request
+ */
+class reqid_t {
+public:
+ msg_addr_t addr;
+ tid_t tid;
+ reqid_t() : tid(0) {}
+ reqid_t(const msg_addr_t& a, tid_t t) : addr(a), tid(t) {}
+};
+
+inline ostream& operator<<(ostream& out, const reqid_t& r) {
+ return out << r.addr << "." << r.tid;
+}
+inline bool operator==(const reqid_t& l, const reqid_t& r) {
+ return (l.addr == r.addr) && (l.tid == r.tid);
+}
+inline bool operator!=(const reqid_t& l, const reqid_t& r) {
+ return (l.addr != r.addr) || (l.tid != r.tid);
+}
+
+namespace __gnu_cxx {
+ template<> struct hash<reqid_t> {
+ size_t operator()(const reqid_t &r) const {
+ static hash<unsigned long> H;
+ static hash<__uint64_t> I;
+ return H(r.addr.type() ^ r.addr.num()) ^ I(r.tid);
+ }
+ };
+}
+
+/** PG - Replica Placement Group
+ *
+ */
+
+class PG {
+public:
+
+ /*
+ * PG::Info - summary of PG statistics.
+ *
+ * some notes:
+ * - last_complete implies we have all objects that existed as of that
+ * stamp, OR a newer object, OR have already applied a later delete.
+ * - if last_complete >= log.bottom, then we know pg contents thru log.top.
+ * otherwise, we have no idea what the pg is supposed to contain.
+ */
+ struct Info {
+ pg_t pgid;
+ eversion_t last_update; // last object version applied to store.
+ eversion_t last_complete; // last version pg was complete through.
+
+ eversion_t log_bottom; // oldest log entry.
+ bool log_backlog; // do we store a complete log?
+
+ epoch_t last_epoch_started; // last epoch started.
+ epoch_t last_epoch_finished; // last epoch finished.
+
+ struct History {
+ epoch_t same_since; // same acting set since
+ epoch_t same_primary_since; // same primary at least back through this epoch.
+ epoch_t same_acker_since; // same acker at least back through this epoch.
+ History() : same_since(0), same_primary_since(0), same_acker_since(0) {}
+ } history;
+
+ Info(pg_t p=0) : pgid(p),
+ log_backlog(false),
+ last_epoch_started(0), last_epoch_finished(0) {}
+ bool is_clean() const { return last_update == last_complete; }
+ bool is_empty() const { return last_update.version == 0; }
+ };
+
+
+ /**
+ * Query - used to ask a peer for information about a pg.
+ *
+ * note: if version=0, type=LOG, then we just provide our full log.
+ * only if type=BACKLOG do we generate a backlog and provide that too.
+ */
+ struct Query {
+ const static int INFO = 0;
+ const static int LOG = 1;
+ const static int BACKLOG = 2;
+ const static int FULLLOG = 3;
+
+ int type;
+ eversion_t split, floor;
+ Info::History history;
+
+ Query() : type(-1) {}
+ Query(int t, Info::History& h) :
+ type(t), history(h) { assert(t != LOG); }
+ Query(int t, eversion_t s, eversion_t f, Info::History& h) :
+ type(t), split(s), floor(f), history(h) { assert(t == LOG); }
+ };
+
+
+ /*
+ * Missing - summary of missing objects.
+ * kept in memory, as a supplement to Log.
+ * also used to pass missing info in messages.
+ */
+ class Missing {
+ public:
+ map<object_t, eversion_t> missing; // oid -> v
+ map<eversion_t, object_t> rmissing; // v -> oid
+
+ map<object_t, int> loc; // where i think i can get them.
+
+ int num_lost() const { return missing.size() - loc.size(); }
+ int num_missing() const { return missing.size(); }
+
+ bool is_missing(object_t oid) {
+ return missing.count(oid);
+ }
+ bool is_missing(object_t oid, eversion_t v) {
+ return missing.count(oid) && missing[oid] <= v;
+ }
+ void add(object_t oid) {
+ eversion_t z;
+ add(oid,z);
+ }
+ void add(object_t oid, eversion_t v) {
+ if (missing.count(oid)) {
+ if (missing[oid] > v) return; // already missing newer.
+ rmissing.erase(missing[oid]);
+ }
+ missing[oid] = v;
+ rmissing[v] = oid;
+ }
+ void rm(object_t oid, eversion_t when) {
+ if (missing.count(oid) && missing[oid] < when) {
+ rmissing.erase(missing[oid]);
+ missing.erase(oid);
+ loc.erase(oid);
+ }
+ }
+ void got(object_t oid, eversion_t v) {
+ assert(missing.count(oid));
+ assert(missing[oid] <= v);
+ loc.erase(oid);
+ rmissing.erase(missing[oid]);
+ missing.erase(oid);
+ }
+ void got(object_t oid) {
+ assert(missing.count(oid));
+ loc.erase(oid);
+ rmissing.erase(missing[oid]);
+ missing.erase(oid);
+ }
+
+ void _encode(bufferlist& blist) {
+ ::_encode(missing, blist);
+ ::_encode(loc, blist);
+ }
+ void _decode(bufferlist& blist, int& off) {
+ ::_decode(missing, blist, off);
+ ::_decode(loc, blist, off);
+
+ for (map<object_t,eversion_t>::iterator it = missing.begin();
+ it != missing.end();
+ it++)
+ rmissing[it->second] = it->first;
+ }
+ };
+
+
+ /*
+ * Log - incremental log of recent pg changes.
+ * also, serves as a recovery queue.
+ *
+ * when backlog is true,
+ * objects with versions <= bottom are in log.
+ * we do not have any deletion info before that time, however.
+ * log is a "summary" in that it contains all objects in the PG.
+ */
+ class Log {
+ public:
+ /** top, bottom
+ * top - newest entry (update|delete)
+ * bottom - entry previous to oldest (update|delete) for which we have
+ * complete negative information.
+ * i.e. we can infer pg contents for any store whose last_update >= bottom.
+ */
+ eversion_t top; // newest entry (update|delete)
+ eversion_t bottom; // version prior to oldest (update|delete)
+
+ /** backlog - true if log is a complete summary of pg contents.
+ * updated will include all items in pg, but deleted will not include
+ * negative entries for items deleted prior to 'bottom'.
+ */
+ bool backlog;
+
+ /** Entry
+ * mapped from the eversion_t, so don't include that.
+ */
+ class Entry {
+ public:
+ const static int LOST = 0;
+ const static int MODIFY = 1;
+ const static int CLONE = 2;
+ const static int DELETE = 3;
+
+ int op; // write, zero, trunc, remove
+ object_t oid;
+ eversion_t version;
+ objectrev_t rev;
+
+ reqid_t reqid; // caller+tid to uniquely identify request
+
+ Entry() : op(0) {}
+ Entry(int _op, object_t _oid, const eversion_t& v,
+ const msg_addr_t& a, tid_t t) :
+ op(_op), oid(_oid), version(v), reqid(a,t) {}
+
+ bool is_delete() const { return op == DELETE; }
+ bool is_clone() const { return op == CLONE; }
+ bool is_modify() const { return op == MODIFY; }
+ bool is_update() const { return is_clone() || is_modify(); }
+ };
+
+ list<Entry> log; // the actual log.
+
+ Log() : backlog(false) {}
+
+ void clear() {
+ eversion_t z;
+ top = bottom = z;
+ backlog = false;
+ log.clear();
+ }
+ bool empty() const {
+ return top.version == 0 && top.epoch == 0;
+ }
+
+ void _encode(bufferlist& blist) const {
+ blist.append((char*)&top, sizeof(top));
+ blist.append((char*)&bottom, sizeof(bottom));
+ blist.append((char*)&backlog, sizeof(backlog));
+ ::_encode(log, blist);
+ }
+ void _decode(bufferlist& blist, int& off) {
+ blist.copy(off, sizeof(top), (char*)&top);
+ off += sizeof(top);
+ blist.copy(off, sizeof(bottom), (char*)&bottom);
+ off += sizeof(bottom);
+ blist.copy(off, sizeof(backlog), (char*)&backlog);
+ off += sizeof(backlog);
+
+ ::_decode(log, blist, off);
+ }
+
+ void copy_after(const Log &other, eversion_t v);
+ bool copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor);
+ void copy_non_backlog(const Log &other);
+ ostream& print(ostream& out) const;
+ };
+
+ /**
+ * IndexLog - adds in-memory index of the log, by oid.
+ * plus some methods to manipulate it all.
+ */
+ class IndexedLog : public Log {
+ public:
+ hash_map<object_t,Entry*> objects; // ptrs into log. be careful!
+ hash_set<reqid_t> caller_ops;
+
+ // recovery pointers
+ list<Entry>::iterator requested_to; // not inclusive of referenced item
+ list<Entry>::iterator complete_to; // not inclusive of referenced item
+
+ /****/
+ IndexedLog() {}
+
+ void clear() {
+ assert(0);
+ unindex();
+ Log::clear();
+ }
+
+ bool logged_object(object_t oid) {
+ return objects.count(oid);
+ }
+ bool logged_req(reqid_t &r) {
+ return caller_ops.count(r);
+ }
+
+ void index() {
+ objects.clear();
+ caller_ops.clear();
+ for (list<Entry>::iterator i = log.begin();
+ i != log.end();
+ i++) {
+ objects[i->oid] = &(*i);
+ caller_ops.insert(i->reqid);
+ }
+ }
+
+ void index(Entry& e) {
+ if (objects.count(e.oid) == 0 ||
+ objects[e.oid]->version < e.version)
+ objects[e.oid] = &e;
+ caller_ops.insert(e.reqid);
+ }
+ void unindex() {
+ objects.clear();
+ caller_ops.clear();
+ }
+ void unindex(Entry& e) {
+ // NOTE: this only works if we remove from the _bottom_ of the log!
+ assert(objects.count(e.oid));
+ if (objects[e.oid]->version == e.version)
+ objects.erase(e.oid);
+ caller_ops.erase(e.reqid);
+ }
+
+
+ // accessors
+ Entry *is_updated(object_t oid) {
+ if (objects.count(oid) && objects[oid]->is_update()) return objects[oid];
+ return 0;
+ }
+ Entry *is_deleted(object_t oid) {
+ if (objects.count(oid) && objects[oid]->is_delete()) return objects[oid];
+ return 0;
+ }
+
+ // actors
+ void add(Entry& e) {
+ // add to log
+ log.push_back(e);
+ assert(e.version > top);
+ assert(top.version == 0 || e.version.version > top.version);
+ top = e.version;
+
+ // to our index
+ objects[e.oid] = &(log.back());
+ caller_ops.insert(e.reqid);
+ }
+
+ void trim(ObjectStore::Transaction &t, eversion_t s);
+ void trim_write_ahead(eversion_t last_update);
+ };
+
+
+ /**
+ * OndiskLog - some info about how we store the log on disk.
+ */
+ class OndiskLog {
+ public:
+ // ok
+ off_t bottom; // first byte of log.
+ off_t top; // byte following end of log.
+ map<off_t,eversion_t> block_map; // block -> first stamp logged there
+
+ OndiskLog() : bottom(0), top(0) {}
+
+ bool trim_to(eversion_t v, ObjectStore::Transaction& t);
+ };
+
+
+ /***
+ */
+
+ class RepOpGather {
+ public:
+ class MOSDOp *op;
+ tid_t rep_tid;
+
+ ObjectStore::Transaction t;
+ bool applied;
+
+ set<int> waitfor_ack;
+ set<int> waitfor_commit;
+
+ utime_t start;
+
+ bool sent_ack, sent_commit;
+
+ set<int> osds;
+ eversion_t new_version;
+
+ eversion_t pg_local_last_complete;
+ map<int,eversion_t> pg_complete_thru;
+
+ RepOpGather(MOSDOp *o, tid_t rt, eversion_t nv, eversion_t lc) :
+ op(o), rep_tid(rt),
+ applied(false),
+ sent_ack(false), sent_commit(false),
+ new_version(nv),
+ pg_local_last_complete(lc) { }
+
+ bool can_send_ack() {
+ return !sent_ack && !sent_commit &&
+ waitfor_ack.empty();
+ }
+ bool can_send_commit() {
+ return !sent_commit &&
+ waitfor_ack.empty() && waitfor_commit.empty();
+ }
+ bool can_delete() {
+ return waitfor_ack.empty() && waitfor_commit.empty();
+ }
+ };
+
+
+ /*** PG ****/
+public:
+ // any
+ static const int STATE_ACTIVE = 1; // i am active. (primary: replicas too)
+
+ // primary
+ static const int STATE_CLEAN = 2; // peers are complete, clean of stray replicas.
+ static const int STATE_CRASHED = 4; // all replicas went down.
+ static const int STATE_REPLAY = 8; // crashed, waiting for replay
+
+ // non-primary
+ static const int STATE_STRAY = 16; // i must notify the primary i exist.
+
+
+ protected:
+ OSD *osd;
+
+public:
+ // pg state
+ Info info;
+ IndexedLog log;
+ OndiskLog ondisklog;
+ Missing missing;
+ utime_t last_heartbeat; //
+
+protected:
+ int role; // 0 = primary, 1 = replica, -1=none.
+ int state; // see bit defns above
+
+ // primary state
+ public:
+ vector<int> acting;
+ epoch_t last_epoch_started_any;
+ eversion_t last_complete_commit;
+
+ // [primary only] content recovery state
+ eversion_t peers_complete_thru;
+ bool have_master_log;
+ protected:
+ set<int> prior_set; // current+prior OSDs, as defined by last_epoch_started_any.
+ set<int> stray_set; // non-acting osds that have PG data.
+ set<int> clean_set; // current OSDs that are clean
+ eversion_t oldest_update; // lowest (valid) last_update in active set
+ map<int,Info> peer_info; // info from peers (stray or prior)
+ set<int> peer_info_requested;
+ map<int, Missing> peer_missing;
+ set<int> peer_log_requested; // logs i've requested (and start stamps)
+ set<int> peer_summary_requested;
+ friend class OSD;
+
+
+ // [primary|tail]
+ // old way
+ map<tid_t, class OSDReplicaOp*> replica_ops;
+ map<int, set<tid_t> > replica_tids_by_osd; // osd -> (tid,...)
+
+ // new way
+ map<tid_t, RepOpGather*> repop_gather;
+ map<tid_t, list<class Message*> > waiting_for_repop;
+
+
+ // [primary|replica]
+ // pg waiters
+ list<class Message*> waiting_for_active;
+ hash_map<object_t,
+ list<class Message*> > waiting_for_missing_object;
+ map<eversion_t,class MOSDOp*> replay_queue;
+
+ // recovery
+ map<object_t, eversion_t> objects_pulling; // which objects are currently being pulled
+
+public:
+ void clear_primary_state();
+
+ public:
+ bool is_acting(int osd) const {
+ for (unsigned i=0; i<acting.size(); i++)
+ if (acting[i] == osd) return true;
+ return false;
+ }
+ bool is_prior(int osd) const { return prior_set.count(osd); }
+ bool is_stray(int osd) const { return stray_set.count(osd); }
+
+ bool is_all_clean() const { return clean_set.size() == acting.size(); }
+
+ void build_prior();
+ void adjust_prior(); // based on new peer_info.last_epoch_started_any
+
+ bool adjust_peers_complete_thru() {
+ eversion_t t = info.last_complete;
+ for (unsigned i=1; i<acting.size(); i++)
+ if (peer_info[i].last_complete < t)
+ t = peer_info[i].last_complete;
+ if (t > peers_complete_thru) {
+ peers_complete_thru = t;
+ return true;
+ }
+ return false;
+ }
+
+ void proc_replica_log(Log &olog, Missing& omissing, int from);
+ void merge_log(Log &olog, Missing& omissing, int from);
+ void proc_missing(Log &olog, Missing &omissing, int fromosd);
+
+ void generate_backlog();
+ void drop_backlog();
+
+ void trim_write_ahead();
+
+ void peer(ObjectStore::Transaction& t, map< int, map<pg_t,Query> >& query_map);
+
+ void activate(ObjectStore::Transaction& t);
+
+ void cancel_recovery();
+ bool do_recovery();
+ void do_peer_recovery();
+
+ void clean_replicas();
+
+ off_t get_log_write_pos() {
+ return 0;
+ }
+
+ public:
+ PG(OSD *o, pg_t p) :
+ osd(o),
+ info(p),
+ role(0),
+ state(0),
+ last_epoch_started_any(0),
+ last_complete_commit(0),
+ peers_complete_thru(0),
+ have_master_log(true)
+ { }
+
+ pg_t get_pgid() const { return info.pgid; }
+ int get_nrep() const { return acting.size(); }
+
+ int get_primary() { return acting.empty() ? -1:acting[0]; }
+ //int get_tail() { return acting.empty() ? -1:acting[ acting.size()-1 ]; }
+ //int get_acker() { return g_conf.osd_rep == OSD_REP_PRIMARY ? get_primary():get_tail(); }
+ int get_acker() {
+ if (g_conf.osd_rep == OSD_REP_PRIMARY ||
+ acting.size() <= 1)
+ return get_primary();
+ return acting[1];
+ }
+
+ int get_role() const { return role; }
+ void set_role(int r) { role = r; }
+
+ bool is_primary() const { return role == PG_ROLE_HEAD; }
+ bool is_acker() const { return role == PG_ROLE_ACKER; }
+ bool is_head() const { return role == PG_ROLE_HEAD; }
+ bool is_middle() const { return role == PG_ROLE_MIDDLE; }
+ bool is_residual() const { return role == PG_ROLE_STRAY; }
+
+ //int get_state() const { return state; }
+ bool state_test(int m) const { return (state & m) != 0; }
+ void state_set(int m) { state |= m; }
+ void state_clear(int m) { state &= ~m; }
+
+ bool is_complete() const { return info.last_complete == info.last_update; }
+
+ bool is_active() const { return state_test(STATE_ACTIVE); }
+ bool is_crashed() const { return state_test(STATE_CRASHED); }
+ bool is_replay() const { return state_test(STATE_REPLAY); }
+ //bool is_complete() { return state_test(STATE_COMPLETE); }
+ bool is_clean() const { return state_test(STATE_CLEAN); }
+ bool is_stray() const { return state_test(STATE_STRAY); }
+
+ bool is_empty() const { return info.last_update == 0; }
+
+ int num_active_ops() const {
+ return objects_pulling.size();
+ }
+
+
+ // pg on-disk content
+ void clean_up_local(ObjectStore::Transaction& t);
+
+ // pg on-disk state
+ void write_log(ObjectStore::Transaction& t);
+ void append_log(ObjectStore::Transaction& t,
+ PG::Log::Entry& logentry,
+ eversion_t trim_to);
+ void read_log(ObjectStore *store);
+ void trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v);
+
+
+
+};
+
+
+
+inline ostream& operator<<(ostream& out, const PG::Info::History& h)
+{
+ return out << h.same_since << "/" << h.same_primary_since << "/" << h.same_acker_since;
+}
+
+inline ostream& operator<<(ostream& out, const PG::Info& pgi)
+{
+ out << "pginfo(" << hex << pgi.pgid << dec;
+ if (pgi.is_empty())
+ out << " empty";
+ else
+ out << " v " << pgi.last_update << "/" << pgi.last_complete
+ << " (" << pgi.log_bottom << "," << pgi.last_update << "]"
+ << (pgi.log_backlog ? "+backlog":"");
+ out << " e " << pgi.last_epoch_started << "/" << pgi.last_epoch_finished
+ << " " << pgi.history
+ << ")";
+ return out;
+}
+
+inline ostream& operator<<(ostream& out, const PG::Log::Entry& e)
+{
+ return out << " " << e.version
+ << (e.is_delete() ? " - ":
+ (e.is_clone() ? " c ":
+ (e.is_modify() ? " m ":
+ " ? ")))
+ << e.oid << " by " << e.reqid;
+}
+
+inline ostream& operator<<(ostream& out, const PG::Log& log)
+{
+ out << "log(" << log.bottom << "," << log.top << "]";
+ if (log.backlog) out << "+backlog";
+ return out;
+}
+
+inline ostream& operator<<(ostream& out, const PG::Missing& missing)
+{
+ out << "missing(" << missing.num_missing();
+ if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
+ out << ")";
+ return out;
+}
+
+inline ostream& operator<<(ostream& out, const PG& pg)
+{
+ out << "pg[" << pg.info
+ << " r=" << pg.get_role();
+
+ if (pg.log.bottom != pg.info.log_bottom)
+ out << " (info mismatch, " << pg.log << ")";
+
+ if (pg.log.log.empty()) {
+ // shoudl it be?
+ if (pg.log.top.version - pg.log.bottom.version != 0) {
+ out << " (log bound mismatch, empty)";
+ }
+ } else {
+ if (((pg.log.log.begin()->version.version - 1 != pg.log.bottom.version) &&
+ !pg.log.backlog) ||
+ (pg.log.log.rbegin()->version.version != pg.log.top.version)) {
+ out << " (log bound mismatch, actual=["
+ << pg.log.log.begin()->version << ","
+ << pg.log.log.rbegin()->version << "])";
+ }
+ }
+
+ if (pg.get_role() == 0) out << " pct " << pg.peers_complete_thru;
+ if (!pg.have_master_log) out << " !hml";
+ if (pg.is_active()) out << " active";
+ if (pg.is_crashed()) out << " crashed";
+ if (pg.is_replay()) out << " replay";
+ if (pg.is_clean()) out << " clean";
+ if (pg.is_stray()) out << " stray";
+ //out << " (" << pg.log.bottom << "," << pg.log.top << "]";
+ if (pg.missing.num_missing()) out << " m=" << pg.missing.num_missing();
+ if (pg.missing.num_lost()) out << " l=" << pg.missing.num_lost();
+ out << "]";
+
+
+ return out;
+}
+
+
+inline ostream& operator<<(ostream& out, PG::RepOpGather& repop)
+{
+ out << "repop(" << &repop << " rep_tid=" << repop.rep_tid
+ << " wfack=" << repop.waitfor_ack
+ << " wfcommit=" << repop.waitfor_commit;
+ out << " pct=" << repop.pg_complete_thru;
+ out << " op=" << *(repop.op);
+ out << " repop=" << &repop;
+ out << ")";
+ return out;
+}
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+//
+//
+// rush.cc
+//
+// $Id$
+//
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <cassert>
+#include "rush.h"
+
+
+static
+unsigned int
+myhash (unsigned int n)
+{
+ unsigned int v = (n ^ 0xdead1234) * (884811920 * 3 + 1);
+ return (v);
+}
+
+Rush::Rush ()
+{
+ nClusters = 0;
+ totalServers = 0;
+}
+
+//----------------------------------------------------------------------
+//
+// Rush::AddCluster
+//
+// Add a cluster. The number of servers in the cluster and
+// the weight of each server is passed. The current number of
+// clusters is returned.
+//
+//----------------------------------------------------------------------
+int
+Rush::AddCluster (int nServers, double weight)
+{
+ clusterSize[nClusters] = nServers;
+ clusterWeight[nClusters] = weight;
+ if (nClusters == 0) {
+ serversInPrevious[0] = 0;
+ totalWeightBefore[0] = 0.0;
+ } else {
+ serversInPrevious[nClusters] = serversInPrevious[nClusters-1] +
+ clusterSize[nClusters-1];
+ totalWeightBefore[nClusters] =
+ totalWeightBefore[nClusters-1] + (double)clusterSize[nClusters-1] *
+ clusterWeight[nClusters-1];
+ }
+ nClusters += 1;
+ totalServers += nServers;
+#if 0
+ for (int i = 0; i < nClusters; i++) {
+ fprintf (stderr, "size=%-3d prev=%-3d weight=%-6.2f prevWeight=%-8.2f\n",
+ clusterSize[i], serversInPrevious[i], clusterWeight[i],
+ totalWeightBefore[i]);
+ }
+#endif
+ return (nClusters);
+}
+
+
+//----------------------------------------------------------------------
+//
+// Rush::GetServersByKey
+//
+// This function returns a list of servers on which an object
+// should be placed. The servers array must be large enough to
+// contain the list.
+//
+//----------------------------------------------------------------------
+void
+Rush::GetServersByKey (int key, int nReplicas, int servers[])
+{
+ int replicasLeft = nReplicas;
+ int cluster;
+ int mustAssign, numberAssigned;
+ int i, toDraw;
+ int *srv = servers;
+ double myWeight;
+ RushRNG rng;
+
+ // There may not be more replicas than servers!
+ assert (nReplicas <= totalServers);
+
+ for (cluster = nClusters-1; (cluster >= 0) && (replicasLeft > 0); cluster--) {
+ if (serversInPrevious[cluster] < replicasLeft) {
+ mustAssign = replicasLeft - serversInPrevious[cluster];
+ } else {
+ mustAssign = 0;
+ }
+ toDraw = replicasLeft - mustAssign;
+ if (toDraw > (clusterSize[cluster] - mustAssign)) {
+ toDraw = clusterSize[cluster] - mustAssign;
+ }
+ myWeight = (double)clusterSize[cluster] * clusterWeight[cluster];
+ rng.Seed (myhash (key)^cluster, cluster^0xb90738);
+ numberAssigned = mustAssign +
+ rng.HyperGeometricWeighted (toDraw, myWeight,
+ totalWeightBefore[cluster] + myWeight,
+ clusterWeight[cluster]);
+ if (numberAssigned > 0) {
+ rng.Seed (myhash (key)^cluster ^ 11, cluster^0xfea937);
+ rng.DrawKofN (srv, numberAssigned, clusterSize[cluster]);
+ for (i = 0; i < numberAssigned; i++) {
+ srv[i] += serversInPrevious[cluster];
+ }
+ replicasLeft -= numberAssigned;
+ srv += numberAssigned;
+ }
+ }
+}
+
+\f
+
+//----------------------------------------------------------------------
+//
+// RushRNG::HyperGeometricWeighted
+//
+// Use an iterative method to generate a hypergeometric random
+// variable. This approach guarantees that, if the number of draws
+// is reduced, the number of successes must be as well as long as
+// the seed for the RNG is the same.
+//
+//----------------------------------------------------------------------
+int
+RushRNG::HyperGeometricWeighted (int nDraws, double yesWeighted,
+ double totalWeighted, double weightOne)
+{
+ int positives = 0, i;
+ double curRand;
+
+ // If the weight is too small (or is negative), choose zero objects.
+ if (weightOne <= 1e-9 || nDraws == 0) {
+ return (0);
+ }
+
+ // Draw nDraws items from the "bag". For each positive, subtract off
+ // the weight of an object from the weight of positives remaining. For
+ // each draw, subtract off the weight of an object from the total weight
+ // remaining.
+ for (i = 0; i < nDraws; i++) {
+ curRand = RandomDouble ();
+ if (curRand < (yesWeighted / totalWeighted)) {
+ positives += 1;
+ yesWeighted -= weightOne;
+ }
+ totalWeighted -= weightOne;
+ }
+ return (positives);
+}
+
+//----------------------------------------------------------------------
+//
+// RushRNG::DrawKofN
+//
+//----------------------------------------------------------------------
+void
+RushRNG::DrawKofN (int vals[], int nToDraw, int setSize)
+{
+ int deck[setSize];
+ int i, pick;
+
+ assert(nToDraw <= setSize);
+
+ for (i = 0; i < setSize; i++) {
+ deck[i] = i;
+ }
+
+ for (i = 0; i < nToDraw; i++) {
+ pick = (int)(RandomDouble () * (double)(setSize - i));
+ if (pick >= setSize-i) pick = setSize-i-1; // in case
+ // assert(i >= 0 && i < nToDraw);
+ // assert(pick >= 0 && pick < setSize);
+ vals[i] = deck[pick];
+ deck[pick] = deck[setSize-i-1];
+ }
+}
+
+#define SEED_X 521288629
+#define SEED_Y 362436069
+RushRNG::RushRNG ()
+{
+ Seed (0, 0);
+}
+
+void
+RushRNG::Seed (unsigned int seed1, unsigned int seed2)
+{
+ state1 = ((seed1 == 0) ? SEED_X : seed1);
+ state2 = ((seed2 == 0) ? SEED_Y : seed2);
+}
+
+unsigned int
+RushRNG::RandomInt ()
+{
+ const unsigned int a = 18000;
+ const unsigned int b = 18879;
+ unsigned int rndValue;
+
+ state1 = a * (state1 & 0xffff) + (state1 >> 16);
+ state2 = b * (state2 & 0xffff) + (state2 >> 16);
+ rndValue = (state1 << 16) + (state2 & 0xffff);
+ return (rndValue);
+}
+
+double
+RushRNG::RandomDouble ()
+{
+ double v;
+
+ v = (double)RandomInt() / (65536.0*65536.0);
+ return (v);
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+//
+//
+// rush.h
+//
+// Classes and definitions for the RUSH algorithm.
+//
+// $Id$
+//
+//
+
+#ifndef _rush_h_
+#define _rush_h_
+
+#define RUSH_MAX_CLUSTERS 100
+
+class RushRNG {
+public:
+ unsigned int RandomInt ();
+ double RandomDouble ();
+ void Seed (unsigned int a, unsigned int b);
+ int HyperGeometricWeighted (int nDraws, double yesWeighted,
+ double totalWeighted, double weightOne);
+ void DrawKofN (int vals[], int nToDraw, int setSize);
+ RushRNG();
+private:
+ unsigned int state1, state2;
+};
+
+class Rush {
+public:
+ void GetServersByKey (int key, int nReplicas, int servers[]);
+ int AddCluster (int nServers, double weight);
+ int Clusters () {return (nClusters);}
+ int Servers () {return (totalServers);}
+ Rush ();
+private:
+ int DrawKofN (int *servers, int n, int clusterSize, RushRNG *g);
+ int nClusters;
+ int totalServers;
+ int clusterSize[RUSH_MAX_CLUSTERS];
+ int serversInPrevious[RUSH_MAX_CLUSTERS];
+ double clusterWeight[RUSH_MAX_CLUSTERS];
+ double totalWeightBefore[RUSH_MAX_CLUSTERS];
+};
+
+#endif /* _rush_h_ */
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include <iostream>
+#include <string>
+
+using namespace std;
+
+#include "common/Mutex.h"
+#include "common/ThreadPool.h"
+// #include <thread.h>
+
+class Op {
+ int i;
+
+public:
+
+ Op(int i)
+ {
+ this->i = i;
+ }
+
+ int get()
+ {
+ return i;
+ }
+};
+
+void foop(class TP *t, class Op *o);
+
+class TP {
+public:
+
+ void foo(Op *o)
+ {
+ cout << "Thread "<< pthread_self() << ": " << o->get() << "\n";
+ usleep(1);
+
+ // sched_yield();
+ }
+
+ int main(int argc, char *argv)
+ {
+ ThreadPool<TP,Op> *t = new ThreadPool<TP,Op>(10, (void (*)(TP*, Op*))foop, this);
+
+ for(int i = 0; i < 100; i++) {
+ Op *o = new Op(i);
+ t->put_op(o);
+ }
+
+ sleep(1);
+
+ delete(t);
+
+ return 0;
+ }
+};
+
+void foop(class TP *t, class Op *o) {
+ t->foo(o);
+}
+
+int main(int argc, char *argv) {
+ TP t;
+
+ t.main(argc,argv);
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __BLINKER_H
+#define __BLINKER_H
+
+class Blinker {
+
+ public:
+
+ class Op {
+ int op;
+ static const int LOOKUP = 1;
+ static const int INSERT = 2;
+ static const int REMOVE = 3;
+ static const int CLEAR = 4;
+ Op(int o) : op(o) {}
+ };
+
+ class OpLookup : public Op {
+ public:
+ bufferptr key;
+ OpLookup(bufferptr& k) : Op(Op::LOOKUP), key(k) {}
+ };
+
+ class OpInsert : public Op {
+ bufferptr key;
+ bufferlist val;
+ OpInsert(bufferptr& k, bufferlist& v) : Op(Op::INSERT), key(k), val(v) {}
+ };
+
+ class OpRemove : public Op {
+ public:
+ bufferptr key;
+ OpRemove(bufferptr& k) : Op(Op::REMOVE), key(k) {}
+ };
+
+ class OpClear : public Op {
+ public:
+ OpClear() : Op(Op::CLEAR) {}
+ };
+
+
+
+private:
+ Objecter *objecter;
+
+ // in-flight operations.
+
+
+ // cache information about tree structure.
+
+
+
+public:
+ // public interface
+
+ // simple accessors
+ void lookup(inode_t& inode, bufferptr& key, bufferlist *pval, Context *onfinish);
+
+ // simple modifiers
+ void insert(inode_t& inode, bufferptr& key, bufferlist& val, Context *onack, Context *onsafe);
+ void remove(inode_t& inode, bufferptr& key, Context *onack, Context *onsafe);
+ void clear(inode_t& inode, Context *onack, Context *onsafe);
+
+ // these are dangerous: the table may be large.
+ void listkeys(inode_t& inode, list<bufferptr>* pkeys, Context *onfinish);
+ void listvals(inode_t& inode, list<bufferptr>* pkeys, list<bufferlist>* pvals, Context *onfinish);
+
+ // fetch *at least* key, but also anything else that is convenient.
+ // include lexical bounds for which this is a complete result.
+ // (if *start and *end are empty, it's the entire table)
+ void prefetch(inode_t& inode, bufferptr& key,
+ list<bufferptr>* pkeys, list<bufferlist>* pvals,
+ bufferptr *start, bufferptr *end,
+ Context *onfinish);
+
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include <assert.h>
+
+#include "Filer.h"
+#include "osd/OSDMap.h"
+
+//#include "messages/MOSDRead.h"
+//#include "messages/MOSDReadReply.h"
+//#include "messages/MOSDWrite.h"
+//#include "messages/MOSDWriteReply.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDMap.h"
+
+#include "msg/Messenger.h"
+
+#include "include/Context.h"
+
+#include "config.h"
+#undef dout
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_filer) cout << g_clock.now() << " " << objecter->messenger->get_myaddr() << ".filer "
+
+
+class Filer::C_Probe : public Context {
+public:
+ Filer *filer;
+ Probe *probe;
+ object_t oid;
+ off_t size;
+ C_Probe(Filer *f, Probe *p, object_t o) : filer(f), probe(p), oid(o), size(0) {}
+ void finish(int r) {
+ filer->_probed(probe, oid, size);
+ }
+};
+
+int Filer::probe_fwd(inode_t& inode,
+ off_t start_from,
+ off_t *end,
+ Context *onfinish)
+{
+ dout(10) << "probe_fwd " << hex << inode.ino << dec << " starting from " << start_from << endl;
+
+ Probe *probe = new Probe(inode, start_from, end, onfinish);
+
+ // period (bytes before we jump unto a new set of object(s))
+ off_t period = inode.layout.period();
+
+ // start with 1+ periods.
+ probe->probing_len = period;
+ if (start_from % period)
+ probe->probing_len += period - (start_from % period);
+
+ _probe(probe);
+ return 0;
+}
+
+void Filer::_probe(Probe *probe)
+{
+ dout(10) << "_probe " << hex << probe->inode.ino << dec << " " << probe->from << "~" << probe->probing_len << endl;
+
+ // map range onto objects
+ file_to_extents(probe->inode, probe->from, probe->probing_len, probe->probing);
+
+ for (list<ObjectExtent>::iterator p = probe->probing.begin();
+ p != probe->probing.end();
+ p++) {
+ dout(10) << "_probe probing " << p->oid << endl;
+ C_Probe *c = new C_Probe(this, probe, p->oid);
+ probe->ops[p->oid] = objecter->stat(p->oid, &c->size, c);
+ }
+}
+
+void Filer::_probed(Probe *probe, object_t oid, off_t size)
+{
+ dout(10) << "_probed " << probe->inode.ino << " object " << hex << oid << dec << " has size " << size << endl;
+
+ probe->known[oid] = size;
+ assert(probe->ops.count(oid));
+ probe->ops.erase(oid);
+
+ if (!probe->ops.empty())
+ return; // waiting for more!
+
+ // analyze!
+ off_t end = 0;
+ for (list<ObjectExtent>::iterator p = probe->probing.begin();
+ p != probe->probing.end();
+ p++) {
+ off_t shouldbe = p->length+p->start;
+ dout(10) << "_probed " << probe->inode.ino << " object " << hex << p->oid << dec
+ << " should be " << shouldbe
+ << ", actual is " << probe->known[p->oid]
+ << endl;
+
+ if (probe->known[p->oid] < 0) { end = -1; break; } // error!
+
+ assert(probe->known[p->oid] <= shouldbe);
+ if (shouldbe == probe->known[p->oid]) continue; // keep going
+
+ // aha, we found the end!
+ // calc offset into buffer_extent to get distance from probe->from.
+ off_t oleft = probe->known[p->oid] - p->start;
+ for (map<size_t,size_t>::iterator i = p->buffer_extents.begin();
+ i != p->buffer_extents.end();
+ i++) {
+ if (oleft <= (off_t)i->second) {
+ end = probe->from + i->first + oleft;
+ dout(10) << "_probed end is in buffer_extent " << i->first << "~" << i->second << " off " << oleft
+ << ", from was " << probe->from << ", end is " << end
+ << endl;
+ break;
+ }
+ oleft -= i->second;
+ }
+ break;
+ }
+
+ if (end == 0) {
+ // keep probing!
+ dout(10) << "_probed didn't find end, probing further" << endl;
+ off_t period = probe->inode.layout.object_size * probe->inode.layout.stripe_count;
+ probe->from += probe->probing_len;
+ probe->probing_len = period;
+ _probe(probe);
+ return;
+ }
+
+ if (end < 0) {
+ dout(10) << "_probed encountered an error while probing" << endl;
+ *probe->end = -1;
+ } else {
+ // hooray!
+ dout(10) << "_probed found end at " << end << endl;
+ *probe->end = end;
+ }
+
+ // done! finish and clean up.
+ probe->onfinish->finish(end > 0 ? 0:-1);
+ delete probe->onfinish;
+ delete probe;
+}
+
+
+void Filer::file_to_extents(inode_t inode,
+ off_t offset, size_t len,
+ list<ObjectExtent>& extents,
+ objectrev_t rev)
+{
+ dout(10) << "file_to_extents " << offset << "~" << len
+ << " on " << hex << inode.ino << dec
+ << endl;
+
+ /* we want only one extent per object!
+ * this means that each extent we read may map into different bits of the
+ * final read buffer.. hence OSDExtent.buffer_extents
+ */
+ map< object_t, ObjectExtent > object_extents;
+
+ assert(inode.layout.object_size >= inode.layout.stripe_size);
+ off_t stripes_per_object = inode.layout.object_size / inode.layout.stripe_size;
+ dout(20) << " stripes_per_object " << stripes_per_object << endl;
+
+ off_t cur = offset;
+ off_t left = len;
+ while (left > 0) {
+ // layout into objects
+ off_t blockno = cur / inode.layout.stripe_size;
+ off_t stripeno = blockno / inode.layout.stripe_count;
+ off_t stripepos = blockno % inode.layout.stripe_count;
+ off_t objectsetno = stripeno / stripes_per_object;
+ off_t objectno = objectsetno * inode.layout.stripe_count + stripepos;
+
+ // find oid, extent
+ ObjectExtent *ex = 0;
+ object_t oid( inode.ino, objectno );
+ if (object_extents.count(oid))
+ ex = &object_extents[oid];
+ else {
+ ex = &object_extents[oid];
+ ex->oid = oid;
+ ex->rev = rev;
+ ex->pgid = objecter->osdmap->object_to_pg( oid, inode.layout );
+ }
+
+ // map range into object
+ off_t block_start = (stripeno % stripes_per_object)*inode.layout.stripe_size;
+ off_t block_off = cur % inode.layout.stripe_size;
+ off_t max = inode.layout.stripe_size - block_off;
+
+ off_t x_offset = block_start + block_off;
+ off_t x_len;
+ if (left > max)
+ x_len = max;
+ else
+ x_len = left;
+
+ if (ex->start + (off_t)ex->length == x_offset) {
+ // add to extent
+ ex->length += x_len;
+ } else {
+ // new extent
+ assert(ex->length == 0);
+ assert(ex->start == 0);
+ ex->start = x_offset;
+ ex->length = x_len;
+ }
+ ex->buffer_extents[cur-offset] = x_len;
+
+ dout(15) << "file_to_extents " << *ex << " in " << ex->pgid << endl;
+ //cout << "map: ino " << ino << " oid " << ex.oid << " osd " << ex.osd << " offset " << ex.offset << " len " << ex.len << " ... left " << left << endl;
+
+ left -= x_len;
+ cur += x_len;
+ }
+
+ // make final list
+ for (map<object_t, ObjectExtent>::iterator it = object_extents.begin();
+ it != object_extents.end();
+ it++) {
+ extents.push_back(it->second);
+ }
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __FILER_H
+#define __FILER_H
+
+/*** Filer
+ *
+ * stripe file ranges onto objects.
+ * build list<ObjectExtent> for the objecter or objectcacher.
+ *
+ * also, provide convenience methods that call objecter for you.
+ *
+ * "files" are identified by ino.
+ */
+
+#include <set>
+#include <map>
+using namespace std;
+
+#include <ext/hash_map>
+#include <ext/rope>
+using namespace __gnu_cxx;
+
+#include "include/types.h"
+
+#include "osd/OSDMap.h"
+#include "Objecter.h"
+
+class Context;
+class Messenger;
+class OSDMap;
+
+
+/**** Filer interface ***/
+
+class Filer {
+ Objecter *objecter;
+
+ // probes
+ struct Probe {
+ inode_t inode;
+ off_t from;
+ off_t *end;
+ Context *onfinish;
+
+ list<ObjectExtent> probing;
+ off_t probing_len;
+
+ map<object_t, off_t> known;
+ map<object_t, tid_t> ops;
+
+ Probe(inode_t &i, off_t f, off_t *e, Context *c) :
+ inode(i), from(f), end(e), onfinish(c), probing_len(0) {}
+ };
+
+ class C_Probe;
+ //friend class C_Probe;
+
+ void _probe(Probe *p);
+ void _probed(Probe *p, object_t oid, off_t size);
+
+ public:
+ Filer(Objecter *o) : objecter(o) {}
+ ~Filer() {}
+
+ bool is_active() {
+ return objecter->is_active(); // || (oc && oc->is_active());
+ }
+
+ /*** async file interface ***/
+ int read(inode_t& inode,
+ off_t offset,
+ size_t len,
+ bufferlist *bl, // ptr to data
+ Context *onfinish) {
+ Objecter::OSDRead *rd = new Objecter::OSDRead(bl);
+ file_to_extents(inode, offset, len, rd->extents);
+ return objecter->readx(rd, onfinish) > 0 ? 0:-1;
+ }
+
+ int write(inode_t& inode,
+ off_t offset,
+ size_t len,
+ bufferlist& bl,
+ int flags,
+ Context *onack,
+ Context *oncommit,
+ objectrev_t rev=0) {
+ Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl);
+ file_to_extents(inode, offset, len, wr->extents, rev);
+ return objecter->modifyx(wr, onack, oncommit) > 0 ? 0:-1;
+ }
+
+ int zero(inode_t& inode,
+ off_t offset,
+ size_t len,
+ Context *onack,
+ Context *oncommit) {
+ Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_ZERO);
+ file_to_extents(inode, offset, len, z->extents);
+ return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1;
+ }
+
+ int remove(inode_t& inode,
+ off_t offset,
+ size_t len,
+ Context *onack,
+ Context *oncommit) {
+ Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_DELETE);
+ file_to_extents(inode, offset, len, z->extents);
+ return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1;
+ }
+
+ int probe_fwd(inode_t& inode,
+ off_t start_from,
+ off_t *end,
+ Context *onfinish);
+
+
+ /***** mapping *****/
+
+ /* map (ino, ono) to an object name
+ (to be used on any osd in the proper replica group) */
+ /*object_t file_to_object(inodeno_t ino,
+ size_t _ono) {
+ __uint64_t ono = _ono;
+ assert(ino < (1ULL<<OID_INO_BITS)); // legal ino can't be too big
+ assert(ono < (1ULL<<OID_ONO_BITS));
+ return ono + (ino << OID_ONO_BITS);
+ }
+ */
+
+
+ /* map (ino, offset, len) to a (list of) OSDExtents
+ (byte ranges in objects on (primary) osds) */
+ void file_to_extents(inode_t inode,
+ off_t offset,
+ size_t len,
+ list<ObjectExtent>& extents,
+ objectrev_t rev=0);
+
+};
+
+
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "Journaler.h"
+
+#include "include/Context.h"
+#include "common/Logger.h"
+#include "msg/Messenger.h"
+
+#include "config.h"
+#undef dout
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << objecter->messenger->get_myaddr() << ".journaler "
+#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << objecter->messenger->get_myaddr() << ".journaler "
+
+
+
+void Journaler::reset()
+{
+ dout(1) << "reset to blank journal" << endl;
+ state = STATE_ACTIVE;
+ write_pos = flush_pos = ack_pos =
+ read_pos = requested_pos = received_pos =
+ expire_pos = trimming_pos = trimmed_pos = inode.layout.period();
+}
+
+
+/***************** HEADER *******************/
+
+ostream& operator<<(ostream& out, Journaler::Header &h)
+{
+ return out << "loghead(trim " << h.trimmed_pos
+ << ", expire " << h.expire_pos
+ << ", read " << h.read_pos
+ << ", write " << h.write_pos
+ << ")";
+}
+
+class Journaler::C_ReadHead : public Context {
+ Journaler *ls;
+public:
+ bufferlist bl;
+ C_ReadHead(Journaler *l) : ls(l) {}
+ void finish(int r) {
+ ls->_finish_read_head(r, bl);
+ }
+};
+
+class Journaler::C_ProbeEnd : public Context {
+ Journaler *ls;
+public:
+ off_t end;
+ C_ProbeEnd(Journaler *l) : ls(l), end(-1) {}
+ void finish(int r) {
+ ls->_finish_probe_end(r, end);
+ }
+};
+
+void Journaler::recover(Context *onread)
+{
+ assert(state != STATE_ACTIVE);
+
+ if (onread)
+ waitfor_recover.push_back(onread);
+
+ if (state != STATE_UNDEF) {
+ dout(1) << "recover - already recoverying" << endl;
+ return;
+ }
+
+ dout(1) << "read_head" << endl;
+ state = STATE_READHEAD;
+ C_ReadHead *fin = new C_ReadHead(this);
+ filer.read(inode, 0, sizeof(Header), &fin->bl, fin);
+}
+
+void Journaler::_finish_read_head(int r, bufferlist& bl)
+{
+ assert(state == STATE_READHEAD);
+
+ if (bl.length() == 0) {
+ dout(1) << "_finish_read_head r=" << r << " read 0 bytes, assuming empty log" << endl;
+ state = STATE_ACTIVE;
+ list<Context*> ls;
+ ls.swap(waitfor_recover);
+ finish_contexts(ls, 0);
+ return;
+ }
+
+ // unpack header
+ Header h;
+ assert(bl.length() == sizeof(h));
+ bl.copy(0, sizeof(h), (char*)&h);
+ dout(1) << "_finish_read_head " << h << ". probing for end of log (from " << write_pos << ")..." << endl;
+
+ write_pos = flush_pos = ack_pos = h.write_pos;
+ read_pos = requested_pos = received_pos = h.read_pos;
+ expire_pos = h.expire_pos;
+ trimmed_pos = trimming_pos = h.trimmed_pos;
+
+ // probe the log
+ state = STATE_PROBING;
+ C_ProbeEnd *fin = new C_ProbeEnd(this);
+ filer.probe_fwd(inode, h.write_pos, &fin->end, fin);
+}
+
+void Journaler::_finish_probe_end(int r, off_t end)
+{
+ assert(r >= 0);
+ assert(end >= write_pos);
+ assert(state == STATE_PROBING);
+
+ dout(1) << "_finish_probe_end write_pos = " << end
+ << " (header had " << write_pos << "). recovered."
+ << endl;
+
+ write_pos = flush_pos = ack_pos = end;
+
+ // done.
+ list<Context*> ls;
+ ls.swap(waitfor_recover);
+ finish_contexts(ls, 0);
+}
+
+
+// WRITING
+
+class Journaler::C_WriteHead : public Context {
+public:
+ Journaler *ls;
+ Header h;
+ Context *oncommit;
+ C_WriteHead(Journaler *l, Header& h_, Context *c) : ls(l), h(h_), oncommit(c) {}
+ void finish(int r) {
+ ls->_finish_write_head(h, oncommit);
+ }
+};
+
+void Journaler::write_head(Context *oncommit)
+{
+ assert(state == STATE_ACTIVE);
+ last_written.trimmed_pos = trimmed_pos;
+ last_written.expire_pos = expire_pos;
+ last_written.read_pos = read_pos;
+ last_written.write_pos = ack_pos; //write_pos;
+ dout(10) << "write_head " << last_written << endl;
+
+ last_wrote_head = g_clock.now();
+
+ bufferlist bl;
+ bl.append((char*)&last_written, sizeof(last_written));
+ filer.write(inode, 0, bl.length(), bl, 0,
+ 0, new C_WriteHead(this, last_written, oncommit));
+}
+
+void Journaler::_finish_write_head(Header &wrote, Context *oncommit)
+{
+ dout(10) << "_finish_write_head " << wrote << endl;
+ last_committed = wrote;
+ if (oncommit) {
+ oncommit->finish(0);
+ delete oncommit;
+ }
+
+ trim(); // trim?
+}
+
+
+/***************** WRITING *******************/
+
+class Journaler::C_Flush : public Context {
+ Journaler *ls;
+ off_t start;
+public:
+ C_Flush(Journaler *l, off_t s) : ls(l), start(s) {}
+ void finish(int r) { ls->_finish_flush(r, start); }
+};
+
+void Journaler::_finish_flush(int r, off_t start)
+{
+ assert(r>=0);
+
+ assert(start >= ack_pos);
+ assert(start < flush_pos);
+ assert(pending_flush.count(start));
+
+ // calc latency?
+ if (logger) {
+ utime_t lat = g_clock.now();
+ lat -= pending_flush[start];
+ logger->finc("lsum", lat);
+ logger->inc("lnum");
+ }
+
+ pending_flush.erase(start);
+
+ // adjust ack_pos
+ if (pending_flush.empty())
+ ack_pos = flush_pos;
+ else
+ ack_pos = pending_flush.begin()->first;
+
+ dout(10) << "_finish_flush from " << start
+ << ", pending_flush now " << pending_flush
+ << ", write positions now " << write_pos << "/" << flush_pos << "/" << ack_pos
+ << endl;
+
+ // kick waiters <= ack_pos
+ while (!waitfor_flush.empty()) {
+ if (waitfor_flush.begin()->first > ack_pos) break;
+ finish_contexts(waitfor_flush.begin()->second);
+ waitfor_flush.erase(waitfor_flush.begin());
+ }
+}
+
+
+off_t Journaler::append_entry(bufferlist& bl, Context *onsync)
+{
+ size_t s = bl.length();
+
+ if (!g_conf.journaler_allow_split_entries) {
+ // will we span a stripe boundary?
+ int p = inode.layout.stripe_size;
+ if (write_pos / p != (write_pos + bl.length() + sizeof(s)) / p) {
+ // yes.
+ // move write_pos forward.
+ off_t owp = write_pos;
+ write_pos += p;
+ write_pos -= (write_pos % p);
+
+ // pad with zeros.
+ bufferptr bp(write_pos - owp);
+ bp.zero();
+ assert(bp.length() >= 4);
+ write_buf.push_back(bp);
+
+ // now flush.
+ flush();
+
+ dout(12) << "append_entry skipped " << (write_pos-owp) << " bytes to " << write_pos << " to avoid spanning stripe boundary" << endl;
+ }
+ }
+
+ dout(10) << "append_entry len " << bl.length() << " to " << write_pos << "~" << (bl.length() + sizeof(size_t)) << endl;
+
+ // append
+ write_buf.append((char*)&s, sizeof(s));
+ write_buf.append(bl);
+ write_pos += sizeof(s) + s;
+
+ // flush now?
+ if (onsync)
+ flush(onsync);
+
+ return write_pos;
+}
+
+
+void Journaler::flush(Context *onsync)
+{
+ if (write_pos == flush_pos) {
+ assert(write_buf.length() == 0);
+ dout(10) << "flush nothing to flush, write pointers at " << write_pos << "/" << flush_pos << "/" << ack_pos << endl;
+
+ if (onsync) {
+ onsync->finish(0);
+ delete onsync;
+ }
+ return;
+ }
+
+ unsigned len = write_pos - flush_pos;
+ assert(len == write_buf.length());
+ dout(10) << "flush flushing " << flush_pos << "~" << len << endl;
+
+ // submit write for anything pending
+ filer.write(inode, flush_pos, len, write_buf, 0,
+ new C_Flush(this, flush_pos), 0); // flush _start_ pos to _finish_flush
+ pending_flush[flush_pos] = g_clock.now();
+
+ // adjust pointers
+ flush_pos = write_pos;
+ write_buf.clear();
+
+ dout(10) << "flush write pointers now at " << write_pos << "/" << flush_pos << "/" << ack_pos << endl;
+
+ // queue waiter (at _new_ write_pos; will go when reached by ack_pos)
+ if (onsync)
+ waitfor_flush[write_pos].push_back(onsync);
+
+ // write head?
+ if (last_wrote_head.sec() + 30 < g_clock.now().sec()) {
+ write_head();
+ }
+}
+
+
+
+/***************** READING *******************/
+
+
+class Journaler::C_Read : public Context {
+ Journaler *ls;
+public:
+ C_Read(Journaler *l) : ls(l) {}
+ void finish(int r) { ls->_finish_read(r); }
+};
+
+class Journaler::C_RetryRead : public Context {
+ Journaler *ls;
+public:
+ C_RetryRead(Journaler *l) : ls(l) {}
+ void finish(int r) { ls->is_readable(); } // this'll kickstart.
+};
+
+void Journaler::_finish_read(int r)
+{
+ assert(r>=0);
+
+ dout(10) << "_finish_read got " << received_pos << "~" << reading_buf.length() << endl;
+ received_pos += reading_buf.length();
+ read_buf.claim_append(reading_buf);
+ assert(received_pos <= requested_pos);
+ dout(10) << "_finish_read read_buf now " << read_pos << "~" << read_buf.length()
+ << ", read pointers " << read_pos << "/" << received_pos << "/" << requested_pos
+ << endl;
+
+ if (is_readable()) { // NOTE: this check may read more
+ // readable!
+ dout(10) << "_finish_read now readable" << endl;
+ if (on_readable) {
+ Context *f = on_readable;
+ on_readable = 0;
+ f->finish(0);
+ delete f;
+ }
+
+ if (read_bl) {
+ bool r = try_read_entry(*read_bl);
+ assert(r); // this should have worked.
+
+ // clear state
+ Context *f = on_read_finish;
+ on_read_finish = 0;
+ read_bl = 0;
+
+ // do callback
+ f->finish(0);
+ delete f;
+ }
+ }
+
+ // prefetch?
+ _prefetch();
+}
+
+/* NOTE: this could be slightly smarter... we could allow
+ * multiple reads to be in progress. e.g., if we prefetch, but
+ * then discover we need even more for an especially large entry.
+ * i don't think that circumstance will arise particularly often.
+ */
+void Journaler::_issue_read(off_t len)
+{
+ if (_is_reading()) {
+ dout(10) << "_issue_read " << len << " waiting, already reading "
+ << received_pos << "~" << (requested_pos-received_pos) << endl;
+ return;
+ }
+ assert(requested_pos == received_pos);
+
+ // stuck at ack_pos?
+ assert(requested_pos <= ack_pos);
+ if (requested_pos == ack_pos) {
+ dout(10) << "_issue_read requested_pos = ack_pos = " << ack_pos << ", waiting" << endl;
+ assert(write_pos > requested_pos);
+ if (flush_pos == ack_pos)
+ flush();
+ assert(flush_pos > ack_pos);
+ waitfor_flush[flush_pos].push_back(new C_RetryRead(this));
+ return;
+ }
+
+ // don't read too much
+ if (requested_pos + len > ack_pos) {
+ len = ack_pos - requested_pos;
+ dout(10) << "_issue_read reading only up to ack_pos " << ack_pos << endl;
+ }
+
+ // go.
+ dout(10) << "_issue_read reading " << requested_pos << "~" << len
+ << ", read pointers " << read_pos << "/" << received_pos << "/" << (requested_pos+len)
+ << endl;
+
+ filer.read(inode, requested_pos, len, &reading_buf,
+ new C_Read(this));
+ requested_pos += len;
+}
+
+void Journaler::_prefetch()
+{
+ // prefetch?
+ off_t left = requested_pos - read_pos;
+ if (left <= prefetch_from && // should read more,
+ !_is_reading() && // and not reading anything right now
+ write_pos > requested_pos) { // there's something more to read...
+ dout(10) << "_prefetch only " << left << " < " << prefetch_from
+ << ", prefetching " << endl;
+ _issue_read(fetch_len);
+ }
+}
+
+
+void Journaler::read_entry(bufferlist *bl, Context *onfinish)
+{
+ // only one read at a time!
+ assert(read_bl == 0);
+ assert(on_read_finish == 0);
+
+ if (is_readable()) {
+ dout(10) << "read_entry at " << read_pos << ", read_buf is "
+ << read_pos << "~" << read_buf.length()
+ << ", readable now" << endl;
+
+ // nice, just do it now.
+ bool r = try_read_entry(*bl);
+ assert(r);
+
+ // callback
+ onfinish->finish(0);
+ delete onfinish;
+ } else {
+ dout(10) << "read_entry at " << read_pos << ", read_buf is "
+ << read_pos << "~" << read_buf.length()
+ << ", not readable now" << endl;
+
+ bl->clear();
+
+ // set it up
+ read_bl = bl;
+ on_read_finish = onfinish;
+
+ // is_readable() will have already initiated a read (if it was possible)
+ }
+}
+
+
+/* is_readable()
+ * return true if next entry is ready.
+ * kickstart read as necessary.
+ */
+bool Journaler::is_readable()
+{
+ // anything to read?
+ if (read_pos == write_pos) return false;
+
+ // have enough for entry size?
+ size_t s = 0;
+ if (read_buf.length() >= sizeof(s))
+ read_buf.copy(0, sizeof(s), (char*)&s);
+
+ // entry and payload?
+ if (read_buf.length() >= sizeof(s) &&
+ read_buf.length() >= sizeof(s) + s)
+ return true; // yep, next entry is ready.
+
+ // darn it!
+
+ // partial fragment at the end?
+ if (received_pos == write_pos) {
+ dout(10) << "is_readable() detected partial entry at tail, adjusting write_pos to " << read_pos << endl;
+ write_pos = flush_pos = ack_pos = read_pos;
+ assert(write_buf.length() == 0);
+
+ // truncate?
+ // FIXME: how much?
+
+ return false;
+ }
+
+ // start reading some more?
+ if (!_is_reading()) {
+ if (s)
+ fetch_len = MAX(fetch_len, sizeof(s)+s-read_buf.length());
+ _issue_read(fetch_len);
+ }
+
+ return false;
+}
+
+
+/* try_read_entry(bl)
+ * read entry into bl if it's ready.
+ * otherwise, do nothing. (well, we'll start fetching it for good measure.)
+ */
+bool Journaler::try_read_entry(bufferlist& bl)
+{
+ if (!is_readable()) { // this may start a read.
+ dout(10) << "try_read_entry at " << read_pos << " not readable" << endl;
+ return false;
+ }
+
+ size_t s;
+ assert(read_buf.length() >= sizeof(s));
+ read_buf.copy(0, sizeof(s), (char*)&s);
+ assert(read_buf.length() >= sizeof(s) + s);
+
+ dout(10) << "try_read_entry at " << read_pos << " reading "
+ << read_pos << "~" << (sizeof(s)+s) << endl;
+
+ // do it
+ assert(bl.length() == 0);
+ read_buf.splice(0, sizeof(s));
+ read_buf.splice(0, s, &bl);
+ read_pos += sizeof(s) + s;
+
+ // prefetch?
+ _prefetch();
+ return true;
+}
+
+void Journaler::wait_for_readable(Context *onreadable)
+{
+ dout(10) << "wait_for_readable at " << read_pos << " onreadable " << onreadable << endl;
+ assert(!is_readable());
+ assert(on_readable == 0);
+ on_readable = onreadable;
+}
+
+
+
+
+/***************** TRIMMING *******************/
+
+
+class Journaler::C_Trim : public Context {
+ Journaler *ls;
+ off_t to;
+public:
+ C_Trim(Journaler *l, off_t t) : ls(l), to(t) {}
+ void finish(int r) {
+ ls->_trim_finish(r, to);
+ }
+};
+
+void Journaler::trim()
+{
+ off_t trim_to = last_committed.expire_pos;
+ trim_to -= trim_to % inode.layout.period();
+ dout(10) << "trim last_commited head was " << last_committed
+ << ", can trim to " << trim_to
+ << endl;
+ if (trim_to == 0 || trim_to == trimming_pos) {
+ dout(10) << "trim already trimmed/trimming to "
+ << trimmed_pos << "/" << trimming_pos << endl;
+ return;
+ }
+
+ // trim
+ assert(trim_to <= write_pos);
+ assert(trim_to > trimming_pos);
+ dout(10) << "trim trimming to " << trim_to
+ << ", trimmed/trimming/expire are "
+ << trimmed_pos << "/" << trimming_pos << "/" << expire_pos
+ << endl;
+
+ filer.remove(inode, trimming_pos, trim_to-trimming_pos,
+ 0, new C_Trim(this, trim_to));
+ trimming_pos = trim_to;
+}
+
+void Journaler::_trim_finish(int r, off_t to)
+{
+ dout(10) << "_trim_finish trimmed_pos was " << trimmed_pos
+ << ", trimmed/trimming/expire now "
+ << to << "/" << trimming_pos << "/" << expire_pos
+ << endl;
+ assert(r >= 0);
+
+ assert(to <= trimming_pos);
+ assert(to > trimmed_pos);
+ trimmed_pos = to;
+
+ // finishers?
+ while (!waitfor_trim.empty() &&
+ waitfor_trim.begin()->first <= trimmed_pos) {
+ finish_contexts(waitfor_trim.begin()->second, 0);
+ waitfor_trim.erase(waitfor_trim.begin());
+ }
+}
+
+
+// eof.
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/* Journaler
+ *
+ * This class stripes a serial log over objects on the store. Four logical pointers:
+ *
+ * write_pos - where we're writing new entries
+ * read_pos - where we're reading old entires
+ * expire_pos - what is deemed "old" by user
+ * trimmed_pos - where we're expiring old items
+ *
+ * trimmed_pos <= expire_pos <= read_pos <= write_pos.
+ *
+ * Often, read_pos <= write_pos (as with MDS log). During recovery, write_pos is undefined
+ * until the end of the log is discovered.
+ *
+ * A "head" struct at the beginning of the log is used to store metadata at
+ * regular intervals. The basic invariants include:
+ *
+ * head.read_pos <= read_pos -- the head may "lag", since it's updated lazily.
+ * head.write_pos <= write_pos
+ * head.expire_pos <= expire_pos
+ * head.trimmed_pos <= trimmed_pos
+ *
+ * More significantly,
+ *
+ * head.expire_pos >= trimmed_pos -- this ensures we can find the "beginning" of the log
+ * as last recorded, before it is trimmed. trimming will
+ * block until a sufficiently current expire_pos is committed.
+ *
+ * To recover log state, we simply start at the last write_pos in the head, and probe the
+ * object sequence sizes until we read the end.
+ *
+ * Head struct is stored in the first object. Actual journal starts after layout.period() bytes.
+ *
+ */
+
+#ifndef __JOURNALER_H
+#define __JOURNALER_H
+
+#include "Objecter.h"
+#include "Filer.h"
+
+#include <list>
+#include <map>
+
+class Context;
+class Logger;
+
+class Journaler {
+
+ // this goes at the head of the log "file".
+ struct Header {
+ off_t trimmed_pos;
+ off_t expire_pos;
+ off_t read_pos;
+ off_t write_pos;
+ Header() : trimmed_pos(0), expire_pos(0), read_pos(0), write_pos(0) {}
+ } last_written, last_committed;
+
+ friend ostream& operator<<(ostream& out, Header &h);
+
+
+ // me
+ inode_t inode;
+ Objecter *objecter;
+ Filer filer;
+
+ Logger *logger;
+
+ // my state
+ static const int STATE_UNDEF = 0;
+ static const int STATE_READHEAD = 1;
+ static const int STATE_PROBING = 2;
+ static const int STATE_ACTIVE = 2;
+
+ int state;
+
+ // header
+ utime_t last_wrote_head;
+ void _finish_write_head(Header &wrote, Context *oncommit);
+ class C_WriteHead;
+ friend class C_WriteHead;
+
+ list<Context*> waitfor_recover;
+ void _finish_read_head(int r, bufferlist& bl);
+ void _finish_probe_end(int r, off_t end);
+ class C_ReadHead;
+ friend class C_ReadHead;
+ class C_ProbeEnd;
+ friend class C_ProbeEnd;
+
+
+
+ // writer
+ off_t write_pos; // logical write position, where next entry will go
+ off_t flush_pos; // where we will flush. if write_pos>flush_pos, we're buffering writes.
+ off_t ack_pos; // what has been acked.
+ bufferlist write_buf; // write buffer. flush_pos + write_buf.length() == write_pos.
+
+ std::map<off_t, utime_t> pending_flush; // start offsets and times for pending flushes
+ std::map<off_t, std::list<Context*> > waitfor_flush; // when flushed through given offset
+
+ void _finish_flush(int r, off_t start);
+ class C_Flush;
+ friend class C_Flush;
+
+ // reader
+ off_t read_pos; // logical read position, where next entry starts.
+ off_t requested_pos; // what we've requested from OSD.
+ off_t received_pos; // what we've received from OSD.
+ bufferlist read_buf; // read buffer. read_pos + read_buf.length() == prefetch_pos.
+ bufferlist reading_buf; // what i'm reading into
+
+ off_t fetch_len; // how much to read at a time
+ off_t prefetch_from; // how far from end do we read next chunk
+
+ // for read_entry() in-progress read
+ bufferlist *read_bl;
+ Context *on_read_finish;
+ // for wait_for_readable()
+ Context *on_readable;
+
+ bool _is_reading() {
+ return requested_pos > received_pos;
+ }
+ void _finish_read(int r); // we just read some (read completion callback)
+ void _issue_read(off_t len); // read some more
+ void _prefetch(); // maybe read ahead
+ class C_Read;
+ friend class C_Read;
+ class C_RetryRead;
+ friend class C_RetryRead;
+
+ // trimmer
+ off_t expire_pos; // what we're allowed to trim to
+ off_t trimming_pos; // what we've requested to trim through
+ off_t trimmed_pos; // what has been trimmed
+ map<off_t, list<Context*> > waitfor_trim;
+
+ void _trim_finish(int r, off_t to);
+ class C_Trim;
+ friend class C_Trim;
+
+public:
+ Journaler(inode_t& inode_, Objecter *obj, Logger *l, off_t fl=0, off_t pff=0) :
+ inode(inode_), objecter(obj), filer(objecter), logger(l),
+ state(STATE_UNDEF),
+ write_pos(0), flush_pos(0), ack_pos(0),
+ read_pos(0), requested_pos(0), received_pos(0),
+ fetch_len(fl), prefetch_from(pff),
+ read_bl(0), on_read_finish(0), on_readable(0),
+ expire_pos(0), trimming_pos(0), trimmed_pos(0)
+ {
+ // prefetch intelligently.
+ // (watch out, this is big if you use big objects or weird striping)
+ if (!fetch_len)
+ fetch_len = inode.layout.object_size*inode.layout.stripe_count;
+ if (!prefetch_from)
+ prefetch_from = fetch_len / 2;
+ }
+
+ // me
+ //void open(Context *onopen);
+ //void claim(Context *onclaim, msg_addr_t from);
+
+ /* reset
+ * NOTE: we assume the caller knows/has ensured that any objects
+ * in our sequence do not exist.. e.g. after a MKFS. this is _not_
+ * an "erase" method.
+ */
+ void reset();
+ void recover(Context *onfinish);
+ void write_head(Context *onsave=0);
+
+ bool is_active() { return state == STATE_ACTIVE; }
+
+ off_t get_write_pos() const { return write_pos; }
+ off_t get_read_pos() const { return read_pos; }
+ off_t get_expire_pos() const { return expire_pos; }
+ off_t get_trimmed_pos() const { return trimmed_pos; }
+
+ // write
+ off_t append_entry(bufferlist& bl, Context *onsync = 0);
+ void flush(Context *onsync = 0);
+
+ // read
+ void set_read_pos(off_t p) {
+ assert(requested_pos == received_pos); // we can't cope w/ in-progress read right now.
+ assert(read_bl == 0); // ...
+ read_pos = requested_pos = received_pos = p;
+ read_buf.clear();
+ }
+ bool is_readable();
+ bool try_read_entry(bufferlist& bl);
+ void wait_for_readable(Context *onfinish);
+ void read_entry(bufferlist* bl, Context *onfinish);
+
+ // trim
+ void set_expire_pos(off_t ep) { expire_pos = ep; }
+ void trim();
+ //bool is_trimmable() { return trimming_pos < expire_pos; }
+ //void trim(off_t trim_to=0, Context *c=0);
+};
+
+
+#endif
--- /dev/null
+
+#include "msg/Messenger.h"
+#include "ObjectCacher.h"
+#include "Objecter.h"
+
+
+
+/*** ObjectCacher::BufferHead ***/
+
+
+/*** ObjectCacher::Object ***/
+
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << oc->objecter->messenger->get_myaddr() << ".objectcacher.object(" << oid << ") "
+
+
+ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *bh, off_t off)
+{
+ dout(20) << "split " << *bh << " at " << off << endl;
+
+ // split off right
+ ObjectCacher::BufferHead *right = new BufferHead(this);
+ right->last_write_tid = bh->last_write_tid;
+ right->set_state(bh->get_state());
+
+ off_t newleftlen = off - bh->start();
+ right->set_start( off );
+ right->set_length( bh->length() - newleftlen );
+
+ // shorten left
+ oc->bh_stat_sub(bh);
+ bh->set_length( newleftlen );
+ oc->bh_stat_add(bh);
+
+ // add right
+ oc->bh_add(this, right);
+
+ // split buffers too
+ bufferlist bl;
+ bl.claim(bh->bl);
+ if (bl.length()) {
+ assert(bl.length() == (bh->length() + right->length()));
+ right->bl.substr_of(bl, bh->length(), right->length());
+ bh->bl.substr_of(bl, 0, bh->length());
+ }
+
+ // move read waiters
+ if (!bh->waitfor_read.empty()) {
+ map<off_t, list<Context*> >::iterator o, p = bh->waitfor_read.end();
+ p--;
+ while (p != bh->waitfor_read.begin()) {
+ if (p->first < right->start()) break;
+ dout(0) << "split moving waiters at byte " << p->first << " to right bh" << endl;
+ right->waitfor_read[p->first].swap( p->second );
+ o = p;
+ p--;
+ bh->waitfor_read.erase(o);
+ }
+ }
+
+ dout(20) << "split left is " << *bh << endl;
+ dout(20) << "split right is " << *right << endl;
+ return right;
+}
+
+
+void ObjectCacher::Object::merge_left(BufferHead *left, BufferHead *right)
+{
+ assert(left->end() == right->start());
+ assert(left->get_state() == right->get_state());
+
+ dout(10) << "merge_left " << *left << " + " << *right << endl;
+ oc->bh_remove(this, right);
+ oc->bh_stat_sub(left);
+ left->set_length( left->length() + right->length());
+ oc->bh_stat_add(left);
+
+ // data
+ left->bl.claim_append(right->bl);
+
+ // version
+ // note: this is sorta busted, but should only be used for dirty buffers
+ left->last_write_tid = MAX( left->last_write_tid, right->last_write_tid );
+ left->last_write = MAX( left->last_write, right->last_write );
+
+ // waiters
+ for (map<off_t, list<Context*> >::iterator p = right->waitfor_read.begin();
+ p != right->waitfor_read.end();
+ p++)
+ left->waitfor_read[p->first].splice( left->waitfor_read[p->first].begin(),
+ p->second );
+
+ // hose right
+ delete right;
+
+ dout(10) << "merge_left result " << *left << endl;
+}
+
+/* buggy possibly, but more importnatly, unnecessary.
+void ObjectCacher::Object::merge_right(BufferHead *left, BufferHead *right)
+{
+ assert(left->end() == right->start());
+ assert(left->get_state() == right->get_state());
+
+ dout(10) << "merge_right " << *left << " + " << *right << endl;
+ oc->bh_remove(this, left);
+ oc->bh_stat_sub(right);
+ data.erase(right->start());
+ right->set_start( left->start() );
+ data[right->start()] = right;
+ right->set_length( left->length() + right->length());
+ oc->bh_stat_add(right);
+
+ // data
+ bufferlist nbl;
+ nbl.claim(left->bl);
+ nbl.claim_append(right->bl);
+ right->bl.claim(nbl);
+
+ // version
+ // note: this is sorta busted, but should only be used for dirty buffers
+ right->last_write_tid = MAX( left->last_write_tid, right->last_write_tid );
+
+ // waiters
+ map<off_t,list<Context*> > old;
+ old.swap(right->waitfor_read);
+
+ // take left's waiters
+ right->waitfor_read.swap(left->waitfor_read);
+
+ // shift old waiters
+ for (map<off_t, list<Context*> >::iterator p = old.begin();
+ p != old.end();
+ p++)
+ right->waitfor_read[p->first + left->length()].swap( p->second );
+
+ // hose left
+ delete left;
+
+ dout(10) << "merge_right result " << *right << endl;
+}
+*/
+
+/*
+ * map a range of bytes into buffer_heads.
+ * - create missing buffer_heads as necessary.
+ */
+int ObjectCacher::Object::map_read(Objecter::OSDRead *rd,
+ map<off_t, BufferHead*>& hits,
+ map<off_t, BufferHead*>& missing,
+ map<off_t, BufferHead*>& rx)
+{
+ for (list<ObjectExtent>::iterator ex_it = rd->extents.begin();
+ ex_it != rd->extents.end();
+ ex_it++) {
+
+ if (ex_it->oid != oid) continue;
+
+ dout(10) << "map_read " << ex_it->oid
+ << " " << ex_it->start << "~" << ex_it->length << endl;
+
+ map<off_t, BufferHead*>::iterator p = data.lower_bound(ex_it->start);
+ // p->first >= start
+
+ off_t cur = ex_it->start;
+ off_t left = ex_it->length;
+
+ if (p != data.begin() &&
+ (p == data.end() || p->first > cur)) {
+ p--; // might overlap!
+ if (p->first + p->second->length() <= cur)
+ p++; // doesn't overlap.
+ }
+
+ while (left > 0) {
+ // at end?
+ if (p == data.end()) {
+ // rest is a miss.
+ BufferHead *n = new BufferHead(this);
+ n->set_start( cur );
+ n->set_length( left );
+ oc->bh_add(this, n);
+ missing[cur] = n;
+ dout(20) << "map_read miss " << left << " left, " << *n << endl;
+ cur += left;
+ left -= left;
+ assert(left == 0);
+ assert(cur == ex_it->start + (off_t)ex_it->length);
+ break; // no more.
+ }
+
+ if (p->first <= cur) {
+ // have it (or part of it)
+ BufferHead *e = p->second;
+
+ if (e->is_clean() ||
+ e->is_dirty() ||
+ e->is_tx()) {
+ hits[cur] = e; // readable!
+ dout(20) << "map_read hit " << *e << endl;
+ }
+ else if (e->is_rx()) {
+ rx[cur] = e; // missing, not readable.
+ dout(20) << "map_read rx " << *e << endl;
+ }
+ else assert(0);
+
+ off_t lenfromcur = MIN(e->end() - cur, left);
+ cur += lenfromcur;
+ left -= lenfromcur;
+ p++;
+ continue; // more?
+
+ } else if (p->first > cur) {
+ // gap.. miss
+ off_t next = p->first;
+ BufferHead *n = new BufferHead(this);
+ n->set_start( cur );
+ n->set_length( MIN(next - cur, left) );
+ oc->bh_add(this,n);
+ missing[cur] = n;
+ cur += MIN(left, n->length());
+ left -= MIN(left, n->length());
+ dout(20) << "map_read gap " << *n << endl;
+ continue; // more?
+ }
+ else
+ assert(0);
+ }
+ }
+ return(0);
+}
+
+/*
+ * map a range of extents on an object's buffer cache.
+ * - combine any bh's we're writing into one
+ * - break up bufferheads that don't fall completely within the range
+ * //no! - return a bh that includes the write. may also include other dirty data to left and/or right.
+ */
+ObjectCacher::BufferHead *ObjectCacher::Object::map_write(Objecter::OSDWrite *wr)
+{
+ BufferHead *final = 0;
+
+ for (list<ObjectExtent>::iterator ex_it = wr->extents.begin();
+ ex_it != wr->extents.end();
+ ex_it++) {
+
+ if (ex_it->oid != oid) continue;
+
+ dout(10) << "map_write oex " << ex_it->oid
+ << " " << ex_it->start << "~" << ex_it->length << endl;
+
+ map<off_t, BufferHead*>::iterator p = data.lower_bound(ex_it->start);
+ // p->first >= start
+
+ off_t cur = ex_it->start;
+ off_t left = ex_it->length;
+
+ if (p != data.begin() &&
+ (p == data.end() || p->first > cur)) {
+ p--; // might overlap or butt up!
+
+ /*// dirty and butts up?
+ if (p->first + p->second->length() == cur &&
+ p->second->is_dirty()) {
+ dout(10) << "map_write will append to tail of " << *p->second << endl;
+ final = p->second;
+ }
+ */
+ if (p->first + p->second->length() <= cur)
+ p++; // doesn't overlap.
+ }
+
+ while (left > 0) {
+ off_t max = left;
+
+ // at end ?
+ if (p == data.end()) {
+ if (final == NULL) {
+ final = new BufferHead(this);
+ final->set_start( cur );
+ final->set_length( max );
+ oc->bh_add(this, final);
+ dout(10) << "map_write adding trailing bh " << *final << endl;
+ } else {
+ final->set_length( final->length() + max );
+ }
+ left -= max;
+ cur += max;
+ continue;
+ }
+
+ dout(10) << "p is " << *p->second << endl;
+
+ if (p->first <= cur) {
+ BufferHead *bh = p->second;
+ dout(10) << "map_write bh " << *bh << " intersected" << endl;
+
+ /*if (bh->is_dirty()) {
+ // already dirty, let's use it.
+ final = bh;
+ } else {
+ */
+ if (p->first < cur) {
+ assert(final == 0);
+ if (cur + max >= p->first + p->second->length()) {
+ // we want right bit (one splice)
+ final = split(bh, cur); // just split it, take right half.
+ p++;
+ assert(p->second == final);
+ } else {
+ // we want middle bit (two splices)
+ final = split(bh, cur);
+ p++;
+ assert(p->second == final);
+ split(final, cur+max);
+ }
+ } else if (p->first == cur) {
+ /*if (bh->is_dirty()) {
+ // already dirty, use it.
+ }
+ else*/
+ if (p->second->length() <= max) {
+ // whole bufferhead, piece of cake.
+ } else {
+ // we want left bit (one splice)
+ split(bh, cur + max); // just split
+ }
+ if (final)
+ merge_left(final,bh);
+ else
+ final = bh;
+ }
+
+ // keep going.
+ off_t lenfromcur = final->end() - cur;
+ cur += lenfromcur;
+ left -= lenfromcur;
+ p++;
+ continue;
+ } else {
+ // gap!
+ off_t next = p->first;
+ off_t glen = MIN(next - cur, max);
+ dout(10) << "map_write gap " << cur << "~" << glen << endl;
+ if (final) {
+ final->set_length( final->length() + glen );
+ } else {
+ final = new BufferHead(this);
+ final->set_start( cur );
+ final->set_length( glen );
+ oc->bh_add(this, final);
+ }
+
+ cur += glen;
+ left -= glen;
+ continue; // more?
+ }
+ }
+ }
+
+ // set versoin
+ assert(final);
+ dout(10) << "map_write final is " << *final << endl;
+
+ return final;
+}
+
+
+
+/*** ObjectCacher ***/
+
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << objecter->messenger->get_myaddr() << ".objectcacher "
+
+
+/* private */
+
+void ObjectCacher::bh_read(BufferHead *bh)
+{
+ dout(7) << "bh_read on " << *bh << endl;
+
+ mark_rx(bh);
+
+ // finisher
+ C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob->get_oid(), bh->start(), bh->length());
+
+ // go
+ objecter->read(bh->ob->get_oid(), bh->start(), bh->length(), &onfinish->bl,
+ onfinish);
+}
+
+void ObjectCacher::bh_read_finish(object_t oid, off_t start, size_t length, bufferlist &bl)
+{
+ //lock.Lock();
+ dout(7) << "bh_read_finish "
+ << oid
+ << " " << start << "~" << length
+ << endl;
+
+ if (objects.count(oid) == 0) {
+ dout(7) << "bh_read_finish no object cache" << endl;
+ } else {
+ Object *ob = objects[oid];
+
+ // apply to bh's!
+ off_t opos = start;
+ map<off_t, BufferHead*>::iterator p = ob->data.lower_bound(opos);
+
+ while (p != ob->data.end() &&
+ opos < start+(off_t)length) {
+ BufferHead *bh = p->second;
+
+ if (bh->start() > opos) {
+ dout(1) << "weirdness: gap when applying read results, "
+ << opos << "~" << bh->start() - opos
+ << endl;
+ opos = bh->start();
+ continue;
+ }
+
+ if (!bh->is_rx()) {
+ dout(10) << "bh_read_finish skipping non-rx " << *bh << endl;
+ opos = bh->end();
+ p++;
+ continue;
+ }
+
+ assert(opos >= bh->start());
+ assert(bh->start() == opos); // we don't merge rx bh's... yet!
+ assert(bh->length() <= start+(off_t)length-opos);
+
+ bh->bl.substr_of(bl,
+ opos-bh->start(),
+ bh->length());
+ mark_clean(bh);
+ dout(10) << "bh_read_finish read " << *bh << endl;
+
+ opos = bh->end();
+ p++;
+
+ // finishers?
+ // called with lock held.
+ list<Context*> ls;
+ for (map<off_t, list<Context*> >::iterator p = bh->waitfor_read.begin();
+ p != bh->waitfor_read.end();
+ p++)
+ ls.splice(ls.end(), p->second);
+ bh->waitfor_read.clear();
+ finish_contexts(ls);
+ }
+ }
+ //lock.Unlock();
+}
+
+
+void ObjectCacher::bh_write(BufferHead *bh)
+{
+ dout(7) << "bh_write " << *bh << endl;
+
+ // finishers
+ C_WriteAck *onack = new C_WriteAck(this, bh->ob->get_oid(), bh->start(), bh->length());
+ C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->get_oid(), bh->start(), bh->length());
+
+ // go
+ tid_t tid = objecter->write(bh->ob->get_oid(), bh->start(), bh->length(), bh->bl,
+ onack, oncommit);
+
+ // set bh last_write_tid
+ onack->tid = tid;
+ oncommit->tid = tid;
+ bh->ob->last_write_tid = tid;
+ bh->last_write_tid = tid;
+
+ mark_tx(bh);
+}
+
+void ObjectCacher::lock_ack(list<object_t>& oids, tid_t tid)
+{
+ for (list<object_t>::iterator i = oids.begin();
+ i != oids.end();
+ i++) {
+ object_t oid = *i;
+
+ if (objects.count(oid) == 0) {
+ dout(7) << "lock_ack no object cache" << endl;
+ assert(0);
+ }
+
+ Object *ob = objects[oid];
+
+ list<Context*> ls;
+
+ assert(tid <= ob->last_write_tid);
+ if (ob->last_write_tid == tid) {
+ dout(10) << "lock_ack " << *ob
+ << " tid " << tid << endl;
+
+ switch (ob->lock_state) {
+ case Object::LOCK_RDUNLOCKING:
+ case Object::LOCK_WRUNLOCKING:
+ ob->lock_state = Object::LOCK_NONE;
+ break;
+ case Object::LOCK_RDLOCKING:
+ case Object::LOCK_DOWNGRADING:
+ ob->lock_state = Object::LOCK_RDLOCK;
+ ls.splice(ls.begin(), ob->waitfor_rd);
+ break;
+ case Object::LOCK_UPGRADING:
+ case Object::LOCK_WRLOCKING:
+ ob->lock_state = Object::LOCK_WRLOCK;
+ ls.splice(ls.begin(), ob->waitfor_wr);
+ ls.splice(ls.begin(), ob->waitfor_rd);
+ break;
+
+ default:
+ assert(0);
+ }
+
+ ob->last_ack_tid = tid;
+
+ if (ob->can_close())
+ close_object(ob);
+ } else {
+ dout(10) << "lock_ack " << *ob
+ << " tid " << tid << " obsolete" << endl;
+ }
+
+ // waiters?
+ if (ob->waitfor_ack.count(tid)) {
+ ls.splice(ls.end(), ob->waitfor_ack[tid]);
+ ob->waitfor_ack.erase(tid);
+ }
+
+ finish_contexts(ls);
+
+ }
+}
+
+void ObjectCacher::bh_write_ack(object_t oid, off_t start, size_t length, tid_t tid)
+{
+ //lock.Lock();
+
+ dout(7) << "bh_write_ack "
+ << oid
+ << " tid " << tid
+ << " " << start << "~" << length
+ << endl;
+ if (objects.count(oid) == 0) {
+ dout(7) << "bh_write_ack no object cache" << endl;
+ assert(0);
+ } else {
+ Object *ob = objects[oid];
+
+ // apply to bh's!
+ for (map<off_t, BufferHead*>::iterator p = ob->data.lower_bound(start);
+ p != ob->data.end();
+ p++) {
+ BufferHead *bh = p->second;
+
+ if (bh->start() > start+(off_t)length) break;
+
+ if (bh->start() < start &&
+ bh->end() > start+(off_t)length) {
+ dout(20) << "bh_write_ack skipping " << *bh << endl;
+ continue;
+ }
+
+ // make sure bh is tx
+ if (!bh->is_tx()) {
+ dout(10) << "bh_write_ack skipping non-tx " << *bh << endl;
+ continue;
+ }
+
+ // make sure bh tid matches
+ if (bh->last_write_tid != tid) {
+ assert(bh->last_write_tid > tid);
+ dout(10) << "bh_write_ack newer tid on " << *bh << endl;
+ continue;
+ }
+
+ // ok! mark bh clean.
+ mark_clean(bh);
+ dout(10) << "bh_write_ack clean " << *bh << endl;
+ }
+
+ // update object last_ack.
+ assert(ob->last_ack_tid < tid);
+ ob->last_ack_tid = tid;
+
+ // waiters?
+ if (ob->waitfor_ack.count(tid)) {
+ list<Context*> ls;
+ ls.splice(ls.begin(), ob->waitfor_ack[tid]);
+ ob->waitfor_ack.erase(tid);
+ finish_contexts(ls);
+ }
+ }
+ //lock.Unlock();
+}
+
+void ObjectCacher::bh_write_commit(object_t oid, off_t start, size_t length, tid_t tid)
+{
+ //lock.Lock();
+
+ // update object last_commit
+ dout(7) << "bh_write_commit "
+ << oid
+ << " tid " << tid
+ << " " << start << "~" << length
+ << endl;
+ if (objects.count(oid) == 0) {
+ dout(7) << "bh_write_commit no object cache" << endl;
+ //assert(0);
+ } else {
+ Object *ob = objects[oid];
+
+ // update last_commit.
+ ob->last_commit_tid = tid;
+
+ // waiters?
+ if (ob->waitfor_commit.count(tid)) {
+ list<Context*> ls;
+ ls.splice(ls.begin(), ob->waitfor_commit[tid]);
+ ob->waitfor_commit.erase(tid);
+ finish_contexts(ls);
+ }
+ }
+
+ // lock.Unlock();
+}
+
+
+void ObjectCacher::flush(off_t amount)
+{
+ utime_t cutoff = g_clock.now();
+ //cutoff.sec_ref() -= g_conf.client_oc_max_dirty_age;
+
+ dout(10) << "flush " << amount << endl;
+
+ off_t did = 0;
+ while (amount == 0 || did < amount) {
+ BufferHead *bh = (BufferHead*) lru_dirty.lru_get_next_expire();
+ if (!bh) break;
+ if (bh->last_write > cutoff) break;
+
+ did += bh->length();
+ bh_write(bh);
+ }
+}
+
+
+void ObjectCacher::trim(off_t max)
+{
+ if (max < 0)
+ max = g_conf.client_oc_size;
+
+ dout(10) << "trim start: max " << max
+ << " clean " << get_stat_clean()
+ << endl;
+
+ while (get_stat_clean() > max) {
+ BufferHead *bh = (BufferHead*) lru_rest.lru_expire();
+ if (!bh) break;
+
+ dout(10) << "trim trimming " << *bh << endl;
+ assert(bh->is_clean());
+
+ Object *ob = bh->ob;
+ bh_remove(ob, bh);
+ delete bh;
+
+ if (ob->can_close()) {
+ dout(10) << "trim trimming " << *ob << endl;
+ close_object(ob);
+ }
+ }
+
+ dout(10) << "trim finish: max " << max
+ << " clean " << get_stat_clean()
+ << endl;
+}
+
+
+
+/* public */
+
+/*
+ * returns # bytes read (if in cache). onfinish is untouched (caller must delete it)
+ * returns 0 if doing async read
+ */
+int ObjectCacher::readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish)
+{
+ bool success = true;
+ list<BufferHead*> hit_ls;
+ map<size_t, bufferlist> stripe_map; // final buffer offset -> substring
+
+ for (list<ObjectExtent>::iterator ex_it = rd->extents.begin();
+ ex_it != rd->extents.end();
+ ex_it++) {
+ dout(10) << "readx " << *ex_it << endl;
+
+ // get Object cache
+ Object *o = get_object(ex_it->oid, ino);
+
+ // map extent into bufferheads
+ map<off_t, BufferHead*> hits, missing, rx;
+ o->map_read(rd, hits, missing, rx);
+
+ if (!missing.empty() || !rx.empty()) {
+ // read missing
+ for (map<off_t, BufferHead*>::iterator bh_it = missing.begin();
+ bh_it != missing.end();
+ bh_it++) {
+ bh_read(bh_it->second);
+ if (success) {
+ dout(10) << "readx missed, waiting on " << *bh_it->second
+ << " off " << bh_it->first << endl;
+ success = false;
+ bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) );
+ }
+ }
+
+ // bump rx
+ for (map<off_t, BufferHead*>::iterator bh_it = rx.begin();
+ bh_it != rx.end();
+ bh_it++) {
+ touch_bh(bh_it->second); // bump in lru, so we don't lose it.
+ if (success) {
+ dout(10) << "readx missed, waiting on " << *bh_it->second
+ << " off " << bh_it->first << endl;
+ success = false;
+ bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) );
+ }
+ }
+ } else {
+ assert(!hits.empty());
+
+ // make a plain list
+ for (map<off_t, BufferHead*>::iterator bh_it = hits.begin();
+ bh_it != hits.end();
+ bh_it++) {
+ dout(10) << "readx hit bh " << *bh_it->second << endl;
+ hit_ls.push_back(bh_it->second);
+ }
+
+ // create reverse map of buffer offset -> object for the eventual result.
+ // this is over a single ObjectExtent, so we know that
+ // - the bh's are contiguous
+ // - the buffer frags need not be (and almost certainly aren't)
+ off_t opos = ex_it->start;
+ map<off_t, BufferHead*>::iterator bh_it = hits.begin();
+ assert(bh_it->second->start() <= opos);
+ size_t bhoff = opos - bh_it->second->start();
+ map<size_t,size_t>::iterator f_it = ex_it->buffer_extents.begin();
+ size_t foff = 0;
+ while (1) {
+ BufferHead *bh = bh_it->second;
+ assert(opos == (off_t)(bh->start() + bhoff));
+
+ dout(10) << "readx rmap opos " << opos
+ << ": " << *bh << " +" << bhoff
+ << " frag " << f_it->first << "~" << f_it->second << " +" << foff
+ << endl;
+
+ size_t len = MIN(f_it->second - foff,
+ bh->length() - bhoff);
+ stripe_map[f_it->first].substr_of(bh->bl,
+ opos - bh->start(),
+ len);
+ opos += len;
+ bhoff += len;
+ foff += len;
+ if (opos == bh->end()) {
+ bh_it++;
+ bhoff = 0;
+ }
+ if (foff == f_it->second) {
+ f_it++;
+ foff = 0;
+ }
+ if (bh_it == hits.end()) break;
+ if (f_it == ex_it->buffer_extents.end()) break;
+ }
+ assert(f_it == ex_it->buffer_extents.end());
+ assert(opos == ex_it->start + (off_t)ex_it->length);
+ }
+ }
+
+ // bump hits in lru
+ for (list<BufferHead*>::iterator bhit = hit_ls.begin();
+ bhit != hit_ls.end();
+ bhit++)
+ touch_bh(*bhit);
+
+ if (!success) return 0; // wait!
+
+ // no misses... success! do the read.
+ assert(!hit_ls.empty());
+ dout(10) << "readx has all buffers" << endl;
+
+ // ok, assemble into result buffer.
+ rd->bl->clear();
+ size_t pos = 0;
+ for (map<size_t,bufferlist>::iterator i = stripe_map.begin();
+ i != stripe_map.end();
+ i++) {
+ assert(pos == i->first);
+ dout(10) << "readx adding buffer len " << i->second.length() << " at " << pos << endl;
+ pos += i->second.length();
+ rd->bl->claim_append(i->second);
+ }
+ dout(10) << "readx result is " << rd->bl->length() << endl;
+
+ trim();
+
+ return pos;
+}
+
+
+int ObjectCacher::writex(Objecter::OSDWrite *wr, inodeno_t ino)
+{
+ utime_t now = g_clock.now();
+
+ for (list<ObjectExtent>::iterator ex_it = wr->extents.begin();
+ ex_it != wr->extents.end();
+ ex_it++) {
+ // get object cache
+ Object *o = get_object(ex_it->oid, ino);
+
+ // map it all into a single bufferhead.
+ BufferHead *bh = o->map_write(wr);
+
+ // adjust buffer pointers (ie "copy" data into my cache)
+ // this is over a single ObjectExtent, so we know that
+ // - there is one contiguous bh
+ // - the buffer frags need not be (and almost certainly aren't)
+ // note: i assume striping is monotonic... no jumps backwards, ever!
+ off_t opos = ex_it->start;
+ for (map<size_t,size_t>::iterator f_it = ex_it->buffer_extents.begin();
+ f_it != ex_it->buffer_extents.end();
+ f_it++) {
+ dout(10) << "writex writing " << f_it->first << "~" << f_it->second << " into " << *bh << " at " << opos << endl;
+ size_t bhoff = bh->start() - opos;
+ assert(f_it->second <= bh->length() - bhoff);
+
+ bufferlist frag;
+ frag.substr_of(wr->bl,
+ f_it->first, f_it->second);
+
+ bh->bl.claim_append(frag);
+ opos += f_it->second;
+ }
+
+ // it's dirty.
+ mark_dirty(bh);
+ touch_bh(bh);
+ bh->last_write = now;
+
+ // recombine with left?
+ map<off_t,BufferHead*>::iterator p = o->data.find(bh->start());
+ if (p != o->data.begin()) {
+ p--;
+ if (p->second->is_dirty()) {
+ o->merge_left(p->second,bh);
+ bh = p->second;
+ }
+ }
+ // right?
+ p = o->data.find(bh->start());
+ p++;
+ if (p != o->data.end() &&
+ p->second->is_dirty())
+ o->merge_left(p->second,bh);
+ }
+
+ delete wr;
+
+ trim();
+ return 0;
+}
+
+
+// blocking wait for write.
+void ObjectCacher::wait_for_write(size_t len, Mutex& lock)
+{
+ while (get_stat_dirty() > g_conf.client_oc_max_dirty) {
+ dout(10) << "wait_for_write waiting" << endl;
+ flusher_cond.Signal();
+ stat_waiter++;
+ stat_cond.Wait(lock);
+ stat_waiter--;
+ dout(10) << "wait_for_write woke up" << endl;
+ }
+}
+
+void ObjectCacher::flusher_entry()
+{
+ dout(10) << "flusher start" << endl;
+ lock.Lock();
+ while (!flusher_stop) {
+ while (!flusher_stop) {
+ off_t all = get_stat_tx() + get_stat_rx() + get_stat_clean() + get_stat_dirty();
+ dout(11) << "flusher "
+ << all << " / " << g_conf.client_oc_size << ": "
+ << get_stat_tx() << " tx, "
+ << get_stat_rx() << " rx, "
+ << get_stat_clean() << " clean, "
+ << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty"
+ << endl;
+ if (get_stat_dirty() > g_conf.client_oc_max_dirty) {
+ // flush some dirty pages
+ dout(10) << "flusher "
+ << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty,"
+ << " flushing some dirty bhs" << endl;
+ flush(get_stat_dirty() - g_conf.client_oc_max_dirty);
+ }
+ else {
+ // check tail of lru for old dirty items
+ utime_t cutoff = g_clock.now();
+ cutoff.sec_ref()--;
+ BufferHead *bh = 0;
+ while ((bh = (BufferHead*)lru_dirty.lru_get_next_expire()) != 0 &&
+ bh->last_write < cutoff) {
+ dout(10) << "flusher flushing aged dirty bh " << *bh << endl;
+ bh_write(bh);
+ }
+ break;
+ }
+ }
+ if (flusher_stop) break;
+ flusher_cond.WaitInterval(lock, utime_t(1,0));
+ }
+ lock.Unlock();
+ dout(10) << "flusher finish" << endl;
+}
+
+
+
+// blocking. atomic+sync.
+int ObjectCacher::atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock)
+{
+ dout(10) << "atomic_sync_readx " << rd
+ << " in " << ino
+ << endl;
+
+ if (rd->extents.size() == 1) {
+ // single object.
+ // just write synchronously.
+ Cond cond;
+ bool done = false;
+ objecter->readx(rd, new C_SafeCond(&lock, &cond, &done));
+
+ // block
+ while (!done) cond.Wait(lock);
+ } else {
+ // spans multiple objects, or is big.
+
+ // sort by object...
+ map<object_t,ObjectExtent> by_oid;
+ for (list<ObjectExtent>::iterator ex_it = rd->extents.begin();
+ ex_it != rd->extents.end();
+ ex_it++)
+ by_oid[ex_it->oid] = *ex_it;
+
+ // lock
+ for (map<object_t,ObjectExtent>::iterator i = by_oid.begin();
+ i != by_oid.end();
+ i++) {
+ Object *o = get_object(i->first, ino);
+ rdlock(o);
+ }
+
+ // readx will hose rd
+ list<ObjectExtent> extents = rd->extents;
+
+ // do the read, into our cache
+ Cond cond;
+ bool done = false;
+ readx(rd, ino, new C_SafeCond(&lock, &cond, &done));
+
+ // block
+ while (!done) cond.Wait(lock);
+
+ // release the locks
+ for (list<ObjectExtent>::iterator ex_it = extents.begin();
+ ex_it != extents.end();
+ ex_it++) {
+ assert(objects.count(ex_it->oid));
+ Object *o = objects[ex_it->oid];
+ rdunlock(o);
+ }
+ }
+
+ return 0;
+}
+
+int ObjectCacher::atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock)
+{
+ dout(10) << "atomic_sync_writex " << wr
+ << " in " << ino
+ << endl;
+
+ if (wr->extents.size() == 1 &&
+ wr->extents.front().length <= g_conf.client_oc_max_sync_write) {
+ // single object.
+
+ // make sure we aren't already locking/locked...
+ object_t oid = wr->extents.front().oid;
+ Object *o = 0;
+ if (objects.count(oid)) o = get_object(oid, ino);
+ if (!o ||
+ (o->lock_state != Object::LOCK_WRLOCK &&
+ o->lock_state != Object::LOCK_WRLOCKING &&
+ o->lock_state != Object::LOCK_UPGRADING)) {
+ // just write synchronously.
+ dout(10) << "atomic_sync_writex " << wr
+ << " in " << ino
+ << " doing sync write"
+ << endl;
+
+ Cond cond;
+ bool done = false;
+ objecter->modifyx(wr, new C_SafeCond(&lock, &cond, &done), 0);
+
+ // block
+ while (!done) cond.Wait(lock);
+ return 0;
+ }
+ }
+
+ // spans multiple objects, or is big.
+ // sort by object...
+ map<object_t,ObjectExtent> by_oid;
+ for (list<ObjectExtent>::iterator ex_it = wr->extents.begin();
+ ex_it != wr->extents.end();
+ ex_it++)
+ by_oid[ex_it->oid] = *ex_it;
+
+ // wrlock
+ for (map<object_t,ObjectExtent>::iterator i = by_oid.begin();
+ i != by_oid.end();
+ i++) {
+ Object *o = get_object(i->first, ino);
+ wrlock(o);
+ }
+
+ // writex will hose wr
+ list<ObjectExtent> extents = wr->extents;
+
+ // do the write, into our cache
+ writex(wr, ino);
+
+ // flush
+ // ...and release the locks?
+ for (list<ObjectExtent>::iterator ex_it = extents.begin();
+ ex_it != extents.end();
+ ex_it++) {
+ assert(objects.count(ex_it->oid));
+ Object *o = objects[ex_it->oid];
+
+ wrunlock(o);
+ }
+
+ return 0;
+}
+
+
+
+// locking -----------------------------
+
+void ObjectCacher::rdlock(Object *o)
+{
+ // lock?
+ if (o->lock_state == Object::LOCK_NONE ||
+ o->lock_state == Object::LOCK_RDUNLOCKING ||
+ o->lock_state == Object::LOCK_WRUNLOCKING) {
+ dout(10) << "rdlock rdlock " << *o << endl;
+
+ o->lock_state = Object::LOCK_RDLOCKING;
+
+ C_LockAck *ack = new C_LockAck(this, o->get_oid());
+ C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0);
+
+ commit->tid =
+ ack->tid =
+ o->last_write_tid =
+ objecter->lock(OSD_OP_RDLOCK, o->get_oid(), ack, commit);
+ }
+
+ // stake our claim.
+ o->rdlock_ref++;
+
+ // wait?
+ if (o->lock_state == Object::LOCK_RDLOCKING ||
+ o->lock_state == Object::LOCK_WRLOCKING) {
+ dout(10) << "rdlock waiting for rdlock|wrlock on " << *o << endl;
+ Cond cond;
+ bool done = false;
+ o->waitfor_rd.push_back(new C_SafeCond(&lock, &cond, &done));
+ while (!done) cond.Wait(lock);
+ }
+ assert(o->lock_state == Object::LOCK_RDLOCK ||
+ o->lock_state == Object::LOCK_WRLOCK ||
+ o->lock_state == Object::LOCK_UPGRADING ||
+ o->lock_state == Object::LOCK_DOWNGRADING);
+}
+
+void ObjectCacher::wrlock(Object *o)
+{
+ // lock?
+ if (o->lock_state != Object::LOCK_WRLOCK &&
+ o->lock_state != Object::LOCK_WRLOCKING &&
+ o->lock_state != Object::LOCK_UPGRADING) {
+ dout(10) << "wrlock wrlock " << *o << endl;
+
+ int op = 0;
+ if (o->lock_state == Object::LOCK_RDLOCK) {
+ o->lock_state = Object::LOCK_UPGRADING;
+ op = OSD_OP_UPLOCK;
+ } else {
+ o->lock_state = Object::LOCK_WRLOCKING;
+ op = OSD_OP_WRLOCK;
+ }
+
+ C_LockAck *ack = new C_LockAck(this, o->get_oid());
+ C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0);
+
+ commit->tid =
+ ack->tid =
+ o->last_write_tid =
+ objecter->lock(op, o->get_oid(), ack, commit);
+ }
+
+ // stake our claim.
+ o->wrlock_ref++;
+
+ // wait?
+ if (o->lock_state == Object::LOCK_WRLOCKING ||
+ o->lock_state == Object::LOCK_UPGRADING) {
+ dout(10) << "wrlock waiting for wrlock on " << *o << endl;
+ Cond cond;
+ bool done = false;
+ o->waitfor_wr.push_back(new C_SafeCond(&lock, &cond, &done));
+ while (!done) cond.Wait(lock);
+ }
+ assert(o->lock_state == Object::LOCK_WRLOCK);
+}
+
+
+void ObjectCacher::rdunlock(Object *o)
+{
+ dout(10) << "rdunlock " << *o << endl;
+ assert(o->lock_state == Object::LOCK_RDLOCK ||
+ o->lock_state == Object::LOCK_WRLOCK ||
+ o->lock_state == Object::LOCK_UPGRADING ||
+ o->lock_state == Object::LOCK_DOWNGRADING);
+
+ assert(o->rdlock_ref > 0);
+ o->rdlock_ref--;
+ if (o->rdlock_ref > 0 ||
+ o->wrlock_ref > 0) {
+ dout(10) << "rdunlock " << *o << " still has rdlock|wrlock refs" << endl;
+ return;
+ }
+
+ release(o); // release first
+
+ o->lock_state = Object::LOCK_RDUNLOCKING;
+
+ C_LockAck *lockack = new C_LockAck(this, o->get_oid());
+ C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0);
+ commit->tid =
+ lockack->tid =
+ o->last_write_tid =
+ objecter->lock(OSD_OP_RDUNLOCK, o->get_oid(), lockack, commit);
+}
+
+void ObjectCacher::wrunlock(Object *o)
+{
+ dout(10) << "wrunlock " << *o << endl;
+ assert(o->lock_state == Object::LOCK_WRLOCK);
+
+ assert(o->wrlock_ref > 0);
+ o->wrlock_ref--;
+ if (o->wrlock_ref > 0) {
+ dout(10) << "wrunlock " << *o << " still has wrlock refs" << endl;
+ return;
+ }
+
+ flush(o); // flush first
+
+ int op = 0;
+ if (o->rdlock_ref > 0) {
+ dout(10) << "wrunlock rdlock " << *o << endl;
+ op = OSD_OP_DNLOCK;
+ o->lock_state = Object::LOCK_DOWNGRADING;
+ } else {
+ dout(10) << "wrunlock wrunlock " << *o << endl;
+ op = OSD_OP_WRUNLOCK;
+ o->lock_state = Object::LOCK_WRUNLOCKING;
+ }
+
+ C_LockAck *lockack = new C_LockAck(this, o->get_oid());
+ C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0);
+ commit->tid =
+ lockack->tid =
+ o->last_write_tid =
+ objecter->lock(op, o->get_oid(), lockack, commit);
+}
+
+
+// -------------------------------------------------
+
+
+bool ObjectCacher::set_is_cached(inodeno_t ino)
+{
+ if (objects_by_ino.count(ino) == 0)
+ return false;
+
+ set<Object*>& s = objects_by_ino[ino];
+ for (set<Object*>::iterator i = s.begin();
+ i != s.end();
+ i++) {
+ Object *ob = *i;
+ if (!ob->data.empty()) return true;
+ }
+
+ return false;
+}
+
+bool ObjectCacher::set_is_dirty_or_committing(inodeno_t ino)
+{
+ if (objects_by_ino.count(ino) == 0)
+ return false;
+
+ set<Object*>& s = objects_by_ino[ino];
+ for (set<Object*>::iterator i = s.begin();
+ i != s.end();
+ i++) {
+ Object *ob = *i;
+
+ for (map<off_t,BufferHead*>::iterator p = ob->data.begin();
+ p != ob->data.end();
+ p++) {
+ BufferHead *bh = p->second;
+ if (bh->is_dirty() || bh->is_tx())
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+// flush. non-blocking. no callback.
+// true if clean, already flushed.
+// false if we wrote something.
+bool ObjectCacher::flush(Object *ob)
+{
+ bool clean = true;
+ for (map<off_t,BufferHead*>::iterator p = ob->data.begin();
+ p != ob->data.end();
+ p++) {
+ BufferHead *bh = p->second;
+ if (bh->is_tx()) {
+ clean = false;
+ continue;
+ }
+ if (!bh->is_dirty()) continue;
+
+ bh_write(bh);
+ clean = false;
+ }
+ return clean;
+}
+
+// flush. non-blocking, takes callback.
+// returns true if already flushed
+bool ObjectCacher::flush_set(inodeno_t ino, Context *onfinish)
+{
+ if (objects_by_ino.count(ino) == 0) {
+ dout(10) << "flush_set on " << ino << " dne" << endl;
+ return true;
+ }
+
+ dout(10) << "flush_set " << ino << endl;
+
+ C_Gather *gather = 0; // we'll need to wait for all objects to flush!
+
+ set<Object*>& s = objects_by_ino[ino];
+ bool safe = true;
+ for (set<Object*>::iterator i = s.begin();
+ i != s.end();
+ i++) {
+ Object *ob = *i;
+
+ if (!flush(ob)) {
+ // we'll need to gather...
+ if (!gather && onfinish)
+ gather = new C_Gather(onfinish);
+ safe = false;
+
+ dout(10) << "flush_set " << ino << " will wait for ack tid "
+ << ob->last_write_tid
+ << " on " << *ob
+ << endl;
+ if (gather)
+ ob->waitfor_ack[ob->last_write_tid].push_back(gather->new_sub());
+ }
+ }
+
+ if (safe) {
+ dout(10) << "flush_set " << ino << " has no dirty|tx bhs" << endl;
+ return true;
+ }
+ return false;
+}
+
+
+// commit. non-blocking, takes callback.
+// return true if already flushed.
+bool ObjectCacher::commit_set(inodeno_t ino, Context *onfinish)
+{
+ assert(onfinish); // doesn't make any sense otherwise.
+
+ if (objects_by_ino.count(ino) == 0) {
+ dout(10) << "commit_set on " << ino << " dne" << endl;
+ return true;
+ }
+
+ dout(10) << "commit_set " << ino << endl;
+
+ C_Gather *gather = 0; // we'll need to wait for all objects to commit
+
+ set<Object*>& s = objects_by_ino[ino];
+ bool safe = true;
+ for (set<Object*>::iterator i = s.begin();
+ i != s.end();
+ i++) {
+ Object *ob = *i;
+
+ // make sure it's flushing.
+ flush_set(ino);
+
+ if (ob->last_write_tid > ob->last_commit_tid) {
+ dout(10) << "commit_set " << ino << " " << *ob
+ << " will finish on commit tid " << ob->last_write_tid
+ << endl;
+ if (!gather && onfinish) gather = new C_Gather(onfinish);
+ safe = false;
+ if (gather)
+ ob->waitfor_commit[ob->last_write_tid].push_back( gather->new_sub() );
+ }
+ }
+
+ if (safe) {
+ dout(10) << "commit_set " << ino << " all committed" << endl;
+ return true;
+ }
+ return false;
+}
+
+
+off_t ObjectCacher::release(Object *ob)
+{
+ list<BufferHead*> clean;
+ off_t o_unclean = 0;
+
+ for (map<off_t,BufferHead*>::iterator p = ob->data.begin();
+ p != ob->data.end();
+ p++) {
+ BufferHead *bh = p->second;
+ if (bh->is_clean())
+ clean.push_back(bh);
+ else
+ o_unclean += bh->length();
+ }
+
+ for (list<BufferHead*>::iterator p = clean.begin();
+ p != clean.end();
+ p++)
+ bh_remove(ob, *p);
+
+ return o_unclean;
+}
+
+off_t ObjectCacher::release_set(inodeno_t ino)
+{
+ // return # bytes not clean (and thus not released).
+ off_t unclean = 0;
+
+ if (objects_by_ino.count(ino) == 0) {
+ dout(10) << "release_set on " << ino << " dne" << endl;
+ return 0;
+ }
+
+ dout(10) << "release_set " << ino << endl;
+
+ set<Object*>& s = objects_by_ino[ino];
+ for (set<Object*>::iterator i = s.begin();
+ i != s.end();
+ i++) {
+ Object *ob = *i;
+
+ off_t o_unclean = release(ob);
+ unclean += o_unclean;
+
+ if (o_unclean)
+ dout(10) << "release_set " << ino << " " << *ob
+ << " has " << o_unclean << " bytes left"
+ << endl;
+
+ }
+
+ if (unclean) {
+ dout(10) << "release_set " << ino
+ << ", " << unclean << " bytes left" << endl;
+ }
+
+ return unclean;
+}
+
+
+void ObjectCacher::kick_sync_writers(inodeno_t ino)
+{
+ if (objects_by_ino.count(ino) == 0) {
+ dout(10) << "kick_sync_writers on " << ino << " dne" << endl;
+ return;
+ }
+
+ dout(10) << "kick_sync_writers on " << ino << endl;
+
+ list<Context*> ls;
+
+ set<Object*>& s = objects_by_ino[ino];
+ for (set<Object*>::iterator i = s.begin();
+ i != s.end();
+ i++) {
+ Object *ob = *i;
+
+ ls.splice(ls.begin(), ob->waitfor_wr);
+ }
+
+ finish_contexts(ls);
+}
+
+void ObjectCacher::kick_sync_readers(inodeno_t ino)
+{
+ if (objects_by_ino.count(ino) == 0) {
+ dout(10) << "kick_sync_readers on " << ino << " dne" << endl;
+ return;
+ }
+
+ dout(10) << "kick_sync_readers on " << ino << endl;
+
+ list<Context*> ls;
+
+ set<Object*>& s = objects_by_ino[ino];
+ for (set<Object*>::iterator i = s.begin();
+ i != s.end();
+ i++) {
+ Object *ob = *i;
+
+ ls.splice(ls.begin(), ob->waitfor_rd);
+ }
+
+ finish_contexts(ls);
+}
+
+
+
--- /dev/null
+#ifndef __OBJECTCACHER_H_
+#define __OBJECTCACHER_H_
+
+#include "include/types.h"
+#include "include/lru.h"
+#include "include/Context.h"
+
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "Objecter.h"
+#include "Filer.h"
+
+class Objecter;
+class Objecter::OSDRead;
+class Objecter::OSDWrite;
+
+class ObjectCacher {
+ public:
+
+ class Object;
+
+ // ******* BufferHead *********
+ class BufferHead : public LRUObject {
+ public:
+ // states
+ static const int STATE_MISSING = 0;
+ static const int STATE_CLEAN = 1;
+ static const int STATE_DIRTY = 2;
+ static const int STATE_RX = 3;
+ static const int STATE_TX = 4;
+
+ private:
+ // my fields
+ int state;
+ int ref;
+ struct {
+ off_t start, length; // bh extent in object
+ } ex;
+
+ public:
+ Object *ob;
+ bufferlist bl;
+ tid_t last_write_tid; // version of bh (if non-zero)
+ utime_t last_write;
+
+ map< off_t, list<Context*> > waitfor_read;
+
+ public:
+ // cons
+ BufferHead(Object *o) :
+ state(STATE_MISSING),
+ ref(0),
+ ob(o),
+ last_write_tid(0) {}
+
+ // extent
+ off_t start() { return ex.start; }
+ void set_start(off_t s) { ex.start = s; }
+ off_t length() { return ex.length; }
+ void set_length(off_t l) { ex.length = l; }
+ off_t end() { return ex.start + ex.length; }
+ off_t last() { return end() - 1; }
+
+ // states
+ void set_state(int s) {
+ if (s == STATE_RX || s == STATE_TX) get();
+ if (state == STATE_RX || state == STATE_TX) put();
+ state = s;
+ }
+ int get_state() { return state; }
+
+ bool is_missing() { return state == STATE_MISSING; }
+ bool is_dirty() { return state == STATE_DIRTY; }
+ bool is_clean() { return state == STATE_CLEAN; }
+ bool is_tx() { return state == STATE_TX; }
+ bool is_rx() { return state == STATE_RX; }
+
+ // reference counting
+ int get() {
+ assert(ref >= 0);
+ if (ref == 0) lru_pin();
+ return ++ref;
+ }
+ int put() {
+ assert(ref > 0);
+ if (ref == 1) lru_unpin();
+ --ref;
+ return ref;
+ }
+ };
+
+
+ // ******* Object *********
+ class Object {
+ private:
+ // ObjectCacher::Object fields
+ ObjectCacher *oc;
+ object_t oid; // this _always_ is oid.rev=0
+ inodeno_t ino;
+ objectrev_t rev; // last rev we're written
+
+ public:
+ map<off_t, BufferHead*> data;
+
+ tid_t last_write_tid; // version of bh (if non-zero)
+ tid_t last_ack_tid; // last update acked.
+ tid_t last_commit_tid; // last update commited.
+
+ map< tid_t, list<Context*> > waitfor_ack;
+ map< tid_t, list<Context*> > waitfor_commit;
+ list<Context*> waitfor_rd;
+ list<Context*> waitfor_wr;
+
+ // lock
+ static const int LOCK_NONE = 0;
+ static const int LOCK_WRLOCKING = 1;
+ static const int LOCK_WRLOCK = 2;
+ static const int LOCK_WRUNLOCKING = 3;
+ static const int LOCK_RDLOCKING = 4;
+ static const int LOCK_RDLOCK = 5;
+ static const int LOCK_RDUNLOCKING = 6;
+ static const int LOCK_UPGRADING = 7; // rd -> wr
+ static const int LOCK_DOWNGRADING = 8; // wr -> rd
+ int lock_state;
+ int wrlock_ref; // how many ppl want or are using a WRITE lock
+ int rdlock_ref; // how many ppl want or are using a READ lock
+
+ public:
+ Object(ObjectCacher *_oc, object_t o, inodeno_t i) :
+ oc(_oc),
+ oid(o), ino(i),
+ last_write_tid(0), last_ack_tid(0), last_commit_tid(0),
+ lock_state(LOCK_NONE), wrlock_ref(0), rdlock_ref(0)
+ {}
+
+ object_t get_oid() { return oid; }
+ inodeno_t get_ino() { return ino; }
+
+ bool can_close() {
+ return data.empty() && lock_state == LOCK_NONE &&
+ waitfor_ack.empty() && waitfor_commit.empty() &&
+ waitfor_rd.empty() && waitfor_wr.empty();
+ }
+
+ // bh
+ void add_bh(BufferHead *bh) {
+ // add to my map
+ assert(data.count(bh->start()) == 0);
+
+ if (0) { // sanity check FIXME DEBUG
+ //cout << "add_bh " << bh->start() << "~" << bh->length() << endl;
+ map<off_t,BufferHead*>::iterator p = data.lower_bound(bh->start());
+ if (p != data.end()) {
+ //cout << " after " << *p->second << endl;
+ //cout << " after starts at " << p->first << endl;
+ assert(p->first >= bh->end());
+ }
+ if (p != data.begin()) {
+ p--;
+ //cout << " before starts at " << p->second->start()
+ //<< " and ends at " << p->second->end() << endl;
+ //cout << " before " << *p->second << endl;
+ assert(p->second->end() <= bh->start());
+ }
+ }
+
+ data[bh->start()] = bh;
+ }
+ void remove_bh(BufferHead *bh) {
+ assert(data.count(bh->start()));
+ data.erase(bh->start());
+ }
+ bool is_empty() { return data.empty(); }
+
+ // mid-level
+ BufferHead *split(BufferHead *bh, off_t off);
+ void merge_left(BufferHead *left, BufferHead *right);
+ void merge_right(BufferHead *left, BufferHead *right);
+
+ int map_read(Objecter::OSDRead *rd,
+ map<off_t, BufferHead*>& hits,
+ map<off_t, BufferHead*>& missing,
+ map<off_t, BufferHead*>& rx);
+ BufferHead *map_write(Objecter::OSDWrite *wr);
+
+ };
+
+ // ******* ObjectCacher *********
+ // ObjectCacher fields
+ public:
+ Objecter *objecter;
+ Filer filer;
+
+ private:
+ Mutex& lock;
+
+ hash_map<object_t, Object*> objects;
+ hash_map<inodeno_t, set<Object*> > objects_by_ino;
+
+ set<BufferHead*> dirty_bh;
+ LRU lru_dirty, lru_rest;
+
+ Cond flusher_cond;
+ bool flusher_stop;
+ void flusher_entry();
+ class FlusherThread : public Thread {
+ ObjectCacher *oc;
+ public:
+ FlusherThread(ObjectCacher *o) : oc(o) {}
+ void *entry() {
+ oc->flusher_entry();
+ return 0;
+ }
+ } flusher_thread;
+
+
+ // objects
+ Object *get_object(object_t oid, inodeno_t ino) {
+ // have it?
+ if (objects.count(oid))
+ return objects[oid];
+
+ // create it.
+ Object *o = new Object(this, oid, ino);
+ objects[oid] = o;
+ objects_by_ino[ino].insert(o);
+ return o;
+ }
+ void close_object(Object *ob) {
+ assert(ob->can_close());
+
+ // ok!
+ objects.erase(ob->get_oid());
+ objects_by_ino[ob->get_ino()].erase(ob);
+ if (objects_by_ino[ob->get_ino()].empty())
+ objects_by_ino.erase(ob->get_ino());
+ delete ob;
+ }
+
+ // bh stats
+ Cond stat_cond;
+ int stat_waiter;
+
+ off_t stat_clean;
+ off_t stat_dirty;
+ off_t stat_rx;
+ off_t stat_tx;
+ off_t stat_missing;
+
+ void bh_stat_add(BufferHead *bh) {
+ switch (bh->get_state()) {
+ case BufferHead::STATE_MISSING: stat_missing += bh->length(); break;
+ case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break;
+ case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break;
+ case BufferHead::STATE_TX: stat_tx += bh->length(); break;
+ case BufferHead::STATE_RX: stat_rx += bh->length(); break;
+ }
+ if (stat_waiter) stat_cond.Signal();
+ }
+ void bh_stat_sub(BufferHead *bh) {
+ switch (bh->get_state()) {
+ case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break;
+ case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break;
+ case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break;
+ case BufferHead::STATE_TX: stat_tx -= bh->length(); break;
+ case BufferHead::STATE_RX: stat_rx -= bh->length(); break;
+ }
+ }
+ off_t get_stat_tx() { return stat_tx; }
+ off_t get_stat_rx() { return stat_rx; }
+ off_t get_stat_dirty() { return stat_dirty; }
+ off_t get_stat_clean() { return stat_clean; }
+
+ void touch_bh(BufferHead *bh) {
+ if (bh->is_dirty())
+ lru_dirty.lru_touch(bh);
+ else
+ lru_rest.lru_touch(bh);
+ }
+
+ // bh states
+ void bh_set_state(BufferHead *bh, int s) {
+ // move between lru lists?
+ if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) {
+ lru_rest.lru_remove(bh);
+ lru_dirty.lru_insert_top(bh);
+ dirty_bh.insert(bh);
+ }
+ if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) {
+ lru_dirty.lru_remove(bh);
+ lru_rest.lru_insert_mid(bh);
+ dirty_bh.erase(bh);
+ }
+
+ // set state
+ bh_stat_sub(bh);
+ bh->set_state(s);
+ bh_stat_add(bh);
+ }
+
+ void copy_bh_state(BufferHead *bh1, BufferHead *bh2) {
+ bh_set_state(bh2, bh1->get_state());
+ }
+
+ void mark_missing(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_MISSING); };
+ void mark_clean(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_CLEAN); };
+ void mark_rx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_RX); };
+ void mark_tx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_TX); };
+ void mark_dirty(BufferHead *bh) {
+ bh_set_state(bh, BufferHead::STATE_DIRTY);
+ lru_dirty.lru_touch(bh);
+ //bh->set_dirty_stamp(g_clock.now());
+ };
+
+ void bh_add(Object *ob, BufferHead *bh) {
+ ob->add_bh(bh);
+ if (bh->is_dirty())
+ lru_dirty.lru_insert_top(bh);
+ else
+ lru_rest.lru_insert_top(bh);
+ bh_stat_add(bh);
+ }
+ void bh_remove(Object *ob, BufferHead *bh) {
+ ob->remove_bh(bh);
+ if (bh->is_dirty())
+ lru_dirty.lru_remove(bh);
+ else
+ lru_rest.lru_remove(bh);
+ bh_stat_sub(bh);
+ }
+
+ // io
+ void bh_read(BufferHead *bh);
+ void bh_write(BufferHead *bh);
+
+ void trim(off_t max=-1);
+ void flush(off_t amount=0);
+
+ bool flush(Object *o);
+ off_t release(Object *o);
+
+ void rdlock(Object *o);
+ void rdunlock(Object *o);
+ void wrlock(Object *o);
+ void wrunlock(Object *o);
+
+ public:
+ void bh_read_finish(object_t oid, off_t offset, size_t length, bufferlist &bl);
+ void bh_write_ack(object_t oid, off_t offset, size_t length, tid_t t);
+ void bh_write_commit(object_t oid, off_t offset, size_t length, tid_t t);
+ void lock_ack(list<object_t>& oids, tid_t tid);
+
+ class C_ReadFinish : public Context {
+ ObjectCacher *oc;
+ object_t oid;
+ off_t start;
+ size_t length;
+ public:
+ bufferlist bl;
+ C_ReadFinish(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {}
+ void finish(int r) {
+ oc->bh_read_finish(oid, start, length, bl);
+ }
+ };
+
+ class C_WriteAck : public Context {
+ ObjectCacher *oc;
+ object_t oid;
+ off_t start;
+ size_t length;
+ public:
+ tid_t tid;
+ C_WriteAck(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {}
+ void finish(int r) {
+ oc->bh_write_ack(oid, start, length, tid);
+ }
+ };
+ class C_WriteCommit : public Context {
+ ObjectCacher *oc;
+ object_t oid;
+ off_t start;
+ size_t length;
+ public:
+ tid_t tid;
+ C_WriteCommit(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {}
+ void finish(int r) {
+ oc->bh_write_commit(oid, start, length, tid);
+ }
+ };
+
+ class C_LockAck : public Context {
+ ObjectCacher *oc;
+ public:
+ list<object_t> oids;
+ tid_t tid;
+ C_LockAck(ObjectCacher *c, object_t o) : oc(c) {
+ oids.push_back(o);
+ }
+ void finish(int r) {
+ oc->lock_ack(oids, tid);
+ }
+ };
+
+
+
+ public:
+ ObjectCacher(Objecter *o, Mutex& l) :
+ objecter(o), filer(o), lock(l),
+ flusher_stop(false), flusher_thread(this),
+ stat_waiter(0),
+ stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_missing(0) {
+ flusher_thread.create();
+ }
+ ~ObjectCacher() {
+ //lock.Lock(); // hmm.. watch out for deadlock!
+ flusher_stop = true;
+ flusher_cond.Signal();
+ //lock.Unlock();
+ flusher_thread.join();
+ }
+
+
+ class C_RetryRead : public Context {
+ ObjectCacher *oc;
+ Objecter::OSDRead *rd;
+ inodeno_t ino;
+ Context *onfinish;
+ public:
+ C_RetryRead(ObjectCacher *_oc, Objecter::OSDRead *r, inodeno_t i, Context *c) : oc(_oc), rd(r), ino(i), onfinish(c) {}
+ void finish(int) {
+ int r = oc->readx(rd, ino, onfinish);
+ if (r > 0) {
+ onfinish->finish(r);
+ delete onfinish;
+ }
+ }
+ };
+
+ // non-blocking. async.
+ int readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish);
+ int writex(Objecter::OSDWrite *wr, inodeno_t ino);
+
+ // write blocking
+ void wait_for_write(size_t len, Mutex& lock);
+
+ // blocking. atomic+sync.
+ int atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock);
+ int atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock);
+
+ bool set_is_cached(inodeno_t ino);
+ bool set_is_dirty_or_committing(inodeno_t ino);
+
+ bool flush_set(inodeno_t ino, Context *onfinish=0);
+ void flush_all(Context *onfinish=0);
+
+ bool commit_set(inodeno_t ino, Context *oncommit);
+ void commit_all(Context *oncommit=0);
+
+ off_t release_set(inodeno_t ino); // returns # of bytes not released (ie non-clean)
+
+ void kick_sync_writers(inodeno_t ino);
+ void kick_sync_readers(inodeno_t ino);
+
+
+ // file functions
+
+ /*** async+caching (non-blocking) file interface ***/
+ int file_read(inode_t& inode,
+ off_t offset, size_t len,
+ bufferlist *bl,
+ Context *onfinish) {
+ Objecter::OSDRead *rd = new Objecter::OSDRead(bl);
+ filer.file_to_extents(inode, offset, len, rd->extents);
+ return readx(rd, inode.ino, onfinish);
+ }
+
+ int file_write(inode_t& inode,
+ off_t offset, size_t len,
+ bufferlist& bl,
+ objectrev_t rev=0) {
+ Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl);
+ filer.file_to_extents(inode, offset, len, wr->extents);
+ return writex(wr, inode.ino);
+ }
+
+
+
+ /*** sync+blocking file interface ***/
+
+ int file_atomic_sync_read(inode_t& inode,
+ off_t offset, size_t len,
+ bufferlist *bl,
+ Mutex &lock) {
+ Objecter::OSDRead *rd = new Objecter::OSDRead(bl);
+ filer.file_to_extents(inode, offset, len, rd->extents);
+ return atomic_sync_readx(rd, inode.ino, lock);
+ }
+
+ int file_atomic_sync_write(inode_t& inode,
+ off_t offset, size_t len,
+ bufferlist& bl,
+ Mutex &lock,
+ objectrev_t rev=0) {
+ Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl);
+ filer.file_to_extents(inode, offset, len, wr->extents);
+ return atomic_sync_writex(wr, inode.ino, lock);
+ }
+
+};
+
+
+inline ostream& operator<<(ostream& out, ObjectCacher::BufferHead &bh)
+{
+ out << "bh["
+ << bh.start() << "~" << bh.length()
+ << " (" << bh.bl.length() << ")"
+ << " v " << bh.last_write_tid;
+ if (bh.is_tx()) out << " tx";
+ if (bh.is_rx()) out << " rx";
+ if (bh.is_dirty()) out << " dirty";
+ if (bh.is_clean()) out << " clean";
+ if (bh.is_missing()) out << " missing";
+ out << "]";
+ return out;
+}
+
+inline ostream& operator<<(ostream& out, ObjectCacher::Object &ob)
+{
+ out << "object["
+ << hex << ob.get_oid() << " ino " << ob.get_ino() << dec
+ << " wr " << ob.last_write_tid << "/" << ob.last_ack_tid << "/" << ob.last_commit_tid;
+
+ switch (ob.lock_state) {
+ case ObjectCacher::Object::LOCK_WRLOCKING: out << " wrlocking"; break;
+ case ObjectCacher::Object::LOCK_WRLOCK: out << " wrlock"; break;
+ case ObjectCacher::Object::LOCK_WRUNLOCKING: out << " wrunlocking"; break;
+ case ObjectCacher::Object::LOCK_RDLOCKING: out << " rdlocking"; break;
+ case ObjectCacher::Object::LOCK_RDLOCK: out << " rdlock"; break;
+ case ObjectCacher::Object::LOCK_RDUNLOCKING: out << " rdunlocking"; break;
+ }
+
+ out << "]";
+ return out;
+}
+
+#endif
--- /dev/null
+
+#include "Objecter.h"
+#include "osd/OSDMap.h"
+#include "mon/MonMap.h"
+
+#include "msg/Messenger.h"
+#include "msg/Message.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDMap.h"
+#include "messages/MOSDGetMap.h"
+
+#include "messages/MOSDFailure.h"
+
+#include <errno.h>
+
+#include "config.h"
+#undef dout
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << messenger->get_myaddr() << ".objecter "
+#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << messenger->get_myaddr() << ".objecter "
+
+
+// messages ------------------------------
+
+void Objecter::dispatch(Message *m)
+{
+ switch (m->get_type()) {
+ case MSG_OSD_OPREPLY:
+ handle_osd_op_reply((MOSDOpReply*)m);
+ break;
+
+ case MSG_OSD_MAP:
+ handle_osd_map((MOSDMap*)m);
+ break;
+
+ default:
+ dout(1) << "don't know message type " << m->get_type() << endl;
+ assert(0);
+ }
+}
+
+void Objecter::handle_osd_map(MOSDMap *m)
+{
+ assert(osdmap);
+
+ if (m->get_last() <= osdmap->get_epoch()) {
+ dout(3) << "handle_osd_map ignoring epochs ["
+ << m->get_first() << "," << m->get_last()
+ << "] <= " << osdmap->get_epoch() << endl;
+ }
+ else {
+ dout(3) << "handle_osd_map got epochs ["
+ << m->get_first() << "," << m->get_last()
+ << "] > " << osdmap->get_epoch()
+ << endl;
+
+ set<pg_t> changed_pgs;
+
+ for (epoch_t e = osdmap->get_epoch() + 1;
+ e <= m->get_last();
+ e++) {
+ if (m->incremental_maps.count(e)) {
+ dout(3) << "handle_osd_map decoding incremental epoch " << e << endl;
+ OSDMap::Incremental inc;
+ int off = 0;
+ inc.decode(m->incremental_maps[e], off);
+ osdmap->apply_incremental(inc);
+
+ // notify messenger
+ for (map<int,entity_inst_t>::iterator i = inc.new_down.begin();
+ i != inc.new_down.end();
+ i++)
+ messenger->mark_down(MSG_ADDR_OSD(i->first), i->second);
+ for (map<int,entity_inst_t>::iterator i = inc.new_up.begin();
+ i != inc.new_up.end();
+ i++)
+ messenger->mark_up(MSG_ADDR_OSD(i->first), i->second);
+
+ }
+ else if (m->maps.count(e)) {
+ dout(3) << "handle_osd_map decoding full epoch " << e << endl;
+ osdmap->decode(m->maps[e]);
+ }
+ else {
+ dout(3) << "handle_osd_map requesting missing epoch " << osdmap->get_epoch()+1 << endl;
+ int mon = monmap->pick_mon();
+ messenger->send_message(new MOSDGetMap(osdmap->get_epoch()),
+ MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ break;
+ }
+
+ // scan pgs for changes
+ scan_pgs(changed_pgs);
+
+ assert(e == osdmap->get_epoch());
+ }
+
+ // kick requests who might be timing out on the wrong osds
+ if (!changed_pgs.empty())
+ kick_requests(changed_pgs);
+ }
+
+ delete m;
+}
+
+void Objecter::scan_pgs(set<pg_t>& changed_pgs)
+{
+ dout(10) << "scan_pgs" << endl;
+
+ for (hash_map<pg_t,PG>::iterator i = pg_map.begin();
+ i != pg_map.end();
+ i++) {
+ pg_t pgid = i->first;
+ PG& pg = i->second;
+
+ // calc new.
+ vector<int> other;
+ osdmap->pg_to_acting_osds(pgid, other);
+
+ if (other == pg.acting)
+ continue; // no change.
+
+ other.swap(pg.acting);
+
+ if (g_conf.osd_rep == OSD_REP_PRIMARY) {
+ // same primary?
+ if (!other.empty() &&
+ !pg.acting.empty() &&
+ other[0] == pg.acting[0])
+ continue;
+ }
+ else if (g_conf.osd_rep == OSD_REP_SPLAY) {
+ // same primary and acker?
+ if (!other.empty() &&
+ !pg.acting.empty() &&
+ other[0] == pg.acting[0] &&
+ other[other.size() > 1 ? 1:0] == pg.acting[pg.acting.size() > 1 ? 1:0])
+ continue;
+ }
+ else if (g_conf.osd_rep == OSD_REP_CHAIN) {
+ // any change is significant.
+ }
+
+ // changed significantly.
+ dout(10) << "scan_pgs pg " << pgid
+ << " (" << pg.active_tids << ")"
+ << " " << other << " -> " << pg.acting
+ << endl;
+ changed_pgs.insert(pgid);
+ }
+}
+
+void Objecter::kick_requests(set<pg_t>& changed_pgs)
+{
+ dout(10) << "kick_requests in pgs " << changed_pgs << endl;
+
+ for (set<pg_t>::iterator i = changed_pgs.begin();
+ i != changed_pgs.end();
+ i++) {
+ pg_t pgid = *i;
+ PG& pg = pg_map[pgid];
+
+ // resubmit ops!
+ set<tid_t> tids;
+ tids.swap( pg.active_tids );
+ close_pg( pgid ); // will pbly reopen, unless it's just commits we're missing
+
+ for (set<tid_t>::iterator p = tids.begin();
+ p != tids.end();
+ p++) {
+ tid_t tid = *p;
+
+ if (op_modify.count(tid)) {
+ OSDModify *wr = op_modify[tid];
+ op_modify.erase(tid);
+
+ // WRITE
+ if (wr->tid_version.count(tid)) {
+ if (wr->op == OSD_OP_WRITE &&
+ !g_conf.objecter_buffer_uncommitted) {
+ dout(0) << "kick_requests missing commit, cannot replay: objecter_buffer_uncommitted == FALSE" << endl;
+ } else {
+ dout(0) << "kick_requests missing commit, replay write " << tid
+ << " v " << wr->tid_version[tid] << endl;
+ modifyx_submit(wr, wr->waitfor_commit[tid], tid);
+ }
+ }
+ else if (wr->waitfor_ack.count(tid)) {
+ dout(0) << "kick_requests missing ack, resub write " << tid << endl;
+ modifyx_submit(wr, wr->waitfor_ack[tid], tid);
+ }
+ }
+
+ else if (op_read.count(tid)) {
+ // READ
+ OSDRead *rd = op_read[tid];
+ op_read.erase(tid);
+ dout(0) << "kick_requests resub read " << tid << endl;
+
+ // resubmit
+ readx_submit(rd, rd->ops[tid]);
+ rd->ops.erase(tid);
+ }
+
+ else if (op_stat.count(tid)) {
+ OSDStat *st = op_stat[tid];
+ op_stat.erase(tid);
+
+ dout(0) << "kick_requests resub stat " << tid << endl;
+
+ // resubmit
+ stat_submit(st);
+ }
+
+ else
+ assert(0);
+ }
+ }
+}
+
+
+
+void Objecter::handle_osd_op_reply(MOSDOpReply *m)
+{
+ // read or modify?
+ switch (m->get_op()) {
+ case OSD_OP_READ:
+ handle_osd_read_reply(m);
+ break;
+
+ case OSD_OP_STAT:
+ handle_osd_stat_reply(m);
+ break;
+
+ case OSD_OP_WRNOOP:
+ case OSD_OP_WRITE:
+ case OSD_OP_ZERO:
+ case OSD_OP_DELETE:
+ case OSD_OP_WRUNLOCK:
+ case OSD_OP_WRLOCK:
+ case OSD_OP_RDLOCK:
+ case OSD_OP_RDUNLOCK:
+ case OSD_OP_UPLOCK:
+ case OSD_OP_DNLOCK:
+ handle_osd_modify_reply(m);
+ break;
+
+ default:
+ assert(0);
+ }
+}
+
+
+
+// stat -----------------------------------
+
+tid_t Objecter::stat(object_t oid, off_t *size, Context *onfinish,
+ objectrev_t rev)
+{
+ OSDStat *st = new OSDStat(size);
+ st->extents.push_back(ObjectExtent(oid, 0, 0));
+ st->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout );
+ st->extents.front().rev = rev;
+ st->onfinish = onfinish;
+
+ return stat_submit(st);
+}
+
+tid_t Objecter::stat_submit(OSDStat *st)
+{
+ // find OSD
+ ObjectExtent &ex = st->extents.front();
+ PG &pg = get_pg( ex.pgid );
+
+ // send
+ last_tid++;
+ MOSDOp *m = new MOSDOp(last_tid, messenger->get_myaddr(),
+ ex.oid, ex.pgid, osdmap->get_epoch(),
+ OSD_OP_STAT);
+ dout(10) << "stat_submit " << st << " tid " << last_tid
+ << " oid " << ex.oid
+ << " pg " << ex.pgid
+ << " osd" << pg.acker()
+ << endl;
+
+ if (pg.acker() >= 0)
+ messenger->send_message(m, MSG_ADDR_OSD(pg.acker()), osdmap->get_inst(pg.acker()));
+
+ // add to gather set
+ st->tid = last_tid;
+ op_stat[last_tid] = st;
+
+ pg.active_tids.insert(last_tid);
+
+ return last_tid;
+}
+
+void Objecter::handle_osd_stat_reply(MOSDOpReply *m)
+{
+ // get pio
+ tid_t tid = m->get_tid();
+
+ if (op_stat.count(tid) == 0) {
+ dout(7) << "handle_osd_stat_reply " << tid << " ... stray" << endl;
+ delete m;
+ return;
+ }
+
+ dout(7) << "handle_osd_stat_reply " << tid
+ << " r=" << m->get_result()
+ << " size=" << m->get_object_size()
+ << endl;
+ OSDStat *st = op_stat[ tid ];
+ op_stat.erase( tid );
+
+ // remove from osd/tid maps
+ PG& pg = get_pg( m->get_pg() );
+ assert(pg.active_tids.count(tid));
+ pg.active_tids.erase(tid);
+ if (pg.active_tids.empty()) close_pg( m->get_pg() );
+
+ // success?
+ if (m->get_result() == -EAGAIN) {
+ dout(7) << " got -EAGAIN, resubmitting" << endl;
+ stat_submit(st);
+ delete m;
+ return;
+ }
+ //assert(m->get_result() >= 0);
+
+ // ok!
+ if (m->get_result() < 0) {
+ *st->size = -1;
+ } else {
+ *st->size = m->get_object_size();
+ }
+
+ // finish, clean up
+ Context *onfinish = st->onfinish;
+
+ // done
+ delete st;
+ if (onfinish) {
+ onfinish->finish(m->get_result());
+ delete onfinish;
+ }
+
+ delete m;
+}
+
+
+// read -----------------------------------
+
+
+tid_t Objecter::read(object_t oid, off_t off, size_t len, bufferlist *bl,
+ Context *onfinish,
+ objectrev_t rev)
+{
+ OSDRead *rd = new OSDRead(bl);
+ rd->extents.push_back(ObjectExtent(oid, off, len));
+ rd->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout );
+ rd->extents.front().rev = rev;
+ readx(rd, onfinish);
+ return last_tid;
+}
+
+
+tid_t Objecter::readx(OSDRead *rd, Context *onfinish)
+{
+ rd->onfinish = onfinish;
+
+ // issue reads
+ for (list<ObjectExtent>::iterator it = rd->extents.begin();
+ it != rd->extents.end();
+ it++)
+ readx_submit(rd, *it);
+
+ return last_tid;
+}
+
+tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex)
+{
+ // find OSD
+ PG &pg = get_pg( ex.pgid );
+
+ // send
+ last_tid++;
+ MOSDOp *m = new MOSDOp(last_tid, messenger->get_myaddr(),
+ ex.oid, ex.pgid, osdmap->get_epoch(),
+ OSD_OP_READ);
+ m->set_length(ex.length);
+ m->set_offset(ex.start);
+ dout(10) << "readx_submit " << rd << " tid " << last_tid
+ << " oid " << ex.oid << " " << ex.start << "~" << ex.length
+ << " (" << ex.buffer_extents.size() << " buffer fragments)"
+ << " pg " << ex.pgid
+ << " osd" << pg.acker()
+ << endl;
+
+ if (pg.acker() >= 0)
+ messenger->send_message(m, MSG_ADDR_OSD(pg.acker()), osdmap->get_inst(pg.acker()));
+
+ // add to gather set
+ rd->ops[last_tid] = ex;
+ op_read[last_tid] = rd;
+
+ pg.active_tids.insert(last_tid);
+
+ return last_tid;
+}
+
+
+void Objecter::handle_osd_read_reply(MOSDOpReply *m)
+{
+ // get pio
+ tid_t tid = m->get_tid();
+
+ if (op_read.count(tid) == 0) {
+ dout(7) << "handle_osd_read_reply " << tid << " ... stray" << endl;
+ delete m;
+ return;
+ }
+
+ dout(7) << "handle_osd_read_reply " << tid << endl;
+ OSDRead *rd = op_read[ tid ];
+ op_read.erase( tid );
+
+ // remove from osd/tid maps
+ PG& pg = get_pg( m->get_pg() );
+ assert(pg.active_tids.count(tid));
+ pg.active_tids.erase(tid);
+ if (pg.active_tids.empty()) close_pg( m->get_pg() );
+
+ // our op finished
+ rd->ops.erase(tid);
+
+ // success?
+ if (m->get_result() == -EAGAIN) {
+ dout(7) << " got -EAGAIN, resubmitting" << endl;
+ readx_submit(rd, rd->ops[tid]);
+ delete m;
+ return;
+ }
+ //assert(m->get_result() >= 0);
+
+ // what buffer offset are we?
+ dout(7) << " got frag from " << m->get_oid() << " "
+ << m->get_offset() << "~" << m->get_length()
+ << ", still have " << rd->ops.size() << " more ops" << endl;
+
+ if (rd->ops.empty()) {
+ // all done
+ size_t bytes_read = 0;
+
+ if (rd->read_data.size()) {
+ dout(15) << " assembling frags" << endl;
+
+ /** FIXME This doesn't handle holes efficiently.
+ * It allocates zero buffers to fill whole buffer, and
+ * then discards trailing ones at the end.
+ *
+ * Actually, this whole thing is pretty messy with temporary bufferlist*'s all over
+ * the heap.
+ */
+
+ // we have other fragments, assemble them all... blech!
+ rd->read_data[m->get_oid()] = new bufferlist;
+ rd->read_data[m->get_oid()]->claim( m->get_data() );
+
+ // map extents back into buffer
+ map<off_t, bufferlist*> by_off; // buffer offset -> bufferlist
+
+ // for each object extent...
+ for (list<ObjectExtent>::iterator eit = rd->extents.begin();
+ eit != rd->extents.end();
+ eit++) {
+ bufferlist *ox_buf = rd->read_data[eit->oid];
+ unsigned ox_len = ox_buf->length();
+ unsigned ox_off = 0;
+ assert(ox_len <= eit->length);
+
+ // for each buffer extent we're mapping into...
+ for (map<size_t,size_t>::iterator bit = eit->buffer_extents.begin();
+ bit != eit->buffer_extents.end();
+ bit++) {
+ dout(21) << " object " << eit->oid << " extent " << eit->start << "~" << eit->length << " : ox offset " << ox_off << " -> buffer extent " << bit->first << "~" << bit->second << endl;
+ by_off[bit->first] = new bufferlist;
+
+ if (ox_off + bit->second <= ox_len) {
+ // we got the whole bx
+ by_off[bit->first]->substr_of(*ox_buf, ox_off, bit->second);
+ if (bytes_read < bit->first + bit->second)
+ bytes_read = bit->first + bit->second;
+ } else if (ox_off + bit->second > ox_len && ox_off < ox_len) {
+ // we got part of this bx
+ by_off[bit->first]->substr_of(*ox_buf, ox_off, (ox_len-ox_off));
+ if (bytes_read < bit->first + ox_len-ox_off)
+ bytes_read = bit->first + ox_len-ox_off;
+
+ // zero end of bx
+ dout(21) << " adding some zeros to the end " << ox_off + bit->second-ox_len << endl;
+ bufferptr z(ox_off + bit->second - ox_len);
+ z.zero();
+ by_off[bit->first]->append( z );
+ } else {
+ // we got none of this bx. zero whole thing.
+ assert(ox_off >= ox_len);
+ dout(21) << " adding all zeros for this bit " << bit->second << endl;
+ bufferptr z(bit->second);
+ z.zero();
+ by_off[bit->first]->append( z );
+ }
+ ox_off += bit->second;
+ }
+ assert(ox_off == eit->length);
+ }
+
+ // sort and string bits together
+ for (map<off_t, bufferlist*>::iterator it = by_off.begin();
+ it != by_off.end();
+ it++) {
+ assert(it->second->length());
+ if (it->first < (off_t)bytes_read) {
+ dout(21) << " concat buffer frag off " << it->first << " len " << it->second->length() << endl;
+ rd->bl->claim_append(*(it->second));
+ } else {
+ dout(21) << " NO concat zero buffer frag off " << it->first << " len " << it->second->length() << endl;
+ }
+ delete it->second;
+ }
+
+ // trim trailing zeros?
+ if (rd->bl->length() > bytes_read) {
+ dout(10) << " trimming off trailing zeros . bytes_read=" << bytes_read
+ << " len=" << rd->bl->length() << endl;
+ rd->bl->splice(bytes_read, rd->bl->length() - bytes_read);
+ assert(bytes_read == rd->bl->length());
+ }
+
+ // hose p->read_data bufferlist*'s
+ for (map<object_t, bufferlist*>::iterator it = rd->read_data.begin();
+ it != rd->read_data.end();
+ it++) {
+ delete it->second;
+ }
+ } else {
+ dout(15) << " only one frag" << endl;
+
+ // only one fragment, easy
+ rd->bl->claim( m->get_data() );
+ bytes_read = rd->bl->length();
+ }
+
+ // finish, clean up
+ Context *onfinish = rd->onfinish;
+
+ dout(7) << " " << bytes_read << " bytes "
+ << rd->bl->length()
+ << endl;
+
+ // done
+ delete rd;
+ if (onfinish) {
+ onfinish->finish(bytes_read);// > 0 ? bytes_read:m->get_result());
+ delete onfinish;
+ }
+ } else {
+ // store my bufferlist for later assembling
+ rd->read_data[m->get_oid()] = new bufferlist;
+ rd->read_data[m->get_oid()]->claim( m->get_data() );
+ }
+
+ delete m;
+}
+
+
+
+// write ------------------------------------
+
+tid_t Objecter::write(object_t oid, off_t off, size_t len, bufferlist &bl,
+ Context *onack, Context *oncommit,
+ objectrev_t rev)
+{
+ OSDWrite *wr = new OSDWrite(bl);
+ wr->extents.push_back(ObjectExtent(oid, off, len));
+ wr->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout );
+ wr->extents.front().buffer_extents[0] = len;
+ wr->extents.front().rev = rev;
+ modifyx(wr, onack, oncommit);
+ return last_tid;
+}
+
+
+// zero
+
+tid_t Objecter::zero(object_t oid, off_t off, size_t len,
+ Context *onack, Context *oncommit,
+ objectrev_t rev)
+{
+ OSDModify *z = new OSDModify(OSD_OP_ZERO);
+ z->extents.push_back(ObjectExtent(oid, off, len));
+ z->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout );
+ z->extents.front().rev = rev;
+ modifyx(z, onack, oncommit);
+ return last_tid;
+}
+
+
+// lock ops
+
+tid_t Objecter::lock(int op, object_t oid,
+ Context *onack, Context *oncommit)
+{
+ OSDModify *l = new OSDModify(op);
+ l->extents.push_back(ObjectExtent(oid, 0, 0));
+ l->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout );
+ modifyx(l, onack, oncommit);
+ return last_tid;
+}
+
+
+
+// generic modify -----------------------------------
+
+tid_t Objecter::modifyx(OSDModify *wr, Context *onack, Context *oncommit)
+{
+ wr->onack = onack;
+ wr->oncommit = oncommit;
+
+ // issue writes/whatevers
+ for (list<ObjectExtent>::iterator it = wr->extents.begin();
+ it != wr->extents.end();
+ it++)
+ modifyx_submit(wr, *it);
+
+ return last_tid;
+}
+
+
+tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid)
+{
+ // find
+ PG &pg = get_pg( ex.pgid );
+
+ // send
+ tid_t tid;
+ if (usetid > 0)
+ tid = usetid;
+ else
+ tid = ++last_tid;
+
+ MOSDOp *m = new MOSDOp(tid, messenger->get_myaddr(),
+ ex.oid, ex.pgid, osdmap->get_epoch(),
+ wr->op);
+ m->set_length(ex.length);
+ m->set_offset(ex.start);
+ m->set_rev(ex.rev);
+
+ if (wr->tid_version.count(tid))
+ m->set_version(wr->tid_version[tid]); // we're replaying this op!
+
+ // what type of op?
+ switch (wr->op) {
+ case OSD_OP_WRITE:
+ {
+ // map buffer segments into this extent
+ // (may be fragmented bc of striping)
+ bufferlist cur;
+ for (map<size_t,size_t>::iterator bit = ex.buffer_extents.begin();
+ bit != ex.buffer_extents.end();
+ bit++) {
+ bufferlist thisbit;
+ thisbit.substr_of(((OSDWrite*)wr)->bl, bit->first, bit->second);
+ cur.claim_append(thisbit);
+ }
+ assert(cur.length() == ex.length);
+ m->set_data(cur);//.claim(cur);
+ }
+ break;
+ }
+
+ // add to gather set
+ wr->waitfor_ack[tid] = ex;
+ wr->waitfor_commit[tid] = ex;
+ op_modify[tid] = wr;
+ pg.active_tids.insert(tid);
+
+ ++num_unacked;
+ ++num_uncommitted;
+
+ // send
+ dout(10) << "modifyx_submit " << MOSDOp::get_opname(wr->op) << " tid " << tid
+ << " oid " << ex.oid
+ << " " << ex.start << "~" << ex.length
+ << " pg " << ex.pgid
+ << " osd" << pg.primary()
+ << endl;
+ if (pg.primary() >= 0)
+ messenger->send_message(m, MSG_ADDR_OSD(pg.primary()), osdmap->get_inst(pg.primary()));
+
+ dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << endl;
+
+ return tid;
+}
+
+
+
+void Objecter::handle_osd_modify_reply(MOSDOpReply *m)
+{
+ // get pio
+ tid_t tid = m->get_tid();
+
+ if (op_modify.count(tid) == 0) {
+ dout(7) << "handle_osd_modify_reply " << tid
+ << (m->get_commit() ? " commit":" ack")
+ << " ... stray" << endl;
+ delete m;
+ return;
+ }
+
+ dout(7) << "handle_osd_modify_reply " << tid
+ << (m->get_commit() ? " commit":" ack")
+ << " v " << m->get_version()
+ << endl;
+ OSDModify *wr = op_modify[ tid ];
+
+ Context *onack = 0;
+ Context *oncommit = 0;
+
+ PG &pg = get_pg( m->get_pg() );
+
+ // ignore?
+ if (pg.acker() != m->get_source().num()) {
+ dout(7) << " ignoring ack|commit from non-acker" << endl;
+ delete m;
+ return;
+ }
+
+ assert(m->get_result() >= 0);
+
+ // ack or commit?
+ if (m->get_commit()) {
+ //dout(15) << " handle_osd_write_reply commit on " << tid << endl;
+ assert(wr->tid_version.count(tid) == 0 ||
+ m->get_version() == wr->tid_version[tid]);
+
+ // remove from tid/osd maps
+ assert(pg.active_tids.count(tid));
+ pg.active_tids.erase(tid);
+ if (pg.active_tids.empty()) close_pg( m->get_pg() );
+
+ // commit.
+ op_modify.erase( tid );
+ wr->waitfor_ack.erase(tid);
+ wr->waitfor_commit.erase(tid);
+
+ num_uncommitted--;
+
+ if (wr->waitfor_commit.empty()) {
+ onack = wr->onack;
+ oncommit = wr->oncommit;
+ delete wr;
+ }
+ } else {
+ // ack.
+ //dout(15) << " handle_osd_write_reply ack on " << tid << endl;
+ assert(wr->waitfor_ack.count(tid));
+ wr->waitfor_ack.erase(tid);
+
+ num_unacked--;
+
+ if (wr->tid_version.count(tid) &&
+ wr->tid_version[tid].version != m->get_version().version) {
+ dout(-10) << "handle_osd_modify_reply WARNING: replay of tid " << tid
+ << " did not achieve previous ordering" << endl;
+ }
+ wr->tid_version[tid] = m->get_version();
+
+ if (wr->waitfor_ack.empty()) {
+ onack = wr->onack;
+ wr->onack = 0; // only do callback once
+
+ // buffer uncommitted?
+ if (!g_conf.objecter_buffer_uncommitted &&
+ wr->op == OSD_OP_WRITE) {
+ // discard buffer!
+ ((OSDWrite*)wr)->bl.clear();
+ }
+ }
+ }
+
+ // do callbacks
+ if (onack) {
+ onack->finish(0);
+ delete onack;
+ }
+ if (oncommit) {
+ oncommit->finish(0);
+ delete oncommit;
+ }
+
+ delete m;
+}
+
+
+
+void Objecter::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst)
+{
+ if (dest.is_mon()) {
+ // try a new mon
+ int mon = monmap->pick_mon(true);
+ dout(0) << "ms_handle_failure " << dest << " inst " << inst
+ << ", resending to mon" << mon
+ << endl;
+ messenger->send_message(m, MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ }
+ else if (dest.is_osd()) {
+ int mon = monmap->pick_mon();
+ dout(0) << "ms_handle_failure " << dest << " inst " << inst
+ << ", dropping and reporting to mon" << mon
+ << endl;
+ messenger->send_message(new MOSDFailure(dest, inst, osdmap->get_epoch()),
+ MSG_ADDR_MON(mon), monmap->get_inst(mon));
+ delete m;
+ } else {
+ dout(0) << "ms_handle_failure " << dest << " inst " << inst
+ << ", dropping" << endl;
+ delete m;
+ }
+}
--- /dev/null
+#ifndef __OBJECTER_H
+#define __OBJECTER_H
+
+#include "include/types.h"
+#include "include/buffer.h"
+
+#include "osd/OSDMap.h"
+#include "messages/MOSDOp.h"
+
+#include <list>
+#include <map>
+#include <ext/hash_map>
+using namespace std;
+using namespace __gnu_cxx;
+
+class Context;
+class Messenger;
+class OSDMap;
+class MonMap;
+class Message;
+
+class Objecter {
+ public:
+ Messenger *messenger;
+ MonMap *monmap;
+ OSDMap *osdmap;
+
+ private:
+ tid_t last_tid;
+ int num_unacked;
+ int num_uncommitted;
+
+ /*** track pending operations ***/
+ // read
+ public:
+ class OSDOp {
+ public:
+ list<ObjectExtent> extents;
+ virtual ~OSDOp() {}
+ };
+
+ class OSDRead : public OSDOp {
+ public:
+ bufferlist *bl;
+ Context *onfinish;
+ map<tid_t, ObjectExtent> ops;
+ map<object_t, bufferlist*> read_data; // bits of data as they come back
+
+ OSDRead(bufferlist *b) : bl(b), onfinish(0) {
+ bl->clear();
+ }
+ };
+
+ class OSDStat : public OSDOp {
+ public:
+ tid_t tid;
+ off_t *size; // where the size goes.
+ Context *onfinish;
+ OSDStat(off_t *s) : tid(0), size(s), onfinish(0) { }
+ };
+
+ // generic modify
+ class OSDModify : public OSDOp {
+ public:
+ int op;
+ list<ObjectExtent> extents;
+ Context *onack;
+ Context *oncommit;
+ map<tid_t, ObjectExtent> waitfor_ack;
+ map<tid_t, eversion_t> tid_version;
+ map<tid_t, ObjectExtent> waitfor_commit;
+
+ OSDModify(int o) : op(o), onack(0), oncommit(0) {}
+ };
+
+ // write (includes the bufferlist)
+ class OSDWrite : public OSDModify {
+ public:
+ bufferlist bl;
+ OSDWrite(bufferlist &b) : OSDModify(OSD_OP_WRITE), bl(b) {}
+ };
+
+
+
+ private:
+ // pending ops
+ hash_map<tid_t,OSDStat*> op_stat;
+ hash_map<tid_t,OSDRead*> op_read;
+ hash_map<tid_t,OSDModify*> op_modify;
+
+ /**
+ * track pending ops by pg
+ * ...so we can cope with failures, map changes
+ */
+ class PG {
+ public:
+ vector<int> acting;
+ set<tid_t> active_tids; // active ops
+
+ PG() {}
+
+ // primary - where i write
+ int primary() {
+ if (acting.empty()) return -1;
+ return acting[0];
+ }
+ // acker - where i read, and receive acks from
+ int acker() {
+ if (acting.empty()) return -1;
+ if (g_conf.osd_rep == OSD_REP_PRIMARY)
+ return acting[0];
+ else
+ return acting[acting.size() > 1 ? 1:0];
+ }
+ };
+
+ hash_map<pg_t,PG> pg_map;
+
+
+ PG &get_pg(pg_t pgid) {
+ if (!pg_map.count(pgid))
+ osdmap->pg_to_acting_osds(pgid, pg_map[pgid].acting);
+ return pg_map[pgid];
+ }
+ void close_pg(pg_t pgid) {
+ assert(pg_map.count(pgid));
+ assert(pg_map[pgid].active_tids.empty());
+ pg_map.erase(pgid);
+ }
+ void scan_pgs(set<pg_t>& chnaged_pgs);
+ void kick_requests(set<pg_t>& changed_pgs);
+
+
+ public:
+ Objecter(Messenger *m, MonMap *mm, OSDMap *om) :
+ messenger(m), monmap(mm), osdmap(om),
+ last_tid(0),
+ num_unacked(0), num_uncommitted(0)
+ {}
+ ~Objecter() {
+ // clean up op_*
+ // ***
+ }
+
+ // messages
+ public:
+ void dispatch(Message *m);
+ void handle_osd_op_reply(class MOSDOpReply *m);
+ void handle_osd_stat_reply(class MOSDOpReply *m);
+ void handle_osd_read_reply(class MOSDOpReply *m);
+ void handle_osd_modify_reply(class MOSDOpReply *m);
+ void handle_osd_lock_reply(class MOSDOpReply *m);
+ void handle_osd_map(class MOSDMap *m);
+
+ private:
+ tid_t readx_submit(OSDRead *rd, ObjectExtent& ex);
+ tid_t modifyx_submit(OSDModify *wr, ObjectExtent& ex, tid_t tid=0);
+ tid_t stat_submit(OSDStat *st);
+
+ // public interface
+ public:
+ bool is_active() {
+ return !(op_read.empty() && op_modify.empty());
+ }
+
+ // med level
+ tid_t readx(OSDRead *read, Context *onfinish);
+ tid_t modifyx(OSDModify *wr, Context *onack, Context *oncommit);
+ //tid_t lockx(OSDLock *l, Context *onack, Context *oncommit);
+
+ // even lazier
+ tid_t read(object_t oid, off_t off, size_t len, bufferlist *bl,
+ Context *onfinish,
+ objectrev_t rev=0);
+ tid_t write(object_t oid, off_t off, size_t len, bufferlist &bl,
+ Context *onack, Context *oncommit,
+ objectrev_t rev=0);
+ tid_t zero(object_t oid, off_t off, size_t len,
+ Context *onack, Context *oncommit,
+ objectrev_t rev=0);
+ tid_t stat(object_t oid, off_t *size, Context *onfinish,
+ objectrev_t rev=0);
+
+ tid_t lock(int op, object_t oid, Context *onack, Context *oncommit);
+
+
+ void ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst);
+
+};
+
+#endif
--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+my $fn = shift @ARGV;
+my $f = `cat $fn`;
+
+my $header = '// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+';
+
+unless ($f =~ /Ceph - scalable distributed file system/) {
+ open(O, ">$fn.new");
+ print O $header;
+ print O $f;
+ close O;
+ rename "$fn.new", $fn;
+}
+
--- /dev/null
+#!/usr/bin/perl
+
+my $tablen = shift @ARGV;
+my $fn = shift @ARGV;
+
+my $tab = ' ' x $tablen;
+open(I, $fn);
+my $f;
+my $oldtab = ' ' x 4;
+while (<I>) {
+ if (my ($oldlen) = /\-\*\- .*tab-width:(\d)/) {
+ print "old length was $oldlen\n";
+ $oldtab = ' ' x $oldlen;
+ s/tab-width:\d/tab-width:$tablen/;
+ }
+ s/\t/$oldtab/g;
+ $f .= $_;
+}
+close I;
+open(O, ">$fn.new");
+print O $f;
+close O;
+
+rename "$fn.new", $fn;
--- /dev/null
+#!/bin/sh
+
+rm osddata/*/*\.*
--- /dev/null
+#!/usr/bin/perl
+
+my $n = 0;
+while (<>) {
+ next unless /trace: /;
+ my $l = $'; $';
+ print $l;
+}
--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+
+my $xaxis = shift @ARGV;
+my @vars;
+while (@ARGV) {
+ $_ = shift @ARGV;
+ last if ($_ eq '-');
+ push(@vars, $_);
+}
+my @dirs;
+while (@ARGV) {
+ $_ = shift @ARGV;
+ last if ($_ eq '-');
+ push(@dirs, $_) if -d $_;
+}
+my @filt = @ARGV;
+push( @filt, '.' ) unless @filt;
+
+print "#xaxis $xaxis
+#vars @vars
+#dirs @dirs
+#filt @filt
+";
+
+sub load_sum {
+ my $fn = shift @_;
+
+ open(I, "$fn");
+ my $k = <I>;
+ chomp($k);
+ my @k = split(/\s+/,$k);
+ shift @k;
+
+ my $s;
+ while (<I>) {
+ chomp;
+ s/^\#//;
+ next unless $_;
+ my @l = split(/\s+/,$_);
+ my $k = shift @l;
+ for my $f (@k) {
+ $s->{$k}->{$f} = shift @l;
+ }
+
+ # clnode latency?
+ if ($fn =~ /cl/) {
+ $s->{$k}->{'wrlat'} = $s->{$k}->{'wrlsum'} / $s->{$k}->{'wrlnum'} if $s->{$k}->{'wrlnum'} > 0;
+ $s->{$k}->{'rlat'} = $s->{$k}->{'rlsum'} / $s->{$k}->{'rlnum'} if $s->{$k}->{'rlnum'} > 0;
+ $s->{$k}->{'lat'} = $s->{$k}->{'lsum'} / $s->{$k}->{'lnum'} if $s->{$k}->{'lnum'} > 0;
+ $s->{$k}->{'latw'} = $s->{$k}->{'lwsum'} / $s->{$k}->{'lwnum'} if $s->{$k}->{'lwnum'} > 0;
+ $s->{$k}->{'latr'} = $s->{$k}->{'lrsum'} / $s->{$k}->{'lrnum'} if $s->{$k}->{'lrnum'} > 0;
+ $s->{$k}->{'statlat'} = $s->{$k}->{'lstatsum'} / $s->{$k}->{'lstatnum'} if $s->{$k}->{'lstatnum'} > 0;
+ $s->{$k}->{'dirlat'} = $s->{$k}->{'ldirsum'} / $s->{$k}->{'ldirnum'} if $s->{$k}->{'ldirnum'} > 0;
+ }
+ }
+ return $s;
+}
+
+
+my %res;
+my @key;
+my %didkey;
+for my $f (@filt) {
+ my @reg = split(/,/, $f);
+ #print "reg @reg\n";
+ for my $d (@dirs) {
+ if ($f ne '.') {
+ my $r = (split(/\//,$d))[-1];
+ my @db = split(/,/, $r);
+ #print "db @db\n";
+ my $ok = 1;
+ for my $r (@reg) {
+
+ $ok = 0 unless grep {$_ eq $r} @db;
+ }
+ next unless $ok;
+ }
+ #next if ($f ne '.' && $d !~ /$reg/);
+ #print "$d\n";
+ my ($x) = $d =~ /$xaxis=(\d+)/;
+
+ for my $v (@vars) {
+ my ($what, $field) = $v =~ /^(.+)\.([^\.]+)$/;
+ #print "$what $field .. $v .. $f.$field\n";
+ my $s = &load_sum("$d/sum.$what");
+
+ #print "\t$v";
+ if ($field =~ /^sum=/) {
+ #warn "SUM field $field\n";
+ push( @{$res{$x}}, $s->{'sum'}->{$'} ); #'});
+ } else {
+ #warn "avg field $field\n";
+ push( @{$res{$x}}, $s->{'avgval'}->{$field} );
+ }
+
+ push( @key, "$f.$field" ) unless $didkey{"$f.$field"};
+ $didkey{"$f.$field"} = 1;
+
+ if (0 && exists $s->{'avgvaldevt'}) {
+ push( @{$res{$x}}, $s->{'avgvaldevt'}->{$field} );
+ push( @key, "$f.$field.dev" ) unless $didkey{"$f.$field.dev"};
+ $didkey{"$f.$field.dev"} = 1;
+ }
+ }
+ }
+}
+
+print join("\t", "#", @key) . "\n";
+for my $x (sort {$a <=> $b} keys %res) {
+ print join("\t", $x, @{$res{$x}}) . "\n";
+}
--- /dev/null
+#!/usr/bin/perl
+
+my %pin;
+my %hist;
+my $l = 1;
+my @pins;
+while (<>) {
+
+ #cdir:adjust_nested_auth_pins on [dir 163 /foo/ rep@13 | child] count now 0 + 1
+
+ if (/adjust_nested_auth_pins/) {
+ my ($what) = /\[(\w+ \d+) /;
+ $hist{$what} .= "$l: $_"
+ if defined $pin{$what};
+ }
+
+ # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0
+
+ if (/auth_pin /) {
+ my ($what) = /\[(\w+ \d+) /;
+# print "add_waiter $c $what\n";
+ $pin{$what}++;
+ $hist{$what} .= "$l: $_";
+ push( @pins, $what ) unless grep {$_ eq $what} @pins;
+ }
+
+ # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0
+
+ if (/auth_unpin/) {
+ my ($what) = /\[(\w+ \d+) /;# / on (.*\])/;
+ $pin{$what}--;
+ $hist{$what} .= "$l: $_";
+ unless ($pin{$what}) {
+ delete $hist{$what};
+ delete $pin{$what};
+ @pins = grep {$_ ne $what} @pins;
+ }
+ }
+ $l++;
+}
+
+for my $what (@pins) {
+ print "---- count $pin{$what} on $what
+$hist{$what}
+";
+}
--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+my %buffers;
+my %bufferlists;
+my %ref;
+my %mal;
+my $l = 1;
+while (<>) {
+ #print "$l: $_";
+
+ # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0
+
+ if (/^buffer\.cons /) {
+ my ($x) = /(0x\S+)/;
+ $buffers{$x} = 1;
+ }
+ if (/^buffer\.des /) {
+ my ($x) = /(0x\S+)/;
+ die "des without cons at $l: $_" unless $buffers{$x};
+ delete $buffers{$x};
+ die "des with ref>0 at $l: $_" unless $ref{$x} == 0;
+ delete $ref{$x};
+ }
+
+ if (/^bufferlist\.cons /) {
+ my ($x) = /(0x\S+)/;
+ $bufferlists{$x} = 1;
+ }
+ if (/^bufferlist\.des /) {
+ my ($x) = /(0x\S+)/;
+ warn "des without cons at $l: $_" unless $bufferlists{$x};
+ delete $bufferlists{$x};
+ }
+
+
+ if (/^buffer\.malloc /) {
+ my ($x) = /(0x\S+)/;
+ $mal{$x} = 1;
+ }
+ if (/^buffer\.free /) {
+ my ($x) = /(0x\S+)/;
+ die "free with malloc at $l: $_" unless $mal{$x};
+ delete $mal{$x};
+ }
+
+ if (/^buffer\.get /) {
+ my ($x) = /(0x\S+)/;
+ $ref{$x}++;
+ }
+ if (/^buffer\.get /) {
+ my ($x) = /(0x\S+)/;
+ $ref{$x}--;
+ }
+
+$l++;
+}
+
+for my $x (keys %bufferlists) {
+ print "leaked bufferlist $x\n";
+}
+
+for my $x (keys %buffers) {
+ print "leaked buffer $x ref $ref{$x}\n";
+}
+
+for my $x (keys %mal) {
+ print "leaked buffer dataptr $x ref $ref{$x}\n";
+}
--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+my %op;
+
+my $line = 0;
+while (<>) {
+ #print $line . $_ if /0x8d4f6a0/;
+ chomp;
+ $line++;
+
+ #bdev(./ebofsdev/0)._submit_io bio(wr 269~1 usemap 0x4de33cc0)
+ if (my ($bio) = /_submit_io bio\(.*(0x\w+)\)/) {
+ $op{$bio} = $line;
+ }
+
+ # cancel
+ #bdev(./ebofsdev/3)._cancel_io bio(wr 1525~1 bh_write 0x8a437b8)
+ if (my ($bio) = /_cancel_io bio\(.*(0x\w+)\)/ &&
+ !(/FAILED/)) {
+ delete $op{$bio};
+ }
+
+ # finish
+ #bdev(./ebofsdev/3).complete_thread finishing bio(wr 1131~1 write_cnode 0x832c1f8)
+ if (my ($bio) = /complete_thread finishing bio\(.*(0x\w+)\)/) {
+ delete $op{$bio};
+ }
+
+}
+
+for my $bio (keys %op) {
+ print "---- lost bio $bio\n";
+}
--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+my %op;
+
+my $line = 0;
+while (<>) {
+ #print "$line: $_";
+ $line++;
+
+ #osd3 do_op MOSDOp(client0.933 oid 100000000000008 0x84b4480) in pg[pginfo(4020000000d v 5662/0 e 2/1) r=0 active (0,5662]]
+ if (my ($from, $opno, $oid, $op) = /do_op MOSDOp\((\S+) op (\d+) oid (\d+) (\w+)\)/) {
+# print "$op\n";
+ if ($opno == 2 || $opno == 11 || $opno == 12 || $opno == 14 || $opno == 15) {
+ $op{$op} = $from;
+ }
+ }
+
+ # commits
+ #osd1 op_modify_commit on op MOSDOp(client1.289 oid 100000100000002 0x51a2f788)
+ if (my ($op) = /op_modify_commit.* (\w+)\)/) {
+ delete $op{$op};
+ }
+ #osd4 rep_modify_commit on op MOSDOp(osd3.289 oid 100000000000008 0x84b0980)
+ if (my ($op) = /rep_modify_commit.* (\w+)\)/) {
+ delete $op{$op};
+ }
+
+ # forwarded?
+ if (my ($op) = /sending (\w+) to osd/) {
+ delete $op{$op};
+ }
+
+}
+
+for my $op (keys %op) {
+ print "---- lost op $op $op{$op}\n";
+}
--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+my %ack;
+my %commit;
+
+my $line = 0;
+while (<>) {
+ #print "$line: $_";
+ $line++;
+
+ #client0.objecter writex_submit tid 21 osd0 oid 100000000000001 851424~100000
+ if (my ($who, $tid) = /(\S+)\.objecter writex_submit tid\D+(\d+)\D+osd/) {
+# print "$who.$tid\n";
+ $ack{"$who.$tid"} = $line;
+ $commit{"$who.$tid"} = $line;
+ }
+
+ #client1.objecter handle_osd_write_reply 304 commit 0
+ #client1.objecter handle_osd_write_reply 777 commit 1
+ if (my ($who, $tid, $commit) = /(\S+)\.objecter handle_osd_write_reply\D+(\d+)\D+commit\D+(\d)/) {
+# print "$who.$tid\n";
+ delete $ack{"$who.$tid"};
+ delete $commit{"$who.$tid"} if $commit;
+ }
+
+}
+
+for my $op (keys %commit) {
+ print "---- lost commit $op $commit{$op}\n";
+}
+for my $op (keys %ack) {
+ print "---- lost ack $op $commit{$op}\n";
+}
--- /dev/null
+#!/usr/bin/perl
+
+my %pin;
+my %hist;
+my $l = 1;
+my @pins;
+while (<>) {
+
+ # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0
+
+ if (/path_pinned /) {
+ my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /;
+ $what = "$dname $dir";
+ #print "$l pin $what\n";
+ $pin{$what}++;
+ $hist{$what} .= "$l: $_";
+ push( @pins, $what ) unless grep {$_ eq $what} @pins;
+ }
+
+ # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0
+
+ if (/path_unpinned/) {
+ my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /;
+ $what = "$dname $dir";
+ #print "$l unpin $what\n";
+ $pin{$what}--;
+ $hist{$what} .= "$l: $_";
+ unless ($pin{$what}) {
+ delete $hist{$what};
+ delete $pin{$what};
+ @pins = grep {$_ ne $what} @pins;
+ }
+ }
+ $l++;
+}
+
+for my $what (@pins) {
+ print "---- count $pin{$what} on $what
+$hist{$what}
+";
+}
--- /dev/null
+#!/usr/bin/perl
+
+my %waiting; # context => what where what is "inode ..." or "dir ..."
+my %hist; # context => history since waited
+my @waiting;
+
+my $line = 0;
+while (<>) {
+
+ #print $line . $_ if /0x8d4f6a0/;
+ $line++;
+ if (/request_start/) {
+ my ($c) = /(0x\w+)/;
+ my ($what) = $'; #';
+ chomp $what;
+ #print "$line add_waiter $c $what\n" if /0x8d4f6a0/;
+ $waiting{$c} = $what
+ if $what && !$waiting{$c};
+ $hist{$c} .= "$line: $_";
+ unless (grep {$_ eq $c} @waiting) {
+ push( @waiting, $c );
+ }
+ }
+ #if (/finish_waiting/) {
+ # my ($c) = /(0x\w+)/;
+ # $hist{$c} .= "$line: $_";
+ #}
+ if (/request_finish/ ||
+ /request_forward/) {
+ my ($c) = /(0x\w+)/;
+ #print "took\n" if /0x8d4f6a0/;
+ delete $waiting{$c};
+ delete $hist{$c};
+ @waiting = grep {$_ ne $c} @waiting;
+ }
+}
+
+for my $c (@waiting) {
+ print "---- lost request $c $waiting{$c}
+$hist{$c}
+";
+}
--- /dev/null
+#!/usr/bin/perl
+
+my %waiting; # context => what where what is "inode ..." or "dir ..."
+my %hist; # context => history since waited
+my @waiting;
+
+my $line = 0;
+while (<>) {
+ #print $line . $_ if /0x8d4f6a0/;
+ $line++;
+ if (/add_waiter/) {
+ my ($c) = /(0x\w+)/;
+ my ($what) = / on (.*\])/;
+ #print "$line add_waiter $c $what\n" if /0x8d4f6a0/;
+ $waiting{$c} = $what
+ if $what && !$waiting{$c};
+ $hist{$c} .= "$line: $_";
+ unless (grep {$_ eq $c} @waiting) {
+ push( @waiting, $c );
+ }
+ }
+ #if (/finish_waiting/) {
+ # my ($c) = /(0x\w+)/;
+ # $hist{$c} .= "$line: $_";
+ #}
+ if (/take_waiting/) {
+ my ($c) = /(0x\w+)/;
+ if (/SKIPPING/) {
+ #print "skipping\n" if /0x8d4f6a0/;
+ $hist{$c} .= "$line: $_";
+ } elsif (/took/) {
+ #print "took\n" if /0x8d4f6a0/;
+ delete $waiting{$c};
+ delete $hist{$c};
+ @waiting = grep {$_ ne $c} @waiting;
+ } else {
+ die "i don't understand: $_";
+ }
+ }
+}
+
+for my $c (@waiting) {
+ print "---- lost waiter $c $waiting{$c}
+$hist{$c}
+";
+}
--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+
+my $block = shift ARGV;
+die unless int $block;
+
+while (<>) {
+ my $yes = 0;
+ for my $x (/(\d+\~\d+)/) {
+ my ($s,$l) = split(/\~/,$x);
+ $yes = 1 if ($block >= $s && $block < $s+$l);
+ }
+ print if $yes;
+}
--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+
+my @file = <>;
+sub get_op {
+ my @op = shift @file;
+ while (@file &&
+ $file[0] !~ /^[a-z]+$/) {
+ push( @op, shift @file );
+ }
+ #print "op = ( @op )\n";
+ return @op;
+}
+
+my $n = 0;
+while (@file) {
+ my ($op, @args) = &get_op;
+ while ($op eq "read\n" ||
+ $op eq "write\n") {
+ die unless scalar(@args) == 3;
+ my ($nop, @nargs) = &get_op;
+ if ($nop eq $op
+ && ($args[0] == $nargs[0] )
+ && ($args[2] + $args[1] == $nargs[2])
+ ) {
+ die unless scalar(@nargs) == 3;
+ $args[1] += $nargs[1];
+ $args[1] .= "\n";
+ die unless scalar(@args) == 3;
+ #print STDOUT "combining $n $op @args\n";
+ $n++;
+ } else {
+# print STDERR "not combinging\n";
+ unshift( @file, $nop, @nargs );
+ die unless scalar(@args) == 3;
+ last;
+ }
+ }
+ print $op;
+ print join('', @args);
+}
--- /dev/null
+#!/usr/bin/perl
+
+my $rank = shift @ARGV;
+my $args = join(' ',@ARGV);
+if ($rank == $ENV{MPD_JRANK}) {
+ $c = "LD_PRELOAD=$ENV{'HOME'}/csl/obsd/src/pmds/gprof-helper.so ./newsyn $args";
+} else {
+ $c = "./newsyn.nopg $args";
+}
+
+#print "$rank: $c\n";
+system $c;
--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+use Data::Dumper;
+
+=item sample input file
+
+# hi there
+{
+ # startup
+ 'n' => 30, # mpi nodes
+ 'sleep' => 10, # seconds between runs
+ 'nummds' => 1,
+ 'numosd' => 8,
+ 'numclient' => 400,#[10, 50, 100, 200, 400],
+
+ # parameters
+ 'fs' => [ 'ebofs', 'fakestore' ],
+ 'until' => 150, # --syn until $n ... when to stop clients
+ 'writefile' => 1,
+ 'writefile_size' => [ 4096, 65526, 256000, 1024000, 2560000 ],
+ 'writefile_mb' => 1000,
+
+ 'custom' => '--tcp_skip_rank0 --osd_maxthreads 0';
+
+ # for final summation (script/sum.pl)
+ 'start' => 30,
+ 'end' => 120,
+
+ '_psub' => 'alc.tp' # switch to psub mode!
+};
+
+=cut
+
+my $usage = "script/runset.pl [--clean] jobs/some/job blah\n";
+
+my $clean;
+my $use_srun;
+my $nobg = '&';
+my $in = shift || die $usage;
+if ($in eq '--clean') {
+ $clean = 1;
+ $in = shift || die $usage;
+}
+if ($in eq '--srun') {
+ $use_srun = 1;
+ $in = shift || die $usage;
+}
+if ($in eq '--nobg') {
+ $nobg = '';
+ $in = shift || die $usage;
+}
+my $tag = shift || die $usage;
+my $fake = shift;
+
+
+my ($job) = $in =~ /^jobs\/(.*)/;
+my ($jname) = $job =~ /\/(\w+)$/;
+$jname ||= $job;
+die "not jobs/?" unless defined $job;
+my $out = "log/$job.$tag";
+my $relout = "$job.$tag";
+
+
+my $cwd = `/bin/pwd`;
+chomp($cwd);
+
+
+
+print "# --- job $job, tag $tag ---\n";
+
+
+# get input
+my $raw = `cat $in`;
+my $sim = eval $raw;
+unless (ref $sim) {
+ print "bad input: $in\n";
+ system "perl -c $in";
+ exit 1;
+}
+
+# prep output
+system "mkdir -p $out" unless -d "$out";
+
+open(W, ">$out/in");
+print W $raw;
+close W;
+
+my $comb = $sim->{'comb'};
+delete $sim->{'comb'};
+my %filters;
+my @fulldirs;
+
+
+
+sub reset {
+ print "reset: restarting mpd in 3 seconds\n";
+ system "sleep 3 && (mpiexec -l -n 32 killall newsyn ; restartmpd.sh)";
+ print "reset: done\n";
+}
+
+
+if (`hostname` =~ /alc/ && !$use_srun) {
+ print "# this looks like alc\n";
+ $sim->{'_psub'} = 'jobs/alc.tp';
+}
+
+
+sub iterate {
+ my $sim = shift @_;
+ my $fix = shift @_ || {};
+ my $vary;
+ my @r;
+
+ my $this;
+ for my $k (sort keys %$sim) {
+ next if $k =~ /^_/;
+ if (defined $fix->{$k}) {
+ $this->{$k} = $fix->{$k};
+ }
+ elsif (ref $sim->{$k} eq 'HASH') {
+ # nothing
+ }
+ elsif (!(ref $sim->{$k})) {
+ $this->{$k} = $sim->{$k};
+ }
+ else {
+ #print ref $sim->{$k};
+ if (!(defined $vary)) {
+ $vary = $k;
+ }
+ }
+ }
+
+ if ($vary) {
+ #print "vary $vary\n";
+ for my $v (@{$sim->{$vary}}) {
+ $this->{$vary} = $v;
+ push(@r, &iterate($sim, $this));
+ }
+ } else {
+
+ if ($sim->{'_dep'}) {
+ my @s = @{$sim->{'_dep'}};
+ while (@s) {
+ my $dv = shift @s;
+ my $eq = shift @s;
+
+ $eq =~ s/\$(\w+)/"\$this->{'$1'}"/eg;
+ $this->{$dv} = eval $eq;
+ #print "$dv : $eq -> $this->{$dv}\n";
+ }
+ }
+
+ push(@r, $this);
+ }
+ return @r;
+}
+
+
+
+sub run {
+ my $h = shift @_;
+
+ my @fn;
+ my @filt;
+ my @vals;
+ for my $k (sort keys %$sim) {
+ next if $k =~ /^_/;
+ next unless ref $sim->{$k} eq 'ARRAY';
+ push(@fn, "$k=$h->{$k}");
+ push(@vals, $h->{$k});
+ next if $comb && $k eq $comb->{'x'};
+ push(@filt, "$k=$h->{$k}");
+ }
+ my $keys = join(",", @fn);
+ $keys =~ s/ /_/g;
+ my $fn = $out . '/' . $keys;
+ my $name = $jname . '_' . join('_',@vals); #$tag . '_' . $keys;
+
+ push( @fulldirs, "" . $fn );
+
+
+ # filters
+ $filters{ join(',', @filt) } = 1;
+
+
+ #system "sh $fn/sh.post" if -e "$fn/sh.post";# && !(-e "$fn/.post");
+ if (-e "$fn/.done") {
+ print "already done.\n";
+ return;
+ }
+ system "rm -r $fn" if $clean && -d "$fn";
+ system "mkdir $fn" unless -d "$fn";
+
+ my $e = './newsyn';
+ #$e = './tcpsynobfs' if $h->{'fs'} eq 'obfs';
+ my $c = "$e";
+ $c .= " --mkfs" unless $h->{'no_mkfs'};
+ $c .= " --$h->{'fs'}";
+ $c .= " --syn until $h->{'until'}" if $h->{'until'};
+
+ $c .= " --syn writefile $h->{'writefile_mb'} $h->{'writefile_size'}" if $h->{'writefile'};
+ $c .= " --syn rw $h->{'rw_mb'} $h->{'rw_size'}" if $h->{'rw'};
+ $c .= " --syn readfile $h->{'readfile_mb'} $h->{'readfile_size'}" if $h->{'readfile'};
+ $c .= " --syn makedirs $h->{'makedirs_dirs'} $h->{'makedirs_files'} $h->{'makedirs_depth'}" if $h->{'makedirs'};
+
+ if ($h->{'ebofs_freelist'}) {
+ system "cp freelist/ebofs.freelist.$h->{'ebofs_freelist'} ebofs.freelist";
+ $c .= " --osd_age_time -1";
+ }
+
+ for my $k ('nummds', 'numclient', 'numosd', 'kill_after',
+ 'osd_maxthreads', 'osd_object_layout', 'osd_pg_layout','osd_pg_bits',
+ 'mds_bal_rep', 'mds_bal_interval', 'mds_bal_max','mds_decay_halflife',
+ 'mds_bal_hash_rd','mds_bal_hash_wr','mds_bal_unhash_rd','mds_bal_unhash_wr',
+ 'mds_cache_size','mds_log_max_len',
+ 'mds_local_osd',
+ 'osd_age_time','osd_age',
+ 'osd_rep',
+ 'osd_pad_pg_log','ebofs_realloc',
+ 'osd_balance_reads',
+ 'tcp_multi_out',
+ 'client_cache_stat_ttl','client_cache_readdir_ttl',
+ 'client_oc',
+ 'fake_osdmap_updates',
+ 'bdev_el_bidir', 'ebofs_idle_commit_ms', 'ebofs_commit_ms',
+ 'ebofs_oc_size','ebofs_cc_size','ebofs_bc_size','ebofs_bc_max_dirty','ebofs_abp_max_alloc',
+ 'file_layout_ssize','file_layout_scount','file_layout_osize','file_layout_num_rep',
+ 'meta_dir_layout_ssize','meta_dir_layout_scount','meta_dir_layout_osize','meta_dir_layout_num_rep',
+ 'meta_log_layout_ssize','meta_log_layout_scount','meta_log_layout_osize','meta_log_layout_num_rep') {
+ $c .= " --$k $h->{$k}" if defined $h->{$k};
+ }
+
+ $c .= ' ' . $h->{'custom'} if $h->{'custom'};
+
+ $c .= " --log_name $relout/$keys";
+
+ my $post = "#!/bin/sh
+script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/osd\\* > $fn/sum.osd
+script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds? $fn/mds?? > $fn/sum.mds
+script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds*.log > $fn/sum.mds.log
+script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/clnode* > $fn/sum.cl
+touch $fn/.post
+";
+ open(O,">$fn/sh.post");
+ print O $post;
+ close O;
+
+ my $killmin = 1 + int ($h->{'kill_after'} / 60);
+
+ $c = "bash -c \"ulimit -c 0 ; $c\"";
+ #$c = "bash -c \"$c\"";
+
+ my $srun = "srun --wait=600 --exclude=jobs/ltest.ignore -l -t $killmin -N $h->{'n'} -p ltest";
+ my $mpiexec = "mpiexec -l -n $h->{'n'}";
+ my $launch;
+ if ($use_srun) {
+ $launch = $srun;
+ } else {
+ $launch = $mpiexec;
+ }
+
+ if ($sim->{'_psub'}) {
+ # template!
+ my $tp = `cat $sim->{'_psub'}`;
+ $tp =~ s/\$CWD/$cwd/g;
+ $tp =~ s/\$NAME/$name/g;
+ $tp =~ s/\$NUM/$h->{'n'}/g;
+ $tp =~ s/\$OUT/$fn\/o/g;
+ $tp =~ s/\$DONE/$fn\/.done/g;
+ $tp =~ s/\$CMD/$c/g;
+ open(O,">$out/$name");
+ print O $tp;
+ close O;
+ print "\npsub $out/$name\n";
+ return;
+ } else {
+ # run
+ my $cmd = "\n$launch $c > $fn/o && touch $fn/.done";#
+ #my $cmd = "\n$launch $c > $fn/o ; touch $fn/.done";
+ print "$cmd $nobg\n";
+ my $r = undef;
+ unless ($fake) {
+ if ($sim->{'_pre'}) {
+ print "pre: $launch $sim->{'_pre'}\n";
+ system "$launch $sim->{'_pre'}";
+ }
+ $r = system $cmd;
+ if ($sim->{'_post'}) {
+ print "post: $launch $sim->{'_post'}\n";
+ system "$launch $sim->{'_post'}";
+ }
+ if ($r) {
+ print "r = $r\n";
+ #&reset;
+ }
+ system "sh $fn/sh.post";
+ }
+ return $r;
+ }
+}
+
+
+
+my @r = &iterate($sim);
+my $n = scalar(@r);
+my $c = 1;
+my %r;
+my $nfailed = 0;
+for my $h (@r) {
+ my $d = `date`;
+ chomp($d);
+ $d =~ s/ P.T .*//;
+ print "# === $c/$n";
+ print " ($nfailed failed)" if $nfailed;
+ print " $d: ";
+ my $r = &run($h);
+
+ if (!(defined $r)) {
+ # already done
+ } else {
+ if ($r) {
+ $nfailed++;
+ }
+ print "sleep $h->{'sleep'}\n";
+ sleep $h->{'sleep'};
+ }
+
+ $c++;
+}
+print "$nfailed failed\n";
+
+
+my @comb;
+if ($comb) {
+ my $x = $comb->{'x'};
+ my @vars = @{$comb->{'vars'}};
+
+ print "\n\n# post\n";
+ for my $p (@fulldirs) {
+ print "sh $p/sh.post\n";
+ }
+
+ my @filters = sort keys %filters;
+ my $cmd = "script/comb.pl $x @vars - @fulldirs - @filters > $out/c";
+ print "$cmd\n";
+ open(O,">$out/comb");
+ print O "$cmd\n";
+ close O;
+ system $cmd;
+
+ print "\n\n";
+
+ my $plot;
+ $plot .= "set data style linespoints;\n";
+ my $s = 2;
+ for my $v (@vars) {
+ my $c = $s;
+ $s++;
+ my @p;
+ for my $f (@filters) {
+ my $t = $f;
+ if ($comb->{'maptitle'}) {
+ for my $a (keys %{$comb->{'maptitle'}}) {
+ my $b = $comb->{'maptitle'}->{$a};
+ $t =~ s/$a/$b/;
+ }
+ }
+ push (@p, "\"$out/c\" u 1:$c t \"$t\"" );
+ $c += scalar(@vars);
+ }
+ $plot .= "# $v\nplot " . join(", ", @p) . ";\n\n";
+ }
+ print $plot;
+ open(O,">$out/plot");
+ print O $plot;
+ close O;
+}
+
--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+my $starttime = 1;
+my $endtime = -1;
+
+my $avgrows = 0;
+
+while ($ARGV[0] =~ /^-/) {
+ $_ = shift @ARGV;
+ if ($_ eq '-avg') {
+ $avgrows = 1;
+ }
+ elsif ($_ eq '-start') {
+ $starttime = shift @ARGV;
+ }
+ elsif ($_ eq '-end') {
+ $endtime = shift @ARGV;
+ }
+ else {
+ die "i don't understand arg $_";
+ }
+}
+my @files = @ARGV;
+
+if (scalar(@files) == 1 && $files[0] =~ /\*/) {
+ my ($dir, $pat) = $files[0] =~ /^(.*)\/([^\/]+)$/;
+ @files = ();
+ $pat =~ s/\*//;
+# print "dir $dir pat $pat\n";
+ opendir(D,"$dir");
+ for my $f (readdir(D)) {
+ # print "$f\n";
+ next unless $f =~ /^$pat/;
+ push(@files, "$dir/$f");
+ }
+ closedir(D);
+
+# print "files = @files\n";
+}
+
+my @data;
+for my $f (@files) {
+ open(I,$f);
+ push( @data, <I> );
+ close I;
+}
+
+my %sum; # time -> name -> val
+my %col; # colnum -> name .. colnums start at 0 (time doesn't count)
+my %min;
+my %max;
+my %avg;
+my %tcount;
+my $files;
+for (@data) {
+ chomp;
+ my @r = split(/\s+/,$_);
+ my $r = shift @r;
+
+ # column headings?
+ if ($r =~ /^\#/) {
+ my $num = 0;
+ while (my $name = shift @r) {
+ $col{$num} = $name;
+ $num++;
+ }
+ next;
+ }
+
+ next unless int $r;
+ next if $r < $starttime;
+ next if $endtime > 0 && $r > $endtime;
+
+ $tcount{$r}++;
+ $files = $tcount{$r} if $tcount{$r} > $files;
+ #print "$r: @r\n";
+ my $i = 0;
+ while (@r) {
+ my $v = shift @r;
+ $sum{$r}->{$col{$i}} += $v; # if $v > 0;
+
+ $min{$col{$i}} = $v
+ if ($min{$col{$i}} > $v || !(defined $min{$col{$i}}));
+ $max{$col{$i}} = $v
+ if ($max{$col{$i}} < $v);
+
+ $avg{$col{$i}} += $v;
+ $i++;
+ }
+}
+
+## dump
+my @c = sort {$a <=> $b} keys %col;
+# cols
+print join("\t",'#', map { $col{$_} } @c) . "\n";
+my $n = 0;
+for my $k (sort {$a <=> $b} keys %sum) {
+ if ($avgrows) {
+ print join("\t",$k, #map int,
+ map { $sum{$k}->{$col{$_}}/$tcount{$k} } @c ) . "\n";
+ } else {
+ print join("\t",$k, map { $sum{$k}->{$col{$_}} } @c ) . "\n";
+ }
+ $n++;
+}
+
+my $rows = $n || 1;
+#my $files = $tcount{$starttime};
+my %avgval;
+
+## devt
+#warn "rows $rows, files $files\n";
+my %avgvalvart; # std dev of each col avg, over time
+for my $k (keys %avg) {
+ my $av = $avgval{$k} = $avg{$k} / ($rows*$files);
+
+ my $var = 0.0;
+ for my $t (sort {$a <=> $b} keys %sum) {
+ my $a = $sum{$t}->{$k} / $files;
+ $var += ($a - $av) * ($a - $av);
+ }
+
+ $avgvalvart{$k} = $var / $rows;
+}
+
+
+
+
+print "\n";
+print join("\t",'#', map { $col{$_} } @c) . "\n";
+print join("\t", '#minval', map { $min{$col{$_}} } @c ) . "\n";
+print join("\t", '#maxval', map { $max{$col{$_}} } @c ) . "\n";
+print join("\t", '#rows', map { $rows } @c) . "\n";
+print join("\t", '#files', map { $files } @c) . "\n";
+print join("\t", '#sum',
+ map { $avg{$col{$_}} } @c ) . "\n";
+print join("\t", '#avgval', #map int,
+ map { $avgval{$col{$_}} } @c ) . "\n";
+# map { ($rows*$files) ? ($_ / ($rows*$files)):0 } map { $avg{$col{$_}} } @c ) . "\n";
+
+print join("\t", '#avgvalvart',
+ map { $avgvalvart{$col{$_}} } @c ) . "\n";
+print join("\t", '#avgvaldevt',
+ map { sqrt($_) } map { $avgvalvart{$col{$_}} } @c ) . "\n";
+
+print join("\t", '#avgsum', #map int,
+ map { $_ / $rows } map { $avg{$col{$_}} } @c ) . "\n";
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mds/MDCluster.h"
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "client/Client.h"
+#include "client/fuse.h"
+
+#include "msg/TCPMessenger.h"
+
+#include "common/Timer.h"
+
+#include <envz.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+int main(int argc, char **argv, char *envp[]) {
+
+ //cerr << "tcpfuse starting " << myrank << "/" << world << endl;
+ vector<char*> args;
+ argv_to_vec(argc, argv, args);
+ parse_config_options(args);
+
+ // args for fuse
+ vec_to_argv(args, argc, argv);
+
+ // start up tcpmessenger
+ tcpaddr_t nsa;
+ if (tcpmessenger_findns(nsa) < 0) exit(1);
+ tcpmessenger_init();
+ tcpmessenger_start();
+ tcpmessenger_start_rankserver(nsa);
+
+ Client *client = new Client(new TCPMessenger(MSG_ADDR_CLIENT_NEW));
+ client->init();
+
+ // start up fuse
+ // use my argc, argv (make sure you pass a mount point!)
+ cout << "mounting" << endl;
+ client->mount();
+
+ cerr << "starting fuse on pid " << getpid() << endl;
+ ceph_fuse_main(client, argc, argv);
+ cerr << "fuse finished on pid " << getpid() << endl;
+
+ client->unmount();
+ cout << "unmounted" << endl;
+ client->shutdown();
+
+ delete client;
+
+ // wait for it to finish
+ tcpmessenger_wait();
+ tcpmessenger_shutdown(); // shutdown MPI
+
+ return 0;
+}
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "mds/MDCluster.h"
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "mon/Monitor.h"
+#include "client/Client.h"
+#include "client/SyntheticClient.h"
+
+#include "msg/TCPMessenger.h"
+
+#include "common/Timer.h"
+
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define NUMCLIENT g_conf.num_client
+
+class C_Test : public Context {
+public:
+ void finish(int r) {
+ cout << "C_Test->finish(" << r << ")" << endl;
+ }
+};
+
+
+#include "msg/mpistarter.cc"
+
+utime_t tick_start;
+int tick_count = 0;
+
+class C_Tick : public Context {
+public:
+ void finish(int) {
+ utime_t now = g_clock.now() - tick_start;
+ dout(0) << "tick +" << g_conf.tick << " -> " << now << " (" << tick_count << ")" << endl;
+ tick_count += g_conf.tick;
+ utime_t next = tick_start;
+ next.sec_ref() += tick_count;
+ g_timer.add_event_at(next, new C_Tick);
+ }
+};
+
+class C_Die : public Context {
+public:
+ void finish(int) {
+ cerr << "die" << endl;
+ exit(1);
+ }
+};
+
+class C_Debug : public Context {
+ public:
+ void finish(int) {
+ int size = &g_conf.debug_after - &g_conf.debug;
+ memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size);
+ dout(0) << "debug_after flipping debug settings" << endl;
+ }
+};
+
+
+int main(int argc, char **argv)
+{
+ vector<char*> args;
+ argv_to_vec(argc, argv, args);
+
+ parse_config_options(args);
+
+ parse_syn_options(args);
+
+ if (g_conf.kill_after)
+ g_timer.add_event_after(g_conf.kill_after, new C_Die);
+ if (g_conf.debug_after)
+ g_timer.add_event_after(g_conf.debug_after, new C_Debug);
+
+ if (g_conf.tick) {
+ tick_start = g_clock.now();
+ g_timer.add_event_after(g_conf.tick, new C_Tick);
+ }
+
+ vector<char*> nargs;
+ for (unsigned i=0; i<args.size(); i++) {
+ //cout << "a " << args[i] << endl;
+ // unknown arg, pass it on.
+ nargs.push_back(args[i]);
+ }
+
+ args = nargs;
+ if (!args.empty()) {
+ for (unsigned i=0; i<args.size(); i++)
+ cerr << "stray arg " << args[i] << endl;
+ }
+ assert(args.empty());
+
+
+ // start up tcp messenger via MPI
+ pair<int,int> mpiwho = mpi_bootstrap_tcp(argc, argv);
+ int myrank = mpiwho.first;
+ int world = mpiwho.second;
+
+ int need = 0;
+ if (g_conf.tcp_skip_rank0) need++;
+ need += NUMMDS;
+ need += NUMOSD;
+ if (NUMCLIENT) {
+ if (!g_conf.tcp_overlay_clients)
+ need += 1;
+ }
+ assert(need <= world);
+
+ if (myrank == 0)
+ cerr << "nummds " << NUMMDS << " numosd " << NUMOSD << " numclient " << NUMCLIENT << " .. need " << need << ", have " << world << endl;
+
+ MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD);
+
+
+ char hostname[100];
+ gethostname(hostname,100);
+ int pid = getpid();
+
+ int started = 0;
+
+ //if (myrank == 0) g_conf.debug = 20;
+
+ // create mon
+ if (myrank == 0) {
+ Monitor *mon = new Monitor(0, new TCPMessenger(MSG_ADDR_MON(0)));
+ mon->init();
+ }
+
+ // create mds
+ MDS *mds[NUMMDS];
+ OSD *mdsosd[NUMMDS];
+ for (int i=0; i<NUMMDS; i++) {
+ if (myrank != g_conf.tcp_skip_rank0+i) continue;
+ TCPMessenger *m = new TCPMessenger(MSG_ADDR_MDS(i));
+ cerr << "mds" << i << " on tcprank " << tcpmessenger_get_rank() << " " << hostname << "." << pid << endl;
+ mds[i] = new MDS(mdc, i, m);
+ mds[i]->init();
+ started++;
+
+ if (g_conf.mds_local_osd) {
+ mdsosd[i] = new OSD(i+10000, new TCPMessenger(MSG_ADDR_OSD(i+10000)));
+ mdsosd[i]->init();
+ }
+ }
+
+ // create osd
+ OSD *osd[NUMOSD];
+ for (int i=0; i<NUMOSD; i++) {
+ if (myrank != g_conf.tcp_skip_rank0+NUMMDS + i) continue;
+ TCPMessenger *m = new TCPMessenger(MSG_ADDR_OSD(i));
+ cerr << "osd" << i << " on tcprank " << tcpmessenger_get_rank() << " " << hostname << "." << pid << endl;
+ osd[i] = new OSD(i, m);
+ osd[i]->init();
+ started++;
+ }
+
+ if (g_conf.tcp_overlay_clients) sleep(5);
+
+ // create client
+ int skip_osd = NUMOSD;
+ if (g_conf.tcp_overlay_clients)
+ skip_osd = 0; // put clients with osds too!
+ int client_nodes = world - NUMMDS - skip_osd - g_conf.tcp_skip_rank0;
+ int clients_per_node = 1;
+ if (NUMCLIENT) clients_per_node = (NUMCLIENT-1) / client_nodes + 1;
+ set<int> clientlist;
+ Client *client[NUMCLIENT];
+ SyntheticClient *syn[NUMCLIENT];
+ for (int i=0; i<NUMCLIENT; i++) {
+ //if (myrank != NUMMDS + NUMOSD + i % client_nodes) continue;
+ if (myrank != g_conf.tcp_skip_rank0+NUMMDS + skip_osd + i / clients_per_node) continue;
+ clientlist.insert(i);
+ client[i] = new Client(new TCPMessenger(MSG_ADDR_CLIENT_NEW));//(i)) );
+
+ // logger?
+ if (client_logger == 0) {
+ char s[80];
+ sprintf(s,"clnode.%d", myrank);
+ client_logger = new Logger(s, &client_logtype);
+
+ client_logtype.add_inc("lsum");
+ client_logtype.add_inc("lnum");
+ client_logtype.add_inc("lwsum");
+ client_logtype.add_inc("lwnum");
+ client_logtype.add_inc("lrsum");
+ client_logtype.add_inc("lrnum");
+ client_logtype.add_inc("trsum");
+ client_logtype.add_inc("trnum");
+ client_logtype.add_inc("wrlsum");
+ client_logtype.add_inc("wrlnum");
+ client_logtype.add_inc("lstatsum");
+ client_logtype.add_inc("lstatnum");
+ client_logtype.add_inc("ldirsum");
+ client_logtype.add_inc("ldirnum");
+ client_logtype.add_inc("readdir");
+ client_logtype.add_inc("stat");
+ }
+
+ client[i]->init();
+ started++;
+
+ syn[i] = new SyntheticClient(client[i]);
+ }
+
+ if (!clientlist.empty()) dout(2) << "i have " << clientlist << endl;
+
+ int nclients = 0;
+ for (set<int>::iterator it = clientlist.begin();
+ it != clientlist.end();
+ it++) {
+ int i = *it;
+
+ //cerr << "starting synthetic client" << i << " on rank " << myrank << endl;
+ client[i]->mount();
+ syn[i]->start_thread();
+
+ nclients++;
+ }
+ if (nclients) {
+ cerr << nclients << " clients on tcprank " << tcpmessenger_get_rank() << " " << hostname << "." << pid << endl;
+ }
+
+ for (set<int>::iterator it = clientlist.begin();
+ it != clientlist.end();
+ it++) {
+ int i = *it;
+
+ // cout << "waiting for synthetic client" << i << " to finish" << endl;
+ syn[i]->join_thread();
+ delete syn[i];
+
+ client[i]->unmount();
+ //cout << "client" << i << " unmounted" << endl;
+ client[i]->shutdown();
+ }
+
+
+ if (myrank && !started) {
+ //dout(1) << "IDLE" << endl;
+ cerr << "idle on tcprank " << tcpmessenger_get_rank() << " " << hostname << "." << pid << endl;
+ tcpmessenger_stop_rankserver();
+ }
+
+ // wait for everything to finish
+ tcpmessenger_wait();
+
+ if (started) cerr << "tcpsyn finishing" << endl;
+
+ tcpmessenger_shutdown();
+
+
+ /*
+ // cleanup
+ for (int i=0; i<NUMMDS; i++) {
+ if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_MDS(i),world)) continue;
+ delete mds[i];
+ }
+ for (int i=0; i<NUMOSD; i++) {
+ if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_OSD(i),world)) continue;
+ delete osd[i];
+ }
+ for (int i=0; i<NUMCLIENT; i++) {
+ if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_CLIENT(i),world)) continue;
+ delete client[i];
+ }
+ */
+ delete mdc;
+
+
+ return 0;
+}
+
--- /dev/null
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "fakeclient/FakeClient.h"
+
+#include "mds/MDCluster.h"
+#include "mds/MDCache.h"
+#include "mds/MDStore.h"
+
+#include "msg/FakeMessenger.h"
+
+#include "messages/MPing.h"
+
+using namespace std;
+
+__uint64_t ino = 1;
+
+
+
+#include "config.h"
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define NUMCLIENT g_conf.num_fakeclient
+
+// this parses find output
+int play();
+
+int main(int oargc, char **oargv) {
+ cerr << "hi there" << endl;
+
+ int argc;
+ char **argv;
+ parse_config_options(oargc, oargv,
+ argc, argv);
+
+ MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD);
+
+ // local config settings
+ g_conf.num_client = g_conf.num_fakeclient; // to fool mds, hack gross
+
+ // create osds
+ OSD *osd[NUMOSD];
+ for (int i=0; i<NUMOSD; i++) {
+ osd[i] = new OSD(i, new FakeMessenger(MSG_ADDR_OSD(i)));
+ osd[i]->init();
+ }
+
+ // create mds
+ MDS *mds[NUMMDS];
+ for (int i=0; i<NUMMDS; i++) {
+ mds[i] = new MDS(mdc, i, new FakeMessenger(MSG_ADDR_MDS(i)));
+ mds[i]->init();
+ }
+
+
+ // create clients
+ FakeClient *client[NUMCLIENT];
+ for (int i=0; i<NUMCLIENT; i++) {
+ client[i] = new FakeClient(mdc, i, new FakeMessenger(MSG_ADDR_CLIENT(i)), g_conf.fakeclient_requests);
+ client[i]->init();
+ }
+
+ // mount clients
+ for (int i=0; i<NUMCLIENT; i++)
+ //for (int i=0; i<1; i++)
+ client[i]->mount();
+
+ // loop
+ fakemessenger_do_loop();
+
+ //mds[0]->shutdown_start();
+ //fakemessenger_do_loop();
+
+ //
+ if (argc > 1 &&
+ strcmp(argv[1], "nocheck") == 0) {
+ cerr << "---- nocheck" << endl;
+ } else {
+ cout << "---- check ----" << endl;
+ for (int i=0; i<NUMMDS; i++)
+ mds[i]->mdcache->shutdown_pass();
+ }
+
+ // cleanup
+ cout << "cleanup" << endl;
+ for (int i=0; i<NUMMDS; i++) {
+ delete mds[i];
+ }
+ for (int i=0; i<NUMOSD; i++) {
+ delete osd[i];
+ }
+ for (int i=0; i<NUMCLIENT; i++) {
+ delete client[i];
+ }
+ delete mdc;
+ cout << "done." << endl;
+ return 0;
+}
+
--- /dev/null
+/* gprof-helper.c -- preload library to profile pthread-enabled programs
+ *
+ * Authors: Sam Hocevar <sam at zoy dot org>
+ * Daniel Jönsson <danieljo at fagotten dot org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the Do What The Fuck You Want To
+ * Public License as published by Banlu Kemiyatorn. See
+ * http://sam.zoy.org/projects/COPYING.WTFPL for more details.
+ *
+ * Compilation example:
+ * gcc -shared -fPIC gprof-helper.c -o gprof-helper.so -lpthread -ldl
+ *
+ * Usage example:
+ * LD_PRELOAD=./gprof-helper.so your_program
+ */
+
+#define _GNU_SOURCE
+#include <sys/time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <dlfcn.h>
+#include <pthread.h>
+
+static void * wrapper_routine(void *);
+
+/* Original pthread function */
+static int (*pthread_create_orig)(pthread_t *__restrict,
+ __const pthread_attr_t *__restrict,
+ void *(*)(void *),
+ void *__restrict) = NULL;
+
+/* Library initialization function */
+void wooinit(void) __attribute__((constructor));
+
+void wooinit(void)
+{
+ pthread_create_orig = dlsym(RTLD_NEXT, "pthread_create");
+ fprintf(stderr, "pthreads: using profiling hooks for gprof\n");
+ if(pthread_create_orig == NULL)
+ {
+ char *error = dlerror();
+ if(error == NULL)
+ {
+ error = "pthread_create is NULL";
+ }
+ fprintf(stderr, "%s\n", error);
+ exit(EXIT_FAILURE);
+ }
+}
+
+/* Our data structure passed to the wrapper */
+typedef struct wrapper_s
+{
+ void * (*start_routine)(void *);
+ void * arg;
+
+ pthread_mutex_t lock;
+ pthread_cond_t wait;
+
+ struct itimerval itimer;
+
+} wrapper_t;
+
+/* The wrapper function in charge for setting the itimer value */
+static void * wrapper_routine(void * data)
+{
+ /* Put user data in thread-local variables */
+ void * (*start_routine)(void *) = ((wrapper_t*)data)->start_routine;
+ void * arg = ((wrapper_t*)data)->arg;
+
+ /* Set the profile timer value */
+ setitimer(ITIMER_PROF, &((wrapper_t*)data)->itimer, NULL);
+
+ /* Tell the calling thread that we don't need its data anymore */
+ pthread_mutex_lock(&((wrapper_t*)data)->lock);
+ pthread_cond_signal(&((wrapper_t*)data)->wait);
+ pthread_mutex_unlock(&((wrapper_t*)data)->lock);
+
+ /* Call the real function */
+ return start_routine(arg);
+}
+
+/* Our wrapper function for the real pthread_create() */
+int pthread_create(pthread_t *__restrict thread,
+ __const pthread_attr_t *__restrict attr,
+ void * (*start_routine)(void *),
+ void *__restrict arg)
+{
+ wrapper_t wrapper_data;
+ int i_return;
+
+ /* Initialize the wrapper structure */
+ wrapper_data.start_routine = start_routine;
+ wrapper_data.arg = arg;
+ getitimer(ITIMER_PROF, &wrapper_data.itimer);
+ pthread_cond_init(&wrapper_data.wait, NULL);
+ pthread_mutex_init(&wrapper_data.lock, NULL);
+ pthread_mutex_lock(&wrapper_data.lock);
+
+ /* The real pthread_create call */
+ i_return = pthread_create_orig(thread,
+ attr,
+ &wrapper_routine,
+ &wrapper_data);
+
+ /* If the thread was successfully spawned, wait for the data
+ * to be released */
+ if(i_return == 0)
+ {
+ pthread_cond_wait(&wrapper_data.wait, &wrapper_data.lock);
+ }
+
+ pthread_mutex_unlock(&wrapper_data.lock);
+ pthread_mutex_destroy(&wrapper_data.lock);
+ pthread_cond_destroy(&wrapper_data.wait);
+
+ return i_return;
+}
+
--- /dev/null
+#include <iostream>
+#include <string>
+using namespace std;
+
+int make_dirs(const char *basedir, int dirs, int files, int depth)
+{
+ //if (time_to_stop()) return 0;
+
+ // make sure base dir exists
+ int r = mkdir(basedir, 0755);
+ if (r != 0) {
+ cout << "can't make base dir? " << basedir << endl;
+ return -1;
+ }
+
+ // children
+ char d[500];
+ cout << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl;
+ for (int i=0; i<files; i++) {
+ sprintf(d,"%s/file.%d", basedir, i);
+ mknod(d, 0644);
+ }
+
+ if (depth == 0) return 0;
+
+ for (int i=0; i<dirs; i++) {
+ sprintf(d, "%s/dir.%d", basedir, i);
+ make_dirs(d, dirs, files, depth-1);
+ }
+
+ return 0;
+}
+
+int main()
+{
+ make_dirs("blah", 10, 10, 4);
+
+}
--- /dev/null
+
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "mds/MDCluster.h"
+#include "mds/MDS.h"
+#include "osd/OSD.h"
+#include "fakeclient/FakeClient.h"
+
+#include "mds/MDCache.h"
+#include "mds/MDStore.h"
+
+#include "msg/MPIMessenger.h"
+//#include "msg/CheesySerializer.h"
+
+#include "messages/MPing.h"
+
+
+__uint64_t ino = 1;
+
+
+
+#include "config.h"
+#define NUMMDS g_conf.num_mds
+#define NUMOSD g_conf.num_osd
+#define NUMCLIENT g_conf.num_client
+
+// this parses find output
+int play();
+
+int main(int argc, char **argv) {
+ cout << "mpitest starting" << endl;
+
+ int myrank = mpimessenger_init(argc, argv);
+ int world = mpimessenger_world();
+
+
+
+ MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD);
+
+ // create osds
+ OSD *osd[NUMOSD];
+ for (int i=0; i<NUMOSD; i++) {
+ if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_OSD(i),world)) continue;
+ osd[i] = new OSD(i, new MPIMessenger(MSG_ADDR_OSD(i)));
+ osd[i]->init();
+ }
+
+ // create mds
+ MDS *mds[NUMMDS];
+ for (int i=0; i<NUMMDS; i++) {
+ if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_MDS(i),world)) continue;
+ mds[i] = new MDS(mdc, i, new MPIMessenger(MSG_ADDR_MDS(i)));
+ mds[i]->init();
+ }
+
+ // create clients
+ FakeClient *client[NUMCLIENT];
+ for (int i=0; i<NUMCLIENT; i++) {
+ if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_CLIENT(i),world)) continue;
+
+ MPIMessenger *real = new MPIMessenger(MSG_ADDR_CLIENT(i));
+ CheesySerializer *serializer = new CheesySerializer(real);
+ real->set_dispatcher(serializer);
+
+ client[i] = new FakeClient(mdc, i, real, g_conf.fakeclient_requests);
+ client[i]->init();
+ }
+
+ // seed initial requests
+ for (int i=0; i<NUMCLIENT; i++) {
+ if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_CLIENT(i),world)) continue;
+ client[i]->issue_request();
+ }
+
+ mpimessenger_start(); // start message loop
+ mpimessenger_wait(); // wait for thread to finish
+ mpimessenger_shutdown(); // shutdown MPI
+
+ //
+ /*
+ cout << "---- check ----" << endl;
+ for (int i=0; i<NUMMDS; i++) {
+ if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_MDS(i),world)) continue;
+ mds[i]->mdcache->shutdown_pass();
+ }
+ */
+
+ // cleanup
+ //cout << "cleanup" << endl;
+ for (int i=0; i<NUMMDS; i++) {
+ if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_MDS(i),world)) continue;
+ delete mds[i];
+ }
+ for (int i=0; i<NUMOSD; i++) {
+ if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_OSD(i),world)) continue;
+ delete osd[i];
+ }
+ for (int i=0; i<NUMCLIENT; i++) {
+ if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_CLIENT(i),world)) continue;
+ delete client[i];
+ }
+ delete mdc;
+
+ //cout << "done." << endl;
+ return 0;
+}
+
--- /dev/null
+// Check that MTMessenger properly dispatches replies to the correct
+// thread. Processes with mutliple threads of clients send a
+// "request" to a server, which then sends back a "reply". The client
+// checks that it received the correct reply for its request. The
+// request and reply are both an MClientRequest, which we used because
+// it allows us to pass an arbitrary string in the sarg field. In the
+// request, the sarg field contains a string "rN:tN:mN" which uniquely
+// identifies a request by rank (process), thread and message. The
+// server sends the reply with the sarg field set to "rN:tN:mN reply",
+// and the client can the verify it receive the correct reply for its
+// request.
+
+#include <pthread.h>
+#include "mpi.h"
+
+#include "messages/MClientRequest.h"
+#include "msg/MTMessenger.h"
+#include "include/error.h"
+
+#define SARG_SIZE 64
+#define SERVER_RANK 0
+#define NTHREADS 11 // number of threads per rank
+#define NMESSAGES 31 // number of messages per thread
+
+static void server_loop(MTMessenger &msgr, int world_size)
+{
+ // we expect this many messages from clients, then we quit
+ // (world_size-1 since server is one of the processes).
+ int totmsg = NTHREADS * NMESSAGES * (world_size - 1);
+ int nmsg = 0;
+
+ char buf[SARG_SIZE];
+
+ while(nmsg < totmsg) {
+ MClientRequest *req = (MClientRequest*)msgr.recvreq();
+ ASSERT(req->get_type() == MSG_CLIENT_REQUEST);
+
+ //cout << "Server acknowledging " << req->get_sarg() << endl;
+
+ sprintf(buf, "%s reply", req->get_sarg().c_str());
+ MClientRequest resp(0, 0);
+ resp.set_sarg(buf);
+ msgr.sendresp(req, &resp);
+
+ delete req;
+ nmsg++;
+ }
+
+ cout << "Server successful" << endl;
+}
+
+// arguments for client thread start function (see pthread_create)
+struct client_arg
+{
+ MTMessenger *msgr;
+ int rank;
+ int thread;
+};
+
+static void *client_session(void *_carg)
+{
+ client_arg *carg = (client_arg *)_carg;
+
+ char buf[SARG_SIZE];
+
+ // repeat some number (arbitrary really) of rounds
+ for (int i = 0; i < NMESSAGES; i++) {
+
+ // send the message, receive the reply and check reply is as
+ // expected
+
+ MClientRequest request(0, 0);
+ sprintf(buf, "r%d:t%d:m%d", carg->rank, carg->thread, i);
+ request.set_sarg(buf);
+
+ //cout << "Client sending " << request.get_sarg() << endl;
+
+ MClientRequest *resp =
+ (MClientRequest*)carg->msgr->sendrecv(&request, SERVER_RANK);
+
+ ASSERT(resp->get_type() == MSG_CLIENT_REQUEST);
+ sprintf(buf, "r%d:t%d:m%d reply", carg->rank, carg->thread, i);
+ ASSERT(strcmp(buf, resp->get_sarg().c_str()) == 0);
+
+ //cout << "Client verified " << resp->get_sarg() << endl;
+
+ delete resp;
+ }
+
+ cout << "Client (" << carg->rank << "," << carg->thread
+ << ") successful" << endl;
+
+ delete carg;
+ return NULL;
+}
+
+static void launch_clients(MTMessenger &msgr, int rank)
+{
+ pthread_t tid[NTHREADS];
+
+ // launch some number (arbitrary really) of threads
+ for (int i = 0; i < NTHREADS; i++) {
+
+ client_arg *carg = (client_arg*)malloc(sizeof(client_arg));
+ ASSERT(carg);
+ carg->msgr = &msgr;
+ carg->rank = rank;
+ carg->thread = i;
+
+ if (pthread_create(&tid[i], NULL, client_session, carg) < 0)
+ SYSERROR();
+ }
+
+ // we must wait for all the threads to exit before returning,
+ // otherwise we shutdown MPI before while the threads are
+ // chatting.
+ for (int i = 0; i < NTHREADS; i++) {
+ void *retval;
+
+ if (pthread_join(tid[i], &retval) < 0)
+ SYSERROR();
+ }
+}
+
+int main(int argc, char **argv)
+{
+ MTMessenger msgr(argc, argv);
+
+ int rank;
+ ASSERT(MPI_Comm_rank(MPI_COMM_WORLD, &rank) == MPI_SUCCESS);
+ int world_size;
+ ASSERT(MPI_Comm_size(MPI_COMM_WORLD, &world_size) == MPI_SUCCESS);
+
+ if (rank == SERVER_RANK)
+ server_loop(msgr, world_size);
+ else
+ launch_clients(msgr, rank);
+
+ return 0;
+}
--- /dev/null
+6
+8 10.0
+4 20.0
+7 30.0
+9 10.0
+8 15.0
+5 11.0
--- /dev/null
+//
+// $Id$
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "../osd/rush.h"
+
+main (int argc, char *argv[])
+{
+ Rush rush;
+ char buf[200];
+ int i, j, k, numClusters;
+ int numKeys = 5;
+ int numReplicas = 4;
+ int curSize;
+ double curWeight;
+ int servers[1000];
+
+ if (argc > 1) {
+ numKeys = atoi (argv[1]);
+ }
+ if (argc > 2) {
+ numReplicas = atoi (argv[2]);
+ }
+
+ fgets (buf, sizeof (buf) - 2, stdin);
+ sscanf (buf, "%d", &numClusters);
+ for (i = 0; i < numClusters; i++) {
+ fgets (buf, sizeof (buf) - 2, stdin);
+ sscanf (buf, "%d %lf", &curSize, &curWeight);
+ rush.AddCluster (curSize, curWeight);
+ if (rush.Servers () < numReplicas) {
+ fprintf (stderr, "ERROR: must have at least %d disks in the system!\n",
+ rush.Clusters ());
+ exit (-1);
+ }
+ for (j = 0; j < numKeys; j++) {
+ rush.GetServersByKey (j, numReplicas, servers);
+#if 0
+ printf ("%-3d %-6d ", i, j);
+ for (k = 0; k < numReplicas; k++) {
+ printf ("%-5d ", servers[k]);
+ }
+ putchar ('\n');
+#endif
+ }
+ }
+}
--- /dev/null
+//
+// $Id$
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "rush.h"
+
+main (int argc, char *argv[])
+{
+ Rush rush;
+ char buf[200];
+ int i, j, k, numClusters;
+ int numKeys = 5;
+ int numReplicas = 4;
+ int curSize;
+ double curWeight;
+ int servers[1000];
+
+ if (argc > 1) {
+ numKeys = atoi (argv[1]);
+ }
+ if (argc > 2) {
+ numReplicas = atoi (argv[2]);
+ }
+
+ fgets (buf, sizeof (buf) - 2, stdin);
+ sscanf (buf, "%d", &numClusters);
+ for (i = 0; i < numClusters; i++) {
+ fgets (buf, sizeof (buf) - 2, stdin);
+ sscanf (buf, "%d %lf", &curSize, &curWeight);
+ rush.AddCluster (curSize, curWeight);
+ if (rush.Servers () < numReplicas) {
+ fprintf (stderr, "ERROR: must have at least %d disks in the system!\n",
+ rush.Clusters ());
+ exit (-1);
+ }
+ for (j = 0; j < numKeys; j++) {
+ rush.GetServersByKey (j, numReplicas, servers);
+#if 0
+ printf ("%-3d %-6d ", i, j);
+ for (k = 0; k < numReplicas; k++) {
+ printf ("%-5d ", servers[k]);
+ }
+ putchar ('\n');
+#endif
+ }
+ }
+}
--- /dev/null
+
+
+#include "../crush/Bucket.h"
+using namespace crush;
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+ostream& operator<<(ostream& out, vector<int>& v)
+{
+ out << "[";
+ for (int i=0; i<v.size(); i++) {
+ if (i) out << " ";
+ out << v[i];
+ }
+ out << "]";
+ return out;
+}
+
+
+int main()
+{
+ Hash h(73);
+
+ vector<int> disks;
+ for (int i=0; i<20; i++)
+ disks.push_back(i);
+
+
+ /*
+ UniformBucket ub(1, 1, 0, 10, disks);
+ ub.make_primes(h);
+ cout << "primes are " << ub.primes << endl;
+ */
+
+ MixedBucket mb(2, 1);
+ for (int i=0;i<20;i++)
+ mb.add_item(i, 10);
+
+ /*
+ MixedBucket b(3, 1);
+ b.add_item(1, ub.get_weight());
+ b.add_item(2, mb.get_weight());
+ */
+ MixedBucket b= mb;
+
+ vector<int> ocount(disks.size());
+ int numrep = 3;
+
+ vector<int> v(numrep);
+ for (int x=1; x<1000000; x++) {
+ //cout << H(x) << "\t" << h(x) << endl;
+ for (int i=0; i<numrep; i++) {
+ int d = b.choose_r(x, i, h);
+ v[i] = d;
+ ocount[d]++;
+ }
+ //cout << v << "\t" << endl;//ocount << endl;
+ }
+
+ for (int i=0; i<ocount.size(); i++) {
+ cout << "disk " << i << " has " << ocount[i] << endl;
+ }
+
+}
--- /dev/null
+
+#include <iostream>
+using namespace std;
+
+#include "include/bufferlist.h"
+
+
+int main()
+{
+
+ bufferptr p1 = new buffer("123456",6);
+ bufferptr p2 = p1;
+
+ cout << "it is '" << p1.c_str() << "'" << endl;
+
+ bufferptr p3 = new buffer("abcdef",6);
+
+ cout << "p3 is " << p3 << endl;
+
+ bufferlist bl;
+ bl.push_back(p2);
+ bl.push_back(p1);
+ bl.push_back(p3);
+
+ cout << "bl is " << bl << endl;
+
+ cout << "len is " << bl.length() << endl;
+
+ bufferlist took;
+ bl.splice(10,4,&took);
+
+ cout << "took out " << took << "leftover is " << bl << endl;
+ //cout << "len is " << bl.length() << endl;
+
+ bufferlist bl2;
+ bl2.substr_of(bl, 3, 5);
+ cout << "bl2 is " << bl2 << endl;
+
+
+}
--- /dev/null
+
+
+#include "../crush/crush.h"
+using namespace crush;
+
+#include <math.h>
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+/*
+ostream& operator<<(ostream& out, vector<int>& v)
+{
+ out << "[";
+ for (int i=0; i<v.size(); i++) {
+ if (i) out << " ";
+ out << v[i];
+ }
+ out << "]";
+ return out;
+}
+*/
+
+void make_disks(int n, int& no, vector<int>& d)
+{
+ d.clear();
+ while (n) {
+ d.push_back(no);
+ no++;
+ n--;
+ }
+}
+
+
+Bucket *make_bucket(Crush& c, vector<int>& wid, int h, int& ndisks, int& nbuckets)
+{
+ if (h == 0) {
+ // uniform
+ Hash hash(123);
+ vector<int> disks;
+ for (int i=0; i<wid[h]; i++)
+ disks.push_back(ndisks++);
+ UniformBucket *b = new UniformBucket(nbuckets--, 1, 0, 10, disks);
+ b->make_primes(hash);
+ c.add_bucket(b);
+ //cout << h << " uniformbucket with " << wid[h] << " disks" << endl;
+ return b;
+ } else {
+ // mixed
+ MixedBucket *b = new MixedBucket(nbuckets--, h+1);
+ for (int i=0; i<wid[h]; i++) {
+ Bucket *n = make_bucket(c, wid, h-1, ndisks, nbuckets);
+ b->add_item(n->get_id(), n->get_weight());
+ }
+ c.add_bucket(b);
+ //cout << h << " mixedbucket with " << wid[h] << endl;
+ return b;
+ }
+}
+
+int make_hierarchy(Crush& c, vector<int>& wid, int& ndisks, int& nbuckets)
+{
+ Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks, nbuckets);
+ return b->get_id();
+}
+
+
+
+int main()
+{
+ Hash h(73232313);
+
+ // crush
+ Crush c;
+
+
+ // buckets
+ vector<int> disks;
+ int root = -1;
+ int nbuckets = -1;
+ int ndisks = 0;
+
+ if (0) {
+ make_disks(12, ndisks, disks);
+ UniformBucket ub1(-1, 1, 0, 30, disks);
+ ub1.make_primes(h);
+ cout << "ub1 primes are " << ub1.primes << endl;
+ c.add_bucket(&ub1);
+
+ make_disks(17, ndisks, disks);
+ UniformBucket ub2(-2, 1, 0, 30, disks);
+ ub2.make_primes(h);
+ cout << "ub2 primes are " << ub2.primes << endl;
+ c.add_bucket(&ub2);
+
+ make_disks(4, ndisks, disks);
+ UniformBucket ub3(-3, 1, 0, 30, disks);
+ ub3.make_primes(h);
+ cout << "ub3 primes are " << ub3.primes << endl;
+ c.add_bucket(&ub3);
+
+ make_disks(20, ndisks, disks);
+ MixedBucket umb1(-4, 1);
+ for (int i=0; i<20; i++)
+ umb1.add_item(disks[i], 30);
+ c.add_bucket(&umb1);
+
+ MixedBucket b(-100, 1);
+ //b.add_item(-2, ub1.get_weight());
+ b.add_item(-4, umb1.get_weight());
+ //b.add_item(-2, ub2.get_weight());
+ //b.add_item(-3, ub3.get_weight());
+ }
+
+ if (0) {
+ int bucket = -1;
+ MixedBucket *root = new MixedBucket(bucket--, 2);
+
+ for (int i=0; i<5; i++) {
+ MixedBucket *b = new MixedBucket(bucket--, 1);
+
+ int n = 5;
+
+ if (1) {
+ // add n buckets of n disks
+ for (int j=0; j<n; j++) {
+
+ MixedBucket *d = new MixedBucket(bucket--, 1);
+
+ make_disks(n, ndisks, disks);
+ for (int k=0; k<n; k++)
+ d->add_item(disks[k], 10);
+
+ //b->add_item(disks[j], 10);
+ c.add_bucket(d);
+ b->add_item(d->get_id(), d->get_weight());
+ }
+
+ c.add_bucket(b);
+ root->add_item(b->get_id(), b->get_weight());
+ } else {
+ // add n*n disks
+ make_disks(n*n, ndisks, disks);
+ for (int k=0; k<n*n; k++)
+ b->add_item(disks[k], 10);
+
+ c.add_bucket(b);
+ root->add_item(b->get_id(), b->get_weight());
+ }
+ }
+
+ c.add_bucket(root);
+ }
+
+
+ if (1) {
+ vector<int> wid;
+ for (int d=0; d<5; d++)
+ wid.push_back(10);
+ root = make_hierarchy(c, wid, ndisks, nbuckets);
+ }
+
+
+
+ // rule
+ int numrep = 1;
+
+ Rule rule;
+ if (0) {
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -100));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0));
+ }
+ if (1) {
+ /*
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -4));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 2, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+ */
+ rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0));
+ rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT));
+ }
+
+ //c.overload[10] = .1;
+
+
+ int pg_per = 100;
+ int numpg = pg_per*ndisks/numrep;
+
+ vector<int> ocount(ndisks);
+ cout << ndisks << " disks, " << 1-nbuckets << " buckets" << endl;
+ cout << pg_per << " pgs per disk" << endl;
+ cout << numpg << " logical pgs" << endl;
+ cout << "numrep is " << numrep << endl;
+
+
+ int place = 1000000;
+ int times = place / numpg;
+ if (!times) times = 1;
+
+ cout << "looping " << times << " times" << endl;
+
+ float tvar = 0;
+ int tvarnum = 0;
+
+ int x = 0;
+ for (int t=0; t<times; t++) {
+ vector<int> v(numrep);
+
+ for (int z=0; z<ndisks; z++) ocount[z] = 0;
+
+ for (int xx=1; xx<numpg; xx++) {
+ x++;
+
+ //cout << H(x) << "\t" << h(x) << endl;
+ c.do_rule(rule, x, v);
+ //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl;
+
+ bool bad = false;
+ for (int i=0; i<numrep; i++) {
+ //int d = b.choose_r(x, i, h);
+ //v[i] = d;
+ ocount[v[i]]++;
+ for (int j=i+1; j<numrep; j++) {
+ if (v[i] == v[j])
+ bad = true;
+ }
+ }
+ if (bad)
+ cout << "bad set " << x << ": " << v << endl;
+
+ //cout << v << "\t" << ocount << endl;
+ }
+
+ /*
+ for (int i=0; i<ocount.size(); i++) {
+ cout << "disk " << i << " has " << ocount[i] << endl;
+ }
+ */
+
+ cout << "collisions: " << c.collisions << endl;
+ cout << "r bumps: " << c.bumps << endl;
+
+
+ float avg = 0.0;
+ for (int i=0; i<ocount.size(); i++)
+ avg += ocount[i];
+ avg /= ocount.size();
+ float var = 0.0;
+ for (int i=0; i<ocount.size(); i++)
+ var += (ocount[i] - avg) * (ocount[i] - avg);
+ var /= ocount.size();
+
+ cout << "avg " << avg << " var " << var << " sd " << sqrt(var) << endl;
+
+ tvar += var;
+ tvarnum++;
+ }
+
+ tvar /= tvarnum;
+
+ cout << "total variance " << tvar << endl;
+
+
+}
--- /dev/null
+
+#include "include/filepath.h"
+#include <iostream>
+using namespace std;
+
+int print(string s) {
+ filepath fp = s;
+ cout << "s = " << s << " filepath = " << fp << endl;
+ cout << " depth " << fp.depth() << endl;
+ for (int i=0; i<fp.depth(); i++) {
+ cout << "\t" << i << " " << fp[i] << endl;
+ }
+}
+
+int main() {
+ filepath p;
+ print("/home/sage");
+ print("a/b/c");
+ print("/a/b/c");
+ print("/a/b/c/");
+ print("/a/b/../d");
+}
--- /dev/null
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+#include "messages/MPing.h"
+#include "common/Mutex.h"
+
+#include "msg/MPIMessenger.h"
+
+class Pinger : public Dispatcher {
+public:
+ Messenger *messenger;
+ Pinger(Messenger *m) : messenger(m) {
+ m->set_dispatcher(this);
+ }
+ void dispatch(Message *m) {
+ //dout(1) << "got incoming " << m << endl;
+ delete m;
+
+ }
+};
+
+int main(int argc, char **argv) {
+ int num = 1000;
+
+ int myrank = mpimessenger_init(argc, argv);
+ int world = mpimessenger_world();
+
+ Pinger *p = new Pinger( new MPIMessenger(myrank) );
+
+ mpimessenger_start();
+
+ //while (1) {
+ for (int i=0; i<10000; i++) {
+
+ // ping random nodes
+ int d = rand() % world;
+ if (d != myrank) {
+ //cout << "sending " << i << " to " << d << endl;
+ p->messenger->send_message(new MPing(), d);
+ }
+
+ }
+
+
+ //cout << "shutting down" << endl;
+ //p->messenger->shutdown();
+
+ mpimessenger_wait();
+ mpimessenger_shutdown(); // shutdown MPI
+}
--- /dev/null
+
+#include <list>
+#include <iostream>
+using namespace std;
+
+
+#include "include/newbuffer.h"
+//#include "include/bufferlist.h"
+
+#include "common/Thread.h"
+
+
+ class Th : public Thread {
+ public:
+ bufferlist bl;
+ Th(bufferlist& o) : bl(o) { }
+
+ void *entry() {
+ //cout << "start" << endl;
+ // thrash it a bit.
+ for (int n=0; n<10000; n++) {
+ bufferlist bl2;
+ unsigned off = rand() % (bl.length() -1);
+ unsigned len = 1 + rand() % (bl.length() - off - 1);
+ bl2.substr_of(bl, off, len);
+ bufferlist bl3;
+ bl3.append(bl);
+ bl3.append(bl2);
+ //cout << bl3 << endl;
+ bl2.clear();
+ bl3.clear();
+ }
+ //cout << "end" << endl;
+ }
+ };
+
+int main()
+{
+
+ bufferptr p1 = buffer::copy("123456",7);
+ //bufferptr p1 = new buffer("123456",7);
+ bufferptr p2 = p1;
+
+ cout << "p1 is '" << p1.c_str() << "'" << " " << p1 << endl;
+ cout << "p2 is '" << p2.c_str() << "'" << " " << p2 << endl;
+
+ bufferptr p3 = buffer::copy("abcdef",7);
+ //bufferptr p3 = new buffer("abcdef",7);
+
+ cout << "p3 is " << p3.c_str() << " " << p3 << endl;
+
+ bufferlist bl;
+ bl.push_back(p2);
+ bl.push_back(p1);
+ bl.push_back(p3);
+
+ cout << "bl is " << bl << endl;
+
+ bufferlist took;
+ bl.splice(10,4,&took);
+
+ cout << "took out " << took << ", leftover is " << bl << endl;
+ //cout << "len is " << bl.length() << endl;
+
+ bufferlist bl2;
+ bl2.substr_of(bl, 3, 5);
+ cout << "bl2 is " << bl2 << endl;
+
+
+ cout << "bl before " << bl << endl;
+
+ list<Th*> ls;
+ for (int t=0; t<40; t++) {
+ Th *t = new Th(bl);
+ cout << "create" << endl;
+ t->create();
+ ls.push_back(t);
+ }
+
+ bl.clear();
+
+ while (!ls.empty()) {
+ cout << "join" << endl;
+ ls.front()->join();
+ delete ls.front();
+ ls.pop_front();
+ }
+
+ cout << "bl after " << bl << endl;
+
+}
--- /dev/null
+
+
+#include "../crush/BinaryTree.h"
+using namespace crush;
+
+#include <iostream>
+#include <vector>
+using namespace std;
+
+int main()
+{
+ BinaryTree t;
+
+ vector<int> nodes;
+
+ for (int i=0; i<30; i++) {
+ cout << "adding " << i << endl;
+ int n = t.add_node(1);
+ nodes.push_back(n);
+ //cout << t << endl;
+ }
+ cout << t << endl;
+
+ for (int k=0; k<10000; k++) {
+ if (rand() % 2) {
+ cout << "adding" << endl;
+ nodes.push_back( t.add_node(1) );
+ } else {
+ if (!nodes.empty()) {
+ //for (int i=0; i<nodes.size(); i++) {
+ int p = rand() % nodes.size();
+ int n = nodes[p];
+ assert (t.exists(n));
+ cout << "removing " << n << endl;
+ t.remove_node(n);
+
+ for (int j=p; j<nodes.size(); j++)
+ nodes[j] = nodes[j+1];
+ nodes.pop_back();
+ }
+ }
+ cout << t << endl;
+ }
+
+
+}
--- /dev/null
+
+#include <iostream>
+using namespace std;
+
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <iostream>
+#include <cassert>
+#include <errno.h>
+#include <dirent.h>
+#include <sys/xattr.h>
+
+int main(int argc, char**argv)
+{
+ int a = 1;
+ int b = 2;
+
+ mknod("test", 0600, 0);
+
+ cout << "setxattr " << setxattr("test", "asdf", &a, sizeof(a), 0) << endl;
+ cout << "errno " << errno << " " << strerror(errno) << endl;
+ cout << "getxattr " << getxattr("test", "asdf", &b, sizeof(b)) << endl;
+ cout << "errno " << errno << " " << strerror(errno) << endl;
+ cout << "a is " << a << " and b is " << b << endl;
+ return 0;
+}