]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
cmake: add submodule for Apache Arrow at v6.0.1
authorCasey Bodley <cbodley@redhat.com>
Thu, 20 Jan 2022 15:22:27 +0000 (10:22 -0500)
committerCasey Bodley <cbodley@redhat.com>
Wed, 23 Mar 2022 20:04:00 +0000 (16:04 -0400)
adds an arrow submodule. when WITH_RADOSGW_SELECT_PARQUET is enabled,
the submodule is built as an external project and rgw links against its
imported Arrow::Parquet target

Signed-off-by: Casey Bodley <cbodley@redhat.com>
(cherry picked from commit 2ca6d75521541e99ebb6101f6d350f92a6797a8b)

Conflicts:
CMakeLists.txt master has an extra option WITH_RADOSGW_MOTR

.gitmodules
CMakeLists.txt
ceph.spec.in
cmake/modules/BuildArrow.cmake [new file with mode: 0644]
cmake/modules/BuildBoost.cmake
debian/control
src/CMakeLists.txt
src/arrow [new submodule]
src/rgw/CMakeLists.txt

index 1fd459a125ef1ab9c4315b7906674242d3f9e845..5100e7d23bca3489b2c82eb1256810ecc512e876 100644 (file)
@@ -66,3 +66,6 @@
        path = src/libkmip
        url = https://github.com/ceph/libkmip
        branch = ceph-master
+[submodule "src/arrow"]
+       path = src/arrow
+       url = https://github.com/apache/arrow.git
index a8bc4d7fd4671542d507200926f23fc718155748..305ae035121af7489f9f9c8d6627a37428812b8b 100644 (file)
@@ -400,7 +400,7 @@ option(WITH_RADOSGW_AMQP_ENDPOINT "Rados Gateway's pubsub support for AMQP push
 option(WITH_RADOSGW_KAFKA_ENDPOINT "Rados Gateway's pubsub support for Kafka push endpoint" ON)
 option(WITH_RADOSGW_LUA_PACKAGES "Rados Gateway's support for dynamically adding lua packagess" ON)
 option(WITH_RADOSGW_DBSTORE "DBStore backend for Rados Gateway" ON)
-option(WITH_RADOSGW_SELECT_PARQUET "Support for s3 select on parquet objects" OFF)
+option(WITH_RADOSGW_SELECT_PARQUET "Support for s3 select on parquet objects" ON)
 
 if(WITH_RADOSGW)
   find_package(EXPAT REQUIRED)
index bb3c0553af5a3c5ad62803d7a4e5cd0af8616e11..4d74d613c96836abcfc6585986f0346b5d9bb0aa 100644 (file)
@@ -260,10 +260,12 @@ BuildRequires:    socat
 %if 0%{with zbd}
 BuildRequires:  libzbd-devel
 %endif
+BuildRequires:  thrift-devel >= 0.13.0
+BuildRequires:  re2-devel
+BuildRequires:  utf8proc-devel >= 2.2.0
 %if 0%{with jaeger}
 BuildRequires:  bison
 BuildRequires:  flex
-BuildRequires:  thrift-devel >= 0.13.0
 %if 0%{?fedora} || 0%{?rhel}
 BuildRequires:  json-devel
 %endif
diff --git a/cmake/modules/BuildArrow.cmake b/cmake/modules/BuildArrow.cmake
new file mode 100644 (file)
index 0000000..45ebb69
--- /dev/null
@@ -0,0 +1,143 @@
+# apache arrow and its parquet library are used in radosgw for s3 select
+
+function(build_arrow)
+  # only enable the parquet component
+  set(arrow_CMAKE_ARGS -DARROW_PARQUET=ON)
+
+  # only use preinstalled dependencies for arrow, don't fetch/build any
+  list(APPEND arrow_CMAKE_ARGS -DARROW_DEPENDENCY_SOURCE=SYSTEM)
+
+  # only build static version of arrow and parquet
+  list(APPEND arrow_CMAKE_ARGS -DARROW_BUILD_SHARED=OFF)
+  list(APPEND arrow_CMAKE_ARGS -DARROW_BUILD_STATIC=ON)
+
+  # arrow only supports its own bundled version of jemalloc, so can't
+  # share the version ceph is using
+  list(APPEND arrow_CMAKE_ARGS -DARROW_JEMALLOC=OFF)
+
+  # transitive dependencies
+  list(APPEND arrow_INTERFACE_LINK_LIBRARIES thrift)
+
+  if (NOT WITH_SYSTEM_UTF8PROC)
+    # forward utf8proc_ROOT from build_utf8proc()
+    list(APPEND arrow_CMAKE_ARGS -Dutf8proc_ROOT=${utf8proc_ROOT})
+    # non-system utf8proc is bundled as a static library
+    list(APPEND arrow_CMAKE_ARGS -DARROW_UTF8PROC_USE_SHARED=OFF)
+    # make sure utf8proc submodule builds first, so arrow can find its byproducts
+    list(APPEND arrow_DEPENDS utf8proc::utf8proc)
+  endif()
+
+  list(APPEND arrow_CMAKE_ARGS -DARROW_WITH_BROTLI=${HAVE_BROTLI})
+  if (HAVE_BROTLI) # optional, off by default
+    list(APPEND arrow_INTERFACE_LINK_LIBRARIES ${brotli_libs})
+  endif (HAVE_BROTLI)
+
+  list(APPEND arrow_CMAKE_ARGS -DARROW_WITH_BZ2=OFF)
+
+  list(APPEND arrow_CMAKE_ARGS -DARROW_WITH_LZ4=${HAVE_LZ4})
+  if (HAVE_LZ4) # optional, on by default
+    list(APPEND arrow_INTERFACE_LINK_LIBRARIES LZ4::LZ4)
+  endif (HAVE_LZ4)
+
+  list(APPEND arrow_CMAKE_ARGS -DARROW_WITH_SNAPPY=ON) # required
+  list(APPEND arrow_INTERFACE_LINK_LIBRARIES snappy::snappy)
+
+  list(APPEND arrow_CMAKE_ARGS -DARROW_WITH_ZLIB=ON) # required
+  list(APPEND arrow_INTERFACE_LINK_LIBRARIES ZLIB::ZLIB)
+
+  list(APPEND arrow_CMAKE_ARGS -DARROW_WITH_ZSTD=${WITH_SYSTEM_ZSTD})
+  if (WITH_SYSTEM_ZSTD)
+    find_package(Zstd 1.4.4 REQUIRED)
+    list(APPEND arrow_INTERFACE_LINK_LIBRARIES Zstd::Zstd)
+  endif (WITH_SYSTEM_ZSTD)
+
+  list(APPEND arrow_CMAKE_ARGS -DBOOST_ROOT=${BOOST_ROOT})
+  list(APPEND arrow_CMAKE_ARGS -DBOOST_INCLUDEDIR=${Boost_INCLUDE_DIR})
+  list(APPEND arrow_CMAKE_ARGS -DBOOST_LIBRARYDIR=${BOOST_LIBRARYDIR})
+
+  if (NOT WITH_SYSTEM_BOOST)
+    # make sure boost submodule builds first, so arrow can find its byproducts
+    list(APPEND arrow_DEPENDS Boost)
+  endif()
+
+  # cmake doesn't properly handle arguments containing ";", such as
+  # CMAKE_PREFIX_PATH, for which reason we'll have to use some other separator.
+  string(REPLACE ";" "!" CMAKE_PREFIX_PATH_ALT_SEP "${CMAKE_PREFIX_PATH}")
+  list(APPEND arrow_CMAKE_ARGS -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH_ALT_SEP})
+  if(CMAKE_TOOLCHAIN_FILE)
+    list(APPEND arrow_CMAKE_ARGS
+         -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
+  endif()
+
+  list(APPEND arrow_CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER})
+  list(APPEND arrow_CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER})
+  list(APPEND arrow_CMAKE_ARGS -DCMAKE_AR=${CMAKE_AR})
+  if(CMAKE_BUILD_TYPE AND NOT CMAKE_BUILD_TYPE STREQUAL "None")
+    list(APPEND arrow_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})
+  else()
+    list(APPEND arrow_CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release)
+  endif()
+
+  # we use an external project and copy the sources to bin directory to ensure
+  # that object files are built outside of the source tree.
+  include(ExternalProject)
+  set(arrow_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/arrow/cpp")
+  set(arrow_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/arrow/cpp")
+
+  set(arrow_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/arrow")
+  list(APPEND arrow_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${arrow_INSTALL_PREFIX})
+
+  set(arrow_INSTALL_LIBDIR "lib") # force lib so we don't have to guess between lib/lib64
+  list(APPEND arrow_CMAKE_ARGS -DCMAKE_INSTALL_LIBDIR=${arrow_INSTALL_LIBDIR})
+  set(arrow_LIBRARY_DIR "${arrow_INSTALL_PREFIX}/${arrow_INSTALL_LIBDIR}")
+
+  set(arrow_LIBRARY "${arrow_LIBRARY_DIR}/libarrow.a")
+  set(parquet_LIBRARY "${arrow_LIBRARY_DIR}/libparquet.a")
+
+  set(arrow_INCLUDE_DIR "${arrow_INSTALL_PREFIX}/include")
+  # this include directory won't exist until the install step, but the
+  # imported targets need it early for INTERFACE_INCLUDE_DIRECTORIES
+  file(MAKE_DIRECTORY "${arrow_INCLUDE_DIR}")
+
+  set(arrow_BYPRODUCTS ${arrow_LIBRARY})
+  list(APPEND arrow_BYPRODUCTS ${parquet_LIBRARY})
+
+  if(CMAKE_MAKE_PROGRAM MATCHES "make")
+    # try to inherit command line arguments passed by parent "make" job
+    set(make_cmd $(MAKE))
+    set(install_cmd $(MAKE) install)
+  else()
+    set(make_cmd ${CMAKE_COMMAND} --build <BINARY_DIR>)
+    set(install_cmd ${CMAKE_COMMAND} --build <BINARY_DIR> --target install)
+  endif()
+
+  # clear the DESTDIR environment variable from debian/rules,
+  # because it messes with the internal install paths of arrow's bundled deps
+  set(NO_DESTDIR_COMMAND ${CMAKE_COMMAND} -E env --unset=DESTDIR)
+
+  ExternalProject_Add(arrow_ext
+    SOURCE_DIR "${arrow_SOURCE_DIR}"
+    CMAKE_ARGS ${arrow_CMAKE_ARGS}
+    BINARY_DIR "${arrow_BINARY_DIR}"
+    BUILD_COMMAND ${NO_DESTDIR_COMMAND} ${make_cmd}
+    BUILD_BYPRODUCTS "${arrow_BYPRODUCTS}"
+    INSTALL_COMMAND ${NO_DESTDIR_COMMAND} ${install_cmd}
+    INSTALL_DIR "${arrow_INSTALL_PREFIX}"
+    DEPENDS "${arrow_DEPENDS}"
+    LIST_SEPARATOR !)
+
+  add_library(Arrow::Arrow STATIC IMPORTED)
+  add_dependencies(Arrow::Arrow arrow_ext)
+  set_target_properties(Arrow::Arrow PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${arrow_INCLUDE_DIR}"
+    INTERFACE_LINK_LIBRARIES "${arrow_INTERFACE_LINK_LIBRARIES}"
+    IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+    IMPORTED_LOCATION "${arrow_LIBRARY}")
+
+  add_library(Arrow::Parquet STATIC IMPORTED)
+  add_dependencies(Arrow::Parquet arrow_ext)
+  target_link_libraries(Arrow::Parquet INTERFACE Arrow::Arrow)
+  set_target_properties(Arrow::Parquet PROPERTIES
+    IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+    IMPORTED_LOCATION "${parquet_LIBRARY}")
+endfunction()
index 36e430e9ed7dd90c37cfdbe90860ca5ecb85d6fe..bdda30f708ef83a9b5ca2ea361883d9ae984aa37 100644 (file)
@@ -1,8 +1,9 @@
-# This module builds Boost
-# executables are. It sets the following variables:
+# This module builds Boost. It sets the following variables:
 #
 #  Boost_FOUND : boolean            - system has Boost
+#  BOOST_ROOT : path
 #  Boost_LIBRARIES : list(filepath) - the libraries needed to use Boost
+#  Boost_LIBRARY_DIR_RELEASE : path - the library path
 #  Boost_INCLUDE_DIRS : list(path)  - the Boost include directories
 #
 # Following hints are respected
@@ -190,8 +191,10 @@ macro(build_boost version)
   # target, so we can collect "Boost_LIBRARIES" which is then used by
   # ExternalProject_Add(Boost ...)
   set(install_dir "${CMAKE_BINARY_DIR}/boost")
+  set(BOOST_ROOT ${install_dir})
   set(Boost_INCLUDE_DIRS ${install_dir}/include)
   set(Boost_INCLUDE_DIR ${install_dir}/include)
+  set(Boost_LIBRARY_DIR_RELEASE ${install_dir}/lib)
   set(Boost_VERSION ${version})
   # create the directory so cmake won't complain when looking at the imported
   # target
index a49e44f96d88db78fc4e2ec8bb9fa23a0becd694..e3e38af28cfa50f6d706296559cc496650477032 100644 (file)
@@ -65,9 +65,11 @@ Build-Depends: automake,
                libnl-genl-3-dev,
                libxml2-dev,
                librabbitmq-dev,
+               libre2-dev,
+               libutf8proc-dev (>= 2.2.0),
                librdkafka-dev,
                luarocks,
-               libthrift-dev (>= 0.13.0) <pkg.ceph.jaeger>,
+               libthrift-dev (>= 0.13.0),
                libyaml-cpp-dev (>= 0.6) <pkg.ceph.crimson> <pkg.ceph.jaeger>,
                libzstd-dev <pkg.ceph.check>,
                libxmlsec1 <pkg.ceph.check>,
index abace5f0b19af7e443b83cb55623677cc32d8e14..3118d2f64fa300e5d01fc8b81ee069e15ed628c4 100644 (file)
@@ -856,6 +856,12 @@ if(WITH_KVS)
 endif(WITH_KVS)
 
 if(WITH_RADOSGW)
+  if(WITH_RADOSGW_SELECT_PARQUET AND NOT WITH_SYSTEM_ARROW)
+    find_package(thrift 0.13 REQUIRED) # a dependency of arrow
+    include(BuildArrow)
+    build_arrow()
+  endif()
+
   add_subdirectory(libkmip)
   add_subdirectory(rgw)
 endif(WITH_RADOSGW)
diff --git a/src/arrow b/src/arrow
new file mode 160000 (submodule)
index 0000000..347a88f
--- /dev/null
+++ b/src/arrow
@@ -0,0 +1 @@
+Subproject commit 347a88ff9d20e2a4061eec0b455b8ea1aa8335dc
index 740d91e92fa7933b096a0daac76b186848c075ab..faa7388a43f776608f9347bc0b2516e13932cc08 100644 (file)
@@ -4,8 +4,7 @@ if(NOT GPERF)
 endif()
 
 if(WITH_RADOSGW_SELECT_PARQUET)
-  find_package(Arrow QUIET REQUIRED)
-  set(ARROW_LIBRARIES "-larrow -lparquet")
+  set(ARROW_LIBRARIES Arrow::Parquet)
   add_definitions(-D_ARROW_EXIST)
   message("-- arrow is installed, radosgw/s3select-op is able to process parquet objects")
 endif(WITH_RADOSGW_SELECT_PARQUET)