From: Casey Bodley Date: Thu, 20 Jan 2022 15:22:27 +0000 (-0500) Subject: cmake: add submodule for Apache Arrow at v6.0.1 X-Git-Tag: v17.2.0~18^2~7 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f8011f17f62fba747aa179963828c68a930ef03c;p=ceph.git cmake: add submodule for Apache Arrow at v6.0.1 adds an arrow submodule. when WITH_RADOSGW_SELECT_PARQUET is enabled, the submodule is built as an external project and rgw links against its imported Arrow::Parquet target Signed-off-by: Casey Bodley (cherry picked from commit 2ca6d75521541e99ebb6101f6d350f92a6797a8b) Conflicts: CMakeLists.txt master has an extra option WITH_RADOSGW_MOTR --- diff --git a/.gitmodules b/.gitmodules index 1fd459a125e..5100e7d23bc 100644 --- a/.gitmodules +++ b/.gitmodules @@ -66,3 +66,6 @@ path = src/libkmip url = https://github.com/ceph/libkmip branch = ceph-master +[submodule "src/arrow"] + path = src/arrow + url = https://github.com/apache/arrow.git diff --git a/CMakeLists.txt b/CMakeLists.txt index a8bc4d7fd46..305ae035121 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -400,7 +400,7 @@ option(WITH_RADOSGW_AMQP_ENDPOINT "Rados Gateway's pubsub support for AMQP push option(WITH_RADOSGW_KAFKA_ENDPOINT "Rados Gateway's pubsub support for Kafka push endpoint" ON) option(WITH_RADOSGW_LUA_PACKAGES "Rados Gateway's support for dynamically adding lua packagess" ON) option(WITH_RADOSGW_DBSTORE "DBStore backend for Rados Gateway" ON) -option(WITH_RADOSGW_SELECT_PARQUET "Support for s3 select on parquet objects" OFF) +option(WITH_RADOSGW_SELECT_PARQUET "Support for s3 select on parquet objects" ON) if(WITH_RADOSGW) find_package(EXPAT REQUIRED) diff --git a/ceph.spec.in b/ceph.spec.in index bb3c0553af5..4d74d613c96 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -260,10 +260,12 @@ BuildRequires: socat %if 0%{with zbd} BuildRequires: libzbd-devel %endif +BuildRequires: thrift-devel >= 0.13.0 +BuildRequires: re2-devel +BuildRequires: utf8proc-devel >= 2.2.0 %if 0%{with jaeger} BuildRequires: bison BuildRequires: flex -BuildRequires: thrift-devel >= 0.13.0 %if 0%{?fedora} || 0%{?rhel} BuildRequires: json-devel %endif diff --git a/cmake/modules/BuildArrow.cmake b/cmake/modules/BuildArrow.cmake new file mode 100644 index 00000000000..45ebb697446 --- /dev/null +++ b/cmake/modules/BuildArrow.cmake @@ -0,0 +1,143 @@ +# apache arrow and its parquet library are used in radosgw for s3 select + +function(build_arrow) + # only enable the parquet component + set(arrow_CMAKE_ARGS -DARROW_PARQUET=ON) + + # only use preinstalled dependencies for arrow, don't fetch/build any + list(APPEND arrow_CMAKE_ARGS -DARROW_DEPENDENCY_SOURCE=SYSTEM) + + # only build static version of arrow and parquet + list(APPEND arrow_CMAKE_ARGS -DARROW_BUILD_SHARED=OFF) + list(APPEND arrow_CMAKE_ARGS -DARROW_BUILD_STATIC=ON) + + # arrow only supports its own bundled version of jemalloc, so can't + # share the version ceph is using + list(APPEND arrow_CMAKE_ARGS -DARROW_JEMALLOC=OFF) + + # transitive dependencies + list(APPEND arrow_INTERFACE_LINK_LIBRARIES thrift) + + if (NOT WITH_SYSTEM_UTF8PROC) + # forward utf8proc_ROOT from build_utf8proc() + list(APPEND arrow_CMAKE_ARGS -Dutf8proc_ROOT=${utf8proc_ROOT}) + # non-system utf8proc is bundled as a static library + list(APPEND arrow_CMAKE_ARGS -DARROW_UTF8PROC_USE_SHARED=OFF) + # make sure utf8proc submodule builds first, so arrow can find its byproducts + list(APPEND arrow_DEPENDS utf8proc::utf8proc) + endif() + + list(APPEND arrow_CMAKE_ARGS -DARROW_WITH_BROTLI=${HAVE_BROTLI}) + if (HAVE_BROTLI) # optional, off by default + list(APPEND arrow_INTERFACE_LINK_LIBRARIES ${brotli_libs}) + endif (HAVE_BROTLI) + + list(APPEND arrow_CMAKE_ARGS -DARROW_WITH_BZ2=OFF) + + list(APPEND arrow_CMAKE_ARGS -DARROW_WITH_LZ4=${HAVE_LZ4}) + if (HAVE_LZ4) # optional, on by default + list(APPEND arrow_INTERFACE_LINK_LIBRARIES LZ4::LZ4) + endif (HAVE_LZ4) + + list(APPEND arrow_CMAKE_ARGS -DARROW_WITH_SNAPPY=ON) # required + list(APPEND arrow_INTERFACE_LINK_LIBRARIES snappy::snappy) + + list(APPEND arrow_CMAKE_ARGS -DARROW_WITH_ZLIB=ON) # required + list(APPEND arrow_INTERFACE_LINK_LIBRARIES ZLIB::ZLIB) + + list(APPEND arrow_CMAKE_ARGS -DARROW_WITH_ZSTD=${WITH_SYSTEM_ZSTD}) + if (WITH_SYSTEM_ZSTD) + find_package(Zstd 1.4.4 REQUIRED) + list(APPEND arrow_INTERFACE_LINK_LIBRARIES Zstd::Zstd) + endif (WITH_SYSTEM_ZSTD) + + list(APPEND arrow_CMAKE_ARGS -DBOOST_ROOT=${BOOST_ROOT}) + list(APPEND arrow_CMAKE_ARGS -DBOOST_INCLUDEDIR=${Boost_INCLUDE_DIR}) + list(APPEND arrow_CMAKE_ARGS -DBOOST_LIBRARYDIR=${BOOST_LIBRARYDIR}) + + if (NOT WITH_SYSTEM_BOOST) + # make sure boost submodule builds first, so arrow can find its byproducts + list(APPEND arrow_DEPENDS Boost) + endif() + + # cmake doesn't properly handle arguments containing ";", such as + # CMAKE_PREFIX_PATH, for which reason we'll have to use some other separator. + string(REPLACE ";" "!" CMAKE_PREFIX_PATH_ALT_SEP "${CMAKE_PREFIX_PATH}") + list(APPEND arrow_CMAKE_ARGS -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH_ALT_SEP}) + if(CMAKE_TOOLCHAIN_FILE) + list(APPEND arrow_CMAKE_ARGS + -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}) + endif() + + list(APPEND arrow_CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}) + list(APPEND arrow_CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}) + list(APPEND arrow_CMAKE_ARGS -DCMAKE_AR=${CMAKE_AR}) + if(CMAKE_BUILD_TYPE AND NOT CMAKE_BUILD_TYPE STREQUAL "None") + list(APPEND arrow_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}) + else() + list(APPEND arrow_CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release) + endif() + + # we use an external project and copy the sources to bin directory to ensure + # that object files are built outside of the source tree. + include(ExternalProject) + set(arrow_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/arrow/cpp") + set(arrow_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/arrow/cpp") + + set(arrow_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/arrow") + list(APPEND arrow_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${arrow_INSTALL_PREFIX}) + + set(arrow_INSTALL_LIBDIR "lib") # force lib so we don't have to guess between lib/lib64 + list(APPEND arrow_CMAKE_ARGS -DCMAKE_INSTALL_LIBDIR=${arrow_INSTALL_LIBDIR}) + set(arrow_LIBRARY_DIR "${arrow_INSTALL_PREFIX}/${arrow_INSTALL_LIBDIR}") + + set(arrow_LIBRARY "${arrow_LIBRARY_DIR}/libarrow.a") + set(parquet_LIBRARY "${arrow_LIBRARY_DIR}/libparquet.a") + + set(arrow_INCLUDE_DIR "${arrow_INSTALL_PREFIX}/include") + # this include directory won't exist until the install step, but the + # imported targets need it early for INTERFACE_INCLUDE_DIRECTORIES + file(MAKE_DIRECTORY "${arrow_INCLUDE_DIR}") + + set(arrow_BYPRODUCTS ${arrow_LIBRARY}) + list(APPEND arrow_BYPRODUCTS ${parquet_LIBRARY}) + + if(CMAKE_MAKE_PROGRAM MATCHES "make") + # try to inherit command line arguments passed by parent "make" job + set(make_cmd $(MAKE)) + set(install_cmd $(MAKE) install) + else() + set(make_cmd ${CMAKE_COMMAND} --build ) + set(install_cmd ${CMAKE_COMMAND} --build --target install) + endif() + + # clear the DESTDIR environment variable from debian/rules, + # because it messes with the internal install paths of arrow's bundled deps + set(NO_DESTDIR_COMMAND ${CMAKE_COMMAND} -E env --unset=DESTDIR) + + ExternalProject_Add(arrow_ext + SOURCE_DIR "${arrow_SOURCE_DIR}" + CMAKE_ARGS ${arrow_CMAKE_ARGS} + BINARY_DIR "${arrow_BINARY_DIR}" + BUILD_COMMAND ${NO_DESTDIR_COMMAND} ${make_cmd} + BUILD_BYPRODUCTS "${arrow_BYPRODUCTS}" + INSTALL_COMMAND ${NO_DESTDIR_COMMAND} ${install_cmd} + INSTALL_DIR "${arrow_INSTALL_PREFIX}" + DEPENDS "${arrow_DEPENDS}" + LIST_SEPARATOR !) + + add_library(Arrow::Arrow STATIC IMPORTED) + add_dependencies(Arrow::Arrow arrow_ext) + set_target_properties(Arrow::Arrow PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${arrow_INCLUDE_DIR}" + INTERFACE_LINK_LIBRARIES "${arrow_INTERFACE_LINK_LIBRARIES}" + IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" + IMPORTED_LOCATION "${arrow_LIBRARY}") + + add_library(Arrow::Parquet STATIC IMPORTED) + add_dependencies(Arrow::Parquet arrow_ext) + target_link_libraries(Arrow::Parquet INTERFACE Arrow::Arrow) + set_target_properties(Arrow::Parquet PROPERTIES + IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" + IMPORTED_LOCATION "${parquet_LIBRARY}") +endfunction() diff --git a/cmake/modules/BuildBoost.cmake b/cmake/modules/BuildBoost.cmake index 36e430e9ed7..bdda30f708e 100644 --- a/cmake/modules/BuildBoost.cmake +++ b/cmake/modules/BuildBoost.cmake @@ -1,8 +1,9 @@ -# This module builds Boost -# executables are. It sets the following variables: +# This module builds Boost. It sets the following variables: # # Boost_FOUND : boolean - system has Boost +# BOOST_ROOT : path # Boost_LIBRARIES : list(filepath) - the libraries needed to use Boost +# Boost_LIBRARY_DIR_RELEASE : path - the library path # Boost_INCLUDE_DIRS : list(path) - the Boost include directories # # Following hints are respected @@ -190,8 +191,10 @@ macro(build_boost version) # target, so we can collect "Boost_LIBRARIES" which is then used by # ExternalProject_Add(Boost ...) set(install_dir "${CMAKE_BINARY_DIR}/boost") + set(BOOST_ROOT ${install_dir}) set(Boost_INCLUDE_DIRS ${install_dir}/include) set(Boost_INCLUDE_DIR ${install_dir}/include) + set(Boost_LIBRARY_DIR_RELEASE ${install_dir}/lib) set(Boost_VERSION ${version}) # create the directory so cmake won't complain when looking at the imported # target diff --git a/debian/control b/debian/control index a49e44f96d8..e3e38af28cf 100644 --- a/debian/control +++ b/debian/control @@ -65,9 +65,11 @@ Build-Depends: automake, libnl-genl-3-dev, libxml2-dev, librabbitmq-dev, + libre2-dev, + libutf8proc-dev (>= 2.2.0), librdkafka-dev, luarocks, - libthrift-dev (>= 0.13.0) , + libthrift-dev (>= 0.13.0), libyaml-cpp-dev (>= 0.6) , libzstd-dev , libxmlsec1 , diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index abace5f0b19..3118d2f64fa 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -856,6 +856,12 @@ if(WITH_KVS) endif(WITH_KVS) if(WITH_RADOSGW) + if(WITH_RADOSGW_SELECT_PARQUET AND NOT WITH_SYSTEM_ARROW) + find_package(thrift 0.13 REQUIRED) # a dependency of arrow + include(BuildArrow) + build_arrow() + endif() + add_subdirectory(libkmip) add_subdirectory(rgw) endif(WITH_RADOSGW) diff --git a/src/arrow b/src/arrow new file mode 160000 index 00000000000..347a88ff9d2 --- /dev/null +++ b/src/arrow @@ -0,0 +1 @@ +Subproject commit 347a88ff9d20e2a4061eec0b455b8ea1aa8335dc diff --git a/src/rgw/CMakeLists.txt b/src/rgw/CMakeLists.txt index 740d91e92fa..faa7388a43f 100644 --- a/src/rgw/CMakeLists.txt +++ b/src/rgw/CMakeLists.txt @@ -4,8 +4,7 @@ if(NOT GPERF) endif() if(WITH_RADOSGW_SELECT_PARQUET) - find_package(Arrow QUIET REQUIRED) - set(ARROW_LIBRARIES "-larrow -lparquet") + set(ARROW_LIBRARIES Arrow::Parquet) add_definitions(-D_ARROW_EXIST) message("-- arrow is installed, radosgw/s3select-op is able to process parquet objects") endif(WITH_RADOSGW_SELECT_PARQUET)