tophat-2.0.9/0000755000175000017500000000000012163557420011556 5ustar toortoortophat-2.0.9/LICENSE0000644000175000017500000000267512122334411012561 0ustar toortoor=========================================================================== Boost Software License, Version 1.0 =========================================================================== Permission is hereby granted, free of charge, to any person or organization obtaining a copy of the software and accompanying documentation covered by this license (the "Software") to use, reproduce, display, distribute, execute, and transmit the Software, and to prepare derivative works of the Software, and to permit third-parties to whom the Software is furnished to do so, all subject to the following: The copyright notices in the Software and this entire statement, including the above license grant, this restriction and the following disclaimer, must be included in all copies of the Software, in whole or in part, and all derivative works of the Software, unless such copies or derivative works are solely in the form of machine-executable object code generated by a source language processor. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. tophat-2.0.9/aclocal.m40000644000175000017500000011373112157116213013416 0ustar toortoor# generated automatically by aclocal 1.9.6 -*- Autoconf -*- # Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, # 2005 Free Software Foundation, Inc. # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY, to the extent permitted by law; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. # Copyright (C) 2002, 2003, 2005 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # AM_AUTOMAKE_VERSION(VERSION) # ---------------------------- # Automake X.Y traces this macro to ensure aclocal.m4 has been # generated from the m4 files accompanying Automake X.Y. AC_DEFUN([AM_AUTOMAKE_VERSION], [am__api_version="1.9"]) # AM_SET_CURRENT_AUTOMAKE_VERSION # ------------------------------- # Call AM_AUTOMAKE_VERSION so it can be traced. # This function is AC_REQUIREd by AC_INIT_AUTOMAKE. AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION], [AM_AUTOMAKE_VERSION([1.9.6])]) # AM_AUX_DIR_EXPAND -*- Autoconf -*- # Copyright (C) 2001, 2003, 2005 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # For projects using AC_CONFIG_AUX_DIR([foo]), Autoconf sets # $ac_aux_dir to `$srcdir/foo'. In other projects, it is set to # `$srcdir', `$srcdir/..', or `$srcdir/../..'. # # Of course, Automake must honor this variable whenever it calls a # tool from the auxiliary directory. The problem is that $srcdir (and # therefore $ac_aux_dir as well) can be either absolute or relative, # depending on how configure is run. This is pretty annoying, since # it makes $ac_aux_dir quite unusable in subdirectories: in the top # source directory, any form will work fine, but in subdirectories a # relative path needs to be adjusted first. # # $ac_aux_dir/missing # fails when called from a subdirectory if $ac_aux_dir is relative # $top_srcdir/$ac_aux_dir/missing # fails if $ac_aux_dir is absolute, # fails when called from a subdirectory in a VPATH build with # a relative $ac_aux_dir # # The reason of the latter failure is that $top_srcdir and $ac_aux_dir # are both prefixed by $srcdir. In an in-source build this is usually # harmless because $srcdir is `.', but things will broke when you # start a VPATH build or use an absolute $srcdir. # # So we could use something similar to $top_srcdir/$ac_aux_dir/missing, # iff we strip the leading $srcdir from $ac_aux_dir. That would be: # am_aux_dir='\$(top_srcdir)/'`expr "$ac_aux_dir" : "$srcdir//*\(.*\)"` # and then we would define $MISSING as # MISSING="\${SHELL} $am_aux_dir/missing" # This will work as long as MISSING is not called from configure, because # unfortunately $(top_srcdir) has no meaning in configure. # However there are other variables, like CC, which are often used in # configure, and could therefore not use this "fixed" $ac_aux_dir. # # Another solution, used here, is to always expand $ac_aux_dir to an # absolute PATH. The drawback is that using absolute paths prevent a # configured tree to be moved without reconfiguration. AC_DEFUN([AM_AUX_DIR_EXPAND], [dnl Rely on autoconf to set up CDPATH properly. AC_PREREQ([2.50])dnl # expand $ac_aux_dir to an absolute path am_aux_dir=`cd $ac_aux_dir && pwd` ]) # AM_CONDITIONAL -*- Autoconf -*- # Copyright (C) 1997, 2000, 2001, 2003, 2004, 2005 # Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # serial 7 # AM_CONDITIONAL(NAME, SHELL-CONDITION) # ------------------------------------- # Define a conditional. AC_DEFUN([AM_CONDITIONAL], [AC_PREREQ(2.52)dnl ifelse([$1], [TRUE], [AC_FATAL([$0: invalid condition: $1])], [$1], [FALSE], [AC_FATAL([$0: invalid condition: $1])])dnl AC_SUBST([$1_TRUE]) AC_SUBST([$1_FALSE]) if $2; then $1_TRUE= $1_FALSE='#' else $1_TRUE='#' $1_FALSE= fi AC_CONFIG_COMMANDS_PRE( [if test -z "${$1_TRUE}" && test -z "${$1_FALSE}"; then AC_MSG_ERROR([[conditional "$1" was never defined. Usually this means the macro was only invoked conditionally.]]) fi])]) # Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005 # Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # serial 8 # There are a few dirty hacks below to avoid letting `AC_PROG_CC' be # written in clear, in which case automake, when reading aclocal.m4, # will think it sees a *use*, and therefore will trigger all it's # C support machinery. Also note that it means that autoscan, seeing # CC etc. in the Makefile, will ask for an AC_PROG_CC use... # _AM_DEPENDENCIES(NAME) # ---------------------- # See how the compiler implements dependency checking. # NAME is "CC", "CXX", "GCJ", or "OBJC". # We try a few techniques and use that to set a single cache variable. # # We don't AC_REQUIRE the corresponding AC_PROG_CC since the latter was # modified to invoke _AM_DEPENDENCIES(CC); we would have a circular # dependency, and given that the user is not expected to run this macro, # just rely on AC_PROG_CC. AC_DEFUN([_AM_DEPENDENCIES], [AC_REQUIRE([AM_SET_DEPDIR])dnl AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])dnl AC_REQUIRE([AM_MAKE_INCLUDE])dnl AC_REQUIRE([AM_DEP_TRACK])dnl ifelse([$1], CC, [depcc="$CC" am_compiler_list=], [$1], CXX, [depcc="$CXX" am_compiler_list=], [$1], OBJC, [depcc="$OBJC" am_compiler_list='gcc3 gcc'], [$1], GCJ, [depcc="$GCJ" am_compiler_list='gcc3 gcc'], [depcc="$$1" am_compiler_list=]) AC_CACHE_CHECK([dependency style of $depcc], [am_cv_$1_dependencies_compiler_type], [if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then # We make a subdir and do the tests there. Otherwise we can end up # making bogus files that we don't know about and never remove. For # instance it was reported that on HP-UX the gcc test will end up # making a dummy file named `D' -- because `-MD' means `put the output # in D'. mkdir conftest.dir # Copy depcomp to subdir because otherwise we won't find it if we're # using a relative directory. cp "$am_depcomp" conftest.dir cd conftest.dir # We will build objects and dependencies in a subdirectory because # it helps to detect inapplicable dependency modes. For instance # both Tru64's cc and ICC support -MD to output dependencies as a # side effect of compilation, but ICC will put the dependencies in # the current directory while Tru64 will put them in the object # directory. mkdir sub am_cv_$1_dependencies_compiler_type=none if test "$am_compiler_list" = ""; then am_compiler_list=`sed -n ['s/^#*\([a-zA-Z0-9]*\))$/\1/p'] < ./depcomp` fi for depmode in $am_compiler_list; do # Setup a source with many dependencies, because some compilers # like to wrap large dependency lists on column 80 (with \), and # we should not choose a depcomp mode which is confused by this. # # We need to recreate these files for each test, as the compiler may # overwrite some of them when testing with obscure command lines. # This happens at least with the AIX C compiler. : > sub/conftest.c for i in 1 2 3 4 5 6; do echo '#include "conftst'$i'.h"' >> sub/conftest.c # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with # Solaris 8's {/usr,}/bin/sh. touch sub/conftst$i.h done echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf case $depmode in nosideeffect) # after this tag, mechanisms are not by side-effect, so they'll # only be used when explicitly requested if test "x$enable_dependency_tracking" = xyes; then continue else break fi ;; none) break ;; esac # We check with `-c' and `-o' for the sake of the "dashmstdout" # mode. It turns out that the SunPro C++ compiler does not properly # handle `-M -o', and we need to detect this. if depmode=$depmode \ source=sub/conftest.c object=sub/conftest.${OBJEXT-o} \ depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \ $SHELL ./depcomp $depcc -c -o sub/conftest.${OBJEXT-o} sub/conftest.c \ >/dev/null 2>conftest.err && grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 && grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 && ${MAKE-make} -s -f confmf > /dev/null 2>&1; then # icc doesn't choke on unknown options, it will just issue warnings # or remarks (even with -Werror). So we grep stderr for any message # that says an option was ignored or not supported. # When given -MP, icc 7.0 and 7.1 complain thusly: # icc: Command line warning: ignoring option '-M'; no argument required # The diagnosis changed in icc 8.0: # icc: Command line remark: option '-MP' not supported if (grep 'ignoring option' conftest.err || grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else am_cv_$1_dependencies_compiler_type=$depmode break fi fi done cd .. rm -rf conftest.dir else am_cv_$1_dependencies_compiler_type=none fi ]) AC_SUBST([$1DEPMODE], [depmode=$am_cv_$1_dependencies_compiler_type]) AM_CONDITIONAL([am__fastdep$1], [ test "x$enable_dependency_tracking" != xno \ && test "$am_cv_$1_dependencies_compiler_type" = gcc3]) ]) # AM_SET_DEPDIR # ------------- # Choose a directory name for dependency files. # This macro is AC_REQUIREd in _AM_DEPENDENCIES AC_DEFUN([AM_SET_DEPDIR], [AC_REQUIRE([AM_SET_LEADING_DOT])dnl AC_SUBST([DEPDIR], ["${am__leading_dot}deps"])dnl ]) # AM_DEP_TRACK # ------------ AC_DEFUN([AM_DEP_TRACK], [AC_ARG_ENABLE(dependency-tracking, [ --disable-dependency-tracking speeds up one-time build --enable-dependency-tracking do not reject slow dependency extractors]) if test "x$enable_dependency_tracking" != xno; then am_depcomp="$ac_aux_dir/depcomp" AMDEPBACKSLASH='\' fi AM_CONDITIONAL([AMDEP], [test "x$enable_dependency_tracking" != xno]) AC_SUBST([AMDEPBACKSLASH]) ]) # Generate code to set up dependency tracking. -*- Autoconf -*- # Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005 # Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. #serial 3 # _AM_OUTPUT_DEPENDENCY_COMMANDS # ------------------------------ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS], [for mf in $CONFIG_FILES; do # Strip MF so we end up with the name of the file. mf=`echo "$mf" | sed -e 's/:.*$//'` # Check whether this is an Automake generated Makefile or not. # We used to match only the files named `Makefile.in', but # some people rename them; so instead we look at the file content. # Grep'ing the first line is not enough: some people post-process # each Makefile.in and add a new line on top of each file to say so. # So let's grep whole file. if grep '^#.*generated by automake' $mf > /dev/null 2>&1; then dirpart=`AS_DIRNAME("$mf")` else continue fi # Extract the definition of DEPDIR, am__include, and am__quote # from the Makefile without running `make'. DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"` test -z "$DEPDIR" && continue am__include=`sed -n 's/^am__include = //p' < "$mf"` test -z "am__include" && continue am__quote=`sed -n 's/^am__quote = //p' < "$mf"` # When using ansi2knr, U may be empty or an underscore; expand it U=`sed -n 's/^U = //p' < "$mf"` # Find all dependency output files, they are included files with # $(DEPDIR) in their names. We invoke sed twice because it is the # simplest approach to changing $(DEPDIR) to its actual value in the # expansion. for file in `sed -n " s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \ sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do # Make sure the directory exists. test -f "$dirpart/$file" && continue fdir=`AS_DIRNAME(["$file"])` AS_MKDIR_P([$dirpart/$fdir]) # echo "creating $dirpart/$file" echo '# dummy' > "$dirpart/$file" done done ])# _AM_OUTPUT_DEPENDENCY_COMMANDS # AM_OUTPUT_DEPENDENCY_COMMANDS # ----------------------------- # This macro should only be invoked once -- use via AC_REQUIRE. # # This code is only required when automatic dependency tracking # is enabled. FIXME. This creates each `.P' file that we will # need in order to bootstrap the dependency handling code. AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS], [AC_CONFIG_COMMANDS([depfiles], [test x"$AMDEP_TRUE" != x"" || _AM_OUTPUT_DEPENDENCY_COMMANDS], [AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"]) ]) # Do all the work for Automake. -*- Autoconf -*- # Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005 # Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # serial 12 # This macro actually does too much. Some checks are only needed if # your package does certain things. But this isn't really a big deal. # AM_INIT_AUTOMAKE(PACKAGE, VERSION, [NO-DEFINE]) # AM_INIT_AUTOMAKE([OPTIONS]) # ----------------------------------------------- # The call with PACKAGE and VERSION arguments is the old style # call (pre autoconf-2.50), which is being phased out. PACKAGE # and VERSION should now be passed to AC_INIT and removed from # the call to AM_INIT_AUTOMAKE. # We support both call styles for the transition. After # the next Automake release, Autoconf can make the AC_INIT # arguments mandatory, and then we can depend on a new Autoconf # release and drop the old call support. AC_DEFUN([AM_INIT_AUTOMAKE], [AC_PREREQ([2.58])dnl dnl Autoconf wants to disallow AM_ names. We explicitly allow dnl the ones we care about. m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl AC_REQUIRE([AM_SET_CURRENT_AUTOMAKE_VERSION])dnl AC_REQUIRE([AC_PROG_INSTALL])dnl # test to see if srcdir already configured if test "`cd $srcdir && pwd`" != "`pwd`" && test -f $srcdir/config.status; then AC_MSG_ERROR([source directory already configured; run "make distclean" there first]) fi # test whether we have cygpath if test -z "$CYGPATH_W"; then if (cygpath --version) >/dev/null 2>/dev/null; then CYGPATH_W='cygpath -w' else CYGPATH_W=echo fi fi AC_SUBST([CYGPATH_W]) # Define the identity of the package. dnl Distinguish between old-style and new-style calls. m4_ifval([$2], [m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl AC_SUBST([PACKAGE], [$1])dnl AC_SUBST([VERSION], [$2])], [_AM_SET_OPTIONS([$1])dnl AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl AC_SUBST([VERSION], ['AC_PACKAGE_VERSION'])])dnl _AM_IF_OPTION([no-define],, [AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package]) AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package])])dnl # Some tools Automake needs. AC_REQUIRE([AM_SANITY_CHECK])dnl AC_REQUIRE([AC_ARG_PROGRAM])dnl AM_MISSING_PROG(ACLOCAL, aclocal-${am__api_version}) AM_MISSING_PROG(AUTOCONF, autoconf) AM_MISSING_PROG(AUTOMAKE, automake-${am__api_version}) AM_MISSING_PROG(AUTOHEADER, autoheader) AM_MISSING_PROG(MAKEINFO, makeinfo) AM_PROG_INSTALL_SH AM_PROG_INSTALL_STRIP AC_REQUIRE([AM_PROG_MKDIR_P])dnl # We need awk for the "check" target. The system "awk" is bad on # some platforms. AC_REQUIRE([AC_PROG_AWK])dnl AC_REQUIRE([AC_PROG_MAKE_SET])dnl AC_REQUIRE([AM_SET_LEADING_DOT])dnl _AM_IF_OPTION([tar-ustar], [_AM_PROG_TAR([ustar])], [_AM_IF_OPTION([tar-pax], [_AM_PROG_TAR([pax])], [_AM_PROG_TAR([v7])])]) _AM_IF_OPTION([no-dependencies],, [AC_PROVIDE_IFELSE([AC_PROG_CC], [_AM_DEPENDENCIES(CC)], [define([AC_PROG_CC], defn([AC_PROG_CC])[_AM_DEPENDENCIES(CC)])])dnl AC_PROVIDE_IFELSE([AC_PROG_CXX], [_AM_DEPENDENCIES(CXX)], [define([AC_PROG_CXX], defn([AC_PROG_CXX])[_AM_DEPENDENCIES(CXX)])])dnl ]) ]) # When config.status generates a header, we must update the stamp-h file. # This file resides in the same directory as the config header # that is generated. The stamp files are numbered to have different names. # Autoconf calls _AC_AM_CONFIG_HEADER_HOOK (when defined) in the # loop where config.status creates the headers, so we can generate # our stamp files there. AC_DEFUN([_AC_AM_CONFIG_HEADER_HOOK], [# Compute $1's index in $config_headers. _am_stamp_count=1 for _am_header in $config_headers :; do case $_am_header in $1 | $1:* ) break ;; * ) _am_stamp_count=`expr $_am_stamp_count + 1` ;; esac done echo "timestamp for $1" >`AS_DIRNAME([$1])`/stamp-h[]$_am_stamp_count]) # Copyright (C) 2001, 2003, 2005 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # AM_PROG_INSTALL_SH # ------------------ # Define $install_sh. AC_DEFUN([AM_PROG_INSTALL_SH], [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl install_sh=${install_sh-"$am_aux_dir/install-sh"} AC_SUBST(install_sh)]) # Copyright (C) 2003, 2005 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # serial 2 # Check whether the underlying file-system supports filenames # with a leading dot. For instance MS-DOS doesn't. AC_DEFUN([AM_SET_LEADING_DOT], [rm -rf .tst 2>/dev/null mkdir .tst 2>/dev/null if test -d .tst; then am__leading_dot=. else am__leading_dot=_ fi rmdir .tst 2>/dev/null AC_SUBST([am__leading_dot])]) # Check to see how 'make' treats includes. -*- Autoconf -*- # Copyright (C) 2001, 2002, 2003, 2005 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # serial 3 # AM_MAKE_INCLUDE() # ----------------- # Check to see how make treats includes. AC_DEFUN([AM_MAKE_INCLUDE], [am_make=${MAKE-make} cat > confinc << 'END' am__doit: @echo done .PHONY: am__doit END # If we don't find an include directive, just comment out the code. AC_MSG_CHECKING([for style of include used by $am_make]) am__include="#" am__quote= _am_result=none # First try GNU make style include. echo "include confinc" > confmf # We grep out `Entering directory' and `Leaving directory' # messages which can occur if `w' ends up in MAKEFLAGS. # In particular we don't look at `^make:' because GNU make might # be invoked under some other name (usually "gmake"), in which # case it prints its new name instead of `make'. if test "`$am_make -s -f confmf 2> /dev/null | grep -v 'ing directory'`" = "done"; then am__include=include am__quote= _am_result=GNU fi # Now try BSD make style include. if test "$am__include" = "#"; then echo '.include "confinc"' > confmf if test "`$am_make -s -f confmf 2> /dev/null`" = "done"; then am__include=.include am__quote="\"" _am_result=BSD fi fi AC_SUBST([am__include]) AC_SUBST([am__quote]) AC_MSG_RESULT([$_am_result]) rm -f confinc confmf ]) # Fake the existence of programs that GNU maintainers use. -*- Autoconf -*- # Copyright (C) 1997, 1999, 2000, 2001, 2003, 2005 # Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # serial 4 # AM_MISSING_PROG(NAME, PROGRAM) # ------------------------------ AC_DEFUN([AM_MISSING_PROG], [AC_REQUIRE([AM_MISSING_HAS_RUN]) $1=${$1-"${am_missing_run}$2"} AC_SUBST($1)]) # AM_MISSING_HAS_RUN # ------------------ # Define MISSING if not defined so far and test if it supports --run. # If it does, set am_missing_run to use it, otherwise, to nothing. AC_DEFUN([AM_MISSING_HAS_RUN], [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl test x"${MISSING+set}" = xset || MISSING="\${SHELL} $am_aux_dir/missing" # Use eval to expand $SHELL if eval "$MISSING --run true"; then am_missing_run="$MISSING --run " else am_missing_run= AC_MSG_WARN([`missing' script is too old or missing]) fi ]) # Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # AM_PROG_MKDIR_P # --------------- # Check whether `mkdir -p' is supported, fallback to mkinstalldirs otherwise. # # Automake 1.8 used `mkdir -m 0755 -p --' to ensure that directories # created by `make install' are always world readable, even if the # installer happens to have an overly restrictive umask (e.g. 077). # This was a mistake. There are at least two reasons why we must not # use `-m 0755': # - it causes special bits like SGID to be ignored, # - it may be too restrictive (some setups expect 775 directories). # # Do not use -m 0755 and let people choose whatever they expect by # setting umask. # # We cannot accept any implementation of `mkdir' that recognizes `-p'. # Some implementations (such as Solaris 8's) are not thread-safe: if a # parallel make tries to run `mkdir -p a/b' and `mkdir -p a/c' # concurrently, both version can detect that a/ is missing, but only # one can create it and the other will error out. Consequently we # restrict ourselves to GNU make (using the --version option ensures # this.) AC_DEFUN([AM_PROG_MKDIR_P], [if mkdir -p --version . >/dev/null 2>&1 && test ! -d ./--version; then # We used to keeping the `.' as first argument, in order to # allow $(mkdir_p) to be used without argument. As in # $(mkdir_p) $(somedir) # where $(somedir) is conditionally defined. However this is wrong # for two reasons: # 1. if the package is installed by a user who cannot write `.' # make install will fail, # 2. the above comment should most certainly read # $(mkdir_p) $(DESTDIR)$(somedir) # so it does not work when $(somedir) is undefined and # $(DESTDIR) is not. # To support the latter case, we have to write # test -z "$(somedir)" || $(mkdir_p) $(DESTDIR)$(somedir), # so the `.' trick is pointless. mkdir_p='mkdir -p --' else # On NextStep and OpenStep, the `mkdir' command does not # recognize any option. It will interpret all options as # directories to create, and then abort because `.' already # exists. for d in ./-p ./--version; do test -d $d && rmdir $d done # $(mkinstalldirs) is defined by Automake if mkinstalldirs exists. if test -f "$ac_aux_dir/mkinstalldirs"; then mkdir_p='$(mkinstalldirs)' else mkdir_p='$(install_sh) -d' fi fi AC_SUBST([mkdir_p])]) # Helper functions for option handling. -*- Autoconf -*- # Copyright (C) 2001, 2002, 2003, 2005 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # serial 3 # _AM_MANGLE_OPTION(NAME) # ----------------------- AC_DEFUN([_AM_MANGLE_OPTION], [[_AM_OPTION_]m4_bpatsubst($1, [[^a-zA-Z0-9_]], [_])]) # _AM_SET_OPTION(NAME) # ------------------------------ # Set option NAME. Presently that only means defining a flag for this option. AC_DEFUN([_AM_SET_OPTION], [m4_define(_AM_MANGLE_OPTION([$1]), 1)]) # _AM_SET_OPTIONS(OPTIONS) # ---------------------------------- # OPTIONS is a space-separated list of Automake options. AC_DEFUN([_AM_SET_OPTIONS], [AC_FOREACH([_AM_Option], [$1], [_AM_SET_OPTION(_AM_Option)])]) # _AM_IF_OPTION(OPTION, IF-SET, [IF-NOT-SET]) # ------------------------------------------- # Execute IF-SET if OPTION is set, IF-NOT-SET otherwise. AC_DEFUN([_AM_IF_OPTION], [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])]) # Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005 # Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # AM_PATH_PYTHON([MINIMUM-VERSION], [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) # --------------------------------------------------------------------------- # Adds support for distributing Python modules and packages. To # install modules, copy them to $(pythondir), using the python_PYTHON # automake variable. To install a package with the same name as the # automake package, install to $(pkgpythondir), or use the # pkgpython_PYTHON automake variable. # # The variables $(pyexecdir) and $(pkgpyexecdir) are provided as # locations to install python extension modules (shared libraries). # Another macro is required to find the appropriate flags to compile # extension modules. # # If your package is configured with a different prefix to python, # users will have to add the install directory to the PYTHONPATH # environment variable, or create a .pth file (see the python # documentation for details). # # If the MINIMUM-VERSION argument is passed, AM_PATH_PYTHON will # cause an error if the version of python installed on the system # doesn't meet the requirement. MINIMUM-VERSION should consist of # numbers and dots only. AC_DEFUN([AM_PATH_PYTHON], [ dnl Find a Python interpreter. Python versions prior to 1.5 are not dnl supported because the default installation locations changed from dnl $prefix/lib/site-python in 1.4 to $prefix/lib/python1.5/site-packages dnl in 1.5. m4_define_default([_AM_PYTHON_INTERPRETER_LIST], [python python2 python2.5 python2.4 python2.3 python2.2 dnl python2.1 python2.0 python1.6 python1.5]) m4_if([$1],[],[ dnl No version check is needed. # Find any Python interpreter. if test -z "$PYTHON"; then AC_PATH_PROGS([PYTHON], _AM_PYTHON_INTERPRETER_LIST, :) fi am_display_PYTHON=python ], [ dnl A version check is needed. if test -n "$PYTHON"; then # If the user set $PYTHON, use it and don't search something else. AC_MSG_CHECKING([whether $PYTHON version >= $1]) AM_PYTHON_CHECK_VERSION([$PYTHON], [$1], [AC_MSG_RESULT(yes)], [AC_MSG_ERROR(too old)]) am_display_PYTHON=$PYTHON else # Otherwise, try each interpreter until we find one that satisfies # VERSION. AC_CACHE_CHECK([for a Python interpreter with version >= $1], [am_cv_pathless_PYTHON],[ for am_cv_pathless_PYTHON in _AM_PYTHON_INTERPRETER_LIST none; do test "$am_cv_pathless_PYTHON" = none && break AM_PYTHON_CHECK_VERSION([$am_cv_pathless_PYTHON], [$1], [break]) done]) # Set $PYTHON to the absolute path of $am_cv_pathless_PYTHON. if test "$am_cv_pathless_PYTHON" = none; then PYTHON=: else AC_PATH_PROG([PYTHON], [$am_cv_pathless_PYTHON]) fi am_display_PYTHON=$am_cv_pathless_PYTHON fi ]) if test "$PYTHON" = :; then dnl Run any user-specified action, or abort. m4_default([$3], [AC_MSG_ERROR([no suitable Python interpreter found])]) else dnl Query Python for its version number. Getting [:3] seems to be dnl the best way to do this; it's what "site.py" does in the standard dnl library. AC_CACHE_CHECK([for $am_display_PYTHON version], [am_cv_python_version], [am_cv_python_version=`$PYTHON -c "import sys; print sys.version[[:3]]"`]) AC_SUBST([PYTHON_VERSION], [$am_cv_python_version]) dnl Use the values of $prefix and $exec_prefix for the corresponding dnl values of PYTHON_PREFIX and PYTHON_EXEC_PREFIX. These are made dnl distinct variables so they can be overridden if need be. However, dnl general consensus is that you shouldn't need this ability. AC_SUBST([PYTHON_PREFIX], ['${prefix}']) AC_SUBST([PYTHON_EXEC_PREFIX], ['${exec_prefix}']) dnl At times (like when building shared libraries) you may want dnl to know which OS platform Python thinks this is. AC_CACHE_CHECK([for $am_display_PYTHON platform], [am_cv_python_platform], [am_cv_python_platform=`$PYTHON -c "import sys; print sys.platform"`]) AC_SUBST([PYTHON_PLATFORM], [$am_cv_python_platform]) dnl Set up 4 directories: dnl pythondir -- where to install python scripts. This is the dnl site-packages directory, not the python standard library dnl directory like in previous automake betas. This behavior dnl is more consistent with lispdir.m4 for example. dnl Query distutils for this directory. distutils does not exist in dnl Python 1.5, so we fall back to the hardcoded directory if it dnl doesn't work. AC_CACHE_CHECK([for $am_display_PYTHON script directory], [am_cv_python_pythondir], [am_cv_python_pythondir=`$PYTHON -c "from distutils import sysconfig; print sysconfig.get_python_lib(0,0,prefix='$PYTHON_PREFIX')" 2>/dev/null || echo "$PYTHON_PREFIX/lib/python$PYTHON_VERSION/site-packages"`]) AC_SUBST([pythondir], [$am_cv_python_pythondir]) dnl pkgpythondir -- $PACKAGE directory under pythondir. Was dnl PYTHON_SITE_PACKAGE in previous betas, but this naming is dnl more consistent with the rest of automake. AC_SUBST([pkgpythondir], [\${pythondir}/$PACKAGE]) dnl pyexecdir -- directory for installing python extension modules dnl (shared libraries) dnl Query distutils for this directory. distutils does not exist in dnl Python 1.5, so we fall back to the hardcoded directory if it dnl doesn't work. AC_CACHE_CHECK([for $am_display_PYTHON extension module directory], [am_cv_python_pyexecdir], [am_cv_python_pyexecdir=`$PYTHON -c "from distutils import sysconfig; print sysconfig.get_python_lib(1,0,prefix='$PYTHON_EXEC_PREFIX')" 2>/dev/null || echo "${PYTHON_EXEC_PREFIX}/lib/python${PYTHON_VERSION}/site-packages"`]) AC_SUBST([pyexecdir], [$am_cv_python_pyexecdir]) dnl pkgpyexecdir -- $(pyexecdir)/$(PACKAGE) AC_SUBST([pkgpyexecdir], [\${pyexecdir}/$PACKAGE]) dnl Run any user-specified action. $2 fi ]) # AM_PYTHON_CHECK_VERSION(PROG, VERSION, [ACTION-IF-TRUE], [ACTION-IF-FALSE]) # --------------------------------------------------------------------------- # Run ACTION-IF-TRUE if the Python interpreter PROG has version >= VERSION. # Run ACTION-IF-FALSE otherwise. # This test uses sys.hexversion instead of the string equivalent (first # word of sys.version), in order to cope with versions such as 2.2c1. # hexversion has been introduced in Python 1.5.2; it's probably not # worth to support older versions (1.5.1 was released on October 31, 1998). AC_DEFUN([AM_PYTHON_CHECK_VERSION], [prog="import sys, string # split strings by '.' and convert to numeric. Append some zeros # because we need at least 4 digits for the hex conversion. minver = map(int, string.split('$2', '.')) + [[0, 0, 0]] minverhex = 0 for i in xrange(0, 4): minverhex = (minverhex << 8) + minver[[i]] sys.exit(sys.hexversion < minverhex)" AS_IF([AM_RUN_LOG([$1 -c "$prog"])], [$3], [$4])]) # Copyright (C) 2001, 2003, 2005 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # AM_RUN_LOG(COMMAND) # ------------------- # Run COMMAND, save the exit status in ac_status, and log it. # (This has been adapted from Autoconf's _AC_RUN_LOG macro.) AC_DEFUN([AM_RUN_LOG], [{ echo "$as_me:$LINENO: $1" >&AS_MESSAGE_LOG_FD ($1) >&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD (exit $ac_status); }]) # Check to make sure that the build environment is sane. -*- Autoconf -*- # Copyright (C) 1996, 1997, 2000, 2001, 2003, 2005 # Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # serial 4 # AM_SANITY_CHECK # --------------- AC_DEFUN([AM_SANITY_CHECK], [AC_MSG_CHECKING([whether build environment is sane]) # Just in case sleep 1 echo timestamp > conftest.file # Do `set' in a subshell so we don't clobber the current shell's # arguments. Must try -L first in case configure is actually a # symlink; some systems play weird games with the mod time of symlinks # (eg FreeBSD returns the mod time of the symlink's containing # directory). if ( set X `ls -Lt $srcdir/configure conftest.file 2> /dev/null` if test "$[*]" = "X"; then # -L didn't work. set X `ls -t $srcdir/configure conftest.file` fi rm -f conftest.file if test "$[*]" != "X $srcdir/configure conftest.file" \ && test "$[*]" != "X conftest.file $srcdir/configure"; then # If neither matched, then we have a broken ls. This can happen # if, for instance, CONFIG_SHELL is bash and it inherits a # broken ls alias from the environment. This has actually # happened. Such a system could not be considered "sane". AC_MSG_ERROR([ls -t appears to fail. Make sure there is not a broken alias in your environment]) fi test "$[2]" = conftest.file ) then # Ok. : else AC_MSG_ERROR([newly created file is older than distributed files! Check your system clock]) fi AC_MSG_RESULT(yes)]) # Copyright (C) 2001, 2003, 2005 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # AM_PROG_INSTALL_STRIP # --------------------- # One issue with vendor `install' (even GNU) is that you can't # specify the program used to strip binaries. This is especially # annoying in cross-compiling environments, where the build's strip # is unlikely to handle the host's binaries. # Fortunately install-sh will honor a STRIPPROG variable, so we # always use install-sh in `make install-strip', and initialize # STRIPPROG with the value of the STRIP variable (set by the user). AC_DEFUN([AM_PROG_INSTALL_STRIP], [AC_REQUIRE([AM_PROG_INSTALL_SH])dnl # Installed binaries are usually stripped using `strip' when the user # run `make install-strip'. However `strip' might not be the right # tool to use in cross-compilation environments, therefore Automake # will honor the `STRIP' environment variable to overrule this program. dnl Don't test for $cross_compiling = yes, because it might be `maybe'. if test "$cross_compiling" != no; then AC_CHECK_TOOL([STRIP], [strip], :) fi INSTALL_STRIP_PROGRAM="\${SHELL} \$(install_sh) -c -s" AC_SUBST([INSTALL_STRIP_PROGRAM])]) # Check how to create a tarball. -*- Autoconf -*- # Copyright (C) 2004, 2005 Free Software Foundation, Inc. # # This file is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # serial 2 # _AM_PROG_TAR(FORMAT) # -------------------- # Check how to create a tarball in format FORMAT. # FORMAT should be one of `v7', `ustar', or `pax'. # # Substitute a variable $(am__tar) that is a command # writing to stdout a FORMAT-tarball containing the directory # $tardir. # tardir=directory && $(am__tar) > result.tar # # Substitute a variable $(am__untar) that extract such # a tarball read from stdin. # $(am__untar) < result.tar AC_DEFUN([_AM_PROG_TAR], [# Always define AMTAR for backward compatibility. AM_MISSING_PROG([AMTAR], [tar]) m4_if([$1], [v7], [am__tar='${AMTAR} chof - "$$tardir"'; am__untar='${AMTAR} xf -'], [m4_case([$1], [ustar],, [pax],, [m4_fatal([Unknown tar format])]) AC_MSG_CHECKING([how to create a $1 tar archive]) # Loop over all known methods to create a tar archive until one works. _am_tools='gnutar m4_if([$1], [ustar], [plaintar]) pax cpio none' _am_tools=${am_cv_prog_tar_$1-$_am_tools} # Do not fold the above two line into one, because Tru64 sh and # Solaris sh will not grok spaces in the rhs of `-'. for _am_tool in $_am_tools do case $_am_tool in gnutar) for _am_tar in tar gnutar gtar; do AM_RUN_LOG([$_am_tar --version]) && break done am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"' am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"' am__untar="$_am_tar -xf -" ;; plaintar) # Must skip GNU tar: if it does not support --format= it doesn't create # ustar tarball either. (tar --version) >/dev/null 2>&1 && continue am__tar='tar chf - "$$tardir"' am__tar_='tar chf - "$tardir"' am__untar='tar xf -' ;; pax) am__tar='pax -L -x $1 -w "$$tardir"' am__tar_='pax -L -x $1 -w "$tardir"' am__untar='pax -r' ;; cpio) am__tar='find "$$tardir" -print | cpio -o -H $1 -L' am__tar_='find "$tardir" -print | cpio -o -H $1 -L' am__untar='cpio -i -H $1 -d' ;; none) am__tar=false am__tar_=false am__untar=false ;; esac # If the value was cached, stop now. We just wanted to have am__tar # and am__untar set. test -n "${am_cv_prog_tar_$1}" && break # tar/untar a dummy directory, and stop if the command works rm -rf conftest.dir mkdir conftest.dir echo GrepMe > conftest.dir/file AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar]) rm -rf conftest.dir if test -s conftest.tar; then AM_RUN_LOG([$am__untar /dev/null 2>&1 && break fi done rm -rf conftest.dir AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool]) AC_MSG_RESULT([$am_cv_prog_tar_$1])]) AC_SUBST([am__tar]) AC_SUBST([am__untar]) ]) # _AM_PROG_TAR tophat-2.0.9/configure0000755000175000017500000102346312157116231013470 0ustar toortoor#! /bin/sh # Guess values for system-dependent variables and create Makefiles. # Generated by GNU Autoconf 2.59 for tophat 2.0.9. # # Report bugs to . # # Copyright (C) 2003 Free Software Foundation, Inc. # This configure script is free software; the Free Software Foundation # gives unlimited permission to copy, distribute and modify it. ## --------------------- ## ## M4sh Initialization. ## ## --------------------- ## # Be Bourne compatible if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then emulate sh NULLCMD=: # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which # is contrary to our usage. Disable this feature. alias -g '${1+"$@"}'='"$@"' elif test -n "${BASH_VERSION+set}" && (set -o posix) >/dev/null 2>&1; then set -o posix fi DUALCASE=1; export DUALCASE # for MKS sh # Support unset when possible. if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then as_unset=unset else as_unset=false fi # Work around bugs in pre-3.0 UWIN ksh. $as_unset ENV MAIL MAILPATH PS1='$ ' PS2='> ' PS4='+ ' # NLS nuisances. for as_var in \ LANG LANGUAGE LC_ADDRESS LC_ALL LC_COLLATE LC_CTYPE LC_IDENTIFICATION \ LC_MEASUREMENT LC_MESSAGES LC_MONETARY LC_NAME LC_NUMERIC LC_PAPER \ LC_TELEPHONE LC_TIME do if (set +x; test -z "`(eval $as_var=C; export $as_var) 2>&1`"); then eval $as_var=C; export $as_var else $as_unset $as_var fi done # Required to use basename. if expr a : '\(a\)' >/dev/null 2>&1; then as_expr=expr else as_expr=false fi if (basename /) >/dev/null 2>&1 && test "X`basename / 2>&1`" = "X/"; then as_basename=basename else as_basename=false fi # Name of the executable. as_me=`$as_basename "$0" || $as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ X"$0" : 'X\(//\)$' \| \ X"$0" : 'X\(/\)$' \| \ . : '\(.\)' 2>/dev/null || echo X/"$0" | sed '/^.*\/\([^/][^/]*\)\/*$/{ s//\1/; q; } /^X\/\(\/\/\)$/{ s//\1/; q; } /^X\/\(\/\).*/{ s//\1/; q; } s/.*/./; q'` # PATH needs CR, and LINENO needs CR and PATH. # Avoid depending upon Character Ranges. as_cr_letters='abcdefghijklmnopqrstuvwxyz' as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' as_cr_Letters=$as_cr_letters$as_cr_LETTERS as_cr_digits='0123456789' as_cr_alnum=$as_cr_Letters$as_cr_digits # The user is always right. if test "${PATH_SEPARATOR+set}" != set; then echo "#! /bin/sh" >conf$$.sh echo "exit 0" >>conf$$.sh chmod +x conf$$.sh if (PATH="/nonexistent;."; conf$$.sh) >/dev/null 2>&1; then PATH_SEPARATOR=';' else PATH_SEPARATOR=: fi rm -f conf$$.sh fi as_lineno_1=$LINENO as_lineno_2=$LINENO as_lineno_3=`(expr $as_lineno_1 + 1) 2>/dev/null` test "x$as_lineno_1" != "x$as_lineno_2" && test "x$as_lineno_3" = "x$as_lineno_2" || { # Find who we are. Look in the path if we contain no path at all # relative or not. case $0 in *[\\/]* ) as_myself=$0 ;; *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break done ;; esac # We did not find ourselves, most probably we were run as `sh COMMAND' # in which case we are not to be found in the path. if test "x$as_myself" = x; then as_myself=$0 fi if test ! -f "$as_myself"; then { echo "$as_me: error: cannot find myself; rerun with an absolute path" >&2 { (exit 1); exit 1; }; } fi case $CONFIG_SHELL in '') as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for as_base in sh bash ksh sh5; do case $as_dir in /*) if ("$as_dir/$as_base" -c ' as_lineno_1=$LINENO as_lineno_2=$LINENO as_lineno_3=`(expr $as_lineno_1 + 1) 2>/dev/null` test "x$as_lineno_1" != "x$as_lineno_2" && test "x$as_lineno_3" = "x$as_lineno_2" ') 2>/dev/null; then $as_unset BASH_ENV || test "${BASH_ENV+set}" != set || { BASH_ENV=; export BASH_ENV; } $as_unset ENV || test "${ENV+set}" != set || { ENV=; export ENV; } CONFIG_SHELL=$as_dir/$as_base export CONFIG_SHELL exec "$CONFIG_SHELL" "$0" ${1+"$@"} fi;; esac done done ;; esac # Create $as_me.lineno as a copy of $as_myself, but with $LINENO # uniformly replaced by the line number. The first 'sed' inserts a # line-number line before each line; the second 'sed' does the real # work. The second script uses 'N' to pair each line-number line # with the numbered line, and appends trailing '-' during # substitution so that $LINENO is not a special case at line end. # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the # second 'sed' script. Blame Lee E. McMahon for sed's syntax. :-) sed '=' <$as_myself | sed ' N s,$,-, : loop s,^\(['$as_cr_digits']*\)\(.*\)[$]LINENO\([^'$as_cr_alnum'_]\),\1\2\1\3, t loop s,-$,, s,^['$as_cr_digits']*\n,, ' >$as_me.lineno && chmod +x $as_me.lineno || { echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2 { (exit 1); exit 1; }; } # Don't try to exec as it changes $[0], causing all sort of problems # (the dirname of $[0] is not the place where we might find the # original and so on. Autoconf is especially sensible to this). . ./$as_me.lineno # Exit status is that of the last command. exit } case `echo "testing\c"; echo 1,2,3`,`echo -n testing; echo 1,2,3` in *c*,-n*) ECHO_N= ECHO_C=' ' ECHO_T=' ' ;; *c*,* ) ECHO_N=-n ECHO_C= ECHO_T= ;; *) ECHO_N= ECHO_C='\c' ECHO_T= ;; esac if expr a : '\(a\)' >/dev/null 2>&1; then as_expr=expr else as_expr=false fi rm -f conf$$ conf$$.exe conf$$.file echo >conf$$.file if ln -s conf$$.file conf$$ 2>/dev/null; then # We could just check for DJGPP; but this test a) works b) is more generic # and c) will remain valid once DJGPP supports symlinks (DJGPP 2.04). if test -f conf$$.exe; then # Don't use ln at all; we don't have any links as_ln_s='cp -p' else as_ln_s='ln -s' fi elif ln conf$$.file conf$$ 2>/dev/null; then as_ln_s=ln else as_ln_s='cp -p' fi rm -f conf$$ conf$$.exe conf$$.file if mkdir -p . 2>/dev/null; then as_mkdir_p=: else test -d ./-p && rmdir ./-p as_mkdir_p=false fi as_executable_p="test -f" # Sed expression to map a string onto a valid CPP name. as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" # Sed expression to map a string onto a valid variable name. as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" # IFS # We need space, tab and new line, in precisely that order. as_nl=' ' IFS=" $as_nl" # CDPATH. $as_unset CDPATH # Name of the host. # hostname on some systems (SVR3.2, Linux) returns a bogus exit status, # so uname gets run too. ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q` exec 6>&1 # # Initializations. # ac_default_prefix=/usr/local ac_config_libobj_dir=. cross_compiling=no subdirs= MFLAGS= MAKEFLAGS= SHELL=${CONFIG_SHELL-/bin/sh} # Maximum number of lines to put in a shell here document. # This variable seems obsolete. It should probably be removed, and # only ac_max_sed_lines should be used. : ${ac_max_here_lines=38} # Identity of this package. PACKAGE_NAME='tophat' PACKAGE_TARNAME='tophat' PACKAGE_VERSION='2.0.9' PACKAGE_STRING='tophat 2.0.9' PACKAGE_BUGREPORT='tophat.cufflinks@gmail.com' ac_unique_file="config.h.in" # Factoring default headers for most tests. ac_includes_default="\ #include #if HAVE_SYS_TYPES_H # include #endif #if HAVE_SYS_STAT_H # include #endif #if STDC_HEADERS # include # include #else # if HAVE_STDLIB_H # include # endif #endif #if HAVE_STRING_H # if !STDC_HEADERS && HAVE_MEMORY_H # include # endif # include #endif #if HAVE_STRINGS_H # include #endif #if HAVE_INTTYPES_H # include #else # if HAVE_STDINT_H # include # endif #endif #if HAVE_UNISTD_H # include #endif" ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA CYGPATH_W PACKAGE VERSION ACLOCAL AUTOCONF AUTOMAKE AUTOHEADER MAKEINFO install_sh STRIP ac_ct_STRIP INSTALL_STRIP_PROGRAM mkdir_p AWK SET_MAKE am__leading_dot AMTAR am__tar am__untar PYTHON CXX CXXFLAGS LDFLAGS CPPFLAGS ac_ct_CXX EXEEXT OBJEXT DEPDIR am__include am__quote AMDEP_TRUE AMDEP_FALSE AMDEPBACKSLASH CXXDEPMODE am__fastdepCXX_TRUE am__fastdepCXX_FALSE CC CFLAGS ac_ct_CC CCDEPMODE am__fastdepCC_TRUE am__fastdepCC_FALSE RANLIB ac_ct_RANLIB PYTHON_VERSION PYTHON_PREFIX PYTHON_EXEC_PREFIX PYTHON_PLATFORM pythondir pkgpythondir pyexecdir pkgpyexecdir BOOST_CPPFLAGS BOOST_LDFLAGS BAM_CPPFLAGS BAM_LDFLAGS BAM_LIB build build_cpu build_vendor build_os BOOST_THREAD_LIB BOOST_SYSTEM_LIB CPP EGREP LIBOBJS host host_cpu host_vendor host_os LTLIBOBJS' ac_subst_files='' # Initialize some variables set by options. ac_init_help= ac_init_version=false # The variables have the same names as the options, with # dashes changed to underlines. cache_file=/dev/null exec_prefix=NONE no_create= no_recursion= prefix=NONE program_prefix=NONE program_suffix=NONE program_transform_name=s,x,x, silent= site= srcdir= verbose= x_includes=NONE x_libraries=NONE # Installation directory options. # These are left unexpanded so users can "make install exec_prefix=/foo" # and all the variables that are supposed to be based on exec_prefix # by default will actually change. # Use braces instead of parens because sh, perl, etc. also accept them. bindir='${exec_prefix}/bin' sbindir='${exec_prefix}/sbin' libexecdir='${exec_prefix}/libexec' datadir='${prefix}/share' sysconfdir='${prefix}/etc' sharedstatedir='${prefix}/com' localstatedir='${prefix}/var' libdir='${exec_prefix}/lib' includedir='${prefix}/include' oldincludedir='/usr/include' infodir='${prefix}/info' mandir='${prefix}/man' ac_prev= for ac_option do # If the previous option needs an argument, assign it. if test -n "$ac_prev"; then eval "$ac_prev=\$ac_option" ac_prev= continue fi ac_optarg=`expr "x$ac_option" : 'x[^=]*=\(.*\)'` # Accept the important Cygnus configure options, so we can diagnose typos. case $ac_option in -bindir | --bindir | --bindi | --bind | --bin | --bi) ac_prev=bindir ;; -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*) bindir=$ac_optarg ;; -build | --build | --buil | --bui | --bu) ac_prev=build_alias ;; -build=* | --build=* | --buil=* | --bui=* | --bu=*) build_alias=$ac_optarg ;; -cache-file | --cache-file | --cache-fil | --cache-fi \ | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c) ac_prev=cache_file ;; -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \ | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*) cache_file=$ac_optarg ;; --config-cache | -C) cache_file=config.cache ;; -datadir | --datadir | --datadi | --datad | --data | --dat | --da) ac_prev=datadir ;; -datadir=* | --datadir=* | --datadi=* | --datad=* | --data=* | --dat=* \ | --da=*) datadir=$ac_optarg ;; -disable-* | --disable-*) ac_feature=`expr "x$ac_option" : 'x-*disable-\(.*\)'` # Reject names that are not valid shell variable names. expr "x$ac_feature" : ".*[^-_$as_cr_alnum]" >/dev/null && { echo "$as_me: error: invalid feature name: $ac_feature" >&2 { (exit 1); exit 1; }; } ac_feature=`echo $ac_feature | sed 's/-/_/g'` eval "enable_$ac_feature=no" ;; -enable-* | --enable-*) ac_feature=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'` # Reject names that are not valid shell variable names. expr "x$ac_feature" : ".*[^-_$as_cr_alnum]" >/dev/null && { echo "$as_me: error: invalid feature name: $ac_feature" >&2 { (exit 1); exit 1; }; } ac_feature=`echo $ac_feature | sed 's/-/_/g'` case $ac_option in *=*) ac_optarg=`echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"`;; *) ac_optarg=yes ;; esac eval "enable_$ac_feature='$ac_optarg'" ;; -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \ | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \ | --exec | --exe | --ex) ac_prev=exec_prefix ;; -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \ | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \ | --exec=* | --exe=* | --ex=*) exec_prefix=$ac_optarg ;; -gas | --gas | --ga | --g) # Obsolete; use --with-gas. with_gas=yes ;; -help | --help | --hel | --he | -h) ac_init_help=long ;; -help=r* | --help=r* | --hel=r* | --he=r* | -hr*) ac_init_help=recursive ;; -help=s* | --help=s* | --hel=s* | --he=s* | -hs*) ac_init_help=short ;; -host | --host | --hos | --ho) ac_prev=host_alias ;; -host=* | --host=* | --hos=* | --ho=*) host_alias=$ac_optarg ;; -includedir | --includedir | --includedi | --included | --include \ | --includ | --inclu | --incl | --inc) ac_prev=includedir ;; -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \ | --includ=* | --inclu=* | --incl=* | --inc=*) includedir=$ac_optarg ;; -infodir | --infodir | --infodi | --infod | --info | --inf) ac_prev=infodir ;; -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*) infodir=$ac_optarg ;; -libdir | --libdir | --libdi | --libd) ac_prev=libdir ;; -libdir=* | --libdir=* | --libdi=* | --libd=*) libdir=$ac_optarg ;; -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \ | --libexe | --libex | --libe) ac_prev=libexecdir ;; -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \ | --libexe=* | --libex=* | --libe=*) libexecdir=$ac_optarg ;; -localstatedir | --localstatedir | --localstatedi | --localstated \ | --localstate | --localstat | --localsta | --localst \ | --locals | --local | --loca | --loc | --lo) ac_prev=localstatedir ;; -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \ | --localstate=* | --localstat=* | --localsta=* | --localst=* \ | --locals=* | --local=* | --loca=* | --loc=* | --lo=*) localstatedir=$ac_optarg ;; -mandir | --mandir | --mandi | --mand | --man | --ma | --m) ac_prev=mandir ;; -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*) mandir=$ac_optarg ;; -nfp | --nfp | --nf) # Obsolete; use --without-fp. with_fp=no ;; -no-create | --no-create | --no-creat | --no-crea | --no-cre \ | --no-cr | --no-c | -n) no_create=yes ;; -no-recursion | --no-recursion | --no-recursio | --no-recursi \ | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) no_recursion=yes ;; -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \ | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \ | --oldin | --oldi | --old | --ol | --o) ac_prev=oldincludedir ;; -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \ | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \ | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*) oldincludedir=$ac_optarg ;; -prefix | --prefix | --prefi | --pref | --pre | --pr | --p) ac_prev=prefix ;; -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*) prefix=$ac_optarg ;; -program-prefix | --program-prefix | --program-prefi | --program-pref \ | --program-pre | --program-pr | --program-p) ac_prev=program_prefix ;; -program-prefix=* | --program-prefix=* | --program-prefi=* \ | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*) program_prefix=$ac_optarg ;; -program-suffix | --program-suffix | --program-suffi | --program-suff \ | --program-suf | --program-su | --program-s) ac_prev=program_suffix ;; -program-suffix=* | --program-suffix=* | --program-suffi=* \ | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*) program_suffix=$ac_optarg ;; -program-transform-name | --program-transform-name \ | --program-transform-nam | --program-transform-na \ | --program-transform-n | --program-transform- \ | --program-transform | --program-transfor \ | --program-transfo | --program-transf \ | --program-trans | --program-tran \ | --progr-tra | --program-tr | --program-t) ac_prev=program_transform_name ;; -program-transform-name=* | --program-transform-name=* \ | --program-transform-nam=* | --program-transform-na=* \ | --program-transform-n=* | --program-transform-=* \ | --program-transform=* | --program-transfor=* \ | --program-transfo=* | --program-transf=* \ | --program-trans=* | --program-tran=* \ | --progr-tra=* | --program-tr=* | --program-t=*) program_transform_name=$ac_optarg ;; -q | -quiet | --quiet | --quie | --qui | --qu | --q \ | -silent | --silent | --silen | --sile | --sil) silent=yes ;; -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) ac_prev=sbindir ;; -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ | --sbi=* | --sb=*) sbindir=$ac_optarg ;; -sharedstatedir | --sharedstatedir | --sharedstatedi \ | --sharedstated | --sharedstate | --sharedstat | --sharedsta \ | --sharedst | --shareds | --shared | --share | --shar \ | --sha | --sh) ac_prev=sharedstatedir ;; -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \ | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \ | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \ | --sha=* | --sh=*) sharedstatedir=$ac_optarg ;; -site | --site | --sit) ac_prev=site ;; -site=* | --site=* | --sit=*) site=$ac_optarg ;; -srcdir | --srcdir | --srcdi | --srcd | --src | --sr) ac_prev=srcdir ;; -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*) srcdir=$ac_optarg ;; -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \ | --syscon | --sysco | --sysc | --sys | --sy) ac_prev=sysconfdir ;; -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \ | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*) sysconfdir=$ac_optarg ;; -target | --target | --targe | --targ | --tar | --ta | --t) ac_prev=target_alias ;; -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*) target_alias=$ac_optarg ;; -v | -verbose | --verbose | --verbos | --verbo | --verb) verbose=yes ;; -version | --version | --versio | --versi | --vers | -V) ac_init_version=: ;; -with-* | --with-*) ac_package=`expr "x$ac_option" : 'x-*with-\([^=]*\)'` # Reject names that are not valid shell variable names. expr "x$ac_package" : ".*[^-_$as_cr_alnum]" >/dev/null && { echo "$as_me: error: invalid package name: $ac_package" >&2 { (exit 1); exit 1; }; } ac_package=`echo $ac_package| sed 's/-/_/g'` case $ac_option in *=*) ac_optarg=`echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"`;; *) ac_optarg=yes ;; esac eval "with_$ac_package='$ac_optarg'" ;; -without-* | --without-*) ac_package=`expr "x$ac_option" : 'x-*without-\(.*\)'` # Reject names that are not valid shell variable names. expr "x$ac_package" : ".*[^-_$as_cr_alnum]" >/dev/null && { echo "$as_me: error: invalid package name: $ac_package" >&2 { (exit 1); exit 1; }; } ac_package=`echo $ac_package | sed 's/-/_/g'` eval "with_$ac_package=no" ;; --x) # Obsolete; use --with-x. with_x=yes ;; -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \ | --x-incl | --x-inc | --x-in | --x-i) ac_prev=x_includes ;; -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \ | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*) x_includes=$ac_optarg ;; -x-libraries | --x-libraries | --x-librarie | --x-librari \ | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l) ac_prev=x_libraries ;; -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \ | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*) x_libraries=$ac_optarg ;; -*) { echo "$as_me: error: unrecognized option: $ac_option Try \`$0 --help' for more information." >&2 { (exit 1); exit 1; }; } ;; *=*) ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='` # Reject names that are not valid shell variable names. expr "x$ac_envvar" : ".*[^_$as_cr_alnum]" >/dev/null && { echo "$as_me: error: invalid variable name: $ac_envvar" >&2 { (exit 1); exit 1; }; } ac_optarg=`echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` eval "$ac_envvar='$ac_optarg'" export $ac_envvar ;; *) # FIXME: should be removed in autoconf 3.0. echo "$as_me: WARNING: you should use --build, --host, --target" >&2 expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null && echo "$as_me: WARNING: invalid host type: $ac_option" >&2 : ${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option} ;; esac done if test -n "$ac_prev"; then ac_option=--`echo $ac_prev | sed 's/_/-/g'` { echo "$as_me: error: missing argument to $ac_option" >&2 { (exit 1); exit 1; }; } fi # Be sure to have absolute paths. for ac_var in exec_prefix prefix do eval ac_val=$`echo $ac_var` case $ac_val in [\\/$]* | ?:[\\/]* | NONE | '' ) ;; *) { echo "$as_me: error: expected an absolute directory name for --$ac_var: $ac_val" >&2 { (exit 1); exit 1; }; };; esac done # Be sure to have absolute paths. for ac_var in bindir sbindir libexecdir datadir sysconfdir sharedstatedir \ localstatedir libdir includedir oldincludedir infodir mandir do eval ac_val=$`echo $ac_var` case $ac_val in [\\/$]* | ?:[\\/]* ) ;; *) { echo "$as_me: error: expected an absolute directory name for --$ac_var: $ac_val" >&2 { (exit 1); exit 1; }; };; esac done # There might be people who depend on the old broken behavior: `$host' # used to hold the argument of --host etc. # FIXME: To remove some day. build=$build_alias host=$host_alias target=$target_alias # FIXME: To remove some day. if test "x$host_alias" != x; then if test "x$build_alias" = x; then cross_compiling=maybe echo "$as_me: WARNING: If you wanted to set the --build type, don't use --host. If a cross compiler is detected then cross compile mode will be used." >&2 elif test "x$build_alias" != "x$host_alias"; then cross_compiling=yes fi fi ac_tool_prefix= test -n "$host_alias" && ac_tool_prefix=$host_alias- test "$silent" = yes && exec 6>/dev/null # Find the source files, if location was not specified. if test -z "$srcdir"; then ac_srcdir_defaulted=yes # Try the directory containing this script, then its parent. ac_confdir=`(dirname "$0") 2>/dev/null || $as_expr X"$0" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$0" : 'X\(//\)[^/]' \| \ X"$0" : 'X\(//\)$' \| \ X"$0" : 'X\(/\)' \| \ . : '\(.\)' 2>/dev/null || echo X"$0" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; } /^X\(\/\/\)[^/].*/{ s//\1/; q; } /^X\(\/\/\)$/{ s//\1/; q; } /^X\(\/\).*/{ s//\1/; q; } s/.*/./; q'` srcdir=$ac_confdir if test ! -r $srcdir/$ac_unique_file; then srcdir=.. fi else ac_srcdir_defaulted=no fi if test ! -r $srcdir/$ac_unique_file; then if test "$ac_srcdir_defaulted" = yes; then { echo "$as_me: error: cannot find sources ($ac_unique_file) in $ac_confdir or .." >&2 { (exit 1); exit 1; }; } else { echo "$as_me: error: cannot find sources ($ac_unique_file) in $srcdir" >&2 { (exit 1); exit 1; }; } fi fi (cd $srcdir && test -r ./$ac_unique_file) 2>/dev/null || { echo "$as_me: error: sources are in $srcdir, but \`cd $srcdir' does not work" >&2 { (exit 1); exit 1; }; } srcdir=`echo "$srcdir" | sed 's%\([^\\/]\)[\\/]*$%\1%'` ac_env_build_alias_set=${build_alias+set} ac_env_build_alias_value=$build_alias ac_cv_env_build_alias_set=${build_alias+set} ac_cv_env_build_alias_value=$build_alias ac_env_host_alias_set=${host_alias+set} ac_env_host_alias_value=$host_alias ac_cv_env_host_alias_set=${host_alias+set} ac_cv_env_host_alias_value=$host_alias ac_env_target_alias_set=${target_alias+set} ac_env_target_alias_value=$target_alias ac_cv_env_target_alias_set=${target_alias+set} ac_cv_env_target_alias_value=$target_alias ac_env_PYTHON_set=${PYTHON+set} ac_env_PYTHON_value=$PYTHON ac_cv_env_PYTHON_set=${PYTHON+set} ac_cv_env_PYTHON_value=$PYTHON ac_env_CXX_set=${CXX+set} ac_env_CXX_value=$CXX ac_cv_env_CXX_set=${CXX+set} ac_cv_env_CXX_value=$CXX ac_env_CXXFLAGS_set=${CXXFLAGS+set} ac_env_CXXFLAGS_value=$CXXFLAGS ac_cv_env_CXXFLAGS_set=${CXXFLAGS+set} ac_cv_env_CXXFLAGS_value=$CXXFLAGS ac_env_LDFLAGS_set=${LDFLAGS+set} ac_env_LDFLAGS_value=$LDFLAGS ac_cv_env_LDFLAGS_set=${LDFLAGS+set} ac_cv_env_LDFLAGS_value=$LDFLAGS ac_env_CPPFLAGS_set=${CPPFLAGS+set} ac_env_CPPFLAGS_value=$CPPFLAGS ac_cv_env_CPPFLAGS_set=${CPPFLAGS+set} ac_cv_env_CPPFLAGS_value=$CPPFLAGS ac_env_CC_set=${CC+set} ac_env_CC_value=$CC ac_cv_env_CC_set=${CC+set} ac_cv_env_CC_value=$CC ac_env_CFLAGS_set=${CFLAGS+set} ac_env_CFLAGS_value=$CFLAGS ac_cv_env_CFLAGS_set=${CFLAGS+set} ac_cv_env_CFLAGS_value=$CFLAGS ac_env_CPP_set=${CPP+set} ac_env_CPP_value=$CPP ac_cv_env_CPP_set=${CPP+set} ac_cv_env_CPP_value=$CPP # # Report the --help message. # if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF \`configure' configures tophat 2.0.9 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... To assign environment variables (e.g., CC, CFLAGS...), specify them as VAR=VALUE. See below for descriptions of some of the useful variables. Defaults for the options are specified in brackets. Configuration: -h, --help display this help and exit --help=short display options specific to this package --help=recursive display the short help of all the included packages -V, --version display version information and exit -q, --quiet, --silent do not print \`checking...' messages --cache-file=FILE cache test results in FILE [disabled] -C, --config-cache alias for \`--cache-file=config.cache' -n, --no-create do not create output files --srcdir=DIR find the sources in DIR [configure dir or \`..'] _ACEOF cat <<_ACEOF Installation directories: --prefix=PREFIX install architecture-independent files in PREFIX [$ac_default_prefix] --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX [PREFIX] By default, \`make install' will install all the files in \`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc. You can specify an installation prefix other than \`$ac_default_prefix' using \`--prefix', for instance \`--prefix=\$HOME'. For better control, use the options below. Fine tuning of the installation directories: --bindir=DIR user executables [EPREFIX/bin] --sbindir=DIR system admin executables [EPREFIX/sbin] --libexecdir=DIR program executables [EPREFIX/libexec] --datadir=DIR read-only architecture-independent data [PREFIX/share] --sysconfdir=DIR read-only single-machine data [PREFIX/etc] --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com] --localstatedir=DIR modifiable single-machine data [PREFIX/var] --libdir=DIR object code libraries [EPREFIX/lib] --includedir=DIR C header files [PREFIX/include] --oldincludedir=DIR C header files for non-gcc [/usr/include] --infodir=DIR info documentation [PREFIX/info] --mandir=DIR man documentation [PREFIX/man] _ACEOF cat <<\_ACEOF Program names: --program-prefix=PREFIX prepend PREFIX to installed program names --program-suffix=SUFFIX append SUFFIX to installed program names --program-transform-name=PROGRAM run sed PROGRAM on installed program names System types: --build=BUILD configure for building on BUILD [guessed] --host=HOST cross-compile to build programs to run on HOST [BUILD] _ACEOF fi if test -n "$ac_init_help"; then case $ac_init_help in short | recursive ) echo "Configuration of tophat 2.0.9:";; esac cat <<\_ACEOF Optional Features: --disable-FEATURE do not include FEATURE (same as --enable-FEATURE=no) --enable-FEATURE[=ARG] include FEATURE [ARG=yes] --disable-dependency-tracking speeds up one-time build --enable-dependency-tracking do not reject slow dependency extractors --disable-largefile omit support for large files --enable-intel64 optimize for Intel64 CPU such as Xeon and Core2 --enable-debug enable debugging info (default is no) --enable-optim[=0|1|2|3] set optimization level (default is 3) Optional Packages: --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) --with-boost[=DIR] use boost (default is yes) - it is possible to specify the root directory for boost (optional) --with-boost-libdir=LIB_DIR Force given directory for boost libraries. Note that this will overwrite library path detection, so use this parameter only if default library detection fails and you know exactly where your boost libraries are located. --with-bam[=DIR] use BAM libraries (default is yes) - it is possible to specify the root directory for BAM (optional) --with-bam-libdir=LIB_DIR Force given directory for bam libraries. Note that this will overwrite library path detection, so use this parameter only if default library detection fails and you know exactly where your bam libraries are located. --with-boost-thread[=special-lib] use the Thread library from boost - it is possible to specify a certain library for the linker e.g. --with-boost-thread=boost_thread-gcc-mt Some influential environment variables: PYTHON python program CXX C++ compiler command CXXFLAGS C++ compiler flags LDFLAGS linker flags, e.g. -L if you have libraries in a nonstandard directory CPPFLAGS C/C++ preprocessor flags, e.g. -I if you have headers in a nonstandard directory CC C compiler command CFLAGS C compiler flags CPP C preprocessor Use these variables to override the choices made by `configure' or to help it to find libraries and programs with nonstandard names/locations. Report bugs to . _ACEOF fi if test "$ac_init_help" = "recursive"; then # If there are subdirs, report their specific --help. ac_popdir=`pwd` for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue test -d $ac_dir || continue ac_builddir=. if test "$ac_dir" != .; then ac_dir_suffix=/`echo "$ac_dir" | sed 's,^\.[\\/],,'` # A "../" for each directory in $ac_dir_suffix. ac_top_builddir=`echo "$ac_dir_suffix" | sed 's,/[^\\/]*,../,g'` else ac_dir_suffix= ac_top_builddir= fi case $srcdir in .) # No --srcdir option. We are building in place. ac_srcdir=. if test -z "$ac_top_builddir"; then ac_top_srcdir=. else ac_top_srcdir=`echo $ac_top_builddir | sed 's,/$,,'` fi ;; [\\/]* | ?:[\\/]* ) # Absolute path. ac_srcdir=$srcdir$ac_dir_suffix; ac_top_srcdir=$srcdir ;; *) # Relative path. ac_srcdir=$ac_top_builddir$srcdir$ac_dir_suffix ac_top_srcdir=$ac_top_builddir$srcdir ;; esac # Do not use `cd foo && pwd` to compute absolute paths, because # the directories may not exist. case `pwd` in .) ac_abs_builddir="$ac_dir";; *) case "$ac_dir" in .) ac_abs_builddir=`pwd`;; [\\/]* | ?:[\\/]* ) ac_abs_builddir="$ac_dir";; *) ac_abs_builddir=`pwd`/"$ac_dir";; esac;; esac case $ac_abs_builddir in .) ac_abs_top_builddir=${ac_top_builddir}.;; *) case ${ac_top_builddir}. in .) ac_abs_top_builddir=$ac_abs_builddir;; [\\/]* | ?:[\\/]* ) ac_abs_top_builddir=${ac_top_builddir}.;; *) ac_abs_top_builddir=$ac_abs_builddir/${ac_top_builddir}.;; esac;; esac case $ac_abs_builddir in .) ac_abs_srcdir=$ac_srcdir;; *) case $ac_srcdir in .) ac_abs_srcdir=$ac_abs_builddir;; [\\/]* | ?:[\\/]* ) ac_abs_srcdir=$ac_srcdir;; *) ac_abs_srcdir=$ac_abs_builddir/$ac_srcdir;; esac;; esac case $ac_abs_builddir in .) ac_abs_top_srcdir=$ac_top_srcdir;; *) case $ac_top_srcdir in .) ac_abs_top_srcdir=$ac_abs_builddir;; [\\/]* | ?:[\\/]* ) ac_abs_top_srcdir=$ac_top_srcdir;; *) ac_abs_top_srcdir=$ac_abs_builddir/$ac_top_srcdir;; esac;; esac cd $ac_dir # Check for guested configure; otherwise get Cygnus style configure. if test -f $ac_srcdir/configure.gnu; then echo $SHELL $ac_srcdir/configure.gnu --help=recursive elif test -f $ac_srcdir/configure; then echo $SHELL $ac_srcdir/configure --help=recursive elif test -f $ac_srcdir/configure.ac || test -f $ac_srcdir/configure.in; then echo $ac_configure --help else echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2 fi cd $ac_popdir done fi test -n "$ac_init_help" && exit 0 if $ac_init_version; then cat <<\_ACEOF tophat configure 2.0.9 generated by GNU Autoconf 2.59 Copyright (C) 2003 Free Software Foundation, Inc. This configure script is free software; the Free Software Foundation gives unlimited permission to copy, distribute and modify it. _ACEOF exit 0 fi exec 5>config.log cat >&5 <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. It was created by tophat $as_me 2.0.9, which was generated by GNU Autoconf 2.59. Invocation command line was $ $0 $@ _ACEOF { cat <<_ASUNAME ## --------- ## ## Platform. ## ## --------- ## hostname = `(hostname || uname -n) 2>/dev/null | sed 1q` uname -m = `(uname -m) 2>/dev/null || echo unknown` uname -r = `(uname -r) 2>/dev/null || echo unknown` uname -s = `(uname -s) 2>/dev/null || echo unknown` uname -v = `(uname -v) 2>/dev/null || echo unknown` /usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown` /bin/uname -X = `(/bin/uname -X) 2>/dev/null || echo unknown` /bin/arch = `(/bin/arch) 2>/dev/null || echo unknown` /usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null || echo unknown` /usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown` hostinfo = `(hostinfo) 2>/dev/null || echo unknown` /bin/machine = `(/bin/machine) 2>/dev/null || echo unknown` /usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null || echo unknown` /bin/universe = `(/bin/universe) 2>/dev/null || echo unknown` _ASUNAME as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. echo "PATH: $as_dir" done } >&5 cat >&5 <<_ACEOF ## ----------- ## ## Core tests. ## ## ----------- ## _ACEOF # Keep a trace of the command line. # Strip out --no-create and --no-recursion so they do not pile up. # Strip out --silent because we don't want to record it for future runs. # Also quote any args containing shell meta-characters. # Make two passes to allow for proper duplicate-argument suppression. ac_configure_args= ac_configure_args0= ac_configure_args1= ac_sep= ac_must_keep_next=false for ac_pass in 1 2 do for ac_arg do case $ac_arg in -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;; -q | -quiet | --quiet | --quie | --qui | --qu | --q \ | -silent | --silent | --silen | --sile | --sil) continue ;; *" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?\"\']*) ac_arg=`echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;; esac case $ac_pass in 1) ac_configure_args0="$ac_configure_args0 '$ac_arg'" ;; 2) ac_configure_args1="$ac_configure_args1 '$ac_arg'" if test $ac_must_keep_next = true; then ac_must_keep_next=false # Got value, back to normal. else case $ac_arg in *=* | --config-cache | -C | -disable-* | --disable-* \ | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \ | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \ | -with-* | --with-* | -without-* | --without-* | --x) case "$ac_configure_args0 " in "$ac_configure_args1"*" '$ac_arg' "* ) continue ;; esac ;; -* ) ac_must_keep_next=true ;; esac fi ac_configure_args="$ac_configure_args$ac_sep'$ac_arg'" # Get rid of the leading space. ac_sep=" " ;; esac done done $as_unset ac_configure_args0 || test "${ac_configure_args0+set}" != set || { ac_configure_args0=; export ac_configure_args0; } $as_unset ac_configure_args1 || test "${ac_configure_args1+set}" != set || { ac_configure_args1=; export ac_configure_args1; } # When interrupted or exit'd, cleanup temporary files, and complete # config.log. We remove comments because anyway the quotes in there # would cause problems or look ugly. # WARNING: Be sure not to use single quotes in there, as some shells, # such as our DU 5.0 friend, will then `close' the trap. trap 'exit_status=$? # Save into config.log some information that might help in debugging. { echo cat <<\_ASBOX ## ---------------- ## ## Cache variables. ## ## ---------------- ## _ASBOX echo # The following way of writing the cache mishandles newlines in values, { (set) 2>&1 | case `(ac_space='"'"' '"'"'; set | grep ac_space) 2>&1` in *ac_space=\ *) sed -n \ "s/'"'"'/'"'"'\\\\'"'"''"'"'/g; s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='"'"'\\2'"'"'/p" ;; *) sed -n \ "s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1=\\2/p" ;; esac; } echo cat <<\_ASBOX ## ----------------- ## ## Output variables. ## ## ----------------- ## _ASBOX echo for ac_var in $ac_subst_vars do eval ac_val=$`echo $ac_var` echo "$ac_var='"'"'$ac_val'"'"'" done | sort echo if test -n "$ac_subst_files"; then cat <<\_ASBOX ## ------------- ## ## Output files. ## ## ------------- ## _ASBOX echo for ac_var in $ac_subst_files do eval ac_val=$`echo $ac_var` echo "$ac_var='"'"'$ac_val'"'"'" done | sort echo fi if test -s confdefs.h; then cat <<\_ASBOX ## ----------- ## ## confdefs.h. ## ## ----------- ## _ASBOX echo sed "/^$/d" confdefs.h | sort echo fi test "$ac_signal" != 0 && echo "$as_me: caught signal $ac_signal" echo "$as_me: exit $exit_status" } >&5 rm -f core *.core && rm -rf conftest* confdefs* conf$$* $ac_clean_files && exit $exit_status ' 0 for ac_signal in 1 2 13 15; do trap 'ac_signal='$ac_signal'; { (exit 1); exit 1; }' $ac_signal done ac_signal=0 # confdefs.h avoids OS command line length limits that DEFS can exceed. rm -rf conftest* confdefs.h # AIX cpp loses on an empty file, so make sure it contains at least a newline. echo >confdefs.h # Predefined preprocessor variables. cat >>confdefs.h <<_ACEOF #define PACKAGE_NAME "$PACKAGE_NAME" _ACEOF cat >>confdefs.h <<_ACEOF #define PACKAGE_TARNAME "$PACKAGE_TARNAME" _ACEOF cat >>confdefs.h <<_ACEOF #define PACKAGE_VERSION "$PACKAGE_VERSION" _ACEOF cat >>confdefs.h <<_ACEOF #define PACKAGE_STRING "$PACKAGE_STRING" _ACEOF cat >>confdefs.h <<_ACEOF #define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT" _ACEOF # Let the site file select an alternate cache file if it wants to. # Prefer explicitly selected file to automatically selected ones. if test -z "$CONFIG_SITE"; then if test "x$prefix" != xNONE; then CONFIG_SITE="$prefix/share/config.site $prefix/etc/config.site" else CONFIG_SITE="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site" fi fi for ac_site_file in $CONFIG_SITE; do if test -r "$ac_site_file"; then { echo "$as_me:$LINENO: loading site script $ac_site_file" >&5 echo "$as_me: loading site script $ac_site_file" >&6;} sed 's/^/| /' "$ac_site_file" >&5 . "$ac_site_file" fi done if test -r "$cache_file"; then # Some versions of bash will fail to source /dev/null (special # files actually), so we avoid doing that. if test -f "$cache_file"; then { echo "$as_me:$LINENO: loading cache $cache_file" >&5 echo "$as_me: loading cache $cache_file" >&6;} case $cache_file in [\\/]* | ?:[\\/]* ) . $cache_file;; *) . ./$cache_file;; esac fi else { echo "$as_me:$LINENO: creating cache $cache_file" >&5 echo "$as_me: creating cache $cache_file" >&6;} >$cache_file fi # Check that the precious variables saved in the cache have kept the same # value. ac_cache_corrupted=false for ac_var in `(set) 2>&1 | sed -n 's/^ac_env_\([a-zA-Z_0-9]*\)_set=.*/\1/p'`; do eval ac_old_set=\$ac_cv_env_${ac_var}_set eval ac_new_set=\$ac_env_${ac_var}_set eval ac_old_val="\$ac_cv_env_${ac_var}_value" eval ac_new_val="\$ac_env_${ac_var}_value" case $ac_old_set,$ac_new_set in set,) { echo "$as_me:$LINENO: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5 echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;} ac_cache_corrupted=: ;; ,set) { echo "$as_me:$LINENO: error: \`$ac_var' was not set in the previous run" >&5 echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;} ac_cache_corrupted=: ;; ,);; *) if test "x$ac_old_val" != "x$ac_new_val"; then { echo "$as_me:$LINENO: error: \`$ac_var' has changed since the previous run:" >&5 echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;} { echo "$as_me:$LINENO: former value: $ac_old_val" >&5 echo "$as_me: former value: $ac_old_val" >&2;} { echo "$as_me:$LINENO: current value: $ac_new_val" >&5 echo "$as_me: current value: $ac_new_val" >&2;} ac_cache_corrupted=: fi;; esac # Pass precious variables to config.status. if test "$ac_new_set" = set; then case $ac_new_val in *" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?\"\']*) ac_arg=$ac_var=`echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;; *) ac_arg=$ac_var=$ac_new_val ;; esac case " $ac_configure_args " in *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy. *) ac_configure_args="$ac_configure_args '$ac_arg'" ;; esac fi done if $ac_cache_corrupted; then { echo "$as_me:$LINENO: error: changes in the environment can compromise the build" >&5 echo "$as_me: error: changes in the environment can compromise the build" >&2;} { { echo "$as_me:$LINENO: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&5 echo "$as_me: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&2;} { (exit 1); exit 1; }; } fi ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu cat >>confdefs.h <<\_ACEOF #define SVN_REVISION "4112M" _ACEOF ac_config_headers="$ac_config_headers config.h" ac_aux_dir= for ac_dir in build-aux $srcdir/build-aux; do if test -f $ac_dir/install-sh; then ac_aux_dir=$ac_dir ac_install_sh="$ac_aux_dir/install-sh -c" break elif test -f $ac_dir/install.sh; then ac_aux_dir=$ac_dir ac_install_sh="$ac_aux_dir/install.sh -c" break elif test -f $ac_dir/shtool; then ac_aux_dir=$ac_dir ac_install_sh="$ac_aux_dir/shtool install -c" break fi done if test -z "$ac_aux_dir"; then { { echo "$as_me:$LINENO: error: cannot find install-sh or install.sh in build-aux $srcdir/build-aux" >&5 echo "$as_me: error: cannot find install-sh or install.sh in build-aux $srcdir/build-aux" >&2;} { (exit 1); exit 1; }; } fi ac_config_guess="$SHELL $ac_aux_dir/config.guess" ac_config_sub="$SHELL $ac_aux_dir/config.sub" ac_configure="$SHELL $ac_aux_dir/configure" # This should be Cygnus configure. am__api_version="1.9" # Find a good install program. We prefer a C program (faster), # so one script is as good as another. But avoid the broken or # incompatible versions: # SysV /etc/install, /usr/sbin/install # SunOS /usr/etc/install # IRIX /sbin/install # AIX /bin/install # AmigaOS /C/install, which installs bootblocks on floppy discs # AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag # AFS /usr/afsws/bin/install, which mishandles nonexistent args # SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff" # OS/2's system install, which has a completely different semantic # ./install, which can be erroneously created by make from ./install.sh. echo "$as_me:$LINENO: checking for a BSD-compatible install" >&5 echo $ECHO_N "checking for a BSD-compatible install... $ECHO_C" >&6 if test -z "$INSTALL"; then if test "${ac_cv_path_install+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. # Account for people who put trailing slashes in PATH elements. case $as_dir/ in ./ | .// | /cC/* | \ /etc/* | /usr/sbin/* | /usr/etc/* | /sbin/* | /usr/afsws/bin/* | \ ?:\\/os2\\/install\\/* | ?:\\/OS2\\/INSTALL\\/* | \ /usr/ucb/* ) ;; *) # OSF1 and SCO ODT 3.0 have their own names for install. # Don't use installbsd from OSF since it installs stuff as root # by default. for ac_prog in ginstall scoinst install; do for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_prog$ac_exec_ext"; then if test $ac_prog = install && grep dspmsg "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then # AIX install. It has an incompatible calling convention. : elif test $ac_prog = install && grep pwplus "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then # program-specific install script used by HP pwplus--don't use. : else ac_cv_path_install="$as_dir/$ac_prog$ac_exec_ext -c" break 3 fi fi done done ;; esac done fi if test "${ac_cv_path_install+set}" = set; then INSTALL=$ac_cv_path_install else # As a last resort, use the slow shell script. We don't cache a # path for INSTALL within a source directory, because that will # break other packages using the cache if that directory is # removed, or if the path is relative. INSTALL=$ac_install_sh fi fi echo "$as_me:$LINENO: result: $INSTALL" >&5 echo "${ECHO_T}$INSTALL" >&6 # Use test -z because SunOS4 sh mishandles braces in ${var-val}. # It thinks the first close brace ends the variable substitution. test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}' test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL}' test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644' echo "$as_me:$LINENO: checking whether build environment is sane" >&5 echo $ECHO_N "checking whether build environment is sane... $ECHO_C" >&6 # Just in case sleep 1 echo timestamp > conftest.file # Do `set' in a subshell so we don't clobber the current shell's # arguments. Must try -L first in case configure is actually a # symlink; some systems play weird games with the mod time of symlinks # (eg FreeBSD returns the mod time of the symlink's containing # directory). if ( set X `ls -Lt $srcdir/configure conftest.file 2> /dev/null` if test "$*" = "X"; then # -L didn't work. set X `ls -t $srcdir/configure conftest.file` fi rm -f conftest.file if test "$*" != "X $srcdir/configure conftest.file" \ && test "$*" != "X conftest.file $srcdir/configure"; then # If neither matched, then we have a broken ls. This can happen # if, for instance, CONFIG_SHELL is bash and it inherits a # broken ls alias from the environment. This has actually # happened. Such a system could not be considered "sane". { { echo "$as_me:$LINENO: error: ls -t appears to fail. Make sure there is not a broken alias in your environment" >&5 echo "$as_me: error: ls -t appears to fail. Make sure there is not a broken alias in your environment" >&2;} { (exit 1); exit 1; }; } fi test "$2" = conftest.file ) then # Ok. : else { { echo "$as_me:$LINENO: error: newly created file is older than distributed files! Check your system clock" >&5 echo "$as_me: error: newly created file is older than distributed files! Check your system clock" >&2;} { (exit 1); exit 1; }; } fi echo "$as_me:$LINENO: result: yes" >&5 echo "${ECHO_T}yes" >&6 test "$program_prefix" != NONE && program_transform_name="s,^,$program_prefix,;$program_transform_name" # Use a double $ so make ignores it. test "$program_suffix" != NONE && program_transform_name="s,\$,$program_suffix,;$program_transform_name" # Double any \ or $. echo might interpret backslashes. # By default was `s,x,x', remove it if useless. cat <<\_ACEOF >conftest.sed s/[\\$]/&&/g;s/;s,x,x,$// _ACEOF program_transform_name=`echo $program_transform_name | sed -f conftest.sed` rm conftest.sed # expand $ac_aux_dir to an absolute path am_aux_dir=`cd $ac_aux_dir && pwd` test x"${MISSING+set}" = xset || MISSING="\${SHELL} $am_aux_dir/missing" # Use eval to expand $SHELL if eval "$MISSING --run true"; then am_missing_run="$MISSING --run " else am_missing_run= { echo "$as_me:$LINENO: WARNING: \`missing' script is too old or missing" >&5 echo "$as_me: WARNING: \`missing' script is too old or missing" >&2;} fi if mkdir -p --version . >/dev/null 2>&1 && test ! -d ./--version; then # We used to keeping the `.' as first argument, in order to # allow $(mkdir_p) to be used without argument. As in # $(mkdir_p) $(somedir) # where $(somedir) is conditionally defined. However this is wrong # for two reasons: # 1. if the package is installed by a user who cannot write `.' # make install will fail, # 2. the above comment should most certainly read # $(mkdir_p) $(DESTDIR)$(somedir) # so it does not work when $(somedir) is undefined and # $(DESTDIR) is not. # To support the latter case, we have to write # test -z "$(somedir)" || $(mkdir_p) $(DESTDIR)$(somedir), # so the `.' trick is pointless. mkdir_p='mkdir -p --' else # On NextStep and OpenStep, the `mkdir' command does not # recognize any option. It will interpret all options as # directories to create, and then abort because `.' already # exists. for d in ./-p ./--version; do test -d $d && rmdir $d done # $(mkinstalldirs) is defined by Automake if mkinstalldirs exists. if test -f "$ac_aux_dir/mkinstalldirs"; then mkdir_p='$(mkinstalldirs)' else mkdir_p='$(install_sh) -d' fi fi for ac_prog in gawk mawk nawk awk do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_AWK+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$AWK"; then ac_cv_prog_AWK="$AWK" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_AWK="$ac_prog" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done fi fi AWK=$ac_cv_prog_AWK if test -n "$AWK"; then echo "$as_me:$LINENO: result: $AWK" >&5 echo "${ECHO_T}$AWK" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi test -n "$AWK" && break done echo "$as_me:$LINENO: checking whether ${MAKE-make} sets \$(MAKE)" >&5 echo $ECHO_N "checking whether ${MAKE-make} sets \$(MAKE)... $ECHO_C" >&6 set dummy ${MAKE-make}; ac_make=`echo "$2" | sed 'y,:./+-,___p_,'` if eval "test \"\${ac_cv_prog_make_${ac_make}_set+set}\" = set"; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.make <<\_ACEOF all: @echo 'ac_maketemp="$(MAKE)"' _ACEOF # GNU make sometimes prints "make[1]: Entering...", which would confuse us. eval `${MAKE-make} -f conftest.make 2>/dev/null | grep temp=` if test -n "$ac_maketemp"; then eval ac_cv_prog_make_${ac_make}_set=yes else eval ac_cv_prog_make_${ac_make}_set=no fi rm -f conftest.make fi if eval "test \"`echo '$ac_cv_prog_make_'${ac_make}_set`\" = yes"; then echo "$as_me:$LINENO: result: yes" >&5 echo "${ECHO_T}yes" >&6 SET_MAKE= else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 SET_MAKE="MAKE=${MAKE-make}" fi rm -rf .tst 2>/dev/null mkdir .tst 2>/dev/null if test -d .tst; then am__leading_dot=. else am__leading_dot=_ fi rmdir .tst 2>/dev/null # test to see if srcdir already configured if test "`cd $srcdir && pwd`" != "`pwd`" && test -f $srcdir/config.status; then { { echo "$as_me:$LINENO: error: source directory already configured; run \"make distclean\" there first" >&5 echo "$as_me: error: source directory already configured; run \"make distclean\" there first" >&2;} { (exit 1); exit 1; }; } fi # test whether we have cygpath if test -z "$CYGPATH_W"; then if (cygpath --version) >/dev/null 2>/dev/null; then CYGPATH_W='cygpath -w' else CYGPATH_W=echo fi fi # Define the identity of the package. PACKAGE='tophat' VERSION='2.0.9' cat >>confdefs.h <<_ACEOF #define PACKAGE "$PACKAGE" _ACEOF cat >>confdefs.h <<_ACEOF #define VERSION "$VERSION" _ACEOF # Some tools Automake needs. ACLOCAL=${ACLOCAL-"${am_missing_run}aclocal-${am__api_version}"} AUTOCONF=${AUTOCONF-"${am_missing_run}autoconf"} AUTOMAKE=${AUTOMAKE-"${am_missing_run}automake-${am__api_version}"} AUTOHEADER=${AUTOHEADER-"${am_missing_run}autoheader"} MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"} install_sh=${install_sh-"$am_aux_dir/install-sh"} # Installed binaries are usually stripped using `strip' when the user # run `make install-strip'. However `strip' might not be the right # tool to use in cross-compilation environments, therefore Automake # will honor the `STRIP' environment variable to overrule this program. if test "$cross_compiling" != no; then if test -n "$ac_tool_prefix"; then # Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args. set dummy ${ac_tool_prefix}strip; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_STRIP+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$STRIP"; then ac_cv_prog_STRIP="$STRIP" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_STRIP="${ac_tool_prefix}strip" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done fi fi STRIP=$ac_cv_prog_STRIP if test -n "$STRIP"; then echo "$as_me:$LINENO: result: $STRIP" >&5 echo "${ECHO_T}$STRIP" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi fi if test -z "$ac_cv_prog_STRIP"; then ac_ct_STRIP=$STRIP # Extract the first word of "strip", so it can be a program name with args. set dummy strip; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_ac_ct_STRIP+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$ac_ct_STRIP"; then ac_cv_prog_ac_ct_STRIP="$ac_ct_STRIP" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_STRIP="strip" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done test -z "$ac_cv_prog_ac_ct_STRIP" && ac_cv_prog_ac_ct_STRIP=":" fi fi ac_ct_STRIP=$ac_cv_prog_ac_ct_STRIP if test -n "$ac_ct_STRIP"; then echo "$as_me:$LINENO: result: $ac_ct_STRIP" >&5 echo "${ECHO_T}$ac_ct_STRIP" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi STRIP=$ac_ct_STRIP else STRIP="$ac_cv_prog_STRIP" fi fi INSTALL_STRIP_PROGRAM="\${SHELL} \$(install_sh) -c -s" # We need awk for the "check" target. The system "awk" is bad on # some platforms. # Always define AMTAR for backward compatibility. AMTAR=${AMTAR-"${am_missing_run}tar"} am__tar='${AMTAR} chof - "$$tardir"'; am__untar='${AMTAR} xf -' #AM_PATH_CPPUNIT(1.10.2) # Make sure CXXFLAGS is defined so that AC_PROG_CXX doesn't set it. CXXFLAGS="$CXXFLAGS" CFLAGS="$CFLAGS" ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu # Checks for programs. for ac_prog in gawk mawk nawk awk do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_AWK+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$AWK"; then ac_cv_prog_AWK="$AWK" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_AWK="$ac_prog" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done fi fi AWK=$ac_cv_prog_AWK if test -n "$AWK"; then echo "$as_me:$LINENO: result: $AWK" >&5 echo "${ECHO_T}$AWK" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi test -n "$AWK" && break done ac_ext=cc ac_cpp='$CXXCPP $CPPFLAGS' ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_cxx_compiler_gnu if test -n "$ac_tool_prefix"; then for ac_prog in $CCC g++ c++ gpp aCC CC cxx cc++ cl FCC KCC RCC xlC_r xlC do # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. set dummy $ac_tool_prefix$ac_prog; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_CXX+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$CXX"; then ac_cv_prog_CXX="$CXX" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_CXX="$ac_tool_prefix$ac_prog" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done fi fi CXX=$ac_cv_prog_CXX if test -n "$CXX"; then echo "$as_me:$LINENO: result: $CXX" >&5 echo "${ECHO_T}$CXX" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi test -n "$CXX" && break done fi if test -z "$CXX"; then ac_ct_CXX=$CXX for ac_prog in $CCC g++ c++ gpp aCC CC cxx cc++ cl FCC KCC RCC xlC_r xlC do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_ac_ct_CXX+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$ac_ct_CXX"; then ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_CXX="$ac_prog" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done fi fi ac_ct_CXX=$ac_cv_prog_ac_ct_CXX if test -n "$ac_ct_CXX"; then echo "$as_me:$LINENO: result: $ac_ct_CXX" >&5 echo "${ECHO_T}$ac_ct_CXX" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi test -n "$ac_ct_CXX" && break done test -n "$ac_ct_CXX" || ac_ct_CXX="g++" CXX=$ac_ct_CXX fi # Provide some information about the compiler. echo "$as_me:$LINENO:" \ "checking for C++ compiler version" >&5 ac_compiler=`set X $ac_compile; echo $2` { (eval echo "$as_me:$LINENO: \"$ac_compiler --version &5\"") >&5 (eval $ac_compiler --version &5) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } { (eval echo "$as_me:$LINENO: \"$ac_compiler -v &5\"") >&5 (eval $ac_compiler -v &5) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } { (eval echo "$as_me:$LINENO: \"$ac_compiler -V &5\"") >&5 (eval $ac_compiler -V &5) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ int main () { ; return 0; } _ACEOF ac_clean_files_save=$ac_clean_files ac_clean_files="$ac_clean_files a.out a.exe b.out" # Try to create an executable without -o first, disregard a.out. # It will help us diagnose broken compilers, and finding out an intuition # of exeext. echo "$as_me:$LINENO: checking for C++ compiler default output file name" >&5 echo $ECHO_N "checking for C++ compiler default output file name... $ECHO_C" >&6 ac_link_default=`echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'` if { (eval echo "$as_me:$LINENO: \"$ac_link_default\"") >&5 (eval $ac_link_default) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; then # Find the output, starting from the most likely. This scheme is # not robust to junk in `.', hence go to wildcards (a.*) only as a last # resort. # Be careful to initialize this variable, since it used to be cached. # Otherwise an old cache value of `no' led to `EXEEXT = no' in a Makefile. ac_cv_exeext= # b.out is created by i960 compilers. for ac_file in a_out.exe a.exe conftest.exe a.out conftest a.* conftest.* b.out do test -f "$ac_file" || continue case $ac_file in *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.o | *.obj ) ;; conftest.$ac_ext ) # This is the source file. ;; [ab].out ) # We found the default executable, but exeext='' is most # certainly right. break;; *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` # FIXME: I believe we export ac_cv_exeext for Libtool, # but it would be cool to find out if it's true. Does anybody # maintain Libtool? --akim. export ac_cv_exeext break;; * ) break;; esac done else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 { { echo "$as_me:$LINENO: error: C++ compiler cannot create executables See \`config.log' for more details." >&5 echo "$as_me: error: C++ compiler cannot create executables See \`config.log' for more details." >&2;} { (exit 77); exit 77; }; } fi ac_exeext=$ac_cv_exeext echo "$as_me:$LINENO: result: $ac_file" >&5 echo "${ECHO_T}$ac_file" >&6 # Check the compiler produces executables we can run. If not, either # the compiler is broken, or we cross compile. echo "$as_me:$LINENO: checking whether the C++ compiler works" >&5 echo $ECHO_N "checking whether the C++ compiler works... $ECHO_C" >&6 # FIXME: These cross compiler hacks should be removed for Autoconf 3.0 # If not cross compiling, check that we can run a simple program. if test "$cross_compiling" != yes; then if { ac_try='./$ac_file' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then cross_compiling=no else if test "$cross_compiling" = maybe; then cross_compiling=yes else { { echo "$as_me:$LINENO: error: cannot run C++ compiled programs. If you meant to cross compile, use \`--host'. See \`config.log' for more details." >&5 echo "$as_me: error: cannot run C++ compiled programs. If you meant to cross compile, use \`--host'. See \`config.log' for more details." >&2;} { (exit 1); exit 1; }; } fi fi fi echo "$as_me:$LINENO: result: yes" >&5 echo "${ECHO_T}yes" >&6 rm -f a.out a.exe conftest$ac_cv_exeext b.out ac_clean_files=$ac_clean_files_save # Check the compiler produces executables we can run. If not, either # the compiler is broken, or we cross compile. echo "$as_me:$LINENO: checking whether we are cross compiling" >&5 echo $ECHO_N "checking whether we are cross compiling... $ECHO_C" >&6 echo "$as_me:$LINENO: result: $cross_compiling" >&5 echo "${ECHO_T}$cross_compiling" >&6 echo "$as_me:$LINENO: checking for suffix of executables" >&5 echo $ECHO_N "checking for suffix of executables... $ECHO_C" >&6 if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5 (eval $ac_link) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; then # If both `conftest.exe' and `conftest' are `present' (well, observable) # catch `conftest.exe'. For instance with Cygwin, `ls conftest' will # work properly (i.e., refer to `conftest.exe'), while it won't with # `rm'. for ac_file in conftest.exe conftest conftest.*; do test -f "$ac_file" || continue case $ac_file in *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.o | *.obj ) ;; *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` export ac_cv_exeext break;; * ) break;; esac done else { { echo "$as_me:$LINENO: error: cannot compute suffix of executables: cannot compile and link See \`config.log' for more details." >&5 echo "$as_me: error: cannot compute suffix of executables: cannot compile and link See \`config.log' for more details." >&2;} { (exit 1); exit 1; }; } fi rm -f conftest$ac_cv_exeext echo "$as_me:$LINENO: result: $ac_cv_exeext" >&5 echo "${ECHO_T}$ac_cv_exeext" >&6 rm -f conftest.$ac_ext EXEEXT=$ac_cv_exeext ac_exeext=$EXEEXT echo "$as_me:$LINENO: checking for suffix of object files" >&5 echo $ECHO_N "checking for suffix of object files... $ECHO_C" >&6 if test "${ac_cv_objext+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ int main () { ; return 0; } _ACEOF rm -f conftest.o conftest.obj if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; then for ac_file in `(ls conftest.o conftest.obj; ls conftest.*) 2>/dev/null`; do case $ac_file in *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg ) ;; *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'` break;; esac done else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 { { echo "$as_me:$LINENO: error: cannot compute suffix of object files: cannot compile See \`config.log' for more details." >&5 echo "$as_me: error: cannot compute suffix of object files: cannot compile See \`config.log' for more details." >&2;} { (exit 1); exit 1; }; } fi rm -f conftest.$ac_cv_objext conftest.$ac_ext fi echo "$as_me:$LINENO: result: $ac_cv_objext" >&5 echo "${ECHO_T}$ac_cv_objext" >&6 OBJEXT=$ac_cv_objext ac_objext=$OBJEXT echo "$as_me:$LINENO: checking whether we are using the GNU C++ compiler" >&5 echo $ECHO_N "checking whether we are using the GNU C++ compiler... $ECHO_C" >&6 if test "${ac_cv_cxx_compiler_gnu+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ int main () { #ifndef __GNUC__ choke me #endif ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_cxx_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_compiler_gnu=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_compiler_gnu=no fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext ac_cv_cxx_compiler_gnu=$ac_compiler_gnu fi echo "$as_me:$LINENO: result: $ac_cv_cxx_compiler_gnu" >&5 echo "${ECHO_T}$ac_cv_cxx_compiler_gnu" >&6 GXX=`test $ac_compiler_gnu = yes && echo yes` ac_test_CXXFLAGS=${CXXFLAGS+set} ac_save_CXXFLAGS=$CXXFLAGS CXXFLAGS="-g" echo "$as_me:$LINENO: checking whether $CXX accepts -g" >&5 echo $ECHO_N "checking whether $CXX accepts -g... $ECHO_C" >&6 if test "${ac_cv_prog_cxx_g+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ int main () { ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_cxx_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_prog_cxx_g=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_cv_prog_cxx_g=no fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext fi echo "$as_me:$LINENO: result: $ac_cv_prog_cxx_g" >&5 echo "${ECHO_T}$ac_cv_prog_cxx_g" >&6 if test "$ac_test_CXXFLAGS" = set; then CXXFLAGS=$ac_save_CXXFLAGS elif test $ac_cv_prog_cxx_g = yes; then if test "$GXX" = yes; then CXXFLAGS="-g -O2" else CXXFLAGS="-g" fi else if test "$GXX" = yes; then CXXFLAGS="-O2" else CXXFLAGS= fi fi for ac_declaration in \ '' \ 'extern "C" void std::exit (int) throw (); using std::exit;' \ 'extern "C" void std::exit (int); using std::exit;' \ 'extern "C" void exit (int) throw ();' \ 'extern "C" void exit (int);' \ 'void exit (int);' do cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ $ac_declaration #include int main () { exit (42); ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_cxx_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then : else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 continue fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ $ac_declaration int main () { exit (42); ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_cxx_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then break else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext done rm -f conftest* if test -n "$ac_declaration"; then echo '#ifdef __cplusplus' >>confdefs.h echo $ac_declaration >>confdefs.h echo '#endif' >>confdefs.h fi ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu DEPDIR="${am__leading_dot}deps" ac_config_commands="$ac_config_commands depfiles" am_make=${MAKE-make} cat > confinc << 'END' am__doit: @echo done .PHONY: am__doit END # If we don't find an include directive, just comment out the code. echo "$as_me:$LINENO: checking for style of include used by $am_make" >&5 echo $ECHO_N "checking for style of include used by $am_make... $ECHO_C" >&6 am__include="#" am__quote= _am_result=none # First try GNU make style include. echo "include confinc" > confmf # We grep out `Entering directory' and `Leaving directory' # messages which can occur if `w' ends up in MAKEFLAGS. # In particular we don't look at `^make:' because GNU make might # be invoked under some other name (usually "gmake"), in which # case it prints its new name instead of `make'. if test "`$am_make -s -f confmf 2> /dev/null | grep -v 'ing directory'`" = "done"; then am__include=include am__quote= _am_result=GNU fi # Now try BSD make style include. if test "$am__include" = "#"; then echo '.include "confinc"' > confmf if test "`$am_make -s -f confmf 2> /dev/null`" = "done"; then am__include=.include am__quote="\"" _am_result=BSD fi fi echo "$as_me:$LINENO: result: $_am_result" >&5 echo "${ECHO_T}$_am_result" >&6 rm -f confinc confmf # Check whether --enable-dependency-tracking or --disable-dependency-tracking was given. if test "${enable_dependency_tracking+set}" = set; then enableval="$enable_dependency_tracking" fi; if test "x$enable_dependency_tracking" != xno; then am_depcomp="$ac_aux_dir/depcomp" AMDEPBACKSLASH='\' fi if test "x$enable_dependency_tracking" != xno; then AMDEP_TRUE= AMDEP_FALSE='#' else AMDEP_TRUE='#' AMDEP_FALSE= fi depcc="$CXX" am_compiler_list= echo "$as_me:$LINENO: checking dependency style of $depcc" >&5 echo $ECHO_N "checking dependency style of $depcc... $ECHO_C" >&6 if test "${am_cv_CXX_dependencies_compiler_type+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then # We make a subdir and do the tests there. Otherwise we can end up # making bogus files that we don't know about and never remove. For # instance it was reported that on HP-UX the gcc test will end up # making a dummy file named `D' -- because `-MD' means `put the output # in D'. mkdir conftest.dir # Copy depcomp to subdir because otherwise we won't find it if we're # using a relative directory. cp "$am_depcomp" conftest.dir cd conftest.dir # We will build objects and dependencies in a subdirectory because # it helps to detect inapplicable dependency modes. For instance # both Tru64's cc and ICC support -MD to output dependencies as a # side effect of compilation, but ICC will put the dependencies in # the current directory while Tru64 will put them in the object # directory. mkdir sub am_cv_CXX_dependencies_compiler_type=none if test "$am_compiler_list" = ""; then am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp` fi for depmode in $am_compiler_list; do # Setup a source with many dependencies, because some compilers # like to wrap large dependency lists on column 80 (with \), and # we should not choose a depcomp mode which is confused by this. # # We need to recreate these files for each test, as the compiler may # overwrite some of them when testing with obscure command lines. # This happens at least with the AIX C compiler. : > sub/conftest.c for i in 1 2 3 4 5 6; do echo '#include "conftst'$i'.h"' >> sub/conftest.c # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with # Solaris 8's {/usr,}/bin/sh. touch sub/conftst$i.h done echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf case $depmode in nosideeffect) # after this tag, mechanisms are not by side-effect, so they'll # only be used when explicitly requested if test "x$enable_dependency_tracking" = xyes; then continue else break fi ;; none) break ;; esac # We check with `-c' and `-o' for the sake of the "dashmstdout" # mode. It turns out that the SunPro C++ compiler does not properly # handle `-M -o', and we need to detect this. if depmode=$depmode \ source=sub/conftest.c object=sub/conftest.${OBJEXT-o} \ depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \ $SHELL ./depcomp $depcc -c -o sub/conftest.${OBJEXT-o} sub/conftest.c \ >/dev/null 2>conftest.err && grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 && grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 && ${MAKE-make} -s -f confmf > /dev/null 2>&1; then # icc doesn't choke on unknown options, it will just issue warnings # or remarks (even with -Werror). So we grep stderr for any message # that says an option was ignored or not supported. # When given -MP, icc 7.0 and 7.1 complain thusly: # icc: Command line warning: ignoring option '-M'; no argument required # The diagnosis changed in icc 8.0: # icc: Command line remark: option '-MP' not supported if (grep 'ignoring option' conftest.err || grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else am_cv_CXX_dependencies_compiler_type=$depmode break fi fi done cd .. rm -rf conftest.dir else am_cv_CXX_dependencies_compiler_type=none fi fi echo "$as_me:$LINENO: result: $am_cv_CXX_dependencies_compiler_type" >&5 echo "${ECHO_T}$am_cv_CXX_dependencies_compiler_type" >&6 CXXDEPMODE=depmode=$am_cv_CXX_dependencies_compiler_type if test "x$enable_dependency_tracking" != xno \ && test "$am_cv_CXX_dependencies_compiler_type" = gcc3; then am__fastdepCXX_TRUE= am__fastdepCXX_FALSE='#' else am__fastdepCXX_TRUE='#' am__fastdepCXX_FALSE= fi ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu if test -n "$ac_tool_prefix"; then # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args. set dummy ${ac_tool_prefix}gcc; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_CC+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$CC"; then ac_cv_prog_CC="$CC" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_CC="${ac_tool_prefix}gcc" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done fi fi CC=$ac_cv_prog_CC if test -n "$CC"; then echo "$as_me:$LINENO: result: $CC" >&5 echo "${ECHO_T}$CC" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi fi if test -z "$ac_cv_prog_CC"; then ac_ct_CC=$CC # Extract the first word of "gcc", so it can be a program name with args. set dummy gcc; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_ac_ct_CC+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$ac_ct_CC"; then ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_CC="gcc" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done fi fi ac_ct_CC=$ac_cv_prog_ac_ct_CC if test -n "$ac_ct_CC"; then echo "$as_me:$LINENO: result: $ac_ct_CC" >&5 echo "${ECHO_T}$ac_ct_CC" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi CC=$ac_ct_CC else CC="$ac_cv_prog_CC" fi if test -z "$CC"; then if test -n "$ac_tool_prefix"; then # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args. set dummy ${ac_tool_prefix}cc; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_CC+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$CC"; then ac_cv_prog_CC="$CC" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_CC="${ac_tool_prefix}cc" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done fi fi CC=$ac_cv_prog_CC if test -n "$CC"; then echo "$as_me:$LINENO: result: $CC" >&5 echo "${ECHO_T}$CC" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi fi if test -z "$ac_cv_prog_CC"; then ac_ct_CC=$CC # Extract the first word of "cc", so it can be a program name with args. set dummy cc; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_ac_ct_CC+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$ac_ct_CC"; then ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_CC="cc" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done fi fi ac_ct_CC=$ac_cv_prog_ac_ct_CC if test -n "$ac_ct_CC"; then echo "$as_me:$LINENO: result: $ac_ct_CC" >&5 echo "${ECHO_T}$ac_ct_CC" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi CC=$ac_ct_CC else CC="$ac_cv_prog_CC" fi fi if test -z "$CC"; then # Extract the first word of "cc", so it can be a program name with args. set dummy cc; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_CC+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$CC"; then ac_cv_prog_CC="$CC" # Let the user override the test. else ac_prog_rejected=no as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then ac_prog_rejected=yes continue fi ac_cv_prog_CC="cc" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done if test $ac_prog_rejected = yes; then # We found a bogon in the path, so make sure we never use it. set dummy $ac_cv_prog_CC shift if test $# != 0; then # We chose a different compiler from the bogus one. # However, it has the same basename, so the bogon will be chosen # first if we set CC to just the basename; use the full file name. shift ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@" fi fi fi fi CC=$ac_cv_prog_CC if test -n "$CC"; then echo "$as_me:$LINENO: result: $CC" >&5 echo "${ECHO_T}$CC" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi fi if test -z "$CC"; then if test -n "$ac_tool_prefix"; then for ac_prog in cl do # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. set dummy $ac_tool_prefix$ac_prog; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_CC+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$CC"; then ac_cv_prog_CC="$CC" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_CC="$ac_tool_prefix$ac_prog" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done fi fi CC=$ac_cv_prog_CC if test -n "$CC"; then echo "$as_me:$LINENO: result: $CC" >&5 echo "${ECHO_T}$CC" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi test -n "$CC" && break done fi if test -z "$CC"; then ac_ct_CC=$CC for ac_prog in cl do # Extract the first word of "$ac_prog", so it can be a program name with args. set dummy $ac_prog; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_ac_ct_CC+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$ac_ct_CC"; then ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_CC="$ac_prog" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done fi fi ac_ct_CC=$ac_cv_prog_ac_ct_CC if test -n "$ac_ct_CC"; then echo "$as_me:$LINENO: result: $ac_ct_CC" >&5 echo "${ECHO_T}$ac_ct_CC" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi test -n "$ac_ct_CC" && break done CC=$ac_ct_CC fi fi test -z "$CC" && { { echo "$as_me:$LINENO: error: no acceptable C compiler found in \$PATH See \`config.log' for more details." >&5 echo "$as_me: error: no acceptable C compiler found in \$PATH See \`config.log' for more details." >&2;} { (exit 1); exit 1; }; } # Provide some information about the compiler. echo "$as_me:$LINENO:" \ "checking for C compiler version" >&5 ac_compiler=`set X $ac_compile; echo $2` { (eval echo "$as_me:$LINENO: \"$ac_compiler --version &5\"") >&5 (eval $ac_compiler --version &5) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } { (eval echo "$as_me:$LINENO: \"$ac_compiler -v &5\"") >&5 (eval $ac_compiler -v &5) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } { (eval echo "$as_me:$LINENO: \"$ac_compiler -V &5\"") >&5 (eval $ac_compiler -V &5) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } echo "$as_me:$LINENO: checking whether we are using the GNU C compiler" >&5 echo $ECHO_N "checking whether we are using the GNU C compiler... $ECHO_C" >&6 if test "${ac_cv_c_compiler_gnu+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ int main () { #ifndef __GNUC__ choke me #endif ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_compiler_gnu=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_compiler_gnu=no fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext ac_cv_c_compiler_gnu=$ac_compiler_gnu fi echo "$as_me:$LINENO: result: $ac_cv_c_compiler_gnu" >&5 echo "${ECHO_T}$ac_cv_c_compiler_gnu" >&6 GCC=`test $ac_compiler_gnu = yes && echo yes` ac_test_CFLAGS=${CFLAGS+set} ac_save_CFLAGS=$CFLAGS CFLAGS="-g" echo "$as_me:$LINENO: checking whether $CC accepts -g" >&5 echo $ECHO_N "checking whether $CC accepts -g... $ECHO_C" >&6 if test "${ac_cv_prog_cc_g+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ int main () { ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_prog_cc_g=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_cv_prog_cc_g=no fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext fi echo "$as_me:$LINENO: result: $ac_cv_prog_cc_g" >&5 echo "${ECHO_T}$ac_cv_prog_cc_g" >&6 if test "$ac_test_CFLAGS" = set; then CFLAGS=$ac_save_CFLAGS elif test $ac_cv_prog_cc_g = yes; then if test "$GCC" = yes; then CFLAGS="-g -O2" else CFLAGS="-g" fi else if test "$GCC" = yes; then CFLAGS="-O2" else CFLAGS= fi fi echo "$as_me:$LINENO: checking for $CC option to accept ANSI C" >&5 echo $ECHO_N "checking for $CC option to accept ANSI C... $ECHO_C" >&6 if test "${ac_cv_prog_cc_stdc+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else ac_cv_prog_cc_stdc=no ac_save_CC=$CC cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include #include #include #include /* Most of the following tests are stolen from RCS 5.7's src/conf.sh. */ struct buf { int x; }; FILE * (*rcsopen) (struct buf *, struct stat *, int); static char *e (p, i) char **p; int i; { return p[i]; } static char *f (char * (*g) (char **, int), char **p, ...) { char *s; va_list v; va_start (v,p); s = g (p, va_arg (v,int)); va_end (v); return s; } /* OSF 4.0 Compaq cc is some sort of almost-ANSI by default. It has function prototypes and stuff, but not '\xHH' hex character constants. These don't provoke an error unfortunately, instead are silently treated as 'x'. The following induces an error, until -std1 is added to get proper ANSI mode. Curiously '\x00'!='x' always comes out true, for an array size at least. It's necessary to write '\x00'==0 to get something that's true only with -std1. */ int osf4_cc_array ['\x00' == 0 ? 1 : -1]; int test (int i, double x); struct s1 {int (*f) (int a);}; struct s2 {int (*f) (double a);}; int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int); int argc; char **argv; int main () { return f (e, argv, 0) != argv[0] || f (e, argv, 1) != argv[1]; ; return 0; } _ACEOF # Don't try gcc -ansi; that turns off useful extensions and # breaks some systems' header files. # AIX -qlanglvl=ansi # Ultrix and OSF/1 -std1 # HP-UX 10.20 and later -Ae # HP-UX older versions -Aa -D_HPUX_SOURCE # SVR4 -Xc -D__EXTENSIONS__ for ac_arg in "" -qlanglvl=ansi -std1 -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__" do CC="$ac_save_CC $ac_arg" rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_prog_cc_stdc=$ac_arg break else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 fi rm -f conftest.err conftest.$ac_objext done rm -f conftest.$ac_ext conftest.$ac_objext CC=$ac_save_CC fi case "x$ac_cv_prog_cc_stdc" in x|xno) echo "$as_me:$LINENO: result: none needed" >&5 echo "${ECHO_T}none needed" >&6 ;; *) echo "$as_me:$LINENO: result: $ac_cv_prog_cc_stdc" >&5 echo "${ECHO_T}$ac_cv_prog_cc_stdc" >&6 CC="$CC $ac_cv_prog_cc_stdc" ;; esac # Some people use a C++ compiler to compile C. Since we use `exit', # in C++ we need to declare it. In case someone uses the same compiler # for both compiling C and C++ we need to have the C++ compiler decide # the declaration of exit, since it's the most demanding environment. cat >conftest.$ac_ext <<_ACEOF #ifndef __cplusplus choke me #endif _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then for ac_declaration in \ '' \ 'extern "C" void std::exit (int) throw (); using std::exit;' \ 'extern "C" void std::exit (int); using std::exit;' \ 'extern "C" void exit (int) throw ();' \ 'extern "C" void exit (int);' \ 'void exit (int);' do cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ $ac_declaration #include int main () { exit (42); ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then : else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 continue fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ $ac_declaration int main () { exit (42); ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then break else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext done rm -f conftest* if test -n "$ac_declaration"; then echo '#ifdef __cplusplus' >>confdefs.h echo $ac_declaration >>confdefs.h echo '#endif' >>confdefs.h fi else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu depcc="$CC" am_compiler_list= echo "$as_me:$LINENO: checking dependency style of $depcc" >&5 echo $ECHO_N "checking dependency style of $depcc... $ECHO_C" >&6 if test "${am_cv_CC_dependencies_compiler_type+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then # We make a subdir and do the tests there. Otherwise we can end up # making bogus files that we don't know about and never remove. For # instance it was reported that on HP-UX the gcc test will end up # making a dummy file named `D' -- because `-MD' means `put the output # in D'. mkdir conftest.dir # Copy depcomp to subdir because otherwise we won't find it if we're # using a relative directory. cp "$am_depcomp" conftest.dir cd conftest.dir # We will build objects and dependencies in a subdirectory because # it helps to detect inapplicable dependency modes. For instance # both Tru64's cc and ICC support -MD to output dependencies as a # side effect of compilation, but ICC will put the dependencies in # the current directory while Tru64 will put them in the object # directory. mkdir sub am_cv_CC_dependencies_compiler_type=none if test "$am_compiler_list" = ""; then am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp` fi for depmode in $am_compiler_list; do # Setup a source with many dependencies, because some compilers # like to wrap large dependency lists on column 80 (with \), and # we should not choose a depcomp mode which is confused by this. # # We need to recreate these files for each test, as the compiler may # overwrite some of them when testing with obscure command lines. # This happens at least with the AIX C compiler. : > sub/conftest.c for i in 1 2 3 4 5 6; do echo '#include "conftst'$i'.h"' >> sub/conftest.c # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with # Solaris 8's {/usr,}/bin/sh. touch sub/conftst$i.h done echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf case $depmode in nosideeffect) # after this tag, mechanisms are not by side-effect, so they'll # only be used when explicitly requested if test "x$enable_dependency_tracking" = xyes; then continue else break fi ;; none) break ;; esac # We check with `-c' and `-o' for the sake of the "dashmstdout" # mode. It turns out that the SunPro C++ compiler does not properly # handle `-M -o', and we need to detect this. if depmode=$depmode \ source=sub/conftest.c object=sub/conftest.${OBJEXT-o} \ depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \ $SHELL ./depcomp $depcc -c -o sub/conftest.${OBJEXT-o} sub/conftest.c \ >/dev/null 2>conftest.err && grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 && grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 && ${MAKE-make} -s -f confmf > /dev/null 2>&1; then # icc doesn't choke on unknown options, it will just issue warnings # or remarks (even with -Werror). So we grep stderr for any message # that says an option was ignored or not supported. # When given -MP, icc 7.0 and 7.1 complain thusly: # icc: Command line warning: ignoring option '-M'; no argument required # The diagnosis changed in icc 8.0: # icc: Command line remark: option '-MP' not supported if (grep 'ignoring option' conftest.err || grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else am_cv_CC_dependencies_compiler_type=$depmode break fi fi done cd .. rm -rf conftest.dir else am_cv_CC_dependencies_compiler_type=none fi fi echo "$as_me:$LINENO: result: $am_cv_CC_dependencies_compiler_type" >&5 echo "${ECHO_T}$am_cv_CC_dependencies_compiler_type" >&6 CCDEPMODE=depmode=$am_cv_CC_dependencies_compiler_type if test "x$enable_dependency_tracking" != xno \ && test "$am_cv_CC_dependencies_compiler_type" = gcc3; then am__fastdepCC_TRUE= am__fastdepCC_FALSE='#' else am__fastdepCC_TRUE='#' am__fastdepCC_FALSE= fi echo "$as_me:$LINENO: checking whether ${MAKE-make} sets \$(MAKE)" >&5 echo $ECHO_N "checking whether ${MAKE-make} sets \$(MAKE)... $ECHO_C" >&6 set dummy ${MAKE-make}; ac_make=`echo "$2" | sed 'y,:./+-,___p_,'` if eval "test \"\${ac_cv_prog_make_${ac_make}_set+set}\" = set"; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.make <<\_ACEOF all: @echo 'ac_maketemp="$(MAKE)"' _ACEOF # GNU make sometimes prints "make[1]: Entering...", which would confuse us. eval `${MAKE-make} -f conftest.make 2>/dev/null | grep temp=` if test -n "$ac_maketemp"; then eval ac_cv_prog_make_${ac_make}_set=yes else eval ac_cv_prog_make_${ac_make}_set=no fi rm -f conftest.make fi if eval "test \"`echo '$ac_cv_prog_make_'${ac_make}_set`\" = yes"; then echo "$as_me:$LINENO: result: yes" >&5 echo "${ECHO_T}yes" >&6 SET_MAKE= else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 SET_MAKE="MAKE=${MAKE-make}" fi if test -n "$ac_tool_prefix"; then # Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args. set dummy ${ac_tool_prefix}ranlib; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_RANLIB+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$RANLIB"; then ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done fi fi RANLIB=$ac_cv_prog_RANLIB if test -n "$RANLIB"; then echo "$as_me:$LINENO: result: $RANLIB" >&5 echo "${ECHO_T}$RANLIB" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi fi if test -z "$ac_cv_prog_RANLIB"; then ac_ct_RANLIB=$RANLIB # Extract the first word of "ranlib", so it can be a program name with args. set dummy ranlib; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_ac_ct_RANLIB+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$ac_ct_RANLIB"; then ac_cv_prog_ac_ct_RANLIB="$ac_ct_RANLIB" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_RANLIB="ranlib" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done test -z "$ac_cv_prog_ac_ct_RANLIB" && ac_cv_prog_ac_ct_RANLIB=":" fi fi ac_ct_RANLIB=$ac_cv_prog_ac_ct_RANLIB if test -n "$ac_ct_RANLIB"; then echo "$as_me:$LINENO: result: $ac_ct_RANLIB" >&5 echo "${ECHO_T}$ac_ct_RANLIB" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi RANLIB=$ac_ct_RANLIB else RANLIB="$ac_cv_prog_RANLIB" fi # Find a good install program. We prefer a C program (faster), # so one script is as good as another. But avoid the broken or # incompatible versions: # SysV /etc/install, /usr/sbin/install # SunOS /usr/etc/install # IRIX /sbin/install # AIX /bin/install # AmigaOS /C/install, which installs bootblocks on floppy discs # AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag # AFS /usr/afsws/bin/install, which mishandles nonexistent args # SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff" # OS/2's system install, which has a completely different semantic # ./install, which can be erroneously created by make from ./install.sh. echo "$as_me:$LINENO: checking for a BSD-compatible install" >&5 echo $ECHO_N "checking for a BSD-compatible install... $ECHO_C" >&6 if test -z "$INSTALL"; then if test "${ac_cv_path_install+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. # Account for people who put trailing slashes in PATH elements. case $as_dir/ in ./ | .// | /cC/* | \ /etc/* | /usr/sbin/* | /usr/etc/* | /sbin/* | /usr/afsws/bin/* | \ ?:\\/os2\\/install\\/* | ?:\\/OS2\\/INSTALL\\/* | \ /usr/ucb/* ) ;; *) # OSF1 and SCO ODT 3.0 have their own names for install. # Don't use installbsd from OSF since it installs stuff as root # by default. for ac_prog in ginstall scoinst install; do for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_prog$ac_exec_ext"; then if test $ac_prog = install && grep dspmsg "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then # AIX install. It has an incompatible calling convention. : elif test $ac_prog = install && grep pwplus "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then # program-specific install script used by HP pwplus--don't use. : else ac_cv_path_install="$as_dir/$ac_prog$ac_exec_ext -c" break 3 fi fi done done ;; esac done fi if test "${ac_cv_path_install+set}" = set; then INSTALL=$ac_cv_path_install else # As a last resort, use the slow shell script. We don't cache a # path for INSTALL within a source directory, because that will # break other packages using the cache if that directory is # removed, or if the path is relative. INSTALL=$ac_install_sh fi fi echo "$as_me:$LINENO: result: $INSTALL" >&5 echo "${ECHO_T}$INSTALL" >&6 # Use test -z because SunOS4 sh mishandles braces in ${var-val}. # It thinks the first close brace ends the variable substitution. test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}' test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL}' test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644' if test -n "$PYTHON"; then # If the user set $PYTHON, use it and don't search something else. echo "$as_me:$LINENO: checking whether $PYTHON version >= 2.4" >&5 echo $ECHO_N "checking whether $PYTHON version >= 2.4... $ECHO_C" >&6 prog="import sys, string # split strings by '.' and convert to numeric. Append some zeros # because we need at least 4 digits for the hex conversion. minver = map(int, string.split('2.4', '.')) + [0, 0, 0] minverhex = 0 for i in xrange(0, 4): minverhex = (minverhex << 8) + minver[i] sys.exit(sys.hexversion < minverhex)" if { echo "$as_me:$LINENO: $PYTHON -c "$prog"" >&5 ($PYTHON -c "$prog") >&5 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; then echo "$as_me:$LINENO: result: yes" >&5 echo "${ECHO_T}yes" >&6 else { { echo "$as_me:$LINENO: error: too old" >&5 echo "$as_me: error: too old" >&2;} { (exit 1); exit 1; }; } fi am_display_PYTHON=$PYTHON else # Otherwise, try each interpreter until we find one that satisfies # VERSION. echo "$as_me:$LINENO: checking for a Python interpreter with version >= 2.4" >&5 echo $ECHO_N "checking for a Python interpreter with version >= 2.4... $ECHO_C" >&6 if test "${am_cv_pathless_PYTHON+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else for am_cv_pathless_PYTHON in python python2 python2.5 python2.4 python2.3 python2.2 python2.1 python2.0 python1.6 python1.5 none; do test "$am_cv_pathless_PYTHON" = none && break prog="import sys, string # split strings by '.' and convert to numeric. Append some zeros # because we need at least 4 digits for the hex conversion. minver = map(int, string.split('2.4', '.')) + [0, 0, 0] minverhex = 0 for i in xrange(0, 4): minverhex = (minverhex << 8) + minver[i] sys.exit(sys.hexversion < minverhex)" if { echo "$as_me:$LINENO: $am_cv_pathless_PYTHON -c "$prog"" >&5 ($am_cv_pathless_PYTHON -c "$prog") >&5 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; then break fi done fi echo "$as_me:$LINENO: result: $am_cv_pathless_PYTHON" >&5 echo "${ECHO_T}$am_cv_pathless_PYTHON" >&6 # Set $PYTHON to the absolute path of $am_cv_pathless_PYTHON. if test "$am_cv_pathless_PYTHON" = none; then PYTHON=: else # Extract the first word of "$am_cv_pathless_PYTHON", so it can be a program name with args. set dummy $am_cv_pathless_PYTHON; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_path_PYTHON+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else case $PYTHON in [\\/]* | ?:[\\/]*) ac_cv_path_PYTHON="$PYTHON" # Let the user override the test with a path. ;; *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_path_PYTHON="$as_dir/$ac_word$ac_exec_ext" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done ;; esac fi PYTHON=$ac_cv_path_PYTHON if test -n "$PYTHON"; then echo "$as_me:$LINENO: result: $PYTHON" >&5 echo "${ECHO_T}$PYTHON" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi fi am_display_PYTHON=$am_cv_pathless_PYTHON fi if test "$PYTHON" = :; then { { echo "$as_me:$LINENO: error: no suitable Python interpreter found" >&5 echo "$as_me: error: no suitable Python interpreter found" >&2;} { (exit 1); exit 1; }; } else echo "$as_me:$LINENO: checking for $am_display_PYTHON version" >&5 echo $ECHO_N "checking for $am_display_PYTHON version... $ECHO_C" >&6 if test "${am_cv_python_version+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else am_cv_python_version=`$PYTHON -c "import sys; print sys.version[:3]"` fi echo "$as_me:$LINENO: result: $am_cv_python_version" >&5 echo "${ECHO_T}$am_cv_python_version" >&6 PYTHON_VERSION=$am_cv_python_version PYTHON_PREFIX='${prefix}' PYTHON_EXEC_PREFIX='${exec_prefix}' echo "$as_me:$LINENO: checking for $am_display_PYTHON platform" >&5 echo $ECHO_N "checking for $am_display_PYTHON platform... $ECHO_C" >&6 if test "${am_cv_python_platform+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else am_cv_python_platform=`$PYTHON -c "import sys; print sys.platform"` fi echo "$as_me:$LINENO: result: $am_cv_python_platform" >&5 echo "${ECHO_T}$am_cv_python_platform" >&6 PYTHON_PLATFORM=$am_cv_python_platform echo "$as_me:$LINENO: checking for $am_display_PYTHON script directory" >&5 echo $ECHO_N "checking for $am_display_PYTHON script directory... $ECHO_C" >&6 if test "${am_cv_python_pythondir+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else am_cv_python_pythondir=`$PYTHON -c "from distutils import sysconfig; print sysconfig.get_python_lib(0,0,prefix='$PYTHON_PREFIX')" 2>/dev/null || echo "$PYTHON_PREFIX/lib/python$PYTHON_VERSION/site-packages"` fi echo "$as_me:$LINENO: result: $am_cv_python_pythondir" >&5 echo "${ECHO_T}$am_cv_python_pythondir" >&6 pythondir=$am_cv_python_pythondir pkgpythondir=\${pythondir}/$PACKAGE echo "$as_me:$LINENO: checking for $am_display_PYTHON extension module directory" >&5 echo $ECHO_N "checking for $am_display_PYTHON extension module directory... $ECHO_C" >&6 if test "${am_cv_python_pyexecdir+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else am_cv_python_pyexecdir=`$PYTHON -c "from distutils import sysconfig; print sysconfig.get_python_lib(1,0,prefix='$PYTHON_EXEC_PREFIX')" 2>/dev/null || echo "${PYTHON_EXEC_PREFIX}/lib/python${PYTHON_VERSION}/site-packages"` fi echo "$as_me:$LINENO: result: $am_cv_python_pyexecdir" >&5 echo "${ECHO_T}$am_cv_python_pyexecdir" >&6 pyexecdir=$am_cv_python_pyexecdir pkgpyexecdir=\${pyexecdir}/$PACKAGE fi # Check whether --with-boost or --without-boost was given. if test "${with_boost+set}" = set; then withval="$with_boost" if test "$withval" = "no"; then want_boost="no" elif test "$withval" = "yes"; then want_boost="yes" ac_boost_path="" else want_boost="yes" ac_boost_path="$withval" fi else want_boost="yes" fi; # Check whether --with-boost-libdir or --without-boost-libdir was given. if test "${with_boost_libdir+set}" = set; then withval="$with_boost_libdir" if test -d $withval then ac_boost_lib_path="$withval" else { { echo "$as_me:$LINENO: error: --with-boost-libdir expected directory name" >&5 echo "$as_me: error: --with-boost-libdir expected directory name" >&2;} { (exit 1); exit 1; }; } fi else ac_boost_lib_path="" fi; if test "x$want_boost" = "xyes"; then boost_lib_version_req=1.38.0 boost_lib_version_req_shorten=`expr $boost_lib_version_req : '\([0-9]*\.[0-9]*\)'` boost_lib_version_req_major=`expr $boost_lib_version_req : '\([0-9]*\)'` boost_lib_version_req_minor=`expr $boost_lib_version_req : '[0-9]*\.\([0-9]*\)'` boost_lib_version_req_sub_minor=`expr $boost_lib_version_req : '[0-9]*\.[0-9]*\.\([0-9]*\)'` if test "x$boost_lib_version_req_sub_minor" = "x" ; then boost_lib_version_req_sub_minor="0" fi WANT_BOOST_VERSION=`expr $boost_lib_version_req_major \* 100000 \+ $boost_lib_version_req_minor \* 100 \+ $boost_lib_version_req_sub_minor` echo "$as_me:$LINENO: checking for boostlib >= $boost_lib_version_req" >&5 echo $ECHO_N "checking for boostlib >= $boost_lib_version_req... $ECHO_C" >&6 succeeded=no if test "$ac_boost_path" != ""; then BOOST_LDFLAGS="-L$ac_boost_path/lib" BOOST_CPPFLAGS="-I$ac_boost_path/include" else for ac_boost_path_tmp in /usr /usr/local /opt /opt/local ; do if test -d "$ac_boost_path_tmp/include/boost" && test -r "$ac_boost_path_tmp/include/boost"; then BOOST_LDFLAGS="-L$ac_boost_path_tmp/lib" BOOST_CPPFLAGS="-I$ac_boost_path_tmp/include" break; fi done fi if test "$ac_boost_lib_path" != ""; then BOOST_LDFLAGS="-L$ac_boost_lib_path" fi CPPFLAGS_SAVED="$CPPFLAGS" CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" export CPPFLAGS LDFLAGS_SAVED="$LDFLAGS" LDFLAGS="$LDFLAGS $BOOST_LDFLAGS" export LDFLAGS ac_ext=cc ac_cpp='$CXXCPP $CPPFLAGS' ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_cxx_compiler_gnu cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include int main () { #if BOOST_VERSION >= $WANT_BOOST_VERSION // Everything is okay #else # error Boost version is too old #endif ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_cxx_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then echo "$as_me:$LINENO: result: yes" >&5 echo "${ECHO_T}yes" >&6 succeeded=yes found_system=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu if test "x$succeeded" != "xyes"; then _version=0 if test "$ac_boost_path" != ""; then if test -d "$ac_boost_path" && test -r "$ac_boost_path"; then for i in `ls -d $ac_boost_path/include/boost-* 2>/dev/null`; do _version_tmp=`echo $i | sed "s#$ac_boost_path##" | sed 's/\/include\/boost-//' | sed 's/_/./'` V_CHECK=`expr $_version_tmp \> $_version` if test "$V_CHECK" = "1" ; then _version=$_version_tmp fi VERSION_UNDERSCORE=`echo $_version | sed 's/\./_/'` BOOST_CPPFLAGS="-I$ac_boost_path/include/boost-$VERSION_UNDERSCORE" done fi else for ac_boost_path in /usr /usr/local /opt /opt/local ; do if test -d "$ac_boost_path" && test -r "$ac_boost_path"; then for i in `ls -d $ac_boost_path/include/boost-* 2>/dev/null`; do _version_tmp=`echo $i | sed "s#$ac_boost_path##" | sed 's/\/include\/boost-//' | sed 's/_/./'` V_CHECK=`expr $_version_tmp \> $_version` if test "$V_CHECK" = "1" ; then _version=$_version_tmp best_path=$ac_boost_path fi done fi done VERSION_UNDERSCORE=`echo $_version | sed 's/\./_/'` boost_major_version=`echo "$VERSION_UNDERSCORE" | sed 's/_//;s/_.*//'` BOOST_CPPFLAGS="-I$best_path/include/boost-$VERSION_UNDERSCORE" if test "$ac_boost_lib_path" = "" then BOOST_LDFLAGS="-L$best_path/lib" fi if test "x$BOOST_ROOT" != "x"; then if test -d "$BOOST_ROOT" && test -r "$BOOST_ROOT" && test -d "$BOOST_ROOT/stage/lib" && test -r "$BOOST_ROOT/stage/lib"; then version_dir=`expr //$BOOST_ROOT : '.*/\(.*\)'` stage_version=`echo $version_dir | sed 's/boost_//' | sed 's/_/./g'` stage_version_shorten=`expr $stage_version : '\([0-9]*\.[0-9]*\)'` V_CHECK=`expr $stage_version_shorten \>\= $_version` if test "$V_CHECK" = "1" -a "$ac_boost_lib_path" = "" ; then { echo "$as_me:$LINENO: We will use a staged boost library from $BOOST_ROOT" >&5 echo "$as_me: We will use a staged boost library from $BOOST_ROOT" >&6;} BOOST_CPPFLAGS="-I$BOOST_ROOT" BOOST_LDFLAGS="-L$BOOST_ROOT/stage/lib" fi fi fi fi CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" export CPPFLAGS LDFLAGS="$LDFLAGS $BOOST_LDFLAGS" export LDFLAGS ac_ext=cc ac_cpp='$CXXCPP $CPPFLAGS' ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_cxx_compiler_gnu cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include int main () { #if BOOST_VERSION >= $WANT_BOOST_VERSION // Everything is okay #else # error Boost version is too old #endif ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_cxx_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then echo "$as_me:$LINENO: result: yes" >&5 echo "${ECHO_T}yes" >&6 succeeded=yes found_system=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu fi if test "$succeeded" != "yes" ; then if test "$_version" = "0" ; then { { echo "$as_me:$LINENO: error: We could not detect the boost libraries (version $boost_lib_version_req_shorten or higher). If you have a staged boost library (still not installed) please specify \$BOOST_ROOT in your environment and do not give a PATH to --with-boost option. If you are sure you have boost installed, then check your version number looking in . See http://randspringer.de/boost for more documentation." >&5 echo "$as_me: error: We could not detect the boost libraries (version $boost_lib_version_req_shorten or higher). If you have a staged boost library (still not installed) please specify \$BOOST_ROOT in your environment and do not give a PATH to --with-boost option. If you are sure you have boost installed, then check your version number looking in . See http://randspringer.de/boost for more documentation." >&2;} { (exit 1); exit 1; }; } else { echo "$as_me:$LINENO: Your boost libraries seem to old (version $_version)." >&5 echo "$as_me: Your boost libraries seem to old (version $_version)." >&6;} fi else cat >>confdefs.h <<\_ACEOF #define HAVE_BOOST _ACEOF fi CPPFLAGS="$CPPFLAGS_SAVED" LDFLAGS="$LDFLAGS_SAVED" fi # Check whether --with-bam or --without-bam was given. if test "${with_bam+set}" = set; then withval="$with_bam" if test "$withval" = "no"; then want_bam="no" elif test "$withval" = "yes"; then want_bam="yes" ac_bam_path="" else want_bam="yes" ac_bam_path="$withval" fi else want_bam="yes" fi; # Check whether --with-bam-libdir or --without-bam-libdir was given. if test "${with_bam_libdir+set}" = set; then withval="$with_bam_libdir" if test -d $withval then ac_bam_lib_path="$withval" else { { echo "$as_me:$LINENO: error: --with-bam-libdir expected directory name" >&5 echo "$as_me: error: --with-bam-libdir expected directory name" >&2;} { (exit 1); exit 1; }; } fi else ac_bam_lib_path="" fi; if test "x$want_bam" = "xyes"; then # bam_lib_version_req=ifelse([], ,1.20.0,) # bam_lib_version_req_shorten=`expr $bam_lib_version_req : '\([[0-9]]*\.[[0-9]]*\)'` # bam_lib_version_req_major=`expr $bam_lib_version_req : '\([[0-9]]*\)'` # bam_lib_version_req_minor=`expr $bam_lib_version_req : '[[0-9]]*\.\([[0-9]]*\)'` # bam_lib_version_req_sub_minor=`expr $bam_lib_version_req : '[[0-9]]*\.[[0-9]]*\.\([[0-9]]*\)'` # if test "x$bam_lib_version_req_sub_minor" = "x" ; then # bam_lib_version_req_sub_minor="0" # fi # WANT_BAM_VERSION=`expr $bam_lib_version_req_major \* 100000 \+ $bam_lib_version_req_minor \* 100 \+ $bam_lib_version_req_sub_minor` echo "$as_me:$LINENO: checking for bamlib" >&5 echo $ECHO_N "checking for bamlib... $ECHO_C" >&6 succeeded=no if test "$ac_bam_path" != ""; then BAM_LDFLAGS="-L$ac_bam_path/lib" BAM_CPPFLAGS="-I$ac_bam_path/include" else for ac_bam_path_tmp in /usr /usr/local /opt /opt/local ; do if test -d "$ac_bam_path_tmp/include/bam" && test -r "$ac_bam_path_tmp/include/bam"; then BAM_LDFLAGS="-L$ac_bam_path_tmp/lib" BAM_CPPFLAGS="-I$ac_bam_path_tmp/include" break; fi done fi if test "$ac_bam_lib_path" != ""; then BAM_LDFLAGS="-L$ac_bam_lib_path" fi CPPFLAGS_SAVED="$CPPFLAGS" CPPFLAGS="$CPPFLAGS $BAM_CPPFLAGS" export CPPFLAGS LDFLAGS_SAVED="$LDFLAGS" LDFLAGS="$LDFLAGS $BAM_LDFLAGS" export LDFLAGS ac_ext=cc ac_cpp='$CXXCPP $CPPFLAGS' ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_cxx_compiler_gnu cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include int main () { ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_cxx_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then echo "$as_me:$LINENO: result: yes" >&5 echo "${ECHO_T}yes" >&6 succeeded=yes found_system=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu if test "x$succeeded" != "xyes"; then _version=0 if test "$ac_bam_path" != ""; then if test -d "$ac_bam_path" && test -r "$ac_bam_path"; then for i in `ls -d $ac_bam_path/include/bam-* 2>/dev/null`; do _version_tmp=`echo $i | sed "s#$ac_bam_path##" | sed 's/\/include\/bam-//' | sed 's/_/./'` V_CHECK=`expr $_version_tmp \> $_version` if test "$V_CHECK" = "1" ; then _version=$_version_tmp fi VERSION_UNDERSCORE=`echo $_version | sed 's/\./_/'` BAM_CPPFLAGS="-I$ac_bam_path/include/bam-$VERSION_UNDERSCORE" done fi else for ac_bam_path in /usr /usr/local /opt /opt/local ; do if test -d "$ac_bam_path" && test -r "$ac_bam_path"; then for i in `ls -d $ac_bam_path/include/bam-* 2>/dev/null`; do _version_tmp=`echo $i | sed "s#$ac_bam_path##" | sed 's/\/include\/bam-//' | sed 's/_/./'` V_CHECK=`expr $_version_tmp \> $_version` if test "$V_CHECK" = "1" ; then _version=$_version_tmp best_path=$ac_bam_path fi done fi done VERSION_UNDERSCORE=`echo $_version | sed 's/\./_/'` BAM_CPPFLAGS="-I$best_path/include/bam-$VERSION_UNDERSCORE" if test "$ac_bam_lib_path" = "" then BAM_LDFLAGS="-L$best_path/lib" fi if test "x$BAM_ROOT" != "x"; then if test -d "$BAM_ROOT" && test -r "$BAM_ROOT" && test -d "$BAM_ROOT/stage/lib" && test -r "$BAM_ROOT/stage/lib"; then version_dir=`expr //$BAM_ROOT : '.*/\(.*\)'` stage_version=`echo $version_dir | sed 's/bam_//' | sed 's/_/./g'` stage_version_shorten=`expr $stage_version : '\([0-9]*\.[0-9]*\)'` V_CHECK=`expr $stage_version_shorten \>\= $_version` if test "$V_CHECK" = "1" -a "$ac_bam_lib_path" = "" ; then { echo "$as_me:$LINENO: We will use a staged bam library from $BAM_ROOT" >&5 echo "$as_me: We will use a staged bam library from $BAM_ROOT" >&6;} BAM_CPPFLAGS="-I$BAM_ROOT" BAM_LDFLAGS="-L$BAM_ROOT/stage/lib" fi fi fi fi CPPFLAGS="$CPPFLAGS $BAM_CPPFLAGS" export CPPFLAGS LDFLAGS="$LDFLAGS $BAM_LDFLAGS" export LDFLAGS ac_ext=cc ac_cpp='$CXXCPP $CPPFLAGS' ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_cxx_compiler_gnu cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include int main () { ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_cxx_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then echo "$as_me:$LINENO: result: yes" >&5 echo "${ECHO_T}yes" >&6 succeeded=yes found_system=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu fi if test "$succeeded" != "yes" ; then if test "$_version" = "0" ; then { { echo "$as_me:$LINENO: error: We could not detect the bam libraries (version $bam_lib_version_req_shorten or higher). If you have a staged bam library (still not installed) please specify \$BAM_ROOT in your environment and do not give a PATH to --with-bam option." >&5 echo "$as_me: error: We could not detect the bam libraries (version $bam_lib_version_req_shorten or higher). If you have a staged bam library (still not installed) please specify \$BAM_ROOT in your environment and do not give a PATH to --with-bam option." >&2;} { (exit 1); exit 1; }; } else { echo "$as_me:$LINENO: Your bam libraries seem too old (version $_version)." >&5 echo "$as_me: Your bam libraries seem too old (version $_version)." >&6;} fi else BAM_LIB="-lbam" cat >>confdefs.h <<\_ACEOF #define HAVE_BAM _ACEOF fi CPPFLAGS="$CPPFLAGS_SAVED" LDFLAGS="$LDFLAGS_SAVED" fi # Make sure we can run config.sub. $ac_config_sub sun4 >/dev/null 2>&1 || { { echo "$as_me:$LINENO: error: cannot run $ac_config_sub" >&5 echo "$as_me: error: cannot run $ac_config_sub" >&2;} { (exit 1); exit 1; }; } echo "$as_me:$LINENO: checking build system type" >&5 echo $ECHO_N "checking build system type... $ECHO_C" >&6 if test "${ac_cv_build+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else ac_cv_build_alias=$build_alias test -z "$ac_cv_build_alias" && ac_cv_build_alias=`$ac_config_guess` test -z "$ac_cv_build_alias" && { { echo "$as_me:$LINENO: error: cannot guess build type; you must specify one" >&5 echo "$as_me: error: cannot guess build type; you must specify one" >&2;} { (exit 1); exit 1; }; } ac_cv_build=`$ac_config_sub $ac_cv_build_alias` || { { echo "$as_me:$LINENO: error: $ac_config_sub $ac_cv_build_alias failed" >&5 echo "$as_me: error: $ac_config_sub $ac_cv_build_alias failed" >&2;} { (exit 1); exit 1; }; } fi echo "$as_me:$LINENO: result: $ac_cv_build" >&5 echo "${ECHO_T}$ac_cv_build" >&6 build=$ac_cv_build build_cpu=`echo $ac_cv_build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'` build_vendor=`echo $ac_cv_build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'` build_os=`echo $ac_cv_build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'` # Check whether --with-boost-thread or --without-boost-thread was given. if test "${with_boost_thread+set}" = set; then withval="$with_boost_thread" if test "$withval" = "no"; then want_boost="no" elif test "$withval" = "yes"; then want_boost="yes" ax_boost_user_thread_lib="" ax_booth_user_system_lib="" else want_boost="yes" echo "using $withval" ax_boost_user_thread_lib="$withval" fi else want_boost="yes" fi; if test "x$want_boost" = "xyes"; then CPPFLAGS_SAVED="$CPPFLAGS" CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" export CPPFLAGS LDFLAGS_SAVED="$LDFLAGS" LDFLAGS="$LDFLAGS $BOOST_LDFLAGS" export LDFLAGS echo "$as_me:$LINENO: checking whether the Boost::Thread library is available" >&5 echo $ECHO_N "checking whether the Boost::Thread library is available... $ECHO_C" >&6 if test "${ax_cv_boost_thread+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else ac_ext=cc ac_cpp='$CXXCPP $CPPFLAGS' ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_cxx_compiler_gnu CXXFLAGS_SAVE=$CXXFLAGS if test "x$build_os" = "xsolaris" ; then CXXFLAGS="-pthreads $CXXFLAGS" elif test "x$build_os" = "xming32" ; then CXXFLAGS="-mthreads $CXXFLAGS" else CXXFLAGS="-pthread $CXXFLAGS" fi cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include int main () { boost::thread_group thrds; return 0; ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_cxx_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ax_cv_boost_thread=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ax_cv_boost_thread=no fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext CXXFLAGS=$CXXFLAGS_SAVE ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu fi echo "$as_me:$LINENO: result: $ax_cv_boost_thread" >&5 echo "${ECHO_T}$ax_cv_boost_thread" >&6 if test "x$ax_cv_boost_thread" = "xyes"; then if test "x$build_os" = "xsolaris" ; then BOOST_CPPFLAGS="-pthreads $BOOST_CPPFLAGS" elif test "x$build_os" = "xming32" ; then BOOST_CPPFLAGS="-mthreads $BOOST_CPPFLAGS" else BOOST_CPPFLAGS="-pthread $BOOST_CPPFLAGS" fi cat >>confdefs.h <<\_ACEOF #define HAVE_BOOST_THREAD _ACEOF BOOSTLIBDIR=`echo $BOOST_LDFLAGS | sed -e 's/[^\/]*//'` LDFLAGS_SAVE=$LDFLAGS case "x$build_os" in *bsd* ) LDFLAGS="-pthread $LDFLAGS" break; ;; esac if test "x$ax_boost_user_thread_lib" = "x"; then for libextension in `ls $BOOSTLIBDIR/libboost_thread*.so* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_thread.*\)\.so.*$;\1;'` `ls $BOOSTLIBDIR/libboost_thread*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_thread.*\)\.a*$;\1;'`; do ax_lib=${libextension} as_ac_Lib=`echo "ac_cv_lib_$ax_lib''_exit" | $as_tr_sh` echo "$as_me:$LINENO: checking for exit in -l$ax_lib" >&5 echo $ECHO_N "checking for exit in -l$ax_lib... $ECHO_C" >&6 if eval "test \"\${$as_ac_Lib+set}\" = set"; then echo $ECHO_N "(cached) $ECHO_C" >&6 else ac_check_lib_save_LIBS=$LIBS LIBS="-l$ax_lib $LIBS" cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ /* Override any gcc2 internal prototype to avoid an error. */ #ifdef __cplusplus extern "C" #endif /* We use char because int might match the return type of a gcc2 builtin and then its argument prototype would still apply. */ char exit (); int main () { exit (); ; return 0; } _ACEOF rm -f conftest.$ac_objext conftest$ac_exeext if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5 (eval $ac_link) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest$ac_exeext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then eval "$as_ac_Lib=yes" else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 eval "$as_ac_Lib=no" fi rm -f conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext LIBS=$ac_check_lib_save_LIBS fi echo "$as_me:$LINENO: result: `eval echo '${'$as_ac_Lib'}'`" >&5 echo "${ECHO_T}`eval echo '${'$as_ac_Lib'}'`" >&6 if test `eval echo '${'$as_ac_Lib'}'` = yes; then BOOST_THREAD_LIB="-l$ax_lib"; link_thread="yes"; break else link_thread="no" fi done if test "x$link_thread" != "xyes"; then for libextension in `ls $BOOSTLIBDIR/boost_thread*.dll* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_thread.*\)\.dll.*$;\1;'` `ls $BOOSTLIBDIR/libboost_thread*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_thread.*\)\.a*$;\1;'` ; do ax_lib=${libextension} as_ac_Lib=`echo "ac_cv_lib_$ax_lib''_exit" | $as_tr_sh` echo "$as_me:$LINENO: checking for exit in -l$ax_lib" >&5 echo $ECHO_N "checking for exit in -l$ax_lib... $ECHO_C" >&6 if eval "test \"\${$as_ac_Lib+set}\" = set"; then echo $ECHO_N "(cached) $ECHO_C" >&6 else ac_check_lib_save_LIBS=$LIBS LIBS="-l$ax_lib $LIBS" cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ /* Override any gcc2 internal prototype to avoid an error. */ #ifdef __cplusplus extern "C" #endif /* We use char because int might match the return type of a gcc2 builtin and then its argument prototype would still apply. */ char exit (); int main () { exit (); ; return 0; } _ACEOF rm -f conftest.$ac_objext conftest$ac_exeext if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5 (eval $ac_link) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest$ac_exeext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then eval "$as_ac_Lib=yes" else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 eval "$as_ac_Lib=no" fi rm -f conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext LIBS=$ac_check_lib_save_LIBS fi echo "$as_me:$LINENO: result: `eval echo '${'$as_ac_Lib'}'`" >&5 echo "${ECHO_T}`eval echo '${'$as_ac_Lib'}'`" >&6 if test `eval echo '${'$as_ac_Lib'}'` = yes; then BOOST_THREAD_LIB="-l$ax_lib"; link_thread="yes"; break else link_thread="no" fi done fi else BOOST_THREAD_LIB="$ax_boost_user_thread_lib"; link_thread="yes"; fi if test "x$link_thread" = "xno"; then { { echo "$as_me:$LINENO: error: Could not link against $ax_lib !" >&5 echo "$as_me: error: Could not link against $ax_lib !" >&2;} { (exit 1); exit 1; }; } else case "x$build_os" in *bsd* ) BOOST_LDFLAGS="-pthread $BOOST_LDFLAGS" break; ;; esac fi if test "x$ax_boost_user_system_lib" = "x"; then for libextension in `ls $BOOSTLIBDIR/libboost_system*.so* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_system.*\)\.so.*$;\1;'` `ls $BOOSTLIBDIR/libboost_system*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_system.*\)\.a*$;\1;'`; do ax_lib=${libextension} as_ac_Lib=`echo "ac_cv_lib_$ax_lib''_exit" | $as_tr_sh` echo "$as_me:$LINENO: checking for exit in -l$ax_lib" >&5 echo $ECHO_N "checking for exit in -l$ax_lib... $ECHO_C" >&6 if eval "test \"\${$as_ac_Lib+set}\" = set"; then echo $ECHO_N "(cached) $ECHO_C" >&6 else ac_check_lib_save_LIBS=$LIBS LIBS="-l$ax_lib $LIBS" cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ /* Override any gcc2 internal prototype to avoid an error. */ #ifdef __cplusplus extern "C" #endif /* We use char because int might match the return type of a gcc2 builtin and then its argument prototype would still apply. */ char exit (); int main () { exit (); ; return 0; } _ACEOF rm -f conftest.$ac_objext conftest$ac_exeext if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5 (eval $ac_link) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest$ac_exeext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then eval "$as_ac_Lib=yes" else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 eval "$as_ac_Lib=no" fi rm -f conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext LIBS=$ac_check_lib_save_LIBS fi echo "$as_me:$LINENO: result: `eval echo '${'$as_ac_Lib'}'`" >&5 echo "${ECHO_T}`eval echo '${'$as_ac_Lib'}'`" >&6 if test `eval echo '${'$as_ac_Lib'}'` = yes; then BOOST_SYSTEM_LIB="-l$ax_lib"; link_system="yes"; break else link_system="no" fi done if test "x$link_system" != "xyes"; then for libextension in `ls $BOOSTLIBDIR/boost_system*.dll* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_system.*\)\.dll.*$;\1;'` `ls $BOOSTLIBDIR/libboost_system*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_system.*\)\.a*$;\1;'` ; do ax_lib=${libextension} as_ac_Lib=`echo "ac_cv_lib_$ax_lib''_exit" | $as_tr_sh` echo "$as_me:$LINENO: checking for exit in -l$ax_lib" >&5 echo $ECHO_N "checking for exit in -l$ax_lib... $ECHO_C" >&6 if eval "test \"\${$as_ac_Lib+set}\" = set"; then echo $ECHO_N "(cached) $ECHO_C" >&6 else ac_check_lib_save_LIBS=$LIBS LIBS="-l$ax_lib $LIBS" cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ /* Override any gcc2 internal prototype to avoid an error. */ #ifdef __cplusplus extern "C" #endif /* We use char because int might match the return type of a gcc2 builtin and then its argument prototype would still apply. */ char exit (); int main () { exit (); ; return 0; } _ACEOF rm -f conftest.$ac_objext conftest$ac_exeext if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5 (eval $ac_link) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest$ac_exeext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then eval "$as_ac_Lib=yes" else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 eval "$as_ac_Lib=no" fi rm -f conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext LIBS=$ac_check_lib_save_LIBS fi echo "$as_me:$LINENO: result: `eval echo '${'$as_ac_Lib'}'`" >&5 echo "${ECHO_T}`eval echo '${'$as_ac_Lib'}'`" >&6 if test `eval echo '${'$as_ac_Lib'}'` = yes; then BOOST_SYSTEM_LIB="-l$ax_lib"; link_system="yes"; break else link_system="no" fi done fi else BOOST_SYSTEM_LIB="$ax_boost_user_system_lib"; link_system="yes"; fi fi CPPFLAGS="$CPPFLAGS_SAVED" LDFLAGS="$LDFLAGS_SAVED" fi # AX_CHECK_ZLIB() # Checks for header files. ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu echo "$as_me:$LINENO: checking how to run the C preprocessor" >&5 echo $ECHO_N "checking how to run the C preprocessor... $ECHO_C" >&6 # On Suns, sometimes $CPP names a directory. if test -n "$CPP" && test -d "$CPP"; then CPP= fi if test -z "$CPP"; then if test "${ac_cv_prog_CPP+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else # Double quotes because CPP needs to be expanded for CPP in "$CC -E" "$CC -E -traditional-cpp" "/lib/cpp" do ac_preproc_ok=false for ac_c_preproc_warn_flag in '' yes do # Use a header file that comes with gcc, so configuring glibc # with a fresh cross-compiler works. # Prefer to if __STDC__ is defined, since # exists even on freestanding compilers. # On the NeXT, cc -E runs the code through the compiler's parser, # not just through cpp. "Syntax error" is here to catch this case. cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #ifdef __STDC__ # include #else # include #endif Syntax error _ACEOF if { (eval echo "$as_me:$LINENO: \"$ac_cpp conftest.$ac_ext\"") >&5 (eval $ac_cpp conftest.$ac_ext) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } >/dev/null; then if test -s conftest.err; then ac_cpp_err=$ac_c_preproc_warn_flag ac_cpp_err=$ac_cpp_err$ac_c_werror_flag else ac_cpp_err= fi else ac_cpp_err=yes fi if test -z "$ac_cpp_err"; then : else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 # Broken: fails on valid input. continue fi rm -f conftest.err conftest.$ac_ext # OK, works on sane cases. Now check whether non-existent headers # can be detected and how. cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include _ACEOF if { (eval echo "$as_me:$LINENO: \"$ac_cpp conftest.$ac_ext\"") >&5 (eval $ac_cpp conftest.$ac_ext) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } >/dev/null; then if test -s conftest.err; then ac_cpp_err=$ac_c_preproc_warn_flag ac_cpp_err=$ac_cpp_err$ac_c_werror_flag else ac_cpp_err= fi else ac_cpp_err=yes fi if test -z "$ac_cpp_err"; then # Broken: success on invalid input. continue else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 # Passes both tests. ac_preproc_ok=: break fi rm -f conftest.err conftest.$ac_ext done # Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped. rm -f conftest.err conftest.$ac_ext if $ac_preproc_ok; then break fi done ac_cv_prog_CPP=$CPP fi CPP=$ac_cv_prog_CPP else ac_cv_prog_CPP=$CPP fi echo "$as_me:$LINENO: result: $CPP" >&5 echo "${ECHO_T}$CPP" >&6 ac_preproc_ok=false for ac_c_preproc_warn_flag in '' yes do # Use a header file that comes with gcc, so configuring glibc # with a fresh cross-compiler works. # Prefer to if __STDC__ is defined, since # exists even on freestanding compilers. # On the NeXT, cc -E runs the code through the compiler's parser, # not just through cpp. "Syntax error" is here to catch this case. cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #ifdef __STDC__ # include #else # include #endif Syntax error _ACEOF if { (eval echo "$as_me:$LINENO: \"$ac_cpp conftest.$ac_ext\"") >&5 (eval $ac_cpp conftest.$ac_ext) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } >/dev/null; then if test -s conftest.err; then ac_cpp_err=$ac_c_preproc_warn_flag ac_cpp_err=$ac_cpp_err$ac_c_werror_flag else ac_cpp_err= fi else ac_cpp_err=yes fi if test -z "$ac_cpp_err"; then : else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 # Broken: fails on valid input. continue fi rm -f conftest.err conftest.$ac_ext # OK, works on sane cases. Now check whether non-existent headers # can be detected and how. cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include _ACEOF if { (eval echo "$as_me:$LINENO: \"$ac_cpp conftest.$ac_ext\"") >&5 (eval $ac_cpp conftest.$ac_ext) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } >/dev/null; then if test -s conftest.err; then ac_cpp_err=$ac_c_preproc_warn_flag ac_cpp_err=$ac_cpp_err$ac_c_werror_flag else ac_cpp_err= fi else ac_cpp_err=yes fi if test -z "$ac_cpp_err"; then # Broken: success on invalid input. continue else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 # Passes both tests. ac_preproc_ok=: break fi rm -f conftest.err conftest.$ac_ext done # Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped. rm -f conftest.err conftest.$ac_ext if $ac_preproc_ok; then : else { { echo "$as_me:$LINENO: error: C preprocessor \"$CPP\" fails sanity check See \`config.log' for more details." >&5 echo "$as_me: error: C preprocessor \"$CPP\" fails sanity check See \`config.log' for more details." >&2;} { (exit 1); exit 1; }; } fi ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' ac_compiler_gnu=$ac_cv_c_compiler_gnu echo "$as_me:$LINENO: checking for egrep" >&5 echo $ECHO_N "checking for egrep... $ECHO_C" >&6 if test "${ac_cv_prog_egrep+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if echo a | (grep -E '(a|b)') >/dev/null 2>&1 then ac_cv_prog_egrep='grep -E' else ac_cv_prog_egrep='egrep' fi fi echo "$as_me:$LINENO: result: $ac_cv_prog_egrep" >&5 echo "${ECHO_T}$ac_cv_prog_egrep" >&6 EGREP=$ac_cv_prog_egrep echo "$as_me:$LINENO: checking for ANSI C header files" >&5 echo $ECHO_N "checking for ANSI C header files... $ECHO_C" >&6 if test "${ac_cv_header_stdc+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include #include #include #include int main () { ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_header_stdc=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_cv_header_stdc=no fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext if test $ac_cv_header_stdc = yes; then # SunOS 4.x string.h does not declare mem*, contrary to ANSI. cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include _ACEOF if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | $EGREP "memchr" >/dev/null 2>&1; then : else ac_cv_header_stdc=no fi rm -f conftest* fi if test $ac_cv_header_stdc = yes; then # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI. cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include _ACEOF if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | $EGREP "free" >/dev/null 2>&1; then : else ac_cv_header_stdc=no fi rm -f conftest* fi if test $ac_cv_header_stdc = yes; then # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi. if test "$cross_compiling" = yes; then : else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include #if ((' ' & 0x0FF) == 0x020) # define ISLOWER(c) ('a' <= (c) && (c) <= 'z') # define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c)) #else # define ISLOWER(c) \ (('a' <= (c) && (c) <= 'i') \ || ('j' <= (c) && (c) <= 'r') \ || ('s' <= (c) && (c) <= 'z')) # define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c)) #endif #define XOR(e, f) (((e) && !(f)) || (!(e) && (f))) int main () { int i; for (i = 0; i < 256; i++) if (XOR (islower (i), ISLOWER (i)) || toupper (i) != TOUPPER (i)) exit(2); exit (0); } _ACEOF rm -f conftest$ac_exeext if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5 (eval $ac_link) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='./conftest$ac_exeext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then : else echo "$as_me: program exited with status $ac_status" >&5 echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ( exit $ac_status ) ac_cv_header_stdc=no fi rm -f core *.core gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext fi fi fi echo "$as_me:$LINENO: result: $ac_cv_header_stdc" >&5 echo "${ECHO_T}$ac_cv_header_stdc" >&6 if test $ac_cv_header_stdc = yes; then cat >>confdefs.h <<\_ACEOF #define STDC_HEADERS 1 _ACEOF fi # On IRIX 5.3, sys/types and inttypes.h are conflicting. for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \ inttypes.h stdint.h unistd.h do as_ac_Header=`echo "ac_cv_header_$ac_header" | $as_tr_sh` echo "$as_me:$LINENO: checking for $ac_header" >&5 echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6 if eval "test \"\${$as_ac_Header+set}\" = set"; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ $ac_includes_default #include <$ac_header> _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then eval "$as_ac_Header=yes" else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 eval "$as_ac_Header=no" fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext fi echo "$as_me:$LINENO: result: `eval echo '${'$as_ac_Header'}'`" >&5 echo "${ECHO_T}`eval echo '${'$as_ac_Header'}'`" >&6 if test `eval echo '${'$as_ac_Header'}'` = yes; then cat >>confdefs.h <<_ACEOF #define `echo "HAVE_$ac_header" | $as_tr_cpp` 1 _ACEOF fi done for ac_header in stdlib.h string.h unistd.h do as_ac_Header=`echo "ac_cv_header_$ac_header" | $as_tr_sh` if eval "test \"\${$as_ac_Header+set}\" = set"; then echo "$as_me:$LINENO: checking for $ac_header" >&5 echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6 if eval "test \"\${$as_ac_Header+set}\" = set"; then echo $ECHO_N "(cached) $ECHO_C" >&6 fi echo "$as_me:$LINENO: result: `eval echo '${'$as_ac_Header'}'`" >&5 echo "${ECHO_T}`eval echo '${'$as_ac_Header'}'`" >&6 else # Is the header compilable? echo "$as_me:$LINENO: checking $ac_header usability" >&5 echo $ECHO_N "checking $ac_header usability... $ECHO_C" >&6 cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ $ac_includes_default #include <$ac_header> _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_header_compiler=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_header_compiler=no fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext echo "$as_me:$LINENO: result: $ac_header_compiler" >&5 echo "${ECHO_T}$ac_header_compiler" >&6 # Is the header present? echo "$as_me:$LINENO: checking $ac_header presence" >&5 echo $ECHO_N "checking $ac_header presence... $ECHO_C" >&6 cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include <$ac_header> _ACEOF if { (eval echo "$as_me:$LINENO: \"$ac_cpp conftest.$ac_ext\"") >&5 (eval $ac_cpp conftest.$ac_ext) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } >/dev/null; then if test -s conftest.err; then ac_cpp_err=$ac_c_preproc_warn_flag ac_cpp_err=$ac_cpp_err$ac_c_werror_flag else ac_cpp_err= fi else ac_cpp_err=yes fi if test -z "$ac_cpp_err"; then ac_header_preproc=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_header_preproc=no fi rm -f conftest.err conftest.$ac_ext echo "$as_me:$LINENO: result: $ac_header_preproc" >&5 echo "${ECHO_T}$ac_header_preproc" >&6 # So? What about this header? case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in yes:no: ) { echo "$as_me:$LINENO: WARNING: $ac_header: accepted by the compiler, rejected by the preprocessor!" >&5 echo "$as_me: WARNING: $ac_header: accepted by the compiler, rejected by the preprocessor!" >&2;} { echo "$as_me:$LINENO: WARNING: $ac_header: proceeding with the compiler's result" >&5 echo "$as_me: WARNING: $ac_header: proceeding with the compiler's result" >&2;} ac_header_preproc=yes ;; no:yes:* ) { echo "$as_me:$LINENO: WARNING: $ac_header: present but cannot be compiled" >&5 echo "$as_me: WARNING: $ac_header: present but cannot be compiled" >&2;} { echo "$as_me:$LINENO: WARNING: $ac_header: check for missing prerequisite headers?" >&5 echo "$as_me: WARNING: $ac_header: check for missing prerequisite headers?" >&2;} { echo "$as_me:$LINENO: WARNING: $ac_header: see the Autoconf documentation" >&5 echo "$as_me: WARNING: $ac_header: see the Autoconf documentation" >&2;} { echo "$as_me:$LINENO: WARNING: $ac_header: section \"Present But Cannot Be Compiled\"" >&5 echo "$as_me: WARNING: $ac_header: section \"Present But Cannot Be Compiled\"" >&2;} { echo "$as_me:$LINENO: WARNING: $ac_header: proceeding with the preprocessor's result" >&5 echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result" >&2;} { echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX ## ----------------------------------------- ## ## Report this to tophat.cufflinks@gmail.com ## ## ----------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; esac echo "$as_me:$LINENO: checking for $ac_header" >&5 echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6 if eval "test \"\${$as_ac_Header+set}\" = set"; then echo $ECHO_N "(cached) $ECHO_C" >&6 else eval "$as_ac_Header=\$ac_header_preproc" fi echo "$as_me:$LINENO: result: `eval echo '${'$as_ac_Header'}'`" >&5 echo "${ECHO_T}`eval echo '${'$as_ac_Header'}'`" >&6 fi if test `eval echo '${'$as_ac_Header'}'` = yes; then cat >>confdefs.h <<_ACEOF #define `echo "HAVE_$ac_header" | $as_tr_cpp` 1 _ACEOF fi done # Checks for header files. echo "$as_me:$LINENO: checking for ANSI C header files" >&5 echo $ECHO_N "checking for ANSI C header files... $ECHO_C" >&6 if test "${ac_cv_header_stdc+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include #include #include #include int main () { ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_header_stdc=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_cv_header_stdc=no fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext if test $ac_cv_header_stdc = yes; then # SunOS 4.x string.h does not declare mem*, contrary to ANSI. cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include _ACEOF if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | $EGREP "memchr" >/dev/null 2>&1; then : else ac_cv_header_stdc=no fi rm -f conftest* fi if test $ac_cv_header_stdc = yes; then # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI. cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include _ACEOF if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | $EGREP "free" >/dev/null 2>&1; then : else ac_cv_header_stdc=no fi rm -f conftest* fi if test $ac_cv_header_stdc = yes; then # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi. if test "$cross_compiling" = yes; then : else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include #if ((' ' & 0x0FF) == 0x020) # define ISLOWER(c) ('a' <= (c) && (c) <= 'z') # define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c)) #else # define ISLOWER(c) \ (('a' <= (c) && (c) <= 'i') \ || ('j' <= (c) && (c) <= 'r') \ || ('s' <= (c) && (c) <= 'z')) # define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c)) #endif #define XOR(e, f) (((e) && !(f)) || (!(e) && (f))) int main () { int i; for (i = 0; i < 256; i++) if (XOR (islower (i), ISLOWER (i)) || toupper (i) != TOUPPER (i)) exit(2); exit (0); } _ACEOF rm -f conftest$ac_exeext if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5 (eval $ac_link) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='./conftest$ac_exeext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then : else echo "$as_me: program exited with status $ac_status" >&5 echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ( exit $ac_status ) ac_cv_header_stdc=no fi rm -f core *.core gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext fi fi fi echo "$as_me:$LINENO: result: $ac_cv_header_stdc" >&5 echo "${ECHO_T}$ac_cv_header_stdc" >&6 if test $ac_cv_header_stdc = yes; then cat >>confdefs.h <<\_ACEOF #define STDC_HEADERS 1 _ACEOF fi # Checks for typedefs, structures, and compiler characteristics. echo "$as_me:$LINENO: checking for stdbool.h that conforms to C99" >&5 echo $ECHO_N "checking for stdbool.h that conforms to C99... $ECHO_C" >&6 if test "${ac_cv_header_stdbool_h+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include #ifndef bool # error bool is not defined #endif #ifndef false # error false is not defined #endif #if false # error false is not 0 #endif #ifndef true # error true is not defined #endif #if true != 1 # error true is not 1 #endif #ifndef __bool_true_false_are_defined # error __bool_true_false_are_defined is not defined #endif struct s { _Bool s: 1; _Bool t; } s; char a[true == 1 ? 1 : -1]; char b[false == 0 ? 1 : -1]; char c[__bool_true_false_are_defined == 1 ? 1 : -1]; char d[(bool) -0.5 == true ? 1 : -1]; bool e = &s; char f[(_Bool) -0.0 == false ? 1 : -1]; char g[true]; char h[sizeof (_Bool)]; char i[sizeof s.t]; int main () { return !a + !b + !c + !d + !e + !f + !g + !h + !i; ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_header_stdbool_h=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_cv_header_stdbool_h=no fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext fi echo "$as_me:$LINENO: result: $ac_cv_header_stdbool_h" >&5 echo "${ECHO_T}$ac_cv_header_stdbool_h" >&6 echo "$as_me:$LINENO: checking for _Bool" >&5 echo $ECHO_N "checking for _Bool... $ECHO_C" >&6 if test "${ac_cv_type__Bool+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ $ac_includes_default int main () { if ((_Bool *) 0) return 0; if (sizeof (_Bool)) return 0; ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_type__Bool=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_cv_type__Bool=no fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext fi echo "$as_me:$LINENO: result: $ac_cv_type__Bool" >&5 echo "${ECHO_T}$ac_cv_type__Bool" >&6 if test $ac_cv_type__Bool = yes; then cat >>confdefs.h <<_ACEOF #define HAVE__BOOL 1 _ACEOF fi if test $ac_cv_header_stdbool_h = yes; then cat >>confdefs.h <<\_ACEOF #define HAVE_STDBOOL_H 1 _ACEOF fi echo "$as_me:$LINENO: checking for inline" >&5 echo $ECHO_N "checking for inline... $ECHO_C" >&6 if test "${ac_cv_c_inline+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else ac_cv_c_inline=no for ac_kw in inline __inline__ __inline; do cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #ifndef __cplusplus typedef int foo_t; static $ac_kw foo_t static_foo () {return 0; } $ac_kw foo_t foo () {return 0; } #endif _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_c_inline=$ac_kw; break else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext done fi echo "$as_me:$LINENO: result: $ac_cv_c_inline" >&5 echo "${ECHO_T}$ac_cv_c_inline" >&6 case $ac_cv_c_inline in inline | yes) ;; *) case $ac_cv_c_inline in no) ac_val=;; *) ac_val=$ac_cv_c_inline;; esac cat >>confdefs.h <<_ACEOF #ifndef __cplusplus #define inline $ac_val #endif _ACEOF ;; esac echo "$as_me:$LINENO: checking for pid_t" >&5 echo $ECHO_N "checking for pid_t... $ECHO_C" >&6 if test "${ac_cv_type_pid_t+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ $ac_includes_default int main () { if ((pid_t *) 0) return 0; if (sizeof (pid_t)) return 0; ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_type_pid_t=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_cv_type_pid_t=no fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext fi echo "$as_me:$LINENO: result: $ac_cv_type_pid_t" >&5 echo "${ECHO_T}$ac_cv_type_pid_t" >&6 if test $ac_cv_type_pid_t = yes; then : else cat >>confdefs.h <<_ACEOF #define pid_t int _ACEOF fi echo "$as_me:$LINENO: checking for size_t" >&5 echo $ECHO_N "checking for size_t... $ECHO_C" >&6 if test "${ac_cv_type_size_t+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ $ac_includes_default int main () { if ((size_t *) 0) return 0; if (sizeof (size_t)) return 0; ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_type_size_t=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_cv_type_size_t=no fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext fi echo "$as_me:$LINENO: result: $ac_cv_type_size_t" >&5 echo "${ECHO_T}$ac_cv_type_size_t" >&6 if test $ac_cv_type_size_t = yes; then : else cat >>confdefs.h <<_ACEOF #define size_t unsigned _ACEOF fi echo "$as_me:$LINENO: checking for ptrdiff_t" >&5 echo $ECHO_N "checking for ptrdiff_t... $ECHO_C" >&6 if test "${ac_cv_type_ptrdiff_t+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ $ac_includes_default int main () { if ((ptrdiff_t *) 0) return 0; if (sizeof (ptrdiff_t)) return 0; ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_type_ptrdiff_t=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_cv_type_ptrdiff_t=no fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext fi echo "$as_me:$LINENO: result: $ac_cv_type_ptrdiff_t" >&5 echo "${ECHO_T}$ac_cv_type_ptrdiff_t" >&6 if test $ac_cv_type_ptrdiff_t = yes; then cat >>confdefs.h <<_ACEOF #define HAVE_PTRDIFF_T 1 _ACEOF fi echo "$as_me:$LINENO: checking for an ANSI C-conforming const" >&5 echo $ECHO_N "checking for an ANSI C-conforming const... $ECHO_C" >&6 if test "${ac_cv_c_const+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ int main () { /* FIXME: Include the comments suggested by Paul. */ #ifndef __cplusplus /* Ultrix mips cc rejects this. */ typedef int charset[2]; const charset x; /* SunOS 4.1.1 cc rejects this. */ char const *const *ccp; char **p; /* NEC SVR4.0.2 mips cc rejects this. */ struct point {int x, y;}; static struct point const zero = {0,0}; /* AIX XL C 1.02.0.0 rejects this. It does not let you subtract one const X* pointer from another in an arm of an if-expression whose if-part is not a constant expression */ const char *g = "string"; ccp = &g + (g ? g-g : 0); /* HPUX 7.0 cc rejects these. */ ++ccp; p = (char**) ccp; ccp = (char const *const *) p; { /* SCO 3.2v4 cc rejects this. */ char *t; char const *s = 0 ? (char *) 0 : (char const *) 0; *t++ = 0; } { /* Someone thinks the Sun supposedly-ANSI compiler will reject this. */ int x[] = {25, 17}; const int *foo = &x[0]; ++foo; } { /* Sun SC1.0 ANSI compiler rejects this -- but not the above. */ typedef const int *iptr; iptr p = 0; ++p; } { /* AIX XL C 1.02.0.0 rejects this saying "k.c", line 2.27: 1506-025 (S) Operand must be a modifiable lvalue. */ struct s { int j; const int *ap[3]; }; struct s *b; b->j = 5; } { /* ULTRIX-32 V3.1 (Rev 9) vcc rejects this */ const int foo = 10; } #endif ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_c_const=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_cv_c_const=no fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext fi echo "$as_me:$LINENO: result: $ac_cv_c_const" >&5 echo "${ECHO_T}$ac_cv_c_const" >&6 if test $ac_cv_c_const = no; then cat >>confdefs.h <<\_ACEOF #define const _ACEOF fi # Check whether --enable-largefile or --disable-largefile was given. if test "${enable_largefile+set}" = set; then enableval="$enable_largefile" fi; if test "$enable_largefile" != no; then echo "$as_me:$LINENO: checking for special C compiler options needed for large files" >&5 echo $ECHO_N "checking for special C compiler options needed for large files... $ECHO_C" >&6 if test "${ac_cv_sys_largefile_CC+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else ac_cv_sys_largefile_CC=no if test "$GCC" != yes; then ac_save_CC=$CC while :; do # IRIX 6.2 and later do not support large files by default, # so use the C compiler's -n32 option if that helps. cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include /* Check that off_t can represent 2**63 - 1 correctly. We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ #define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; int main () { ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then break else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 fi rm -f conftest.err conftest.$ac_objext CC="$CC -n32" rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_sys_largefile_CC=' -n32'; break else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 fi rm -f conftest.err conftest.$ac_objext break done CC=$ac_save_CC rm -f conftest.$ac_ext fi fi echo "$as_me:$LINENO: result: $ac_cv_sys_largefile_CC" >&5 echo "${ECHO_T}$ac_cv_sys_largefile_CC" >&6 if test "$ac_cv_sys_largefile_CC" != no; then CC=$CC$ac_cv_sys_largefile_CC fi echo "$as_me:$LINENO: checking for _FILE_OFFSET_BITS value needed for large files" >&5 echo $ECHO_N "checking for _FILE_OFFSET_BITS value needed for large files... $ECHO_C" >&6 if test "${ac_cv_sys_file_offset_bits+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else while :; do ac_cv_sys_file_offset_bits=no cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include /* Check that off_t can represent 2**63 - 1 correctly. We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ #define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; int main () { ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then break else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #define _FILE_OFFSET_BITS 64 #include /* Check that off_t can represent 2**63 - 1 correctly. We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ #define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; int main () { ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_sys_file_offset_bits=64; break else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext break done fi echo "$as_me:$LINENO: result: $ac_cv_sys_file_offset_bits" >&5 echo "${ECHO_T}$ac_cv_sys_file_offset_bits" >&6 if test "$ac_cv_sys_file_offset_bits" != no; then cat >>confdefs.h <<_ACEOF #define _FILE_OFFSET_BITS $ac_cv_sys_file_offset_bits _ACEOF fi rm -f conftest* echo "$as_me:$LINENO: checking for _LARGE_FILES value needed for large files" >&5 echo $ECHO_N "checking for _LARGE_FILES value needed for large files... $ECHO_C" >&6 if test "${ac_cv_sys_large_files+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else while :; do ac_cv_sys_large_files=no cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include /* Check that off_t can represent 2**63 - 1 correctly. We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ #define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; int main () { ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then break else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #define _LARGE_FILES 1 #include /* Check that off_t can represent 2**63 - 1 correctly. We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ #define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; int main () { ; return 0; } _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_sys_large_files=1; break else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext break done fi echo "$as_me:$LINENO: result: $ac_cv_sys_large_files" >&5 echo "${ECHO_T}$ac_cv_sys_large_files" >&6 if test "$ac_cv_sys_large_files" != no; then cat >>confdefs.h <<_ACEOF #define _LARGE_FILES $ac_cv_sys_large_files _ACEOF fi rm -f conftest* fi # Checks for libraries. echo "$as_me:$LINENO: checking for gzread in -lz" >&5 echo $ECHO_N "checking for gzread in -lz... $ECHO_C" >&6 if test "${ac_cv_lib_z_gzread+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else ac_check_lib_save_LIBS=$LIBS LIBS="-lz $LIBS" cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ /* Override any gcc2 internal prototype to avoid an error. */ #ifdef __cplusplus extern "C" #endif /* We use char because int might match the return type of a gcc2 builtin and then its argument prototype would still apply. */ char gzread (); int main () { gzread (); ; return 0; } _ACEOF rm -f conftest.$ac_objext conftest$ac_exeext if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5 (eval $ac_link) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest$ac_exeext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_lib_z_gzread=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_cv_lib_z_gzread=no fi rm -f conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext LIBS=$ac_check_lib_save_LIBS fi echo "$as_me:$LINENO: result: $ac_cv_lib_z_gzread" >&5 echo "${ECHO_T}$ac_cv_lib_z_gzread" >&6 if test $ac_cv_lib_z_gzread = yes; then cat >>confdefs.h <<_ACEOF #define HAVE_LIBZ 1 _ACEOF LIBS="-lz $LIBS" fi # Checks for library functions. for ac_header in stdlib.h do as_ac_Header=`echo "ac_cv_header_$ac_header" | $as_tr_sh` if eval "test \"\${$as_ac_Header+set}\" = set"; then echo "$as_me:$LINENO: checking for $ac_header" >&5 echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6 if eval "test \"\${$as_ac_Header+set}\" = set"; then echo $ECHO_N "(cached) $ECHO_C" >&6 fi echo "$as_me:$LINENO: result: `eval echo '${'$as_ac_Header'}'`" >&5 echo "${ECHO_T}`eval echo '${'$as_ac_Header'}'`" >&6 else # Is the header compilable? echo "$as_me:$LINENO: checking $ac_header usability" >&5 echo $ECHO_N "checking $ac_header usability... $ECHO_C" >&6 cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ $ac_includes_default #include <$ac_header> _ACEOF rm -f conftest.$ac_objext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest.$ac_objext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_header_compiler=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_header_compiler=no fi rm -f conftest.err conftest.$ac_objext conftest.$ac_ext echo "$as_me:$LINENO: result: $ac_header_compiler" >&5 echo "${ECHO_T}$ac_header_compiler" >&6 # Is the header present? echo "$as_me:$LINENO: checking $ac_header presence" >&5 echo $ECHO_N "checking $ac_header presence... $ECHO_C" >&6 cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #include <$ac_header> _ACEOF if { (eval echo "$as_me:$LINENO: \"$ac_cpp conftest.$ac_ext\"") >&5 (eval $ac_cpp conftest.$ac_ext) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } >/dev/null; then if test -s conftest.err; then ac_cpp_err=$ac_c_preproc_warn_flag ac_cpp_err=$ac_cpp_err$ac_c_werror_flag else ac_cpp_err= fi else ac_cpp_err=yes fi if test -z "$ac_cpp_err"; then ac_header_preproc=yes else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ac_header_preproc=no fi rm -f conftest.err conftest.$ac_ext echo "$as_me:$LINENO: result: $ac_header_preproc" >&5 echo "${ECHO_T}$ac_header_preproc" >&6 # So? What about this header? case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in yes:no: ) { echo "$as_me:$LINENO: WARNING: $ac_header: accepted by the compiler, rejected by the preprocessor!" >&5 echo "$as_me: WARNING: $ac_header: accepted by the compiler, rejected by the preprocessor!" >&2;} { echo "$as_me:$LINENO: WARNING: $ac_header: proceeding with the compiler's result" >&5 echo "$as_me: WARNING: $ac_header: proceeding with the compiler's result" >&2;} ac_header_preproc=yes ;; no:yes:* ) { echo "$as_me:$LINENO: WARNING: $ac_header: present but cannot be compiled" >&5 echo "$as_me: WARNING: $ac_header: present but cannot be compiled" >&2;} { echo "$as_me:$LINENO: WARNING: $ac_header: check for missing prerequisite headers?" >&5 echo "$as_me: WARNING: $ac_header: check for missing prerequisite headers?" >&2;} { echo "$as_me:$LINENO: WARNING: $ac_header: see the Autoconf documentation" >&5 echo "$as_me: WARNING: $ac_header: see the Autoconf documentation" >&2;} { echo "$as_me:$LINENO: WARNING: $ac_header: section \"Present But Cannot Be Compiled\"" >&5 echo "$as_me: WARNING: $ac_header: section \"Present But Cannot Be Compiled\"" >&2;} { echo "$as_me:$LINENO: WARNING: $ac_header: proceeding with the preprocessor's result" >&5 echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result" >&2;} { echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5 echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;} ( cat <<\_ASBOX ## ----------------------------------------- ## ## Report this to tophat.cufflinks@gmail.com ## ## ----------------------------------------- ## _ASBOX ) | sed "s/^/$as_me: WARNING: /" >&2 ;; esac echo "$as_me:$LINENO: checking for $ac_header" >&5 echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6 if eval "test \"\${$as_ac_Header+set}\" = set"; then echo $ECHO_N "(cached) $ECHO_C" >&6 else eval "$as_ac_Header=\$ac_header_preproc" fi echo "$as_me:$LINENO: result: `eval echo '${'$as_ac_Header'}'`" >&5 echo "${ECHO_T}`eval echo '${'$as_ac_Header'}'`" >&6 fi if test `eval echo '${'$as_ac_Header'}'` = yes; then cat >>confdefs.h <<_ACEOF #define `echo "HAVE_$ac_header" | $as_tr_cpp` 1 _ACEOF fi done echo "$as_me:$LINENO: checking for GNU libc compatible malloc" >&5 echo $ECHO_N "checking for GNU libc compatible malloc... $ECHO_C" >&6 if test "${ac_cv_func_malloc_0_nonnull+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test "$cross_compiling" = yes; then ac_cv_func_malloc_0_nonnull=no else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ #if STDC_HEADERS || HAVE_STDLIB_H # include #else char *malloc (); #endif int main () { exit (malloc (0) ? 0 : 1); ; return 0; } _ACEOF rm -f conftest$ac_exeext if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5 (eval $ac_link) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='./conftest$ac_exeext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then ac_cv_func_malloc_0_nonnull=yes else echo "$as_me: program exited with status $ac_status" >&5 echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 ( exit $ac_status ) ac_cv_func_malloc_0_nonnull=no fi rm -f core *.core gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext fi fi echo "$as_me:$LINENO: result: $ac_cv_func_malloc_0_nonnull" >&5 echo "${ECHO_T}$ac_cv_func_malloc_0_nonnull" >&6 if test $ac_cv_func_malloc_0_nonnull = yes; then cat >>confdefs.h <<\_ACEOF #define HAVE_MALLOC 1 _ACEOF else cat >>confdefs.h <<\_ACEOF #define HAVE_MALLOC 0 _ACEOF case $LIBOBJS in "malloc.$ac_objext" | \ *" malloc.$ac_objext" | \ "malloc.$ac_objext "* | \ *" malloc.$ac_objext "* ) ;; *) LIBOBJS="$LIBOBJS malloc.$ac_objext" ;; esac cat >>confdefs.h <<\_ACEOF #define malloc rpl_malloc _ACEOF fi for ac_func in memset strdup strrchr strtol strsep do as_ac_var=`echo "ac_cv_func_$ac_func" | $as_tr_sh` echo "$as_me:$LINENO: checking for $ac_func" >&5 echo $ECHO_N "checking for $ac_func... $ECHO_C" >&6 if eval "test \"\${$as_ac_var+set}\" = set"; then echo $ECHO_N "(cached) $ECHO_C" >&6 else cat >conftest.$ac_ext <<_ACEOF /* confdefs.h. */ _ACEOF cat confdefs.h >>conftest.$ac_ext cat >>conftest.$ac_ext <<_ACEOF /* end confdefs.h. */ /* Define $ac_func to an innocuous variant, in case declares $ac_func. For example, HP-UX 11i declares gettimeofday. */ #define $ac_func innocuous_$ac_func /* System header to define __stub macros and hopefully few prototypes, which can conflict with char $ac_func (); below. Prefer to if __STDC__ is defined, since exists even on freestanding compilers. */ #ifdef __STDC__ # include #else # include #endif #undef $ac_func /* Override any gcc2 internal prototype to avoid an error. */ #ifdef __cplusplus extern "C" { #endif /* We use char because int might match the return type of a gcc2 builtin and then its argument prototype would still apply. */ char $ac_func (); /* The GNU C library defines this for functions which it implements to always fail with ENOSYS. Some functions are actually named something starting with __ and the normal name is an alias. */ #if defined (__stub_$ac_func) || defined (__stub___$ac_func) choke me #else char (*f) () = $ac_func; #endif #ifdef __cplusplus } #endif int main () { return f != $ac_func; ; return 0; } _ACEOF rm -f conftest.$ac_objext conftest$ac_exeext if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5 (eval $ac_link) 2>conftest.er1 ac_status=$? grep -v '^ *+' conftest.er1 >conftest.err rm -f conftest.er1 cat conftest.err >&5 echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_try='test -s conftest$ac_exeext' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; }; then eval "$as_ac_var=yes" else echo "$as_me: failed program was:" >&5 sed 's/^/| /' conftest.$ac_ext >&5 eval "$as_ac_var=no" fi rm -f conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext fi echo "$as_me:$LINENO: result: `eval echo '${'$as_ac_var'}'`" >&5 echo "${ECHO_T}`eval echo '${'$as_ac_var'}'`" >&6 if test `eval echo '${'$as_ac_var'}'` = yes; then cat >>confdefs.h <<_ACEOF #define `echo "HAVE_$ac_func" | $as_tr_cpp` 1 _ACEOF fi done # check the platform echo "$as_me:$LINENO: checking host system type" >&5 echo $ECHO_N "checking host system type... $ECHO_C" >&6 if test "${ac_cv_host+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else ac_cv_host_alias=$host_alias test -z "$ac_cv_host_alias" && ac_cv_host_alias=$ac_cv_build_alias ac_cv_host=`$ac_config_sub $ac_cv_host_alias` || { { echo "$as_me:$LINENO: error: $ac_config_sub $ac_cv_host_alias failed" >&5 echo "$as_me: error: $ac_config_sub $ac_cv_host_alias failed" >&2;} { (exit 1); exit 1; }; } fi echo "$as_me:$LINENO: result: $ac_cv_host" >&5 echo "${ECHO_T}$ac_cv_host" >&6 host=$ac_cv_host host_cpu=`echo $ac_cv_host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'` host_vendor=`echo $ac_cv_host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'` host_os=`echo $ac_cv_host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'` # --------------------------------------------------------------------- # Debug and profile # --------------------------------------------------------------------- # set CFLAGS and CXXFLAGS #user_CFLAGS="${CXXFLAGS}" user_CFLAGS=${CFLAGS} generic_CFLAGS="-Wall -Wno-strict-aliasing -g -gdwarf-2 -Wuninitialized" ext_CFLAGS="" debug_CFLAGS="" user_LDFLAGS="$LDFLAGS" # Check whether --enable-intel64 or --disable-intel64 was given. if test "${enable_intel64+set}" = set; then enableval="$enable_intel64" ext_CFLAGS="${ext_CFLAGS} -mtune=nocona" fi; # Check whether --enable-debug or --disable-debug was given. if test "${enable_debug+set}" = set; then enableval="$enable_debug" else enable_debug=no fi; # Check whether --enable-optim or --disable-optim was given. if test "${enable_optim+set}" = set; then enableval="$enable_optim" if test "x$enable_optim" = xyes; then enable_optim=3; fi else enable_optim=3 fi; if test "x$enable_optim" != xno; then ext_CFLAGS="$ext_CFLAGS -O$enable_optim" fi if test "x$enable_debug" = xyes; then debug_CFLAGS="-DDEBUG" else debug_CFLAGS="-DNDEBUG" fi CFLAGS="${generic_CFLAGS} ${ext_CFLAGS} ${user_CFLAGS} ${debug_CFLAGS}" CXXFLAGS="$CFLAGS" CXXFLAGS="$CXXFLAGS $BOOST_CPPFLAGS $BAM_CPPFLAGS -I./SeqAn-1.3" LDFLAGS="$user_LDFLAGS" # test to see if srcdir already configured if test "`cd $srcdir && pwd`" != "`pwd`" && test -f $srcdir/config.status; then { { echo "$as_me:$LINENO: error: source directory already configured; run \"make distclean\" there first" >&5 echo "$as_me: error: source directory already configured; run \"make distclean\" there first" >&2;} { (exit 1); exit 1; }; } fi # test whether we have cygpath if test -z "$CYGPATH_W"; then if (cygpath --version) >/dev/null 2>/dev/null; then CYGPATH_W='cygpath -w' else CYGPATH_W=echo fi fi # Define the identity of the package. PACKAGE='tophat' VERSION='2.0.9' cat >>confdefs.h <<_ACEOF #define PACKAGE "$PACKAGE" _ACEOF cat >>confdefs.h <<_ACEOF #define VERSION "$VERSION" _ACEOF # Some tools Automake needs. ACLOCAL=${ACLOCAL-"${am_missing_run}aclocal-${am__api_version}"} AUTOCONF=${AUTOCONF-"${am_missing_run}autoconf"} AUTOMAKE=${AUTOMAKE-"${am_missing_run}automake-${am__api_version}"} AUTOHEADER=${AUTOHEADER-"${am_missing_run}autoheader"} MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"} install_sh=${install_sh-"$am_aux_dir/install-sh"} # Installed binaries are usually stripped using `strip' when the user # run `make install-strip'. However `strip' might not be the right # tool to use in cross-compilation environments, therefore Automake # will honor the `STRIP' environment variable to overrule this program. if test "$cross_compiling" != no; then if test -n "$ac_tool_prefix"; then # Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args. set dummy ${ac_tool_prefix}strip; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_STRIP+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$STRIP"; then ac_cv_prog_STRIP="$STRIP" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_STRIP="${ac_tool_prefix}strip" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done fi fi STRIP=$ac_cv_prog_STRIP if test -n "$STRIP"; then echo "$as_me:$LINENO: result: $STRIP" >&5 echo "${ECHO_T}$STRIP" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi fi if test -z "$ac_cv_prog_STRIP"; then ac_ct_STRIP=$STRIP # Extract the first word of "strip", so it can be a program name with args. set dummy strip; ac_word=$2 echo "$as_me:$LINENO: checking for $ac_word" >&5 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6 if test "${ac_cv_prog_ac_ct_STRIP+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -n "$ac_ct_STRIP"; then ac_cv_prog_ac_ct_STRIP="$ac_ct_STRIP" # Let the user override the test. else as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for ac_exec_ext in '' $ac_executable_extensions; do if $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; then ac_cv_prog_ac_ct_STRIP="strip" echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 break 2 fi done done test -z "$ac_cv_prog_ac_ct_STRIP" && ac_cv_prog_ac_ct_STRIP=":" fi fi ac_ct_STRIP=$ac_cv_prog_ac_ct_STRIP if test -n "$ac_ct_STRIP"; then echo "$as_me:$LINENO: result: $ac_ct_STRIP" >&5 echo "${ECHO_T}$ac_ct_STRIP" >&6 else echo "$as_me:$LINENO: result: no" >&5 echo "${ECHO_T}no" >&6 fi STRIP=$ac_ct_STRIP else STRIP="$ac_cv_prog_STRIP" fi fi INSTALL_STRIP_PROGRAM="\${SHELL} \$(install_sh) -c -s" # We need awk for the "check" target. The system "awk" is bad on # some platforms. # Always define AMTAR for backward compatibility. AMTAR=${AMTAR-"${am_missing_run}tar"} echo "$as_me:$LINENO: checking how to create a pax tar archive" >&5 echo $ECHO_N "checking how to create a pax tar archive... $ECHO_C" >&6 # Loop over all known methods to create a tar archive until one works. _am_tools='gnutar pax cpio none' _am_tools=${am_cv_prog_tar_pax-$_am_tools} # Do not fold the above two line into one, because Tru64 sh and # Solaris sh will not grok spaces in the rhs of `-'. for _am_tool in $_am_tools do case $_am_tool in gnutar) for _am_tar in tar gnutar gtar; do { echo "$as_me:$LINENO: $_am_tar --version" >&5 ($_am_tar --version) >&5 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } && break done am__tar="$_am_tar --format=posix -chf - "'"$$tardir"' am__tar_="$_am_tar --format=posix -chf - "'"$tardir"' am__untar="$_am_tar -xf -" ;; plaintar) # Must skip GNU tar: if it does not support --format= it doesn't create # ustar tarball either. (tar --version) >/dev/null 2>&1 && continue am__tar='tar chf - "$$tardir"' am__tar_='tar chf - "$tardir"' am__untar='tar xf -' ;; pax) am__tar='pax -L -x pax -w "$$tardir"' am__tar_='pax -L -x pax -w "$tardir"' am__untar='pax -r' ;; cpio) am__tar='find "$$tardir" -print | cpio -o -H pax -L' am__tar_='find "$tardir" -print | cpio -o -H pax -L' am__untar='cpio -i -H pax -d' ;; none) am__tar=false am__tar_=false am__untar=false ;; esac # If the value was cached, stop now. We just wanted to have am__tar # and am__untar set. test -n "${am_cv_prog_tar_pax}" && break # tar/untar a dummy directory, and stop if the command works rm -rf conftest.dir mkdir conftest.dir echo GrepMe > conftest.dir/file { echo "$as_me:$LINENO: tardir=conftest.dir && eval $am__tar_ >conftest.tar" >&5 (tardir=conftest.dir && eval $am__tar_ >conftest.tar) >&5 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } rm -rf conftest.dir if test -s conftest.tar; then { echo "$as_me:$LINENO: $am__untar &5 ($am__untar &5 2>&5 ac_status=$? echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); } grep GrepMe conftest.dir/file >/dev/null 2>&1 && break fi done rm -rf conftest.dir if test "${am_cv_prog_tar_pax+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else am_cv_prog_tar_pax=$_am_tool fi echo "$as_me:$LINENO: result: $am_cv_prog_tar_pax" >&5 echo "${ECHO_T}$am_cv_prog_tar_pax" >&6 depcc="$CC" am_compiler_list= echo "$as_me:$LINENO: checking dependency style of $depcc" >&5 echo $ECHO_N "checking dependency style of $depcc... $ECHO_C" >&6 if test "${am_cv_CC_dependencies_compiler_type+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then # We make a subdir and do the tests there. Otherwise we can end up # making bogus files that we don't know about and never remove. For # instance it was reported that on HP-UX the gcc test will end up # making a dummy file named `D' -- because `-MD' means `put the output # in D'. mkdir conftest.dir # Copy depcomp to subdir because otherwise we won't find it if we're # using a relative directory. cp "$am_depcomp" conftest.dir cd conftest.dir # We will build objects and dependencies in a subdirectory because # it helps to detect inapplicable dependency modes. For instance # both Tru64's cc and ICC support -MD to output dependencies as a # side effect of compilation, but ICC will put the dependencies in # the current directory while Tru64 will put them in the object # directory. mkdir sub am_cv_CC_dependencies_compiler_type=none if test "$am_compiler_list" = ""; then am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp` fi for depmode in $am_compiler_list; do # Setup a source with many dependencies, because some compilers # like to wrap large dependency lists on column 80 (with \), and # we should not choose a depcomp mode which is confused by this. # # We need to recreate these files for each test, as the compiler may # overwrite some of them when testing with obscure command lines. # This happens at least with the AIX C compiler. : > sub/conftest.c for i in 1 2 3 4 5 6; do echo '#include "conftst'$i'.h"' >> sub/conftest.c # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with # Solaris 8's {/usr,}/bin/sh. touch sub/conftst$i.h done echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf case $depmode in nosideeffect) # after this tag, mechanisms are not by side-effect, so they'll # only be used when explicitly requested if test "x$enable_dependency_tracking" = xyes; then continue else break fi ;; none) break ;; esac # We check with `-c' and `-o' for the sake of the "dashmstdout" # mode. It turns out that the SunPro C++ compiler does not properly # handle `-M -o', and we need to detect this. if depmode=$depmode \ source=sub/conftest.c object=sub/conftest.${OBJEXT-o} \ depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \ $SHELL ./depcomp $depcc -c -o sub/conftest.${OBJEXT-o} sub/conftest.c \ >/dev/null 2>conftest.err && grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 && grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 && ${MAKE-make} -s -f confmf > /dev/null 2>&1; then # icc doesn't choke on unknown options, it will just issue warnings # or remarks (even with -Werror). So we grep stderr for any message # that says an option was ignored or not supported. # When given -MP, icc 7.0 and 7.1 complain thusly: # icc: Command line warning: ignoring option '-M'; no argument required # The diagnosis changed in icc 8.0: # icc: Command line remark: option '-MP' not supported if (grep 'ignoring option' conftest.err || grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else am_cv_CC_dependencies_compiler_type=$depmode break fi fi done cd .. rm -rf conftest.dir else am_cv_CC_dependencies_compiler_type=none fi fi echo "$as_me:$LINENO: result: $am_cv_CC_dependencies_compiler_type" >&5 echo "${ECHO_T}$am_cv_CC_dependencies_compiler_type" >&6 CCDEPMODE=depmode=$am_cv_CC_dependencies_compiler_type if test "x$enable_dependency_tracking" != xno \ && test "$am_cv_CC_dependencies_compiler_type" = gcc3; then am__fastdepCC_TRUE= am__fastdepCC_FALSE='#' else am__fastdepCC_TRUE='#' am__fastdepCC_FALSE= fi depcc="$CXX" am_compiler_list= echo "$as_me:$LINENO: checking dependency style of $depcc" >&5 echo $ECHO_N "checking dependency style of $depcc... $ECHO_C" >&6 if test "${am_cv_CXX_dependencies_compiler_type+set}" = set; then echo $ECHO_N "(cached) $ECHO_C" >&6 else if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then # We make a subdir and do the tests there. Otherwise we can end up # making bogus files that we don't know about and never remove. For # instance it was reported that on HP-UX the gcc test will end up # making a dummy file named `D' -- because `-MD' means `put the output # in D'. mkdir conftest.dir # Copy depcomp to subdir because otherwise we won't find it if we're # using a relative directory. cp "$am_depcomp" conftest.dir cd conftest.dir # We will build objects and dependencies in a subdirectory because # it helps to detect inapplicable dependency modes. For instance # both Tru64's cc and ICC support -MD to output dependencies as a # side effect of compilation, but ICC will put the dependencies in # the current directory while Tru64 will put them in the object # directory. mkdir sub am_cv_CXX_dependencies_compiler_type=none if test "$am_compiler_list" = ""; then am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp` fi for depmode in $am_compiler_list; do # Setup a source with many dependencies, because some compilers # like to wrap large dependency lists on column 80 (with \), and # we should not choose a depcomp mode which is confused by this. # # We need to recreate these files for each test, as the compiler may # overwrite some of them when testing with obscure command lines. # This happens at least with the AIX C compiler. : > sub/conftest.c for i in 1 2 3 4 5 6; do echo '#include "conftst'$i'.h"' >> sub/conftest.c # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with # Solaris 8's {/usr,}/bin/sh. touch sub/conftst$i.h done echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf case $depmode in nosideeffect) # after this tag, mechanisms are not by side-effect, so they'll # only be used when explicitly requested if test "x$enable_dependency_tracking" = xyes; then continue else break fi ;; none) break ;; esac # We check with `-c' and `-o' for the sake of the "dashmstdout" # mode. It turns out that the SunPro C++ compiler does not properly # handle `-M -o', and we need to detect this. if depmode=$depmode \ source=sub/conftest.c object=sub/conftest.${OBJEXT-o} \ depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \ $SHELL ./depcomp $depcc -c -o sub/conftest.${OBJEXT-o} sub/conftest.c \ >/dev/null 2>conftest.err && grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 && grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 && ${MAKE-make} -s -f confmf > /dev/null 2>&1; then # icc doesn't choke on unknown options, it will just issue warnings # or remarks (even with -Werror). So we grep stderr for any message # that says an option was ignored or not supported. # When given -MP, icc 7.0 and 7.1 complain thusly: # icc: Command line warning: ignoring option '-M'; no argument required # The diagnosis changed in icc 8.0: # icc: Command line remark: option '-MP' not supported if (grep 'ignoring option' conftest.err || grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else am_cv_CXX_dependencies_compiler_type=$depmode break fi fi done cd .. rm -rf conftest.dir else am_cv_CXX_dependencies_compiler_type=none fi fi echo "$as_me:$LINENO: result: $am_cv_CXX_dependencies_compiler_type" >&5 echo "${ECHO_T}$am_cv_CXX_dependencies_compiler_type" >&6 CXXDEPMODE=depmode=$am_cv_CXX_dependencies_compiler_type if test "x$enable_dependency_tracking" != xno \ && test "$am_cv_CXX_dependencies_compiler_type" = gcc3; then am__fastdepCXX_TRUE= am__fastdepCXX_FALSE='#' else am__fastdepCXX_TRUE='#' am__fastdepCXX_FALSE= fi # makefiles to configure ac_config_files="$ac_config_files Makefile src/Makefile" # make it happen cat >confcache <<\_ACEOF # This file is a shell script that caches the results of configure # tests run on this system so they can be shared between configure # scripts and configure runs, see configure's option --config-cache. # It is not useful on other systems. If it contains results you don't # want to keep, you may remove or edit it. # # config.status only pays attention to the cache file if you give it # the --recheck option to rerun configure. # # `ac_cv_env_foo' variables (set or unset) will be overridden when # loading this file, other *unset* `ac_cv_foo' will be assigned the # following values. _ACEOF # The following way of writing the cache mishandles newlines in values, # but we know of no workaround that is simple, portable, and efficient. # So, don't put newlines in cache variables' values. # Ultrix sh set writes to stderr and can't be redirected directly, # and sets the high bit in the cache file unless we assign to the vars. { (set) 2>&1 | case `(ac_space=' '; set | grep ac_space) 2>&1` in *ac_space=\ *) # `set' does not quote correctly, so add quotes (double-quote # substitution turns \\\\ into \\, and sed turns \\ into \). sed -n \ "s/'/'\\\\''/g; s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p" ;; *) # `set' quotes correctly as required by POSIX, so do not add quotes. sed -n \ "s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1=\\2/p" ;; esac; } | sed ' t clear : clear s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/ t end /^ac_cv_env/!s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/ : end' >>confcache if diff $cache_file confcache >/dev/null 2>&1; then :; else if test -w $cache_file; then test "x$cache_file" != "x/dev/null" && echo "updating cache $cache_file" cat confcache >$cache_file else echo "not updating unwritable cache $cache_file" fi fi rm -f confcache test "x$prefix" = xNONE && prefix=$ac_default_prefix # Let make expand exec_prefix. test "x$exec_prefix" = xNONE && exec_prefix='${prefix}' # VPATH may cause trouble with some makes, so we remove $(srcdir), # ${srcdir} and @srcdir@ from VPATH if srcdir is ".", strip leading and # trailing colons and then remove the whole line if VPATH becomes empty # (actually we leave an empty line to preserve line numbers). if test "x$srcdir" = x.; then ac_vpsub='/^[ ]*VPATH[ ]*=/{ s/:*\$(srcdir):*/:/; s/:*\${srcdir}:*/:/; s/:*@srcdir@:*/:/; s/^\([^=]*=[ ]*\):*/\1/; s/:*$//; s/^[^=]*=[ ]*$//; }' fi DEFS=-DHAVE_CONFIG_H ac_libobjs= ac_ltlibobjs= for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue # 1. Remove the extension, and $U if already installed. ac_i=`echo "$ac_i" | sed 's/\$U\././;s/\.o$//;s/\.obj$//'` # 2. Add them. ac_libobjs="$ac_libobjs $ac_i\$U.$ac_objext" ac_ltlibobjs="$ac_ltlibobjs $ac_i"'$U.lo' done LIBOBJS=$ac_libobjs LTLIBOBJS=$ac_ltlibobjs if test -z "${AMDEP_TRUE}" && test -z "${AMDEP_FALSE}"; then { { echo "$as_me:$LINENO: error: conditional \"AMDEP\" was never defined. Usually this means the macro was only invoked conditionally." >&5 echo "$as_me: error: conditional \"AMDEP\" was never defined. Usually this means the macro was only invoked conditionally." >&2;} { (exit 1); exit 1; }; } fi if test -z "${am__fastdepCXX_TRUE}" && test -z "${am__fastdepCXX_FALSE}"; then { { echo "$as_me:$LINENO: error: conditional \"am__fastdepCXX\" was never defined. Usually this means the macro was only invoked conditionally." >&5 echo "$as_me: error: conditional \"am__fastdepCXX\" was never defined. Usually this means the macro was only invoked conditionally." >&2;} { (exit 1); exit 1; }; } fi if test -z "${am__fastdepCC_TRUE}" && test -z "${am__fastdepCC_FALSE}"; then { { echo "$as_me:$LINENO: error: conditional \"am__fastdepCC\" was never defined. Usually this means the macro was only invoked conditionally." >&5 echo "$as_me: error: conditional \"am__fastdepCC\" was never defined. Usually this means the macro was only invoked conditionally." >&2;} { (exit 1); exit 1; }; } fi if test -z "${am__fastdepCC_TRUE}" && test -z "${am__fastdepCC_FALSE}"; then { { echo "$as_me:$LINENO: error: conditional \"am__fastdepCC\" was never defined. Usually this means the macro was only invoked conditionally." >&5 echo "$as_me: error: conditional \"am__fastdepCC\" was never defined. Usually this means the macro was only invoked conditionally." >&2;} { (exit 1); exit 1; }; } fi if test -z "${am__fastdepCXX_TRUE}" && test -z "${am__fastdepCXX_FALSE}"; then { { echo "$as_me:$LINENO: error: conditional \"am__fastdepCXX\" was never defined. Usually this means the macro was only invoked conditionally." >&5 echo "$as_me: error: conditional \"am__fastdepCXX\" was never defined. Usually this means the macro was only invoked conditionally." >&2;} { (exit 1); exit 1; }; } fi : ${CONFIG_STATUS=./config.status} ac_clean_files_save=$ac_clean_files ac_clean_files="$ac_clean_files $CONFIG_STATUS" { echo "$as_me:$LINENO: creating $CONFIG_STATUS" >&5 echo "$as_me: creating $CONFIG_STATUS" >&6;} cat >$CONFIG_STATUS <<_ACEOF #! $SHELL # Generated by $as_me. # Run this file to recreate the current configuration. # Compiler output produced by configure, useful for debugging # configure, is in config.log if it exists. debug=false ac_cs_recheck=false ac_cs_silent=false SHELL=\${CONFIG_SHELL-$SHELL} _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF ## --------------------- ## ## M4sh Initialization. ## ## --------------------- ## # Be Bourne compatible if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then emulate sh NULLCMD=: # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which # is contrary to our usage. Disable this feature. alias -g '${1+"$@"}'='"$@"' elif test -n "${BASH_VERSION+set}" && (set -o posix) >/dev/null 2>&1; then set -o posix fi DUALCASE=1; export DUALCASE # for MKS sh # Support unset when possible. if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then as_unset=unset else as_unset=false fi # Work around bugs in pre-3.0 UWIN ksh. $as_unset ENV MAIL MAILPATH PS1='$ ' PS2='> ' PS4='+ ' # NLS nuisances. for as_var in \ LANG LANGUAGE LC_ADDRESS LC_ALL LC_COLLATE LC_CTYPE LC_IDENTIFICATION \ LC_MEASUREMENT LC_MESSAGES LC_MONETARY LC_NAME LC_NUMERIC LC_PAPER \ LC_TELEPHONE LC_TIME do if (set +x; test -z "`(eval $as_var=C; export $as_var) 2>&1`"); then eval $as_var=C; export $as_var else $as_unset $as_var fi done # Required to use basename. if expr a : '\(a\)' >/dev/null 2>&1; then as_expr=expr else as_expr=false fi if (basename /) >/dev/null 2>&1 && test "X`basename / 2>&1`" = "X/"; then as_basename=basename else as_basename=false fi # Name of the executable. as_me=`$as_basename "$0" || $as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ X"$0" : 'X\(//\)$' \| \ X"$0" : 'X\(/\)$' \| \ . : '\(.\)' 2>/dev/null || echo X/"$0" | sed '/^.*\/\([^/][^/]*\)\/*$/{ s//\1/; q; } /^X\/\(\/\/\)$/{ s//\1/; q; } /^X\/\(\/\).*/{ s//\1/; q; } s/.*/./; q'` # PATH needs CR, and LINENO needs CR and PATH. # Avoid depending upon Character Ranges. as_cr_letters='abcdefghijklmnopqrstuvwxyz' as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' as_cr_Letters=$as_cr_letters$as_cr_LETTERS as_cr_digits='0123456789' as_cr_alnum=$as_cr_Letters$as_cr_digits # The user is always right. if test "${PATH_SEPARATOR+set}" != set; then echo "#! /bin/sh" >conf$$.sh echo "exit 0" >>conf$$.sh chmod +x conf$$.sh if (PATH="/nonexistent;."; conf$$.sh) >/dev/null 2>&1; then PATH_SEPARATOR=';' else PATH_SEPARATOR=: fi rm -f conf$$.sh fi as_lineno_1=$LINENO as_lineno_2=$LINENO as_lineno_3=`(expr $as_lineno_1 + 1) 2>/dev/null` test "x$as_lineno_1" != "x$as_lineno_2" && test "x$as_lineno_3" = "x$as_lineno_2" || { # Find who we are. Look in the path if we contain no path at all # relative or not. case $0 in *[\\/]* ) as_myself=$0 ;; *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break done ;; esac # We did not find ourselves, most probably we were run as `sh COMMAND' # in which case we are not to be found in the path. if test "x$as_myself" = x; then as_myself=$0 fi if test ! -f "$as_myself"; then { { echo "$as_me:$LINENO: error: cannot find myself; rerun with an absolute path" >&5 echo "$as_me: error: cannot find myself; rerun with an absolute path" >&2;} { (exit 1); exit 1; }; } fi case $CONFIG_SHELL in '') as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. for as_base in sh bash ksh sh5; do case $as_dir in /*) if ("$as_dir/$as_base" -c ' as_lineno_1=$LINENO as_lineno_2=$LINENO as_lineno_3=`(expr $as_lineno_1 + 1) 2>/dev/null` test "x$as_lineno_1" != "x$as_lineno_2" && test "x$as_lineno_3" = "x$as_lineno_2" ') 2>/dev/null; then $as_unset BASH_ENV || test "${BASH_ENV+set}" != set || { BASH_ENV=; export BASH_ENV; } $as_unset ENV || test "${ENV+set}" != set || { ENV=; export ENV; } CONFIG_SHELL=$as_dir/$as_base export CONFIG_SHELL exec "$CONFIG_SHELL" "$0" ${1+"$@"} fi;; esac done done ;; esac # Create $as_me.lineno as a copy of $as_myself, but with $LINENO # uniformly replaced by the line number. The first 'sed' inserts a # line-number line before each line; the second 'sed' does the real # work. The second script uses 'N' to pair each line-number line # with the numbered line, and appends trailing '-' during # substitution so that $LINENO is not a special case at line end. # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the # second 'sed' script. Blame Lee E. McMahon for sed's syntax. :-) sed '=' <$as_myself | sed ' N s,$,-, : loop s,^\(['$as_cr_digits']*\)\(.*\)[$]LINENO\([^'$as_cr_alnum'_]\),\1\2\1\3, t loop s,-$,, s,^['$as_cr_digits']*\n,, ' >$as_me.lineno && chmod +x $as_me.lineno || { { echo "$as_me:$LINENO: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&5 echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2;} { (exit 1); exit 1; }; } # Don't try to exec as it changes $[0], causing all sort of problems # (the dirname of $[0] is not the place where we might find the # original and so on. Autoconf is especially sensible to this). . ./$as_me.lineno # Exit status is that of the last command. exit } case `echo "testing\c"; echo 1,2,3`,`echo -n testing; echo 1,2,3` in *c*,-n*) ECHO_N= ECHO_C=' ' ECHO_T=' ' ;; *c*,* ) ECHO_N=-n ECHO_C= ECHO_T= ;; *) ECHO_N= ECHO_C='\c' ECHO_T= ;; esac if expr a : '\(a\)' >/dev/null 2>&1; then as_expr=expr else as_expr=false fi rm -f conf$$ conf$$.exe conf$$.file echo >conf$$.file if ln -s conf$$.file conf$$ 2>/dev/null; then # We could just check for DJGPP; but this test a) works b) is more generic # and c) will remain valid once DJGPP supports symlinks (DJGPP 2.04). if test -f conf$$.exe; then # Don't use ln at all; we don't have any links as_ln_s='cp -p' else as_ln_s='ln -s' fi elif ln conf$$.file conf$$ 2>/dev/null; then as_ln_s=ln else as_ln_s='cp -p' fi rm -f conf$$ conf$$.exe conf$$.file if mkdir -p . 2>/dev/null; then as_mkdir_p=: else test -d ./-p && rmdir ./-p as_mkdir_p=false fi as_executable_p="test -f" # Sed expression to map a string onto a valid CPP name. as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" # Sed expression to map a string onto a valid variable name. as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" # IFS # We need space, tab and new line, in precisely that order. as_nl=' ' IFS=" $as_nl" # CDPATH. $as_unset CDPATH exec 6>&1 # Open the log real soon, to keep \$[0] and so on meaningful, and to # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. Logging --version etc. is OK. exec 5>>config.log { echo sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX ## Running $as_me. ## _ASBOX } >&5 cat >&5 <<_CSEOF This file was extended by tophat $as_me 2.0.9, which was generated by GNU Autoconf 2.59. Invocation command line was CONFIG_FILES = $CONFIG_FILES CONFIG_HEADERS = $CONFIG_HEADERS CONFIG_LINKS = $CONFIG_LINKS CONFIG_COMMANDS = $CONFIG_COMMANDS $ $0 $@ _CSEOF echo "on `(hostname || uname -n) 2>/dev/null | sed 1q`" >&5 echo >&5 _ACEOF # Files that config.status was made for. if test -n "$ac_config_files"; then echo "config_files=\"$ac_config_files\"" >>$CONFIG_STATUS fi if test -n "$ac_config_headers"; then echo "config_headers=\"$ac_config_headers\"" >>$CONFIG_STATUS fi if test -n "$ac_config_links"; then echo "config_links=\"$ac_config_links\"" >>$CONFIG_STATUS fi if test -n "$ac_config_commands"; then echo "config_commands=\"$ac_config_commands\"" >>$CONFIG_STATUS fi cat >>$CONFIG_STATUS <<\_ACEOF ac_cs_usage="\ \`$as_me' instantiates files from templates according to the current configuration. Usage: $0 [OPTIONS] [FILE]... -h, --help print this help, then exit -V, --version print version number, then exit -q, --quiet do not print progress messages -d, --debug don't remove temporary files --recheck update $as_me by reconfiguring in the same conditions --file=FILE[:TEMPLATE] instantiate the configuration file FILE --header=FILE[:TEMPLATE] instantiate the configuration header FILE Configuration files: $config_files Configuration headers: $config_headers Configuration commands: $config_commands Report bugs to ." _ACEOF cat >>$CONFIG_STATUS <<_ACEOF ac_cs_version="\\ tophat config.status 2.0.9 configured by $0, generated by GNU Autoconf 2.59, with options \\"`echo "$ac_configure_args" | sed 's/[\\""\`\$]/\\\\&/g'`\\" Copyright (C) 2003 Free Software Foundation, Inc. This config.status script is free software; the Free Software Foundation gives unlimited permission to copy, distribute and modify it." srcdir=$srcdir INSTALL="$INSTALL" _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF # If no file are specified by the user, then we need to provide default # value. By we need to know if files were specified by the user. ac_need_defaults=: while test $# != 0 do case $1 in --*=*) ac_option=`expr "x$1" : 'x\([^=]*\)='` ac_optarg=`expr "x$1" : 'x[^=]*=\(.*\)'` ac_shift=: ;; -*) ac_option=$1 ac_optarg=$2 ac_shift=shift ;; *) # This is not an option, so the user has probably given explicit # arguments. ac_option=$1 ac_need_defaults=false;; esac case $ac_option in # Handling of the options. _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) ac_cs_recheck=: ;; --version | --vers* | -V ) echo "$ac_cs_version"; exit 0 ;; --he | --h) # Conflict between --help and --header { { echo "$as_me:$LINENO: error: ambiguous option: $1 Try \`$0 --help' for more information." >&5 echo "$as_me: error: ambiguous option: $1 Try \`$0 --help' for more information." >&2;} { (exit 1); exit 1; }; };; --help | --hel | -h ) echo "$ac_cs_usage"; exit 0 ;; --debug | --d* | -d ) debug=: ;; --file | --fil | --fi | --f ) $ac_shift CONFIG_FILES="$CONFIG_FILES $ac_optarg" ac_need_defaults=false;; --header | --heade | --head | --hea ) $ac_shift CONFIG_HEADERS="$CONFIG_HEADERS $ac_optarg" ac_need_defaults=false;; -q | -quiet | --quiet | --quie | --qui | --qu | --q \ | -silent | --silent | --silen | --sile | --sil | --si | --s) ac_cs_silent=: ;; # This is an error. -*) { { echo "$as_me:$LINENO: error: unrecognized option: $1 Try \`$0 --help' for more information." >&5 echo "$as_me: error: unrecognized option: $1 Try \`$0 --help' for more information." >&2;} { (exit 1); exit 1; }; } ;; *) ac_config_targets="$ac_config_targets $1" ;; esac shift done ac_configure_extra_args= if $ac_cs_silent; then exec 6>/dev/null ac_configure_extra_args="$ac_configure_extra_args --silent" fi _ACEOF cat >>$CONFIG_STATUS <<_ACEOF if \$ac_cs_recheck; then echo "running $SHELL $0 " $ac_configure_args \$ac_configure_extra_args " --no-create --no-recursion" >&6 exec $SHELL $0 $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion fi _ACEOF cat >>$CONFIG_STATUS <<_ACEOF # # INIT-COMMANDS section. # AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir" _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF for ac_config_target in $ac_config_targets do case "$ac_config_target" in # Handling of arguments. "Makefile" ) CONFIG_FILES="$CONFIG_FILES Makefile" ;; "src/Makefile" ) CONFIG_FILES="$CONFIG_FILES src/Makefile" ;; "depfiles" ) CONFIG_COMMANDS="$CONFIG_COMMANDS depfiles" ;; "config.h" ) CONFIG_HEADERS="$CONFIG_HEADERS config.h" ;; *) { { echo "$as_me:$LINENO: error: invalid argument: $ac_config_target" >&5 echo "$as_me: error: invalid argument: $ac_config_target" >&2;} { (exit 1); exit 1; }; };; esac done # If the user did not use the arguments to specify the items to instantiate, # then the envvar interface is used. Set only those that are not. # We use the long form for the default assignment because of an extremely # bizarre bug on SunOS 4.1.3. if $ac_need_defaults; then test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files test "${CONFIG_HEADERS+set}" = set || CONFIG_HEADERS=$config_headers test "${CONFIG_COMMANDS+set}" = set || CONFIG_COMMANDS=$config_commands fi # Have a temporary directory for convenience. Make it in the build tree # simply because there is no reason to put it here, and in addition, # creating and moving files from /tmp can sometimes cause problems. # Create a temporary directory, and hook for its removal unless debugging. $debug || { trap 'exit_status=$?; rm -rf $tmp && exit $exit_status' 0 trap '{ (exit 1); exit 1; }' 1 2 13 15 } # Create a (secure) tmp directory for tmp files. { tmp=`(umask 077 && mktemp -d -q "./confstatXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" } || { tmp=./confstat$$-$RANDOM (umask 077 && mkdir $tmp) } || { echo "$me: cannot create a temporary directory in ." >&2 { (exit 1); exit 1; } } _ACEOF cat >>$CONFIG_STATUS <<_ACEOF # # CONFIG_FILES section. # # No need to generate the scripts if there are no CONFIG_FILES. # This happens for instance when ./config.status config.h if test -n "\$CONFIG_FILES"; then # Protect against being on the right side of a sed subst in config.status. sed 's/,@/@@/; s/@,/@@/; s/,;t t\$/@;t t/; /@;t t\$/s/[\\\\&,]/\\\\&/g; s/@@/,@/; s/@@/@,/; s/@;t t\$/,;t t/' >\$tmp/subs.sed <<\\CEOF s,@SHELL@,$SHELL,;t t s,@PATH_SEPARATOR@,$PATH_SEPARATOR,;t t s,@PACKAGE_NAME@,$PACKAGE_NAME,;t t s,@PACKAGE_TARNAME@,$PACKAGE_TARNAME,;t t s,@PACKAGE_VERSION@,$PACKAGE_VERSION,;t t s,@PACKAGE_STRING@,$PACKAGE_STRING,;t t s,@PACKAGE_BUGREPORT@,$PACKAGE_BUGREPORT,;t t s,@exec_prefix@,$exec_prefix,;t t s,@prefix@,$prefix,;t t s,@program_transform_name@,$program_transform_name,;t t s,@bindir@,$bindir,;t t s,@sbindir@,$sbindir,;t t s,@libexecdir@,$libexecdir,;t t s,@datadir@,$datadir,;t t s,@sysconfdir@,$sysconfdir,;t t s,@sharedstatedir@,$sharedstatedir,;t t s,@localstatedir@,$localstatedir,;t t s,@libdir@,$libdir,;t t s,@includedir@,$includedir,;t t s,@oldincludedir@,$oldincludedir,;t t s,@infodir@,$infodir,;t t s,@mandir@,$mandir,;t t s,@build_alias@,$build_alias,;t t s,@host_alias@,$host_alias,;t t s,@target_alias@,$target_alias,;t t s,@DEFS@,$DEFS,;t t s,@ECHO_C@,$ECHO_C,;t t s,@ECHO_N@,$ECHO_N,;t t s,@ECHO_T@,$ECHO_T,;t t s,@LIBS@,$LIBS,;t t s,@INSTALL_PROGRAM@,$INSTALL_PROGRAM,;t t s,@INSTALL_SCRIPT@,$INSTALL_SCRIPT,;t t s,@INSTALL_DATA@,$INSTALL_DATA,;t t s,@CYGPATH_W@,$CYGPATH_W,;t t s,@PACKAGE@,$PACKAGE,;t t s,@VERSION@,$VERSION,;t t s,@ACLOCAL@,$ACLOCAL,;t t s,@AUTOCONF@,$AUTOCONF,;t t s,@AUTOMAKE@,$AUTOMAKE,;t t s,@AUTOHEADER@,$AUTOHEADER,;t t s,@MAKEINFO@,$MAKEINFO,;t t s,@install_sh@,$install_sh,;t t s,@STRIP@,$STRIP,;t t s,@ac_ct_STRIP@,$ac_ct_STRIP,;t t s,@INSTALL_STRIP_PROGRAM@,$INSTALL_STRIP_PROGRAM,;t t s,@mkdir_p@,$mkdir_p,;t t s,@AWK@,$AWK,;t t s,@SET_MAKE@,$SET_MAKE,;t t s,@am__leading_dot@,$am__leading_dot,;t t s,@AMTAR@,$AMTAR,;t t s,@am__tar@,$am__tar,;t t s,@am__untar@,$am__untar,;t t s,@PYTHON@,$PYTHON,;t t s,@CXX@,$CXX,;t t s,@CXXFLAGS@,$CXXFLAGS,;t t s,@LDFLAGS@,$LDFLAGS,;t t s,@CPPFLAGS@,$CPPFLAGS,;t t s,@ac_ct_CXX@,$ac_ct_CXX,;t t s,@EXEEXT@,$EXEEXT,;t t s,@OBJEXT@,$OBJEXT,;t t s,@DEPDIR@,$DEPDIR,;t t s,@am__include@,$am__include,;t t s,@am__quote@,$am__quote,;t t s,@AMDEP_TRUE@,$AMDEP_TRUE,;t t s,@AMDEP_FALSE@,$AMDEP_FALSE,;t t s,@AMDEPBACKSLASH@,$AMDEPBACKSLASH,;t t s,@CXXDEPMODE@,$CXXDEPMODE,;t t s,@am__fastdepCXX_TRUE@,$am__fastdepCXX_TRUE,;t t s,@am__fastdepCXX_FALSE@,$am__fastdepCXX_FALSE,;t t s,@CC@,$CC,;t t s,@CFLAGS@,$CFLAGS,;t t s,@ac_ct_CC@,$ac_ct_CC,;t t s,@CCDEPMODE@,$CCDEPMODE,;t t s,@am__fastdepCC_TRUE@,$am__fastdepCC_TRUE,;t t s,@am__fastdepCC_FALSE@,$am__fastdepCC_FALSE,;t t s,@RANLIB@,$RANLIB,;t t s,@ac_ct_RANLIB@,$ac_ct_RANLIB,;t t s,@PYTHON_VERSION@,$PYTHON_VERSION,;t t s,@PYTHON_PREFIX@,$PYTHON_PREFIX,;t t s,@PYTHON_EXEC_PREFIX@,$PYTHON_EXEC_PREFIX,;t t s,@PYTHON_PLATFORM@,$PYTHON_PLATFORM,;t t s,@pythondir@,$pythondir,;t t s,@pkgpythondir@,$pkgpythondir,;t t s,@pyexecdir@,$pyexecdir,;t t s,@pkgpyexecdir@,$pkgpyexecdir,;t t s,@BOOST_CPPFLAGS@,$BOOST_CPPFLAGS,;t t s,@BOOST_LDFLAGS@,$BOOST_LDFLAGS,;t t s,@BAM_CPPFLAGS@,$BAM_CPPFLAGS,;t t s,@BAM_LDFLAGS@,$BAM_LDFLAGS,;t t s,@BAM_LIB@,$BAM_LIB,;t t s,@build@,$build,;t t s,@build_cpu@,$build_cpu,;t t s,@build_vendor@,$build_vendor,;t t s,@build_os@,$build_os,;t t s,@BOOST_THREAD_LIB@,$BOOST_THREAD_LIB,;t t s,@BOOST_SYSTEM_LIB@,$BOOST_SYSTEM_LIB,;t t s,@CPP@,$CPP,;t t s,@EGREP@,$EGREP,;t t s,@LIBOBJS@,$LIBOBJS,;t t s,@host@,$host,;t t s,@host_cpu@,$host_cpu,;t t s,@host_vendor@,$host_vendor,;t t s,@host_os@,$host_os,;t t s,@LTLIBOBJS@,$LTLIBOBJS,;t t CEOF _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF # Split the substitutions into bite-sized pieces for seds with # small command number limits, like on Digital OSF/1 and HP-UX. ac_max_sed_lines=48 ac_sed_frag=1 # Number of current file. ac_beg=1 # First line for current file. ac_end=$ac_max_sed_lines # Line after last line for current file. ac_more_lines=: ac_sed_cmds= while $ac_more_lines; do if test $ac_beg -gt 1; then sed "1,${ac_beg}d; ${ac_end}q" $tmp/subs.sed >$tmp/subs.frag else sed "${ac_end}q" $tmp/subs.sed >$tmp/subs.frag fi if test ! -s $tmp/subs.frag; then ac_more_lines=false else # The purpose of the label and of the branching condition is to # speed up the sed processing (if there are no `@' at all, there # is no need to browse any of the substitutions). # These are the two extra sed commands mentioned above. (echo ':t /@[a-zA-Z_][a-zA-Z_0-9]*@/!b' && cat $tmp/subs.frag) >$tmp/subs-$ac_sed_frag.sed if test -z "$ac_sed_cmds"; then ac_sed_cmds="sed -f $tmp/subs-$ac_sed_frag.sed" else ac_sed_cmds="$ac_sed_cmds | sed -f $tmp/subs-$ac_sed_frag.sed" fi ac_sed_frag=`expr $ac_sed_frag + 1` ac_beg=$ac_end ac_end=`expr $ac_end + $ac_max_sed_lines` fi done if test -z "$ac_sed_cmds"; then ac_sed_cmds=cat fi fi # test -n "$CONFIG_FILES" _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF for ac_file in : $CONFIG_FILES; do test "x$ac_file" = x: && continue # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in". case $ac_file in - | *:- | *:-:* ) # input from stdin cat >$tmp/stdin ac_file_in=`echo "$ac_file" | sed 's,[^:]*:,,'` ac_file=`echo "$ac_file" | sed 's,:.*,,'` ;; *:* ) ac_file_in=`echo "$ac_file" | sed 's,[^:]*:,,'` ac_file=`echo "$ac_file" | sed 's,:.*,,'` ;; * ) ac_file_in=$ac_file.in ;; esac # Compute @srcdir@, @top_srcdir@, and @INSTALL@ for subdirectories. ac_dir=`(dirname "$ac_file") 2>/dev/null || $as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$ac_file" : 'X\(//\)[^/]' \| \ X"$ac_file" : 'X\(//\)$' \| \ X"$ac_file" : 'X\(/\)' \| \ . : '\(.\)' 2>/dev/null || echo X"$ac_file" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; } /^X\(\/\/\)[^/].*/{ s//\1/; q; } /^X\(\/\/\)$/{ s//\1/; q; } /^X\(\/\).*/{ s//\1/; q; } s/.*/./; q'` { if $as_mkdir_p; then mkdir -p "$ac_dir" else as_dir="$ac_dir" as_dirs= while test ! -d "$as_dir"; do as_dirs="$as_dir $as_dirs" as_dir=`(dirname "$as_dir") 2>/dev/null || $as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$as_dir" : 'X\(//\)[^/]' \| \ X"$as_dir" : 'X\(//\)$' \| \ X"$as_dir" : 'X\(/\)' \| \ . : '\(.\)' 2>/dev/null || echo X"$as_dir" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; } /^X\(\/\/\)[^/].*/{ s//\1/; q; } /^X\(\/\/\)$/{ s//\1/; q; } /^X\(\/\).*/{ s//\1/; q; } s/.*/./; q'` done test ! -n "$as_dirs" || mkdir $as_dirs fi || { { echo "$as_me:$LINENO: error: cannot create directory \"$ac_dir\"" >&5 echo "$as_me: error: cannot create directory \"$ac_dir\"" >&2;} { (exit 1); exit 1; }; }; } ac_builddir=. if test "$ac_dir" != .; then ac_dir_suffix=/`echo "$ac_dir" | sed 's,^\.[\\/],,'` # A "../" for each directory in $ac_dir_suffix. ac_top_builddir=`echo "$ac_dir_suffix" | sed 's,/[^\\/]*,../,g'` else ac_dir_suffix= ac_top_builddir= fi case $srcdir in .) # No --srcdir option. We are building in place. ac_srcdir=. if test -z "$ac_top_builddir"; then ac_top_srcdir=. else ac_top_srcdir=`echo $ac_top_builddir | sed 's,/$,,'` fi ;; [\\/]* | ?:[\\/]* ) # Absolute path. ac_srcdir=$srcdir$ac_dir_suffix; ac_top_srcdir=$srcdir ;; *) # Relative path. ac_srcdir=$ac_top_builddir$srcdir$ac_dir_suffix ac_top_srcdir=$ac_top_builddir$srcdir ;; esac # Do not use `cd foo && pwd` to compute absolute paths, because # the directories may not exist. case `pwd` in .) ac_abs_builddir="$ac_dir";; *) case "$ac_dir" in .) ac_abs_builddir=`pwd`;; [\\/]* | ?:[\\/]* ) ac_abs_builddir="$ac_dir";; *) ac_abs_builddir=`pwd`/"$ac_dir";; esac;; esac case $ac_abs_builddir in .) ac_abs_top_builddir=${ac_top_builddir}.;; *) case ${ac_top_builddir}. in .) ac_abs_top_builddir=$ac_abs_builddir;; [\\/]* | ?:[\\/]* ) ac_abs_top_builddir=${ac_top_builddir}.;; *) ac_abs_top_builddir=$ac_abs_builddir/${ac_top_builddir}.;; esac;; esac case $ac_abs_builddir in .) ac_abs_srcdir=$ac_srcdir;; *) case $ac_srcdir in .) ac_abs_srcdir=$ac_abs_builddir;; [\\/]* | ?:[\\/]* ) ac_abs_srcdir=$ac_srcdir;; *) ac_abs_srcdir=$ac_abs_builddir/$ac_srcdir;; esac;; esac case $ac_abs_builddir in .) ac_abs_top_srcdir=$ac_top_srcdir;; *) case $ac_top_srcdir in .) ac_abs_top_srcdir=$ac_abs_builddir;; [\\/]* | ?:[\\/]* ) ac_abs_top_srcdir=$ac_top_srcdir;; *) ac_abs_top_srcdir=$ac_abs_builddir/$ac_top_srcdir;; esac;; esac case $INSTALL in [\\/$]* | ?:[\\/]* ) ac_INSTALL=$INSTALL ;; *) ac_INSTALL=$ac_top_builddir$INSTALL ;; esac if test x"$ac_file" != x-; then { echo "$as_me:$LINENO: creating $ac_file" >&5 echo "$as_me: creating $ac_file" >&6;} rm -f "$ac_file" fi # Let's still pretend it is `configure' which instantiates (i.e., don't # use $as_me), people would be surprised to read: # /* config.h. Generated by config.status. */ if test x"$ac_file" = x-; then configure_input= else configure_input="$ac_file. " fi configure_input=$configure_input"Generated from `echo $ac_file_in | sed 's,.*/,,'` by configure." # First look for the input files in the build tree, otherwise in the # src tree. ac_file_inputs=`IFS=: for f in $ac_file_in; do case $f in -) echo $tmp/stdin ;; [\\/$]*) # Absolute (can't be DOS-style, as IFS=:) test -f "$f" || { { echo "$as_me:$LINENO: error: cannot find input file: $f" >&5 echo "$as_me: error: cannot find input file: $f" >&2;} { (exit 1); exit 1; }; } echo "$f";; *) # Relative if test -f "$f"; then # Build tree echo "$f" elif test -f "$srcdir/$f"; then # Source tree echo "$srcdir/$f" else # /dev/null tree { { echo "$as_me:$LINENO: error: cannot find input file: $f" >&5 echo "$as_me: error: cannot find input file: $f" >&2;} { (exit 1); exit 1; }; } fi;; esac done` || { (exit 1); exit 1; } _ACEOF cat >>$CONFIG_STATUS <<_ACEOF sed "$ac_vpsub $extrasub _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF :t /@[a-zA-Z_][a-zA-Z_0-9]*@/!b s,@configure_input@,$configure_input,;t t s,@srcdir@,$ac_srcdir,;t t s,@abs_srcdir@,$ac_abs_srcdir,;t t s,@top_srcdir@,$ac_top_srcdir,;t t s,@abs_top_srcdir@,$ac_abs_top_srcdir,;t t s,@builddir@,$ac_builddir,;t t s,@abs_builddir@,$ac_abs_builddir,;t t s,@top_builddir@,$ac_top_builddir,;t t s,@abs_top_builddir@,$ac_abs_top_builddir,;t t s,@INSTALL@,$ac_INSTALL,;t t " $ac_file_inputs | (eval "$ac_sed_cmds") >$tmp/out rm -f $tmp/stdin if test x"$ac_file" != x-; then mv $tmp/out $ac_file else cat $tmp/out rm -f $tmp/out fi done _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF # # CONFIG_HEADER section. # # These sed commands are passed to sed as "A NAME B NAME C VALUE D", where # NAME is the cpp macro being defined and VALUE is the value it is being given. # # ac_d sets the value in "#define NAME VALUE" lines. ac_dA='s,^\([ ]*\)#\([ ]*define[ ][ ]*\)' ac_dB='[ ].*$,\1#\2' ac_dC=' ' ac_dD=',;t' # ac_u turns "#undef NAME" without trailing blanks into "#define NAME VALUE". ac_uA='s,^\([ ]*\)#\([ ]*\)undef\([ ][ ]*\)' ac_uB='$,\1#\2define\3' ac_uC=' ' ac_uD=',;t' for ac_file in : $CONFIG_HEADERS; do test "x$ac_file" = x: && continue # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in". case $ac_file in - | *:- | *:-:* ) # input from stdin cat >$tmp/stdin ac_file_in=`echo "$ac_file" | sed 's,[^:]*:,,'` ac_file=`echo "$ac_file" | sed 's,:.*,,'` ;; *:* ) ac_file_in=`echo "$ac_file" | sed 's,[^:]*:,,'` ac_file=`echo "$ac_file" | sed 's,:.*,,'` ;; * ) ac_file_in=$ac_file.in ;; esac test x"$ac_file" != x- && { echo "$as_me:$LINENO: creating $ac_file" >&5 echo "$as_me: creating $ac_file" >&6;} # First look for the input files in the build tree, otherwise in the # src tree. ac_file_inputs=`IFS=: for f in $ac_file_in; do case $f in -) echo $tmp/stdin ;; [\\/$]*) # Absolute (can't be DOS-style, as IFS=:) test -f "$f" || { { echo "$as_me:$LINENO: error: cannot find input file: $f" >&5 echo "$as_me: error: cannot find input file: $f" >&2;} { (exit 1); exit 1; }; } # Do quote $f, to prevent DOS paths from being IFS'd. echo "$f";; *) # Relative if test -f "$f"; then # Build tree echo "$f" elif test -f "$srcdir/$f"; then # Source tree echo "$srcdir/$f" else # /dev/null tree { { echo "$as_me:$LINENO: error: cannot find input file: $f" >&5 echo "$as_me: error: cannot find input file: $f" >&2;} { (exit 1); exit 1; }; } fi;; esac done` || { (exit 1); exit 1; } # Remove the trailing spaces. sed 's/[ ]*$//' $ac_file_inputs >$tmp/in _ACEOF # Transform confdefs.h into two sed scripts, `conftest.defines' and # `conftest.undefs', that substitutes the proper values into # config.h.in to produce config.h. The first handles `#define' # templates, and the second `#undef' templates. # And first: Protect against being on the right side of a sed subst in # config.status. Protect against being in an unquoted here document # in config.status. rm -f conftest.defines conftest.undefs # Using a here document instead of a string reduces the quoting nightmare. # Putting comments in sed scripts is not portable. # # `end' is used to avoid that the second main sed command (meant for # 0-ary CPP macros) applies to n-ary macro definitions. # See the Autoconf documentation for `clear'. cat >confdef2sed.sed <<\_ACEOF s/[\\&,]/\\&/g s,[\\$`],\\&,g t clear : clear s,^[ ]*#[ ]*define[ ][ ]*\([^ (][^ (]*\)\(([^)]*)\)[ ]*\(.*\)$,${ac_dA}\1${ac_dB}\1\2${ac_dC}\3${ac_dD},gp t end s,^[ ]*#[ ]*define[ ][ ]*\([^ ][^ ]*\)[ ]*\(.*\)$,${ac_dA}\1${ac_dB}\1${ac_dC}\2${ac_dD},gp : end _ACEOF # If some macros were called several times there might be several times # the same #defines, which is useless. Nevertheless, we may not want to # sort them, since we want the *last* AC-DEFINE to be honored. uniq confdefs.h | sed -n -f confdef2sed.sed >conftest.defines sed 's/ac_d/ac_u/g' conftest.defines >conftest.undefs rm -f confdef2sed.sed # This sed command replaces #undef with comments. This is necessary, for # example, in the case of _POSIX_SOURCE, which is predefined and required # on some systems where configure will not decide to define it. cat >>conftest.undefs <<\_ACEOF s,^[ ]*#[ ]*undef[ ][ ]*[a-zA-Z_][a-zA-Z_0-9]*,/* & */, _ACEOF # Break up conftest.defines because some shells have a limit on the size # of here documents, and old seds have small limits too (100 cmds). echo ' # Handle all the #define templates only if necessary.' >>$CONFIG_STATUS echo ' if grep "^[ ]*#[ ]*define" $tmp/in >/dev/null; then' >>$CONFIG_STATUS echo ' # If there are no defines, we may have an empty if/fi' >>$CONFIG_STATUS echo ' :' >>$CONFIG_STATUS rm -f conftest.tail while grep . conftest.defines >/dev/null do # Write a limited-size here document to $tmp/defines.sed. echo ' cat >$tmp/defines.sed <>$CONFIG_STATUS # Speed up: don't consider the non `#define' lines. echo '/^[ ]*#[ ]*define/!b' >>$CONFIG_STATUS # Work around the forget-to-reset-the-flag bug. echo 't clr' >>$CONFIG_STATUS echo ': clr' >>$CONFIG_STATUS sed ${ac_max_here_lines}q conftest.defines >>$CONFIG_STATUS echo 'CEOF sed -f $tmp/defines.sed $tmp/in >$tmp/out rm -f $tmp/in mv $tmp/out $tmp/in ' >>$CONFIG_STATUS sed 1,${ac_max_here_lines}d conftest.defines >conftest.tail rm -f conftest.defines mv conftest.tail conftest.defines done rm -f conftest.defines echo ' fi # grep' >>$CONFIG_STATUS echo >>$CONFIG_STATUS # Break up conftest.undefs because some shells have a limit on the size # of here documents, and old seds have small limits too (100 cmds). echo ' # Handle all the #undef templates' >>$CONFIG_STATUS rm -f conftest.tail while grep . conftest.undefs >/dev/null do # Write a limited-size here document to $tmp/undefs.sed. echo ' cat >$tmp/undefs.sed <>$CONFIG_STATUS # Speed up: don't consider the non `#undef' echo '/^[ ]*#[ ]*undef/!b' >>$CONFIG_STATUS # Work around the forget-to-reset-the-flag bug. echo 't clr' >>$CONFIG_STATUS echo ': clr' >>$CONFIG_STATUS sed ${ac_max_here_lines}q conftest.undefs >>$CONFIG_STATUS echo 'CEOF sed -f $tmp/undefs.sed $tmp/in >$tmp/out rm -f $tmp/in mv $tmp/out $tmp/in ' >>$CONFIG_STATUS sed 1,${ac_max_here_lines}d conftest.undefs >conftest.tail rm -f conftest.undefs mv conftest.tail conftest.undefs done rm -f conftest.undefs cat >>$CONFIG_STATUS <<\_ACEOF # Let's still pretend it is `configure' which instantiates (i.e., don't # use $as_me), people would be surprised to read: # /* config.h. Generated by config.status. */ if test x"$ac_file" = x-; then echo "/* Generated by configure. */" >$tmp/config.h else echo "/* $ac_file. Generated by configure. */" >$tmp/config.h fi cat $tmp/in >>$tmp/config.h rm -f $tmp/in if test x"$ac_file" != x-; then if diff $ac_file $tmp/config.h >/dev/null 2>&1; then { echo "$as_me:$LINENO: $ac_file is unchanged" >&5 echo "$as_me: $ac_file is unchanged" >&6;} else ac_dir=`(dirname "$ac_file") 2>/dev/null || $as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$ac_file" : 'X\(//\)[^/]' \| \ X"$ac_file" : 'X\(//\)$' \| \ X"$ac_file" : 'X\(/\)' \| \ . : '\(.\)' 2>/dev/null || echo X"$ac_file" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; } /^X\(\/\/\)[^/].*/{ s//\1/; q; } /^X\(\/\/\)$/{ s//\1/; q; } /^X\(\/\).*/{ s//\1/; q; } s/.*/./; q'` { if $as_mkdir_p; then mkdir -p "$ac_dir" else as_dir="$ac_dir" as_dirs= while test ! -d "$as_dir"; do as_dirs="$as_dir $as_dirs" as_dir=`(dirname "$as_dir") 2>/dev/null || $as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$as_dir" : 'X\(//\)[^/]' \| \ X"$as_dir" : 'X\(//\)$' \| \ X"$as_dir" : 'X\(/\)' \| \ . : '\(.\)' 2>/dev/null || echo X"$as_dir" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; } /^X\(\/\/\)[^/].*/{ s//\1/; q; } /^X\(\/\/\)$/{ s//\1/; q; } /^X\(\/\).*/{ s//\1/; q; } s/.*/./; q'` done test ! -n "$as_dirs" || mkdir $as_dirs fi || { { echo "$as_me:$LINENO: error: cannot create directory \"$ac_dir\"" >&5 echo "$as_me: error: cannot create directory \"$ac_dir\"" >&2;} { (exit 1); exit 1; }; }; } rm -f $ac_file mv $tmp/config.h $ac_file fi else cat $tmp/config.h rm -f $tmp/config.h fi # Compute $ac_file's index in $config_headers. _am_stamp_count=1 for _am_header in $config_headers :; do case $_am_header in $ac_file | $ac_file:* ) break ;; * ) _am_stamp_count=`expr $_am_stamp_count + 1` ;; esac done echo "timestamp for $ac_file" >`(dirname $ac_file) 2>/dev/null || $as_expr X$ac_file : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X$ac_file : 'X\(//\)[^/]' \| \ X$ac_file : 'X\(//\)$' \| \ X$ac_file : 'X\(/\)' \| \ . : '\(.\)' 2>/dev/null || echo X$ac_file | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; } /^X\(\/\/\)[^/].*/{ s//\1/; q; } /^X\(\/\/\)$/{ s//\1/; q; } /^X\(\/\).*/{ s//\1/; q; } s/.*/./; q'`/stamp-h$_am_stamp_count done _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF # # CONFIG_COMMANDS section. # for ac_file in : $CONFIG_COMMANDS; do test "x$ac_file" = x: && continue ac_dest=`echo "$ac_file" | sed 's,:.*,,'` ac_source=`echo "$ac_file" | sed 's,[^:]*:,,'` ac_dir=`(dirname "$ac_dest") 2>/dev/null || $as_expr X"$ac_dest" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$ac_dest" : 'X\(//\)[^/]' \| \ X"$ac_dest" : 'X\(//\)$' \| \ X"$ac_dest" : 'X\(/\)' \| \ . : '\(.\)' 2>/dev/null || echo X"$ac_dest" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; } /^X\(\/\/\)[^/].*/{ s//\1/; q; } /^X\(\/\/\)$/{ s//\1/; q; } /^X\(\/\).*/{ s//\1/; q; } s/.*/./; q'` { if $as_mkdir_p; then mkdir -p "$ac_dir" else as_dir="$ac_dir" as_dirs= while test ! -d "$as_dir"; do as_dirs="$as_dir $as_dirs" as_dir=`(dirname "$as_dir") 2>/dev/null || $as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$as_dir" : 'X\(//\)[^/]' \| \ X"$as_dir" : 'X\(//\)$' \| \ X"$as_dir" : 'X\(/\)' \| \ . : '\(.\)' 2>/dev/null || echo X"$as_dir" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; } /^X\(\/\/\)[^/].*/{ s//\1/; q; } /^X\(\/\/\)$/{ s//\1/; q; } /^X\(\/\).*/{ s//\1/; q; } s/.*/./; q'` done test ! -n "$as_dirs" || mkdir $as_dirs fi || { { echo "$as_me:$LINENO: error: cannot create directory \"$ac_dir\"" >&5 echo "$as_me: error: cannot create directory \"$ac_dir\"" >&2;} { (exit 1); exit 1; }; }; } ac_builddir=. if test "$ac_dir" != .; then ac_dir_suffix=/`echo "$ac_dir" | sed 's,^\.[\\/],,'` # A "../" for each directory in $ac_dir_suffix. ac_top_builddir=`echo "$ac_dir_suffix" | sed 's,/[^\\/]*,../,g'` else ac_dir_suffix= ac_top_builddir= fi case $srcdir in .) # No --srcdir option. We are building in place. ac_srcdir=. if test -z "$ac_top_builddir"; then ac_top_srcdir=. else ac_top_srcdir=`echo $ac_top_builddir | sed 's,/$,,'` fi ;; [\\/]* | ?:[\\/]* ) # Absolute path. ac_srcdir=$srcdir$ac_dir_suffix; ac_top_srcdir=$srcdir ;; *) # Relative path. ac_srcdir=$ac_top_builddir$srcdir$ac_dir_suffix ac_top_srcdir=$ac_top_builddir$srcdir ;; esac # Do not use `cd foo && pwd` to compute absolute paths, because # the directories may not exist. case `pwd` in .) ac_abs_builddir="$ac_dir";; *) case "$ac_dir" in .) ac_abs_builddir=`pwd`;; [\\/]* | ?:[\\/]* ) ac_abs_builddir="$ac_dir";; *) ac_abs_builddir=`pwd`/"$ac_dir";; esac;; esac case $ac_abs_builddir in .) ac_abs_top_builddir=${ac_top_builddir}.;; *) case ${ac_top_builddir}. in .) ac_abs_top_builddir=$ac_abs_builddir;; [\\/]* | ?:[\\/]* ) ac_abs_top_builddir=${ac_top_builddir}.;; *) ac_abs_top_builddir=$ac_abs_builddir/${ac_top_builddir}.;; esac;; esac case $ac_abs_builddir in .) ac_abs_srcdir=$ac_srcdir;; *) case $ac_srcdir in .) ac_abs_srcdir=$ac_abs_builddir;; [\\/]* | ?:[\\/]* ) ac_abs_srcdir=$ac_srcdir;; *) ac_abs_srcdir=$ac_abs_builddir/$ac_srcdir;; esac;; esac case $ac_abs_builddir in .) ac_abs_top_srcdir=$ac_top_srcdir;; *) case $ac_top_srcdir in .) ac_abs_top_srcdir=$ac_abs_builddir;; [\\/]* | ?:[\\/]* ) ac_abs_top_srcdir=$ac_top_srcdir;; *) ac_abs_top_srcdir=$ac_abs_builddir/$ac_top_srcdir;; esac;; esac { echo "$as_me:$LINENO: executing $ac_dest commands" >&5 echo "$as_me: executing $ac_dest commands" >&6;} case $ac_dest in depfiles ) test x"$AMDEP_TRUE" != x"" || for mf in $CONFIG_FILES; do # Strip MF so we end up with the name of the file. mf=`echo "$mf" | sed -e 's/:.*$//'` # Check whether this is an Automake generated Makefile or not. # We used to match only the files named `Makefile.in', but # some people rename them; so instead we look at the file content. # Grep'ing the first line is not enough: some people post-process # each Makefile.in and add a new line on top of each file to say so. # So let's grep whole file. if grep '^#.*generated by automake' $mf > /dev/null 2>&1; then dirpart=`(dirname "$mf") 2>/dev/null || $as_expr X"$mf" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$mf" : 'X\(//\)[^/]' \| \ X"$mf" : 'X\(//\)$' \| \ X"$mf" : 'X\(/\)' \| \ . : '\(.\)' 2>/dev/null || echo X"$mf" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; } /^X\(\/\/\)[^/].*/{ s//\1/; q; } /^X\(\/\/\)$/{ s//\1/; q; } /^X\(\/\).*/{ s//\1/; q; } s/.*/./; q'` else continue fi # Extract the definition of DEPDIR, am__include, and am__quote # from the Makefile without running `make'. DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"` test -z "$DEPDIR" && continue am__include=`sed -n 's/^am__include = //p' < "$mf"` test -z "am__include" && continue am__quote=`sed -n 's/^am__quote = //p' < "$mf"` # When using ansi2knr, U may be empty or an underscore; expand it U=`sed -n 's/^U = //p' < "$mf"` # Find all dependency output files, they are included files with # $(DEPDIR) in their names. We invoke sed twice because it is the # simplest approach to changing $(DEPDIR) to its actual value in the # expansion. for file in `sed -n " s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \ sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do # Make sure the directory exists. test -f "$dirpart/$file" && continue fdir=`(dirname "$file") 2>/dev/null || $as_expr X"$file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$file" : 'X\(//\)[^/]' \| \ X"$file" : 'X\(//\)$' \| \ X"$file" : 'X\(/\)' \| \ . : '\(.\)' 2>/dev/null || echo X"$file" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; } /^X\(\/\/\)[^/].*/{ s//\1/; q; } /^X\(\/\/\)$/{ s//\1/; q; } /^X\(\/\).*/{ s//\1/; q; } s/.*/./; q'` { if $as_mkdir_p; then mkdir -p $dirpart/$fdir else as_dir=$dirpart/$fdir as_dirs= while test ! -d "$as_dir"; do as_dirs="$as_dir $as_dirs" as_dir=`(dirname "$as_dir") 2>/dev/null || $as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ X"$as_dir" : 'X\(//\)[^/]' \| \ X"$as_dir" : 'X\(//\)$' \| \ X"$as_dir" : 'X\(/\)' \| \ . : '\(.\)' 2>/dev/null || echo X"$as_dir" | sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; } /^X\(\/\/\)[^/].*/{ s//\1/; q; } /^X\(\/\/\)$/{ s//\1/; q; } /^X\(\/\).*/{ s//\1/; q; } s/.*/./; q'` done test ! -n "$as_dirs" || mkdir $as_dirs fi || { { echo "$as_me:$LINENO: error: cannot create directory $dirpart/$fdir" >&5 echo "$as_me: error: cannot create directory $dirpart/$fdir" >&2;} { (exit 1); exit 1; }; }; } # echo "creating $dirpart/$file" echo '# dummy' > "$dirpart/$file" done done ;; esac done _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF { (exit 0); exit 0; } _ACEOF chmod +x $CONFIG_STATUS ac_clean_files=$ac_clean_files_save # configure is writing to config.log, and then calls config.status. # config.status does its own redirection, appending to config.log. # Unfortunately, on DOS this fails, as config.log is still kept open # by configure, so config.status won't be able to write to it; its # output is simply discarded. So we exec the FD to /dev/null, # effectively closing config.log, so it can be properly (re)opened and # appended to by config.status. When coming back to configure, we # need to make the FD available again. if test "$no_create" != yes; then ac_cs_success=: ac_config_status_args= test "$silent" = yes && ac_config_status_args="$ac_config_status_args --quiet" exec 5>/dev/null $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false exec 5>>config.log # Use ||, not &&, to avoid exiting from the if with $? = 1, which # would make configure fail if this is the last instruction. $ac_cs_success || { (exit 1); exit 1; } fi # dump some configuration confirmations echo \ " -- ${PACKAGE_STRING} Configuration Results -- C++ compiler: ${CXX} ${CXXFLAGS} Linker flags: ${LDFLAGS}" if test x"${GCC}" = x"yes" ; then gcc_version=`${CC} --version | head -n 1` echo " GCC version: ${gcc_version}" else gcc_version='' fi echo \ " Host System type: ${host} Install prefix: ${prefix} Install eprefix: ${exec_prefix} See config.h for further configuration information. Email <${PACKAGE_BUGREPORT}> with questions and bug reports. " if test x"${PYTHON}" = x":" || ! test -x "${PYTHON}"; then echo "WARNING! python was not found and is required to run tophat" echo " Please install python and point configure to the installed location" fi tophat-2.0.9/NEWS0000644000175000017500000001472412122334411012251 0ustar toortoor[ 1.0 (BETA) release - 5/4/2009 ] * TopHat has been almost entirely redesigned and rewritten to handle "second-generation" RNA-Seq data. Reads longer than 50bp and paired end reads are substantially more powerful for finding splice junctions, and TopHat needed new algorithms to take advantage of them. While this release should be considered a beta, and still contains bugs, it has been under development for several months and has been tested by several groups on both first- and second-generation RNA-Seq data in multiple organisms. Longer and/or paired end reads provide a dramatic leap in sensitivity and specificity. Notable improvements include: - Paired-end RNA-Seq read support - Long read support - Improved SAM output - No longer depends on Maq - Mismatches near splicing anchors now allowed - Much more of the pipeline is multithreaded, yielding a massive performance boost - Compiles under GCC 4.3 [ 0.8.3 release - 3/12/2009 ] This release contains the following enhancements and fixes: - Reporting now has a smaller memory footprint - A possible source of erroneous alignments due to hashing collisions has been eliminated - The install scripts now correctly detect whether to build TopHat with 64-bit compiler flags. [ 0.8.2 release - 3/1/2009 ] This release contains the following enhancements and fixes: - TopHat now reports the alignments it finds in the SAM format. The SAM tools were written primarily by Heng Li at Sanger, and will allow TopHat users to call expressed SNPs from their RNA-Seq reads. The SAM tools themselves are still under development, so TopHat's SAM support should be considered experimental. - You can now specify a list of junctions for TopHat to check in a raw format, without using a GFF file of genes - The new -o option allows you to change where TopHat puts its output, instead of always writing to "./tophat_out" [ 0.8.1 release - 1/30/2009 ] * New experimental support for user-supplied annotations. TopHat will accept a GFF file, and will look for junctions contained in the GFF file. TopHat will also perform a basic RPKM calculation on the regions in the annotation, normalized to those annotations only (rather than the whole map). The file must contain "gene", "exon" and "mRNA" records, in the normal record ID, Parent heirarchy. Users are encouraged to treat GFF support as unstable and interpret their results with caution. TopHat 0.8.1 uses some code kindly provided by Robert Bradley. The code originally came from Rob's statistical alignment package FSA. * Several minor bugfixes. [ 0.8.0 release - 1/19/2009 ] * Dramatic reduction in false positives. * TopHat now estimates a minor isoform frequency for each splice junction, and filters infrequent events to cut down dramatically on the false positives. By default, minor isoforms must occur at at least 15 percent of the major isoform. * The new output file coverage.wig is a UCSC wigglegram of alignment coverage. * TopHat supports multithreading, though not all stages of the pipeline use multiple threads. * TopHat now allows reads to have multiple alignments, and it suppresses alignments for reads that have more than a user-specified number (10, by default). * The memory exhaustion problem associated with converting Bowtie alignments to Maq has been fixed. * You are no longer required to concatenate your reads into a single input file. * TopHat will attempt to automatically determine seed length, quality scale, and FASTA/FASTQ format from your input reads. * If you are missing a Maq binary fasta file for your reference, one will be created in the output directory using bowtie-inspect. You can copy this file to the location of your bowtie index to avoid this step in your next run. [ 0.7.2 release - 12/05/2008 ] * Bowtie 0.9.8 renamed bowtie-convert to bowtie-maqconvert, and TopHat is now compatible with both the new and old name. * Minor cosmetic improvements in the TopHat output log. * Improved checking in the installer to emit sensible error messages when compiling on Solaris. Solaris is currently not supported, but hopefully will be in the next release. Known issues: * TopHat can exhaust memory when run with many (> 50 million) reads on some machines. This will be fixed in the next release. [ 0.7.1 release - 11/08/2008 ] The following issues have been fixed: * Maq 0.7.0 changed the Maq map file format. Bowtie 0.9.7 now supports both the new and old mapping format, and thus so now does TopHat. TopHat now checks the version of Maq on the system and uses the correct format. * Minor command line interface improvements * The -X option has been added to allow the use of FASTQ files that are scaled on the Solexa quality scale, as opposed to Phred (the default). Note that TopHat doesn't support FASTQ-int, only ASCII-encoded qualities are used. * The -D option has been added, allowing users to specify when to look for junctions within single islands, as opposed to just between two distinct islands * The -Q option allows the user to specify a Phred quality character below which the island consensus caller will use the reference base call. That is, TopHat will not allow SNPs to be called where base quality drops below a certain threshold. * TopHat now includes Heng Li's fq_all2std.pl format conversion script to make installation easier. [ 0.7.0 release - 10/27/08 ] The first public release of TopHat is now available for download. To use TopHat, you will need to install Bowtie and Maq. Both are open source and freely available under the Artistic license. When you install Bowtie, you should also install the Bowtie index for the genome in your RNA-Seq experiment, if one is available. If there is no pre-built index for the organism you're interested in, you can follow the Bowtie manual's section on how to build one yourself. Because this is the first release, the manual is very limited. Only the basic options have been described. However, we will be updating it frequently, so please check back. If you find something unclear, or have questions about how TopHat works, please email Cole Trapnell. We will be posting a list of frequently asked questions soon. In this release, TopHat does not consider mate pairing between reads. You can analyze paired-end RNA-Seq data with TopHat, but the program won't make use of the mate information. Yet. Use of mate pair information is our top development priority. Check back soon for a release with full paired-end supporttophat-2.0.9/AUTHORS0000644000175000017500000000117212122334411012613 0ustar toortoorTopHat authors Primary contact Cole Trapnell wrote TopHat. TopHat is built on Bowtie, which was written by Ben Langmead and Cole Trapnell. Daehwan Kim and Geo Pertea added support for SOLiD reads, Bowtie 2 and other features and improvements. The SeqAn-1.2 library is used in TopHat and Bowtie and some of its sources are included in TopHat source releases; its authors are Andreas Doring, David Weese, Tobias Rausch, and Knut Reinert. Websites: TopHat: http://tophat.cbcb.umd.edu Bowtie: http://bowtie-bio.sf.net SeqAn: http://www.seqan.de October 2010 tophat-2.0.9/COPYING0000644000175000017500000001212712122334411012600 0ustar toortoorThe Artistic License Preamble The intent of this document is to state the conditions under which a Package may be copied, such that the Copyright Holder maintains some semblance of artistic control over the development of the package, while giving the users of the package the right to use and distribute the Package in a more-or-less customary fashion, plus the right to make reasonable modifications. Definitions: * "Package" refers to the collection of files distributed by the Copyright Holder, and derivatives of that collection of files created through textual modification. * "Standard Version" refers to such a Package if it has not been modified, or has been modified in accordance with the wishes of the Copyright Holder. * "Copyright Holder" is whoever is named in the copyright or copyrights for the package. * "You" is you, if you're thinking about copying or distributing this Package. * "Reasonable copying fee" is whatever you can justify on the basis of media cost, duplication charges, time of people involved, and so on. (You will not be required to justify it to the Copyright Holder, but only to the computing community at large as a market that must bear the fee.) * "Freely Available" means that no fee is charged for the item itself, though there may be fees involved in handling the item. It also means that recipients of the item may redistribute it under the same conditions they received it. 1. You may make and give away verbatim copies of the source form of the Standard Version of this Package without restriction, provided that you duplicate all of the original copyright notices and associated disclaimers. 2. You may apply bug fixes, portability fixes and other modifications derived from the Public Domain or from the Copyright Holder. A Package modified in such a way shall still be considered the Standard Version. 3. You may otherwise modify your copy of this Package in any way, provided that you insert a prominent notice in each changed file stating how and when you changed that file, and provided that you do at least ONE of the following: a) place your modifications in the Public Domain or otherwise make them Freely Available, such as by posting said modifications to Usenet or an equivalent medium, or placing the modifications on a major archive site such as ftp.uu.net, or by allowing the Copyright Holder to include your modifications in the Standard Version of the Package. b) use the modified Package only within your corporation or organization. c) rename any non-standard executables so the names do not conflict with standard executables, which must also be provided, and provide a separate manual page for each non-standard executable that clearly documents how it differs from the Standard Version. d) make other distribution arrangements with the Copyright Holder. 4. You may distribute the programs of this Package in object code or executable form, provided that you do at least ONE of the following: a) distribute a Standard Version of the executables and library files, together with instructions (in the manual page or equivalent) on where to get the Standard Version. b) accompany the distribution with the machine-readable source of the Package with your modifications. c) accompany any non-standard executables with their corresponding Standard Version executables, giving the non-standard executables non-standard names, and clearly documenting the differences in manual pages (or equivalent), together with instructions on where to get the Standard Version. d) make other distribution arrangements with the Copyright Holder. 5. You may charge a reasonable copying fee for any distribution of this Package. You may charge any fee you choose for support of this Package. You may not charge a fee for this Package itself. However, you may distribute this Package in aggregate with other (possibly commercial) programs as part of a larger (possibly commercial) software distribution provided that you do not advertise this Package as a product of your own. 6. The scripts and library files supplied as input to or produced as output from the programs of this Package do not automatically fall under the copyright of this Package, but belong to whomever generated them, and may be sold commercially, and may be aggregated with this Package. 7. C or perl subroutines supplied by you and linked into this Package shall not be considered part of this Package. 8. The name of the Copyright Holder may not be used to endorse or promote products derived from this software without specific prior written permission. 9. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. The End This license is approved by the Open Source Initiative (www.opensource.org) for certifying software as OSI Certified Open Source. tophat-2.0.9/configure.ac0000644000175000017500000000671112157116165014051 0ustar toortoorm4_include([ax_boost_base.m4]) m4_include([ax_boost_thread.m4]) m4_include([ax_bam.m4]) #m4_include([ax_check_zlib.m4]) define([svnversion], esyscmd([sh -c "svnversion|tr -d '\n'"]))dnl AC_INIT([tophat],[2.0.9],[tophat.cufflinks@gmail.com]) AC_DEFINE(SVN_REVISION, "svnversion", [SVN Revision]) AC_CONFIG_SRCDIR([config.h.in]) AC_CONFIG_HEADERS([config.h]) AC_CONFIG_AUX_DIR([build-aux]) AM_INIT_AUTOMAKE #AM_PATH_CPPUNIT(1.10.2) AC_ARG_VAR(PYTHON, [python program]) # Make sure CXXFLAGS is defined so that AC_PROG_CXX doesn't set it. CXXFLAGS="$CXXFLAGS" CFLAGS="$CFLAGS" AC_LANG(C) # Checks for programs. AC_PROG_AWK AC_PROG_CXX AC_PROG_CC AC_PROG_MAKE_SET AC_PROG_RANLIB AC_PROG_INSTALL AM_PATH_PYTHON([2.4]) AX_BOOST_BASE([1.38.0]) AX_BAM AX_BOOST_THREAD # AX_CHECK_ZLIB() # Checks for header files. AC_CHECK_HEADERS([stdlib.h string.h unistd.h]) # Checks for header files. AC_HEADER_STDC # Checks for typedefs, structures, and compiler characteristics. AC_HEADER_STDBOOL AC_C_INLINE AC_TYPE_PID_T AC_TYPE_SIZE_T AC_CHECK_TYPES([ptrdiff_t]) AC_C_CONST AC_SYS_LARGEFILE # Checks for libraries. AC_CHECK_LIB([z], [gzread]) # Checks for library functions. AC_FUNC_MALLOC AC_CHECK_FUNCS([memset strdup strrchr strtol strsep]) # check the platform AC_CANONICAL_HOST # --------------------------------------------------------------------- # Debug and profile # --------------------------------------------------------------------- # set CFLAGS and CXXFLAGS #user_CFLAGS="${CXXFLAGS}" user_CFLAGS=${CFLAGS} generic_CFLAGS="-Wall -Wno-strict-aliasing -g -gdwarf-2 -Wuninitialized" ext_CFLAGS="" debug_CFLAGS="" user_LDFLAGS="$LDFLAGS" AC_ARG_ENABLE(intel64, [ --enable-intel64 optimize for Intel64 CPU such as Xeon and Core2], [ext_CFLAGS="${ext_CFLAGS} -mtune=nocona"], []) AC_ARG_ENABLE([debug], [AS_HELP_STRING([--enable-debug], [enable debugging info (default is no)])], [], [enable_debug=no]) AC_ARG_ENABLE([optim], [AS_HELP_STRING([--enable-optim@<:@=0|1|2|3@:>@], [set optimization level (default is 3)])], [if test "x$enable_optim" = xyes; then enable_optim=3; fi], [enable_optim=3]) AS_IF([test "x$enable_optim" != xno], [ext_CFLAGS="$ext_CFLAGS -O$enable_optim"]) AS_IF([test "x$enable_debug" = xyes], [debug_CFLAGS="-DDEBUG"], [debug_CFLAGS="-DNDEBUG"]) CFLAGS="${generic_CFLAGS} ${ext_CFLAGS} ${user_CFLAGS} ${debug_CFLAGS}" CXXFLAGS="$CFLAGS" CXXFLAGS="$CXXFLAGS $BOOST_CPPFLAGS $BAM_CPPFLAGS -I./SeqAn-1.3" LDFLAGS="$user_LDFLAGS" AM_INIT_AUTOMAKE([-Wall foreign tar-pax foreign]) # makefiles to configure AC_CONFIG_FILES([Makefile src/Makefile]) # make it happen AC_OUTPUT # dump some configuration confirmations echo \ " -- ${PACKAGE_STRING} Configuration Results -- C++ compiler: ${CXX} ${CXXFLAGS} Linker flags: ${LDFLAGS}" if test x"${GCC}" = x"yes" ; then gcc_version=`${CC} --version | head -n 1` echo " GCC version: ${gcc_version}" else gcc_version='' fi echo \ " Host System type: ${host} Install prefix: ${prefix} Install eprefix: ${exec_prefix} See config.h for further configuration information. Email <${PACKAGE_BUGREPORT}> with questions and bug reports. " if test x"${PYTHON}" = x":" || ! test -x "${PYTHON}"; then echo "WARNING! python was not found and is required to run tophat" echo " Please install python and point configure to the installed location" fi tophat-2.0.9/build-aux/0000755000175000017500000000000012163557417013456 5ustar toortoortophat-2.0.9/build-aux/depcomp0000755000175000017500000003710012122334411015012 0ustar toortoor#! /bin/sh # depcomp - compile a program generating dependencies as side-effects scriptversion=2005-07-09.11 # Copyright (C) 1999, 2000, 2003, 2004, 2005 Free Software Foundation, Inc. # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2, or (at your option) # any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that program. # Originally written by Alexandre Oliva . case $1 in '') echo "$0: No command. Try \`$0 --help' for more information." 1>&2 exit 1; ;; -h | --h*) cat <<\EOF Usage: depcomp [--help] [--version] PROGRAM [ARGS] Run PROGRAMS ARGS to compile a file, generating dependencies as side-effects. Environment variables: depmode Dependency tracking mode. source Source file read by `PROGRAMS ARGS'. object Object file output by `PROGRAMS ARGS'. DEPDIR directory where to store dependencies. depfile Dependency file to output. tmpdepfile Temporary file to use when outputing dependencies. libtool Whether libtool is used (yes/no). Report bugs to . EOF exit $? ;; -v | --v*) echo "depcomp $scriptversion" exit $? ;; esac if test -z "$depmode" || test -z "$source" || test -z "$object"; then echo "depcomp: Variables source, object and depmode must be set" 1>&2 exit 1 fi # Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po. depfile=${depfile-`echo "$object" | sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`} tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`} rm -f "$tmpdepfile" # Some modes work just like other modes, but use different flags. We # parameterize here, but still list the modes in the big case below, # to make depend.m4 easier to write. Note that we *cannot* use a case # here, because this file can only contain one case statement. if test "$depmode" = hp; then # HP compiler uses -M and no extra arg. gccflag=-M depmode=gcc fi if test "$depmode" = dashXmstdout; then # This is just like dashmstdout with a different argument. dashmflag=-xM depmode=dashmstdout fi case "$depmode" in gcc3) ## gcc 3 implements dependency tracking that does exactly what ## we want. Yay! Note: for some reason libtool 1.4 doesn't like ## it if -MD -MP comes after the -MF stuff. Hmm. "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" stat=$? if test $stat -eq 0; then : else rm -f "$tmpdepfile" exit $stat fi mv "$tmpdepfile" "$depfile" ;; gcc) ## There are various ways to get dependency output from gcc. Here's ## why we pick this rather obscure method: ## - Don't want to use -MD because we'd like the dependencies to end ## up in a subdir. Having to rename by hand is ugly. ## (We might end up doing this anyway to support other compilers.) ## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like ## -MM, not -M (despite what the docs say). ## - Using -M directly means running the compiler twice (even worse ## than renaming). if test -z "$gccflag"; then gccflag=-MD, fi "$@" -Wp,"$gccflag$tmpdepfile" stat=$? if test $stat -eq 0; then : else rm -f "$tmpdepfile" exit $stat fi rm -f "$depfile" echo "$object : \\" > "$depfile" alpha=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ## The second -e expression handles DOS-style file names with drive letters. sed -e 's/^[^:]*: / /' \ -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile" ## This next piece of magic avoids the `deleted header file' problem. ## The problem is that when a header file which appears in a .P file ## is deleted, the dependency causes make to die (because there is ## typically no way to rebuild the header). We avoid this by adding ## dummy dependencies for each header file. Too bad gcc doesn't do ## this for us directly. tr ' ' ' ' < "$tmpdepfile" | ## Some versions of gcc put a space before the `:'. On the theory ## that the space means something, we add a space to the output as ## well. ## Some versions of the HPUX 10.20 sed can't process this invocation ## correctly. Breaking it into two sed invocations is a workaround. sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile" rm -f "$tmpdepfile" ;; hp) # This case exists only to let depend.m4 do its work. It works by # looking at the text of this script. This case will never be run, # since it is checked for above. exit 1 ;; sgi) if test "$libtool" = yes; then "$@" "-Wp,-MDupdate,$tmpdepfile" else "$@" -MDupdate "$tmpdepfile" fi stat=$? if test $stat -eq 0; then : else rm -f "$tmpdepfile" exit $stat fi rm -f "$depfile" if test -f "$tmpdepfile"; then # yes, the sourcefile depend on other files echo "$object : \\" > "$depfile" # Clip off the initial element (the dependent). Don't try to be # clever and replace this with sed code, as IRIX sed won't handle # lines with more than a fixed number of characters (4096 in # IRIX 6.2 sed, 8192 in IRIX 6.5). We also remove comment lines; # the IRIX cc adds comments like `#:fec' to the end of the # dependency line. tr ' ' ' ' < "$tmpdepfile" \ | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' | \ tr ' ' ' ' >> $depfile echo >> $depfile # The second pass generates a dummy entry for each header file. tr ' ' ' ' < "$tmpdepfile" \ | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \ >> $depfile else # The sourcefile does not contain any dependencies, so just # store a dummy comment line, to avoid errors with the Makefile # "include basename.Plo" scheme. echo "#dummy" > "$depfile" fi rm -f "$tmpdepfile" ;; aix) # The C for AIX Compiler uses -M and outputs the dependencies # in a .u file. In older versions, this file always lives in the # current directory. Also, the AIX compiler puts `$object:' at the # start of each line; $object doesn't have directory information. # Version 6 uses the directory in both cases. stripped=`echo "$object" | sed 's/\(.*\)\..*$/\1/'` tmpdepfile="$stripped.u" if test "$libtool" = yes; then "$@" -Wc,-M else "$@" -M fi stat=$? if test -f "$tmpdepfile"; then : else stripped=`echo "$stripped" | sed 's,^.*/,,'` tmpdepfile="$stripped.u" fi if test $stat -eq 0; then : else rm -f "$tmpdepfile" exit $stat fi if test -f "$tmpdepfile"; then outname="$stripped.o" # Each line is of the form `foo.o: dependent.h'. # Do two passes, one to just change these to # `$object: dependent.h' and one to simply `dependent.h:'. sed -e "s,^$outname:,$object :," < "$tmpdepfile" > "$depfile" sed -e "s,^$outname: \(.*\)$,\1:," < "$tmpdepfile" >> "$depfile" else # The sourcefile does not contain any dependencies, so just # store a dummy comment line, to avoid errors with the Makefile # "include basename.Plo" scheme. echo "#dummy" > "$depfile" fi rm -f "$tmpdepfile" ;; icc) # Intel's C compiler understands `-MD -MF file'. However on # icc -MD -MF foo.d -c -o sub/foo.o sub/foo.c # ICC 7.0 will fill foo.d with something like # foo.o: sub/foo.c # foo.o: sub/foo.h # which is wrong. We want: # sub/foo.o: sub/foo.c # sub/foo.o: sub/foo.h # sub/foo.c: # sub/foo.h: # ICC 7.1 will output # foo.o: sub/foo.c sub/foo.h # and will wrap long lines using \ : # foo.o: sub/foo.c ... \ # sub/foo.h ... \ # ... "$@" -MD -MF "$tmpdepfile" stat=$? if test $stat -eq 0; then : else rm -f "$tmpdepfile" exit $stat fi rm -f "$depfile" # Each line is of the form `foo.o: dependent.h', # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'. # Do two passes, one to just change these to # `$object: dependent.h' and one to simply `dependent.h:'. sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile" # Some versions of the HPUX 10.20 sed can't process this invocation # correctly. Breaking it into two sed invocations is a workaround. sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" | sed -e 's/$/ :/' >> "$depfile" rm -f "$tmpdepfile" ;; tru64) # The Tru64 compiler uses -MD to generate dependencies as a side # effect. `cc -MD -o foo.o ...' puts the dependencies into `foo.o.d'. # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put # dependencies in `foo.d' instead, so we check for that too. # Subdirectories are respected. dir=`echo "$object" | sed -e 's|/[^/]*$|/|'` test "x$dir" = "x$object" && dir= base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'` if test "$libtool" = yes; then # With Tru64 cc, shared objects can also be used to make a # static library. This mecanism is used in libtool 1.4 series to # handle both shared and static libraries in a single compilation. # With libtool 1.4, dependencies were output in $dir.libs/$base.lo.d. # # With libtool 1.5 this exception was removed, and libtool now # generates 2 separate objects for the 2 libraries. These two # compilations output dependencies in in $dir.libs/$base.o.d and # in $dir$base.o.d. We have to check for both files, because # one of the two compilations can be disabled. We should prefer # $dir$base.o.d over $dir.libs/$base.o.d because the latter is # automatically cleaned when .libs/ is deleted, while ignoring # the former would cause a distcleancheck panic. tmpdepfile1=$dir.libs/$base.lo.d # libtool 1.4 tmpdepfile2=$dir$base.o.d # libtool 1.5 tmpdepfile3=$dir.libs/$base.o.d # libtool 1.5 tmpdepfile4=$dir.libs/$base.d # Compaq CCC V6.2-504 "$@" -Wc,-MD else tmpdepfile1=$dir$base.o.d tmpdepfile2=$dir$base.d tmpdepfile3=$dir$base.d tmpdepfile4=$dir$base.d "$@" -MD fi stat=$? if test $stat -eq 0; then : else rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4" exit $stat fi for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4" do test -f "$tmpdepfile" && break done if test -f "$tmpdepfile"; then sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile" # That's a tab and a space in the []. sed -e 's,^.*\.[a-z]*:[ ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile" else echo "#dummy" > "$depfile" fi rm -f "$tmpdepfile" ;; #nosideeffect) # This comment above is used by automake to tell side-effect # dependency tracking mechanisms from slower ones. dashmstdout) # Important note: in order to support this mode, a compiler *must* # always write the preprocessed file to stdout, regardless of -o. "$@" || exit $? # Remove the call to Libtool. if test "$libtool" = yes; then while test $1 != '--mode=compile'; do shift done shift fi # Remove `-o $object'. IFS=" " for arg do case $arg in -o) shift ;; $object) shift ;; *) set fnord "$@" "$arg" shift # fnord shift # $arg ;; esac done test -z "$dashmflag" && dashmflag=-M # Require at least two characters before searching for `:' # in the target name. This is to cope with DOS-style filenames: # a dependency such as `c:/foo/bar' could be seen as target `c' otherwise. "$@" $dashmflag | sed 's:^[ ]*[^: ][^:][^:]*\:[ ]*:'"$object"'\: :' > "$tmpdepfile" rm -f "$depfile" cat < "$tmpdepfile" > "$depfile" tr ' ' ' ' < "$tmpdepfile" | \ ## Some versions of the HPUX 10.20 sed can't process this invocation ## correctly. Breaking it into two sed invocations is a workaround. sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile" rm -f "$tmpdepfile" ;; dashXmstdout) # This case only exists to satisfy depend.m4. It is never actually # run, as this mode is specially recognized in the preamble. exit 1 ;; makedepend) "$@" || exit $? # Remove any Libtool call if test "$libtool" = yes; then while test $1 != '--mode=compile'; do shift done shift fi # X makedepend shift cleared=no for arg in "$@"; do case $cleared in no) set ""; shift cleared=yes ;; esac case "$arg" in -D*|-I*) set fnord "$@" "$arg"; shift ;; # Strip any option that makedepend may not understand. Remove # the object too, otherwise makedepend will parse it as a source file. -*|$object) ;; *) set fnord "$@" "$arg"; shift ;; esac done obj_suffix="`echo $object | sed 's/^.*\././'`" touch "$tmpdepfile" ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@" rm -f "$depfile" cat < "$tmpdepfile" > "$depfile" sed '1,2d' "$tmpdepfile" | tr ' ' ' ' | \ ## Some versions of the HPUX 10.20 sed can't process this invocation ## correctly. Breaking it into two sed invocations is a workaround. sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile" rm -f "$tmpdepfile" "$tmpdepfile".bak ;; cpp) # Important note: in order to support this mode, a compiler *must* # always write the preprocessed file to stdout. "$@" || exit $? # Remove the call to Libtool. if test "$libtool" = yes; then while test $1 != '--mode=compile'; do shift done shift fi # Remove `-o $object'. IFS=" " for arg do case $arg in -o) shift ;; $object) shift ;; *) set fnord "$@" "$arg" shift # fnord shift # $arg ;; esac done "$@" -E | sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \ -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' | sed '$ s: \\$::' > "$tmpdepfile" rm -f "$depfile" echo "$object : \\" > "$depfile" cat < "$tmpdepfile" >> "$depfile" sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile" rm -f "$tmpdepfile" ;; msvisualcpp) # Important note: in order to support this mode, a compiler *must* # always write the preprocessed file to stdout, regardless of -o, # because we must use -o when running libtool. "$@" || exit $? IFS=" " for arg do case "$arg" in "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI") set fnord "$@" shift shift ;; *) set fnord "$@" "$arg" shift shift ;; esac done "$@" -E | sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::echo "`cygpath -u \\"\1\\"`":p' | sort | uniq > "$tmpdepfile" rm -f "$depfile" echo "$object : \\" > "$depfile" . "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s:: \1 \\:p' >> "$depfile" echo " " >> "$depfile" . "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s::\1\::p' >> "$depfile" rm -f "$tmpdepfile" ;; none) exec "$@" ;; *) echo "Unknown depmode $depmode" 1>&2 exit 1 ;; esac exit 0 # Local Variables: # mode: shell-script # sh-indentation: 2 # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "scriptversion=" # time-stamp-format: "%:y-%02m-%02d.%02H" # time-stamp-end: "$" # End: tophat-2.0.9/build-aux/missing0000755000175000017500000002540612122334411015042 0ustar toortoor#! /bin/sh # Common stub for a few missing GNU programs while installing. scriptversion=2005-06-08.21 # Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005 # Free Software Foundation, Inc. # Originally by Fran,cois Pinard , 1996. # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2, or (at your option) # any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that program. if test $# -eq 0; then echo 1>&2 "Try \`$0 --help' for more information" exit 1 fi run=: # In the cases where this matters, `missing' is being run in the # srcdir already. if test -f configure.ac; then configure_ac=configure.ac else configure_ac=configure.in fi msg="missing on your system" case "$1" in --run) # Try to run requested program, and just exit if it succeeds. run= shift "$@" && exit 0 # Exit code 63 means version mismatch. This often happens # when the user try to use an ancient version of a tool on # a file that requires a minimum version. In this case we # we should proceed has if the program had been absent, or # if --run hadn't been passed. if test $? = 63; then run=: msg="probably too old" fi ;; -h|--h|--he|--hel|--help) echo "\ $0 [OPTION]... PROGRAM [ARGUMENT]... Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an error status if there is no known handling for PROGRAM. Options: -h, --help display this help and exit -v, --version output version information and exit --run try to run the given command, and emulate it if it fails Supported PROGRAM values: aclocal touch file \`aclocal.m4' autoconf touch file \`configure' autoheader touch file \`config.h.in' automake touch all \`Makefile.in' files bison create \`y.tab.[ch]', if possible, from existing .[ch] flex create \`lex.yy.c', if possible, from existing .c help2man touch the output file lex create \`lex.yy.c', if possible, from existing .c makeinfo touch the output file tar try tar, gnutar, gtar, then tar without non-portable flags yacc create \`y.tab.[ch]', if possible, from existing .[ch] Send bug reports to ." exit $? ;; -v|--v|--ve|--ver|--vers|--versi|--versio|--version) echo "missing $scriptversion (GNU Automake)" exit $? ;; -*) echo 1>&2 "$0: Unknown \`$1' option" echo 1>&2 "Try \`$0 --help' for more information" exit 1 ;; esac # Now exit if we have it, but it failed. Also exit now if we # don't have it and --version was passed (most likely to detect # the program). case "$1" in lex|yacc) # Not GNU programs, they don't have --version. ;; tar) if test -n "$run"; then echo 1>&2 "ERROR: \`tar' requires --run" exit 1 elif test "x$2" = "x--version" || test "x$2" = "x--help"; then exit 1 fi ;; *) if test -z "$run" && ($1 --version) > /dev/null 2>&1; then # We have it, but it failed. exit 1 elif test "x$2" = "x--version" || test "x$2" = "x--help"; then # Could not run --version or --help. This is probably someone # running `$TOOL --version' or `$TOOL --help' to check whether # $TOOL exists and not knowing $TOOL uses missing. exit 1 fi ;; esac # If it does not exist, or fails to run (possibly an outdated version), # try to emulate it. case "$1" in aclocal*) echo 1>&2 "\ WARNING: \`$1' is $msg. You should only need it if you modified \`acinclude.m4' or \`${configure_ac}'. You might want to install the \`Automake' and \`Perl' packages. Grab them from any GNU archive site." touch aclocal.m4 ;; autoconf) echo 1>&2 "\ WARNING: \`$1' is $msg. You should only need it if you modified \`${configure_ac}'. You might want to install the \`Autoconf' and \`GNU m4' packages. Grab them from any GNU archive site." touch configure ;; autoheader) echo 1>&2 "\ WARNING: \`$1' is $msg. You should only need it if you modified \`acconfig.h' or \`${configure_ac}'. You might want to install the \`Autoconf' and \`GNU m4' packages. Grab them from any GNU archive site." files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}` test -z "$files" && files="config.h" touch_files= for f in $files; do case "$f" in *:*) touch_files="$touch_files "`echo "$f" | sed -e 's/^[^:]*://' -e 's/:.*//'`;; *) touch_files="$touch_files $f.in";; esac done touch $touch_files ;; automake*) echo 1>&2 "\ WARNING: \`$1' is $msg. You should only need it if you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'. You might want to install the \`Automake' and \`Perl' packages. Grab them from any GNU archive site." find . -type f -name Makefile.am -print | sed 's/\.am$/.in/' | while read f; do touch "$f"; done ;; autom4te) echo 1>&2 "\ WARNING: \`$1' is needed, but is $msg. You might have modified some files without having the proper tools for further handling them. You can get \`$1' as part of \`Autoconf' from any GNU archive site." file=`echo "$*" | sed -n 's/.*--output[ =]*\([^ ]*\).*/\1/p'` test -z "$file" && file=`echo "$*" | sed -n 's/.*-o[ ]*\([^ ]*\).*/\1/p'` if test -f "$file"; then touch $file else test -z "$file" || exec >$file echo "#! /bin/sh" echo "# Created by GNU Automake missing as a replacement of" echo "# $ $@" echo "exit 0" chmod +x $file exit 1 fi ;; bison|yacc) echo 1>&2 "\ WARNING: \`$1' $msg. You should only need it if you modified a \`.y' file. You may need the \`Bison' package in order for those modifications to take effect. You can get \`Bison' from any GNU archive site." rm -f y.tab.c y.tab.h if [ $# -ne 1 ]; then eval LASTARG="\${$#}" case "$LASTARG" in *.y) SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'` if [ -f "$SRCFILE" ]; then cp "$SRCFILE" y.tab.c fi SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'` if [ -f "$SRCFILE" ]; then cp "$SRCFILE" y.tab.h fi ;; esac fi if [ ! -f y.tab.h ]; then echo >y.tab.h fi if [ ! -f y.tab.c ]; then echo 'main() { return 0; }' >y.tab.c fi ;; lex|flex) echo 1>&2 "\ WARNING: \`$1' is $msg. You should only need it if you modified a \`.l' file. You may need the \`Flex' package in order for those modifications to take effect. You can get \`Flex' from any GNU archive site." rm -f lex.yy.c if [ $# -ne 1 ]; then eval LASTARG="\${$#}" case "$LASTARG" in *.l) SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'` if [ -f "$SRCFILE" ]; then cp "$SRCFILE" lex.yy.c fi ;; esac fi if [ ! -f lex.yy.c ]; then echo 'main() { return 0; }' >lex.yy.c fi ;; help2man) echo 1>&2 "\ WARNING: \`$1' is $msg. You should only need it if you modified a dependency of a manual page. You may need the \`Help2man' package in order for those modifications to take effect. You can get \`Help2man' from any GNU archive site." file=`echo "$*" | sed -n 's/.*-o \([^ ]*\).*/\1/p'` if test -z "$file"; then file=`echo "$*" | sed -n 's/.*--output=\([^ ]*\).*/\1/p'` fi if [ -f "$file" ]; then touch $file else test -z "$file" || exec >$file echo ".ab help2man is required to generate this page" exit 1 fi ;; makeinfo) echo 1>&2 "\ WARNING: \`$1' is $msg. You should only need it if you modified a \`.texi' or \`.texinfo' file, or any other file indirectly affecting the aspect of the manual. The spurious call might also be the consequence of using a buggy \`make' (AIX, DU, IRIX). You might want to install the \`Texinfo' package or the \`GNU make' package. Grab either from any GNU archive site." # The file to touch is that specified with -o ... file=`echo "$*" | sed -n 's/.*-o \([^ ]*\).*/\1/p'` if test -z "$file"; then # ... or it is the one specified with @setfilename ... infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'` file=`sed -n '/^@setfilename/ { s/.* \([^ ]*\) *$/\1/; p; q; }' $infile` # ... or it is derived from the source name (dir/f.texi becomes f.info) test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info fi # If the file does not exist, the user really needs makeinfo; # let's fail without touching anything. test -f $file || exit 1 touch $file ;; tar) shift # We have already tried tar in the generic part. # Look for gnutar/gtar before invocation to avoid ugly error # messages. if (gnutar --version > /dev/null 2>&1); then gnutar "$@" && exit 0 fi if (gtar --version > /dev/null 2>&1); then gtar "$@" && exit 0 fi firstarg="$1" if shift; then case "$firstarg" in *o*) firstarg=`echo "$firstarg" | sed s/o//` tar "$firstarg" "$@" && exit 0 ;; esac case "$firstarg" in *h*) firstarg=`echo "$firstarg" | sed s/h//` tar "$firstarg" "$@" && exit 0 ;; esac fi echo 1>&2 "\ WARNING: I can't seem to be able to run \`tar' with the given arguments. You may want to install GNU tar or Free paxutils, or check the command line arguments." exit 1 ;; *) echo 1>&2 "\ WARNING: \`$1' is needed, and is $msg. You might have modified some files without having the proper tools for further handling them. Check the \`README' file, it often tells you about the needed prerequisites for installing this package. You may also peek at any GNU archive site, in case some other package would contain this missing \`$1' program." exit 1 ;; esac exit 0 # Local variables: # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "scriptversion=" # time-stamp-format: "%:y-%02m-%02d.%02H" # time-stamp-end: "$" # End: tophat-2.0.9/build-aux/config.sub0000755000175000017500000007577712122334411015446 0ustar toortoor#! /bin/sh # Configuration validation subroutine script. # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, # 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc. timestamp='2005-07-08' # This file is (in principle) common to ALL GNU software. # The presence of a machine in this file suggests that SOME GNU software # can handle that machine. It does not imply ALL GNU software can. # # This file is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA # 02110-1301, USA. # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that program. # Please send patches to . Submit a context # diff and a properly formatted ChangeLog entry. # # Configuration subroutine to validate and canonicalize a configuration type. # Supply the specified configuration type as an argument. # If it is invalid, we print an error message on stderr and exit with code 1. # Otherwise, we print the canonical config type on stdout and succeed. # This file is supposed to be the same for all GNU packages # and recognize all the CPU types, system types and aliases # that are meaningful with *any* GNU software. # Each package is responsible for reporting which valid configurations # it does not support. The user should be able to distinguish # a failure to support a valid configuration from a meaningless # configuration. # The goal of this file is to map all the various variations of a given # machine specification into a single specification in the form: # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM # or in some cases, the newer four-part form: # CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM # It is wrong to echo any other type of specification. me=`echo "$0" | sed -e 's,.*/,,'` usage="\ Usage: $0 [OPTION] CPU-MFR-OPSYS $0 [OPTION] ALIAS Canonicalize a configuration name. Operation modes: -h, --help print this help, then exit -t, --time-stamp print date of last modification, then exit -v, --version print version number, then exit Report bugs and patches to ." version="\ GNU config.sub ($timestamp) Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." help=" Try \`$me --help' for more information." # Parse command line while test $# -gt 0 ; do case $1 in --time-stamp | --time* | -t ) echo "$timestamp" ; exit ;; --version | -v ) echo "$version" ; exit ;; --help | --h* | -h ) echo "$usage"; exit ;; -- ) # Stop option processing shift; break ;; - ) # Use stdin as input. break ;; -* ) echo "$me: invalid option $1$help" exit 1 ;; *local*) # First pass through any local machine types. echo $1 exit ;; * ) break ;; esac done case $# in 0) echo "$me: missing argument$help" >&2 exit 1;; 1) ;; *) echo "$me: too many arguments$help" >&2 exit 1;; esac # Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any). # Here we must recognize all the valid KERNEL-OS combinations. maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` case $maybe_os in nto-qnx* | linux-gnu* | linux-dietlibc | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | \ kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* | storm-chaos* | os2-emx* | rtmk-nova*) os=-$maybe_os basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'` ;; *) basic_machine=`echo $1 | sed 's/-[^-]*$//'` if [ $basic_machine != $1 ] then os=`echo $1 | sed 's/.*-/-/'` else os=; fi ;; esac ### Let's recognize common machines as not being operating systems so ### that things like config.sub decstation-3100 work. We also ### recognize some manufacturers as not being operating systems, so we ### can provide default operating systems below. case $os in -sun*os*) # Prevent following clause from handling this invalid input. ;; -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \ -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \ -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \ -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \ -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \ -apple | -axis | -knuth | -cray) os= basic_machine=$1 ;; -sim | -cisco | -oki | -wec | -winbond) os= basic_machine=$1 ;; -scout) ;; -wrs) os=-vxworks basic_machine=$1 ;; -chorusos*) os=-chorusos basic_machine=$1 ;; -chorusrdb) os=-chorusrdb basic_machine=$1 ;; -hiux*) os=-hiuxwe2 ;; -sco5) os=-sco3.2v5 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco4) os=-sco3.2v4 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco3.2.[4-9]*) os=`echo $os | sed -e 's/sco3.2./sco3.2v/'` basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco3.2v[4-9]*) # Don't forget version if it is 3.2v4 or newer. basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco*) os=-sco3.2v2 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -udk*) basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -isc) os=-isc2.2 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -clix*) basic_machine=clipper-intergraph ;; -isc*) basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -lynx*) os=-lynxos ;; -ptx*) basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'` ;; -windowsnt*) os=`echo $os | sed -e 's/windowsnt/winnt/'` ;; -psos*) os=-psos ;; -mint | -mint[0-9]*) basic_machine=m68k-atari os=-mint ;; esac # Decode aliases for certain CPU-COMPANY combinations. case $basic_machine in # Recognize the basic CPU types without company name. # Some are omitted here because they have special meanings below. 1750a | 580 \ | a29k \ | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \ | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \ | am33_2.0 \ | arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr \ | bfin \ | c4x | clipper \ | d10v | d30v | dlx | dsp16xx \ | fr30 | frv \ | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \ | i370 | i860 | i960 | ia64 \ | ip2k | iq2000 \ | m32r | m32rle | m68000 | m68k | m88k | maxq | mcore \ | mips | mipsbe | mipseb | mipsel | mipsle \ | mips16 \ | mips64 | mips64el \ | mips64vr | mips64vrel \ | mips64orion | mips64orionel \ | mips64vr4100 | mips64vr4100el \ | mips64vr4300 | mips64vr4300el \ | mips64vr5000 | mips64vr5000el \ | mips64vr5900 | mips64vr5900el \ | mipsisa32 | mipsisa32el \ | mipsisa32r2 | mipsisa32r2el \ | mipsisa64 | mipsisa64el \ | mipsisa64r2 | mipsisa64r2el \ | mipsisa64sb1 | mipsisa64sb1el \ | mipsisa64sr71k | mipsisa64sr71kel \ | mipstx39 | mipstx39el \ | mn10200 | mn10300 \ | ms1 \ | msp430 \ | ns16k | ns32k \ | or32 \ | pdp10 | pdp11 | pj | pjl \ | powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \ | pyramid \ | sh | sh[1234] | sh[24]a | sh[23]e | sh[34]eb | shbe | shle | sh[1234]le | sh3ele \ | sh64 | sh64le \ | sparc | sparc64 | sparc64b | sparc86x | sparclet | sparclite \ | sparcv8 | sparcv9 | sparcv9b \ | strongarm \ | tahoe | thumb | tic4x | tic80 | tron \ | v850 | v850e \ | we32k \ | x86 | xscale | xscalee[bl] | xstormy16 | xtensa \ | z8k) basic_machine=$basic_machine-unknown ;; m32c) basic_machine=$basic_machine-unknown ;; m6811 | m68hc11 | m6812 | m68hc12) # Motorola 68HC11/12. basic_machine=$basic_machine-unknown os=-none ;; m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k) ;; # We use `pc' rather than `unknown' # because (1) that's what they normally are, and # (2) the word "unknown" tends to confuse beginning users. i*86 | x86_64) basic_machine=$basic_machine-pc ;; # Object if more than one company name word. *-*-*) echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 exit 1 ;; # Recognize the basic CPU types with company name. 580-* \ | a29k-* \ | alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \ | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \ | alphapca5[67]-* | alpha64pca5[67]-* | arc-* \ | arm-* | armbe-* | armle-* | armeb-* | armv*-* \ | avr-* \ | bfin-* | bs2000-* \ | c[123]* | c30-* | [cjt]90-* | c4x-* | c54x-* | c55x-* | c6x-* \ | clipper-* | craynv-* | cydra-* \ | d10v-* | d30v-* | dlx-* \ | elxsi-* \ | f30[01]-* | f700-* | fr30-* | frv-* | fx80-* \ | h8300-* | h8500-* \ | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \ | i*86-* | i860-* | i960-* | ia64-* \ | ip2k-* | iq2000-* \ | m32r-* | m32rle-* \ | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \ | m88110-* | m88k-* | maxq-* | mcore-* \ | mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \ | mips16-* \ | mips64-* | mips64el-* \ | mips64vr-* | mips64vrel-* \ | mips64orion-* | mips64orionel-* \ | mips64vr4100-* | mips64vr4100el-* \ | mips64vr4300-* | mips64vr4300el-* \ | mips64vr5000-* | mips64vr5000el-* \ | mips64vr5900-* | mips64vr5900el-* \ | mipsisa32-* | mipsisa32el-* \ | mipsisa32r2-* | mipsisa32r2el-* \ | mipsisa64-* | mipsisa64el-* \ | mipsisa64r2-* | mipsisa64r2el-* \ | mipsisa64sb1-* | mipsisa64sb1el-* \ | mipsisa64sr71k-* | mipsisa64sr71kel-* \ | mipstx39-* | mipstx39el-* \ | mmix-* \ | ms1-* \ | msp430-* \ | none-* | np1-* | ns16k-* | ns32k-* \ | orion-* \ | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \ | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \ | pyramid-* \ | romp-* | rs6000-* \ | sh-* | sh[1234]-* | sh[24]a-* | sh[23]e-* | sh[34]eb-* | shbe-* \ | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \ | sparc-* | sparc64-* | sparc64b-* | sparc86x-* | sparclet-* \ | sparclite-* \ | sparcv8-* | sparcv9-* | sparcv9b-* | strongarm-* | sv1-* | sx?-* \ | tahoe-* | thumb-* \ | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \ | tron-* \ | v850-* | v850e-* | vax-* \ | we32k-* \ | x86-* | x86_64-* | xps100-* | xscale-* | xscalee[bl]-* \ | xstormy16-* | xtensa-* \ | ymp-* \ | z8k-*) ;; m32c-*) ;; # Recognize the various machine names and aliases which stand # for a CPU type and a company and sometimes even an OS. 386bsd) basic_machine=i386-unknown os=-bsd ;; 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) basic_machine=m68000-att ;; 3b*) basic_machine=we32k-att ;; a29khif) basic_machine=a29k-amd os=-udi ;; abacus) basic_machine=abacus-unknown ;; adobe68k) basic_machine=m68010-adobe os=-scout ;; alliant | fx80) basic_machine=fx80-alliant ;; altos | altos3068) basic_machine=m68k-altos ;; am29k) basic_machine=a29k-none os=-bsd ;; amd64) basic_machine=x86_64-pc ;; amd64-*) basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'` ;; amdahl) basic_machine=580-amdahl os=-sysv ;; amiga | amiga-*) basic_machine=m68k-unknown ;; amigaos | amigados) basic_machine=m68k-unknown os=-amigaos ;; amigaunix | amix) basic_machine=m68k-unknown os=-sysv4 ;; apollo68) basic_machine=m68k-apollo os=-sysv ;; apollo68bsd) basic_machine=m68k-apollo os=-bsd ;; aux) basic_machine=m68k-apple os=-aux ;; balance) basic_machine=ns32k-sequent os=-dynix ;; c90) basic_machine=c90-cray os=-unicos ;; convex-c1) basic_machine=c1-convex os=-bsd ;; convex-c2) basic_machine=c2-convex os=-bsd ;; convex-c32) basic_machine=c32-convex os=-bsd ;; convex-c34) basic_machine=c34-convex os=-bsd ;; convex-c38) basic_machine=c38-convex os=-bsd ;; cray | j90) basic_machine=j90-cray os=-unicos ;; craynv) basic_machine=craynv-cray os=-unicosmp ;; cr16c) basic_machine=cr16c-unknown os=-elf ;; crds | unos) basic_machine=m68k-crds ;; crisv32 | crisv32-* | etraxfs*) basic_machine=crisv32-axis ;; cris | cris-* | etrax*) basic_machine=cris-axis ;; crx) basic_machine=crx-unknown os=-elf ;; da30 | da30-*) basic_machine=m68k-da30 ;; decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn) basic_machine=mips-dec ;; decsystem10* | dec10*) basic_machine=pdp10-dec os=-tops10 ;; decsystem20* | dec20*) basic_machine=pdp10-dec os=-tops20 ;; delta | 3300 | motorola-3300 | motorola-delta \ | 3300-motorola | delta-motorola) basic_machine=m68k-motorola ;; delta88) basic_machine=m88k-motorola os=-sysv3 ;; djgpp) basic_machine=i586-pc os=-msdosdjgpp ;; dpx20 | dpx20-*) basic_machine=rs6000-bull os=-bosx ;; dpx2* | dpx2*-bull) basic_machine=m68k-bull os=-sysv3 ;; ebmon29k) basic_machine=a29k-amd os=-ebmon ;; elxsi) basic_machine=elxsi-elxsi os=-bsd ;; encore | umax | mmax) basic_machine=ns32k-encore ;; es1800 | OSE68k | ose68k | ose | OSE) basic_machine=m68k-ericsson os=-ose ;; fx2800) basic_machine=i860-alliant ;; genix) basic_machine=ns32k-ns ;; gmicro) basic_machine=tron-gmicro os=-sysv ;; go32) basic_machine=i386-pc os=-go32 ;; h3050r* | hiux*) basic_machine=hppa1.1-hitachi os=-hiuxwe2 ;; h8300hms) basic_machine=h8300-hitachi os=-hms ;; h8300xray) basic_machine=h8300-hitachi os=-xray ;; h8500hms) basic_machine=h8500-hitachi os=-hms ;; harris) basic_machine=m88k-harris os=-sysv3 ;; hp300-*) basic_machine=m68k-hp ;; hp300bsd) basic_machine=m68k-hp os=-bsd ;; hp300hpux) basic_machine=m68k-hp os=-hpux ;; hp3k9[0-9][0-9] | hp9[0-9][0-9]) basic_machine=hppa1.0-hp ;; hp9k2[0-9][0-9] | hp9k31[0-9]) basic_machine=m68000-hp ;; hp9k3[2-9][0-9]) basic_machine=m68k-hp ;; hp9k6[0-9][0-9] | hp6[0-9][0-9]) basic_machine=hppa1.0-hp ;; hp9k7[0-79][0-9] | hp7[0-79][0-9]) basic_machine=hppa1.1-hp ;; hp9k78[0-9] | hp78[0-9]) # FIXME: really hppa2.0-hp basic_machine=hppa1.1-hp ;; hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893) # FIXME: really hppa2.0-hp basic_machine=hppa1.1-hp ;; hp9k8[0-9][13679] | hp8[0-9][13679]) basic_machine=hppa1.1-hp ;; hp9k8[0-9][0-9] | hp8[0-9][0-9]) basic_machine=hppa1.0-hp ;; hppa-next) os=-nextstep3 ;; hppaosf) basic_machine=hppa1.1-hp os=-osf ;; hppro) basic_machine=hppa1.1-hp os=-proelf ;; i370-ibm* | ibm*) basic_machine=i370-ibm ;; # I'm not sure what "Sysv32" means. Should this be sysv3.2? i*86v32) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-sysv32 ;; i*86v4*) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-sysv4 ;; i*86v) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-sysv ;; i*86sol2) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-solaris2 ;; i386mach) basic_machine=i386-mach os=-mach ;; i386-vsta | vsta) basic_machine=i386-unknown os=-vsta ;; iris | iris4d) basic_machine=mips-sgi case $os in -irix*) ;; *) os=-irix4 ;; esac ;; isi68 | isi) basic_machine=m68k-isi os=-sysv ;; m88k-omron*) basic_machine=m88k-omron ;; magnum | m3230) basic_machine=mips-mips os=-sysv ;; merlin) basic_machine=ns32k-utek os=-sysv ;; mingw32) basic_machine=i386-pc os=-mingw32 ;; miniframe) basic_machine=m68000-convergent ;; *mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*) basic_machine=m68k-atari os=-mint ;; mips3*-*) basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'` ;; mips3*) basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown ;; monitor) basic_machine=m68k-rom68k os=-coff ;; morphos) basic_machine=powerpc-unknown os=-morphos ;; msdos) basic_machine=i386-pc os=-msdos ;; mvs) basic_machine=i370-ibm os=-mvs ;; ncr3000) basic_machine=i486-ncr os=-sysv4 ;; netbsd386) basic_machine=i386-unknown os=-netbsd ;; netwinder) basic_machine=armv4l-rebel os=-linux ;; news | news700 | news800 | news900) basic_machine=m68k-sony os=-newsos ;; news1000) basic_machine=m68030-sony os=-newsos ;; news-3600 | risc-news) basic_machine=mips-sony os=-newsos ;; necv70) basic_machine=v70-nec os=-sysv ;; next | m*-next ) basic_machine=m68k-next case $os in -nextstep* ) ;; -ns2*) os=-nextstep2 ;; *) os=-nextstep3 ;; esac ;; nh3000) basic_machine=m68k-harris os=-cxux ;; nh[45]000) basic_machine=m88k-harris os=-cxux ;; nindy960) basic_machine=i960-intel os=-nindy ;; mon960) basic_machine=i960-intel os=-mon960 ;; nonstopux) basic_machine=mips-compaq os=-nonstopux ;; np1) basic_machine=np1-gould ;; nsr-tandem) basic_machine=nsr-tandem ;; op50n-* | op60c-*) basic_machine=hppa1.1-oki os=-proelf ;; openrisc | openrisc-*) basic_machine=or32-unknown ;; os400) basic_machine=powerpc-ibm os=-os400 ;; OSE68000 | ose68000) basic_machine=m68000-ericsson os=-ose ;; os68k) basic_machine=m68k-none os=-os68k ;; pa-hitachi) basic_machine=hppa1.1-hitachi os=-hiuxwe2 ;; paragon) basic_machine=i860-intel os=-osf ;; pbd) basic_machine=sparc-tti ;; pbb) basic_machine=m68k-tti ;; pc532 | pc532-*) basic_machine=ns32k-pc532 ;; pentium | p5 | k5 | k6 | nexgen | viac3) basic_machine=i586-pc ;; pentiumpro | p6 | 6x86 | athlon | athlon_*) basic_machine=i686-pc ;; pentiumii | pentium2 | pentiumiii | pentium3) basic_machine=i686-pc ;; pentium4) basic_machine=i786-pc ;; pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*) basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentiumpro-* | p6-* | 6x86-* | athlon-*) basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*) basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentium4-*) basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pn) basic_machine=pn-gould ;; power) basic_machine=power-ibm ;; ppc) basic_machine=powerpc-unknown ;; ppc-*) basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ppcle | powerpclittle | ppc-le | powerpc-little) basic_machine=powerpcle-unknown ;; ppcle-* | powerpclittle-*) basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ppc64) basic_machine=powerpc64-unknown ;; ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ppc64le | powerpc64little | ppc64-le | powerpc64-little) basic_machine=powerpc64le-unknown ;; ppc64le-* | powerpc64little-*) basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ps2) basic_machine=i386-ibm ;; pw32) basic_machine=i586-unknown os=-pw32 ;; rom68k) basic_machine=m68k-rom68k os=-coff ;; rm[46]00) basic_machine=mips-siemens ;; rtpc | rtpc-*) basic_machine=romp-ibm ;; s390 | s390-*) basic_machine=s390-ibm ;; s390x | s390x-*) basic_machine=s390x-ibm ;; sa29200) basic_machine=a29k-amd os=-udi ;; sb1) basic_machine=mipsisa64sb1-unknown ;; sb1el) basic_machine=mipsisa64sb1el-unknown ;; sei) basic_machine=mips-sei os=-seiux ;; sequent) basic_machine=i386-sequent ;; sh) basic_machine=sh-hitachi os=-hms ;; sh64) basic_machine=sh64-unknown ;; sparclite-wrs | simso-wrs) basic_machine=sparclite-wrs os=-vxworks ;; sps7) basic_machine=m68k-bull os=-sysv2 ;; spur) basic_machine=spur-unknown ;; st2000) basic_machine=m68k-tandem ;; stratus) basic_machine=i860-stratus os=-sysv4 ;; sun2) basic_machine=m68000-sun ;; sun2os3) basic_machine=m68000-sun os=-sunos3 ;; sun2os4) basic_machine=m68000-sun os=-sunos4 ;; sun3os3) basic_machine=m68k-sun os=-sunos3 ;; sun3os4) basic_machine=m68k-sun os=-sunos4 ;; sun4os3) basic_machine=sparc-sun os=-sunos3 ;; sun4os4) basic_machine=sparc-sun os=-sunos4 ;; sun4sol2) basic_machine=sparc-sun os=-solaris2 ;; sun3 | sun3-*) basic_machine=m68k-sun ;; sun4) basic_machine=sparc-sun ;; sun386 | sun386i | roadrunner) basic_machine=i386-sun ;; sv1) basic_machine=sv1-cray os=-unicos ;; symmetry) basic_machine=i386-sequent os=-dynix ;; t3e) basic_machine=alphaev5-cray os=-unicos ;; t90) basic_machine=t90-cray os=-unicos ;; tic54x | c54x*) basic_machine=tic54x-unknown os=-coff ;; tic55x | c55x*) basic_machine=tic55x-unknown os=-coff ;; tic6x | c6x*) basic_machine=tic6x-unknown os=-coff ;; tx39) basic_machine=mipstx39-unknown ;; tx39el) basic_machine=mipstx39el-unknown ;; toad1) basic_machine=pdp10-xkl os=-tops20 ;; tower | tower-32) basic_machine=m68k-ncr ;; tpf) basic_machine=s390x-ibm os=-tpf ;; udi29k) basic_machine=a29k-amd os=-udi ;; ultra3) basic_machine=a29k-nyu os=-sym1 ;; v810 | necv810) basic_machine=v810-nec os=-none ;; vaxv) basic_machine=vax-dec os=-sysv ;; vms) basic_machine=vax-dec os=-vms ;; vpp*|vx|vx-*) basic_machine=f301-fujitsu ;; vxworks960) basic_machine=i960-wrs os=-vxworks ;; vxworks68) basic_machine=m68k-wrs os=-vxworks ;; vxworks29k) basic_machine=a29k-wrs os=-vxworks ;; w65*) basic_machine=w65-wdc os=-none ;; w89k-*) basic_machine=hppa1.1-winbond os=-proelf ;; xbox) basic_machine=i686-pc os=-mingw32 ;; xps | xps100) basic_machine=xps100-honeywell ;; ymp) basic_machine=ymp-cray os=-unicos ;; z8k-*-coff) basic_machine=z8k-unknown os=-sim ;; none) basic_machine=none-none os=-none ;; # Here we handle the default manufacturer of certain CPU types. It is in # some cases the only manufacturer, in others, it is the most popular. w89k) basic_machine=hppa1.1-winbond ;; op50n) basic_machine=hppa1.1-oki ;; op60c) basic_machine=hppa1.1-oki ;; romp) basic_machine=romp-ibm ;; mmix) basic_machine=mmix-knuth ;; rs6000) basic_machine=rs6000-ibm ;; vax) basic_machine=vax-dec ;; pdp10) # there are many clones, so DEC is not a safe bet basic_machine=pdp10-unknown ;; pdp11) basic_machine=pdp11-dec ;; we32k) basic_machine=we32k-att ;; sh[1234] | sh[24]a | sh[34]eb | sh[1234]le | sh[23]ele) basic_machine=sh-unknown ;; sparc | sparcv8 | sparcv9 | sparcv9b) basic_machine=sparc-sun ;; cydra) basic_machine=cydra-cydrome ;; orion) basic_machine=orion-highlevel ;; orion105) basic_machine=clipper-highlevel ;; mac | mpw | mac-mpw) basic_machine=m68k-apple ;; pmac | pmac-mpw) basic_machine=powerpc-apple ;; *-unknown) # Make sure to match an already-canonicalized machine name. ;; *) echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 exit 1 ;; esac # Here we canonicalize certain aliases for manufacturers. case $basic_machine in *-digital*) basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'` ;; *-commodore*) basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'` ;; *) ;; esac # Decode manufacturer-specific aliases for certain operating systems. if [ x"$os" != x"" ] then case $os in # First match some system type aliases # that might get confused with valid system types. # -solaris* is a basic system type, with this one exception. -solaris1 | -solaris1.*) os=`echo $os | sed -e 's|solaris1|sunos4|'` ;; -solaris) os=-solaris2 ;; -svr4*) os=-sysv4 ;; -unixware*) os=-sysv4.2uw ;; -gnu/linux*) os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'` ;; # First accept the basic system types. # The portable systems comes first. # Each alternative MUST END IN A *, to match a version number. # -sysv* is not here because it comes later, after sysvr4. -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \ | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ | -aos* \ | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \ | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* | -openbsd* \ | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \ | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \ | -chorusos* | -chorusrdb* \ | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ | -mingw32* | -linux-gnu* | -linux-uclibc* | -uxpv* | -beos* | -mpeix* | -udk* \ | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \ | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \ | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \ | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \ | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \ | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \ | -skyos* | -haiku*) # Remember, each alternative MUST END IN *, to match a version number. ;; -qnx*) case $basic_machine in x86-* | i*86-*) ;; *) os=-nto$os ;; esac ;; -nto-qnx*) ;; -nto*) os=`echo $os | sed -e 's|nto|nto-qnx|'` ;; -sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \ | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \ | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*) ;; -mac*) os=`echo $os | sed -e 's|mac|macos|'` ;; -linux-dietlibc) os=-linux-dietlibc ;; -linux*) os=`echo $os | sed -e 's|linux|linux-gnu|'` ;; -sunos5*) os=`echo $os | sed -e 's|sunos5|solaris2|'` ;; -sunos6*) os=`echo $os | sed -e 's|sunos6|solaris3|'` ;; -opened*) os=-openedition ;; -os400*) os=-os400 ;; -wince*) os=-wince ;; -osfrose*) os=-osfrose ;; -osf*) os=-osf ;; -utek*) os=-bsd ;; -dynix*) os=-bsd ;; -acis*) os=-aos ;; -atheos*) os=-atheos ;; -syllable*) os=-syllable ;; -386bsd) os=-bsd ;; -ctix* | -uts*) os=-sysv ;; -nova*) os=-rtmk-nova ;; -ns2 ) os=-nextstep2 ;; -nsk*) os=-nsk ;; # Preserve the version number of sinix5. -sinix5.*) os=`echo $os | sed -e 's|sinix|sysv|'` ;; -sinix*) os=-sysv4 ;; -tpf*) os=-tpf ;; -triton*) os=-sysv3 ;; -oss*) os=-sysv3 ;; -svr4) os=-sysv4 ;; -svr3) os=-sysv3 ;; -sysvr4) os=-sysv4 ;; # This must come after -sysvr4. -sysv*) ;; -ose*) os=-ose ;; -es1800*) os=-ose ;; -xenix) os=-xenix ;; -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) os=-mint ;; -aros*) os=-aros ;; -kaos*) os=-kaos ;; -zvmoe) os=-zvmoe ;; -none) ;; *) # Get rid of the `-' at the beginning of $os. os=`echo $os | sed 's/[^-]*-//'` echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2 exit 1 ;; esac else # Here we handle the default operating systems that come with various machines. # The value should be what the vendor currently ships out the door with their # machine or put another way, the most popular os provided with the machine. # Note that if you're going to try to match "-MANUFACTURER" here (say, # "-sun"), then you have to tell the case statement up towards the top # that MANUFACTURER isn't an operating system. Otherwise, code above # will signal an error saying that MANUFACTURER isn't an operating # system, and we'll never get to this point. case $basic_machine in *-acorn) os=-riscix1.2 ;; arm*-rebel) os=-linux ;; arm*-semi) os=-aout ;; c4x-* | tic4x-*) os=-coff ;; # This must come before the *-dec entry. pdp10-*) os=-tops20 ;; pdp11-*) os=-none ;; *-dec | vax-*) os=-ultrix4.2 ;; m68*-apollo) os=-domain ;; i386-sun) os=-sunos4.0.2 ;; m68000-sun) os=-sunos3 # This also exists in the configure program, but was not the # default. # os=-sunos4 ;; m68*-cisco) os=-aout ;; mips*-cisco) os=-elf ;; mips*-*) os=-elf ;; or32-*) os=-coff ;; *-tti) # must be before sparc entry or we get the wrong os. os=-sysv3 ;; sparc-* | *-sun) os=-sunos4.1.1 ;; *-be) os=-beos ;; *-haiku) os=-haiku ;; *-ibm) os=-aix ;; *-knuth) os=-mmixware ;; *-wec) os=-proelf ;; *-winbond) os=-proelf ;; *-oki) os=-proelf ;; *-hp) os=-hpux ;; *-hitachi) os=-hiux ;; i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent) os=-sysv ;; *-cbm) os=-amigaos ;; *-dg) os=-dgux ;; *-dolphin) os=-sysv3 ;; m68k-ccur) os=-rtu ;; m88k-omron*) os=-luna ;; *-next ) os=-nextstep ;; *-sequent) os=-ptx ;; *-crds) os=-unos ;; *-ns) os=-genix ;; i370-*) os=-mvs ;; *-next) os=-nextstep3 ;; *-gould) os=-sysv ;; *-highlevel) os=-bsd ;; *-encore) os=-bsd ;; *-sgi) os=-irix ;; *-siemens) os=-sysv4 ;; *-masscomp) os=-rtu ;; f30[01]-fujitsu | f700-fujitsu) os=-uxpv ;; *-rom68k) os=-coff ;; *-*bug) os=-coff ;; *-apple) os=-macos ;; *-atari*) os=-mint ;; *) os=-none ;; esac fi # Here we handle the case where we know the os, and the CPU type, but not the # manufacturer. We pick the logical manufacturer. vendor=unknown case $basic_machine in *-unknown) case $os in -riscix*) vendor=acorn ;; -sunos*) vendor=sun ;; -aix*) vendor=ibm ;; -beos*) vendor=be ;; -hpux*) vendor=hp ;; -mpeix*) vendor=hp ;; -hiux*) vendor=hitachi ;; -unos*) vendor=crds ;; -dgux*) vendor=dg ;; -luna*) vendor=omron ;; -genix*) vendor=ns ;; -mvs* | -opened*) vendor=ibm ;; -os400*) vendor=ibm ;; -ptx*) vendor=sequent ;; -tpf*) vendor=ibm ;; -vxsim* | -vxworks* | -windiss*) vendor=wrs ;; -aux*) vendor=apple ;; -hms*) vendor=hitachi ;; -mpw* | -macos*) vendor=apple ;; -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) vendor=atari ;; -vos*) vendor=stratus ;; esac basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"` ;; esac echo $basic_machine$os exit # Local variables: # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "timestamp='" # time-stamp-format: "%:y-%02m-%02d" # time-stamp-end: "'" # End: tophat-2.0.9/build-aux/install-sh0000755000175000017500000002202112122334411015435 0ustar toortoor#!/bin/sh # install - install a program, script, or datafile scriptversion=2005-05-14.22 # This originates from X11R5 (mit/util/scripts/install.sh), which was # later released in X11R6 (xc/config/util/install.sh) with the # following copyright and license. # # Copyright (C) 1994 X Consortium # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to # deal in the Software without restriction, including without limitation the # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or # sell copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN # AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC- # TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # # Except as contained in this notice, the name of the X Consortium shall not # be used in advertising or otherwise to promote the sale, use or other deal- # ings in this Software without prior written authorization from the X Consor- # tium. # # # FSF changes to this file are in the public domain. # # Calling this script install-sh is preferred over install.sh, to prevent # `make' implicit rules from creating a file called install from it # when there is no Makefile. # # This script is compatible with the BSD install script, but was written # from scratch. It can only install one file at a time, a restriction # shared with many OS's install programs. # set DOITPROG to echo to test this script # Don't use :- since 4.3BSD and earlier shells don't like it. doit="${DOITPROG-}" # put in absolute paths if you don't have them in your path; or use env. vars. mvprog="${MVPROG-mv}" cpprog="${CPPROG-cp}" chmodprog="${CHMODPROG-chmod}" chownprog="${CHOWNPROG-chown}" chgrpprog="${CHGRPPROG-chgrp}" stripprog="${STRIPPROG-strip}" rmprog="${RMPROG-rm}" mkdirprog="${MKDIRPROG-mkdir}" chmodcmd="$chmodprog 0755" chowncmd= chgrpcmd= stripcmd= rmcmd="$rmprog -f" mvcmd="$mvprog" src= dst= dir_arg= dstarg= no_target_directory= usage="Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE or: $0 [OPTION]... SRCFILES... DIRECTORY or: $0 [OPTION]... -t DIRECTORY SRCFILES... or: $0 [OPTION]... -d DIRECTORIES... In the 1st form, copy SRCFILE to DSTFILE. In the 2nd and 3rd, copy all SRCFILES to DIRECTORY. In the 4th, create DIRECTORIES. Options: -c (ignored) -d create directories instead of installing files. -g GROUP $chgrpprog installed files to GROUP. -m MODE $chmodprog installed files to MODE. -o USER $chownprog installed files to USER. -s $stripprog installed files. -t DIRECTORY install into DIRECTORY. -T report an error if DSTFILE is a directory. --help display this help and exit. --version display version info and exit. Environment variables override the default commands: CHGRPPROG CHMODPROG CHOWNPROG CPPROG MKDIRPROG MVPROG RMPROG STRIPPROG " while test -n "$1"; do case $1 in -c) shift continue;; -d) dir_arg=true shift continue;; -g) chgrpcmd="$chgrpprog $2" shift shift continue;; --help) echo "$usage"; exit $?;; -m) chmodcmd="$chmodprog $2" shift shift continue;; -o) chowncmd="$chownprog $2" shift shift continue;; -s) stripcmd=$stripprog shift continue;; -t) dstarg=$2 shift shift continue;; -T) no_target_directory=true shift continue;; --version) echo "$0 $scriptversion"; exit $?;; *) # When -d is used, all remaining arguments are directories to create. # When -t is used, the destination is already specified. test -n "$dir_arg$dstarg" && break # Otherwise, the last argument is the destination. Remove it from $@. for arg do if test -n "$dstarg"; then # $@ is not empty: it contains at least $arg. set fnord "$@" "$dstarg" shift # fnord fi shift # arg dstarg=$arg done break;; esac done if test -z "$1"; then if test -z "$dir_arg"; then echo "$0: no input file specified." >&2 exit 1 fi # It's OK to call `install-sh -d' without argument. # This can happen when creating conditional directories. exit 0 fi for src do # Protect names starting with `-'. case $src in -*) src=./$src ;; esac if test -n "$dir_arg"; then dst=$src src= if test -d "$dst"; then mkdircmd=: chmodcmd= else mkdircmd=$mkdirprog fi else # Waiting for this to be detected by the "$cpprog $src $dsttmp" command # might cause directories to be created, which would be especially bad # if $src (and thus $dsttmp) contains '*'. if test ! -f "$src" && test ! -d "$src"; then echo "$0: $src does not exist." >&2 exit 1 fi if test -z "$dstarg"; then echo "$0: no destination specified." >&2 exit 1 fi dst=$dstarg # Protect names starting with `-'. case $dst in -*) dst=./$dst ;; esac # If destination is a directory, append the input filename; won't work # if double slashes aren't ignored. if test -d "$dst"; then if test -n "$no_target_directory"; then echo "$0: $dstarg: Is a directory" >&2 exit 1 fi dst=$dst/`basename "$src"` fi fi # This sed command emulates the dirname command. dstdir=`echo "$dst" | sed -e 's,/*$,,;s,[^/]*$,,;s,/*$,,;s,^$,.,'` # Make sure that the destination directory exists. # Skip lots of stat calls in the usual case. if test ! -d "$dstdir"; then defaultIFS=' ' IFS="${IFS-$defaultIFS}" oIFS=$IFS # Some sh's can't handle IFS=/ for some reason. IFS='%' set x `echo "$dstdir" | sed -e 's@/@%@g' -e 's@^%@/@'` shift IFS=$oIFS pathcomp= while test $# -ne 0 ; do pathcomp=$pathcomp$1 shift if test ! -d "$pathcomp"; then $mkdirprog "$pathcomp" # mkdir can fail with a `File exist' error in case several # install-sh are creating the directory concurrently. This # is OK. test -d "$pathcomp" || exit fi pathcomp=$pathcomp/ done fi if test -n "$dir_arg"; then $doit $mkdircmd "$dst" \ && { test -z "$chowncmd" || $doit $chowncmd "$dst"; } \ && { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } \ && { test -z "$stripcmd" || $doit $stripcmd "$dst"; } \ && { test -z "$chmodcmd" || $doit $chmodcmd "$dst"; } else dstfile=`basename "$dst"` # Make a couple of temp file names in the proper directory. dsttmp=$dstdir/_inst.$$_ rmtmp=$dstdir/_rm.$$_ # Trap to clean up those temp files at exit. trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0 trap '(exit $?); exit' 1 2 13 15 # Copy the file name to the temp name. $doit $cpprog "$src" "$dsttmp" && # and set any options; do chmod last to preserve setuid bits. # # If any of these fail, we abort the whole thing. If we want to # ignore errors from any of these, just make sure not to ignore # errors from the above "$doit $cpprog $src $dsttmp" command. # { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } \ && { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } \ && { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } \ && { test -z "$chmodcmd" || $doit $chmodcmd "$dsttmp"; } && # Now rename the file to the real destination. { $doit $mvcmd -f "$dsttmp" "$dstdir/$dstfile" 2>/dev/null \ || { # The rename failed, perhaps because mv can't rename something else # to itself, or perhaps because mv is so ancient that it does not # support -f. # Now remove or move aside any old file at destination location. # We try this two ways since rm can't unlink itself on some # systems and the destination file might be busy for other # reasons. In this case, the final cleanup might fail but the new # file should still install successfully. { if test -f "$dstdir/$dstfile"; then $doit $rmcmd -f "$dstdir/$dstfile" 2>/dev/null \ || $doit $mvcmd -f "$dstdir/$dstfile" "$rmtmp" 2>/dev/null \ || { echo "$0: cannot unlink or rename $dstdir/$dstfile" >&2 (exit 1); exit 1 } else : fi } && # Now rename the file to the real destination. $doit $mvcmd "$dsttmp" "$dstdir/$dstfile" } } fi || { (exit 1); exit 1; } done # The final little trick to "correctly" pass the exit status to the exit trap. { (exit 0); exit 0 } # Local variables: # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "scriptversion=" # time-stamp-format: "%:y-%02m-%02d.%02H" # time-stamp-end: "$" # End: tophat-2.0.9/build-aux/config.guess0000755000175000017500000012463412122334411015766 0ustar toortoor#! /bin/sh # Attempt to guess a canonical system name. # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, # 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc. timestamp='2005-07-08' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA # 02110-1301, USA. # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that program. # Originally written by Per Bothner . # Please send patches to . Submit a context # diff and a properly formatted ChangeLog entry. # # This script attempts to guess a canonical system name similar to # config.sub. If it succeeds, it prints the system name on stdout, and # exits with 0. Otherwise, it exits with 1. # # The plan is that this can be called by configure scripts if you # don't specify an explicit build system type. me=`echo "$0" | sed -e 's,.*/,,'` usage="\ Usage: $0 [OPTION] Output the configuration name of the system \`$me' is run on. Operation modes: -h, --help print this help, then exit -t, --time-stamp print date of last modification, then exit -v, --version print version number, then exit Report bugs and patches to ." version="\ GNU config.guess ($timestamp) Originally written by Per Bothner. Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." help=" Try \`$me --help' for more information." # Parse command line while test $# -gt 0 ; do case $1 in --time-stamp | --time* | -t ) echo "$timestamp" ; exit ;; --version | -v ) echo "$version" ; exit ;; --help | --h* | -h ) echo "$usage"; exit ;; -- ) # Stop option processing shift; break ;; - ) # Use stdin as input. break ;; -* ) echo "$me: invalid option $1$help" >&2 exit 1 ;; * ) break ;; esac done if test $# != 0; then echo "$me: too many arguments$help" >&2 exit 1 fi trap 'exit 1' 1 2 15 # CC_FOR_BUILD -- compiler used by this script. Note that the use of a # compiler to aid in system detection is discouraged as it requires # temporary files to be created and, as you can see below, it is a # headache to deal with in a portable fashion. # Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still # use `HOST_CC' if defined, but it is deprecated. # Portable tmp directory creation inspired by the Autoconf team. set_cc_for_build=' trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ; trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ; : ${TMPDIR=/tmp} ; { tmp=`(umask 077 && mktemp -d -q "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } || { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } || { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ; dummy=$tmp/dummy ; tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ; case $CC_FOR_BUILD,$HOST_CC,$CC in ,,) echo "int x;" > $dummy.c ; for c in cc gcc c89 c99 ; do if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then CC_FOR_BUILD="$c"; break ; fi ; done ; if test x"$CC_FOR_BUILD" = x ; then CC_FOR_BUILD=no_compiler_found ; fi ;; ,,*) CC_FOR_BUILD=$CC ;; ,*,*) CC_FOR_BUILD=$HOST_CC ;; esac ; set_cc_for_build= ;' # This is needed to find uname on a Pyramid OSx when run in the BSD universe. # (ghazi@noc.rutgers.edu 1994-08-24) if (test -f /.attbin/uname) >/dev/null 2>&1 ; then PATH=$PATH:/.attbin ; export PATH fi UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown # Note: order is significant - the case branches are not exclusive. case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in *:NetBSD:*:*) # NetBSD (nbsd) targets should (where applicable) match one or # more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*, # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently # switched to ELF, *-*-netbsd* would select the old # object file format. This provides both forward # compatibility and a consistent mechanism for selecting the # object file format. # # Note: NetBSD doesn't particularly care about the vendor # portion of the name. We always set it to "unknown". sysctl="sysctl -n hw.machine_arch" UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \ /usr/sbin/$sysctl 2>/dev/null || echo unknown)` case "${UNAME_MACHINE_ARCH}" in armeb) machine=armeb-unknown ;; arm*) machine=arm-unknown ;; sh3el) machine=shl-unknown ;; sh3eb) machine=sh-unknown ;; *) machine=${UNAME_MACHINE_ARCH}-unknown ;; esac # The Operating System including object format, if it has switched # to ELF recently, or will in the future. case "${UNAME_MACHINE_ARCH}" in arm*|i386|m68k|ns32k|sh3*|sparc|vax) eval $set_cc_for_build if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep __ELF__ >/dev/null then # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). # Return netbsd for either. FIX? os=netbsd else os=netbsdelf fi ;; *) os=netbsd ;; esac # The OS release # Debian GNU/NetBSD machines have a different userland, and # thus, need a distinct triplet. However, they do not need # kernel version information, so it can be replaced with a # suitable tag, in the style of linux-gnu. case "${UNAME_VERSION}" in Debian*) release='-gnu' ;; *) release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` ;; esac # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: # contains redundant information, the shorter form: # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. echo "${machine}-${os}${release}" exit ;; *:OpenBSD:*:*) UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE} exit ;; *:ekkoBSD:*:*) echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE} exit ;; macppc:MirBSD:*:*) echo powerppc-unknown-mirbsd${UNAME_RELEASE} exit ;; *:MirBSD:*:*) echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE} exit ;; alpha:OSF1:*:*) case $UNAME_RELEASE in *4.0) UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` ;; *5.*) UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` ;; esac # According to Compaq, /usr/sbin/psrinfo has been available on # OSF/1 and Tru64 systems produced since 1995. I hope that # covers most systems running today. This code pipes the CPU # types through head -n 1, so we only detect the type of CPU 0. ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` case "$ALPHA_CPU_TYPE" in "EV4 (21064)") UNAME_MACHINE="alpha" ;; "EV4.5 (21064)") UNAME_MACHINE="alpha" ;; "LCA4 (21066/21068)") UNAME_MACHINE="alpha" ;; "EV5 (21164)") UNAME_MACHINE="alphaev5" ;; "EV5.6 (21164A)") UNAME_MACHINE="alphaev56" ;; "EV5.6 (21164PC)") UNAME_MACHINE="alphapca56" ;; "EV5.7 (21164PC)") UNAME_MACHINE="alphapca57" ;; "EV6 (21264)") UNAME_MACHINE="alphaev6" ;; "EV6.7 (21264A)") UNAME_MACHINE="alphaev67" ;; "EV6.8CB (21264C)") UNAME_MACHINE="alphaev68" ;; "EV6.8AL (21264B)") UNAME_MACHINE="alphaev68" ;; "EV6.8CX (21264D)") UNAME_MACHINE="alphaev68" ;; "EV6.9A (21264/EV69A)") UNAME_MACHINE="alphaev69" ;; "EV7 (21364)") UNAME_MACHINE="alphaev7" ;; "EV7.9 (21364A)") UNAME_MACHINE="alphaev79" ;; esac # A Pn.n version is a patched version. # A Vn.n version is a released version. # A Tn.n version is a released field test version. # A Xn.n version is an unreleased experimental baselevel. # 1.2 uses "1.2" for uname -r. echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` exit ;; Alpha\ *:Windows_NT*:*) # How do we know it's Interix rather than the generic POSIX subsystem? # Should we change UNAME_MACHINE based on the output of uname instead # of the specific Alpha model? echo alpha-pc-interix exit ;; 21064:Windows_NT:50:3) echo alpha-dec-winnt3.5 exit ;; Amiga*:UNIX_System_V:4.0:*) echo m68k-unknown-sysv4 exit ;; *:[Aa]miga[Oo][Ss]:*:*) echo ${UNAME_MACHINE}-unknown-amigaos exit ;; *:[Mm]orph[Oo][Ss]:*:*) echo ${UNAME_MACHINE}-unknown-morphos exit ;; *:OS/390:*:*) echo i370-ibm-openedition exit ;; *:z/VM:*:*) echo s390-ibm-zvmoe exit ;; *:OS400:*:*) echo powerpc-ibm-os400 exit ;; arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) echo arm-acorn-riscix${UNAME_RELEASE} exit ;; arm:riscos:*:*|arm:RISCOS:*:*) echo arm-unknown-riscos exit ;; SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) echo hppa1.1-hitachi-hiuxmpp exit ;; Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. if test "`(/bin/universe) 2>/dev/null`" = att ; then echo pyramid-pyramid-sysv3 else echo pyramid-pyramid-bsd fi exit ;; NILE*:*:*:dcosx) echo pyramid-pyramid-svr4 exit ;; DRS?6000:unix:4.0:6*) echo sparc-icl-nx6 exit ;; DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*) case `/usr/bin/uname -p` in sparc) echo sparc-icl-nx7; exit ;; esac ;; sun4H:SunOS:5.*:*) echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; i86pc:SunOS:5.*:*) echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4*:SunOS:6*:*) # According to config.sub, this is the proper way to canonicalize # SunOS6. Hard to guess exactly what SunOS6 will be like, but # it's likely to be more like Solaris than SunOS4. echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4*:SunOS:*:*) case "`/usr/bin/arch -k`" in Series*|S4*) UNAME_RELEASE=`uname -v` ;; esac # Japanese Language versions have a version number like `4.1.3-JL'. echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'` exit ;; sun3*:SunOS:*:*) echo m68k-sun-sunos${UNAME_RELEASE} exit ;; sun*:*:4.2BSD:*) UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3 case "`/bin/arch`" in sun3) echo m68k-sun-sunos${UNAME_RELEASE} ;; sun4) echo sparc-sun-sunos${UNAME_RELEASE} ;; esac exit ;; aushp:SunOS:*:*) echo sparc-auspex-sunos${UNAME_RELEASE} exit ;; # The situation for MiNT is a little confusing. The machine name # can be virtually everything (everything which is not # "atarist" or "atariste" at least should have a processor # > m68000). The system name ranges from "MiNT" over "FreeMiNT" # to the lowercase version "mint" (or "freemint"). Finally # the system name "TOS" denotes a system which is actually not # MiNT. But MiNT is downward compatible to TOS, so this should # be no problem. atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} exit ;; atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} exit ;; *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} exit ;; milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) echo m68k-milan-mint${UNAME_RELEASE} exit ;; hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) echo m68k-hades-mint${UNAME_RELEASE} exit ;; *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) echo m68k-unknown-mint${UNAME_RELEASE} exit ;; m68k:machten:*:*) echo m68k-apple-machten${UNAME_RELEASE} exit ;; powerpc:machten:*:*) echo powerpc-apple-machten${UNAME_RELEASE} exit ;; RISC*:Mach:*:*) echo mips-dec-mach_bsd4.3 exit ;; RISC*:ULTRIX:*:*) echo mips-dec-ultrix${UNAME_RELEASE} exit ;; VAX*:ULTRIX*:*:*) echo vax-dec-ultrix${UNAME_RELEASE} exit ;; 2020:CLIX:*:* | 2430:CLIX:*:*) echo clipper-intergraph-clix${UNAME_RELEASE} exit ;; mips:*:*:UMIPS | mips:*:*:RISCos) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #ifdef __cplusplus #include /* for printf() prototype */ int main (int argc, char *argv[]) { #else int main (argc, argv) int argc; char *argv[]; { #endif #if defined (host_mips) && defined (MIPSEB) #if defined (SYSTYPE_SYSV) printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0); #endif #if defined (SYSTYPE_SVR4) printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0); #endif #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0); #endif #endif exit (-1); } EOF $CC_FOR_BUILD -o $dummy $dummy.c && dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` && SYSTEM_NAME=`$dummy $dummyarg` && { echo "$SYSTEM_NAME"; exit; } echo mips-mips-riscos${UNAME_RELEASE} exit ;; Motorola:PowerMAX_OS:*:*) echo powerpc-motorola-powermax exit ;; Motorola:*:4.3:PL8-*) echo powerpc-harris-powermax exit ;; Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) echo powerpc-harris-powermax exit ;; Night_Hawk:Power_UNIX:*:*) echo powerpc-harris-powerunix exit ;; m88k:CX/UX:7*:*) echo m88k-harris-cxux7 exit ;; m88k:*:4*:R4*) echo m88k-motorola-sysv4 exit ;; m88k:*:3*:R3*) echo m88k-motorola-sysv3 exit ;; AViiON:dgux:*:*) # DG/UX returns AViiON for all architectures UNAME_PROCESSOR=`/usr/bin/uname -p` if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ] then if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \ [ ${TARGET_BINARY_INTERFACE}x = x ] then echo m88k-dg-dgux${UNAME_RELEASE} else echo m88k-dg-dguxbcs${UNAME_RELEASE} fi else echo i586-dg-dgux${UNAME_RELEASE} fi exit ;; M88*:DolphinOS:*:*) # DolphinOS (SVR3) echo m88k-dolphin-sysv3 exit ;; M88*:*:R3*:*) # Delta 88k system running SVR3 echo m88k-motorola-sysv3 exit ;; XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) echo m88k-tektronix-sysv3 exit ;; Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) echo m68k-tektronix-bsd exit ;; *:IRIX*:*:*) echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'` exit ;; ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id exit ;; # Note that: echo "'`uname -s`'" gives 'AIX ' i*86:AIX:*:*) echo i386-ibm-aix exit ;; ia64:AIX:*:*) if [ -x /usr/bin/oslevel ] ; then IBM_REV=`/usr/bin/oslevel` else IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} fi echo ${UNAME_MACHINE}-ibm-aix${IBM_REV} exit ;; *:AIX:2:3) if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #include main() { if (!__power_pc()) exit(1); puts("powerpc-ibm-aix3.2.5"); exit(0); } EOF if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` then echo "$SYSTEM_NAME" else echo rs6000-ibm-aix3.2.5 fi elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then echo rs6000-ibm-aix3.2.4 else echo rs6000-ibm-aix3.2 fi exit ;; *:AIX:*:[45]) IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then IBM_ARCH=rs6000 else IBM_ARCH=powerpc fi if [ -x /usr/bin/oslevel ] ; then IBM_REV=`/usr/bin/oslevel` else IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} fi echo ${IBM_ARCH}-ibm-aix${IBM_REV} exit ;; *:AIX:*:*) echo rs6000-ibm-aix exit ;; ibmrt:4.4BSD:*|romp-ibm:BSD:*) echo romp-ibm-bsd4.4 exit ;; ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to exit ;; # report: romp-ibm BSD 4.3 *:BOSX:*:*) echo rs6000-bull-bosx exit ;; DPX/2?00:B.O.S.:*:*) echo m68k-bull-sysv3 exit ;; 9000/[34]??:4.3bsd:1.*:*) echo m68k-hp-bsd exit ;; hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) echo m68k-hp-bsd4.4 exit ;; 9000/[34678]??:HP-UX:*:*) HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` case "${UNAME_MACHINE}" in 9000/31? ) HP_ARCH=m68000 ;; 9000/[34]?? ) HP_ARCH=m68k ;; 9000/[678][0-9][0-9]) if [ -x /usr/bin/getconf ]; then sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` case "${sc_cpu_version}" in 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1 532) # CPU_PA_RISC2_0 case "${sc_kernel_bits}" in 32) HP_ARCH="hppa2.0n" ;; 64) HP_ARCH="hppa2.0w" ;; '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20 esac ;; esac fi if [ "${HP_ARCH}" = "" ]; then eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #define _HPUX_SOURCE #include #include int main () { #if defined(_SC_KERNEL_BITS) long bits = sysconf(_SC_KERNEL_BITS); #endif long cpu = sysconf (_SC_CPU_VERSION); switch (cpu) { case CPU_PA_RISC1_0: puts ("hppa1.0"); break; case CPU_PA_RISC1_1: puts ("hppa1.1"); break; case CPU_PA_RISC2_0: #if defined(_SC_KERNEL_BITS) switch (bits) { case 64: puts ("hppa2.0w"); break; case 32: puts ("hppa2.0n"); break; default: puts ("hppa2.0"); break; } break; #else /* !defined(_SC_KERNEL_BITS) */ puts ("hppa2.0"); break; #endif default: puts ("hppa1.0"); break; } exit (0); } EOF (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` test -z "$HP_ARCH" && HP_ARCH=hppa fi ;; esac if [ ${HP_ARCH} = "hppa2.0w" ] then eval $set_cc_for_build # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler # generating 64-bit code. GNU and HP use different nomenclature: # # $ CC_FOR_BUILD=cc ./config.guess # => hppa2.0w-hp-hpux11.23 # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess # => hppa64-hp-hpux11.23 if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | grep __LP64__ >/dev/null then HP_ARCH="hppa2.0w" else HP_ARCH="hppa64" fi fi echo ${HP_ARCH}-hp-hpux${HPUX_REV} exit ;; ia64:HP-UX:*:*) HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` echo ia64-hp-hpux${HPUX_REV} exit ;; 3050*:HI-UX:*:*) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #include int main () { long cpu = sysconf (_SC_CPU_VERSION); /* The order matters, because CPU_IS_HP_MC68K erroneously returns true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct results, however. */ if (CPU_IS_PA_RISC (cpu)) { switch (cpu) { case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; default: puts ("hppa-hitachi-hiuxwe2"); break; } } else if (CPU_IS_HP_MC68K (cpu)) puts ("m68k-hitachi-hiuxwe2"); else puts ("unknown-hitachi-hiuxwe2"); exit (0); } EOF $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` && { echo "$SYSTEM_NAME"; exit; } echo unknown-hitachi-hiuxwe2 exit ;; 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) echo hppa1.1-hp-bsd exit ;; 9000/8??:4.3bsd:*:*) echo hppa1.0-hp-bsd exit ;; *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) echo hppa1.0-hp-mpeix exit ;; hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) echo hppa1.1-hp-osf exit ;; hp8??:OSF1:*:*) echo hppa1.0-hp-osf exit ;; i*86:OSF1:*:*) if [ -x /usr/sbin/sysversion ] ; then echo ${UNAME_MACHINE}-unknown-osf1mk else echo ${UNAME_MACHINE}-unknown-osf1 fi exit ;; parisc*:Lites*:*:*) echo hppa1.1-hp-lites exit ;; C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) echo c1-convex-bsd exit ;; C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) if getsysinfo -f scalar_acc then echo c32-convex-bsd else echo c2-convex-bsd fi exit ;; C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) echo c34-convex-bsd exit ;; C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) echo c38-convex-bsd exit ;; C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) echo c4-convex-bsd exit ;; CRAY*Y-MP:*:*:*) echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*[A-Z]90:*:*:*) echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \ | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ -e 's/\.[^.]*$/.X/' exit ;; CRAY*TS:*:*:*) echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*T3E:*:*:*) echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*SV1:*:*:*) echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; *:UNICOS/mp:*:*) echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" exit ;; 5000:UNIX_System_V:4.*:*) FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" exit ;; i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} exit ;; sparc*:BSD/OS:*:*) echo sparc-unknown-bsdi${UNAME_RELEASE} exit ;; *:BSD/OS:*:*) echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE} exit ;; *:FreeBSD:*:*) echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` exit ;; i*:CYGWIN*:*) echo ${UNAME_MACHINE}-pc-cygwin exit ;; i*:MINGW*:*) echo ${UNAME_MACHINE}-pc-mingw32 exit ;; i*:windows32*:*) # uname -m includes "-pc" on this system. echo ${UNAME_MACHINE}-mingw32 exit ;; i*:PW*:*) echo ${UNAME_MACHINE}-pc-pw32 exit ;; x86:Interix*:[34]*) echo i586-pc-interix${UNAME_RELEASE}|sed -e 's/\..*//' exit ;; [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) echo i${UNAME_MACHINE}-pc-mks exit ;; i*:Windows_NT*:* | Pentium*:Windows_NT*:*) # How do we know it's Interix rather than the generic POSIX subsystem? # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we # UNAME_MACHINE based on the output of uname instead of i386? echo i586-pc-interix exit ;; i*:UWIN*:*) echo ${UNAME_MACHINE}-pc-uwin exit ;; amd64:CYGWIN*:*:*) echo x86_64-unknown-cygwin exit ;; p*:CYGWIN*:*) echo powerpcle-unknown-cygwin exit ;; prep*:SunOS:5.*:*) echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; *:GNU:*:*) # the GNU system echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` exit ;; *:GNU/*:*:*) # other systems with GNU libc and userland echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu exit ;; i*86:Minix:*:*) echo ${UNAME_MACHINE}-pc-minix exit ;; arm*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; cris:Linux:*:*) echo cris-axis-linux-gnu exit ;; crisv32:Linux:*:*) echo crisv32-axis-linux-gnu exit ;; frv:Linux:*:*) echo frv-unknown-linux-gnu exit ;; ia64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; m32r*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; m68*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; mips:Linux:*:*) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #undef CPU #undef mips #undef mipsel #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) CPU=mipsel #else #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) CPU=mips #else CPU= #endif #endif EOF eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^CPU=` test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } ;; mips64:Linux:*:*) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #undef CPU #undef mips64 #undef mips64el #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) CPU=mips64el #else #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) CPU=mips64 #else CPU= #endif #endif EOF eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^CPU=` test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } ;; ppc:Linux:*:*) echo powerpc-unknown-linux-gnu exit ;; ppc64:Linux:*:*) echo powerpc64-unknown-linux-gnu exit ;; alpha:Linux:*:*) case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in EV5) UNAME_MACHINE=alphaev5 ;; EV56) UNAME_MACHINE=alphaev56 ;; PCA56) UNAME_MACHINE=alphapca56 ;; PCA57) UNAME_MACHINE=alphapca56 ;; EV6) UNAME_MACHINE=alphaev6 ;; EV67) UNAME_MACHINE=alphaev67 ;; EV68*) UNAME_MACHINE=alphaev68 ;; esac objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} exit ;; parisc:Linux:*:* | hppa:Linux:*:*) # Look for CPU level case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in PA7*) echo hppa1.1-unknown-linux-gnu ;; PA8*) echo hppa2.0-unknown-linux-gnu ;; *) echo hppa-unknown-linux-gnu ;; esac exit ;; parisc64:Linux:*:* | hppa64:Linux:*:*) echo hppa64-unknown-linux-gnu exit ;; s390:Linux:*:* | s390x:Linux:*:*) echo ${UNAME_MACHINE}-ibm-linux exit ;; sh64*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; sh*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; sparc:Linux:*:* | sparc64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; x86_64:Linux:*:*) echo x86_64-unknown-linux-gnu exit ;; i*86:Linux:*:*) # The BFD linker knows what the default object file format is, so # first see if it will tell us. cd to the root directory to prevent # problems with other programs or directories called `ld' in the path. # Set LC_ALL=C to ensure ld outputs messages in English. ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \ | sed -ne '/supported targets:/!d s/[ ][ ]*/ /g s/.*supported targets: *// s/ .*// p'` case "$ld_supported_targets" in elf32-i386) TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu" ;; a.out-i386-linux) echo "${UNAME_MACHINE}-pc-linux-gnuaout" exit ;; coff-i386) echo "${UNAME_MACHINE}-pc-linux-gnucoff" exit ;; "") # Either a pre-BFD a.out linker (linux-gnuoldld) or # one that does not give us useful --help. echo "${UNAME_MACHINE}-pc-linux-gnuoldld" exit ;; esac # Determine whether the default compiler is a.out or elf eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #include #ifdef __ELF__ # ifdef __GLIBC__ # if __GLIBC__ >= 2 LIBC=gnu # else LIBC=gnulibc1 # endif # else LIBC=gnulibc1 # endif #else #ifdef __INTEL_COMPILER LIBC=gnu #else LIBC=gnuaout #endif #endif #ifdef __dietlibc__ LIBC=dietlibc #endif EOF eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^LIBC=` test x"${LIBC}" != x && { echo "${UNAME_MACHINE}-pc-linux-${LIBC}" exit } test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; } ;; i*86:DYNIX/ptx:4*:*) # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. # earlier versions are messed up and put the nodename in both # sysname and nodename. echo i386-sequent-sysv4 exit ;; i*86:UNIX_SV:4.2MP:2.*) # Unixware is an offshoot of SVR4, but it has its own version # number series starting with 2... # I am not positive that other SVR4 systems won't match this, # I just have to hope. -- rms. # Use sysv4.2uw... so that sysv4* matches it. echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} exit ;; i*86:OS/2:*:*) # If we were able to find `uname', then EMX Unix compatibility # is probably installed. echo ${UNAME_MACHINE}-pc-os2-emx exit ;; i*86:XTS-300:*:STOP) echo ${UNAME_MACHINE}-unknown-stop exit ;; i*86:atheos:*:*) echo ${UNAME_MACHINE}-unknown-atheos exit ;; i*86:syllable:*:*) echo ${UNAME_MACHINE}-pc-syllable exit ;; i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*) echo i386-unknown-lynxos${UNAME_RELEASE} exit ;; i*86:*DOS:*:*) echo ${UNAME_MACHINE}-pc-msdosdjgpp exit ;; i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*) UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'` if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL} else echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL} fi exit ;; i*86:*:5:[678]*) # UnixWare 7.x, OpenUNIX and OpenServer 6. case `/bin/uname -X | grep "^Machine"` in *486*) UNAME_MACHINE=i486 ;; *Pentium) UNAME_MACHINE=i586 ;; *Pent*|*Celeron) UNAME_MACHINE=i686 ;; esac echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} exit ;; i*86:*:3.2:*) if test -f /usr/options/cb.name; then UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \ && UNAME_MACHINE=i586 (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \ && UNAME_MACHINE=i686 (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ && UNAME_MACHINE=i686 echo ${UNAME_MACHINE}-pc-sco$UNAME_REL else echo ${UNAME_MACHINE}-pc-sysv32 fi exit ;; pc:*:*:*) # Left here for compatibility: # uname -m prints for DJGPP always 'pc', but it prints nothing about # the processor, so we play safe by assuming i386. echo i386-pc-msdosdjgpp exit ;; Intel:Mach:3*:*) echo i386-pc-mach3 exit ;; paragon:*:*:*) echo i860-intel-osf1 exit ;; i860:*:4.*:*) # i860-SVR4 if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4 else # Add other i860-SVR4 vendors below as they are discovered. echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4 fi exit ;; mini*:CTIX:SYS*5:*) # "miniframe" echo m68010-convergent-sysv exit ;; mc68k:UNIX:SYSTEM5:3.51m) echo m68k-convergent-sysv exit ;; M680?0:D-NIX:5.3:*) echo m68k-diab-dnix exit ;; M68*:*:R3V[5678]*:*) test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;; 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0) OS_REL='' test -r /etc/.relid \ && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ && { echo i486-ncr-sysv4.3${OS_REL}; exit; } /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ && { echo i486-ncr-sysv4; exit; } ;; m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) echo m68k-unknown-lynxos${UNAME_RELEASE} exit ;; mc68030:UNIX_System_V:4.*:*) echo m68k-atari-sysv4 exit ;; TSUNAMI:LynxOS:2.*:*) echo sparc-unknown-lynxos${UNAME_RELEASE} exit ;; rs6000:LynxOS:2.*:*) echo rs6000-unknown-lynxos${UNAME_RELEASE} exit ;; PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*) echo powerpc-unknown-lynxos${UNAME_RELEASE} exit ;; SM[BE]S:UNIX_SV:*:*) echo mips-dde-sysv${UNAME_RELEASE} exit ;; RM*:ReliantUNIX-*:*:*) echo mips-sni-sysv4 exit ;; RM*:SINIX-*:*:*) echo mips-sni-sysv4 exit ;; *:SINIX-*:*:*) if uname -p 2>/dev/null >/dev/null ; then UNAME_MACHINE=`(uname -p) 2>/dev/null` echo ${UNAME_MACHINE}-sni-sysv4 else echo ns32k-sni-sysv fi exit ;; PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort # says echo i586-unisys-sysv4 exit ;; *:UNIX_System_V:4*:FTX*) # From Gerald Hewes . # How about differentiating between stratus architectures? -djm echo hppa1.1-stratus-sysv4 exit ;; *:*:*:FTX*) # From seanf@swdc.stratus.com. echo i860-stratus-sysv4 exit ;; i*86:VOS:*:*) # From Paul.Green@stratus.com. echo ${UNAME_MACHINE}-stratus-vos exit ;; *:VOS:*:*) # From Paul.Green@stratus.com. echo hppa1.1-stratus-vos exit ;; mc68*:A/UX:*:*) echo m68k-apple-aux${UNAME_RELEASE} exit ;; news*:NEWS-OS:6*:*) echo mips-sony-newsos6 exit ;; R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) if [ -d /usr/nec ]; then echo mips-nec-sysv${UNAME_RELEASE} else echo mips-unknown-sysv${UNAME_RELEASE} fi exit ;; BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. echo powerpc-be-beos exit ;; BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. echo powerpc-apple-beos exit ;; BePC:BeOS:*:*) # BeOS running on Intel PC compatible. echo i586-pc-beos exit ;; SX-4:SUPER-UX:*:*) echo sx4-nec-superux${UNAME_RELEASE} exit ;; SX-5:SUPER-UX:*:*) echo sx5-nec-superux${UNAME_RELEASE} exit ;; SX-6:SUPER-UX:*:*) echo sx6-nec-superux${UNAME_RELEASE} exit ;; Power*:Rhapsody:*:*) echo powerpc-apple-rhapsody${UNAME_RELEASE} exit ;; *:Rhapsody:*:*) echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE} exit ;; *:Darwin:*:*) UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown case $UNAME_PROCESSOR in *86) UNAME_PROCESSOR=i686 ;; unknown) UNAME_PROCESSOR=powerpc ;; esac echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} exit ;; *:procnto*:*:* | *:QNX:[0123456789]*:*) UNAME_PROCESSOR=`uname -p` if test "$UNAME_PROCESSOR" = "x86"; then UNAME_PROCESSOR=i386 UNAME_MACHINE=pc fi echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE} exit ;; *:QNX:*:4*) echo i386-pc-qnx exit ;; NSE-?:NONSTOP_KERNEL:*:*) echo nse-tandem-nsk${UNAME_RELEASE} exit ;; NSR-?:NONSTOP_KERNEL:*:*) echo nsr-tandem-nsk${UNAME_RELEASE} exit ;; *:NonStop-UX:*:*) echo mips-compaq-nonstopux exit ;; BS2000:POSIX*:*:*) echo bs2000-siemens-sysv exit ;; DS/*:UNIX_System_V:*:*) echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE} exit ;; *:Plan9:*:*) # "uname -m" is not consistent, so use $cputype instead. 386 # is converted to i386 for consistency with other x86 # operating systems. if test "$cputype" = "386"; then UNAME_MACHINE=i386 else UNAME_MACHINE="$cputype" fi echo ${UNAME_MACHINE}-unknown-plan9 exit ;; *:TOPS-10:*:*) echo pdp10-unknown-tops10 exit ;; *:TENEX:*:*) echo pdp10-unknown-tenex exit ;; KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) echo pdp10-dec-tops20 exit ;; XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) echo pdp10-xkl-tops20 exit ;; *:TOPS-20:*:*) echo pdp10-unknown-tops20 exit ;; *:ITS:*:*) echo pdp10-unknown-its exit ;; SEI:*:*:SEIUX) echo mips-sei-seiux${UNAME_RELEASE} exit ;; *:DragonFly:*:*) echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` exit ;; *:*VMS:*:*) UNAME_MACHINE=`(uname -p) 2>/dev/null` case "${UNAME_MACHINE}" in A*) echo alpha-dec-vms ; exit ;; I*) echo ia64-dec-vms ; exit ;; V*) echo vax-dec-vms ; exit ;; esac ;; *:XENIX:*:SysV) echo i386-pc-xenix exit ;; i*86:skyos:*:*) echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//' exit ;; esac #echo '(No uname command or uname output not recognized.)' 1>&2 #echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2 eval $set_cc_for_build cat >$dummy.c < # include #endif main () { #if defined (sony) #if defined (MIPSEB) /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, I don't know.... */ printf ("mips-sony-bsd\n"); exit (0); #else #include printf ("m68k-sony-newsos%s\n", #ifdef NEWSOS4 "4" #else "" #endif ); exit (0); #endif #endif #if defined (__arm) && defined (__acorn) && defined (__unix) printf ("arm-acorn-riscix\n"); exit (0); #endif #if defined (hp300) && !defined (hpux) printf ("m68k-hp-bsd\n"); exit (0); #endif #if defined (NeXT) #if !defined (__ARCHITECTURE__) #define __ARCHITECTURE__ "m68k" #endif int version; version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; if (version < 4) printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); else printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version); exit (0); #endif #if defined (MULTIMAX) || defined (n16) #if defined (UMAXV) printf ("ns32k-encore-sysv\n"); exit (0); #else #if defined (CMU) printf ("ns32k-encore-mach\n"); exit (0); #else printf ("ns32k-encore-bsd\n"); exit (0); #endif #endif #endif #if defined (__386BSD__) printf ("i386-pc-bsd\n"); exit (0); #endif #if defined (sequent) #if defined (i386) printf ("i386-sequent-dynix\n"); exit (0); #endif #if defined (ns32000) printf ("ns32k-sequent-dynix\n"); exit (0); #endif #endif #if defined (_SEQUENT_) struct utsname un; uname(&un); if (strncmp(un.version, "V2", 2) == 0) { printf ("i386-sequent-ptx2\n"); exit (0); } if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ printf ("i386-sequent-ptx1\n"); exit (0); } printf ("i386-sequent-ptx\n"); exit (0); #endif #if defined (vax) # if !defined (ultrix) # include # if defined (BSD) # if BSD == 43 printf ("vax-dec-bsd4.3\n"); exit (0); # else # if BSD == 199006 printf ("vax-dec-bsd4.3reno\n"); exit (0); # else printf ("vax-dec-bsd\n"); exit (0); # endif # endif # else printf ("vax-dec-bsd\n"); exit (0); # endif # else printf ("vax-dec-ultrix\n"); exit (0); # endif #endif #if defined (alliant) && defined (i860) printf ("i860-alliant-bsd\n"); exit (0); #endif exit (1); } EOF $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` && { echo "$SYSTEM_NAME"; exit; } # Apollos put the system type in the environment. test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; } # Convex versions that predate uname can use getsysinfo(1) if [ -x /usr/convex/getsysinfo ] then case `getsysinfo -f cpu_type` in c1*) echo c1-convex-bsd exit ;; c2*) if getsysinfo -f scalar_acc then echo c32-convex-bsd else echo c2-convex-bsd fi exit ;; c34*) echo c34-convex-bsd exit ;; c38*) echo c38-convex-bsd exit ;; c4*) echo c4-convex-bsd exit ;; esac fi cat >&2 < in order to provide the needed information to handle your system. config.guess timestamp = $timestamp uname -m = `(uname -m) 2>/dev/null || echo unknown` uname -r = `(uname -r) 2>/dev/null || echo unknown` uname -s = `(uname -s) 2>/dev/null || echo unknown` uname -v = `(uname -v) 2>/dev/null || echo unknown` /usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null` /bin/uname -X = `(/bin/uname -X) 2>/dev/null` hostinfo = `(hostinfo) 2>/dev/null` /bin/universe = `(/bin/universe) 2>/dev/null` /usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null` /bin/arch = `(/bin/arch) 2>/dev/null` /usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` /usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` UNAME_MACHINE = ${UNAME_MACHINE} UNAME_RELEASE = ${UNAME_RELEASE} UNAME_SYSTEM = ${UNAME_SYSTEM} UNAME_VERSION = ${UNAME_VERSION} EOF exit 1 # Local variables: # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "timestamp='" # time-stamp-format: "%:y-%02m-%02d" # time-stamp-end: "'" # End: tophat-2.0.9/ax_bam.m40000644000175000017500000001453012122334411013236 0ustar toortoor# SYNOPSIS # # AX_BAM # # DESCRIPTION # # Test for the BAM libraries of a particular version (or newer) # # If no path to the installed bam library is given the macro searchs # under /usr, /usr/local, /opt and /opt/local and evaluates the # $BAM_ROOT environment variable. # Adapted from AX_BOOST_BASE # # This macro calls: # # AC_SUBST(BAM_CPPFLAGS) / AC_SUBST(BAM_LDFLAGS) # # And sets: # # HAVE_BAM # # LICENSE # # Copyright (c) 2010 Cole Trapnell # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. AC_DEFUN([AX_BAM], [ AC_ARG_WITH([bam], AS_HELP_STRING([--with-bam@<:@=DIR@:>@], [use BAM libraries (default is yes) - it is possible to specify the root directory for BAM (optional)]), [ if test "$withval" = "no"; then want_bam="no" elif test "$withval" = "yes"; then want_bam="yes" ac_bam_path="" else want_bam="yes" ac_bam_path="$withval" fi ], [want_bam="yes"]) AC_ARG_WITH([bam-libdir], AS_HELP_STRING([--with-bam-libdir=LIB_DIR], [Force given directory for bam libraries. Note that this will overwrite library path detection, so use this parameter only if default library detection fails and you know exactly where your bam libraries are located.]), [ if test -d $withval then ac_bam_lib_path="$withval" else AC_MSG_ERROR(--with-bam-libdir expected directory name) fi ], [ac_bam_lib_path=""] ) if test "x$want_bam" = "xyes"; then # bam_lib_version_req=ifelse([$1], ,1.20.0,$1) # bam_lib_version_req_shorten=`expr $bam_lib_version_req : '\([[0-9]]*\.[[0-9]]*\)'` # bam_lib_version_req_major=`expr $bam_lib_version_req : '\([[0-9]]*\)'` # bam_lib_version_req_minor=`expr $bam_lib_version_req : '[[0-9]]*\.\([[0-9]]*\)'` # bam_lib_version_req_sub_minor=`expr $bam_lib_version_req : '[[0-9]]*\.[[0-9]]*\.\([[0-9]]*\)'` # if test "x$bam_lib_version_req_sub_minor" = "x" ; then # bam_lib_version_req_sub_minor="0" # fi # WANT_BAM_VERSION=`expr $bam_lib_version_req_major \* 100000 \+ $bam_lib_version_req_minor \* 100 \+ $bam_lib_version_req_sub_minor` AC_MSG_CHECKING(for bamlib) succeeded=no dnl first we check the system location for bam libraries if test "$ac_bam_path" != ""; then BAM_LDFLAGS="-L$ac_bam_path/lib" BAM_CPPFLAGS="-I$ac_bam_path/include" else for ac_bam_path_tmp in /usr /usr/local /opt /opt/local ; do if test -d "$ac_bam_path_tmp/include/bam" && test -r "$ac_bam_path_tmp/include/bam"; then BAM_LDFLAGS="-L$ac_bam_path_tmp/lib" BAM_CPPFLAGS="-I$ac_bam_path_tmp/include" break; fi done fi dnl overwrite ld flags if we have required special directory with dnl --with-bam-libdir parameter if test "$ac_bam_lib_path" != ""; then BAM_LDFLAGS="-L$ac_bam_lib_path" fi CPPFLAGS_SAVED="$CPPFLAGS" CPPFLAGS="$CPPFLAGS $BAM_CPPFLAGS" export CPPFLAGS LDFLAGS_SAVED="$LDFLAGS" LDFLAGS="$LDFLAGS $BAM_LDFLAGS" export LDFLAGS AC_LANG_PUSH(C++) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ @%:@include ]], [[ ]])],[ AC_MSG_RESULT(yes) succeeded=yes found_system=yes ],[ ]) AC_LANG_POP([C++]) dnl if we found no bam with system layout we search for bam libraries dnl built and installed without the --layout=system option or for a staged(not installed) version if test "x$succeeded" != "xyes"; then _version=0 if test "$ac_bam_path" != ""; then if test -d "$ac_bam_path" && test -r "$ac_bam_path"; then for i in `ls -d $ac_bam_path/include/bam-* 2>/dev/null`; do _version_tmp=`echo $i | sed "s#$ac_bam_path##" | sed 's/\/include\/bam-//' | sed 's/_/./'` V_CHECK=`expr $_version_tmp \> $_version` if test "$V_CHECK" = "1" ; then _version=$_version_tmp fi VERSION_UNDERSCORE=`echo $_version | sed 's/\./_/'` BAM_CPPFLAGS="-I$ac_bam_path/include/bam-$VERSION_UNDERSCORE" done fi else for ac_bam_path in /usr /usr/local /opt /opt/local ; do if test -d "$ac_bam_path" && test -r "$ac_bam_path"; then for i in `ls -d $ac_bam_path/include/bam-* 2>/dev/null`; do _version_tmp=`echo $i | sed "s#$ac_bam_path##" | sed 's/\/include\/bam-//' | sed 's/_/./'` V_CHECK=`expr $_version_tmp \> $_version` if test "$V_CHECK" = "1" ; then _version=$_version_tmp best_path=$ac_bam_path fi done fi done VERSION_UNDERSCORE=`echo $_version | sed 's/\./_/'` BAM_CPPFLAGS="-I$best_path/include/bam-$VERSION_UNDERSCORE" if test "$ac_bam_lib_path" = "" then BAM_LDFLAGS="-L$best_path/lib" fi if test "x$BAM_ROOT" != "x"; then if test -d "$BAM_ROOT" && test -r "$BAM_ROOT" && test -d "$BAM_ROOT/stage/lib" && test -r "$BAM_ROOT/stage/lib"; then version_dir=`expr //$BAM_ROOT : '.*/\(.*\)'` stage_version=`echo $version_dir | sed 's/bam_//' | sed 's/_/./g'` stage_version_shorten=`expr $stage_version : '\([[0-9]]*\.[[0-9]]*\)'` V_CHECK=`expr $stage_version_shorten \>\= $_version` if test "$V_CHECK" = "1" -a "$ac_bam_lib_path" = "" ; then AC_MSG_NOTICE(We will use a staged bam library from $BAM_ROOT) BAM_CPPFLAGS="-I$BAM_ROOT" BAM_LDFLAGS="-L$BAM_ROOT/stage/lib" fi fi fi fi CPPFLAGS="$CPPFLAGS $BAM_CPPFLAGS" export CPPFLAGS LDFLAGS="$LDFLAGS $BAM_LDFLAGS" export LDFLAGS AC_LANG_PUSH(C++) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ @%:@include ]], [[ ]])],[ AC_MSG_RESULT(yes) succeeded=yes found_system=yes ],[ ]) AC_LANG_POP([C++]) fi if test "$succeeded" != "yes" ; then if test "$_version" = "0" ; then AC_MSG_ERROR([[We could not detect the bam libraries (version $bam_lib_version_req_shorten or higher). If you have a staged bam library (still not installed) please specify \$BAM_ROOT in your environment and do not give a PATH to --with-bam option.]]) else AC_MSG_NOTICE([Your bam libraries seem too old (version $_version).]) fi else BAM_LIB="-lbam" AC_SUBST(BAM_CPPFLAGS) AC_SUBST(BAM_LDFLAGS) AC_SUBST(BAM_LIB) AC_DEFINE(HAVE_BAM,,[define if the BAM library is available]) fi CPPFLAGS="$CPPFLAGS_SAVED" LDFLAGS="$LDFLAGS_SAVED" fi ]) tophat-2.0.9/Makefile.in0000644000175000017500000004545312157116230013627 0ustar toortoor# Makefile.in generated by automake 1.9.6 from Makefile.am. # @configure_input@ # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, # 2003, 2004, 2005 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY, to the extent permitted by law; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. @SET_MAKE@ srcdir = @srcdir@ top_srcdir = @top_srcdir@ VPATH = @srcdir@ pkgdatadir = $(datadir)/@PACKAGE@ pkglibdir = $(libdir)/@PACKAGE@ pkgincludedir = $(includedir)/@PACKAGE@ top_builddir = . am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd INSTALL = @INSTALL@ install_sh_DATA = $(install_sh) -c -m 644 install_sh_PROGRAM = $(install_sh) -c install_sh_SCRIPT = $(install_sh) -c INSTALL_HEADER = $(INSTALL_DATA) transform = $(program_transform_name) NORMAL_INSTALL = : PRE_INSTALL = : POST_INSTALL = : NORMAL_UNINSTALL = : PRE_UNINSTALL = : POST_UNINSTALL = : build_triplet = @build@ host_triplet = @host@ DIST_COMMON = README $(am__configure_deps) $(srcdir)/Makefile.am \ $(srcdir)/Makefile.in $(srcdir)/config.h.in \ $(top_srcdir)/configure AUTHORS COPYING ChangeLog INSTALL NEWS \ THANKS build-aux/config.guess build-aux/config.sub \ build-aux/depcomp build-aux/install-sh build-aux/missing subdir = . ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/ax_boost_base.m4 \ $(top_srcdir)/ax_boost_thread.m4 $(top_srcdir)/ax_bam.m4 \ $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ $(ACLOCAL_M4) am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \ configure.lineno configure.status.lineno mkinstalldirs = $(install_sh) -d CONFIG_HEADER = config.h CONFIG_CLEAN_FILES = SOURCES = DIST_SOURCES = RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \ html-recursive info-recursive install-data-recursive \ install-exec-recursive install-info-recursive \ install-recursive installcheck-recursive installdirs-recursive \ pdf-recursive ps-recursive uninstall-info-recursive \ uninstall-recursive ETAGS = etags CTAGS = ctags DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) distdir = $(PACKAGE)-$(VERSION) top_distdir = $(distdir) am__remove_distdir = \ { test ! -d $(distdir) \ || { find $(distdir) -type d ! -perm -200 -exec chmod u+w {} ';' \ && rm -fr $(distdir); }; } DIST_ARCHIVES = $(distdir).tar.gz GZIP_ENV = --best distuninstallcheck_listfiles = find . -type f -print distcleancheck_listfiles = find . -type f -print ACLOCAL = @ACLOCAL@ AMDEP_FALSE = @AMDEP_FALSE@ AMDEP_TRUE = @AMDEP_TRUE@ AMTAR = @AMTAR@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ AUTOMAKE = @AUTOMAKE@ AWK = @AWK@ BAM_CPPFLAGS = @BAM_CPPFLAGS@ BAM_LDFLAGS = @BAM_LDFLAGS@ BAM_LIB = @BAM_LIB@ BOOST_CPPFLAGS = @BOOST_CPPFLAGS@ BOOST_LDFLAGS = @BOOST_LDFLAGS@ BOOST_SYSTEM_LIB = @BOOST_SYSTEM_LIB@ BOOST_THREAD_LIB = @BOOST_THREAD_LIB@ CC = @CC@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ CXX = @CXX@ CXXDEPMODE = @CXXDEPMODE@ CXXFLAGS = @CXXFLAGS@ CYGPATH_W = @CYGPATH_W@ DEFS = @DEFS@ DEPDIR = @DEPDIR@ ECHO_C = @ECHO_C@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ EGREP = @EGREP@ EXEEXT = @EXEEXT@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ LDFLAGS = @LDFLAGS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LTLIBOBJS = @LTLIBOBJS@ MAKEINFO = @MAKEINFO@ OBJEXT = @OBJEXT@ PACKAGE = @PACKAGE@ PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ PACKAGE_NAME = @PACKAGE_NAME@ PACKAGE_STRING = @PACKAGE_STRING@ PACKAGE_TARNAME = @PACKAGE_TARNAME@ PACKAGE_VERSION = @PACKAGE_VERSION@ PATH_SEPARATOR = @PATH_SEPARATOR@ PYTHON = @PYTHON@ PYTHON_EXEC_PREFIX = @PYTHON_EXEC_PREFIX@ PYTHON_PLATFORM = @PYTHON_PLATFORM@ PYTHON_PREFIX = @PYTHON_PREFIX@ PYTHON_VERSION = @PYTHON_VERSION@ RANLIB = @RANLIB@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ STRIP = @STRIP@ VERSION = @VERSION@ ac_ct_CC = @ac_ct_CC@ ac_ct_CXX = @ac_ct_CXX@ ac_ct_RANLIB = @ac_ct_RANLIB@ ac_ct_STRIP = @ac_ct_STRIP@ am__fastdepCC_FALSE = @am__fastdepCC_FALSE@ am__fastdepCC_TRUE = @am__fastdepCC_TRUE@ am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@ am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@ am__include = @am__include@ am__leading_dot = @am__leading_dot@ am__quote = @am__quote@ am__tar = @am__tar@ am__untar = @am__untar@ bindir = @bindir@ build = @build@ build_alias = @build_alias@ build_cpu = @build_cpu@ build_os = @build_os@ build_vendor = @build_vendor@ datadir = @datadir@ exec_prefix = @exec_prefix@ host = @host@ host_alias = @host_alias@ host_cpu = @host_cpu@ host_os = @host_os@ host_vendor = @host_vendor@ includedir = @includedir@ infodir = @infodir@ install_sh = @install_sh@ libdir = @libdir@ libexecdir = @libexecdir@ localstatedir = @localstatedir@ mandir = @mandir@ mkdir_p = @mkdir_p@ oldincludedir = @oldincludedir@ pkgpyexecdir = @pkgpyexecdir@ pkgpythondir = @pkgpythondir@ prefix = @prefix@ program_transform_name = @program_transform_name@ pyexecdir = @pyexecdir@ pythondir = @pythondir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ sysconfdir = @sysconfdir@ target_alias = @target_alias@ ALWAYS_BUILT = src SUBDIRS = $(ALWAYS_BUILT) DIST_SUBDIRS = $(ALWAYS_BUILT) EXTRA_DIST = LICENSE all: config.h $(MAKE) $(AM_MAKEFLAGS) all-recursive .SUFFIXES: am--refresh: @: $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) @for dep in $?; do \ case '$(am__configure_deps)' in \ *$$dep*) \ echo ' cd $(srcdir) && $(AUTOMAKE) --foreign '; \ cd $(srcdir) && $(AUTOMAKE) --foreign \ && exit 0; \ exit 1;; \ esac; \ done; \ echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign Makefile'; \ cd $(top_srcdir) && \ $(AUTOMAKE) --foreign Makefile .PRECIOUS: Makefile Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status @case '$?' in \ *config.status*) \ echo ' $(SHELL) ./config.status'; \ $(SHELL) ./config.status;; \ *) \ echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)'; \ cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe);; \ esac; $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) $(SHELL) ./config.status --recheck $(top_srcdir)/configure: $(am__configure_deps) cd $(srcdir) && $(AUTOCONF) $(ACLOCAL_M4): $(am__aclocal_m4_deps) cd $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS) config.h: stamp-h1 @if test ! -f $@; then \ rm -f stamp-h1; \ $(MAKE) stamp-h1; \ else :; fi stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status @rm -f stamp-h1 cd $(top_builddir) && $(SHELL) ./config.status config.h $(srcdir)/config.h.in: $(am__configure_deps) cd $(top_srcdir) && $(AUTOHEADER) rm -f stamp-h1 touch $@ distclean-hdr: -rm -f config.h stamp-h1 uninstall-info-am: # This directory's subdirectories are mostly independent; you can cd # into them and run `make' without going through this Makefile. # To change the values of `make' variables: instead of editing Makefiles, # (1) if the variable is set in `config.status', edit `config.status' # (which will cause the Makefiles to be regenerated when you run `make'); # (2) otherwise, pass the desired values on the `make' command line. $(RECURSIVE_TARGETS): @failcom='exit 1'; \ for f in x $$MAKEFLAGS; do \ case $$f in \ *=* | --[!k]*);; \ *k*) failcom='fail=yes';; \ esac; \ done; \ dot_seen=no; \ target=`echo $@ | sed s/-recursive//`; \ list='$(SUBDIRS)'; for subdir in $$list; do \ echo "Making $$target in $$subdir"; \ if test "$$subdir" = "."; then \ dot_seen=yes; \ local_target="$$target-am"; \ else \ local_target="$$target"; \ fi; \ (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ || eval $$failcom; \ done; \ if test "$$dot_seen" = "no"; then \ $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ fi; test -z "$$fail" mostlyclean-recursive clean-recursive distclean-recursive \ maintainer-clean-recursive: @failcom='exit 1'; \ for f in x $$MAKEFLAGS; do \ case $$f in \ *=* | --[!k]*);; \ *k*) failcom='fail=yes';; \ esac; \ done; \ dot_seen=no; \ case "$@" in \ distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \ *) list='$(SUBDIRS)' ;; \ esac; \ rev=''; for subdir in $$list; do \ if test "$$subdir" = "."; then :; else \ rev="$$subdir $$rev"; \ fi; \ done; \ rev="$$rev ."; \ target=`echo $@ | sed s/-recursive//`; \ for subdir in $$rev; do \ echo "Making $$target in $$subdir"; \ if test "$$subdir" = "."; then \ local_target="$$target-am"; \ else \ local_target="$$target"; \ fi; \ (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ || eval $$failcom; \ done && test -z "$$fail" tags-recursive: list='$(SUBDIRS)'; for subdir in $$list; do \ test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \ done ctags-recursive: list='$(SUBDIRS)'; for subdir in $$list; do \ test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \ done ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) ' { files[$$0] = 1; } \ END { for (i in files) print i; }'`; \ mkid -fID $$unique tags: TAGS TAGS: tags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \ $(TAGS_FILES) $(LISP) tags=; \ here=`pwd`; \ if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \ include_option=--etags-include; \ empty_fix=.; \ else \ include_option=--include; \ empty_fix=; \ fi; \ list='$(SUBDIRS)'; for subdir in $$list; do \ if test "$$subdir" = .; then :; else \ test ! -f $$subdir/TAGS || \ tags="$$tags $$include_option=$$here/$$subdir/TAGS"; \ fi; \ done; \ list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) ' { files[$$0] = 1; } \ END { for (i in files) print i; }'`; \ if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \ test -n "$$unique" || unique=$$empty_fix; \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ $$tags $$unique; \ fi ctags: CTAGS CTAGS: ctags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \ $(TAGS_FILES) $(LISP) tags=; \ here=`pwd`; \ list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) ' { files[$$0] = 1; } \ END { for (i in files) print i; }'`; \ test -z "$(CTAGS_ARGS)$$tags$$unique" \ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ $$tags $$unique GTAGS: here=`$(am__cd) $(top_builddir) && pwd` \ && cd $(top_srcdir) \ && gtags -i $(GTAGS_ARGS) $$here distclean-tags: -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags distdir: $(DISTFILES) $(am__remove_distdir) mkdir $(distdir) $(mkdir_p) $(distdir)/build-aux @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \ list='$(DISTFILES)'; for file in $$list; do \ case $$file in \ $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \ $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \ esac; \ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \ if test "$$dir" != "$$file" && test "$$dir" != "."; then \ dir="/$$dir"; \ $(mkdir_p) "$(distdir)$$dir"; \ else \ dir=''; \ fi; \ if test -d $$d/$$file; then \ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ fi; \ cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ else \ test -f $(distdir)/$$file \ || cp -p $$d/$$file $(distdir)/$$file \ || exit 1; \ fi; \ done list='$(DIST_SUBDIRS)'; for subdir in $$list; do \ if test "$$subdir" = .; then :; else \ test -d "$(distdir)/$$subdir" \ || $(mkdir_p) "$(distdir)/$$subdir" \ || exit 1; \ distdir=`$(am__cd) $(distdir) && pwd`; \ top_distdir=`$(am__cd) $(top_distdir) && pwd`; \ (cd $$subdir && \ $(MAKE) $(AM_MAKEFLAGS) \ top_distdir="$$top_distdir" \ distdir="$$distdir/$$subdir" \ distdir) \ || exit 1; \ fi; \ done -find $(distdir) -type d ! -perm -755 -exec chmod a+rwx,go+rx {} \; -o \ ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \ ! -type d ! -perm -400 -exec chmod a+r {} \; -o \ ! -type d ! -perm -444 -exec $(SHELL) $(install_sh) -c -m a+r {} {} \; \ || chmod -R a+r $(distdir) dist-gzip: distdir tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz $(am__remove_distdir) dist-bzip2: distdir tardir=$(distdir) && $(am__tar) | bzip2 -9 -c >$(distdir).tar.bz2 $(am__remove_distdir) dist-tarZ: distdir tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z $(am__remove_distdir) dist-shar: distdir shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz $(am__remove_distdir) dist-zip: distdir -rm -f $(distdir).zip zip -rq $(distdir).zip $(distdir) $(am__remove_distdir) dist dist-all: distdir tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz $(am__remove_distdir) # This target untars the dist file and tries a VPATH configuration. Then # it guarantees that the distribution is self-contained by making another # tarfile. distcheck: dist case '$(DIST_ARCHIVES)' in \ *.tar.gz*) \ GZIP=$(GZIP_ENV) gunzip -c $(distdir).tar.gz | $(am__untar) ;;\ *.tar.bz2*) \ bunzip2 -c $(distdir).tar.bz2 | $(am__untar) ;;\ *.tar.Z*) \ uncompress -c $(distdir).tar.Z | $(am__untar) ;;\ *.shar.gz*) \ GZIP=$(GZIP_ENV) gunzip -c $(distdir).shar.gz | unshar ;;\ *.zip*) \ unzip $(distdir).zip ;;\ esac chmod -R a-w $(distdir); chmod a+w $(distdir) mkdir $(distdir)/_build mkdir $(distdir)/_inst chmod a-w $(distdir) dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \ && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \ && cd $(distdir)/_build \ && ../configure --srcdir=.. --prefix="$$dc_install_base" \ $(DISTCHECK_CONFIGURE_FLAGS) \ && $(MAKE) $(AM_MAKEFLAGS) \ && $(MAKE) $(AM_MAKEFLAGS) dvi \ && $(MAKE) $(AM_MAKEFLAGS) check \ && $(MAKE) $(AM_MAKEFLAGS) install \ && $(MAKE) $(AM_MAKEFLAGS) installcheck \ && $(MAKE) $(AM_MAKEFLAGS) uninstall \ && $(MAKE) $(AM_MAKEFLAGS) distuninstallcheck_dir="$$dc_install_base" \ distuninstallcheck \ && chmod -R a-w "$$dc_install_base" \ && ({ \ (cd ../.. && umask 077 && mkdir "$$dc_destdir") \ && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" install \ && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" uninstall \ && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" \ distuninstallcheck_dir="$$dc_destdir" distuninstallcheck; \ } || { rm -rf "$$dc_destdir"; exit 1; }) \ && rm -rf "$$dc_destdir" \ && $(MAKE) $(AM_MAKEFLAGS) dist \ && rm -rf $(DIST_ARCHIVES) \ && $(MAKE) $(AM_MAKEFLAGS) distcleancheck $(am__remove_distdir) @(echo "$(distdir) archives ready for distribution: "; \ list='$(DIST_ARCHIVES)'; for i in $$list; do echo $$i; done) | \ sed -e '1{h;s/./=/g;p;x;}' -e '$${p;x;}' distuninstallcheck: @cd $(distuninstallcheck_dir) \ && test `$(distuninstallcheck_listfiles) | wc -l` -le 1 \ || { echo "ERROR: files left after uninstall:" ; \ if test -n "$(DESTDIR)"; then \ echo " (check DESTDIR support)"; \ fi ; \ $(distuninstallcheck_listfiles) ; \ exit 1; } >&2 distcleancheck: distclean @if test '$(srcdir)' = . ; then \ echo "ERROR: distcleancheck can only run from a VPATH build" ; \ exit 1 ; \ fi @test `$(distcleancheck_listfiles) | wc -l` -eq 0 \ || { echo "ERROR: files left in build directory after distclean:" ; \ $(distcleancheck_listfiles) ; \ exit 1; } >&2 check-am: all-am check: check-recursive all-am: Makefile config.h installdirs: installdirs-recursive installdirs-am: install: install-recursive install-exec: install-exec-recursive install-data: install-data-recursive uninstall: uninstall-recursive install-am: all-am @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am installcheck: installcheck-recursive install-strip: $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ `test -z '$(STRIP)' || \ echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install mostlyclean-generic: clean-generic: distclean-generic: -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) maintainer-clean-generic: @echo "This command is intended for maintainers to use" @echo "it deletes files that may require special tools to rebuild." clean: clean-recursive clean-am: clean-generic mostlyclean-am distclean: distclean-recursive -rm -f $(am__CONFIG_DISTCLEAN_FILES) -rm -f Makefile distclean-am: clean-am distclean-generic distclean-hdr distclean-tags dvi: dvi-recursive dvi-am: html: html-recursive info: info-recursive info-am: install-data-am: install-exec-am: install-info: install-info-recursive install-man: installcheck-am: maintainer-clean: maintainer-clean-recursive -rm -f $(am__CONFIG_DISTCLEAN_FILES) -rm -rf $(top_srcdir)/autom4te.cache -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic mostlyclean: mostlyclean-recursive mostlyclean-am: mostlyclean-generic pdf: pdf-recursive pdf-am: ps: ps-recursive ps-am: uninstall-am: uninstall-info-am uninstall-info: uninstall-info-recursive .PHONY: $(RECURSIVE_TARGETS) CTAGS GTAGS all all-am am--refresh check \ check-am clean clean-generic clean-recursive ctags \ ctags-recursive dist dist-all dist-bzip2 dist-gzip dist-shar \ dist-tarZ dist-zip distcheck distclean distclean-generic \ distclean-hdr distclean-recursive distclean-tags \ distcleancheck distdir distuninstallcheck dvi dvi-am html \ html-am info info-am install install-am install-data \ install-data-am install-exec install-exec-am install-info \ install-info-am install-man install-strip installcheck \ installcheck-am installdirs installdirs-am maintainer-clean \ maintainer-clean-generic maintainer-clean-recursive \ mostlyclean mostlyclean-generic mostlyclean-recursive pdf \ pdf-am ps ps-am tags tags-recursive uninstall uninstall-am \ uninstall-info-am .PHONY: FORCE # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. .NOEXPORT: tophat-2.0.9/ax_boost_thread.m40000644000175000017500000001452212122334411015155 0ustar toortoor# =========================================================================== # http://autoconf-archive.cryp.to/ax_boost_thread.html # =========================================================================== # # SYNOPSIS # # AX_BOOST_THREAD # # DESCRIPTION # # Test for Thread library from the Boost C++ libraries. The macro requires # a preceding call to AX_BOOST_BASE. Further documentation is available at # . # # This macro calls: # # AC_SUBST(BOOST_THREAD_LIB) # AC_SUBST(BOOST_SYSTEM_LIB) # # And sets: # # HAVE_BOOST_THREAD # # LICENSE # # Copyright (c) 2009 Thomas Porschberg # Copyright (c) 2009 Michael Tindal # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. AC_DEFUN([AX_BOOST_THREAD], [ AC_ARG_WITH([boost-thread], AS_HELP_STRING([--with-boost-thread@<:@=special-lib@:>@], [use the Thread library from boost - it is possible to specify a certain library for the linker e.g. --with-boost-thread=boost_thread-gcc-mt ]), [ if test "$withval" = "no"; then want_boost="no" elif test "$withval" = "yes"; then want_boost="yes" ax_boost_user_thread_lib="" ax_booth_user_system_lib="" else want_boost="yes" echo "using $withval" ax_boost_user_thread_lib="$withval" fi ], [want_boost="yes"] ) if test "x$want_boost" = "xyes"; then AC_REQUIRE([AC_PROG_CC]) AC_REQUIRE([AC_CANONICAL_BUILD]) CPPFLAGS_SAVED="$CPPFLAGS" CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" export CPPFLAGS LDFLAGS_SAVED="$LDFLAGS" LDFLAGS="$LDFLAGS $BOOST_LDFLAGS" export LDFLAGS AC_CACHE_CHECK(whether the Boost::Thread library is available, ax_cv_boost_thread, [AC_LANG_PUSH([C++]) CXXFLAGS_SAVE=$CXXFLAGS if test "x$build_os" = "xsolaris" ; then CXXFLAGS="-pthreads $CXXFLAGS" elif test "x$build_os" = "xming32" ; then CXXFLAGS="-mthreads $CXXFLAGS" else CXXFLAGS="-pthread $CXXFLAGS" fi AC_COMPILE_IFELSE(AC_LANG_PROGRAM([[@%:@include ]], [[boost::thread_group thrds; return 0;]]), ax_cv_boost_thread=yes, ax_cv_boost_thread=no) CXXFLAGS=$CXXFLAGS_SAVE AC_LANG_POP([C++]) ]) if test "x$ax_cv_boost_thread" = "xyes"; then if test "x$build_os" = "xsolaris" ; then BOOST_CPPFLAGS="-pthreads $BOOST_CPPFLAGS" elif test "x$build_os" = "xming32" ; then BOOST_CPPFLAGS="-mthreads $BOOST_CPPFLAGS" else BOOST_CPPFLAGS="-pthread $BOOST_CPPFLAGS" fi AC_SUBST(BOOST_CPPFLAGS) AC_DEFINE(HAVE_BOOST_THREAD,,[define if the Boost::Thread library is available]) BOOSTLIBDIR=`echo $BOOST_LDFLAGS | sed -e 's/@<:@^\/@:>@*//'` LDFLAGS_SAVE=$LDFLAGS case "x$build_os" in *bsd* ) LDFLAGS="-pthread $LDFLAGS" break; ;; esac if test "x$ax_boost_user_thread_lib" = "x"; then for libextension in `ls $BOOSTLIBDIR/libboost_thread*.so* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_thread.*\)\.so.*$;\1;'` `ls $BOOSTLIBDIR/libboost_thread*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_thread.*\)\.a*$;\1;'`; do ax_lib=${libextension} AC_CHECK_LIB($ax_lib, exit, [BOOST_THREAD_LIB="-l$ax_lib"; AC_SUBST(BOOST_THREAD_LIB) link_thread="yes"; break], [link_thread="no"]) done if test "x$link_thread" != "xyes"; then for libextension in `ls $BOOSTLIBDIR/boost_thread*.dll* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_thread.*\)\.dll.*$;\1;'` `ls $BOOSTLIBDIR/libboost_thread*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_thread.*\)\.a*$;\1;'` ; do ax_lib=${libextension} AC_CHECK_LIB($ax_lib, exit, [BOOST_THREAD_LIB="-l$ax_lib"; AC_SUBST(BOOST_THREAD_LIB) link_thread="yes"; break], [link_thread="no"]) done fi else BOOST_THREAD_LIB="$ax_boost_user_thread_lib"; AC_SUBST(BOOST_THREAD_LIB) link_thread="yes"; fi if test "x$link_thread" = "xno"; then AC_MSG_ERROR(Could not link against $ax_lib !) else case "x$build_os" in *bsd* ) BOOST_LDFLAGS="-pthread $BOOST_LDFLAGS" break; ;; esac fi if test "x$ax_boost_user_system_lib" = "x"; then for libextension in `ls $BOOSTLIBDIR/libboost_system*.so* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_system.*\)\.so.*$;\1;'` `ls $BOOSTLIBDIR/libboost_system*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^lib\(boost_system.*\)\.a*$;\1;'`; do ax_lib=${libextension} AC_CHECK_LIB($ax_lib, exit, [BOOST_SYSTEM_LIB="-l$ax_lib"; AC_SUBST(BOOST_SYSTEM_LIB) link_system="yes"; break], [link_system="no"]) done if test "x$link_system" != "xyes"; then for libextension in `ls $BOOSTLIBDIR/boost_system*.dll* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_system.*\)\.dll.*$;\1;'` `ls $BOOSTLIBDIR/libboost_system*.a* 2>/dev/null | sed 's,.*/,,' | sed -e 's;^\(boost_system.*\)\.a*$;\1;'` ; do ax_lib=${libextension} AC_CHECK_LIB($ax_lib, exit, [BOOST_SYSTEM_LIB="-l$ax_lib"; AC_SUBST(BOOST_SYSTEM_LIB) link_system="yes"; break], [link_system="no"]) done fi else BOOST_SYSTEM_LIB="$ax_boost_user_system_lib"; AC_SUBST(BOOST_SYSTEM_LIB) link_system="yes"; fi fi CPPFLAGS="$CPPFLAGS_SAVED" LDFLAGS="$LDFLAGS_SAVED" fi ]) tophat-2.0.9/ax_boost_base.m40000644000175000017500000001670512122334411014625 0ustar toortoor# =========================================================================== # http://autoconf-archive.cryp.to/ax_boost_base.html # =========================================================================== # # SYNOPSIS # # AX_BOOST_BASE([MINIMUM-VERSION]) # # DESCRIPTION # # Test for the Boost C++ libraries of a particular version (or newer) # # If no path to the installed boost library is given the macro searchs # under /usr, /usr/local, /opt and /opt/local and evaluates the # $BOOST_ROOT environment variable. Further documentation is available at # . # # This macro calls: # # AC_SUBST(BOOST_CPPFLAGS) / AC_SUBST(BOOST_LDFLAGS) # # And sets: # # HAVE_BOOST # # LICENSE # # Copyright (c) 2008 Thomas Porschberg # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. AC_DEFUN([AX_BOOST_BASE], [ AC_ARG_WITH([boost], AS_HELP_STRING([--with-boost@<:@=DIR@:>@], [use boost (default is yes) - it is possible to specify the root directory for boost (optional)]), [ if test "$withval" = "no"; then want_boost="no" elif test "$withval" = "yes"; then want_boost="yes" ac_boost_path="" else want_boost="yes" ac_boost_path="$withval" fi ], [want_boost="yes"]) AC_ARG_WITH([boost-libdir], AS_HELP_STRING([--with-boost-libdir=LIB_DIR], [Force given directory for boost libraries. Note that this will overwrite library path detection, so use this parameter only if default library detection fails and you know exactly where your boost libraries are located.]), [ if test -d $withval then ac_boost_lib_path="$withval" else AC_MSG_ERROR(--with-boost-libdir expected directory name) fi ], [ac_boost_lib_path=""] ) if test "x$want_boost" = "xyes"; then boost_lib_version_req=ifelse([$1], ,1.20.0,$1) boost_lib_version_req_shorten=`expr $boost_lib_version_req : '\([[0-9]]*\.[[0-9]]*\)'` boost_lib_version_req_major=`expr $boost_lib_version_req : '\([[0-9]]*\)'` boost_lib_version_req_minor=`expr $boost_lib_version_req : '[[0-9]]*\.\([[0-9]]*\)'` boost_lib_version_req_sub_minor=`expr $boost_lib_version_req : '[[0-9]]*\.[[0-9]]*\.\([[0-9]]*\)'` if test "x$boost_lib_version_req_sub_minor" = "x" ; then boost_lib_version_req_sub_minor="0" fi WANT_BOOST_VERSION=`expr $boost_lib_version_req_major \* 100000 \+ $boost_lib_version_req_minor \* 100 \+ $boost_lib_version_req_sub_minor` AC_MSG_CHECKING(for boostlib >= $boost_lib_version_req) succeeded=no dnl first we check the system location for boost libraries dnl this location ist chosen if boost libraries are installed with the --layout=system option dnl or if you install boost with RPM if test "$ac_boost_path" != ""; then BOOST_LDFLAGS="-L$ac_boost_path/lib" BOOST_CPPFLAGS="-I$ac_boost_path/include" else for ac_boost_path_tmp in /usr /usr/local /opt /opt/local ; do if test -d "$ac_boost_path_tmp/include/boost" && test -r "$ac_boost_path_tmp/include/boost"; then BOOST_LDFLAGS="-L$ac_boost_path_tmp/lib" BOOST_CPPFLAGS="-I$ac_boost_path_tmp/include" break; fi done fi dnl overwrite ld flags if we have required special directory with dnl --with-boost-libdir parameter if test "$ac_boost_lib_path" != ""; then BOOST_LDFLAGS="-L$ac_boost_lib_path" fi CPPFLAGS_SAVED="$CPPFLAGS" CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" export CPPFLAGS LDFLAGS_SAVED="$LDFLAGS" LDFLAGS="$LDFLAGS $BOOST_LDFLAGS" export LDFLAGS AC_LANG_PUSH(C++) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ @%:@include ]], [[ #if BOOST_VERSION >= $WANT_BOOST_VERSION // Everything is okay #else # error Boost version is too old #endif ]])],[ AC_MSG_RESULT(yes) succeeded=yes found_system=yes ],[ ]) AC_LANG_POP([C++]) dnl if we found no boost with system layout we search for boost libraries dnl built and installed without the --layout=system option or for a staged(not installed) version if test "x$succeeded" != "xyes"; then _version=0 if test "$ac_boost_path" != ""; then if test -d "$ac_boost_path" && test -r "$ac_boost_path"; then for i in `ls -d $ac_boost_path/include/boost-* 2>/dev/null`; do _version_tmp=`echo $i | sed "s#$ac_boost_path##" | sed 's/\/include\/boost-//' | sed 's/_/./'` V_CHECK=`expr $_version_tmp \> $_version` if test "$V_CHECK" = "1" ; then _version=$_version_tmp fi VERSION_UNDERSCORE=`echo $_version | sed 's/\./_/'` BOOST_CPPFLAGS="-I$ac_boost_path/include/boost-$VERSION_UNDERSCORE" done fi else for ac_boost_path in /usr /usr/local /opt /opt/local ; do if test -d "$ac_boost_path" && test -r "$ac_boost_path"; then for i in `ls -d $ac_boost_path/include/boost-* 2>/dev/null`; do _version_tmp=`echo $i | sed "s#$ac_boost_path##" | sed 's/\/include\/boost-//' | sed 's/_/./'` V_CHECK=`expr $_version_tmp \> $_version` if test "$V_CHECK" = "1" ; then _version=$_version_tmp best_path=$ac_boost_path fi done fi done VERSION_UNDERSCORE=`echo $_version | sed 's/\./_/'` boost_major_version=`echo "$VERSION_UNDERSCORE" | sed 's/_//;s/_.*//'` BOOST_CPPFLAGS="-I$best_path/include/boost-$VERSION_UNDERSCORE" if test "$ac_boost_lib_path" = "" then BOOST_LDFLAGS="-L$best_path/lib" fi if test "x$BOOST_ROOT" != "x"; then if test -d "$BOOST_ROOT" && test -r "$BOOST_ROOT" && test -d "$BOOST_ROOT/stage/lib" && test -r "$BOOST_ROOT/stage/lib"; then version_dir=`expr //$BOOST_ROOT : '.*/\(.*\)'` stage_version=`echo $version_dir | sed 's/boost_//' | sed 's/_/./g'` stage_version_shorten=`expr $stage_version : '\([[0-9]]*\.[[0-9]]*\)'` V_CHECK=`expr $stage_version_shorten \>\= $_version` if test "$V_CHECK" = "1" -a "$ac_boost_lib_path" = "" ; then AC_MSG_NOTICE(We will use a staged boost library from $BOOST_ROOT) BOOST_CPPFLAGS="-I$BOOST_ROOT" BOOST_LDFLAGS="-L$BOOST_ROOT/stage/lib" fi fi fi fi CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" export CPPFLAGS LDFLAGS="$LDFLAGS $BOOST_LDFLAGS" export LDFLAGS AC_LANG_PUSH(C++) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ @%:@include ]], [[ #if BOOST_VERSION >= $WANT_BOOST_VERSION // Everything is okay #else # error Boost version is too old #endif ]])],[ AC_MSG_RESULT(yes) succeeded=yes found_system=yes ],[ ]) AC_LANG_POP([C++]) fi if test "$succeeded" != "yes" ; then if test "$_version" = "0" ; then AC_MSG_ERROR([[We could not detect the boost libraries (version $boost_lib_version_req_shorten or higher). If you have a staged boost library (still not installed) please specify \$BOOST_ROOT in your environment and do not give a PATH to --with-boost option. If you are sure you have boost installed, then check your version number looking in . See http://randspringer.de/boost for more documentation.]]) else AC_MSG_NOTICE([Your boost libraries seem to old (version $_version).]) fi else AC_SUBST(BOOST_CPPFLAGS) AC_SUBST(BOOST_LDFLAGS) AC_DEFINE(HAVE_BOOST,,[define if the Boost library is available]) fi CPPFLAGS="$CPPFLAGS_SAVED" LDFLAGS="$LDFLAGS_SAVED" fi ]) tophat-2.0.9/INSTALL0000644000175000017500000002240612122334411012577 0ustar toortoorInstallation Instructions ************************* Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005 Free Software Foundation, Inc. This file is free documentation; the Free Software Foundation gives unlimited permission to copy, distribute and modify it. Basic Installation ================== These are generic installation instructions. The `configure' shell script attempts to guess correct values for various system-dependent variables used during compilation. It uses those values to create a `Makefile' in each directory of the package. It may also create one or more `.h' files containing system-dependent definitions. Finally, it creates a shell script `config.status' that you can run in the future to recreate the current configuration, and a file `config.log' containing compiler output (useful mainly for debugging `configure'). It can also use an optional file (typically called `config.cache' and enabled with `--cache-file=config.cache' or simply `-C') that saves the results of its tests to speed up reconfiguring. (Caching is disabled by default to prevent problems with accidental use of stale cache files.) If you need to do unusual things to compile the package, please try to figure out how `configure' could check whether to do them, and mail diffs or instructions to the address given in the `README' so they can be considered for the next release. If you are using the cache, and at some point `config.cache' contains results you don't want to keep, you may remove or edit it. The file `configure.ac' (or `configure.in') is used to create `configure' by a program called `autoconf'. You only need `configure.ac' if you want to change it or regenerate `configure' using a newer version of `autoconf'. The simplest way to compile this package is: 1. `cd' to the directory containing the package's source code and type `./configure' to configure the package for your system. If you're using `csh' on an old version of System V, you might need to type `sh ./configure' instead to prevent `csh' from trying to execute `configure' itself. Running `configure' takes awhile. While running, it prints some messages telling which features it is checking for. 2. Type `make' to compile the package. 3. Optionally, type `make check' to run any self-tests that come with the package. 4. Type `make install' to install the programs and any data files and documentation. 5. You can remove the program binaries and object files from the source code directory by typing `make clean'. To also remove the files that `configure' created (so you can compile the package for a different kind of computer), type `make distclean'. There is also a `make maintainer-clean' target, but that is intended mainly for the package's developers. If you use it, you may have to get all sorts of other programs in order to regenerate files that came with the distribution. Compilers and Options ===================== Some systems require unusual options for compilation or linking that the `configure' script does not know about. Run `./configure --help' for details on some of the pertinent environment variables. You can give `configure' initial values for configuration parameters by setting variables in the command line or in the environment. Here is an example: ./configure CC=c89 CFLAGS=-O2 LIBS=-lposix *Note Defining Variables::, for more details. Compiling For Multiple Architectures ==================================== You can compile the package for more than one kind of computer at the same time, by placing the object files for each architecture in their own directory. To do this, you must use a version of `make' that supports the `VPATH' variable, such as GNU `make'. `cd' to the directory where you want the object files and executables to go and run the `configure' script. `configure' automatically checks for the source code in the directory that `configure' is in and in `..'. If you have to use a `make' that does not support the `VPATH' variable, you have to compile the package for one architecture at a time in the source code directory. After you have installed the package for one architecture, use `make distclean' before reconfiguring for another architecture. Installation Names ================== By default, `make install' will install the package's files in `/usr/local/bin', `/usr/local/man', etc. You can specify an installation prefix other than `/usr/local' by giving `configure' the option `--prefix=PREFIX'. You can specify separate installation prefixes for architecture-specific files and architecture-independent files. If you give `configure' the option `--exec-prefix=PREFIX', the package will use PREFIX as the prefix for installing programs and libraries. Documentation and other data files will still use the regular prefix. In addition, if you use an unusual directory layout you can give options like `--bindir=DIR' to specify different values for particular kinds of files. Run `configure --help' for a list of the directories you can set and what kinds of files go in them. If the package supports it, you can cause programs to be installed with an extra prefix or suffix on their names by giving `configure' the option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. Optional Features ================= Some packages pay attention to `--enable-FEATURE' options to `configure', where FEATURE indicates an optional part of the package. They may also pay attention to `--with-PACKAGE' options, where PACKAGE is something like `gnu-as' or `x' (for the X Window System). The `README' should mention any `--enable-' and `--with-' options that the package recognizes. For packages that use the X Window System, `configure' can usually find the X include and library files automatically, but if it doesn't, you can use the `configure' options `--x-includes=DIR' and `--x-libraries=DIR' to specify their locations. Specifying the System Type ========================== There may be some features `configure' cannot figure out automatically, but needs to determine by the type of machine the package will run on. Usually, assuming the package is built to be run on the _same_ architectures, `configure' can figure that out, but if it prints a message saying it cannot guess the machine type, give it the `--build=TYPE' option. TYPE can either be a short name for the system type, such as `sun4', or a canonical name which has the form: CPU-COMPANY-SYSTEM where SYSTEM can have one of these forms: OS KERNEL-OS See the file `config.sub' for the possible values of each field. If `config.sub' isn't included in this package, then this package doesn't need to know the machine type. If you are _building_ compiler tools for cross-compiling, you should use the `--target=TYPE' option to select the type of system they will produce code for. If you want to _use_ a cross compiler, that generates code for a platform different from the build platform, you should specify the "host" platform (i.e., that on which the generated programs will eventually be run) with `--host=TYPE'. Sharing Defaults ================ If you want to set default values for `configure' scripts to share, you can create a site shell script called `config.site' that gives default values for variables like `CC', `cache_file', and `prefix'. `configure' looks for `PREFIX/share/config.site' if it exists, then `PREFIX/etc/config.site' if it exists. Or, you can set the `CONFIG_SITE' environment variable to the location of the site script. A warning: not all `configure' scripts look for a site script. Defining Variables ================== Variables not defined in a site shell script can be set in the environment passed to `configure'. However, some packages may run configure again during the build, and the customized values of these variables may be lost. In order to avoid this problem, you should set them in the `configure' command line, using `VAR=value'. For example: ./configure CC=/usr/local2/bin/gcc causes the specified `gcc' to be used as the C compiler (unless it is overridden in the site shell script). Here is a another example: /bin/bash ./configure CONFIG_SHELL=/bin/bash Here the `CONFIG_SHELL=/bin/bash' operand causes subsequent configuration-related scripts to be executed by `/bin/bash'. `configure' Invocation ====================== `configure' recognizes the following options to control how it operates. `--help' `-h' Print a summary of the options to `configure', and exit. `--version' `-V' Print the version of Autoconf used to generate the `configure' script, and exit. `--cache-file=FILE' Enable the cache: use and save the results of the tests in FILE, traditionally `config.cache'. FILE defaults to `/dev/null' to disable caching. `--config-cache' `-C' Alias for `--cache-file=config.cache'. `--quiet' `--silent' `-q' Do not print messages saying which checks are being made. To suppress all normal output, redirect it to `/dev/null' (any error messages will still be shown). `--srcdir=DIR' Look for the package's source code in directory DIR. Usually `configure' can determine that directory automatically. `configure' also accepts some other, not widely useful, options. Run `configure --help' for more details. tophat-2.0.9/README0000644000175000017500000000263012122334411012423 0ustar toortoorTopHat is a fast splice junction mapper for RNA-Seq reads. It aligns RNA-Seq reads to mammalian-sized genomes using the ultra high-throughput short read aligner Bowtie, and then analyzes the mapping results to identify splice junctions between exons. TopHat is a collaborative effort between the Institute of Genetic Medicine at Johns Hopkins University, the Department of Mathematics at the University of California, Berkeley, and the Department of Stem Cell and Regenerative Biology at Harvard University. See http://tophat.cbcb.umd.edu for more information. Notes on compiling the package from source: ------------------------------------------- TopHat requires the Samtools package (http://samtools.sourceforge.net/) in order to generate and handle the compressed, binary alignment files (BAM). The --with-bam= option of the configure script expects the following subdirectories and files: /include/bam/*.h (all the header files from Samtools) /lib/libbam.a (obtained by compiling Samtools) At the time of this writing the Samtools package does not have an install routine the user will have to explicitly create the above directory structure (./include/bam/ and ./lib) and copy the required Samtools files as indicated (the header files and libbam.a) TopHat also requires the Boost libraries (http://www.boost.org). Please refer to the TopHat webpage for installation information. tophat-2.0.9/Makefile.am0000644000175000017500000000016312122334411013576 0ustar toortoor ALWAYS_BUILT = src SUBDIRS = $(ALWAYS_BUILT) DIST_SUBDIRS = $(ALWAYS_BUILT) EXTRA_DIST = LICENSE .PHONY: FORCE tophat-2.0.9/ChangeLog0000644000175000017500000000000012122334411013302 0ustar toortoortophat-2.0.9/config.h.in0000644000175000017500000000577212157116245013613 0ustar toortoor/* config.h.in. Generated from configure.ac by autoheader. */ /* define if the BAM library is available */ #undef HAVE_BAM /* define if the Boost library is available */ #undef HAVE_BOOST /* define if the Boost::Thread library is available */ #undef HAVE_BOOST_THREAD /* Define to 1 if you have the header file. */ #undef HAVE_INTTYPES_H /* Define to 1 if you have the `z' library (-lz). */ #undef HAVE_LIBZ /* Define to 1 if your system has a GNU libc compatible `malloc' function, and to 0 otherwise. */ #undef HAVE_MALLOC /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H /* Define to 1 if you have the `memset' function. */ #undef HAVE_MEMSET /* Define to 1 if the system has the type `ptrdiff_t'. */ #undef HAVE_PTRDIFF_T /* Define to 1 if stdbool.h conforms to C99. */ #undef HAVE_STDBOOL_H /* Define to 1 if you have the header file. */ #undef HAVE_STDINT_H /* Define to 1 if you have the header file. */ #undef HAVE_STDLIB_H /* Define to 1 if you have the `strdup' function. */ #undef HAVE_STRDUP /* Define to 1 if you have the header file. */ #undef HAVE_STRINGS_H /* Define to 1 if you have the header file. */ #undef HAVE_STRING_H /* Define to 1 if you have the `strrchr' function. */ #undef HAVE_STRRCHR /* Define to 1 if you have the `strsep' function. */ #undef HAVE_STRSEP /* Define to 1 if you have the `strtol' function. */ #undef HAVE_STRTOL /* Define to 1 if you have the header file. */ #undef HAVE_SYS_STAT_H /* Define to 1 if you have the header file. */ #undef HAVE_SYS_TYPES_H /* Define to 1 if you have the header file. */ #undef HAVE_UNISTD_H /* Define to 1 if the system has the type `_Bool'. */ #undef HAVE__BOOL /* Name of package */ #undef PACKAGE /* Define to the address where bug reports for this package should be sent. */ #undef PACKAGE_BUGREPORT /* Define to the full name of this package. */ #undef PACKAGE_NAME /* Define to the full name and version of this package. */ #undef PACKAGE_STRING /* Define to the one symbol short name of this package. */ #undef PACKAGE_TARNAME /* Define to the version of this package. */ #undef PACKAGE_VERSION /* Define to 1 if you have the ANSI C header files. */ #undef STDC_HEADERS /* SVN Revision */ #undef SVN_REVISION /* Version number of package */ #undef VERSION /* Number of bits in a file offset, on hosts where this is settable. */ #undef _FILE_OFFSET_BITS /* Define for large files, on AIX-style hosts. */ #undef _LARGE_FILES /* Define to empty if `const' does not conform to ANSI C. */ #undef const /* Define to `__inline__' or `__inline' if that's what the C compiler calls it, or to nothing if 'inline' is not supported under any name. */ #ifndef __cplusplus #undef inline #endif /* Define to rpl_malloc if the replacement function should be used. */ #undef malloc /* Define to `int' if does not define. */ #undef pid_t /* Define to `unsigned' if does not define. */ #undef size_t tophat-2.0.9/src/0000755000175000017500000000000012165261277012351 5ustar toortoortophat-2.0.9/src/bwt_map.cpp0000644000175000017500000020211312122334363014473 0ustar toortoor/* * bwt_map.cpp * TopHat * * Created by Cole Trapnell on 11/17/08. * Copyright 2008 Cole Trapnell. All rights reserved. * */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #include #include #include #include #include "common.h" #include "bwt_map.h" #include "tokenize.h" #include "reads.h" #include "align_status.h" int gap_length(const vector& cigar) { int edit_dist = 0; for (size_t i = 0; i < cigar.size(); ++i) { const CigarOp& c = cigar[i]; if (c.opcode == INS || c.opcode == iNS || c.opcode == DEL || c.opcode == dEL) edit_dist += c.length; } return edit_dist; } void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness) { uint32_t reference_id = bh.ref_id(); pair ret = _hits_for_ref.insert(make_pair(reference_id, HitList())); HitList& hl = ret.first->second; if (check_uniqueness) { // Check uniqueness, in case we are adding spliced hits from // several spliced alignment sources (e.g. de novo hashing + Bowtie // against a user-supplied index). We don't want to count the same // alignment twice if it happened to be found by more than one method HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt); HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt); for (; lb != ub && lb != hl.end(); ++lb) { if (*lb == bh) { //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id()); return; } if (lb->insert_id() == bh.insert_id() && lb->ref_id() == bh.ref_id() && lb->antisense_align() == bh.antisense_align()) { // If we get here, we may be looking at the same alignment // However, spanning_reads may report a shorter, trimmed alignment // so not all fields will be equal. If they just disagree on the // ends, and don't indicate a different junction coord, the // alignments are the same. if ((lb->left() <= bh.left() && lb->right() >= bh.right()) || (bh.left() <= lb->left() && bh.right() >= lb->right())) { vector > lb_gaps, bh_gaps; lb->gaps(lb_gaps); bh.gaps(bh_gaps); if (lb_gaps == bh_gaps) { // One alignment is contained in the other, they agree on // where the gaps, if any, are, and they share an id // => this is a redundant aligment, so toss it //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id()); return; } } } } } _total_hits++; hl.push_back(bh); } bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2) { return h1.insert_id() < h2.insert_id(); } void LineHitFactory::openStream(HitStream& hs) { if (hs._hit_file==NULL && !hs._hit_file_name.empty()) { //open the file for HitStream here hs._hit_file=fopen(hs._hit_file_name.c_str(),"r"); if (hs._hit_file==NULL) err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str()); return; } if (hs._fzpipe!=NULL) { hs._hit_file=hs._fzpipe->file; } } void LineHitFactory::rewind(HitStream& hs) { if (hs._fzpipe!=NULL) { hs._fzpipe->rewind(); hs._hit_file=hs._fzpipe->file; } else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file)); } void LineHitFactory::seek(HitStream& hs, int64_t offset) { // daehwan - implement this later if (hs._fzpipe != NULL) { hs._fzpipe->seek(offset); hs._hit_file=hs._fzpipe->file; } // else if (hs._hit_file) ::seek((FILE*)(hs._hit_file)); } bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) { FILE* f=(FILE *)(hs._hit_file); bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL); if (!new_rec || feof(f)) { hs._eof=true; return false; } ++_line_num; char* nl = strrchr(_hit_buf, '\n'); if (nl) *nl = 0; buf = _hit_buf; buf_size = _hit_buf_max_sz - 1; return true; } void LineHitFactory::closeStream(HitStream& hs) { if (hs._fzpipe!=NULL) { hs._fzpipe->close(); return; } if (hs._hit_file!=NULL) { fclose((FILE*)(hs._hit_file)); hs._hit_file=NULL; } } void BAMHitFactory::openStream(HitStream& hs) { if (hs._hit_file==NULL) { if (hs._hit_file_name.empty()) //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n"); return; //invalid stream, could be just a place holder //open the file here if not already open string fext=getFext(hs._hit_file_name); if (fext=="sam") hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0); else hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0); samfile_t* sam_file=(samfile_t*)(hs._hit_file); if (sam_file == NULL) err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str()); if (sam_file->header == NULL) err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str()); memset(&_next_hit, 0, sizeof(_next_hit)); //_beginning = bgzf_tell(sam_file->x.bam); if (inspect_header(hs) == false) err_die("Error: invalid SAM header for file %s\n", hs._hit_file_name.c_str()); if (_sam_header) { bam_header_destroy(sam_file->header); sam_file->header = _sam_header; _sam_header_destroyed = true; } else { _sam_header = sam_file->header; } } } void BAMHitFactory::closeStream(HitStream& hs) { if (hs._hit_file) { if (_sam_header_destroyed) { ((samfile_t*)(hs._hit_file))->header = NULL; } samclose((samfile_t*)(hs._hit_file)); } hs._hit_file=NULL; _sam_header=NULL; } void BAMHitFactory::rewind(HitStream& hs) { /* if (_hit_file && ((samfile_t*)_hit_file)->x.bam) { bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET); _eof = false; } */ this->closeStream(hs); this->openStream(hs); } void BAMHitFactory::seek(HitStream& hs, int64_t offset) { if (hs._hit_file) { bgzf_seek(((samfile_t*)hs._hit_file)->x.bam, offset, SEEK_SET); } } string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) { const bam1_t* bamrec=(const bam1_t*)hit_buf; char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec); string sam_line(tamline); free(tamline); return sam_line; } bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) { if (_next_hit.data) { free(_next_hit.data); _next_hit.data = NULL; } if (_sam_header == NULL) _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on if (hs.eof() || !hs.ready()) return false; //mark_curr_pos(); memset(&_next_hit, 0, sizeof(_next_hit)); int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit); if (bytes_read <= 0) { hs._eof = true; return false; } buf = (const char*)&_next_hit; buf_size = bytes_read; return true; } BowtieHit HitFactory::create_hit(const string& insert_name, const string& ref_name, const string& ref_name2, int left, const vector& cigar, bool antisense_aln, bool antisense_splice, unsigned char mismatches, unsigned char edit_dist, unsigned char splice_mms, bool end) { uint64_t insert_id = _insert_table.get_id(insert_name); uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0); uint32_t reference_id2 = reference_id; if (ref_name2.length() > 0) reference_id2 = _ref_table.get_id(ref_name2, NULL, 0); return BowtieHit(reference_id, reference_id2, insert_id, left, cigar, antisense_aln, antisense_splice, mismatches, edit_dist, splice_mms, end); } BowtieHit HitFactory::create_hit(const string& insert_name, const string& ref_name, uint32_t left, uint32_t read_len, bool antisense_aln, unsigned char mismatches, unsigned char edit_dist, bool end) { uint64_t insert_id = _insert_table.get_id(insert_name); uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0); return BowtieHit(reference_id, reference_id, insert_id, left, read_len, antisense_aln, mismatches, edit_dist, end); } int anchor_mismatch = 0; void parseSegReadName(char* name, char*& name_tags, bool strip_slash, bool &end, unsigned int &seg_offset, unsigned int& seg_num, unsigned int & num_segs) { char* pipe = strrchr(name, '|'); if (pipe) { if (name_tags) strcpy(name_tags, pipe); char* tag_buf = pipe + 1; if (strchr(tag_buf, ':')) { sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); if (seg_num + 1 == num_segs) end = true; else end = false; } *pipe = 0; } // Stripping the slash and number following it gives the insert name char* slash = strrchr(name, '/'); if (strip_slash && slash) *slash = 0; } int parseCigar(vector& cigar, const char* cigar_str, bool &spliced_alignment) { const char* p_cig = cigar_str; int refspan=0; //alignment span on reference sequence while (*p_cig) { char* t; int op_len = (int)strtol(p_cig, &t, 10); if (op_len <= 0) { fprintf (stderr, "Error: CIGAR op has zero length\n"); return 0; } char op_char = toupper(*t); CigarOpCode opcode; switch (op_char) { case '=': case 'X': case 'M': opcode = MATCH; refspan+=op_len; break; case 'I': opcode = INS; break; case 'D': opcode = DEL; refspan+=op_len; break; case 'N': if (op_len > max_report_intron_length) return 0; opcode = REF_SKIP; spliced_alignment = true; refspan+=op_len; break; case 'S': opcode = SOFT_CLIP; break; case 'H': opcode = HARD_CLIP; break; case 'P': opcode = PAD; break; default: fprintf (stderr, "Error: invalid CIGAR operation\n"); return 0; } p_cig = t + 1; cigar.push_back(CigarOp(opcode, op_len)); } //while cigar codes if (*p_cig) { fprintf (stderr, "Error: unmatched CIGAR operation (%s in %s)\n", p_cig, cigar_str); return 0; } return refspan; } int getBAMmismatches(const bam1_t* buf, vector& cigar, vector& mismatches, int& sam_nm, bool& antisense_splice) { int gspan=0;//genomic span of the alignment sam_nm=0; int num_mismatches=0; uint8_t* ptr = bam_aux_get(buf, "XS"); if (ptr) { char src_strand_char = bam_aux2A(ptr); if (src_strand_char == '-') antisense_splice = true; } ptr = bam_aux_get(buf, "MD"); if (ptr) { const char* p = bam_aux2Z(ptr); int bi=0; //base offset position in the read while (*p != 0) { if (isdigit(*p)) { int v=atoi(p); do { p++; } while (isdigit(*p)); bi+=v; } while (isalpha(*p)) { p++; num_mismatches++; //mismatches.push_back(bi); mismatches[bi]=true; bi++; } if (*p=='^') { //reference deletion p++; while (isalpha(*p)) { //insert read bases p++; bi++; } } } } /* By convention,the NM field of the SAM record * counts an insertion or deletion. I dont' think * we want the mismatch count in the BowtieHit * record to reflect this. Therefore, subtract out * the mismatches due to in/dels */ for(vector::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){ switch (itr->opcode) { case MATCH: case REF_SKIP: case PAD: gspan += itr->length; break; case DEL: gspan += itr->length; sam_nm -= itr->length; break; case INS: sam_nm -= itr->length; break; default: break; } } return num_mismatches; } int getSAMmismatches(char* &buf, vector& cigar, vector& mismatches, int& sam_nm, bool& antisense_splice) { int gspan=0;//genomic span of the alignment const char* tag_buf = buf; sam_nm=0; int num_mismatches=0; while((tag_buf = get_token((char**)&buf,"\t"))) { vector tuple_fields; tokenize(tag_buf,":", tuple_fields); if (tuple_fields.size() == 3) { if (tuple_fields[0] == "XS") { if (tuple_fields[2] == "-") antisense_splice = true; } else if (tuple_fields[0] == "NM") { sam_nm = atoi(tuple_fields[2].c_str()); } else if (tuple_fields[0] == "NS") { //ignored for now } else if (tuple_fields[0] == "MD") { const char* p=tuple_fields[2].c_str(); int bi=0; //base offset position in the read while (*p != 0) { if (isdigit(*p)) { int v=atoi(p); do { p++; } while (isdigit(*p)); bi+=v; } while (isalpha(*p)) { p++; num_mismatches++; //mismatches.push_back(bi); mismatches[bi]=true; bi++; } if (*p=='^') { //reference deletion p++; while (isalpha(*p)) { //insert read bases p++; bi++; } } } } //else //{ //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str()); //return false; //} } } /* By convention,the NM field of the SAM record * counts an insertion or deletion. I dont' think * we want the mismatch count in the BowtieHit * record to reflect this. Therefore, subtract out * the mismatches due to in/dels */ for(vector::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){ switch (itr->opcode) { case MATCH: case REF_SKIP: case PAD: gspan += itr->length; break; case DEL: gspan += itr->length; sam_nm -= itr->length; break; case INS: sam_nm -= itr->length; break; default: break; } } return num_mismatches; } bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, BowtieHit& bh, bool strip_slash, char* name_out, char* name_tags, char* seq, char* qual) { if (!orig_bwt_buf || !*orig_bwt_buf) return false; char bwt_buf[2048]; strcpy(bwt_buf, orig_bwt_buf); // Are we still in the header region? if (bwt_buf[0] == '@') return false; char* buf = bwt_buf; char* name = get_token((char**)&buf,"\t"); char* sam_flag_str = get_token((char**)&buf,"\t"); char* text_name = get_token((char**)&buf,"\t"); char* text_offset_str = get_token((char**)&buf,"\t"); const char* map_qual_str = get_token((char**)&buf,"\t"); char* cigar_str = get_token((char**)&buf,"\t"); const char* mate_ref_str = get_token((char**)&buf,"\t"); const char* mate_pos_str = get_token((char**)&buf,"\t"); const char* inferred_insert_sz_str = get_token((char**)&buf,"\t"); const char* seq_str = get_token((char**)&buf,"\t"); if (seq) strcpy(seq, seq_str); const char* qual_str = get_token((char**)&buf,"\t"); if (qual) strcpy(qual, qual_str); if (!name || !sam_flag_str || !text_name || !text_offset_str || !map_qual_str || !cigar_str || !mate_ref_str || !mate_pos_str || !inferred_insert_sz_str || !seq_str || !qual_str) { // truncated or malformed SAM record return false; } int sam_flag = atoi(sam_flag_str); string ref_name = text_name, ref_name2 = ""; int text_offset = atoi(text_offset_str); bool end = true; unsigned int seg_offset = 0; unsigned int seg_num = 0; unsigned int num_segs = 0; // Copy the tag out of the name field before we might wipe it out parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs); vector cigar; bool spliced_alignment = false; int refspan=parseCigar(cigar, cigar_str, spliced_alignment); if (refspan==0) return false; //vector attributes; //tokenize(tag_buf, " \t",attributes); bool antisense_splice = false; int sam_nm = 0; //the value of the NM tag (edit distance) //int mismatches[1024];//array with mismatch positions on the read (0-based from the left aligned end of the read) vector mismatches; mismatches.resize(strlen(seq_str), false); int num_mismatches=getSAMmismatches(buf, cigar, mismatches, sam_nm, antisense_splice); int edit_dist = num_mismatches + gap_length(bh.cigar()); if (spliced_alignment) { bh = create_hit(name, ref_name, ref_name2, text_offset - 1, cigar, sam_flag & 0x0010, antisense_splice, num_mismatches, edit_dist, 0, end); } else { //assert(cigar.size() == 1 && cigar[0].opcode == MATCH); bh = create_hit(name, ref_name, ref_name2, text_offset - 1, // SAM files are 1-indexed cigar, sam_flag & 0x0010, false, num_mismatches, edit_dist, 0, end); } return true; } void cigar_add(vector& cigar, CigarOp& op) { if (op.length<=0) return; if (cigar.size()>0 && cigar.back().opcode==op.opcode) { cigar.back().length+=op.length; } cigar.push_back(op); } bool spliceCigar(vector& splcigar, const vector& cigar, vector mismatches, int &left, int spl_start, int spl_len, CigarOpCode spl_code, int& spl_mismatches) { //merge the original 'cigar' with the new insert/gap operation //at position spl_start and place the result into splcigar; //TODO: ideally this should also get and rebuild the MD string (alignment mismatches) //return value: mismatches in the insert region for INS case, //or number of mismatches in the anchor region //return -1 if somehow the hit seems bad //these offsets are relative to the beginning of alignment on reference int spl_ofs=spl_start-left; //relative position of splice op if (spl_code == FUSION_FF || spl_code == FUSION_FR || spl_code == FUSION_RF || spl_code == FUSION_RR) spl_ofs = abs(spl_ofs); int spl_ofs_end=spl_ofs; //relative position of first ref base AFTER splice op CigarOp gapop(spl_code, spl_len); //for DEL, REF_SKIP, FUSIONS if (spl_code==INS) spl_ofs_end += spl_len; int ref_ofs=0; //working offset on reference int read_ofs=0; //working offset on the read, relative to the leftmost aligned base bool xfound=false; //if (left<=spl_start+spl_len) { if (spl_ofs_end>0) { int prev_opcode=0; int prev_oplen=0; for (size_t c = 0 ; c < cigar.size(); ++c) { int prev_read_ofs=read_ofs; int cur_op_ofs=ref_ofs; int cur_opcode=cigar[c].opcode; int cur_oplen=cigar[c].length; switch (cur_opcode) { case MATCH: ref_ofs+=cur_oplen; read_ofs+=cur_oplen; if (spl_code==REF_SKIP || spl_code==DEL || spl_code==FUSION_FF || spl_code==FUSION_FR || spl_code==FUSION_RF || spl_code==FUSION_RR) { for (int o=cur_op_ofs;o=spl_ofs && o=spl_ofs_end || ref_ofs<=spl_ofs) { if (cur_op_ofs==spl_ofs_end) { if (spl_code!=INS) { if (cur_opcode!=INS) { xfound=true; //we have to insert the gap here first cigar_add(splcigar, gapop); //also, check } } } CigarOp op(cigar[c]); if (xfound) { if (spl_code == FUSION_FR || spl_code == FUSION_RR) { if (op.opcode == MATCH) op.opcode = mATCH; else if (op.opcode == INS) op.opcode = iNS; else if (op.opcode == DEL) op.opcode = dEL; else if (op.opcode == REF_SKIP) op.opcode = rEF_SKIP; } } else { if (spl_code == FUSION_RF || spl_code == FUSION_RR) { if (op.opcode == MATCH) op.opcode = mATCH; else if (op.opcode == INS) op.opcode = iNS; else if (op.opcode == DEL) op.opcode = dEL; else if (op.opcode == REF_SKIP) op.opcode = rEF_SKIP; } } cigar_add(splcigar, op); } else //if (ref_ofs>spl_ofs) { { //op intersection xfound=true; if (spl_code==INS) { //we have to shorten cur_opcode // find the overlap between current range //int ovl_start = (cur_op_ofs>spl_ofs) ? cur_op_ofs : spl_ofs; //int ovl_end = (ref_ofs>spl_ofs_end) ? spl_ofs_end : ref_ofs; CigarOp op(cigar[c]); op.length=spl_ofs-cur_op_ofs; if (spl_ofs>cur_op_ofs) cigar_add(splcigar, op); if (spl_ofs<0) { CigarOp temp = gapop; temp.length += spl_ofs; if (temp.length>0) cigar_add(splcigar, temp); } else cigar_add(splcigar, gapop); op.length=ref_ofs-spl_ofs_end; if (ref_ofs>spl_ofs_end) cigar_add(splcigar,op); } else {//DEL or REF_SKIP or FUSION_[FR][FR] //spl_ofs == spl_ofs_end //we have to split cur_opcode //look for mismatches within min_anchor_len distance from splice point CigarOp op(cigar[c]); CigarOpCode opcode = op.opcode; op.length=spl_ofs-cur_op_ofs; if (spl_code == FUSION_RF || spl_code == FUSION_RR) { if (opcode == MATCH) op.opcode = mATCH; else if (opcode == INS) op.opcode = iNS; else if (opcode == DEL) op.opcode = dEL; else if (opcode == REF_SKIP) op.opcode = rEF_SKIP; } cigar_add(splcigar, op); cigar_add(splcigar, gapop); op.opcode = opcode; if (spl_code == FUSION_FR || spl_code == FUSION_RR) { if (opcode == MATCH) op.opcode = mATCH; else if (opcode == INS) op.opcode = iNS; else if (opcode == DEL) op.opcode = dEL; else if (opcode == REF_SKIP) op.opcode = rEF_SKIP; } op.length=ref_ofs-spl_ofs; cigar_add(splcigar,op); } } //op intersection prev_opcode=cur_opcode; prev_oplen=cur_oplen; } //for each cigar opcode } //intersection possible //if (!xfound) {//no intersection found between splice event and alignment if (spl_ofs_end<=0) { //alignment starts after the splice event if (spl_code==INS) left-=spl_len; else left+=spl_len; splcigar = cigar; } //else { //alignment ends before the splice event //nothing to do // } //return spl_mismatches; // } if (splcigar.size() < cigar.size() + 2) return false; else if (splcigar.front().opcode != MATCH && splcigar.front().opcode != mATCH) return false; else if (splcigar.back().opcode != MATCH && splcigar.back().opcode != mATCH) return false; else return true; } bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, BowtieHit& bh, bool strip_slash, char* name_out, char* name_tags, char* seq, char* qual) { if (!orig_bwt_buf || !*orig_bwt_buf) return false; char bwt_buf[2048]; strcpy(bwt_buf, orig_bwt_buf); // Are we still in the header region? if (bwt_buf[0] == '@') return false; char* buf = bwt_buf; char* name = get_token((char**)&buf,"\t"); char* sam_flag_str = get_token((char**)&buf,"\t"); char* text_name = get_token((char**)&buf,"\t"); char* text_offset_str = get_token((char**)&buf,"\t"); const char* map_qual_str = get_token((char**)&buf,"\t"); char* cigar_str = get_token((char**)&buf,"\t"); const char* mate_ref_str = get_token((char**)&buf,"\t"); const char* mate_pos_str = get_token((char**)&buf,"\t"); const char* inferred_insert_sz_str = get_token((char**)&buf,"\t"); //int num_mismatches=0; //int mismatches[1024]; //list of 0-based mismatch positions in this read //parsed from SAM's MD:Z: tag const char* seq_str = get_token((char**)&buf,"\t"); if (seq) strcpy(seq, seq_str); const char* qual_str = get_token((char**)&buf,"\t"); if (qual) strcpy(qual, qual_str); if (!name || !sam_flag_str || !text_name || !text_offset_str || !map_qual_str || !cigar_str || !mate_ref_str || !mate_pos_str || !inferred_insert_sz_str || !seq_str || !qual_str) { // truncated or malformed SAM record return false; } int sam_flag = atoi(sam_flag_str); int text_offset = atoi(text_offset_str); text_offset--; //make it 0-based (SAM is 1-based, Bowtie is 0-based) bool end = true; unsigned int seg_offset = 0; unsigned int seg_num = 0; unsigned int num_segs = 0; // Copy the tag out of the name field before we might wipe it out parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs); vector samcigar; bool spliced_alignment = false; int refspan=parseCigar(samcigar, cigar_str, spliced_alignment); if (refspan==0) return false; bool antisense_splice = false; int sam_nm = 0; vector mismatches; mismatches.resize(strlen(seq_str), false); int num_mismatches=getSAMmismatches(buf, samcigar, mismatches, sam_nm, antisense_splice); //############################################## // Add this alignment to the table of hits for this half of the // Bowtie map // Parse the text_name field to recover the splice coords vector toks; tokenize_strict(text_name, "|", toks); int num_extra_toks = (int)toks.size() - 6; if (num_extra_toks >= 0) { static const uint8_t left_window_edge_field = 1; static const uint8_t splice_field = 2; //static const uint8_t right_window_edge_field = 3; static const uint8_t junction_type_field = 4; static const uint8_t strand_field = 5; string contig = toks[0]; for (int t = 1; t <= num_extra_toks; ++t) { contig += "|"; contig += toks[t]; } vector splice_toks; tokenize(toks[num_extra_toks + splice_field], "-", splice_toks); if (splice_toks.size() != 2) { fprintf(stderr, "Warning: found malformed splice record, skipping:\n"); //fprintf(stderr, "\t%s (token: %s)\n", text_name, // toks[num_extra_toks + splice_field].c_str()); return false; } string junction_strand = toks[num_extra_toks + strand_field]; if(junction_strand != "rev" && junction_strand != "fwd"){ fprintf(stderr, "Malformed insertion record\n"); return false; } // // check for an insertion hit // if(toks[num_extra_toks + junction_type_field] == "ins") { //int8_t spliced_read_len = strlen(seq_str); //TODO FIXME: use the CIGAR instead of seq length! // The 0-based position of the left edge of the alignment. Note that this // value may need to be further corrected to account for the presence of // of the insertion. int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; // The 0-based position of the last genomic sequence before the insertion int left_splice_pos = atoi(splice_toks[0].c_str()); string insertedSequence = splice_toks[1]; // The 0-based position of the first genomic sequence after the insertion vector splcigar; //this also updates left to the adjusted genomic coordinates int spl_num_mismatches=0; bool overlapped = spliceCigar(splcigar, samcigar, mismatches, left, left_splice_pos+1, insertedSequence.length(), INS, spl_num_mismatches); if (!overlapped) return false; if (spl_num_mismatches<0) return false; num_mismatches-=spl_num_mismatches; bh = create_hit(name, contig, "", left, splcigar, sam_flag & 0x0010, junction_strand == "rev", num_mismatches, num_mismatches + gap_length(splcigar), 0, end); return true; } //"ins" else //"del" or intron { // The 0-based position of the left edge of the alignment. int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; // The 0-based position of the last genomic sequence before the deletion int left_splice_pos = atoi(splice_toks[0].c_str()); int gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1; vector splcigar; CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP; int spl_num_mismatches=0; bool overlapped = spliceCigar(splcigar, samcigar, mismatches, left, left_splice_pos+1, gap_len, opcode, spl_num_mismatches); if (!overlapped) return false; if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches) return false; bh = create_hit(name, contig, "", left, splcigar, (sam_flag & 0x0010), junction_strand == "rev", num_mismatches, num_mismatches + gap_length(splcigar), spl_num_mismatches, end); return true; } } //parse splice data else { fprintf(stderr, "Warning: found malformed splice record, skipping\n"); //fprintf(stderr, "%s\n", orig_bwt_buf); // continue; return false; } return false; } bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, BowtieHit& bh, bool strip_slash, char* name_out, char* name_tags, char* seq, char* qual) { if (_sam_header==NULL) err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!"); const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf; uint32_t sam_flag = hit_buf->core.flag; int text_offset = hit_buf->core.pos; int text_mate_pos = hit_buf->core.mpos; int target_id = hit_buf->core.tid; int mate_target_id = hit_buf->core.mtid; vector cigar; bool spliced_alignment = false; int num_hits = 1; bool end = true; unsigned int seg_offset = 0; unsigned int seg_num = 0; unsigned int num_segs = 0; // Copy the tag out of the name field before we might wipe it out char* qname = bam1_qname(hit_buf); char* pipe = strrchr(qname, '|'); if (pipe) { if (name_tags) strcpy(name_tags, pipe); char* tag_buf = pipe + 1; if (strchr(tag_buf, ':')) { sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); if (seg_num + 1 == num_segs) end = true; else end = false; } *pipe = 0; } if (target_id < 0) { //assert(cigar.size() == 1 && cigar[0].opcode == MATCH); bh = create_hit(qname, "*", //ref_name 0, //left coord 0, //read_len false, //antisense_aln 0, //mismatches 0, //edit_dist end); return true; } if (seq!=NULL) { char *bseq = (char*)bam1_seq(hit_buf); for(int i=0;i<(hit_buf->core.l_qseq);i++) { char v = bam1_seqi(bseq,i); seq[i]=bam_nt16_rev_table[(int)v]; } seq[hit_buf->core.l_qseq]=0; } if (qual!=NULL) { char *bq = (char*)bam1_qual(hit_buf); for(int i=0;i<(hit_buf->core.l_qseq);i++) { qual[i]=bq[i]+33; } qual[hit_buf->core.l_qseq]=0; } bool antisense_splice=false; unsigned char num_mismatches = 0; unsigned char num_splice_anchor_mismatches = 0; uint8_t* ptr = bam_aux_get(hit_buf, "XS"); if (ptr) { char src_strand_char = bam_aux2A(ptr); if (src_strand_char == '-') antisense_splice = true; } ptr = bam_aux_get(hit_buf, "NM"); if (ptr) { num_mismatches = bam_aux2i(ptr); } ptr = bam_aux_get(hit_buf, "NH"); if (ptr) { num_hits = bam_aux2i(ptr); } int alignment_score = 0; bool has_alignment_score = false; ptr = bam_aux_get(hit_buf, "AS"); if (ptr) { alignment_score = bam_aux2i(ptr); has_alignment_score = true; } string text_name = _sam_header->target_name[target_id]; string text_name2 = ""; bool fusion_alignment = false; string fusion_cigar_str; ptr = bam_aux_get(hit_buf, "XF"); if (ptr) { fusion_alignment = true; char* xf = bam_aux2Z(ptr); // ignore the second part of a fusion alignment if (xf[0] == '2') return false; vector fields; tokenize(xf, " ", fields); vector contigs; tokenize(fields[1], "-", contigs); if (contigs.size() >= 2) { text_name = contigs[0]; text_name2 = contigs[1]; } text_offset = atoi(fields[2].c_str()) - 1; fusion_cigar_str = fields[3].c_str(); if (seq) strcpy(seq, fields[4].c_str()); if (qual) strcpy(qual, fields[5].c_str()); } if (fusion_alignment) { const char* p_cig = fusion_cigar_str.c_str(); while (*p_cig) { char* t; int length = (int)strtol(p_cig, &t, 10); if (length <= 0) { //fprintf (stderr, "CIGAR op has zero length\n"); return false; } char op_char = *t; CigarOpCode opcode; if (op_char == 'M') opcode = MATCH; else if(op_char == 'm') opcode = mATCH; else if (op_char == 'I') opcode = INS; else if (op_char == 'i') opcode = iNS; else if (op_char == 'D') opcode = DEL; else if (op_char == 'd') opcode = dEL; else if (op_char == 'N' || op_char == 'n') { if (length > max_report_intron_length) return false; if (op_char == 'N') opcode = REF_SKIP; else opcode = rEF_SKIP; spliced_alignment = true; } else if (op_char == 'F') { opcode = FUSION_FF; length = length - 1; } else if (op_char == 'S') opcode = SOFT_CLIP; else if (op_char == 'H') opcode = HARD_CLIP; else if (op_char == 'P') opcode = PAD; else { fprintf (stderr, "(%d-%d) invalid CIGAR operation\n", length, (int)op_char); return false; } p_cig = t + 1; cigar.push_back(CigarOp(opcode, length)); if (opcode == INS) num_mismatches -= length; else if (opcode == DEL) num_mismatches -= length; if (!has_alignment_score) { if (opcode == INS) alignment_score -= (bowtie2_read_gap_open * bowtie2_read_gap_cont * length); else if(opcode == DEL) alignment_score -= (bowtie2_ref_gap_open * bowtie2_ref_gap_cont * length); } /* * update fusion direction. */ size_t cigar_size = cigar.size(); if (cigar_size >= 3 && cigar[cigar_size - 2].opcode == FUSION_FF) { CigarOpCode prev = cigar[cigar_size - 3].opcode; CigarOpCode next = cigar[cigar_size - 1].opcode; bool increase1 = false, increase2 = false; if (prev == MATCH || prev == DEL || prev == INS || prev == REF_SKIP) increase1 = true; if (next == MATCH || next == DEL || next == INS || next == REF_SKIP) increase2 = true; if (increase1 && !increase2) cigar[cigar_size - 2].opcode = FUSION_FR; else if (!increase1 && increase2) cigar[cigar_size - 2].opcode = FUSION_RF; else if (!increase1 && !increase2) cigar[cigar_size - 2].opcode = FUSION_RR; } } } else { for (int i = 0; i < hit_buf->core.n_cigar; ++i) { int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT; if (length <= 0) { fprintf (stderr, "insert_id: %s - BAM error: CIGAR op has zero length\n", qname); return false; } CigarOpCode opcode; switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK) { case BAM_CMATCH: opcode = MATCH; break; case BAM_CINS: opcode = INS; break; case BAM_CDEL: opcode = DEL; break; case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break; case BAM_CHARD_CLIP: opcode = HARD_CLIP; break; case BAM_CPAD: opcode = PAD; break; case BAM_CREF_SKIP: opcode = REF_SKIP; spliced_alignment = true; if (length > (int)max_report_intron_length) { //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n"); return false; } break; default: fprintf (stderr, "BAM read: invalid CIGAR operation\n"); return false; } if (opcode != HARD_CLIP) cigar.push_back(CigarOp(opcode, length)); /* * By convention,the NM field of the SAM record * counts an insertion or deletion. I dont' think * we want the mismatch count in the BowtieHit * record to reflect this. Therefore, subtract out * the mismatches due to in/dels */ if (opcode == INS) num_mismatches -= length; else if (opcode == DEL) num_mismatches -= length; if (!has_alignment_score) { if (opcode == INS) alignment_score -= (bowtie2_read_gap_open * bowtie2_read_gap_cont * length); else if(opcode == DEL) alignment_score -= (bowtie2_ref_gap_open * bowtie2_ref_gap_cont * length); } } } if (!has_alignment_score) { ptr = bam_aux_get(hit_buf, "MD"); if (ptr && qual) { const char* p = bam_aux2Z(ptr); int bi=0; //base offset position in the read while (*p != 0) { if (isdigit(*p)) { int v=atoi(p); do { p++; } while (isdigit(*p)); bi+=v; } while (isalpha(*p)) { p++; float penalty = bowtie2_min_penalty + (bowtie2_max_penalty - bowtie2_min_penalty) * min((int)(qual[bi] - '!'), 40) / 40.0; alignment_score -= (int)penalty; bi++; } if (*p=='^') { //reference deletion p++; while (isalpha(*p)) { //insert read bases p++; bi++; } } } } else { alignment_score -= (num_mismatches * (bowtie2_max_penalty + bowtie2_min_penalty) / 2); } } string mrnm; if (mate_target_id >= 0) { if (mate_target_id == target_id) { mrnm = _sam_header->target_name[mate_target_id]; } else { return false; } } else { text_mate_pos = 0; } if (spliced_alignment) { bh = create_hit(qname, text_name, text_name2, text_offset, // BAM files are 0-indexed cigar, sam_flag & 0x0010, antisense_splice, num_mismatches, num_mismatches + gap_length(cigar), num_splice_anchor_mismatches, end); } else { bh = create_hit(qname, text_name, text_name2, text_offset, // BAM files are 0-indexed cigar, sam_flag & 0x0010, false, num_mismatches, num_mismatches + gap_length(cigar), 0, end); } bh.alignment_score(alignment_score); return true; } bool BAMHitFactory::inspect_header(HitStream& hs) { bam_header_t* header = ((samfile_t*)(hs._hit_file))->header; if (header == NULL) { fprintf(stderr, "Warning: No BAM header\n"); return false; } if (header->l_text == 0) { fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n"); return false; } return true; } bool SplicedBAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, BowtieHit& bh, bool strip_slash, char* name_out, char* name_tags, char* seq, char* qual) { if (_sam_header==NULL) err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!"); const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf; uint32_t sam_flag = hit_buf->core.flag; int text_offset = hit_buf->core.pos; int text_mate_pos = hit_buf->core.mpos; int target_id = hit_buf->core.tid; int mate_target_id = hit_buf->core.mtid; vector samcigar; bool spliced_alignment = false; if (seq!=NULL) { char *bseq = (char*)bam1_seq(hit_buf); for(int i=0;i<(hit_buf->core.l_qseq);i++) { char v = bam1_seqi(bseq,i); seq[i]=bam_nt16_rev_table[(int)v]; } seq[hit_buf->core.l_qseq]=0; } if (qual!=NULL) { char *bq = (char*)bam1_qual(hit_buf); for(int i=0;i<(hit_buf->core.l_qseq);i++) { qual[i]=bq[i]+33; } qual[hit_buf->core.l_qseq]=0; } bool end = true; unsigned int seg_offset = 0; unsigned int seg_num = 0; unsigned int num_segs = 0; // Copy the tag out of the name field before we might wipe it out char* name = bam1_qname(hit_buf); parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs); if (target_id < 0) { //assert(cigar.size() == 1 && cigar[0].opcode == MATCH); bh = create_hit(name, "*", //ref_name 0, //left coord 0, //read_len false, //antisense_aln 0, //mismatches 0, //edit_dist end); return true; } string text_name = _sam_header->target_name[target_id]; for (int i = 0; i < hit_buf->core.n_cigar; ++i) { int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT; if (length <= 0) { fprintf (stderr, "BAM error: CIGAR op has zero length\n"); return false; } CigarOpCode opcode; switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK) { case BAM_CMATCH: opcode = MATCH; break; case BAM_CINS: opcode = INS; break; case BAM_CDEL: opcode = DEL; break; case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break; case BAM_CHARD_CLIP: opcode = HARD_CLIP; break; case BAM_CPAD: opcode = PAD; break; case BAM_CREF_SKIP: opcode = REF_SKIP; spliced_alignment = true; if (length > (int)max_report_intron_length) { //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n"); return false; } break; default: fprintf (stderr, "BAM read: invalid CIGAR operation\n"); return false; } if (opcode != HARD_CLIP) samcigar.push_back(CigarOp(opcode, length)); } string mrnm; if (mate_target_id >= 0) { if (mate_target_id == target_id) { mrnm = _sam_header->target_name[mate_target_id]; } else { return false; } } else { text_mate_pos = 0; } bool antisense_splice = false; int sam_nm = 0; vector mismatches; mismatches.resize(strlen(seq), false); int num_mismatches=getBAMmismatches(hit_buf, samcigar, mismatches, sam_nm, antisense_splice); //############################################## // Add this alignment to the table of hits for this half of the // Bowtie map // Parse the text_name field to recover the splice coords vector toks; tokenize_strict(text_name.c_str(), "|", toks); int num_extra_toks = (int)toks.size() - 6; if (num_extra_toks >= 0) { static const uint8_t left_window_edge_field = 1; static const uint8_t splice_field = 2; //static const uint8_t right_window_edge_field = 3; static const uint8_t junction_type_field = 4; static const uint8_t strand_field = 5; string contig = toks[0]; for (int t = 1; t <= num_extra_toks; ++t) { contig += "|"; contig += toks[t]; } vector splice_toks; tokenize(toks[num_extra_toks + splice_field], "-", splice_toks); if (splice_toks.size() != 2) { fprintf(stderr, "Warning: found malformed splice record, skipping:\n"); //fprintf(stderr, "\t%s (token: %s)\n", text_name, // toks[num_extra_toks + splice_field].c_str()); return false; } const string& junction_type = toks[num_extra_toks + junction_type_field]; const string junction_strand = toks[num_extra_toks + strand_field]; // // check for an insertion hit // if(junction_type == "ins") { //int8_t spliced_read_len = strlen(seq_str); //TODO FIXME: use the CIGAR instead of seq length! // The 0-based position of the left edge of the alignment. Note that this // value may need to be further corrected to account for the presence of // of the insertion. int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; // The 0-based position of the last genomic sequence before the insertion int left_splice_pos = atoi(splice_toks[0].c_str()); if (left > left_splice_pos) return false; string insertedSequence = splice_toks[1]; // The 0-based position of the first genomic sequence after the insertion vector splcigar; //this also updates left to the adjusted genomic coordinates int spl_num_mismatches = 0; bool overlapped = spliceCigar(splcigar, samcigar, mismatches, left, left_splice_pos+1, insertedSequence.length(), INS, spl_num_mismatches); if (!overlapped) return false; if (spl_num_mismatches < 0) return false; num_mismatches -= spl_num_mismatches; bh = create_hit(name, contig, "", left, splcigar, sam_flag & 0x0010, junction_strand == "rev", num_mismatches, num_mismatches + gap_length(splcigar), 0, end); return true; } //"ins" else //"del", "intron", or "fusion" { char orientation = (sam_flag & 0x0010 ? '-' : '+'); if (!(junction_strand == "ff" || junction_strand == "fr" || junction_strand == "rf" || junction_strand == "rr" || junction_strand == "rev" || junction_strand == "fwd")|| !(orientation == '-' || orientation == '+')) { fprintf(stderr, "Warning: found malformed splice record, skipping\n"); return false; } // The 0-based position of the left edge of the alignment. int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()); if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr")) left += text_offset; else left -= text_offset; vector splcigar; CigarOpCode opcode; if(junction_type == "del") opcode = DEL; else if(junction_type == "fus") { if (junction_strand == "ff") opcode = FUSION_FF; else if (junction_strand == "fr") opcode = FUSION_FR; else if (junction_strand == "rf") opcode = FUSION_RF; else opcode = FUSION_RR; } else opcode = REF_SKIP; int left_splice_pos = atoi(splice_toks[0].c_str()); // The 0-based position of the last genomic sequence before the deletion int gap_len = 0; if (junction_type == "fus") gap_len = atoi(splice_toks[1].c_str()); else gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1; if (opcode == FUSION_RF || opcode == FUSION_RR) { left_splice_pos -= 1; if (left <= left_splice_pos) return false; } else { left_splice_pos += 1; if (left >= left_splice_pos) return false; } int spl_num_mismatches = 0; bool overlapped = spliceCigar(splcigar, samcigar, mismatches, left, left_splice_pos, gap_len, opcode, spl_num_mismatches); if (!overlapped) return false; if (spl_num_mismatches < 0) // || spl_num_mismatches>max_anchor_mismatches) return false; string contig2 = ""; if (junction_type == "fus") { vector contigs; tokenize(contig, "-", contigs); if (contigs.size() != 2) return false; contig = contigs[0]; contig2 = contigs[1]; if (junction_strand == "rf" || junction_strand == "rr") orientation = (orientation == '+' ? '-' : '+'); } bh = create_hit(name, contig, contig2, left, splcigar, orientation == '-', junction_strand == "rev", num_mismatches, num_mismatches + gap_length(splcigar), spl_num_mismatches, end); return true; } } //parse splice data else { fprintf(stderr, "Warning: found malformed splice record, skipping\n"); //fprintf(stderr, "%s\n", orig_bwt_buf); // continue; return false; } return false; } void get_mapped_reads(FILE* bwtf, HitTable& hits, HitFactory& hit_factory, bool strip_slash, bool verbose) { char bwt_buf[2048]; uint32_t reads_extracted = 0; while (fgets(bwt_buf, 2048, bwtf)) { // Chomp the newline char* nl = strrchr(bwt_buf, '\n'); if (nl) *nl = 0; if (*bwt_buf == 0) continue; // Get a new record from the tab-delimited Bowtie map BowtieHit bh; if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash)) { // Only check uniqueness if these hits are spliced hits.add_hit(bh, true); } reads_extracted++; } // This will sort the map by insert id. hits.finalize(); fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted); } /* AlignStatus status(const BowtieHit* align) { if (!align) return UNALIGNED; if (align->contiguous()) return CONTIGUOUS; return SPLICED; } */ void add_hits_to_coverage(const HitList& hits, vector& DoC) { int max_hit_pos = -1; for (size_t i = 0; i < hits.size(); ++i) { max_hit_pos = max((int)hits[i].right(),max_hit_pos); } if ((int)DoC.size() < max_hit_pos) DoC.resize(max_hit_pos); for (size_t i = 0; i < hits.size(); ++i) { const BowtieHit& bh = hits[i]; // split up the coverage contibution for this reads size_t j = bh.left(); const vector& cigar = bh.cigar(); for (size_t c = 0 ; c < cigar.size(); ++c) { switch(cigar[c].opcode) { case MATCH: for (size_t m = 0; m < cigar[c].length; ++m) { if (DoC[j + m] < 0xFFFF) DoC[j + m]++; } //fall through this case to REF_SKIP is intentional case REF_SKIP: j += cigar[c].length; break; default: break; } } } } void add_hit_to_coverage(const BowtieHit& bh, vector& DoC) { if ((int)DoC.size() < bh.right()) DoC.resize(bh.right()); // split up the coverage contibution for this reads size_t j = bh.left(); const vector& cigar = bh.cigar(); for (size_t c = 0 ; c < cigar.size(); ++c) { switch(cigar[c].opcode) { case MATCH: for (size_t m = 0; m < cigar[c].length; ++m) { if (DoC[j + m] < VMAXINT32) DoC[j + m]++; } //fall through this case to REF_SKIP is intentional case REF_SKIP: j += cigar[c].length; break; default: break; } } } void print_bamhit(GBamWriter& wbam, const char* read_name, const BowtieHit& bh, const char* ref_name, const char* ref_name2, const char* sequence, const char* qualities, bool from_bowtie, const vector* extra_fields) { string seq; string quals; if (sequence) { seq = sequence; quals = qualities; seq.resize(bh.read_len()); quals.resize(bh.read_len()); } else { seq = "*"; } if (qualities) { quals = qualities; quals.resize(bh.read_len()); } else { quals = "*"; } uint32_t sam_flag = 0; if (bh.antisense_align()) { sam_flag |= 0x0010; // BAM_FREVERSE if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed. { reverse_complement(seq); reverse(quals.begin(), quals.end()); } } uint32_t sam_pos = bh.left() + 1; uint32_t map_quality = 255; char cigar[256]; cigar[0] = 0; string mate_ref_name = "*"; uint32_t mate_pos = 0; uint32_t insert_size = 0; //string qualities = "*"; const vector& bh_cigar = bh.cigar(); /* * In addition to calculating the cigar string, * we need to figure out how many in/dels are in the * sequence, so that we can give the correct * value for the NM tag */ int indel_distance = 0; CigarOpCode fusion_dir = FUSION_NOTHING; for (size_t c = 0; c < bh_cigar.size(); ++c) { const CigarOp& op = bh_cigar[c]; char ibuf[64]; sprintf(ibuf, "%d", op.length); switch(op.opcode) { case MATCH: case mATCH: strcat(cigar, ibuf); if (bh_cigar[c].opcode == MATCH) strcat(cigar, "M"); else strcat(cigar, "m"); break; case INS: case iNS: strcat(cigar, ibuf); if (bh_cigar[c].opcode == INS) strcat(cigar, "I"); else strcat(cigar, "i"); indel_distance += bh_cigar[c].length; break; case DEL: case dEL: strcat(cigar, ibuf); if (bh_cigar[c].opcode == DEL) strcat(cigar, "D"); else strcat(cigar, "d"); indel_distance += bh_cigar[c].length; break; case REF_SKIP: case rEF_SKIP: strcat(cigar, ibuf); if (bh_cigar[c].opcode == REF_SKIP) strcat(cigar, "N"); else strcat(cigar, "n"); break; case FUSION_FF: case FUSION_FR: case FUSION_RF: case FUSION_RR: fusion_dir = op.opcode; sprintf(ibuf, "%d", bh_cigar[c].length + 1); strcat(cigar, ibuf); strcat(cigar, "F"); break; default: break; } } char cigar1[256] = {0}, cigar2[256] = {0}; string left_seq, right_seq, left_qual, right_qual; int left1 = -1, left2 = -1; extract_partial_hits(bh, seq, quals, cigar1, cigar2, left_seq, right_seq, left_qual, right_qual, left1, left2); bool containsSplice = false; for (vector::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr) { if (itr->opcode == REF_SKIP || itr->opcode == rEF_SKIP) { containsSplice = true; break; } } vector auxdata; if (extra_fields) auxdata.insert(auxdata.end(), extra_fields->begin(), extra_fields->end()); string nm("NM:i:"); str_appendInt(nm, bh.mismatches() + indel_distance); auxdata.push_back(nm); if (containsSplice) { // do not add more than once bool XS_found = false; for (size_t i = 0; i < auxdata.size(); ++i) { if (auxdata[i].substr(0, 2) == "XS") { XS_found = true; break; } } if (!XS_found) { nm="XS:A:"; nm+=(char)(bh.antisense_splice() ? '-' : '+'); auxdata.push_back(nm); } } if (fusion_dir != FUSION_NOTHING) { char XF[2048] = {0}; sprintf(XF, "XF:Z:1 %s-%s %u %s %s %s", ref_name, ref_name2, sam_pos, cigar, seq.c_str(), quals.c_str()); auxdata.push_back(XF); GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, left1 + 1, map_quality, cigar1, mate_ref_name.c_str(), mate_pos, insert_size, left_seq.c_str(), left_qual.c_str(), &auxdata); wbam.write(brec); delete brec; sprintf(XF, "XF:Z:2 %s-%s %u %s %s %s", ref_name, ref_name2, sam_pos, cigar, seq.c_str(), quals.c_str()); auxdata.back() = XF; brec = wbam.new_record(read_name, sam_flag, ref_name2, left2 + 1, map_quality, cigar2, mate_ref_name.c_str(), mate_pos, insert_size, right_seq.c_str(), right_qual.c_str(), &auxdata); wbam.write(brec); delete brec; } else { GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality, cigar, mate_ref_name.c_str(), mate_pos, insert_size, seq.c_str(), quals.c_str(), &auxdata); wbam.write(brec); delete brec; } } /** * Print a vector of cigar operations to a file. * @param bh_cigar A vector of CigarOps * @return a string representation of the cigar string */ std::string print_cigar(const vector& bh_cigar){ char cigar[256]; cigar[0] = 0; for (size_t c = 0; c < bh_cigar.size(); ++c) { char ibuf[64]; sprintf(ibuf, "%d", bh_cigar[c].length); strcat(cigar, ibuf); switch(bh_cigar[c].opcode) { case MATCH: strcat(cigar, "M"); break; case mATCH: strcat(cigar, "m"); break; case INS: strcat(cigar, "I"); break; case iNS: strcat(cigar, "i"); break; case DEL: strcat(cigar, "D"); break; case dEL: strcat(cigar, "d"); break; case REF_SKIP: strcat(cigar, "N"); break; case rEF_SKIP: strcat(cigar, "n"); break; case FUSION_FF: case FUSION_FR: case FUSION_RF: case FUSION_RR: strcat(cigar, "F"); break; default: break; } } string result(cigar); return result; } void extract_partial_hits(const BowtieHit& bh, const string& seq, const string& qual, char* cigar1, char* cigar2, string& seq1, string& seq2, string& qual1, string& qual2, int& left1, int& left2) { const int left = bh.left(); int right = left; int fusion_left = -1, fusion_right = -1; const vector& bh_cigar = bh.cigar(); CigarOpCode fusion_dir = FUSION_NOTHING; size_t fusion_idx = 0; size_t left_part_len = 0; for (size_t c = 0; c < bh_cigar.size(); ++c) { const CigarOp& op = bh_cigar[c]; switch(op.opcode) { case MATCH: case REF_SKIP: case DEL: right += op.length; break; case mATCH: case rEF_SKIP: case dEL: right -= op.length; break; case FUSION_FF: case FUSION_FR: case FUSION_RF: case FUSION_RR: { fusion_dir = op.opcode; fusion_idx = c; if (op.opcode == FUSION_FF || op.opcode == FUSION_FR) fusion_left = right - 1; else fusion_left = right + 1; fusion_right = right = op.length; } break; default: break; } if (fusion_dir == FUSION_NOTHING) { if (op.opcode == MATCH || op.opcode == mATCH || op.opcode == INS || op.opcode == iNS) { left_part_len += op.length; } } } if (fusion_dir == FUSION_FF || fusion_dir == FUSION_FR) { for (size_t c = 0; c < fusion_idx; ++c) { const CigarOp& op = bh_cigar[c]; char ibuf[64]; sprintf(ibuf, "%d", op.length); strcat(cigar1, ibuf); switch (op.opcode) { case MATCH: strcat(cigar1, "M"); break; case INS: strcat(cigar1, "I"); break; case DEL: strcat(cigar1, "D"); break; case REF_SKIP: strcat(cigar1, "N"); break; default: assert (0); break; } } } else if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR) { assert (fusion_idx > 0); for (int c = fusion_idx - 1; c >=0; --c) { const CigarOp& op = bh_cigar[c]; char ibuf[64]; sprintf(ibuf, "%d", op.length); strcat(cigar1, ibuf); switch (op.opcode) { case mATCH: strcat(cigar1, "M"); break; case iNS: strcat(cigar1, "I"); break; case dEL: strcat(cigar1, "D"); break; case rEF_SKIP: strcat(cigar1, "N"); break; default: assert (0); break; } } } if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RF) { for (size_t c = fusion_idx + 1; c < bh_cigar.size(); ++c) { const CigarOp& op = bh_cigar[c]; char ibuf[64]; sprintf(ibuf, "%d", op.length); strcat(cigar2, ibuf); switch (op.opcode) { case MATCH: strcat(cigar2, "M"); break; case INS: strcat(cigar2, "I"); break; case DEL: strcat(cigar2, "D"); break; case REF_SKIP: strcat(cigar2, "N"); break; default: assert (0); break; } } } else if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RR) { assert (bh_cigar.size() > 0); for (size_t c = bh_cigar.size() - 1; c > fusion_idx; --c) { const CigarOp& op = bh_cigar[c]; char ibuf[64]; sprintf(ibuf, "%d", op.length); strcat(cigar2, ibuf); switch (op.opcode) { case mATCH: strcat(cigar2, "M"); break; case iNS: strcat(cigar2, "I"); break; case dEL: strcat(cigar2, "D"); break; case rEF_SKIP: strcat(cigar2, "N"); break; default: assert (0); break; } } } if (fusion_dir != FUSION_NOTHING) { seq1 = seq.substr(0, left_part_len); qual1 = qual.substr(0, left_part_len); if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR) { reverse_complement(seq1); reverse(qual1.begin(), qual1.end()); } seq2 = seq.substr(left_part_len); qual2 = qual.substr(left_part_len); if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RR) { reverse_complement(seq2); reverse(qual2.begin(), qual2.end()); } left1 = ((fusion_dir == FUSION_FF || fusion_dir == FUSION_FR) ? left : fusion_left); left2 = ((fusion_dir == FUSION_FF || fusion_dir == FUSION_RF) ? fusion_right : right + 1); } } bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt, bool bDebug) { RefSequenceTable::Sequence* ref_str1 = rt.get_seq(_ref_id); RefSequenceTable::Sequence* ref_str2 = rt.get_seq(_ref_id2); if (!ref_str1 || !ref_str2) return false; if (bDebug) { cout << "check_editdist_consistency" << endl << "insert id: " << _insert_id << endl; } RefSequenceTable::Sequence* ref_str = ref_str1; size_t pos_seq = 0; size_t pos_ref = _left; size_t mismatch = 0; size_t N_mismatch = 0; bool bSawFusion = false; for (size_t i = 0; i < _cigar.size(); ++i) { CigarOp cigar = _cigar[i]; switch(cigar.opcode) { case MATCH: { seqan::Dna5String ref_seq = seqan::infix(*ref_str, pos_ref, pos_ref + cigar.length); for (size_t j = 0; j < cigar.length; ++j) { seqan::Dna5 ref_nt = _seq[pos_seq]; if (ref_nt != ref_seq[j]) ++mismatch; if (ref_nt == ref_seq[j] && ref_nt == 'N') ++N_mismatch; if (bDebug) cout << pos_seq << "\t" << ref_nt << " vs. " << ref_seq[j] << "\t" << mismatch << endl; ++pos_seq; } pos_ref += cigar.length; } break; case mATCH: { seqan::Dna5String ref_seq = seqan::infix(*ref_str, pos_ref - cigar.length + 1, pos_ref + 1); seqan::reverseComplement(ref_seq); for (size_t j = 0; j < cigar.length; ++j) { seqan::Dna5 ref_nt = _seq[pos_seq]; if (ref_nt != ref_seq[j]) ++mismatch; if (ref_nt == ref_seq[j] && ref_nt == 'N') ++N_mismatch; if (bDebug) cout << pos_seq << "\t" << ref_nt << " vs. " << ref_seq[j] << "\t" << mismatch << endl; ++pos_seq; } pos_ref -= cigar.length; } break; case INS: case iNS: { pos_seq += cigar.length; } break; case DEL: case REF_SKIP: { pos_ref += cigar.length; } break; case dEL: case rEF_SKIP: { pos_ref -= cigar.length; } break; case FUSION_FF: case FUSION_FR: case FUSION_RF: case FUSION_RR: { // We don't allow a read spans more than two chromosomes. if (bSawFusion) return false; ref_str = ref_str2; pos_ref = cigar.length; bSawFusion = true; } break; default: break; } } if (bDebug) cout << "mismatch (real) vs. (calculated):" << mismatch << " vs. " << (int)_edit_dist << endl; return mismatch == _mismatches || mismatch + N_mismatch == _mismatches; } void bowtie_sam_extra(const BowtieHit& bh, const RefSequenceTable& rt, vector& fields) { RefSequenceTable::Sequence* ref_str1 = rt.get_seq(bh.ref_id()); RefSequenceTable::Sequence* ref_str2 = rt.get_seq(bh.ref_id2()); if (!ref_str1 || !ref_str2) return; RefSequenceTable::Sequence* ref_str = ref_str1; size_t pos_seq = 0; size_t pos_mismatch = 0; size_t pos_ref = bh.left(); size_t mismatch = 0; size_t N_mismatch = 0; size_t num_gap_opens = 0; size_t num_gap_conts = 0; bool bSawFusion = false; int AS_score = 0; const vector& cigars = bh.cigar(); const string& seq = bh.seq(); const string& qual = bh.qual(); string AS = "AS:i:"; string MD = "MD:Z:"; for (size_t i = 0; i < cigars.size(); ++i) { CigarOp cigar = cigars[i]; switch(cigar.opcode) { case MATCH: case mATCH: { seqan::Dna5String ref_seq; if (cigar.opcode == MATCH) { ref_seq = seqan::infix(*ref_str, pos_ref, pos_ref + cigar.length); pos_ref += cigar.length; } else { ref_seq = seqan::infix(*ref_str, pos_ref - cigar.length + 1, pos_ref + 1); seqan::reverseComplement(ref_seq); pos_ref -= cigar.length; } for (size_t j = 0; j < cigar.length; ++j) { seqan::Dna5 ref_nt = ref_seq[j]; if (seq[pos_seq] != ref_nt) { ++mismatch; if (pos_seq < qual.length()) { if (seq[pos_seq] == 'N' || ref_nt == 'N') { AS_score -= (int)bowtie2_penalty_for_N; } else { float penalty = bowtie2_min_penalty + (bowtie2_max_penalty - bowtie2_min_penalty) * min((int)(qual[pos_seq] - '!'), 40) / 40.0; AS_score -= (int)penalty; } } str_appendInt(MD, (int)pos_mismatch); MD.push_back((char)ref_nt); pos_mismatch = 0; } else { if (ref_nt == 'N') { ++N_mismatch; AS_score -= (int)bowtie2_penalty_for_N; } ++pos_mismatch; } ++pos_seq; } } break; case INS: case iNS: { pos_seq += cigar.length; AS_score -= bowtie2_read_gap_open; AS_score -= (int)(bowtie2_read_gap_cont * cigar.length); num_gap_opens += 1; num_gap_conts += cigar.length; } break; case DEL: case dEL: { AS_score -= bowtie2_ref_gap_open; AS_score -= (int)(bowtie2_ref_gap_cont * cigar.length); num_gap_opens += 1; num_gap_conts += cigar.length; seqan::Dna5String ref_seq; if (cigar.opcode == DEL) { ref_seq = seqan::infix(*ref_str, pos_ref, pos_ref + cigar.length); pos_ref += cigar.length; } else { ref_seq = seqan::infix(*ref_str, pos_ref - cigar.length + 1, pos_ref + 1); seqan::reverseComplement(ref_seq); pos_ref -= cigar.length; } str_appendInt(MD, (int)pos_mismatch); MD.push_back('^'); for (size_t k = 0; k < length(ref_seq); ++k) MD.push_back((char)ref_seq[k]); pos_mismatch = 0; } break; case REF_SKIP: case rEF_SKIP: { if (cigar.opcode == REF_SKIP) pos_ref += cigar.length; else pos_ref -= cigar.length; } break; case FUSION_FF: case FUSION_FR: case FUSION_RF: case FUSION_RR: { // We don't allow a read spans more than two chromosomes. if (bSawFusion) return; ref_str = ref_str2; pos_ref = cigar.length; bSawFusion = true; } break; default: break; } } str_appendInt(AS, AS_score); fields.push_back(AS); string XM = "XM:i:"; str_appendInt(XM, (int)mismatch); fields.push_back(XM); string XO = "XO:i:"; str_appendInt(XO, (int)num_gap_opens); fields.push_back(XO); string XG = "XG:i:"; str_appendInt(XG, (int)num_gap_conts); fields.push_back(XG); str_appendInt(MD, (int)pos_mismatch); fields.push_back(MD); } tophat-2.0.9/src/qual.cpp0000644000175000017500000000644312122334361014012 0ustar toortoor/* NOTE: This file was written by Ben Langmead, and is borrowed from Bowtie */ /// An array that transforms Phred qualities into their maq-like /// equivalents by dividing by ten and rounding to the nearest 10, /// but saturating at 3. unsigned char qualRounds[] = { 0, 0, 0, 0, 0, // 0 - 4 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, // 5 - 14 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 15 - 24 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 25 - 34 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 35 - 44 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 45 - 54 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 55 - 64 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 65 - 74 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 75 - 84 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 85 - 94 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 95 - 104 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 105 - 114 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 115 - 124 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 125 - 134 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 135 - 144 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 145 - 154 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 155 - 164 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 165 - 174 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 175 - 184 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 185 - 194 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 195 - 204 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 205 - 214 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 215 - 224 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 225 - 234 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 235 - 244 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 245 - 254 30 // 255 }; /** * Lookup table for converting from Solexa-scaled (log-odds) quality * values to Phred-scaled quality values. */ unsigned char solToPhred[] = { /* -10 */ 0, 1, 1, 1, 1, 1, 1, 2, 2, 3, /* 0 */ 3, 4, 4, 5, 5, 6, 7, 8, 9, 10, /* 10 */ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, /* 20 */ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, /* 30 */ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, /* 40 */ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, /* 50 */ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, /* 60 */ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, /* 70 */ 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, /* 80 */ 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, /* 90 */ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, /* 100 */ 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, /* 110 */ 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, /* 120 */ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, /* 130 */ 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, /* 140 */ 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, /* 150 */ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, /* 160 */ 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, /* 170 */ 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, /* 180 */ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, /* 190 */ 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, /* 200 */ 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, /* 210 */ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, /* 220 */ 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, /* 230 */ 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, /* 240 */ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, /* 250 */ 250, 251, 252, 253, 254, 255 }; tophat-2.0.9/src/fragments.h0000644000175000017500000000330512122334360014474 0ustar toortoor#ifndef FRAGMENTS_H #define FRAGMENTS_H /* * fragments.h * TopHat * * Created by Cole Trapnell on 1/14/09. * Copyright 2009 Cole Trapnell. All rights reserved. * */ #include "bwt_map.h" #include "align_status.h" typedef BowtieHit FragmentAlignment; struct FragmentAlignmentGrade { FragmentAlignmentGrade() { num_alignments = 0; status = AlignStatus(); } FragmentAlignmentGrade(const BowtieHit& h1, const JunctionSet& gtf_junctions, const JunctionSet& junctions, const InsertionSet& insertions, const DeletionSet& deletions, const FusionSet& fusions, const Coverage& coverage) { status = AlignStatus(h1, gtf_junctions, junctions, insertions, deletions, fusions, coverage); num_alignments = 1; } FragmentAlignmentGrade& operator=(const FragmentAlignmentGrade& rhs) { status = rhs.status; num_alignments = rhs.num_alignments; return *this; } // Returns true if rhs is a "happier" alignment for the ends of this insert // than this InsertStatus. bool operator<(const FragmentAlignmentGrade& rhs) { return status < rhs.status; } AlignStatus status; int num_alignments; // number of equally good alignments for this fragment }; typedef vector > > BestFragmentAlignmentTable; void best_fragment_mappings(uint64_t refid, const string& name, HitList& hits1_in_ref, ReadTable& it, BestFragmentAlignmentTable& best_status_for_fragments); void accept_best_hits(BestFragmentAlignmentTable& best_status_for_fragments); void accept_unique_hits(BestFragmentAlignmentTable& best_status_for_fragments); #endif tophat-2.0.9/src/bwt_map.h0000644000175000017500000007560312122334364014155 0ustar toortoor#ifndef BWT_MAP_H #define BWT_MAP_H #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #include #include #include #include #include using namespace std; #include "common.h" #include "reads.h" #define _FBUF_SIZE 10*1024 /* * bwt_map.h * TopHat * * Created by Cole Trapnell on 11/17/08. * Copyright 2008 Cole Trapnell. All rights reserved. * */ enum CigarOpCode { CIGAR_NOTHING = 0, FUSION_NOTHING = 0, MATCH, mATCH, INS, iNS, DEL, dEL, FUSION_FF, FUSION_FR, FUSION_RF, FUSION_RR, REF_SKIP, rEF_SKIP, SOFT_CLIP, HARD_CLIP, PAD }; struct CigarOp { CigarOp(CigarOpCode o, uint32_t l) : opcode(o), length(l) {} CigarOpCode opcode; uint32_t length; bool operator==(const CigarOp& rhs) const { return opcode == rhs.opcode && length == rhs.length; } }; typedef uint32_t ReadID; typedef uint32_t RefID; class RefSequenceTable; /* Stores the information from a single record of the bowtie map. A given read may have many of these. Reads up to 255bp are supported. */ struct BowtieHit { BowtieHit() : _ref_id(0), _ref_id2(0), _insert_id(0), _left(0), _antisense_splice(false), _antisense_aln(false), _mismatches(0), _edit_dist(0), _splice_mms(0), _alignment_score(0), _end(false){} BowtieHit(uint32_t ref_id, uint32_t ref_id2, ReadID insert_id, int left, int read_len, bool antisense, unsigned char mismatches, unsigned char edit_dist, bool end) : _ref_id(ref_id), _ref_id2(ref_id2), _insert_id(insert_id), _left(left), _cigar(vector(1,CigarOp(MATCH,read_len))), _antisense_splice(false), _antisense_aln(antisense), _mismatches(mismatches), _edit_dist(edit_dist), _splice_mms(0), _alignment_score(0), _end(end) { assert(_cigar.capacity() == _cigar.size()); } BowtieHit(uint32_t ref_id, uint32_t ref_id2, ReadID insert_id, int left, const vector& cigar, bool antisense_aln, bool antisense_splice, unsigned char mismatches, unsigned char edit_dist, unsigned char splice_mms, bool end) : _ref_id(ref_id), _ref_id2(ref_id2), _insert_id(insert_id), _left(left), _cigar(cigar), _antisense_splice(antisense_splice), _antisense_aln(antisense_aln), _mismatches(mismatches), _edit_dist(edit_dist), _splice_mms(splice_mms), _alignment_score(0), _end(end) { assert(_cigar.capacity() == _cigar.size()); } int read_len() const { uint32_t len = 0; for (size_t i = 0; i < _cigar.size(); ++i) { const CigarOp& op = _cigar[i]; switch(op.opcode) { case MATCH: case mATCH: case INS: case iNS: case SOFT_CLIP: len += op.length; break; default: break; } } return len; } bool operator==(const BowtieHit& rhs) const { return (_insert_id == rhs._insert_id && _ref_id == rhs._ref_id && _ref_id2 == rhs._ref_id2 && _antisense_aln == rhs._antisense_aln && _left == rhs._left && _antisense_splice == rhs._antisense_splice && _edit_dist == rhs._edit_dist && /* DO NOT USE ACCEPTED IN COMPARISON */ _cigar == rhs._cigar); } bool operator<(const BowtieHit& rhs) const { if (_insert_id != rhs._insert_id) return _insert_id < rhs._insert_id; if (_ref_id != rhs._ref_id) return _ref_id < rhs._ref_id; if (_ref_id2 != rhs._ref_id2) return _ref_id2 < rhs._ref_id2; if (_left != rhs._left) return _left < rhs._left; if (_antisense_aln != rhs._antisense_aln) return _antisense_aln < rhs._antisense_aln; if (_mismatches != rhs._mismatches) return _mismatches < rhs._mismatches; if (_edit_dist != rhs._edit_dist) return _edit_dist < rhs._edit_dist; if (_cigar != rhs._cigar) { if (_cigar.size() != rhs._cigar.size()) return _cigar.size() < rhs._cigar.size(); for (size_t i = 0; i < _cigar.size(); ++i) { if (!(_cigar[i] == rhs._cigar[i])) return (_cigar[i].opcode < rhs._cigar[i].opcode || (_cigar[i].opcode == rhs._cigar[i].opcode && _cigar[i].length < rhs._cigar[i].length)); } } return false; } uint32_t ref_id() const { return _ref_id; } uint32_t ref_id2() const { return _ref_id2; } ReadID insert_id() const { return _insert_id; } int left() const { return _left; } int right() const { int r = _left; for (size_t i = 0; i < _cigar.size(); ++i) { const CigarOp& op = _cigar[i]; switch(op.opcode) { case MATCH: case REF_SKIP: case DEL: r += op.length; break; case mATCH: case rEF_SKIP: case dEL: r -= op.length; break; case FUSION_FF: case FUSION_FR: case FUSION_RF: case FUSION_RR: r = op.length; break; default: break; } } return r; } bool is_spliced() const { for (size_t i = 0; i < _cigar.size(); ++i) { const CigarOp& op = _cigar[i]; if (op.opcode == REF_SKIP || op.opcode == rEF_SKIP) return true; } return false; } CigarOpCode fusion_opcode() const { for (size_t i = 0; i < _cigar.size(); ++i) { const CigarOp& op = _cigar[i]; if (op.opcode == FUSION_FF || op.opcode == FUSION_FR || op.opcode == FUSION_RF || op.opcode == FUSION_RR) return op.opcode; } return FUSION_NOTHING; } /* * checks whether its coordinate is increasing or decreasing * before its fusion or until the end. */ bool is_forwarding_left() const { for (size_t i = 0; i < _cigar.size(); ++i) { const CigarOp& op = _cigar[i]; if (op.opcode == MATCH || op.opcode == REF_SKIP || op.opcode == INS || op.opcode == DEL) return true; if (op.opcode == mATCH || op.opcode == rEF_SKIP || op.opcode == iNS || op.opcode == dEL) return false; if (op.opcode == FUSION_FF || op.opcode == FUSION_FR || op.opcode == FUSION_RF || op.opcode == FUSION_RR) break; } return true; } /* * checks whether its coordinate is increasing or decreasing * before its fusion or until the end. */ bool is_forwarding_right() const { for (int i = _cigar.size() - 1; i >= 0; --i) { const CigarOp& op = _cigar[i]; if (op.opcode == MATCH || op.opcode == REF_SKIP || op.opcode == INS || op.opcode == DEL) return true; if (op.opcode == mATCH || op.opcode == rEF_SKIP || op.opcode == iNS || op.opcode == dEL) return false; if (op.opcode == FUSION_FF || op.opcode == FUSION_FR || op.opcode == FUSION_RF || op.opcode == FUSION_RR) break; } return true; } bool antisense_splice() const { return _antisense_splice; } bool antisense_align() const { return _antisense_aln; } void antisense_align(bool antisense_align) { _antisense_aln = antisense_align; } bool antisense_align2() const { /* * antisense_splice is also used to indicate whether fusion is ff, fr, or rf. */ CigarOpCode fusionOpCode = fusion_opcode(); if (fusionOpCode == FUSION_NOTHING || fusionOpCode == FUSION_FF || fusionOpCode == FUSION_RR) return antisense_align(); return !antisense_align(); } BowtieHit reverse() const { BowtieHit result; result._ref_id = _ref_id2; result._ref_id2 = _ref_id; result._insert_id = _insert_id; uint32_t right, fusion_pos; right = fusion_pos = _left; for (size_t i = 0; i < _cigar.size(); ++i) { const CigarOp& op = _cigar[i]; switch(op.opcode) { case MATCH: case REF_SKIP: case DEL: right += op.length; break; case mATCH: case rEF_SKIP: case dEL: right -= op.length; break; case FUSION_FF: case FUSION_FR: case FUSION_RF: case FUSION_RR: fusion_pos = right; right = op.length; break; default: break; } } if (is_forwarding_left()) fusion_pos -= 1; else fusion_pos += 1; CigarOpCode fusionOpCode = fusion_opcode(); if (fusionOpCode == FUSION_NOTHING || fusionOpCode == FUSION_FF || fusionOpCode == FUSION_RR) { if (is_forwarding_left()) result._left = right - 1; else result._left = right + 1; } else { if (fusionOpCode == FUSION_FR) result._left = right + 1; else result._left = right - 1; } result._cigar.clear(); for (int i = _cigar.size() - 1; i >= 0; --i) { CigarOp cigar = _cigar[i]; switch(cigar.opcode) { case MATCH: cigar.opcode = mATCH; break; case mATCH: cigar.opcode = MATCH; break; case INS: cigar.opcode = iNS; break; case iNS: cigar.opcode = INS; break; case DEL: cigar.opcode = dEL; break; case dEL: cigar.opcode = DEL; break; case REF_SKIP: cigar.opcode = rEF_SKIP; break; case rEF_SKIP: cigar.opcode = REF_SKIP; break; case FUSION_FF: case FUSION_FR: case FUSION_RF: case FUSION_RR: cigar.length = fusion_pos; break; default: break; } result._cigar.push_back(cigar); } if (fusionOpCode == FUSION_FR || fusionOpCode == FUSION_RF) result._antisense_aln = !_antisense_aln; else result._antisense_aln = _antisense_aln; result._antisense_splice = _antisense_splice; result._mismatches = _mismatches; result._edit_dist = _edit_dist; result._splice_mms = _splice_mms; result._end = _end; result._seq = _seq; reverse_complement(result._seq); result._qual = _qual; ::reverse(result._qual.begin(), result._qual.end()); return result; } unsigned char mismatches() const { return _mismatches; } void mismatches(unsigned char mm) { _mismatches = mm; } unsigned char edit_dist() const { return _edit_dist; } void edit_dist(unsigned char ed) { _edit_dist = ed; } unsigned char gap_length() const { return _edit_dist - _mismatches; } unsigned char splice_mms() const { return _splice_mms; } int alignment_score() const { return _alignment_score; } void alignment_score(int as) { _alignment_score = as; } // For convenience, if you just want a copy of the gap intervals // for this hit. void gaps(vector >& gaps_out) const { gaps_out.clear(); int pos = _left; for (size_t i = 0; i < _cigar.size(); ++i) { const CigarOp& op = _cigar[i]; switch(op.opcode) { case REF_SKIP: gaps_out.push_back(make_pair(pos, pos + op.length - 1)); pos += op.length; break; case rEF_SKIP: gaps_out.push_back(make_pair(pos, pos - op.length + 1)); pos -= op.length; break; case MATCH: case DEL: pos += op.length; break; case mATCH: case dEL: pos -= op.length; break; case FUSION_FF: case FUSION_FR: case FUSION_RF: case FUSION_RR: pos = op.length; break; default: break; } } } const vector& cigar() const { return _cigar; } bool contiguous() const { return _cigar.size() == 1 && _cigar[0].opcode == MATCH; } const string& hitfile_rec() const { return _hitfile_rec; } void hitfile_rec(const string& rec) { _hitfile_rec = rec; } const string& seq() const { return _seq; } void seq(const string& seq) { _seq = seq; } const string& qual() const { return _qual; } void qual(const string& qual) { _qual = qual; } bool end() const { return _end; } void end(bool end) { _end = end; } // this is for debugging purpose bool check_editdist_consistency(const RefSequenceTable& rt, bool bDebug = false); private: uint32_t _ref_id; uint32_t _ref_id2; ReadID _insert_id; // Id of the sequencing insert int _left; // Position in the reference of the left side of the alignment vector _cigar; bool _antisense_splice; // Whether the junction spanned is on the reverse strand bool _antisense_aln; // Whether the alignment is to the reverse strand unsigned char _mismatches; unsigned char _edit_dist; // Total mismatches (note this is not including insertions or deletions as mismatches, ie, not equivalent to NM field of a SAM record) unsigned char _splice_mms; // Mismatches within min_anchor_len of a splice junction string _hitfile_rec; // Points to the buffer for the record from which this hit came string _seq; string _qual; int _alignment_score; // Bowtie2 outputs AS (alignment score) in SAM, TopHat2 uses the value when selecting the best alignments. bool _end; // Whether this segment is the last one of the read it belongs to }; class ReadTable { public: ReadTable() : _next_id(1) {} // This function should NEVER return zero ReadID get_id(const string& name) { uint32_t _id = atoi(name.c_str()); //assert(_id); _next_id = max(_next_id, (size_t)_id); return _id; } uint32_t observation_order(ReadID ID) { if (ID == 0) return VMAXINT32; return ID; } size_t size() const { return _next_id; } private: size_t _next_id; }; inline bool REFID_Less(uint32_t ref_id1, uint32_t ref_id2) { return false; } inline bool REFID_Equal(uint32_t ref_id1, uint32_t ref_id2) { return ref_id1 == ref_id2; } class RefSequenceTable { public: typedef seqan::String > > Sequence; struct SequenceInfo { SequenceInfo(uint32_t _order, char* _name, Sequence* _seq, uint32_t _len) : observation_order(_order), name(_name), seq(_seq), len(_len) {} uint32_t observation_order; char* name; Sequence* seq; uint32_t len; }; typedef map IDTable; typedef IDTable::iterator iterator; typedef IDTable::const_iterator const_iterator; RefSequenceTable(bool keep_names) : _next_id(1), _keep_names(keep_names) {} RefSequenceTable(const string& sam_header_filename, bool keep_names) : _next_id(1), _keep_names(keep_names) { if (sam_header_filename != "") { samfile_t* fh = samopen(sam_header_filename.c_str(), "r", 0); if (fh == 0) { fprintf(stderr, "Failed to open SAM header file %s\n", sam_header_filename.c_str()); exit(1); } for (size_t i = 0; i < (size_t)fh->header->n_targets; ++i) { const char* name = fh->header->target_name[i]; uint32_t len = fh->header->target_len[i]; get_id(name, NULL, len); // fprintf(stderr, "SQ: %s - %u\n", name, len); } // order_recs_lexicographically(); samclose(fh); } } ~RefSequenceTable() { for (IDTable::iterator i = _by_name.begin(); i != _by_name.end(); ++i) { free(i->second.name); } } // This function should NEVER return zero uint32_t get_id(const string& name, Sequence* seq = NULL, uint32_t len = 0) { pair ret = _by_name.insert(make_pair(name, SequenceInfo(_next_id, NULL, NULL, 0))); if (ret.second == true) { char* _name = NULL; if (_keep_names) _name = strdup(name.c_str()); ret.first->second.name = _name; ret.first->second.seq = seq; ret.first->second.len = len; ret.first->second.observation_order = _next_id; //assert (_refid_to_hash.size() + 1 == _next_id); _refid_to_name.push_back (name); ++_next_id; } else { if (seq) { ret.first->second.seq = seq; ret.first->second.len = len; } } return ret.first->second.observation_order; } const char* get_name(uint32_t ID) const { const string& name = _refid_to_name[ID-1]; IDTable::const_iterator itr = _by_name.find(name); if (itr != _by_name.end()) return itr->second.name; else return NULL; } uint32_t get_len(uint32_t ID) const { const string& name = _refid_to_name[ID-1]; IDTable::const_iterator itr = _by_name.find(name); if (itr != _by_name.end()) return itr->second.len; else return 0; } Sequence* get_seq(uint32_t ID) const { assert (ID > 0 && ID <= _refid_to_name.size()); const string& name = _refid_to_name[ID-1]; IDTable::const_iterator itr = _by_name.find(name); if (itr != _by_name.end()) return itr->second.seq; else return NULL; } const SequenceInfo* get_info(uint32_t ID) const { assert (ID > 0 && ID <= _refid_to_name.size()); const string& name = _refid_to_name[ID-1]; IDTable::const_iterator itr = _by_name.find(name); if (itr != _by_name.end()) { return &(itr->second); } else return NULL; } uint32_t observation_order(uint32_t ID) const { return ID; } iterator begin() { return _by_name.begin(); } iterator end() { return _by_name.end(); } const_iterator begin() const { return _by_name.begin(); } const_iterator end() const { return _by_name.end(); } size_t size() const { return _by_name.size(); } void clear() { _by_name.clear(); } // strnum_cmp is taken from samtools. static inline int strnum_cmp(const string &a, const string &b) { char *pa = (char*)a.c_str(), *pb = (char*)b.c_str(); while (*pa && *pb) { if (isdigit(*pa) && isdigit(*pb)) { long ai, bi; ai = strtol(pa, &pa, 10); bi = strtol(pb, &pb, 10); if (ai != bi) return ai vStr; for (IDTable::iterator i = _by_name.begin(); i != _by_name.end(); ++i) { vStr.push_back(i->first); } ::sort(vStr.begin(), vStr.end(), RefSequenceTable::strnum_cmp); _refid_to_name.clear(); size_t new_order = 1; for (vector::iterator i = vStr.begin(); i != vStr.end(); ++i, ++new_order) { _by_name.find(*i)->second.observation_order = new_order; _refid_to_name.push_back(*i); } } private: uint32_t _next_id; bool _keep_names; IDTable _by_name; vector _refid_to_name; }; bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2); typedef vector HitList; /* This class stores all the hits from a Bowtie map */ class HitTable { public: typedef map RefHits; typedef RefHits::const_iterator const_iterator; typedef RefHits::iterator iterator; HitTable() : _total_hits(0) {} const_iterator begin() const { return _hits_for_ref.begin(); } const_iterator end() const { return _hits_for_ref.end(); } iterator begin() { return _hits_for_ref.begin(); } iterator end() { return _hits_for_ref.end(); } void add_hit(const BowtieHit& bh, bool check_uniqueness); void finalize() { for (RefHits::iterator i = _hits_for_ref.begin(); i != _hits_for_ref.end(); ++i) { sort(i->second.begin(), i->second.end(), hit_insert_id_lt); } } HitList* get_hits(uint64_t ref_id) { RefHits::iterator i = _hits_for_ref.find(ref_id); if (i == _hits_for_ref.end()) return NULL; else return &(i->second); } uint32_t total_hits() const { return _total_hits; } private: RefHits _hits_for_ref; uint32_t _total_hits; }; class HitStream; class HitFactory { friend class HitStream; public: HitFactory(ReadTable& insert_table, RefSequenceTable& reference_table) : _insert_table(insert_table), _ref_table(reference_table) {} virtual ~HitFactory() {} virtual void openStream(HitStream& hs)=0; virtual void rewind(HitStream& hs)=0; virtual void seek(HitStream& hs, int64_t offset)=0; virtual void closeStream(HitStream& hs)=0; BowtieHit create_hit(const string& insert_name, const string& ref_name, const string& ref_name2, int left, const vector& cigar, bool antisense_aln, bool antisense_splice, unsigned char mismatches, unsigned char edit_dist, unsigned char splice_mms, bool end); BowtieHit create_hit(const string& insert_name, const string& ref_name, uint32_t left, uint32_t read_len, bool antisense_aln, unsigned char mismatches, unsigned char edit_dist, bool end); virtual string hitfile_rec(HitStream& hs, const char* hit_buf)=0; virtual bool next_record(HitStream& hs, const char*& buf, size_t& buf_size) = 0; virtual bool get_hit_from_buf(const char* bwt_buf, BowtieHit& bh, bool strip_slash, char* name_out = NULL, char* name_tags = NULL, char* seq = NULL, char* qual = NULL) = 0; protected: ReadTable& _insert_table; RefSequenceTable& _ref_table; HitStream* _hit_stream; };//class HitFactory class LineHitFactory : public HitFactory { //for text line-based formats like Bowtie and SAM public: LineHitFactory(ReadTable& insert_table, RefSequenceTable& reference_table) : HitFactory(insert_table, reference_table) {} string hitfile_rec(HitStream& hs, const char* hit_buf) { string r(hit_buf); return r; } void openStream(HitStream& hs); void rewind(HitStream& hs); void seek(HitStream&hs, int64_t offset); void closeStream(HitStream& hs); bool next_record(HitStream& hs, const char*& buf, size_t& buf_size); protected: static const size_t _hit_buf_max_sz = 10 * 1024; char _hit_buf[_hit_buf_max_sz]; int _line_num; }; class BowtieHitFactory : public LineHitFactory { public: BowtieHitFactory(ReadTable& insert_table, RefSequenceTable& reference_table) : LineHitFactory(insert_table, reference_table) {} }; class SplicedBowtieHitFactory : public LineHitFactory { public: SplicedBowtieHitFactory(ReadTable& insert_table, RefSequenceTable& reference_table, int anchor_length) : LineHitFactory(insert_table, reference_table), _anchor_length(anchor_length){} private: int _anchor_length; int _seg_offset; int _size_buf; }; class SplicedSAMHitFactory : public LineHitFactory { public: SplicedSAMHitFactory(ReadTable& insert_table, RefSequenceTable& reference_table, int anchor_length) : LineHitFactory(insert_table, reference_table), _anchor_length(anchor_length){} bool get_hit_from_buf(const char* bwt_buf, BowtieHit& bh, bool strip_slash, char* name_out = NULL, char* name_tags = NULL, char* seq = NULL, char* qual = NULL); private: int _anchor_length; int _seg_offset; int _size_buf; }; class SAMHitFactory : public LineHitFactory { public: SAMHitFactory(ReadTable& insert_table, RefSequenceTable& reference_table) : LineHitFactory(insert_table, reference_table) {} bool get_hit_from_buf(const char* bwt_buf, BowtieHit& bh, bool strip_slash, char* name_out = NULL, char* name_tags = NULL, char* seq = NULL, char* qual = NULL); }; /****************************************************************************** BAMHitFactory turns SAM alignments into BowtieHits *******************************************************************************/ class BAMHitFactory : public HitFactory { public: BAMHitFactory(ReadTable& insert_table, RefSequenceTable& reference_table, bam_header_t* sam_header = NULL) : HitFactory(insert_table, reference_table), _sam_header(sam_header), _sam_header_destroyed(false) { } void openStream(HitStream& hs); void rewind(HitStream& hs); void seek(HitStream& hs, int64_t offset); void closeStream(HitStream& hs); bool next_record(HitStream& hs, const char*& buf, size_t& buf_size); string hitfile_rec(HitStream& hs, const char* hit_buf); bool get_hit_from_buf(const char* bwt_buf, BowtieHit& bh, bool strip_slash, char* name_out = NULL, char* name_tags = NULL, char* seq = NULL, char* qual = NULL); void set_sam_header(bam_header_t* header) { _sam_header = header; } protected: //int64_t _curr_pos; //int64_t _beginning; bam1_t _next_hit; bam_header_t* _sam_header; bool inspect_header(HitStream& hs); bool _sam_header_destroyed; }; class SplicedBAMHitFactory : public BAMHitFactory { public: SplicedBAMHitFactory(ReadTable& insert_table, RefSequenceTable& reference_table, bam_header_t* sam_header = NULL, int anchor_length = 4) : BAMHitFactory(insert_table, reference_table, sam_header), _anchor_length(anchor_length) { } bool get_hit_from_buf(const char* bwt_buf, BowtieHit& bh, bool strip_slash, char* name_out = NULL, char* name_tags = NULL, char* seq = NULL, char* qual = NULL); private: int _anchor_length; int _seg_offset; int _size_buf; }; struct HitsForRead { HitsForRead() : insert_id(0) {} uint64_t insert_id; vector hits; }; class HitStream { friend class HitFactory; friend class LineHitFactory; friend class BAMHitFactory; //private: HitFactory* _factory; bool _spliced; bool _strip_slash; BowtieHit buffered_hit; bool _keep_bufs; bool _keep_seqs; bool _keep_quals; bool _from_bowtie; void* _hit_file; string _hit_file_name; FZPipe* _fzpipe; bool _eof; public: HitStream(void* hit_file, //could be FILE* or samfile_t* HitFactory* hit_factory, bool spliced, bool strip_slash, bool keep_bufs, bool keep_seqs = false, bool keep_quals = false, bool from_bowtie = false) : _factory(hit_factory), _spliced(spliced), _strip_slash(strip_slash), buffered_hit(BowtieHit()), _keep_bufs(keep_bufs), _keep_seqs(keep_seqs), _keep_quals(keep_quals), _from_bowtie(from_bowtie), _hit_file(hit_file), _hit_file_name(), _fzpipe(NULL), _eof(false) { primeStream(); } HitStream(const string& hit_filename, HitFactory* hit_factory, bool spliced, bool strip_slash, bool keep_bufs, bool keep_seqs = false, bool keep_quals = false, bool from_bowtie = false) : _factory(hit_factory), _spliced(spliced), _strip_slash(strip_slash), buffered_hit(BowtieHit()), _keep_bufs(keep_bufs), _keep_seqs(keep_seqs), _keep_quals(keep_quals), _from_bowtie(from_bowtie), _hit_file(NULL), _hit_file_name(hit_filename), _fzpipe(NULL), _eof(false) { _factory->openStream(*this); primeStream(); } HitStream(FZPipe& hit_filepipe, HitFactory* hit_factory, bool spliced, bool strip_slash, bool keep_bufs, bool keep_seqs = false, bool keep_quals = false, bool from_bowtie = false) : _factory(hit_factory), _spliced(spliced), _strip_slash(strip_slash), buffered_hit(BowtieHit()), _keep_bufs(keep_bufs), _keep_seqs(keep_seqs), _keep_quals(keep_quals), _from_bowtie(from_bowtie), _hit_file(NULL), _hit_file_name(), _fzpipe(&hit_filepipe), _eof(false) { _hit_file=_fzpipe->file; primeStream(); } void primeStream() { //why? // Prime the stream by reading a single hit into the buffered_hit HitsForRead dummy = HitsForRead(); next_read_hits(dummy); } bool eof() { return _eof; } bool ready() { return (_hit_file!=NULL); } void reset() { _factory->rewind(*this); _eof=false; // re-prime the stream; buffered_hit = BowtieHit(); primeStream(); } void seek(int64_t offset) { _factory->seek(*this, offset); _eof = false; buffered_hit = BowtieHit(); primeStream(); } bool next_read_hits(HitsForRead& hits_for_read) { hits_for_read.hits.clear(); hits_for_read.insert_id = 0; //if (!_hit_file || (feof(_hit_file) && buffered_hit.insert_id() == 0)) // return false; if (!this->ready()) //err_die("Error: next_read_hits() called on HitFactory with no file handle\n"); return false; if (this->eof() && buffered_hit.insert_id() == 0) { return false; } //char bwt_buf[2048]; bwt_buf[0] = 0; char bwt_seq[2048]; bwt_seq[0] = 0; char bwt_qual[2048]; bwt_qual[0] = 0; char* seq = _keep_seqs ? bwt_seq : NULL; char* qual = _keep_quals ? bwt_qual : NULL; hits_for_read.insert_id = buffered_hit.insert_id(); if (hits_for_read.insert_id) hits_for_read.hits.push_back(buffered_hit); const char* hit_buf; size_t hit_buf_size = 0; while (true) { if (!_factory->next_record(*this, hit_buf, hit_buf_size)) { buffered_hit = BowtieHit(); break; } //string clean_buf = bwt_buf; // Get a new record from the tab-delimited Bowtie map BowtieHit bh; if (_factory->get_hit_from_buf(hit_buf, bh, _strip_slash, NULL, NULL, seq, qual)) { if (_keep_bufs) bh.hitfile_rec(_factory->hitfile_rec(*this, hit_buf)); if (_keep_seqs) bh.seq(seq); if (_keep_quals) { // when it comes to convert from qual in color to qual in bp, // we need to fill in the two extream qual values using the adjacent qual values. size_t qual_len = strlen(qual); if (color && qual_len > 2) { qual[0] = qual[1]; qual[qual_len-1] = qual[qual_len-2]; } bh.qual(qual); } if (bh.insert_id() == hits_for_read.insert_id) { hits_for_read.hits.push_back(bh); } else { buffered_hit = bh; break; } } //hit parsed } //while reading hits return (!hits_for_read.hits.empty() && hits_for_read.insert_id != 0); } uint64_t next_group_id() const { return buffered_hit.insert_id(); } bool fromBowtie() { return _from_bowtie; } }; typedef uint32_t MateStatusMask; void get_mapped_reads(FILE* bwtf, HitTable& hits, HitFactory& hit_factory, bool strip_slash, bool verbose = false); //bool left_status_better(MateStatusMask left, MateStatusMask right); //bool status_equivalent(MateStatusMask left, MateStatusMask right); typedef uint32_t MateStatusMask; void add_hits_to_coverage(const HitList& hits, vector& DoC); void add_hit_to_coverage(const BowtieHit& bh, vector& DoC); void accept_all_hits(HitTable& hits); int gap_length(const vector& cigar); //print BowtieHit as BAM record void print_bamhit(GBamWriter& wbam, const char* read_name, const BowtieHit& bh, const char* ref_name, const char* ref_name2, const char* sequence, const char* qualities, bool from_bowtie = false, const vector* extra_fields = NULL); void extract_partial_hits(const BowtieHit& bh, const string& seq, const string& qual, char* cigar1, char* cigar2, string& seq1, string& seq2, string& qual1, string& qual2, int& left1, int& left2); /** * Convert a vector of CigarOps to a string representation */ std::string print_cigar(const vector& bh_cigar); /** * Calculate bowtie (1 or 2) related extra SAM fields such as * AS:i (alignment score) * MD:Z * NM:i * etc */ void bowtie_sam_extra(const BowtieHit& bh, const RefSequenceTable& rt, vector& fields); #endif tophat-2.0.9/src/reads.h0000644000175000017500000002326012162605263013615 0ustar toortoor#ifndef READS_H #define READS_H /* * reads.h * TopHat * * Created by Cole Trapnell on 9/2/08. * Copyright 2008 Cole Trapnell. All rights reserved. * */ #include #include #include #include #include #include "common.h" using std::string; // Note: qualities are not currently used by TopHat struct Read { Read() { //seq.reserve(MAX_READ_LEN); //qual.reserve(MAX_READ_LEN); } string name; string seq; string alt_name; string qual; bool lengths_equal() { return seq.length() == qual.length(); } void clear() { name.clear(); seq.clear(); qual.clear(); alt_name.clear(); } }; void reverse_complement(string& seq); string str_convert_color_to_bp(const string& color); seqan::String convert_color_to_bp(char base, const seqan::String& color); string convert_bp_to_color(const string& bp, bool remove_primer = false); seqan::String convert_bp_to_color(const seqan::String& bp, bool remove_primer = false); /* This is a dynamic programming to decode a colorspace read, which is from BWA paper. Heng Li and Richard Durbin Fast and accurate short read alignment with Burrows-Wheeler transform */ void BWA_decode(const string& color, const string& qual, const string& ref, string& decode); template string DnaString_to_string(const Type& dnaString) { std::string result; std::stringstream ss(std::stringstream::in | std::stringstream::out); ss << dnaString >> result; return result; } class ReadTable; class FLineReader { //simple text line reader class, buffering last line read int len; int allocated; char* buf; bool isEOF; FILE* file; bool is_pipe; bool pushed; //pushed back uint64_t lcount; //counting all lines read by the object public: char* chars() { return buf; } char* line() { return buf; } uint64_t readcount() { return lcount; } //number of lines read int length() { return len; } //length of the last line read bool isEof() {return isEOF; } char* nextLine(); FILE* fhandle() { return file; } void pushBack() { if (lcount) pushed=true; } // "undo" the last getLine request // so the next call will in fact return the same line FLineReader(FILE* stream=NULL) { len=0; isEOF=false; is_pipe=false; allocated=512; buf=(char*)malloc(allocated); lcount=0; buf[0]=0; file=stream; pushed=false; } FLineReader(FZPipe& fzpipe) { len=0; isEOF=false; allocated=512; buf=(char*)malloc(allocated); lcount=0; buf[0]=0; file=fzpipe.file; is_pipe=!fzpipe.pipecmd.empty(); pushed=false; } void reset(FZPipe& fzpipe) { lcount=0; buf[0]=0; file=fzpipe.file; is_pipe=!fzpipe.pipecmd.empty(); pushed=false; } void close() { if (file==NULL) return; if (is_pipe) pclose(file); else fclose(file); } ~FLineReader() { free(buf); //does not call close() -- we might reuse the file handle } }; void skip_lines(FLineReader& fr); bool next_fasta_record(FLineReader& fr, string& defline, string& seq, ReadFormat reads_format); bool next_fastq_record(FLineReader& fr, const string& seq, string& alt_name, string& qual, ReadFormat reads_format); bool next_fastx_read(FLineReader& fr, Read& read, ReadFormat reads_format=FASTQ, FLineReader* frq=NULL); #define READSTREAM_BUF_SIZE 500000 struct QReadData { //read data for the priority queue uint64_t id; Read read; char trashCode; //ZT tag value int8_t matenum; //mate number (1,2) 0 if unpaired QReadData():id(0),read(),trashCode(0), matenum(0) { } QReadData(uint64_t rid, Read& rd, bam1_t* bd=NULL): id(rid), read(rd), trashCode(0), matenum(0) { if (bd) { if (bd->core.flag & BAM_FREAD1) { matenum=1; } else if (bd->core.flag & BAM_FREAD2) matenum=2; GBamRecord bamrec(bd); trashCode=bamrec.tag_char("ZT"); } } }; //callback struct for ReadStream::getRead() - called for each read in the stream struct GetReadProc { GBamWriter* um_out; //skipped (unmapped) reads will be written here int64_t* unmapped_counter; int64_t* multimapped_counter; //char um_code; GetReadProc(GBamWriter* bamw=NULL, int64_t* um_counter=NULL, int64_t* mm_counter=NULL): um_out(bamw), unmapped_counter(um_counter), multimapped_counter(mm_counter) { } virtual bool process(QReadData& rdata, bool& found, bool is_unmapped) { //should return True - if it returns FALSE it will cause getRead() to abort //(stops looking for target readId in the stream) and to return false (="not found") return true; } virtual ~GetReadProc() { } }; class ReadStream { FLineReader* flquals; FLineReader* flseqs; bool stream_copy; bam1_t* b; bool bam_alt_name; //from BAM files, look for alt_name tag to retrieve the original read name bool bam_ignoreQC; //from BAM files, ignore QC flag (return the next read even if it has QC fail) protected: struct ReadOrdering { bool operator()(QReadData& lhs, QReadData& rhs) { return (lhs.id > rhs.id); } }; FZPipe fstream; FZPipe* fquals; size_t ReadBufSize; std::priority_queue< QReadData, std::vector, ReadOrdering > read_pq; uint64_t last_id; //keep track of last requested ID, for consistency check bool r_eof; bool next_read(QReadData& rdata, ReadFormat read_format=FASTQ); //get top read from the queue public: ReadStream(int bufsize=READSTREAM_BUF_SIZE):flquals(NULL), flseqs(NULL), stream_copy(false), b(NULL), bam_alt_name(false), bam_ignoreQC(false), fstream(), fquals(NULL),ReadBufSize(bufsize), read_pq(), last_id(0), r_eof(false) { } ReadStream(const string& fname, FZPipe* pquals=NULL, bool guess_packer=false):flquals(NULL), flseqs(NULL), stream_copy(false), b(NULL), bam_alt_name(false), bam_ignoreQC(false), fstream(), fquals(pquals), ReadBufSize(READSTREAM_BUF_SIZE), read_pq(), last_id(0), r_eof(false) { init(fname, pquals, guess_packer); } ReadStream(FZPipe& f_stream, FZPipe* pquals=NULL):flquals(NULL), flseqs(NULL), stream_copy(true), b(NULL), bam_alt_name(false), bam_ignoreQC(false), fstream(f_stream), fquals(pquals), ReadBufSize(READSTREAM_BUF_SIZE), read_pq(), last_id(0), r_eof(false) { //init(f_stream, pquals); if (fstream.is_bam) { b = bam_init1(); } else { flseqs=new FLineReader(fstream.file); skip_lines(*flseqs); } fquals=pquals; if (fquals) { flquals=new FLineReader(fquals->file); skip_lines(*flquals); } } void use_alt_name(bool v=true) { bam_alt_name=v; } void ignoreQC(bool v=true) { bam_ignoreQC=v; } void init(const string& fname, FZPipe* pquals=NULL, bool guess_packer=false) { if (fname.empty()) return; if (fstream.openRead(fname, guess_packer)==NULL) { fprintf(stderr, "Warning: couldn't open file %s\n",fname.c_str()); return; } if (fstream.is_bam) { if (b==NULL) { b = bam_init1(); } } else { if (b) { bam_destroy1(b); b=NULL; } flseqs=new FLineReader(fstream.file); skip_lines(*flseqs); } fquals=pquals; if (fquals) { flquals=new FLineReader(fquals->file); skip_lines(*flquals); } } void init(FZPipe& f_stream, FZPipe* pquals=NULL) { fstream=f_stream; //Warning - original copy may end up with invalid (closed) file handle stream_copy=true; if (fstream.file==NULL) { fprintf(stderr, "Warning: ReadStream not open.\n"); return; } if (fstream.is_bam) { if (b==NULL) { b = bam_init1(); } } else { if (b) { bam_destroy1(b); b=NULL; } flseqs=new FLineReader(fstream.file); skip_lines(*flseqs); } fquals=pquals; if (fquals) { flquals=new FLineReader(fquals->file); skip_lines(*flquals); } } //unbuffered reading from stream bool get_direct(Read& read, ReadFormat read_format=FASTQ); bool isBam() { return fstream.is_bam; } bam1_t* last_b() {//return the latest SAM record data fetched by get_direct() //must only be called after get_direct() return b; } const char* filename() { return fstream.filename.c_str(); } //read_ids must ALWAYS be requested in increasing order bool getRead(uint64_t read_id, Read& read, ReadFormat read_format=FASTQ, bool strip_slash=false, uint64_t begin_id = 0, uint64_t end_id=std::numeric_limits::max(), GetReadProc* rProc=NULL, bool is_unmapped=false //the target read, when found is also written by //rProc into the unmapped BAM file /* GBamWriter* um_out=NULL, //skipped (unmapped) reads will be written here // char um_code=0 int64_t* unmapped_counter=NULL, int64_t* multimapped_counter=NULL */ ); void rewind() { fstream.rewind(); clear(); if (flseqs) { flseqs->reset(fstream); skip_lines(*flseqs); } if (flquals) { flquals->reset(*fquals); } } void seek(int64_t offset) { clear(); fstream.seek(offset); } FILE* file() { return fstream.file; } void clear() { read_pq=std::priority_queue< QReadData, std::vector, ReadOrdering > (); } void close() { clear(); fstream.close(); } ~ReadStream() { close(); if (b) { bam_destroy1(b); } if (flquals) delete flquals; if (flseqs) delete flseqs; } }; #endif tophat-2.0.9/src/coverage.h0000755000175000017500000000264012122334360014305 0ustar toortoor#ifndef COVERAGE_H #define COVERAGE_H /* * coverage.h * TopHat * * Created by Daehwan Kim on 2/11/2012 */ #include #include #include #include #include #include "common.h" #include "bwt_map.h" using namespace std; typedef int8_t cov_t; typedef std::map > PosCoverage; typedef std::map GenomeCoverage; static const cov_t cov_max_value = std::numeric_limits::max(); static const cov_t cov_min_value = std::numeric_limits::min(); class Coverage { public: Coverage(); ~Coverage(); public: // this should be called before calculate_coverage() call. void add_coverage(RefID refid, int pos, int length); void merge_with(const Coverage& other); // calculate real coverage instead of difference void calculate_coverage(); // this can be called after calculate_coverage() call. int get_coverage(RefID refid, int pos) const; void clear(); // for debug purposes; void print_info() const; void print_info(const PosCoverage& posCoverage, int begin = 0, int end = std::numeric_limits::max()) const; void print_info(int pos, const vector& cov) const; private: void merge_contig(int pos, vector& cov, int pos2, const vector& cov2); PosCoverage::iterator get_contig(PosCoverage& posCoverage, int pos); private: GenomeCoverage genomeCoverage; public: bool debug; }; #endif tophat-2.0.9/src/GList.hh0000644000175000017500000005053512157116165013720 0ustar toortoor//--------------------------------------------------------------------------- /* Sortable collections of objects and object pointers */ #ifndef _GList_HH #define _GList_HH #include "GVec.hh" #define GLIST_SORTED_ERR "Operation not allowed on a sorted list!\n" #define GLIST_UNSORTED_ERR "Operation not allowed on an unsorted list!\n" //------ useful macros: #define BE_UNSORTED if (fCompareProc!=NULL) { GError(GLIST_SORTED_ERR); return; } #define BE_SORTED if (fCompareProc==NULL) { GError(GLIST_UNSORTED_ERR); return; } #define SORTED (fCompareProc!=NULL) #define UNSORTED (fCompareProc==NULL) // GArray is the sortable array type, requires the comparison operator < to be defined template class GArray:public GVec { protected: bool fUnique; static int DefaultCompareProc(const pointer item1, const pointer item2) { //operator< MUST be defined for OBJ class! if (*((OBJ*)item2) < *((OBJ*)item1)) return 1; else if (*((OBJ*)item1) < *((OBJ*)item2)) return -1; else return 0; } GCompareProc* fCompareProc; public: GArray(GCompareProc* cmpFunc=NULL); GArray(bool sorted, bool unique=false); GArray(int init_capacity, bool sorted, bool unique=false); GArray(GArray& array); //copy constructor const GArray& operator=(GArray& array); //~GArray(); //assignment operator void setSorted(GCompareProc* cmpFunc); void setSorted(bool sorted) { if (sorted) { if (fCompareProc!=&DefaultCompareProc) { fCompareProc=&DefaultCompareProc; Sort(); } } else fCompareProc=NULL; } //sort the array if cmpFunc not NULL or changes int Add(OBJ* item); // specific implementation if sorted int Add(OBJ& item) { return Add(&item); } //both will CREATE a new OBJ and COPY to it // using OBJ new operator= int cAdd(OBJ item) { return Add(&item); } int cPush(OBJ item) { return Add(&item); } int Push(OBJ& item) { return Add(&item); } void Add(GArray& list); //add copies of all items from another list //this will reject identical items in sorted lists only! void setUnique(bool beUnique) { fUnique = beUnique; }; void Sort(); //explicit sort may be requested bool Sorted() { return fCompareProc!=NULL; } void Replace(int idx, OBJ& item); //Put, use operator= to copy int Unique() { return fUnique; } int IndexOf(OBJ& item); //this needs the == operator to have been defined for OBJ bool Found(OBJ& item, int& idx); // for sorted arrays only; //search by content; if found, returns true and idx will be the index //of the first item found matching for which fCompareProc returns 0 bool Exists(OBJ& item); //same as above without existing index info //unsorted only, place item at position idx: void Move(int curidx, int newidx); void Insert(int idx, OBJ* item); void Insert(int idx, OBJ item) { Insert(idx,&item); } }; //GList is a sortable collection of pointers to objects; requires operator< to be defined, or a custom compare function template class GList:public GPVec { protected: bool fUnique; GCompareProc* fCompareProc; //a pointer to a Compare function static int DefaultCompareProc(const pointer item1, const pointer item2) { //operator< MUST be defined for OBJ class! if (*((OBJ*)item2) < *((OBJ*)item1)) return 1; else if (*((OBJ*)item1) < *((OBJ*)item2)) return -1; else return 0; } public: void sortInsert(int idx, OBJ* item); GList(GCompareProc* compareProc=NULL); //free by default GList(GCompareProc* compareProc, //unsorted by default GFreeProc *freeProc, bool beUnique=false); GList(bool sorted, bool free_elements=true, bool beUnique=false); GList(int init_capacity, bool sorted, bool free_elements=true, bool beUnique=false); GList(GList& list); //copy constructor? GList(GList* list); //kind of a copy constructor const GList& operator=(GList& list); //void Clear(); //~GList(); void setSorted(GCompareProc* compareProc); //sorted if compareProc not NULL; sort the list if compareProc changes ! bool Sorted() { return fCompareProc!=NULL; } void setSorted(bool sorted) { if (sorted) { if (fCompareProc!=&DefaultCompareProc) { fCompareProc=&DefaultCompareProc; Sort(); } } else fCompareProc=NULL; } int Add(OBJ* item); //-- specific implementation if sorted void Add(GList& list); //add all pointers from another list OBJ* AddIfNew(OBJ* item, bool deleteIfFound=true, int* fidx=NULL); // default: delete item if Found() (and pointers are not equal)! //returns the equal (==) object if it's in the list already //or the item itself if it is unique and actually added int AddedIfNew(OBJ* item); // if Found(item) (and pointers are not equal) delete item and returns -1 // if added, returns the new item index int Unique() { return fUnique; } //this will reject identical items in sorted lists only! void setUnique(bool beUnique) { fUnique = beUnique; }; GCompareProc* GetCompareProc() {return fCompareProc;} int IndexOf(OBJ* item); //this has a specific implementation for sorted lists //if list is sorted, item data is located by binary search //based on the Compare function //if not, a linear search is performed, but //this needs the == operator to have been defined for OBJ void Put(int idx, OBJ* item, bool re_sort=false); bool Found(OBJ* item, int & idx); // sorted only; //search by content; if found, returns true and idx will be the index //of the first item found matching for which GTCompareProc returns 0 bool Exists(OBJ* item); //same as above without existing index info bool Exists(OBJ& item); //same as above without existing index info void Sort(); //explicit sort may be requested using this function int Remove(OBJ* item); //search for pointer, using binary search if sorted void Insert(int idx, OBJ* item); //unsorted only, place item at position idx void Move(int curidx, int newidx); }; //GList //-------------------- TEMPLATE IMPLEMENTATION------------------------------- template GArray::GArray(GArray& array):GVec(0) { //copy constructor this->fCount=array.fCount; this->fCapacity=array.fCapacity; this->fArray=NULL; if (this->fCapacity>0) { //GMALLOC(this->fArray, this->fCapacity*sizeof(OBJ)); this->fArray=new OBJ[this->fCapacity]; } this->fCount=array.fCount; fUnique=array.fUnique; fCompareProc=array.fCompareProc; // uses OBJ operator= for (int i=0;ifCount;i++) this->fArray[i]=array[i]; } template const GArray& GArray::operator=(GArray& array) { if (&array==this) return *this; GVec::Clear(); this->fCount=array.fCount; this->fUnique=array.fUnique; this->fCapacity=array.fCapacity; if (this->fCapacity>0) { //GMALLOC(this->fArray, this->fCapacity*sizeof(OBJ)); this->fArray=new OBJ[this->fCapacity]; } this->fCompareProc=array.fCompareProc; this->fCount=array.fCount; // uses OBJ operator= for (int i=0;ifCount;i++) { this->fArray[i]=array[i]; } return *this; } template GArray::GArray(GCompareProc* cmpFunc):GVec(0) { fCompareProc = cmpFunc; fUnique = false; //only affects sorted lists } template GArray::GArray(bool sorted, bool unique):GVec(0) { fUnique=unique; fCompareProc = sorted ? DefaultCompareProc : NULL; } template GArray::GArray(int init_capacity, bool sorted, bool unique):GVec(init_capacity) { fUnique=unique; fCompareProc=sorted ? DefaultCompareProc : NULL; } template void GArray::setSorted(GCompareProc* cmpFunc) { GCompareProc* old_proc=fCompareProc; fCompareProc=cmpFunc; if (fCompareProc!=old_proc && fCompareProc!=NULL) Sort(); //new compare method } template int GArray::IndexOf(OBJ& item) { int result=0; if (Found(item, result)) return result; else return -1; } template bool GArray::Exists(OBJ& item) { int result=0; if (Found(item, result)) return true; else return false; } template int GArray::Add(OBJ* item) { if (item==NULL) return -1; int result; if (SORTED) { if (Found(*item, result)) if (fUnique) return -1; //cannot add a duplicate! //Found sets result to the position where the item should be! GVec::Insert(result, *item); } else { if (fUnique && Found(*item,result)) return -1; //set behaviour result = this->fCount; if (result==this->fCapacity) GVec::Grow(); this->fArray[result] = *item; //operator=, copies the item this->fCount++; } return result; } template void GArray::Add(GArray& list) { if (list.Count()==0) return; if (SORTED) { for (int i=0;isetCapacity(this->fCapacity+list.fCount); int s=this->fCount; for (int i=0;ifArray[s+i]=list.fArray[i]; this->fCount+=list.fCount; } } template bool GArray::Found(OBJ& item, int& idx) { //search the list by using fCompareProc (if defined) //or == operator for a non-sortable list //for sorted lists, even when the result is false, the idx is //set to the closest matching object! int i; idx=-1; if (this->fCount==0) { idx=0;return false;} if (SORTED) { //binary search based on fCompareProc //do the simplest tests first: if ((*fCompareProc)(&(this->fArray[0]),&item)>0) { idx=0; return false; } if ((*fCompareProc)(&item, &(this->fArray[this->fCount-1]))>0) { idx=this->fCount; return false; } int l=0; int h = this->fCount - 1; int c; while (l <= h) { i = (l + h) >> 1; c = (*fCompareProc)(&(this->fArray[i]), &item); if (c < 0) l = i + 1; else { h = i - 1; if (c == 0) { //found! idx=i; return true; } } } //while idx = l; return false; } else {//not sorted: use linear search // needs == operator to compare user defined objects ! i=0; while (ifCount) { if (this->fArray[i]==item) { //requires operator== idx=i; return true; } i++; } return false; } } template void GArray::Insert(int idx, OBJ* item) { //idx can be [0..fCount] so an item can be actually added BE_UNSORTED; //forbid this operation on sorted data GVec::Insert(idx, item); } template void GArray::Move(int curidx, int newidx) { BE_UNSORTED; //cannot do this in a sorted list! if (curidx!=newidx || newidx>=this->fCount) GError(GVEC_INDEX_ERR, newidx); OBJ tmp=this->fArray[curidx]; //copy constructor here this->fArray[curidx]=this->fArray[newidx]; this->fArray[newidx]=tmp; } template void GArray::Replace(int idx, OBJ& item) { //TEST_INDEX(idx); if (idx<0 || idx>=this->fCount) GError(GVEC_INDEX_ERR, __FILE__,__LINE__, idx); this->fArray[idx]=item; if ( SORTED ) Sort(); //re-sort ! this could be very expensive, don't do it } template void GArray::Sort() { if (fCompareProc==NULL) { fCompareProc=DefaultCompareProc; } if (this->fArray!=NULL && this->fCount>0) this->qSort(0, this->fCount-1, fCompareProc); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //*=> GList implementation -- sortable array of pointers to OBJ template GList::GList(GList& list):GPVec(list) { //copy constructor fUnique=list.fUnique; fCompareProc=list.fCompareProc; } template GList::GList(GList* plist):GPVec(0) { //another copy constructor this->fCapacity=plist->fCapacity; this->fList=NULL; if (this->fCapacity>0) { GMALLOC(this->fList, this->fCapacity*sizeof(OBJ*)); } fUnique=plist->fUnique; fCompareProc=plist->fCompareProc; this->fFreeProc=plist->fFreeProc; this->fCount=plist->fCount; memcpy(this->fList, plist->fList, this->fCount*sizeof(OBJ*)); //for (int i=0;ifCount;i++) Add(plist->Get(i)); } template void GList::Add(GList& list) { if (list.Count()==0) return; if (SORTED) { for (int i=0;isetCapacity(this->fCapacity+list.fCount); memcpy( & (this->fList[this->fCount]), list.fList, list.fCount*sizeof(OBJ*)); this->fCount+=list.fCount; } } template GList::GList(GCompareProc* compareProc, GFreeProc* freeProc, bool beUnique) { fCompareProc = compareProc; this->fFreeProc = freeProc; fUnique = beUnique; //only affects sorted lists } template GList::GList(GCompareProc* compareProc) { fCompareProc = compareProc; this->fFreeProc = GPVec::DefaultFreeProc; fUnique = false; //only affects sorted lists } template GList::GList(bool sorted, bool free_elements, bool beUnique) { if (sorted) { if (free_elements) { fCompareProc=&DefaultCompareProc; this->fFreeProc = GPVec::DefaultFreeProc; fUnique=beUnique; } else { fCompareProc=&DefaultCompareProc; this->fFreeProc=NULL; fUnique=beUnique; } } else { if (free_elements) { fCompareProc=NULL; this->fFreeProc=GPVec::DefaultFreeProc; fUnique=beUnique; } else { fCompareProc=NULL; this->fFreeProc=NULL; fUnique=beUnique; } } } template GList::GList(int init_capacity, bool sorted, bool free_elements, bool beUnique):GPVec(init_capacity, free_elements) { if (sorted) { fCompareProc=&DefaultCompareProc; fUnique=beUnique; } else { fCompareProc=NULL; fUnique=beUnique; } } template const GList& GList::operator=(GList& list) { if (&list!=this) { GPVec::Clear(); fCompareProc=list.fCompareProc; this->fFreeProc=list.fFreeProc; //Attention: the object pointers are copied directly, //but the actual objects are NOT duplicated for (int i=0;i void GList::setSorted(GCompareProc* compareProc) { GCompareProc* old_proc=fCompareProc; fCompareProc=compareProc; if (fCompareProc!=old_proc && fCompareProc!=NULL) Sort(); //new compare method } template int GList::IndexOf(OBJ* item) { int result=0; if (Found(item, result)) return result; else return -1; } template bool GList::Exists(OBJ& item) { int result=0; if (Found(&item, result)) return true; else return false; } template bool GList::Exists(OBJ* item) { int result=0; if (Found(item, result)) return true; else return false; } template int GList::Add(OBJ* item) { int result; if (item==NULL) return -1; if (SORTED) { if (Found(item, result)) if (fUnique) return -1; //duplicates forbidden //Found sets result to the position where the item should be! sortInsert(result, item); } else { if (fUnique && Found(item,result)) return -1; //set behaviour result = this->fCount; if (result==this->fCapacity) GPVec::Grow(); this->fList[result]=item; this->fCount++; } return result; } //by default, it deletes the item if it has an equal in the list! //returns the existing equal (==) object if it's in the list already //or returns the item itself if it's unique (and adds it) template OBJ* GList::AddIfNew(OBJ* item, bool deleteIfFound, int* fidx) { int r; if (Found(item, r)) { if (deleteIfFound && (pointer)item != (pointer)(this->fList[r])) { this->deallocate_item(item); } if (fidx!=NULL) *fidx=r; return this->fList[r]; //found } //not found: if (SORTED) { //Found() set result to the position where the item should be inserted: sortInsert(r, item); } else { r = this->fCount; if (r==this->fCapacity) GPVec::Grow(); this->fList[r]=item; this->fCount++; } if (fidx!=NULL) *fidx=r; return item; } //if item is found already in the list DELETE it and return -1 //otherwise the item is added and its index is returned template int GList::AddedIfNew(OBJ* item) { int r; if (Found(item, r)) { if ((pointer)item != (pointer)(this->fList[r])) { this->deallocate_item(item); } return -1; } //not found: if (SORTED) { //Found() set r to the position where the item should be inserted: sortInsert(r, item); } else { r = this->fCount; if (r==this->fCapacity) GPVec::Grow(); this->fList[r]=item; this->fCount++; } return r; } template bool GList::Found(OBJ* item, int& idx) { //search the list by using fCompareProc (if defined) //or == operator for a non-sortable list //for sorted lists, even when the result is false, the idx is //set to the closest matching object! int i; idx=-1; if (this->fCount==0) { idx=0;return false;} if (SORTED) { //binary search based on fCompareProc //do the simple test first: if ((*fCompareProc)(this->fList[0],item)>0) { idx=0; return false; } if ((*fCompareProc)(item, this->fList[this->fCount-1])>0) { idx=this->fCount; return false; } int l, h, c; l = 0; h = this->fCount - 1; while (l <= h) { i = (l + h) >> 1; c = (*fCompareProc)(this->fList[i], item); if (c < 0) l = i + 1; else { h = i - 1; if (c == 0) { idx=i; return true; } } } //while idx = l; return false; } else {//not sorted: use linear search // needs == operator to compare user defined objects ! i=0; while (ifCount) { if (*this->fList[i]==*item) { idx=i; return true; } i++; } return false; } } template void GList::sortInsert(int idx, OBJ* item) { //idx must be the new position this new item must have //so the allowed range is [0..fCount] //the old idx item all the above will be shifted to idx+1 if (idx<0 || idx>this->fCount) GError(GVEC_INDEX_ERR, idx); if (this->fCount==this->fCapacity) { GPVec::Grow(idx, item); //expand and also copy/move data and insert the new item return; } //room still left, just move data around and insert the new one if (idxfCount) //copy/move pointers only! memmove(&(this->fList[idx+1]), &(this->fList[idx]), (this->fCount-idx)*sizeof(OBJ*)); this->fList[idx]=item; this->fCount++; } template void GList::Insert(int idx, OBJ* item) { //idx can be [0..fCount] so an item can be actually added BE_UNSORTED; //cannot do that with a sorted list! GPVec::Insert(idx,item); } template void GList::Move(int curidx, int newidx) { BE_UNSORTED; //cannot do this in a sorted list! GPVec::Move(curidx,newidx); } template void GList::Put(int idx, OBJ* item, bool re_sort) { //WARNING: this will never free the replaced item! // this may BREAK the sort order unless the "re_sort" parameter is given if (idx<0 || idx>this->fCount) GError(GVEC_INDEX_ERR, idx); this->fList[idx]=item; if (SORTED && item!=NULL && re_sort) Sort(); //re-sort } template int GList::Remove(OBJ* item) { //removes an item if it's in our list int result=IndexOf(item); if (result>=0) GPVec::Delete(result); return result; } template void GList::Sort() { if (fCompareProc==NULL) fCompareProc = DefaultCompareProc; if (this->fList!=NULL && this->fCount>0) this->qSort(0, this->fCount-1, fCompareProc); } //--------------------------------------------------------------------------- #endif tophat-2.0.9/src/codons.h0000644000175000017500000000205112122334361013771 0ustar toortoor#ifndef CODONS_H #define CODONS_H #include "GBase.h" #include unsigned short packCodon(char n1, char n2, char n3); //assumes n1,n2,n3 are UPPERCASE! struct Codon { char nuc[3]; Codon(char* str=NULL) { if (str==NULL) { nuc[0]='N'; nuc[1]='N'; nuc[2]='N'; } else { nuc[0]=toupper(str[0]); nuc[1]=toupper(str[1]); nuc[2]=toupper(str[2]); } } Codon(char s1, char s2, char s3) { nuc[0]=toupper(s1); nuc[1]=toupper(s2); nuc[2]=toupper(s3); } char& operator[](int idx) { if (idx<0 || idx>2) GError("Error: Codon index out of bounds!\n"); return nuc[idx]; } char operator[](int idx) const { if (idx<0 || idx>2) GError("Error: Codon index out of bounds!\n"); return nuc[idx]; } char translate(); }; //simple 1st frame forward translation of a given DNA string //will allocated memory for the translation -- the caller is // responsible for freeing the returned string! char* translateDNA(const char* dnastr, int& aalen, int dnalen=0); bool codonTableInit(); #endif tophat-2.0.9/src/tokenize.h0000644000175000017500000000055312122334361014341 0ustar toortoor#ifndef TOKENIZE_H_ #define TOKENIZE_H_ #include #include void tokenize(const std::string& s, const std::string& delims, std::vector& ss); void tokenize_strict(const std::string& s, const std::string& delims, std::vector& ss); #endif /*TOKENIZE_H_*/ tophat-2.0.9/src/segments.h0000644000175000017500000000271312122334361014336 0ustar toortoor/* * segments.h * TopHat * * Created by Cole Trapnell on 2/11/09. * Copyright 2009 __MyCompanyName__. All rights reserved. * */ #include #include #include "bwt_map.h" enum eREAD { READ_DONTCARE = 0, READ_LEFT, READ_RIGHT }; enum ePOINT_DIR { POINT_DIR_DONTCARE = 0, POINT_DIR_LEFT, POINT_DIR_RIGHT, POINT_DIR_BOTH }; struct RefSeg { RefSeg() : ref_id(0), points_where(POINT_DIR_DONTCARE), antisense(false), read(READ_DONTCARE), left(0), right(0), support_read("") {} RefSeg(uint32_t i, ePOINT_DIR p, bool antisense, eREAD read, int l, int r, const string& support_read = "") : ref_id(i), points_where(p), antisense(antisense), read(read), left(l), right(r), support_read(support_read) {} bool operator<(const RefSeg& rhs) const { if (ref_id != rhs.ref_id) return ref_id < rhs.ref_id; if (left != rhs.left) return left < rhs.left; if (right != rhs.right) return right < rhs.right; return false; } bool operator==(const RefSeg& rhs) const { return (ref_id == rhs.ref_id && left == rhs.left && right == rhs.right && points_where == rhs.points_where && antisense == rhs.antisense && read == rhs.read && support_read == rhs.support_read); } uint32_t ref_id; ePOINT_DIR points_where; bool antisense; eREAD read; int left; int right; string support_read; }; tophat-2.0.9/src/tophat2.in0000644000175000017500000000036312122334361014250 0ustar toortoor#!/bin/bash prefix="__PREFIX__" pbin="" if [[ -z $prefix ]]; then fl=$(readlink $0) if [[ -z $fl ]]; then pbin=$(dirname $0) else pbin=$(dirname $fl) fi else pbin=$prefix/bin fi export PATH=$pbin:$PATH $pbin/tophat "$@" tophat-2.0.9/src/prep_reads.cpp0000644000175000017500000004600512162605263015200 0ustar toortoor/* * prep_reads.cpp * TopHat * * Created by Cole Trapnell on 9/2/08. * Copyright 2008 Cole Trapnell. All rights reserved. * Derived from maq "catfilter", by Heng Li at Sanger */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include "common.h" #include "reads.h" #include "tokenize.h" #include "qual.h" using namespace std; void format_qual_string(string& qual_str) { for (size_t i = 0; i < qual_str.size(); ++i) { qual_str[i] = charToPhred33(qual_str[i], solexa_quals, phred64_quals); } } vector flt_reads_fnames; bool readmap_loaded=false; vector readmap; //for filter_multihits vector mate_readmap; //for filter_multihits void load_readmap(string& flt_fname, vector& rmap) { //readmap_loaded=false; if (flt_fname.empty()) return; ReadStream rdstream(flt_fname, NULL, true); Read read; while (rdstream.get_direct(read, reads_format)) { uint32_t rnum=(uint32_t)atol(read.name.c_str()); if (rnum>=(uint32_t) rmap.size()) rmap.resize(rnum+1, false); rmap[rnum] = true; } readmap_loaded=true; } bool check_readmap(vector& rmap, uint32_t read_id) { if (read_id>=rmap.size()) return false; else return rmap[read_id]; } void flt_reads_and_hits(vector& reads_files) { if (!readmap_loaded) err_die("Error: filtering reads not enabled, aborting."); if (aux_outfile.empty()) err_die("Error: auxiliary output file not provided."); // -- filter mappings: string fext=getFext(flt_mappings); if (fext=="bam") { samfile_t* fbam=samopen(flt_mappings.c_str(), "rb", 0); if (fbam==NULL) err_die("Error opening BAM file %s!\n", flt_mappings.c_str()); bam1_t *b = bam_init1(); string aux_index(aux_outfile); aux_index+=".index"; GBamWriter wbam(aux_outfile.c_str(), fbam->header, aux_index); while (samread(fbam, b) > 0) { char* rname = bam1_qname(b); uint32_t rid=(uint32_t)atol(rname); if (check_readmap(readmap, rid)) { //write this mapping into the output file wbam.write(b, rid); } } bam_destroy1(b); samclose(fbam); } else { bool is_sam=false; string s(flt_mappings); for(size_t i=0; i!=s.length(); ++i) s[i] = std::tolower(s[i]); if (fext=="sam" || s.rfind(".sam.")+fext.length()+5==s.length()) is_sam=true; string unzcmd=getUnpackCmd(flt_mappings); FZPipe hitsfile(flt_mappings, unzcmd); FLineReader fh(hitsfile); FZPipe outfile; outfile.openWrite(aux_outfile.c_str(), zpacker); if (outfile.file==NULL) err_die("Error: cannot create file %s", aux_outfile.c_str()); const char* line; while ((line=fh.nextLine())) { if (is_sam && line[0]=='@') { //copy the header fprintf(outfile.file, "%s\n", line); continue; } char* tab=strchr((char*)line, '\t'); if (tab==NULL) err_die("Error: cannot find tab character in %s mappings line:\n%s", flt_mappings.c_str(),line); *tab=0; uint32_t rid = (uint32_t) atol(line); if (rid==0) err_die("Error: invalid read ID (%s) parsed from mapping file %s", line, flt_mappings.c_str()); *tab='\t'; if (check_readmap(readmap, rid)) { fprintf(outfile.file, "%s\n", line); } } outfile.close(); hitsfile.close(); } // -- now filter reads //FILE* findex = NULL; GBamWriter* wbam=NULL; FILE* fout=NULL; if (std_outfile.empty()) { fout=stdout; } else { //output file name explicitely given if (getFext(std_outfile)=="bam") { if (sam_header.empty()) err_die("Error: sam header file not provided.\n"); wbam = new GBamWriter(std_outfile.c_str(), sam_header.c_str(), index_outfile); //wbam = new GBamWriter(std_outfile, index_outfile); } else { fout = fopen(std_outfile.c_str(), "w"); if (fout==NULL) err_die("Error: cannot create file %s\n", std_outfile.c_str()); } } /* if (wbam==NULL && !index_outfile.empty()) { findex = fopen(index_outfile.c_str(), "w"); if (findex == NULL) err_die("Error: cannot create file %s\n", index_outfile.c_str()); } */ for (size_t fi = 0; fi < reads_files.size(); ++fi) { //only one file expected here, this is not the initial prep_reads Read read; ReadStream readstream(reads_files[fi], NULL, true); //skip_lines(fr); while (readstream.get_direct(read)) { uint32_t rnum=(uint32_t)atol(read.name.c_str()); if (check_readmap(readmap, rnum)) { if (wbam) { GBamRecord bamrec(read.name.c_str(), -1, 0, false, read.seq.c_str(), NULL, read.qual.c_str()); wbam->write(bamrec.get_b(), rnum); } else { fprintf(fout, "@%s\n%s\n+%s\n%s\n", read.name.c_str(), read.seq.c_str(), read.alt_name.c_str(), read.qual.c_str()); } } } } if (wbam) delete wbam; if (fout && fout!=stdout) fclose(fout); } void writePrepBam(GBamWriter* wbam, Read& read, uint32_t rid, char trashcode=0, int matenum=0) { if (wbam==NULL) return; string rnum; str_appendUInt(rnum, rid); string rname(read.name); // attach a primer tag and the following color to the end of the read name for colorspace reads // also, attach a quality value if (color) { rnum.push_back(read.seq[0]); rnum.push_back(read.seq[1]); if (!read.qual.empty()) rnum.push_back(read.qual[1]); else rnum.push_back('!'); } GBamRecord bamrec(rnum.c_str(), -1, 0, false, read.seq.c_str(), NULL, read.qual.c_str()); if (matenum) { bamrec.set_flag(BAM_FPAIRED); if (matenum==1) bamrec.set_flag(BAM_FREAD1); else if (matenum==2) bamrec.set_flag(BAM_FREAD2); } bamrec.add_aux("ZN", 'Z', rname.length(), (uint8_t*)rname.c_str()); if (trashcode) { bamrec.set_flag(BAM_FQCFAIL); bamrec.add_aux("ZT", 'A', 1, (uint8_t*)&trashcode); } wbam->write(bamrec.get_b(), rid); } bool processRead(int matenum, Read& read, uint32_t next_id, int& num_reads_chucked, int& multimap_chucked, GBamWriter* wbam, FILE* fout, FILE* fqindex, int& min_read_len, int& max_read_len, uint64_t& fout_offset, vector& rmap) { if (read.seq.length()<12) { ++num_reads_chucked; writePrepBam(wbam, read, next_id, 'S', matenum); return false; } if ((int)read.seq.length()max_read_len) max_read_len=read.seq.length(); if (color && read.seq[1] == '4') { ++num_reads_chucked; writePrepBam(wbam, read, next_id, 'c', matenum); return false; } if (readmap_loaded && check_readmap(rmap, next_id)) { ++num_reads_chucked; ++multimap_chucked; writePrepBam(wbam, read, next_id, 'M', matenum); return false; } format_qual_string(read.qual); std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper); char counts[256]; memset(counts, 0, sizeof(counts)); // Count up the bad characters for (unsigned int i = 0; i != read.seq.length(); ++i) { char c = (char)toupper(read.seq[i]); counts[(size_t)c]++; } double percent_A = (double)(counts[(size_t)'A']) / read.seq.length(); double percent_C = (double)(counts[(size_t)'C']) / read.seq.length(); double percent_G = (double)(counts[(size_t)'G']) / read.seq.length(); double percent_T = (double)(counts[(size_t)'T']) / read.seq.length(); double percent_N = (double)(counts[(size_t)'N']) / read.seq.length(); double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length(); // Chuck the read if there are at least 5 'N's or if it's mostly // (>90%) 'N's and 'A's char trash_code=0; if (percent_A > 0.9 || percent_C > 0.9 || percent_G > 0.9 || percent_T > 0.9) trash_code='L'; else if (percent_N >= 0.1 || percent_4 >=0.1) trash_code='N'; if (trash_code) { ++num_reads_chucked; writePrepBam(wbam, read, next_id, trash_code, matenum); return false; } if (wbam) { if (reads_format == FASTA && !quals) read.qual = string(read.seq.length(), 'I').c_str(); else if (color && quals) read.qual = "!" + read.qual; writePrepBam(wbam, read, next_id, 0, matenum); } else { // daehwan - we should not use buf in printf function // because it may contain some control characters such as "\" from quality values. // Here, buf is only used for calculating the file offset char buf[2048] = {0}; if (reads_format == FASTQ or (reads_format == FASTA && quals)) { sprintf(buf, "@%u\n%s\n+%s\n%s\n", next_id, read.seq.c_str(), read.name.c_str(), read.qual.c_str()); fprintf(fout, "@%u\n%s\n+%s\n%s\n", next_id, read.seq.c_str(), read.name.c_str(), read.qual.c_str()); } else if (reads_format == FASTA) { string qual; if (color) qual = string(read.seq.length()-1, 'I').c_str(); else qual = string(read.seq.length(), 'I').c_str(); sprintf(buf, "@%u\n%s\n+%s\n%s\n", next_id, read.seq.c_str(), read.name.c_str(), qual.c_str()); fprintf(fout, "@%u\n%s\n+%s\n%s\n", next_id, read.seq.c_str(), read.name.c_str(), qual.c_str()); } else { assert(0); } if (fqindex != NULL) { if ((next_id - num_reads_chucked) % INDEX_REC_COUNT == 0) fprintf(fqindex, "%d\t%lu\n", next_id, (long unsigned)fout_offset); } fout_offset += strlen(buf); } return true; } //validate read const char* ERR_FILE_CREATE="Error: cannot create file %s\n"; void process_reads(vector& reads_fnames, vector& quals_files, vector& mate_fnames, vector& mate_quals_files) { //TODO: add the option to write the garbage reads into separate file(s) int num_reads_chucked = 0; int multimap_chucked = 0; int mate_num_reads_chucked = 0; int mate_multimap_chucked = 0; int min_read_len = 20000000; int max_read_len = 0; int mate_min_read_len = 20000000; int mate_max_read_len = 0; uint32_t next_id = 0; uint32_t num_left = 0; uint32_t num_mates = 0; FILE* fw=NULL; //aux output file string outfname; //std_outfile after instancing template string mate_outfname; string idxfname; //index_outfile after instancing template string mate_idxfname; bool have_mates = (mate_fnames.size() > 0); if (!aux_outfile.empty()) { fw=fopen(aux_outfile.c_str(), "w"); if (fw==NULL) err_die(ERR_FILE_CREATE,aux_outfile.c_str()); } FILE* fqindex = NULL; //fastq index FILE* mate_fqindex = NULL; GBamWriter* wbam=NULL; GBamWriter* mate_wbam=NULL; FILE* fout=NULL; FILE* mate_fout=NULL; uint64_t fout_offset = 0; uint64_t mate_fout_offset = 0; if (std_outfile.empty()) { fout=stdout; //for PE reads, flt_side will decide which side is printed (can't be both) if (have_mates && flt_side==2) err_die("Error: --flt-side option required for PE reads directed to stdout!\n"); mate_fout=stdout; } else { //output file name explicitely given //could be a template if (std_outfile.find("%side%") != string::npos) { outfname=str_replace(std_outfile, "%side%", "left"); if (have_mates) mate_outfname=str_replace(std_outfile, "%side%", "right"); } else { outfname=std_outfile; } if (index_outfile.find("%side%") != string::npos) { idxfname=str_replace(index_outfile, "%side%", "left"); if (have_mates) mate_idxfname=str_replace(index_outfile, "%side%", "right"); } else { idxfname=index_outfile; } if (getFext(outfname)=="bam") { if (sam_header.empty()) err_die("Error: sam header file not provided.\n"); wbam = new GBamWriter(outfname.c_str(), sam_header.c_str(), idxfname); if (!mate_outfname.empty()) { mate_wbam = new GBamWriter(mate_outfname.c_str(), sam_header.c_str(), mate_idxfname); } } else { //fastq output fout = fopen(outfname.c_str(), "w"); if (fout==NULL) err_die(ERR_FILE_CREATE, outfname.c_str()); mate_fout = fopen(mate_outfname.c_str(), "w"); if (mate_fout==NULL) err_die(ERR_FILE_CREATE, mate_outfname.c_str()); } } if (wbam==NULL && !idxfname.empty()) { //fastq file output, indexed fqindex = fopen(idxfname.c_str(), "w"); if (fqindex == NULL) err_die(ERR_FILE_CREATE, idxfname.c_str()); if (!mate_idxfname.empty()) { mate_fqindex = fopen(mate_idxfname.c_str(), "w"); if (mate_fqindex == NULL) err_die(ERR_FILE_CREATE, mate_idxfname.c_str()); } } bool possible_mate_mismatch=false; size_t max_files=max(reads_fnames.size(), mate_fnames.size()); for (size_t fi = 0; fi < max_files; ++fi) { Read read; Read mate_read; ReadStream* reads=NULL; ReadStream* mate_reads=NULL; FZPipe* fq=NULL; FZPipe* mate_fq=NULL; bool have_l_reads=(figet_direct(read, reads_format))) num_left++; // Get the next read from the file int matenum=0; // 0 = unpaired, 1 = left, 2 = right if (have_r_reads && (have_r_reads=mate_reads->get_direct(mate_read, reads_format)) ) { num_mates++; } if (have_l_reads && have_r_reads) { matenum = 1; //read is first in a pair if (have_l_reads && have_r_reads && !possible_mate_mismatch) { //check if reads are paired correctly int nl=read.name.length(); bool mate_match=(nl==(int)mate_read.name.length()); int m_len=0, c=0; while (c 2) //more than 2 chars differ mate_match=false; if (!mate_match) { fprintf(stderr, "WARNING: read pairing issues detected (check prep_reads.log) !\n" " Pair #%d name mismatch: %s vs %s\n", next_id+1, read.name.c_str(), mate_read.name.c_str()); possible_mate_mismatch=true; } } //mate check } //paired reads if (have_l_reads || have_r_reads) { //IMPORTANT: to keep paired reads in sync, this must be //incremented BEFORE any reads are chucked ! ++next_id; } if ((flt_side & 1)==0 && have_l_reads) //for unpaired reads or left read in a pair processRead(matenum, read, next_id, num_reads_chucked, multimap_chucked, wbam, fout, fqindex, min_read_len, max_read_len, fout_offset, readmap); if (flt_side>0 && have_r_reads) { matenum = have_l_reads ? 2 : 0; processRead(matenum, mate_read, next_id, mate_num_reads_chucked, mate_multimap_chucked, mate_wbam, mate_fout, mate_fqindex, mate_min_read_len, mate_max_read_len, mate_fout_offset, mate_readmap); } } //while !fr.isEof() if (reads) delete reads; if (mate_reads) delete mate_reads; } //for each input file if (fout!=stdout || (flt_side & 1) == 0) { fprintf(stderr, "%u out of %u reads have been filtered out\n", num_reads_chucked, num_left); if (readmap_loaded) fprintf(stderr, "\t(%u filtered out due to %s)\n", multimap_chucked, flt_reads_fnames[0].c_str()); } if (have_mates && (fout!=stdout || flt_side>0)) { fprintf(stderr, "%u out of %u read mates have been filtered out\n", mate_num_reads_chucked, num_mates); if (readmap_loaded && mate_multimap_chucked) fprintf(stderr, "\t(%u mates filtered out due to %s)\n", mate_multimap_chucked, flt_reads_fnames[1].c_str()); } if (wbam) { delete wbam; } if (mate_wbam) { delete mate_wbam; } if (fout && fout!=stdout) fclose(fout); if (mate_fout) fclose(mate_fout); if (fw!=NULL) { string side(""); if (have_mates) side="left_"; fprintf(fw, "%smin_read_len=%d\n", side.c_str(), min_read_len - (color ? 1 : 0)); fprintf(fw, "%smax_read_len=%d\n", side.c_str(), max_read_len - (color ? 1 : 0)); fprintf(fw, "%sreads_in =%d\n", side.c_str(), num_left); fprintf(fw, "%sreads_out=%d\n", side.c_str(), num_left-num_reads_chucked); if (have_mates) { side="right_"; fprintf(fw, "%smin_read_len=%d\n", side.c_str(), mate_min_read_len - (color ? 1 : 0)); fprintf(fw, "%smax_read_len=%d\n", side.c_str(), mate_max_read_len - (color ? 1 : 0)); fprintf(fw, "%sreads_in =%d\n", side.c_str(), num_mates); fprintf(fw, "%sreads_out=%d\n", side.c_str(), num_mates-mate_num_reads_chucked); } fclose(fw); } if (fqindex) fclose(fqindex); if (mate_fqindex) fclose(mate_fqindex); } void print_usage() { fprintf(stderr, "Usage:\n prep_reads [--filter-multi ] [,..] \\" "[ [,..]\n"); } void open_qual_files(vector& quals_files, string& quals_file_list) { vector quals_file_names; tokenize(quals_file_list, ",", quals_file_names); for (size_t i = 0; i < quals_file_names.size(); ++i) { FZPipe seg_file(quals_file_names[i], true); if (seg_file.file == NULL) { fprintf(stderr, "Error: cannot open qual. file %s\n", quals_file_names[i].c_str()); exit(1); } quals_files.push_back(seg_file); } } int main(int argc, char *argv[]) { fprintf(stderr, "prep_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION); fprintf(stderr, "---------------------------\n"); int parse_ret = parse_options(argc, argv, print_usage); if (parse_ret) return parse_ret; if(optind >= argc) { print_usage(); return 1; } string reads_file_list(argv[optind++]); vector reads_filenames; tokenize(reads_file_list, ",",reads_filenames); vector quals_files; if (quals) { if (optind>=argc) { err_die("Error: quality value file(s) not provided !\n"); } string quals_file_list = argv[optind++]; open_qual_files(quals_files, quals_file_list); if (quals_files.size()!=reads_filenames.size()) err_die("Error: number of quality value files must much the number of read files!\n"); } string mate_file_list; vector mate_filenames; vector mate_quals_files; if (optind=argc) { err_die("Error: mate quality value file(s) not provided !\n"); } string mate_quals_file_list = argv[optind++]; open_qual_files(mate_quals_files, mate_quals_file_list); if (mate_quals_files.size()!=mate_filenames.size()) err_die("Error: number of quality value files must much the number of read files!\n"); } } if (!flt_reads.empty()) { //for multi-mapped prefiltering usage readmap_loaded = false; tokenize(flt_reads, ",", flt_reads_fnames); load_readmap(flt_reads_fnames[0], readmap); if (flt_reads_fnames.size()==2) load_readmap(flt_reads_fnames[1], mate_readmap); } if (flt_mappings.empty()) process_reads(reads_filenames, quals_files, mate_filenames, mate_quals_files); else //special use case: filter previous mappings (when prefiltering) flt_reads_and_hits(reads_filenames); return 0; } tophat-2.0.9/src/gdna.cpp0000644000175000017500000000404112122334356013755 0ustar toortoor#include "gdna.h" #include const char* IUPAC_2BIT ="AACCTTGGTTAAAAAACCCCGGAAAAAACCAAAAAA"; const char* IUPAC_2BITN ="001133223300000011112200000011000000"; const char* IUPAC_DEFS ="AaCcTtGgUuMmRrWwSsYyKkVvHhDdBbNnXx-*"; const char* IUPAC_COMP ="TtGgAaCcAaKkYyWwSsRrMmBbDdHhVvNnXx-*"; #define A_2BIT 0 // 00 #define C_2BIT 1 // 01 #define G_2BIT 2 // 10 #define T_2BIT 3 // 11 static byte ntCompTable[256]; static byte nt2bit[256]; //maps any character to a 2bit base value (with N = A) static char v_2bit2nt[4] = {'A','C','G','T'}; //---------------------- static bool gdna_Ready=gDnaInit(); //---------------------- byte gdna2bit(char* &nt, int n) { // Pack n bases into a byte (n can be 1..4) byte out = 0; while (n && *nt) { n--; out <<= 2; out += nt2bit[(int)*nt]; nt++; } #ifdef GDEBUG if (n) { GError("Error: attempt to read 6-mer beyond the end of the string!\n"); } #endif return out; } char ntComplement(char c) { return ntCompTable[(int)c]; } char g2bit2base(byte v2bit) { return v_2bit2nt[v2bit & 0x03 ]; } //in place reverse complement of nucleotide (sub)sequence char* reverseComplement(char* seq, int slen) { if (slen==0) slen=strlen(seq); //reverseChars(seq,len); int l=0; int r=slen-1; register char c; while (l #else #define PACKAGE_VERSION "INTERNAL" #define SVN_REVISION "XXX" #endif #include #include "common.h" #include "bwt_map.h" #include "junctions.h" void get_junctions_from_hitstream(HitStream& hitstream, ReadTable& it, JunctionSet& junctions) { HitsForRead curr_hit_group; hitstream.next_read_hits(curr_hit_group); uint32_t curr_obs_order = it.observation_order(curr_hit_group.insert_id); // While we still have unreported hits... while(curr_obs_order != VMAXINT32) { for (size_t i = 0; i < curr_hit_group.hits.size(); ++i) { const BowtieHit& bh = curr_hit_group.hits[i]; junctions_from_alignment(bh, junctions); } //fprintf(stderr, "#Hits = %d\n", curr_hit_group.hits.size()); //curr_hit_group = HitsForRead(); // Get next hit group hitstream.next_read_hits(curr_hit_group); curr_obs_order = it.observation_order(curr_hit_group.insert_id); } hitstream.reset(); } void driver(FILE* hit_map) { ReadTable it; RefSequenceTable rt(sam_header, true); SAMHitFactory hit_factory(it,rt); //HitStream hitstream(hit_map, &hit_factory, false, true, true); JunctionSet junctions; while (hit_map && !feof(hit_map)) { char bwt_buf[2048]; if (!fgets(bwt_buf, 2048, hit_map)) { break; } // Chomp the newline char* nl = strrchr(bwt_buf, '\n'); if (nl) *nl = 0; // Get a new record from the tab-delimited Bowtie map BowtieHit bh; if (hit_factory.get_hit_from_buf(bwt_buf, bh, false)) { junctions_from_alignment(bh, junctions); } } for (JunctionSet::iterator itr = junctions.begin(); itr != junctions.end(); ++itr) { const char* ref_name = rt.get_name(itr->first.refid); fprintf(stdout, "%s\t%d\t%d\t%c\n", ref_name, itr->first.left - 1, itr->first.right, itr->first.antisense ? '-' : '+'); } fprintf(stderr, "Extracted %lu junctions\n", junctions.size()); } void print_usage() { fprintf(stderr, "Usage: sam_juncs \n"); // fprintf(stderr, "Usage: tophat_reports [splice_map1.sbwtout]\n"); } int main(int argc, char** argv) { fprintf(stderr, "sam_juncs v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION); fprintf(stderr, "---------------------------------------\n"); reads_format = FASTQ; int parse_ret = parse_options(argc, argv, print_usage); if (parse_ret) return parse_ret; if(optind >= argc) { print_usage(); return 1; } string map_filename = argv[optind++]; FILE* map_file = fopen(map_filename.c_str(), "r"); if (!map_file) { fprintf(stderr, "Error: cannot open map file %s for reading\n", map_filename.c_str()); exit(1); } driver(map_file); return 0; } tophat-2.0.9/src/long_spanning_reads.cpp0000644000175000017500000030532312162605263017067 0ustar toortoor#ifdef HAVE_CONFIG_H #include #endif /* * long_spanning_reads.cpp * TopHat * * Created by Cole Trapnell on 2/5/09. * Copyright 2009 Cole Trapnell. All rights reserved. * */ #include #include #include #include #include #include #include #include #include #include //#include #include #include #include #include #include #include #include #include #include #include "common.h" #include "utils.h" #include "bwt_map.h" #include "tokenize.h" #include "segments.h" #include "reads.h" #include "junctions.h" #include "insertions.h" #include "deletions.h" #include "fusions.h" using namespace seqan; using namespace std; // daehwan bool bDebug = false; void print_usage() { fprintf(stderr, "Usage: long_spanning_reads [spliced_seg1.bwtout,...,spliced_segN.bwtout]\n"); } bool key_lt(const pair& lhs, const pair& rhs) { return lhs.first < rhs.first; } void get_seqs(istream& ref_stream, RefSequenceTable& rt, bool keep_seqs = true) { while(ref_stream.good() && !ref_stream.eof()) { RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence(); string name; readMeta(ref_stream, name, Fasta()); string::size_type space_pos = name.find_first_of(" \t\r"); if (space_pos != string::npos) { name.resize(space_pos); } seqan::read(ref_stream, *ref_str, Fasta()); rt.get_id(name, keep_seqs ? ref_str : NULL, 0); } } void look_right_for_hit_group(ReadTable& unmapped_reads, vector& contig_hits, size_t curr_file, vector& spliced_hits, const HitsForRead& targets, vector& seg_hits_for_read) { int right_file = curr_file + 1; HitStream& right = contig_hits[right_file]; uint64_t curr_next_group_id = targets.insert_id; int curr_order = unmapped_reads.observation_order(curr_next_group_id); assert (curr_order != -1); while(true) { HitsForRead hit_group; uint64_t right_next_group_id = right.next_group_id(); int right_order = unmapped_reads.observation_order(right_next_group_id); // If we would have seen the hits by now, bail out. if (curr_order < right_order || right_order == -1) { break; } if (right.next_read_hits(hit_group)) { if (hit_group.insert_id == targets.insert_id) { // Some of the targets may be missing, we need to // process them individually seg_hits_for_read[right_file] = hit_group; break; } } } HitsForRead& curr_seg_hits = seg_hits_for_read[right_file]; if (right_file < (int)spliced_hits.size() && right_file >= 0) { // Scan forward in the spliced hits file for this hit group HitsForRead spliced_group; HitsForRead curr_spliced_group; while (spliced_hits[right_file].next_group_id() > 0 && spliced_hits[right_file].next_group_id() <= (uint32_t)curr_order) { spliced_hits[right_file].next_read_hits(curr_spliced_group); if (curr_spliced_group.insert_id == (uint32_t)curr_order) { spliced_group = curr_spliced_group; break; } } if (!spliced_group.hits.empty()) { curr_seg_hits.insert_id = spliced_group.insert_id; curr_seg_hits.hits.insert(curr_seg_hits.hits.end(), spliced_group.hits.begin(), spliced_group.hits.end()); } } if (curr_seg_hits.hits.empty()) return; else if (right_file + 1 < (int)contig_hits.size()) { look_right_for_hit_group(unmapped_reads, contig_hits, curr_file + 1, spliced_hits, curr_seg_hits, seg_hits_for_read); } } BowtieHit merge_chain_color(RefSequenceTable& rt, const string& read_seq, const string& read_quals, std::set& possible_juncs, std::set& possible_insertions, list& hit_chain) { bool antisense = hit_chain.front().antisense_align(); uint32_t reference_id = hit_chain.front().ref_id(); uint64_t insert_id = hit_chain.front().insert_id(); int left = hit_chain.front().left(); list::iterator prev_hit = hit_chain.begin(); list::iterator curr_hit = ++(hit_chain.begin()); string seq; string qual; int old_read_length = 0; int first_seg_length = hit_chain.front().seq().length(); for (list::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i) { seq += i->seq(); qual += i->qual(); old_read_length += i->read_len(); } string rev_read_seq, rev_read_quals; if (color && antisense) { rev_read_seq = read_seq; reverse(rev_read_seq.begin() + 1, rev_read_seq.end()); rev_read_quals = read_quals; reverse(rev_read_quals.begin(), rev_read_quals.end()); } while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end()) { /* * Note that the gap size may be negative, since left() and right() return * signed integers, this will be OK. */ int gap = curr_hit->left() - prev_hit->right(); if (gap < -(int)max_insertion_length || (gap > (int)max_deletion_length && (gap < min_report_intron_length || gap > max_report_intron_length))) { return BowtieHit(); } ++prev_hit; ++curr_hit; } prev_hit = hit_chain.begin(); curr_hit = ++(hit_chain.begin()); RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id()); if (!ref_str) return BowtieHit(); int curr_seg_index = 1; while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end()) { /* * This code is assuming that the cigar strings end and start with a match * While segment alignments can actually end with a junction, insertion or deletion, the hope * is that in those cases, the right and left ends of the alignments will correctly * line up, so we won't get to this bit of code */ if (prev_hit->cigar().back().opcode != MATCH || curr_hit->cigar().front().opcode != MATCH) { return BowtieHit(); } if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice()) { /* * There is no way that we can splice these together into a valid * alignment */ return BowtieHit(); } bool found_closure = false; /* * Note that antisense_splice is the same for prev and curr */ bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice(); vector new_cigar; int new_left = -1; int mismatch = 0; /* * Take the length of matched bases in each segment into consideration for closures, * this can be a problem for reads of variable lengths. */ int prev_right_end_match_length = prev_hit->cigar().back().length; int curr_left_end_match_length = curr_hit->cigar().front().length; if (prev_hit->right() > curr_hit->left()) { std::set::iterator lb, ub; /* * Note, this offset is determined by the min-anchor length supplied to * juncs_db, which is currently hard-coded at 3 in tophat.py * as this value determines what sort of read segments * should be able to align directly to the splice sequences */ int left_boundary = prev_hit->right() - 4; int right_boundary = curr_hit->left() + 4; /* * Create a dummy sequence to represent the maximum possible insertion */ std::string maxInsertedSequence = ""; maxInsertedSequence.resize(max_insertion_length,'A'); lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, "")); ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence)); int reference_mismatch = 0; while (lb != ub && lb != possible_insertions.end()) { /* * In the following code, we will check to make sure that the segments have the proper * separation and sequence for the insertions, and generate the appropriate merged bowtie hit * In general, reads with insertions must match the inserted sequence exactly. */ if (((int)lb->sequence.size()) == (prev_hit->right() - curr_hit->left())) { /* * Check we have enough matched bases on prev or curr segment. */ int insert_to_prev_right = prev_hit->right() - lb->left - 1; int curr_left_to_insert = lb->left - curr_hit->left() + 1; if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length) { ++lb; continue; } /* * Keep track of how many mismatches were made to the genome in the region * where we should actually be matching against the insertion */ int this_reference_mismatch = 0; int insertion_mismatch = 0; int insertion_len = lb->sequence.length(); const seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence); /* * First check to see if we need to adjust number of observed errors for the left (prev) * hit. This is only the case if this segment runs into the insertion. To be consistent * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region */ string colorSegmentSequence_prev; if (insert_to_prev_right > 0) { const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right()); const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right)); if (color) { string color; if (antisense) color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1); else color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1); color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1]; colorSegmentSequence_prev = str_convert_color_to_bp(color); } const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence; /* * Scan right in the read until we run out of read */ for (int read_index = 0; read_index < insert_to_prev_right; ++read_index) { /* * Any mismatch to the insertion is a failure */ if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index]) { ++this_reference_mismatch; } if (read_index < insertion_len) { if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index]) { ++insertion_mismatch; break; } } else { if (referenceSequence[read_index - insertion_len] == 'N' || referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index]) { --this_reference_mismatch; } } } } string colorSegmentSequence_curr; if (curr_left_to_insert > 0) { const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1); const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert)); if (color) { string color; if (antisense) color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert); else color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert); color.push_back(curr_hit->seq()[curr_left_to_insert]); reverse(color.begin(), color.end()); string bp = str_convert_color_to_bp(color); reverse(bp.begin(), bp.end()); colorSegmentSequence_curr = bp; } const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence; /* * Scan left in the read until * We ran out of read sequence (insertion extends past segment) */ for (int read_index = 0; read_index < curr_left_to_insert; ++read_index) { int segmentPosition = curr_left_to_insert - read_index - 1; int insertionPosition = insertion_len - read_index - 1; if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition])) { ++this_reference_mismatch; } if (read_index < insertion_len) { if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition])) { ++insertion_mismatch; break; } } else { if (referenceSequence[segmentPosition + insertion_len] == 'N' || (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition])) { --this_reference_mismatch; } } } } if (found_closure) { fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id); return BowtieHit(); } if (insertion_mismatch == 0) { reference_mismatch = this_reference_mismatch; mismatch = -reference_mismatch; found_closure = true; new_left = prev_hit->left(); new_cigar = prev_hit->cigar(); /* * Need to make a new insert operation between the two match character that begin * and end the intersection of these two junction. Note that we necessarily assume * that this insertion can't span beyond the boundaries of these reads. That should * probably be better enforced somewhere */ new_cigar.back().length -= insert_to_prev_right; if (new_cigar.back().length <= 0) new_cigar.pop_back(); new_cigar.push_back(CigarOp(INS, lb->sequence.size())); vector new_right_cigar = curr_hit->cigar(); new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size()); /* * Finish stitching together the new cigar string */ size_t c = new_right_cigar.front().length > 0 ? 0 : 1; for (; c < new_right_cigar.size(); ++c) { new_cigar.push_back(new_right_cigar[c]); } if (color) { if (insert_to_prev_right > 0) seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev); if (curr_left_to_insert > 0) seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr); } } } ++lb; } if (!found_closure) { return BowtieHit(); } } /* * Stitch segments together using juctions or deletions if necessary. */ else if (prev_hit->right() < curr_hit->left()) { std::set::iterator lb, ub; int left_boundary = prev_hit->right() - 4; int right_boundary = curr_hit->left() + 4; lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true)); ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false)); int new_diff_mismatches = 0xff; while (lb != ub && lb != possible_juncs.end()) { int dist_to_left = lb->left - prev_hit->right() + 1; int dist_to_right = lb->right - curr_hit->left(); if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right) { /* * Check we have enough matched bases on prev or curr segment. */ if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length ) { ++lb; continue; } Dna5String new_cmp_str, old_cmp_str; int new_mismatch = 0, old_mismatch = 0; string new_patch_str; // this is for colorspace reads if (dist_to_left > 0) { new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1); old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right); string new_seq; if (color) { string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1)); string color, qual; if (antisense) { color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left); qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left); } else { color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left); qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left); } BWA_decode(color, qual, ref, new_seq); new_seq = new_seq.substr(1); } const string& curr_old_seq = curr_hit->seq(); const string& curr_seq = color ? new_seq : curr_hit->seq(); for (int i = 0; i < dist_to_left; ++i) { if (curr_seq[i] != new_cmp_str[i]) ++new_mismatch; if (curr_old_seq[i] != old_cmp_str[i]) ++old_mismatch; } if (color) new_patch_str = curr_seq.substr(0, dist_to_left); } else if (dist_to_left < 0) { new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left()); old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right()); size_t abs_dist = -dist_to_left; string new_seq; if (color) { string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1)); ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left())); string color, qual; if (antisense) { color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist); qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist); } else { color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist); qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist); } BWA_decode(color, qual, ref, new_seq); new_seq = new_seq.substr(1); } const string& prev_old_seq = prev_hit->seq(); size_t prev_old_seq_len = prev_old_seq.length(); const string& prev_seq = color ? new_seq : prev_hit->seq(); size_t prev_seq_len = prev_seq.length(); for (size_t i = 0; i < abs_dist; ++i) { if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i]) ++new_mismatch; if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i]) ++old_mismatch; } if (color) new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist); } int temp_diff_mismatches = new_mismatch - old_mismatch; if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2) { ++lb; continue; } if (color) { /* * We need to recover the origianl sequence. */ if (found_closure) { seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8, prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4)); } if (dist_to_left > 0) seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str); else if (dist_to_left < 0) seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str); } new_diff_mismatches = temp_diff_mismatches; new_left = prev_hit->left(); new_cigar = prev_hit->cigar(); int new_left_back_len = new_cigar.back().length; new_left_back_len += dist_to_left; vector new_right_cig = curr_hit->cigar(); int new_right_front_len = new_right_cig.front().length; new_right_front_len -= dist_to_right; if (new_left_back_len > 0) new_cigar.back().length = new_left_back_len; else new_cigar.pop_back(); /* * FIXME, currently just differentiating between a deletion and a * reference skip based on length. However, would probably be better * to denote the difference explicitly, this would allow the user * to supply their own (very large) deletions */ if ((lb->right - lb->left - 1) <= max_deletion_length) { new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1)); antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice(); } else { new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1)); antisense_closure = lb->antisense; } new_right_cig.front().length = new_right_front_len; size_t c = new_right_front_len > 0 ? 0 : 1; for (; c < new_right_cig.size(); ++c) new_cigar.push_back(new_right_cig[c]); mismatch = new_diff_mismatches; found_closure = true; } ++lb; } if (!found_closure) { return BowtieHit(); } } if (found_closure) { bool end = false; int mismatches = prev_hit->mismatches() + curr_hit->mismatches() + mismatch; BowtieHit merged_hit(reference_id, reference_id, insert_id, new_left, new_cigar, antisense, antisense_closure, mismatches, mismatches + gap_length(new_cigar), prev_hit->splice_mms() + curr_hit->splice_mms(), end); if (curr_seg_index > 1) merged_hit.seq(seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, 2 * segment_length)); else merged_hit.seq(seq.substr(0, first_seg_length + segment_length)); prev_hit = hit_chain.erase(prev_hit, ++curr_hit); /* * prev_hit now points PAST the last element removed */ prev_hit = hit_chain.insert(prev_hit, merged_hit); /* * merged_hit has been inserted before the old position of * prev_hit. New location of prev_hit is merged_hit */ curr_hit = prev_hit; ++curr_hit; ++curr_seg_index; continue; } ++prev_hit; ++curr_hit; ++curr_seg_index; } bool saw_antisense_splice = false; bool saw_sense_splice = false; vector long_cigar; int num_mismatches = 0; int num_splice_mms = 0; for (list::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s) { num_mismatches += s->mismatches(); num_splice_mms += s->splice_mms(); /* * Check whether the sequence contains any reference skips. Previously, * this was just a check to see whether the sequence was contiguous; however * we don't want to count an indel event as a splice */ bool containsSplice = s->is_spliced(); if (containsSplice) { if (s->antisense_splice()) { if (saw_sense_splice) return BowtieHit(); saw_antisense_splice = true; } else { if (saw_antisense_splice) return BowtieHit(); saw_sense_splice = true; } } const vector& cigar = s->cigar(); if (long_cigar.empty()) { long_cigar = cigar; } else { CigarOp& last = long_cigar.back(); /* * If necessary, merge the back and front * cigar operations */ if(last.opcode == cigar[0].opcode){ last.length += cigar[0].length; for (size_t b = 1; b < cigar.size(); ++b) { long_cigar.push_back(cigar[b]); } }else{ for(size_t b = 0; b < cigar.size(); ++b) { long_cigar.push_back(cigar[b]); } } } } bool end = false; BowtieHit new_hit(reference_id, reference_id, insert_id, left, long_cigar, antisense, saw_antisense_splice, num_mismatches, num_mismatches + gap_length(long_cigar), num_splice_mms, end); new_hit.seq(seq); new_hit.qual(qual); int new_read_len = new_hit.read_len(); if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt)) { fprintf(stderr, "Warning: malformed closure\n"); return BowtieHit(); } return new_hit; } BowtieHit merge_chain(RefSequenceTable& rt, const string& read_seq, const string& read_quals, std::set& possible_juncs, std::set& possible_insertions, std::set& possible_fusions, list& hit_chain, int fusion_dir = FUSION_NOTHING) { bool antisense = hit_chain.front().antisense_align(); uint64_t insert_id = hit_chain.front().insert_id(); const int left = hit_chain.front().left(); list::iterator prev_hit = hit_chain.begin(); list::iterator curr_hit = ++(hit_chain.begin()); string seq; string qual; int old_read_length = 0; int first_seg_length = hit_chain.front().seq().length(); for (list::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i) { seq += i->seq(); qual += i->qual(); old_read_length += i->read_len(); } string rev_read_seq, rev_read_quals; if (color && antisense) { rev_read_seq = read_seq; reverse(rev_read_seq.begin() + 1, rev_read_seq.end()); rev_read_quals = read_quals; reverse(rev_read_quals.begin(), rev_read_quals.end()); } size_t num_fusions = prev_hit->fusion_opcode() == FUSION_NOTHING ? 0 : 1; bool fusion_passed = false; while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end()) { if (prev_hit->ref_id() != prev_hit->ref_id2() || prev_hit->ref_id2() != curr_hit->ref_id()) fusion_passed = true; if (prev_hit->ref_id2() != curr_hit->ref_id()) ++num_fusions; if (curr_hit->fusion_opcode() != FUSION_NOTHING) ++num_fusions; if (prev_hit->ref_id2() == curr_hit->ref_id()) { bool reversed = false; if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed)) reversed = true; /* * Note that the gap size may be negative, since left() and right() return * signed integers, this will be OK. */ int gap; if (reversed) gap = prev_hit->right() - curr_hit->left(); else gap = curr_hit->left() - prev_hit->right(); // daehwan if (bDebug) { cout << "prev: " << prev_hit->ref_id() << ":" << prev_hit->left() << ":" << (prev_hit->antisense_align() ? "-" : "+") << "\t" << prev_hit->ref_id2() << ":" << prev_hit->right() << ":" << (prev_hit->antisense_align2() ? "-" : "+") << endl << "curr: " << curr_hit->ref_id() << ":" << curr_hit->left() << ":" << (curr_hit->antisense_align() ? "-" : "+") << "\t" << curr_hit->ref_id2() << ":" << curr_hit->right() << ":" << (curr_hit->antisense_align2() ? "-" : "+") << endl << "gap: " << gap << endl; } if (gap < -(int)max_insertion_length || (gap > (int)max_deletion_length && (gap < min_report_intron_length || gap > min(max_report_intron_length, (int)fusion_min_dist)))) { fusion_passed = true; ++num_fusions; } } if (num_fusions >= 2) return BowtieHit(); ++prev_hit; ++curr_hit; } prev_hit = hit_chain.begin(); curr_hit = ++(hit_chain.begin()); // daehwan if (bDebug) { cout << "daehwan - test" << endl; } int curr_seg_index = 1; fusion_passed = false; while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end()) { antisense = prev_hit->antisense_align(); // daehwan if (bDebug) { cout << "daehwan - start - stitch" << endl; cout << "prev right: " << prev_hit->right() << endl; cout << "curr left: " << curr_hit->left() << endl; cout << "prev back: " << prev_hit->cigar().back().opcode << endl; cout << "curr front: " << curr_hit->cigar().front().opcode << endl; cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl; cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl; } if (prev_hit->fusion_opcode() != FUSION_NOTHING || prev_hit->ref_id2() != curr_hit->ref_id()) fusion_passed = true; /* * This code is assuming that the cigar strings end and start with a match * While segment alignments can actually end with a junction, insertion or deletion, the hope * is that in those cases, the right and left ends of the alignments will correctly * line up, so we won't get to this bit of code */ if (!(prev_hit->cigar().back().opcode == MATCH || curr_hit->cigar().front().opcode == MATCH || prev_hit->cigar().back().opcode == mATCH || curr_hit->cigar().front().opcode == mATCH)) { return BowtieHit(); } // daehwan if (bDebug) { cout << "daehwan - pass - enough matched bases" << endl; } if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice()) { /* * There is no way that we can splice these together into a valid * alignment */ return BowtieHit(); } bool found_closure = false; /* * Note that antisense_splice is the same for prev and curr */ bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice(); vector new_cigar; int new_left = -1; int mismatch = 0; /* * Take the length of matched bases in each segment into consideration for closures, * this can be a problem for reads of variable lengths. */ int prev_right_end_match_length = prev_hit->cigar().back().length; int curr_left_end_match_length = curr_hit->cigar().front().length; bool check_fusion = prev_hit->ref_id2() != curr_hit->ref_id(); if (prev_hit->ref_id2() == curr_hit->ref_id()) { // daehwan if (bDebug) { cout << "daehwan - start - junction or insertion" << endl; cout << "prev right: " << prev_hit->right() << endl; cout << "curr left: " << curr_hit->left() << endl; cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl; cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl; } bool reversed = false; if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed)) reversed = true; uint32_t reference_id = prev_hit->ref_id2(); RefSequenceTable::Sequence* ref_str = rt.get_seq(reference_id); int left_boundary, right_boundary; if (reversed) { left_boundary = curr_hit->left() - 4; right_boundary = prev_hit->right() + 4; } else { left_boundary = prev_hit->right() - 4; right_boundary = curr_hit->left() + 4; } int dist_btw_two; if (reversed) dist_btw_two = prev_hit->right() - curr_hit->left(); else dist_btw_two = curr_hit->left() - prev_hit->right(); if (dist_btw_two < 0 && dist_btw_two >= -(int)max_insertion_length && prev_hit->antisense_align2() == curr_hit->antisense_align()) { std::set::iterator lb, ub; /* * Create a dummy sequence to represent the maximum possible insertion */ std::string maxInsertedSequence = ""; maxInsertedSequence.resize(max_insertion_length,'A'); lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, "")); ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence)); int reference_mismatch = 0; while (lb != ub && lb != possible_insertions.end()) { /* * In the following code, we will check to make sure that the segments have the proper * separation and sequence for the insertions, and generate the appropriate merged bowtie hit * In general, reads with insertions must match the inserted sequence exactly. */ if (((int)lb->sequence.size()) == (reversed ? curr_hit->left() - prev_hit->right() : prev_hit->right() - curr_hit->left())) { /* * Check we have enough matched bases on prev or curr segment. */ int insert_to_prev_right, curr_left_to_insert; if (reversed) { insert_to_prev_right = lb->left - prev_hit->right(); curr_left_to_insert = curr_hit->left() - lb->left; } else { insert_to_prev_right = prev_hit->right() - lb->left - 1; curr_left_to_insert = lb->left - curr_hit->left() + 1; } if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length) { ++lb; continue; } // daehwan if (bDebug) { cout << "insert_to_prev_right: " << insert_to_prev_right << endl; cout << "curr_left_to_insert: " << curr_left_to_insert << endl; cout << "curr_seg_index: " << curr_seg_index << endl; } /* * Keep track of how many mismatches were made to the genome in the region * where we should actually be matching against the insertion */ int this_reference_mismatch = 0; int insertion_mismatch = 0; int insertion_len = lb->sequence.length(); seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence); if (reversed) { seqan::reverseComplement(insertionSequence); } /* * First check to see if we need to adjust number of observed errors for the left (prev) * hit. This is only the case if this segment runs into the insertion. To be consistent * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region */ string colorSegmentSequence_prev; if (insert_to_prev_right > 0) { seqan::Dna5String referenceSequence, oldSegmentSequence; if (reversed) { referenceSequence = seqan::infix(*ref_str, prev_hit->right() + 1, lb->left + 1); seqan::reverseComplement(referenceSequence); string temp; // daehwan if (bDebug) { cout << "reversed: " << read_seq.length() << " " << read_seq << endl; } temp = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right); oldSegmentSequence = seqan::Dna5String(temp); } else { referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right()); // daehwan if (bDebug) { cout << "non-reversed: " << prev_hit->seq() << endl; } oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right)); } if (color) { string color; if (antisense) color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1); else color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1); color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1]; colorSegmentSequence_prev = str_convert_color_to_bp(color); } const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence; // daehwan if (bDebug) { cout << "ref: " << referenceSequence << endl; cout << "old: " << oldSegmentSequence << endl; cout << "ins: " << insertionSequence << endl; } /* * Scan right in the read until we run out of read */ for (int read_index = 0; read_index < insert_to_prev_right; ++read_index) { /* * Any mismatch to the insertion is a failure */ if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index]) { ++this_reference_mismatch; } if (read_index < insertion_len) { if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index]) { ++insertion_mismatch; break; } } else { if (referenceSequence[read_index - insertion_len] == 'N' || referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index]) { --this_reference_mismatch; } } } } string colorSegmentSequence_curr; if (curr_left_to_insert > 0) { seqan::Dna5String referenceSequence, oldSegmentSequence; if (reversed) { referenceSequence = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1); seqan::reverseComplement(referenceSequence); string temp = read_seq.substr(curr_seg_index * segment_length, curr_left_to_insert); oldSegmentSequence = seqan::Dna5String(temp); } else { referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1); oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert)); } if (color) { string color; if (antisense) color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert); else color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert); color.push_back(curr_hit->seq()[curr_left_to_insert]); reverse(color.begin(), color.end()); string bp = str_convert_color_to_bp(color); reverse(bp.begin(), bp.end()); colorSegmentSequence_curr = bp; } const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence; // daehwan if (bDebug) { cout << "ref: " << referenceSequence << endl; cout << "old: " << oldSegmentSequence << endl; cout << "ins: " << insertionSequence << endl; } /* * Scan left in the read until * We ran out of read sequence (insertion extends past segment) */ for (int read_index = 0; read_index < curr_left_to_insert; ++read_index) { int segmentPosition = curr_left_to_insert - read_index - 1; int insertionPosition = insertion_len - read_index - 1; if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition])) { ++this_reference_mismatch; } if (read_index < insertion_len) { if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition])) { ++insertion_mismatch; break; } } else { if (referenceSequence[segmentPosition + insertion_len] == 'N' || (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition])) { --this_reference_mismatch; } } } } if (found_closure) { // fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id); return BowtieHit(); } if (insertion_mismatch == 0) { reference_mismatch = this_reference_mismatch; mismatch = -reference_mismatch; found_closure = true; new_left = prev_hit->left(); new_cigar = prev_hit->cigar(); /* * Need to make a new insert operation between the two match character that begin * and end the intersection of these two junction. Note that we necessarily assume * that this insertion can't span beyond the boundaries of these reads. That should * probably be better enforced somewhere */ new_cigar.back().length -= insert_to_prev_right; if (new_cigar.back().length <= 0) new_cigar.pop_back(); if (reversed) new_cigar.push_back(CigarOp(iNS, lb->sequence.size())); else new_cigar.push_back(CigarOp(INS, lb->sequence.size())); vector new_right_cigar = curr_hit->cigar(); new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size()); /* * Finish stitching together the new cigar string */ size_t c = new_right_cigar.front().length > 0 ? 0 : 1; for (; c < new_right_cigar.size(); ++c) { new_cigar.push_back(new_right_cigar[c]); } if (color) { if (insert_to_prev_right > 0) seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev); if (curr_left_to_insert > 0) seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr); } } } ++lb; } if (!found_closure) { return BowtieHit(); } } /* * Stitch segments together using juctions or deletions if necessary. */ else if (dist_btw_two > 0 && dist_btw_two <= max_report_intron_length && prev_hit->antisense_align2() == curr_hit->antisense_align()) { std::set::iterator lb, ub; // daehwan if (bDebug) { cout << "junction" << endl; cout << "min: " << left_boundary << "-" << right_boundary - 8 << endl; cout << "max: " << left_boundary + 8 << "-" << right_boundary << endl; } lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true)); ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false)); int new_diff_mismatches = 0xff; while (lb != ub && lb != possible_juncs.end()) { int dist_to_left, dist_to_right; if (reversed) { dist_to_left = lb->left - curr_hit->left(); dist_to_right = lb->right - prev_hit->right() - 1; } else { dist_to_left = lb->left - prev_hit->right() + 1; dist_to_right = lb->right - curr_hit->left(); } if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right) { /* * Check we have enough matched bases on prev or curr segment. */ if ((reversed && (dist_to_left > prev_right_end_match_length || -dist_to_left > curr_left_end_match_length)) || (!reversed && (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length))) { ++lb; continue; } // daehwan if (bDebug) { cout << "candidate junction: " << endl; cout << "coords: " << lb->left << "-" << lb->right << endl; cout << "dist to left: " << dist_to_left << endl; } Dna5String new_cmp_str, old_cmp_str; int new_mismatch = 0, old_mismatch = 0; string new_patch_str; // this is for colorspace reads if (dist_to_left > 0) { if (reversed) { new_cmp_str = seqan::infix(*ref_str, curr_hit->left() + 1, lb->left + 1); seqan::reverseComplement(new_cmp_str); old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb->right); seqan::reverseComplement(old_cmp_str); } else { new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1); old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right); } string new_seq; if (color) { string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1)); string color, qual; if (antisense) { color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left); qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left); } else { color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left); qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left); } BWA_decode(color, qual, ref, new_seq); new_seq = new_seq.substr(1); } string curr_hit_seq; if (reversed) curr_hit_seq = read_seq.substr(curr_seg_index * segment_length - dist_to_left, dist_to_left); else curr_hit_seq = curr_hit->seq(); const string& curr_old_seq = curr_hit_seq; const string& curr_seq = color ? new_seq : curr_hit_seq; for (int i = 0; i < dist_to_left; ++i) { if (curr_seq[i] != new_cmp_str[i]) ++new_mismatch; if (curr_old_seq[i] != old_cmp_str[i]) ++old_mismatch; } if (color) new_patch_str = curr_seq.substr(0, dist_to_left); } else if (dist_to_left < 0) { if (reversed) { new_cmp_str = seqan::infix(*ref_str, lb->right, prev_hit->right() + 1); seqan::reverseComplement(new_cmp_str); old_cmp_str = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1); seqan::reverseComplement(old_cmp_str); } else { new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left()); old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right()); } size_t abs_dist = -dist_to_left; string new_seq; if (color) { string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1)); ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left())); string color, qual; if (antisense) { color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist); qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist); } else { color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist); qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist); } BWA_decode(color, qual, ref, new_seq); new_seq = new_seq.substr(1); } string prev_hit_seq; if (reversed) prev_hit_seq = read_seq.substr(curr_seg_index * segment_length, abs_dist); else prev_hit_seq = prev_hit->seq(); // daehwan if (bDebug) { cout << "reverse: " << (int)reversed << endl; cout << "new cmp str: " << new_cmp_str << endl; cout << "old cmp str: " << old_cmp_str << endl; cout << "hit seq: " << prev_hit_seq << endl; cout << "curr seq: " << curr_hit->seq() << endl; cout << read_seq << endl; cout << read_seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, segment_length) << endl; } const string& prev_old_seq = prev_hit_seq; size_t prev_old_seq_len = prev_old_seq.length(); const string& prev_seq = color ? new_seq : prev_hit_seq; size_t prev_seq_len = prev_seq.length(); for (size_t i = 0; i < abs_dist; ++i) { if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i]) ++new_mismatch; if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i]) ++old_mismatch; } if (color) new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist); } int temp_diff_mismatches = new_mismatch - old_mismatch; // daehwan if (bDebug) { cout << "new mismatch: " << new_mismatch << endl; cout << "old mismatch: " << old_mismatch << endl; cout << "new_diff_mismatch: " << new_diff_mismatches << endl; cout << "temp mismatch: " << temp_diff_mismatches << endl; } if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2) { ++lb; continue; } if (color) { /* * We need to recover the origianl sequence. */ if (found_closure) { seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8, prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4)); } if (dist_to_left > 0) seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str); else if (dist_to_left < 0) seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str); } new_diff_mismatches = temp_diff_mismatches; new_left = prev_hit->left(); new_cigar = prev_hit->cigar(); int new_left_back_len = new_cigar.back().length; if (reversed) new_left_back_len -= dist_to_left; else new_left_back_len += dist_to_left; vector new_right_cig = curr_hit->cigar(); int new_right_front_len = new_right_cig.front().length; if (reversed) new_right_front_len += dist_to_right; else new_right_front_len -= dist_to_right; if (new_left_back_len > 0) new_cigar.back().length = new_left_back_len; else new_cigar.pop_back(); /* * FIXME, currently just differentiating between a deletion and a * reference skip based on length. However, would probably be better * to denote the difference explicitly, this would allow the user * to supply their own (very large) deletions */ if ((lb->right - lb->left - 1) <= max_deletion_length) { if (reversed) new_cigar.push_back(CigarOp(dEL, lb->right - lb->left - 1)); else new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1)); antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice(); } else { if (reversed) new_cigar.push_back(CigarOp(rEF_SKIP, lb->right - lb->left - 1)); else new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1)); antisense_closure = lb->antisense; } new_right_cig.front().length = new_right_front_len; size_t c = new_right_front_len > 0 ? 0 : 1; for (; c < new_right_cig.size(); ++c) new_cigar.push_back(new_right_cig[c]); mismatch = new_diff_mismatches; found_closure = true; } ++lb; } if (!found_closure) { return BowtieHit(); } } else if (!(dist_btw_two == 0 && prev_hit->antisense_align2() == curr_hit->antisense_align())) check_fusion = true; } if (check_fusion) { std::set::iterator lb, ub; uint32_t ref_id1 = prev_hit->ref_id2(); uint32_t ref_id2 = curr_hit->ref_id(); uint32_t left = prev_hit->right() - 4; uint32_t right = curr_hit->left() - 4; // daehwan if (bDebug) { cout << "daehwan - start - fusion" << endl << "ref_id1: " << ref_id1 << endl << "ref_id2: " << ref_id2 << endl << "left: " << left << endl << "right: " << right << endl << "dir: " << fusion_dir << endl; } bool reversed = false; if (fusion_dir != FUSION_FF && (ref_id2 < ref_id1 || (ref_id1 == ref_id2 && left > right))) { reversed = true; uint32_t temp = ref_id1; ref_id1 = ref_id2; ref_id2 = temp; temp = left; left = right; right = temp; } lb = possible_fusions.upper_bound(Fusion(ref_id1, ref_id2, left, right)); ub = possible_fusions.lower_bound(Fusion(ref_id1, ref_id2, left + 8, right + 8)); RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id2()); RefSequenceTable::Sequence* ref_str2 = rt.get_seq(curr_hit->ref_id()); int new_diff_mismatches = 0xff; while (lb != ub && lb != possible_fusions.end()) { int lb_left = lb->left; int lb_right = lb->right; if (reversed) { lb_left = lb->right; lb_right = lb->left; } int dist_to_left, dist_to_right; if (fusion_dir == FUSION_RF) dist_to_left = prev_hit->right() - lb_left + 1; else dist_to_left = lb_left - prev_hit->right() + 1; if (fusion_dir == FUSION_FR) dist_to_right = curr_hit->left() - lb_right; else dist_to_right = lb_right - curr_hit->left(); // daehwan if (bDebug) { cout << "daehwan - fusion gap" << endl; cout << "dist left: " << dist_to_left << endl; cout << "dist right: " << dist_to_right << endl; } if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right) { /* * Check we have enough matched bases on prev or curr segment. */ if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length) { ++lb; continue; } Dna5String new_cmp_str, old_cmp_str; int new_mismatch = 0, old_mismatch = 0; string new_patch_str; // this is for colorspace reads if (dist_to_left > 0) { if (fusion_dir == FUSION_RF) { new_cmp_str = seqan::infix(*ref_str, lb_left, prev_hit->right() + 1); seqan::reverseComplement(new_cmp_str); } else new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb_left + 1); if (fusion_dir == FUSION_FR) { old_cmp_str = seqan::infix(*ref_str2, lb_right + 1, curr_hit->left() + 1); seqan::reverseComplement(old_cmp_str); } else old_cmp_str = seqan::infix(*ref_str2, curr_hit->left(), lb_right); // daehwan if (bDebug) { cout << "new str: " << new_cmp_str << endl; cout << "old str: " << old_cmp_str << endl; cout << "curr seq: " << curr_hit->seq() << endl; } string curr_hit_seq; if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR) curr_hit_seq = curr_hit->seq(); else curr_hit_seq = read_seq.substr(curr_seg_index * segment_length, segment_length); string new_seq; const string& curr_old_seq = curr_hit_seq; const string& curr_seq = color ? new_seq : curr_hit_seq; for (int i = 0; i < dist_to_left; ++i) { if (curr_seq[i] != new_cmp_str[i]) ++new_mismatch; if (curr_old_seq[i] != old_cmp_str[i]) ++old_mismatch; } } else if (dist_to_left < 0) { if (fusion_dir == FUSION_FR) { new_cmp_str = seqan::infix(*ref_str2, curr_hit->left() + 1, lb_right + 1); seqan::reverseComplement(new_cmp_str); } else new_cmp_str = seqan::infix(*ref_str2, lb_right, curr_hit->left()); if (fusion_dir == FUSION_RF) { old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb_left); seqan::reverseComplement(old_cmp_str); } else old_cmp_str = seqan::infix(*ref_str, lb_left + 1, prev_hit->right()); string prev_hit_seq; if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR) prev_hit_seq = prev_hit->seq(); else prev_hit_seq = read_seq.substr((curr_seg_index - 1) * segment_length, segment_length); size_t abs_dist = -dist_to_left; string new_seq; const string& prev_old_seq = prev_hit_seq; size_t prev_old_seq_len = prev_old_seq.length(); const string& prev_seq = color ? new_seq : prev_hit_seq; size_t prev_seq_len = prev_seq.length(); for (size_t i = 0; i < abs_dist; ++i) { if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i]) ++new_mismatch; if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i]) ++old_mismatch; } } int temp_diff_mismatches = new_mismatch - old_mismatch; if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2) { ++lb; continue; } new_diff_mismatches = temp_diff_mismatches; new_left = prev_hit->left(); new_cigar = prev_hit->cigar(); int new_left_back_len = new_cigar.back().length; new_left_back_len += dist_to_left; vector new_right_cig = curr_hit->cigar(); int new_right_front_len = new_right_cig.front().length; new_right_front_len -= dist_to_right; if (new_left_back_len > 0) new_cigar.back().length = new_left_back_len; else new_cigar.pop_back(); new_cigar.push_back(CigarOp((CigarOpCode)fusion_dir, lb_right)); antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice(); new_right_cig.front().length = new_right_front_len; size_t c = new_right_front_len > 0 ? 0 : 1; for (; c < new_right_cig.size(); ++c) new_cigar.push_back(new_right_cig[c]); mismatch = new_diff_mismatches; found_closure = true; ++num_fusions; // daehwan if (bDebug) { cout << "daehwan - fusion gap - found" << endl; } } ++lb; } // daehwan if (bDebug) { cout << "daehwan2 - end - fusion: " << (found_closure ? "found" : "not found") << endl; } if (!found_closure) { return BowtieHit(); } } if (found_closure) { bool end = false; int mismatches = prev_hit->mismatches() + curr_hit->mismatches() + mismatch; BowtieHit merged_hit(prev_hit->ref_id(), curr_hit->ref_id2(), insert_id, new_left, new_cigar, antisense, antisense_closure, mismatches, mismatches + gap_length(new_cigar), prev_hit->splice_mms() + curr_hit->splice_mms(), end); // daehwan - should fix this for SOLiD dataset merged_hit.seq(prev_hit->seq() + curr_hit->seq()); // daehwan if (bDebug) { cout << "fusing of " << merged_hit.left() << " and " << merged_hit.right() << endl; cout << print_cigar(merged_hit.cigar()) << endl; if (!merged_hit.check_editdist_consistency(rt, bDebug)) { prev_hit->check_editdist_consistency(rt, bDebug); curr_hit->check_editdist_consistency(rt, bDebug); cout << "btw " << print_cigar(prev_hit->cigar()) << " and " << print_cigar(curr_hit->cigar()) << endl; cout << "this is a malformed hit" << endl; exit(1); } } prev_hit = hit_chain.erase(prev_hit, ++curr_hit); /* * prev_hit now points PAST the last element removed */ prev_hit = hit_chain.insert(prev_hit, merged_hit); /* * merged_hit has been inserted before the old position of * prev_hit. New location of prev_hit is merged_hit */ curr_hit = prev_hit; ++curr_hit; ++curr_seg_index; continue; } // daehwan if (bDebug) { cout << "daehwan - test 0.3" << endl; } ++prev_hit; ++curr_hit; ++curr_seg_index; } // daehwan if (bDebug) { cout << "daehwan - test2" << endl; } bool saw_antisense_splice = false; bool saw_sense_splice = false; vector long_cigar; int num_mismatches = 0; int num_splice_mms = 0; for (list::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s) { num_mismatches += s->mismatches(); num_splice_mms += s->splice_mms(); /* * Check whether the sequence contains any reference skips. Previously, * this was just a check to see whether the sequence was contiguous; however * we don't want to count an indel event as a splice */ bool containsSplice = s->is_spliced(); if (containsSplice) { if (s->antisense_splice()) { if (saw_sense_splice) return BowtieHit(); saw_antisense_splice = true; } else { if (saw_antisense_splice) return BowtieHit(); saw_sense_splice = true; } } const vector& cigar = s->cigar(); if (long_cigar.empty()) { long_cigar = cigar; } else { CigarOp& last = long_cigar.back(); /* * If necessary, merge the back and front * cigar operations */ if(last.opcode == cigar[0].opcode){ last.length += cigar[0].length; for (size_t b = 1; b < cigar.size(); ++b) { long_cigar.push_back(cigar[b]); } }else{ for(size_t b = 0; b < cigar.size(); ++b) { long_cigar.push_back(cigar[b]); } } } } bool end = false; BowtieHit new_hit(hit_chain.front().ref_id(), hit_chain.back().ref_id2(), insert_id, left, long_cigar, antisense, saw_antisense_splice, num_mismatches, num_mismatches + gap_length(long_cigar), num_splice_mms, end); if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR) { new_hit.seq(seq); if (bowtie2) { // for the time being, let's compare "seq" and "read_seq" if (seq != read_seq) { string temp_qual = read_quals; reverse(temp_qual.begin(), temp_qual.end()); new_hit.qual(temp_qual); } else new_hit.qual(read_quals); } else new_hit.qual(qual); } else { new_hit.seq(read_seq); new_hit.qual(read_quals); } bool do_reverse = new_hit.ref_id() > new_hit.ref_id2(); if (new_hit.ref_id() == new_hit.ref_id2()) { vector fusions; bool auto_sort = false; fusions_from_spliced_hit(new_hit, fusions, auto_sort); if (fusions.size() > 0) { const Fusion& fusion = fusions[0]; do_reverse = fusion.left > fusion.right; } } if (do_reverse) { new_hit = new_hit.reverse(); } /* if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR) { new_hit.antisense_align(!new_hit.antisense_align()); } */ if (fusion_dir != FUSION_NOTHING) { if (new_hit.seq() != read_seq) new_hit.antisense_align(true); else new_hit.antisense_align(false); } // daehwan if (bDebug) { cout << "daehwan - test3" << endl; cout << new_hit.left() << " " << print_cigar(new_hit.cigar()) << endl; cout << new_hit.ref_id() << "-" << new_hit.ref_id2() << ": " << new_hit.fusion_opcode() << endl; } int new_read_len = new_hit.read_len(); if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt, bDebug)) { // daehwan if (bDebug) { cout << "Warning: " << new_hit.insert_id() << " malformed closure: " << print_cigar(new_hit.cigar()) << endl; exit(1); } fprintf(stderr, "Warning: %d malformed closure\n", new_hit.insert_id()); return BowtieHit(); } return new_hit; } int multi_closure = 0; int anchor_too_short = 0; int gap_too_short = 0; bool valid_hit(const BowtieHit& bh) { if (bh.insert_id()) { /* * validate the cigar chain - no gaps shorter than an intron, etc. * also, * -Don't start or end with an indel or refskip * -Only a match operation is allowed is allowed * adjacent to an indel or refskip * -Indels should confrom to length restrictions */ const CigarOp* prevCig = &(bh.cigar()[0]); const CigarOp* currCig = &(bh.cigar()[1]); for (size_t i = 1; i < bh.cigar().size(); ++i){ currCig = &(bh.cigar()[i]); if(!(currCig->opcode == MATCH || currCig->opcode == mATCH) && !(prevCig->opcode == MATCH || prevCig->opcode == mATCH)){ return false; } if(currCig->opcode == INS || currCig->opcode == iNS){ if(currCig->length > max_insertion_length){ return false; } } if(currCig->opcode == DEL || currCig->opcode == dEL){ if(currCig->length > max_deletion_length){ return false; } } if(currCig->opcode == REF_SKIP || currCig->opcode == rEF_SKIP){ if(currCig->length < (uint64_t)min_report_intron_length){ gap_too_short++; return false; } } prevCig = currCig; } if (!(bh.cigar().front().opcode == MATCH || bh.cigar().front().opcode == mATCH) || !(bh.cigar().back().opcode == MATCH || bh.cigar().back().opcode == mATCH)/* || (int)bh.cigar().front().length < min_anchor_len|| (int)bh.cigar().back().length < min_anchor_len*/ ) { anchor_too_short++; return false; } } else { multi_closure++; return false; } return true; } void merge_segment_chain(RefSequenceTable& rt, const string& read_seq, const string& read_quals, std::set& possible_juncs, std::set& possible_insertions, std::set& possible_fusions, vector& hits, vector& merged_hits, int fusion_dir = FUSION_NOTHING) { if (hits.size() == 0) return; BowtieHit bh; if (hits.size() > 1) { list hit_chain; if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR) { if (hits.front().antisense_align()) copy(hits.rbegin(), hits.rend(), back_inserter(hit_chain)); else copy(hits.begin(), hits.end(), back_inserter(hit_chain)); } else { bool bSawFusion = false; for (size_t i = 0; i < hits.size(); ++i) { bool pushed = false; if (!bSawFusion) { if (i > 0) { if (hits[i-1].ref_id() != hits[i].ref_id()) bSawFusion = true; else if(hits[i-1].antisense_align() != hits[i].antisense_align()) bSawFusion = true; else { int dist = 0; if (hits[i].antisense_align()) dist = hits[i-1].left() - hits[i].right(); else dist = hits[i].left() - hits[i-1].right(); if (dist >= max_report_intron_length || dist < -(int)max_insertion_length) bSawFusion = true; } } } if (hits[i].fusion_opcode() == FUSION_NOTHING && ((fusion_dir == FUSION_FR && bSawFusion) || (fusion_dir == FUSION_RF && !bSawFusion)) && hits[i].left() < hits[i].right()) { hit_chain.push_back(hits[i].reverse()); pushed = true; } if (i > 0 && hits[i].fusion_opcode() != FUSION_NOTHING && hits[i].ref_id() != hits[i-1].ref_id()) { hit_chain.push_back(hits[i].reverse()); pushed = true; } if (!bSawFusion) { if (hits[i].fusion_opcode() != FUSION_NOTHING) bSawFusion = true; } if (!pushed) hit_chain.push_back(hits[i]); } } // todo: merge_chain_color needs to be merged into merge_chain fuction. if (color) bh = merge_chain_color(rt, read_seq, read_quals, possible_juncs, possible_insertions, hit_chain); else bh = merge_chain(rt, read_seq, read_quals, possible_juncs, possible_insertions, possible_fusions, hit_chain, fusion_dir); } else { bh = hits[0]; bool do_reverse = bh.ref_id() > bh.ref_id2(); if (bh.ref_id() == bh.ref_id2()) { vector fusions; bool auto_sort = false; fusions_from_spliced_hit(bh, fusions, auto_sort); if (fusions.size() > 0) { const Fusion& fusion = fusions[0]; do_reverse = fusion.left > fusion.right; } } if (do_reverse) bh = bh.reverse(); } if (valid_hit(bh)) merged_hits.push_back(bh); } bool dfs_seg_hits(RefSequenceTable& rt, const string& read_seq, const string& read_quals, std::set& possible_juncs, std::set& possible_insertions, std::set& possible_fusions, vector& seg_hits_for_read, size_t curr, vector& seg_hit_stack, vector& joined_hits, int& num_try, int fusion_dir = FUSION_NOTHING) { if (num_try <= 0) return false; assert (!seg_hit_stack.empty()); bool join_success = false; if (curr < seg_hits_for_read.size()) { for (size_t i = 0; i < seg_hits_for_read[curr].hits.size(); ++i) { /* * As we reverse segments depending on directions like FR or RF, * it's necessary to recover the original segments. */ BowtieHit bh = seg_hits_for_read[curr].hits[i]; BowtieHit bh_prev = seg_hit_stack.back(); BowtieHit* prevHit = &bh_prev; BowtieHit* currHit = &bh; // daehwan - for debugging purposes // if (bh.insert_id() == 792140) // bDebug = true; /* * Each segment has at most one fusion by assumption, */ bool prevHit_fused = prevHit->fusion_opcode() != FUSION_NOTHING; bool currHit_fused = currHit->fusion_opcode() != FUSION_NOTHING; /* * Count the number of fusions on prev and curr segments, * but this doesn't take into account the gap (if exists) between the two segments. */ size_t num_fusions = prevHit_fused ? 1 : 0; num_fusions += currHit_fused ? 1 : 0; int dir = prevHit_fused ? prevHit->fusion_opcode() : currHit->fusion_opcode(); if (!fusion_search && num_fusions > 0) continue; /* * We don't allow reads that span more than two fusion points. */ if (num_fusions >= 2) continue; if (fusion_dir != FUSION_NOTHING && currHit_fused) continue; if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR) { if ((currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id()) || (!currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id2())) continue; } if (bDebug) { cout << "daehwan - prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": " << print_cigar(prevHit->cigar()) << endl; cout << "daehwan - prev sense: " << (prevHit->antisense_align() ? "-" : "+") << "\t" << (prevHit->antisense_align2() ? "-" : "+") << endl; cout << "daehwan - prev coords: " << prevHit->left() << "\t" << prevHit->right() << endl; cout << "daehwan - curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": " << print_cigar(currHit->cigar()) << endl; cout << "daehwan - curr sense: " << (currHit->antisense_align() ? "-" : "+") << "\t" << (currHit->antisense_align2() ? "-" : "+") << endl; cout << "daehwan - curr coords: " << currHit->left() << "\t" << currHit->right() << endl; } if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF) && prevHit->ref_id2() != currHit->ref_id()) continue; if ((fusion_dir == FUSION_FR && !currHit->antisense_align()) || (fusion_dir == FUSION_RF && currHit->antisense_align())) continue; if (currHit_fused && dir == FUSION_RR) *currHit = currHit->reverse(); if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RF || (currHit_fused && currHit->ref_id() == currHit->ref_id2() && (dir == FUSION_FR || dir == FUSION_RF))) { if (currHit_fused) { if ((dir == FUSION_FR && currHit->antisense_align()) || (dir == FUSION_RF && !currHit->antisense_align())) *currHit = currHit->reverse(); } else { if (fusion_dir == FUSION_FR && currHit->antisense_align()) *currHit = currHit->reverse(); } } /* * Switch prevHit and currHit in FUSION_NOTHING and FUSION_FF cases * to make it easier to check the distance in the gap between the two segments. */ else if ((num_fusions == 0 && prevHit->antisense_align() && currHit->antisense_align() && prevHit->ref_id() == currHit->ref_id() && (!fusion_search || (prevHit->left() <= currHit->right() + (int)max_report_intron_length && prevHit->left() + (int)max_insertion_length >= currHit->right()))) || (num_fusions == 1 && (dir == FUSION_FF || dir == FUSION_RR) && ((!prevHit_fused && prevHit->antisense_align()) || (!currHit_fused && currHit->antisense_align()))) ) { BowtieHit* tempHit = prevHit; prevHit = currHit; currHit = tempHit; } else if (num_fusions == 0) { if (prevHit->ref_id2() == currHit->ref_id() && prevHit->antisense_align() == currHit->antisense_align()) { int dist = 0; if (prevHit->antisense_align()) dist = prevHit->left() - currHit->right(); else dist = currHit->left() - prevHit->right(); if (dist > max_report_intron_length || dist < -(int)max_insertion_length) { if ((prevHit->antisense_align() && prevHit->left() > currHit->left()) || (!prevHit->antisense_align() && prevHit->left() < currHit->left())) dir = FUSION_FF; else dir = FUSION_RR; } } else { if (prevHit->antisense_align() == currHit->antisense_align()) { if ((prevHit->antisense_align() && prevHit->ref_id() > currHit->ref_id()) || (!prevHit->antisense_align() && prevHit->ref_id() < currHit->ref_id())) dir = FUSION_FF; else dir = FUSION_RR; } else if (!prevHit->antisense_align()) dir = FUSION_FR; else dir = FUSION_RF; if (dir == FUSION_FR) *currHit = currHit->reverse(); else if(dir == FUSION_RF) *prevHit = prevHit->reverse(); } } if (!fusion_search && dir != FUSION_NOTHING) continue; // daehwan - test if (bDebug) { cout << "insert id: " << prevHit->insert_id() << endl; cout << "(" << curr - 1 << ") prev: " << prevHit->seq() << " : " << (prevHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl; cout << "(" << curr << ") curr: " << currHit->seq() << " : " << (currHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl; cout << "prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": " << print_cigar(prevHit->cigar()) << endl; cout << "curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": " << print_cigar(currHit->cigar()) << endl; cout << "prev coords: " << prevHit->left() << "\t" << prevHit->right() << endl; cout << "curr corrds: " << currHit->left() << "\t" << currHit->right() << endl; cout << "prev sense: " << (prevHit->antisense_align() ? "-" : "+") << "\t" << (prevHit->antisense_align2() ? "-" : "+") << endl; cout << "curr sense: " << (currHit->antisense_align() ? "-" : "+") << "\t" << (currHit->antisense_align2() ? "-" : "+") << endl; } if (num_fusions == 1) { // daehwan if (bDebug) { cout << "direction: " << (int)dir << endl; } /* * orient the fused segment, which depends on a fusion direction. */ if (dir != FUSION_FF && dir != FUSION_RR) { bool prevHit_rep = false; bool currHit_rep = false; if (prevHit_fused) { if ((dir == FUSION_FR && !currHit->antisense_align()) || (dir == FUSION_RF && currHit->antisense_align())) continue; if (prevHit->ref_id2() != currHit->ref_id()) prevHit_rep = true; else if ((dir == FUSION_FR && prevHit->antisense_align()) || (dir == FUSION_RF && !prevHit->antisense_align())) prevHit_rep = true; } if (currHit_fused) { if ((dir == FUSION_FR && prevHit->antisense_align()) || (dir == FUSION_RF && !prevHit->antisense_align())) continue; if (currHit->ref_id() != prevHit->ref_id2()) currHit_rep = true; } if (bDebug) { if (prevHit_rep) cout << "1. reversed in prev" << endl; if (currHit_rep) cout << "1. reversed in curr" << endl; } if (prevHit_rep) *prevHit = prevHit->reverse(); if (currHit_rep) *currHit = currHit->reverse(); prevHit_rep = false; currHit_rep = false; if (prevHit_fused) { if (prevHit->is_forwarding_right() != currHit->is_forwarding_left()) currHit_rep = true; } else { if (prevHit->is_forwarding_right() != currHit->is_forwarding_left()) prevHit_rep = true; } if (prevHit_rep) *prevHit = prevHit->reverse(); if (currHit_rep) *currHit = currHit->reverse(); // daehwan if (bDebug) { if (prevHit_rep) cout << "2. reversed in prev" << endl; if (currHit_rep) cout << "2. reversed in curr" << endl; } } } bool same_contig = prevHit->ref_id2() == currHit->ref_id(); if (!same_contig && num_fusions > 0) continue; if (!fusion_search) { if (!same_contig || num_fusions > 0) continue; } if (same_contig && num_fusions >= 1) { if (prevHit->antisense_align2() != currHit->antisense_align()) continue; } int bh_l = 0, back_right = 0, dist = 0; if (same_contig) { if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF || dir == FUSION_FR || dir == FUSION_RF) && prevHit->antisense_align2()) { bh_l = prevHit->right() + 1; back_right = currHit->left() + 1; } else { bh_l = currHit->left(); back_right = prevHit->right(); } dist = bh_l - back_right; } // daehwan - pass if (bDebug) { cout << "daehwan - pass" << endl; cout << "prev coords: " << prevHit->left() << "\t" << prevHit->right() << endl; cout << "curr coords: " << currHit->left() << "\t" << currHit->right() << endl; } if (!same_contig || (same_contig && num_fusions == 0 && dir != FUSION_NOTHING && fusion_dir == FUSION_NOTHING) || (same_contig && dist <= max_report_intron_length && dist >= -(int)max_insertion_length && prevHit->is_forwarding_right() == currHit->is_forwarding_left())) { // daehwan if (bDebug) { cout << "daehwan - really passed!!" << endl; } BowtieHit tempHit = seg_hit_stack.back(); seg_hit_stack.back() = bh_prev; // these hits are compatible, so push bh onto the // stack, recurse, and pop it when done. seg_hit_stack.push_back(bh); bool success = dfs_seg_hits(rt, read_seq, read_quals, possible_juncs, possible_insertions, possible_fusions, seg_hits_for_read, curr + 1, seg_hit_stack, joined_hits, num_try, dir == FUSION_NOTHING ? fusion_dir : dir); if (success) join_success = true; if (num_try <= 0) return join_success; seg_hit_stack.pop_back(); seg_hit_stack.back() = tempHit; } } } else { --num_try; merge_segment_chain(rt, read_seq, read_quals, possible_juncs, possible_insertions, possible_fusions, seg_hit_stack, joined_hits, fusion_dir); return join_success = true; } return join_success; } bool join_segments_for_read(RefSequenceTable& rt, const string& read_seq, const string& read_quals, std::set& possible_juncs, std::set& possible_insertions, std::set& possible_fusions, vector& seg_hits_for_read, vector& joined_hits) { vector seg_hit_stack; bool join_success = false; // ignore segments that map to more than this many places. if (bowtie2) { for (size_t s = 0; s < seg_hits_for_read.size(); ++s) { if (seg_hits_for_read[s].hits.size() > max_seg_multihits) return join_success; } } for (size_t i = 0; i < seg_hits_for_read[0].hits.size(); ++i) { BowtieHit& bh = seg_hits_for_read[0].hits[i]; // daehwan - remove this //if (bh.insert_id() == 16487) // bDebug = true; if (bh.fusion_opcode() == FUSION_RR) seg_hit_stack.push_back(bh.reverse()); else seg_hit_stack.push_back(bh); const int max_try = 10000; int num_try = max_try; bool success = dfs_seg_hits(rt, read_seq, read_quals, possible_juncs, possible_insertions, possible_fusions, seg_hits_for_read, 1, seg_hit_stack, joined_hits, num_try); if (success) join_success = true; seg_hit_stack.pop_back(); } return join_success; } struct JoinSegmentsWorker { void operator()() { ReadTable it; GBamWriter bam_writer(bam_output_fname.c_str(), sam_header_fname.c_str(), bam_output_fname + ".index"); ReadStream readstream(reads_fname); if (readstream.file() == NULL) err_die("Error: cannot open %s for reading\n", reads_fname.c_str()); if (read_offset > 0) readstream.seek(read_offset); uint32_t curr_contig_obs_order = VMAXINT32; HitStream* first_seg_contig_stream = NULL; uint64_t next_contig_id = 0; if (contig_hits.size() > 0) { first_seg_contig_stream = &(contig_hits.front()); next_contig_id = first_seg_contig_stream->next_group_id(); curr_contig_obs_order = it.observation_order(next_contig_id); } HitsForRead curr_hit_group; uint32_t curr_spliced_obs_order = VMAXINT32; HitStream* first_seg_spliced_stream = NULL; uint64_t next_spliced_id = 0; if (spliced_hits.size() > 0) { first_seg_spliced_stream = &(spliced_hits.front()); next_spliced_id = first_seg_spliced_stream->next_group_id(); curr_spliced_obs_order = it.observation_order(next_spliced_id); } while((curr_contig_obs_order != VMAXINT32 || curr_spliced_obs_order != VMAXINT32) && (curr_contig_obs_order < end_id || curr_spliced_obs_order < end_id)) { uint32_t read_in_process; vector seg_hits_for_read; seg_hits_for_read.resize(contig_hits.size()); if (curr_contig_obs_order < curr_spliced_obs_order) { first_seg_contig_stream->next_read_hits(curr_hit_group); seg_hits_for_read.front() = curr_hit_group; next_contig_id = first_seg_contig_stream->next_group_id(); uint32_t next_order = it.observation_order(next_contig_id); read_in_process = curr_contig_obs_order; curr_contig_obs_order = next_order; } else if (curr_spliced_obs_order < curr_contig_obs_order) { first_seg_spliced_stream->next_read_hits(curr_hit_group); seg_hits_for_read.front() = curr_hit_group; next_spliced_id = first_seg_spliced_stream->next_group_id(); uint32_t next_order = it.observation_order(next_spliced_id); read_in_process = curr_spliced_obs_order; curr_spliced_obs_order = next_order; if (read_in_process < begin_id) continue; } else if (curr_contig_obs_order == curr_spliced_obs_order && curr_contig_obs_order != VMAXINT32 && curr_spliced_obs_order != VMAXINT32) { first_seg_contig_stream->next_read_hits(curr_hit_group); HitsForRead curr_spliced_group; first_seg_spliced_stream->next_read_hits(curr_spliced_group); curr_hit_group.hits.insert(curr_hit_group.hits.end(), curr_spliced_group.hits.begin(), curr_spliced_group.hits.end()); seg_hits_for_read.front() = curr_hit_group; read_in_process = curr_spliced_obs_order; next_contig_id = first_seg_contig_stream->next_group_id(); uint32_t next_order = it.observation_order(next_contig_id); next_spliced_id = first_seg_spliced_stream->next_group_id(); uint32_t next_spliced_order = it.observation_order(next_spliced_id); curr_spliced_obs_order = next_spliced_order; curr_contig_obs_order = next_order; } else { break; } if (contig_hits.size() > 1) { look_right_for_hit_group(it, contig_hits, 0, spliced_hits, curr_hit_group, seg_hits_for_read); } int last_non_empty = seg_hits_for_read.size() - 1; while(last_non_empty >= 0 && seg_hits_for_read[last_non_empty].hits.empty()) { --last_non_empty; } seg_hits_for_read.resize(last_non_empty + 1); if (!seg_hits_for_read[last_non_empty].hits[0].end()) continue; if (!seg_hits_for_read.empty() && !seg_hits_for_read[0].hits.empty()) { uint64_t insert_id = seg_hits_for_read[0].hits[0].insert_id(); if (insert_id >= begin_id && insert_id < end_id) { Read read; if (readstream.getRead(insert_id, read)) { vector joined_hits; join_segments_for_read(*rt, read.seq.c_str(), read.qual.c_str(), *possible_juncs, *possible_insertions, *possible_fusions, seg_hits_for_read, joined_hits); sort(joined_hits.begin(), joined_hits.end()); vector::iterator new_end = unique(joined_hits.begin(), joined_hits.end()); joined_hits.erase(new_end, joined_hits.end()); for (size_t i = 0; i < joined_hits.size(); i++) { if (joined_hits[i].mismatches() > read_mismatches || joined_hits[i].gap_length() > read_gap_length || joined_hits[i].edit_dist() > read_edit_dist) continue; const char* ref_name = rt->get_name(joined_hits[i].ref_id()); const char* ref_name2 = ""; if (joined_hits[i].fusion_opcode() != FUSION_NOTHING) ref_name2 = rt->get_name(joined_hits[i].ref_id2()); vector extra_fields; if (!color) bowtie_sam_extra(joined_hits[i], *rt, extra_fields); if (color) print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2, joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true, &extra_fields); else print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2, read.seq.c_str(), read.qual.c_str(), false, &extra_fields); } } else { err_die("Error: could not get read # %d from stream\n", read_in_process); } } } else { //fprintf(stderr, "Warning: couldn't join segments for read # %d\n", read_in_process); } } } string bam_output_fname; string sam_header_fname; string reads_fname; vector segmap_fnames; vector spliced_segmap_fnames; std::set* possible_juncs; std::set* possible_insertions; std::set* possible_fusions; RefSequenceTable* rt; uint64_t begin_id; uint64_t end_id; int64_t read_offset; vector seg_offsets; vector spliced_seg_offsets; vector contig_hits; vector spliced_hits; }; void driver(const string& bam_output_fname, istream& ref_stream, vector& possible_juncs_files, vector& possible_insertions_files, vector& possible_deletions_files, vector& possible_fusions_files, vector& spliced_segmap_fnames, //.bam files vector& segmap_fnames, //.bam files const string& reads_fname) { if (!parallel) num_threads = 1; if (segmap_fnames.size() == 0) { fprintf(stderr, "No hits to process, exiting\n"); exit(0); } RefSequenceTable rt(sam_header, true); fprintf (stderr, "Loading reference sequences...\n"); get_seqs(ref_stream, rt, true); fprintf (stderr, " reference sequences loaded.\n"); fprintf(stderr, "Loading junctions..."); std::set possible_juncs; for (size_t i = 0; i < possible_juncs_files.size(); ++i) { char buf[2048]; while(!feof(possible_juncs_files[i]) && fgets(buf, sizeof(buf), possible_juncs_files[i])) { char junc_ref_name[256]; int left; int right; char orientation; int ret = sscanf(buf, "%s %d %d %c", junc_ref_name, &left, &right, &orientation); if (ret != 4) continue; uint32_t ref_id = rt.get_id(junc_ref_name, NULL, 0); possible_juncs.insert(Junction(ref_id, left, right, orientation == '-')); } } fprintf(stderr, "done\n"); fprintf(stderr, "Loading deletions..."); for (size_t i = 0; i < possible_deletions_files.size(); ++i) { char splice_buf[2048]; FILE* deletions_file = possible_deletions_files[i]; if(!deletions_file){ continue; } while(fgets(splice_buf, 2048, deletions_file)){ char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; char* ref_name = get_token((char**)&buf, "\t"); char* scan_left_coord = get_token((char**)&buf, "\t"); char* scan_right_coord = get_token((char**)&buf, "\t"); if (!scan_left_coord || !scan_right_coord) { err_die("Error: malformed deletion coordinate record\n"); } uint32_t ref_id = rt.get_id(ref_name,NULL,0); uint32_t left_coord = atoi(scan_left_coord); uint32_t right_coord = atoi(scan_right_coord); possible_juncs.insert((Junction)Deletion(ref_id, left_coord - 1,right_coord, false)); } } fprintf(stderr, "done\n"); /* * Read the insertions from the list of insertion * files into a set */ fprintf(stderr, "Loading insertions..."); std::set possible_insertions; for (size_t i=0; i < possible_insertions_files.size(); ++i) { char splice_buf[2048]; FILE* insertions_file = possible_insertions_files[i]; if(!insertions_file){ continue; } while(fgets(splice_buf, 2048, insertions_file)){ char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; char* ref_name = get_token((char**)&buf, "\t"); char* scan_left_coord = get_token((char**)&buf, "\t"); char* scan_right_coord = get_token((char**)&buf, "\t"); char* scan_sequence = get_token((char**)&buf, "\t"); if (!scan_left_coord || !scan_sequence || !scan_right_coord) { err_die("Error: malformed insertion coordinate record\n"); } uint32_t ref_id = rt.get_id(ref_name,NULL,0); uint32_t left_coord = atoi(scan_left_coord); std::string sequence(scan_sequence); possible_insertions.insert(Insertion(ref_id, left_coord, sequence)); } } fprintf(stderr, "done\n"); vector read_ids; vector > offsets; if (num_threads > 1) { vector fnames; fnames.push_back(reads_fname); fnames.insert(fnames.end(), spliced_segmap_fnames.rbegin(), spliced_segmap_fnames.rend()); fnames.insert(fnames.end(), segmap_fnames.rbegin(), segmap_fnames.rend()); bool enough_data = calculate_offsets(fnames, read_ids, offsets); if (!enough_data) num_threads = 1; } std::set possible_fusions; if (fusion_search) { fprintf(stderr, "Loading fusions..."); for (size_t i=0; i < possible_fusions_files.size(); ++i) { char splice_buf[2048]; FILE* fusions_file = possible_fusions_files[i]; if(!fusions_file){ continue; } while(fgets(splice_buf, 2048, fusions_file)){ char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; char* ref_name1 = strsep((char**)&buf, "\t"); char* scan_left_coord = strsep((char**)&buf, "\t"); char* ref_name2 = strsep((char**)&buf, "\t"); char* scan_right_coord = strsep((char**)&buf, "\t"); char* scan_dir = strsep((char**)&buf, "\t"); if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir) { fprintf(stderr,"Error: malformed insertion coordinate record\n"); exit(1); } uint32_t ref_id1 = rt.get_id(ref_name1,NULL,0); uint32_t ref_id2 = rt.get_id(ref_name2,NULL,0); uint32_t left_coord = atoi(scan_left_coord); uint32_t right_coord = atoi(scan_right_coord); uint32_t dir = FUSION_FF; if (strcmp(scan_dir, "fr") == 0) dir = FUSION_FR; else if(strcmp(scan_dir, "rf") == 0) dir = FUSION_RF; else if (strcmp(scan_dir, "rr") == 0) dir = FUSION_RR; possible_fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir)); } } fprintf(stderr, "done\n"); } vector factories; ReadTable it; samfile_t* common_spliced_bam_file = NULL; if (spliced_segmap_fnames.size() > 0) { common_spliced_bam_file = samopen(spliced_segmap_fnames[0].c_str(), "rb", 0); } vector workers(num_threads); for (int i = 0; i < num_threads; ++i) { JoinSegmentsWorker& worker = workers[i]; if (num_threads == 1) worker.bam_output_fname = bam_output_fname; else { string filename_base = bam_output_fname.substr(0, bam_output_fname.length() - 4); char filename[1024] = {0}; sprintf(filename, "%s%d.bam", filename_base.c_str(), i); worker.bam_output_fname = filename; } worker.sam_header_fname = sam_header; worker.reads_fname = reads_fname; worker.possible_juncs = &possible_juncs; worker.possible_insertions = &possible_insertions; worker.possible_fusions = &possible_fusions; worker.rt = &rt; if (i == 0) { worker.begin_id = 0; worker.seg_offsets = vector(segmap_fnames.size(), 0); worker.spliced_seg_offsets = vector(spliced_segmap_fnames.size(), 0); worker.read_offset = 0; } else { worker.begin_id = read_ids[i-1]; worker.seg_offsets.insert(worker.seg_offsets.end(), offsets[i-1].rbegin(), offsets[i-1].rbegin() + segmap_fnames.size()); worker.spliced_seg_offsets.insert(worker.spliced_seg_offsets.end(), offsets[i-1].rbegin() + segmap_fnames.size(), offsets[i-1].rend() - 1); worker.read_offset = offsets[i-1][0]; } worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits::max(); // create HitFactory and HitStream one by one, which is necessary due to a huge SAM header from spliced segment mapping, // which happens with fusion option enabled. // otherwise, if we do this each thread, it may create lots of holes in memory alignment // (imagine each thread allocates and deallocates memory for each header line). bool need_seq = true, need_qual = true; for (size_t j = 0; j < segmap_fnames.size(); ++j) { HitFactory* fac = new BAMHitFactory(it, rt); factories.push_back(fac); HitStream hs(segmap_fnames[j], fac, false, false, false, need_seq, need_qual); if (worker.seg_offsets[j] > 0) hs.seek(worker.seg_offsets[j]); worker.contig_hits.push_back(hs); } for (size_t j = 0; j < spliced_segmap_fnames.size(); ++j) { int anchor_length = 0; HitFactory* fac = new SplicedBAMHitFactory(it, rt, common_spliced_bam_file->header, anchor_length); factories.push_back(fac); HitStream hs(spliced_segmap_fnames[j], fac, true, false, false, need_seq, need_qual); if (worker.spliced_seg_offsets[j] > 0) hs.seek(worker.spliced_seg_offsets[j]); worker.spliced_hits.push_back(hs); } } vector threads; for (int i = 0; i < num_threads; ++i) { if (num_threads > 1 && i + 1 < num_threads) threads.push_back(new boost::thread(workers[i])); else workers[i](); } for (size_t i = 0; i < threads.size(); ++i) { threads[i]->join(); delete threads[i]; threads[i] = NULL; } threads.clear(); for (size_t fac = 0; fac < factories.size(); ++fac) { delete factories[fac]; } factories.clear(); samclose(common_spliced_bam_file); } //driver int main(int argc, char** argv) { fprintf(stderr, "long_spanning_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION); fprintf(stderr, "--------------------------------------------\n"); int parse_ret = parse_options(argc, argv, print_usage); if (parse_ret) return parse_ret; if(optind >= argc) { print_usage(); return 1; } string ref_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string reads_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string juncs_file_list = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string insertions_file_list = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string deletions_file_list = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string fusions_file_list = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string bam_output_fname = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string segmap_file_list = argv[optind++]; string spliced_segmap_flist; if(optind < argc) { spliced_segmap_flist = argv[optind++]; } ifstream ref_stream(ref_file_name.c_str(), ifstream::in); if (!ref_stream.good()) err_die("Error: cannot open %s for reading\n",ref_file_name.c_str()); checkSamHeader(); //FILE* reads_file = fopen(reads_file_name.c_str(), "r"); vector segmap_file_names; tokenize(segmap_file_list, ",",segmap_file_names); vector juncs_file_names; vector juncs_files; tokenize(juncs_file_list, ",",juncs_file_names); for (size_t i = 0; i < juncs_file_names.size(); ++i) { //fprintf(stderr, "Opening %s for reading\n", // juncs_file_names[i].c_str()); FILE* juncs_file = fopen(juncs_file_names[i].c_str(), "r"); if (juncs_file == NULL) { fprintf(stderr, "Warning: cannot open %s for reading\n", juncs_file_names[i].c_str()); continue; } juncs_files.push_back(juncs_file); } /* * Read in the deletion file names */ vector deletions_file_names; vector deletions_files; tokenize(deletions_file_list, ",",deletions_file_names); for (size_t i = 0; i < deletions_file_names.size(); ++i) { //fprintf(stderr, "Opening %s for reading\n", // deletions_file_names[i].c_str()); FILE* deletions_file = fopen(deletions_file_names[i].c_str(), "r"); if (deletions_file == NULL) { fprintf(stderr, "Warning: cannot open %s for reading\n", deletions_file_names[i].c_str()); continue; } deletions_files.push_back(deletions_file); } /* * Read in the list of filenames that contain * insertion coordinates */ vector insertions_file_names; vector insertions_files; tokenize(insertions_file_list, ",",insertions_file_names); for (size_t i = 0; i < insertions_file_names.size(); ++i) { //fprintf(stderr, "Opening %s for reading\n", // insertions_file_names[i].c_str()); FILE* insertions_file = fopen(insertions_file_names[i].c_str(), "r"); if (insertions_file == NULL) { fprintf(stderr, "Warning: cannot open %s for reading\n", insertions_file_names[i].c_str()); continue; } insertions_files.push_back(insertions_file); } vector spliced_segmap_file_names; vector spliced_segmap_files; string unzcmd; tokenize(spliced_segmap_flist, ",",spliced_segmap_file_names); vector fusions_file_names; vector fusions_files; tokenize(fusions_file_list, ",",fusions_file_names); for (size_t i = 0; i < fusions_file_names.size(); ++i) { fprintf(stderr, "Opening %s for reading\n", fusions_file_names[i].c_str()); FILE* fusions_file = fopen(fusions_file_names[i].c_str(), "r"); if (fusions_file == NULL) { fprintf(stderr, "Warning: cannot open %s for reading\n", fusions_file_names[i].c_str()); continue; } fusions_files.push_back(fusions_file); } driver(bam_output_fname, ref_stream, juncs_files, insertions_files, deletions_files, fusions_files, spliced_segmap_file_names, segmap_file_names, reads_file_name); return 0; } tophat-2.0.9/src/qual.h0000644000175000017500000000737012122334357013464 0ustar toortoor#ifndef QUAL_H_ #define QUAL_H_ /* NOTE: This file was written by Ben Langmead, and is borrowed from Bowtie */ #include #include #include "assert_helpers.h" using namespace std; extern unsigned char qualRounds[]; extern unsigned char solToPhred[]; /// Translate a Phred-encoded ASCII character into a Phred quality static inline uint8_t phredCharToPhredQual(char c) { return ((uint8_t)c >= 33 ? ((uint8_t)c - 33) : 0); } /** * Convert a Solexa-scaled quality value into a Phred-scale quality * value. * * p = probability that base is miscalled * Qphred = -10 * log10 (p) * Qsolexa = -10 * log10 (p / (1 - p)) * See: http://en.wikipedia.org/wiki/FASTQ_format * */ static inline uint8_t solexaToPhred(int sol) { assert_lt(sol, 256); if(sol < -10) return 0; return solToPhred[sol+10]; } class SimplePhredPenalty { public: static uint8_t mmPenalty (uint8_t qual) { return qual; } static uint8_t delPenalty(uint8_t qual) { return qual; } static uint8_t insPenalty(uint8_t qual_left, uint8_t qual_right) { return std::max(qual_left, qual_right); } }; class MaqPhredPenalty { public: static uint8_t mmPenalty (uint8_t qual) { return qualRounds[qual]; } static uint8_t delPenalty(uint8_t qual) { return qualRounds[qual]; } static uint8_t insPenalty(uint8_t qual_left, uint8_t qual_right) { return qualRounds[std::max(qual_left, qual_right)]; } }; static inline uint8_t mmPenalty(bool maq, uint8_t qual) { if(maq) { return MaqPhredPenalty::mmPenalty(qual); } else { return SimplePhredPenalty::mmPenalty(qual); } } static inline uint8_t delPenalty(bool maq, uint8_t qual) { if(maq) { return MaqPhredPenalty::delPenalty(qual); } else { return SimplePhredPenalty::delPenalty(qual); } } static inline uint8_t insPenalty(bool maq, uint8_t qual_left, uint8_t qual_right) { if(maq) { return MaqPhredPenalty::insPenalty(qual_left, qual_right); } else { return SimplePhredPenalty::insPenalty(qual_left, qual_right); } } /** * Take an ASCII-encoded quality value and convert it to a Phred33 * ASCII char. */ inline static char charToPhred33(char c, bool solQuals, bool phred64Quals) { if(c == ' ') { cerr << "Saw a space but expected an ASCII-encoded quality value." << endl << "Are quality values formatted as integers? If so, try --integer-quals." << endl; throw 1; } if (solQuals) { // Convert solexa-scaled chars to phred // http://maq.sourceforge.net/fastq.shtml char cc = solexaToPhred((int)c - 64) + 33; if (cc < 33) { cerr << "Saw ASCII character " << ((int)c) << " but expected 64-based Solexa qual (converts to " << (int)cc << ")." << endl << "Try not specifying --solexa-quals." << endl; throw 1; } c = cc; } else if(phred64Quals) { if (c < 64) { cerr << "Saw ASCII character " << ((int)c) << " but expected 64-based Phred qual." << endl << "Try not specifying --solexa1.3-quals/--phred64-quals." << endl; throw 1; } // Convert to 33-based phred c -= (64-33); } else { // Keep the phred quality if (c < 33) { cerr << "Saw ASCII character " << ((int)c) << " but expected 33-based Phred qual." << endl; throw 1; } } return c; } /** * Take an integer quality value and convert it to a Phred33 ASCII * char. */ inline static char intToPhred33(int iQ, bool solQuals) { int pQ; if (solQuals) { // Convert from solexa quality to phred // quality and translate to ASCII // http://maq.sourceforge.net/qual.shtml pQ = solexaToPhred((int)iQ) + 33; } else { // Keep the phred quality and translate // to ASCII pQ = (iQ <= 93 ? iQ : 93) + 33; } if (pQ < 33) { cerr << "Saw negative Phred quality " << ((int)pQ-33) << "." << endl; throw 1; } assert_geq(pQ, 0); return (int)pQ; } #endif /*QUAL_H_*/ tophat-2.0.9/src/FastaTools.cpp0000644000175000017500000000743412157116165015140 0ustar toortoor// // FastaTools.cpp // TopHat // // Created by Harold Pimentel on 10/27/11. // #include "FastaTools.h" FastaReader::FastaReader() { isPrimed_ = false; } FastaReader::FastaReader(std::string fname) { isPrimed_ = false; init(fname); } FastaReader::~FastaReader() { ifstream_.close(); } void FastaReader::init(std::string fname) { if (isPrimed_) { std::cerr << "Warning: object has already FastaReader has already been " << "initialized with file: " << fname_ << std::endl; return; } std::ios::sync_with_stdio(false); //to speed up slow iostream reading fname_ = fname; ifstream_.open(fname_.c_str(), std::ios::in); if (!ifstream_.good()) { std::cerr << "ERROR: Could not open file " << fname_ << " in FastaReader" << std::endl; exit(1); } // Check the first character to see if it is valid char c = ifstream_.peek(); if (c != '>') { std::cerr << "ERROR: Invalid format for FASTA file. Begins with a '" << c << "'instead of a '>'" << std::endl; exit(1); } isPrimed_ = true; } bool FastaReader::good() const { return ifstream_.good() && !ifstream_.eof(); } // Up to caller to allocate memory. // Only deallocates memory when there are no more records left bool FastaReader::next(FastaRecord& rec) { if (!isPrimed_) { std::cerr << "ERROR: Stream has not been primed (FastaReader)" << std::endl; exit(1); } // Get the entire first line and description //ifstream_.getline(line_buf_, LINE_BUF_SIZE); if (ifstream_.eof() || !std::getline(ifstream_, line_buf_)) { rec.clear(); return false; } if (line_buf_.empty() || !good()) { rec.clear(); return false; } if (line_buf_.length()>0 && line_buf_[0]!='>') { std::cerr << "ERROR: no FASTA record start found (FastaReader)" << std::endl; exit(1); } size_t sp_pos = line_buf_.find(' '); if (sp_pos != std::string::npos) { rec.id_=line_buf_.substr(1, sp_pos-1); rec.desc_=line_buf_.substr(sp_pos+1); } else { rec.id_=line_buf_.substr(1); rec.desc_.clear(); } rec.seq_.clear(); // Read until you see another ">" while (ifstream_.peek() != '>') { //ifstream_ >> cur_line >> std::ws; if (std::getline(ifstream_, line_buf_)) rec.seq_ += line_buf_; else { break; // if ifstream_.good() && !ifstream_.eof() && } } return true; } FastaWriter::FastaWriter() { isPrimed_ = false; } FastaWriter::FastaWriter(std::string fname) { isPrimed_ = false; init(fname); } FastaWriter::~FastaWriter() { ofstream_.close(); } void FastaWriter::init(std::string fname) { if (isPrimed_) { std::cerr << "Warning: Cannot allocate FastaWriter to file '" << fname << "'. It has already been allocated to file '" << fname_ << "'" << std::endl; return; } ofstream_.open(fname.c_str(), std::ios::out); if (!ofstream_.good()) { std::cerr << "ERROR: Could not open " << fname << " for writing in " << "FastaWriter" << std::endl; exit(1); } fname_ = fname; isPrimed_ = true; } void FastaWriter::write(FastaRecord& rec, size_t column_size) { if (rec.seq_.length() == 0) return; //don't write empty records ofstream_ << ">" << rec.id_; //<< std::endl; if (rec.desc_.length()) { ofstream_ << " " << rec.desc_; } ofstream_ << std::endl; // iterate throught the string and print out the string size_t start = 0; while (start < rec.seq_.length()) { ofstream_ << rec.seq_.substr(start, column_size) << std::endl; start += column_size; } } tophat-2.0.9/src/alphabet.c0000644000175000017500000000721012122334360014260 0ustar toortoor#include uint8_t dna4Cat[] = { /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, /* - */ /* 48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64 */ 0, 1, 2, 1, 2, 0, 0, 1, 2, 0, 0, 2, 0, 2, 2, 0, /* A B C D G H K M N */ /* 80 */ 0, 0, 2, 2, 1, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, /* R S T V W X Y */ /* 96 */ 0, 1, 2, 1, 2, 0, 0, 1, 2, 0, 0, 2, 0, 2, 2, 0, /* a b c d g h k m n */ /* 112 */ 0, 0, 2, 2, 1, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, /* r s t v w x y */ /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /// For converting from ASCII to the Dna5 code where A=0, C=1, G=2, /// T=3, N=4 uint8_t charToDna5[] = { /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, /* A C G N */ /* 80 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* T */ /* 96 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, /* a c g n */ /* 112 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* t */ /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /// For converting from ASCII to the reverse-complement Dna5 code where /// A=3, C=2, G=1, T=0, N=4 uint8_t rcCharToDna5[] = { /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64 */ 0, 3, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 4, 0, /* A C G N */ /* 80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* T */ /* 96 */ 0, 3, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 4, 0, /* a c g n */ /* 112 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* t */ /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; tophat-2.0.9/src/fix_map_ordering.cpp0000644000175000017500000003246512122334360016366 0ustar toortoor/* * fix_map_ordering.cpp * TopHat * * Created by Cole Trapnell on 2/28/09. * Copyright 2009 Cole Trapnell. All rights reserved. * */ #include #include #include #include #include #include "common.h" #include "reads.h" #include "bwt_map.h" using namespace seqan; using namespace std; struct TabSplitLine { //split a text line into an array of strings t[] //holds a copy of the text line int tcount; char **t; char *str; //text line, with \0s instead of tabs int tcap; TabSplitLine(const char* line) { tcount=0; t=NULL; tcap=0; str=NULL; if (line==NULL) return; str=strdup(line); //Notes: * destructive operation for s (replaces every \t with \0) // * user must free t when no longer needed tcap=14; t=(char**)malloc(tcap*sizeof(char*)); char prevch=0; for (char* p = str; *p!=0 ;p++) { if (*p=='\t') *p=0; //break the string here if (prevch==0) { //field start if (tcount==tcap) { tcap+=4; t = (char**)realloc(t,tcap*sizeof(char*)); } t[tcount]=p; tcount++; } //field start prevch=*p; if (*p=='\n' || *p=='\r') { *p=0; break; } }//for each character on the line } ~TabSplitLine() { if (str!=NULL) { free(str); free(t); } } }; struct MapOrdering { bool operator()(pair& lhs, pair& rhs) { uint64_t lhs_id = lhs.first; uint64_t rhs_id = rhs.first; return lhs_id > rhs_id; } }; #define NOQUALS 0xFF void copy_quals(const bam1_t& from_bam, bam1_t& to_bam) { const uint8_t *base_bq = bam1_qual(&from_bam); uint8_t *this_bq = bam1_qual(&to_bam); if ((from_bam.core.flag & BAM_FREVERSE) == (to_bam.core.flag & BAM_FREVERSE)) { memcpy(this_bq, base_bq, to_bam.core.l_qseq); } else { for(int i=0;i<(to_bam.core.l_qseq);i++) { this_bq[to_bam.core.l_qseq - i - 1] = base_bq[i]; } } } // "AS:i" (alignment score) is considered. struct BamMapOrdering { bool operator()(pair& lhs, pair& rhs) { uint64_t lhs_id = lhs.first; uint64_t rhs_id = rhs.first; //if (lhs_id != rhs_id || !bowtie2) if (lhs_id != rhs_id) return lhs_id > rhs_id; //they have the same ID here int lhs_score, rhs_score; lhs_score = rhs_score = numeric_limits::min(); if (bowtie2) { uint8_t* ptr = bam_aux_get(lhs.second, "AS"); if (ptr) lhs_score = bam_aux2i(ptr); ptr = bam_aux_get(rhs.second, "AS"); if (ptr) rhs_score = bam_aux2i(ptr); if (lhs_score != rhs_score) return lhs_score < rhs_score; } lhs_score = rhs_score = numeric_limits::min(); uint8_t* ptr = bam_aux_get(lhs.second, "NM"); if (ptr) lhs_score = bam_aux2i(ptr); ptr = bam_aux_get(rhs.second, "NM"); if (ptr) rhs_score = bam_aux2i(ptr); if (lhs_score != rhs_score) return lhs_score < rhs_score; //try to get a stable sort here bam1_t* lb=lhs.second; bam1_t* rb=rhs.second; if (lb->core.tid != rb->core.tid) return lb->core.tidcore.tid; if (lb->core.pos != rb->core.pos) return lb->core.poscore.pos; return lhs.second->core.flag & BAM_FSECONDARY; } }; void writeSamLine(TabSplitLine& l, FILE* f) { if (l.tcount<10) { //fprintf(stderr, "Warning: skipping malformed SAM line %s\n",samline); return; } int flag=atoi(l.t[1]); //FLAG if ((flag & BAM_FUNMAP) != 0) return; fprintf(f, "%s", l.t[0]); for (int i=1;i11) { for (int i=11;iadd_aux(l.t[i]); }//for each aux field } wbam.write(brec); delete brec; } void driver_bam(string& fname, GBamWriter& bam_writer, GBamWriter* umbam) { tamFile fh=sam_open(sam_header.c_str()); bam_header_t* header=sam_header_read(fh); sam_close(fh); priority_queue< pair, vector >, BamMapOrdering > map_pq; GBamWriter* wmulti=NULL; //for multi-mapped prefiltering if (!aux_outfile.empty() && max_multihits>0) { wmulti=new GBamWriter(aux_outfile.c_str(), sam_header.c_str()); } tamFile fp=sam_open(fname.c_str()); bam1_t *b = bam_init1(); //uint64_t last_id = 0; do { bool new_record = (sam_read1(fp, header, b) >= 0); if (new_record) { char *qname = bam1_qname(b); uint64_t qid=(uint64_t)atol(qname); if (color) { int qname_len = strlen(qname); assert (qname_len > 3); qid = qid << 24 | (uint64_t)qname[qname_len-3] << 16 | (uint64_t)qname[qname_len-2] << 8 | (uint64_t)qname[qname_len-1]; } bam1_t* bamrec=bam_dup1(b); map_pq.push(make_pair(qid, bamrec)); } while (map_pq.size() > 1000000 || (!new_record && map_pq.size() > 0)) { uint64_t rid=map_pq.top().first; char primer_tag = 0, first_color = 0, first_qual = 0; if (color) { primer_tag = (char)((rid >> 16) & 0xff); first_color = (char)((rid >> 8) & 0xff); first_qual = (char)(rid & 0xff); rid >>= 24; } bam1_t* tb=map_pq.top().second; bool unmapped = (tb->core.flag & BAM_FUNMAP) != 0; if (unmapped) { //unmapped read if (umbam!=NULL) { // add a primer tag with the corresponding dummy quality value '!' if (color) { int l_qseq = tb->core.l_qseq + 2; int data_len = tb->data_len + 3; // one for primer tag and first color, and two for two quality values int m_data = data_len; kroundup32(m_data); uint8_t* data = (uint8_t*)calloc(m_data, 1); memset(data, 0, m_data); int copy_len = tb->core.l_qname + tb->core.n_cigar * 4; memcpy(data, tb->data, copy_len); uint8_t* data_seq = data + copy_len; uint8_t* source_seq = bam1_seq(tb); data_seq[0] = bam_nt16_table[(int)primer_tag] << 4 | bam_nt16_table[(int)first_color]; memcpy(data_seq + 1, source_seq, (tb->core.l_qseq + 1) >> 1); uint8_t* data_qual = data_seq + ((l_qseq + 1) >> 1); data_qual[0] = '!' - 33; data_qual[1] = first_qual - 33; memcpy(data_qual + 2, bam1_qual(tb), tb->core.l_qseq); uint8_t* data_aux = data_qual + l_qseq; memcpy(data_aux, bam1_aux(tb), data_len - (data_aux - data)); free(tb->data); tb->core.l_qseq = l_qseq; tb->data = data; tb->data_len = data_len; tb->m_data = m_data; } umbam->write(tb, rid); } bam_destroy1(tb); map_pq.pop(); } else { //mapped read vector > read_hits; //all mappings of a read are dealt with here //if (!unmapped) { // mapped read //collect all hits for this read read_hits.push_back(map_pq.top()); unsigned int mcount=0; //number of "good" scoring multi-mappings int tbscore=0; //best mapping score for this read (first alignment reported) uint8_t* tbq=bam1_qual(read_hits[0].second); bool need_quals = (tbq[0] == NOQUALS); if (bowtie2) { uint8_t* ptr = bam_aux_get(tb, "AS"); if (ptr) { tbscore=bam_aux2i(ptr); if (tbscore>=bowtie2_min_score) { ++mcount; } } } //bowtie2 only else mcount++; //for bowtie 1 count every mapping map_pq.pop(); while (map_pq.size()>0 && map_pq.top().first==rid) { //read_hits.push_back(map_pq.top()); //no, we'll keep only "acceptable" mappings if (need_quals) { uint8_t* mq=bam1_qual(map_pq.top().second); if (mq[0]!=NOQUALS) { copy_quals(*(map_pq.top().second), *(read_hits[0].second)); need_quals=false; } } if (bowtie2) { uint8_t* ptr = bam_aux_get(map_pq.top().second, "AS"); if (ptr) { int score=bam_aux2i(ptr); if (score>=bowtie2_min_score && score>=tbscore-2) { ++mcount; } } } else mcount++; read_hits.push_back(map_pq.top()); map_pq.pop(); } //for each alignment of the same read int32_t num_hits=read_hits.size(); //this will only count "acceptable" mappings if (wmulti && mcount>max_multihits) { //just filtering out multi-mapped hits if requested (pre-filtering feature) if (num_hits>1) { bam_aux_append(tb, "NH", 'i', 4, (uint8_t*)&num_hits); } wmulti->write(tb); } else { //we're NOT filtering multi-mapped hits // In case of Bowtie2, some of the mapped reads against either transcriptome or genome // may have low alignment scores due to gaps, in which case we will remap those. // Later, we may have better alignments that usually involve splice junctions. if (bowtie2 && tbscore<=bowtie2_min_score) { //poor mapping, we want to map this read later in the pipeline //unmapped = true; if (umbam!=NULL) { umbam->write(tb); } } //-- keep all "acceptable" mappings for this read: for (vector >::size_type i=0;i& v = read_hits[i]; v.second->core.flag &= ~BAM_FSECONDARY; if (i>0) { uint8_t* mq=bam1_qual(v.second); if (mq[0]==NOQUALS) copy_quals(*(read_hits[0].second), *v.second); } if (num_hits>1) bam_aux_append(v.second, "NH", 'i', 4, (uint8_t*)&num_hits); bam_writer.write(v.second, v.first); } //for each mapping of this read } //free the read hits for (vector >::size_type i=0;i& v = read_hits[i]; bam_destroy1(v.second); } } // mapped reads } } while (map_pq.size() > 0); //while SAM records bam_destroy1(b); bam_header_destroy(header); if (wmulti) delete wmulti; } void driver_headerless(FILE* map_file, FILE* f_out) { char bwt_buf[4096]; priority_queue< pair, vector >, MapOrdering > map_pq; while (fgets(bwt_buf, sizeof(bwt_buf), map_file)) { // Chomp the newline char* nl = strrchr(bwt_buf, '\n'); if (nl) *nl = 0; if (*bwt_buf == 0) continue; TabSplitLine* l=new TabSplitLine(bwt_buf); if (l->tcount<10 || l->t[0][0]=='@') { delete l; continue; } //char* hitline = strdup(bwt_buf); uint64_t qid = (uint64_t)atol(l->t[0]); map_pq.push(make_pair(qid, l)); if (map_pq.size() > 1000000) { const pair& t = map_pq.top(); writeSamLine(*t.second, f_out); delete t.second; map_pq.pop(); } } while (map_pq.size()) { const pair& t = map_pq.top(); writeSamLine(*t.second, f_out); delete t.second; map_pq.pop(); } } void print_usage() { // fprintf(stderr, "Usage: fix_map_ordering []\n"); fprintf(stderr, "Usage: \nfix_map_ordering [--sam-header ] []\n"); } int main(int argc, char** argv) { int parse_ret = parse_options(argc, argv, print_usage); if (parse_ret) return parse_ret; //if --sam_header option was given write BAM and expects (headerless) SAM input; // else simply lets SAM lines through if(optind >= argc) { print_usage(); return 1; } string map_file_name = argv[optind++]; string out_file_name("-"); string out_unmapped_fname; if (optind #endif #include #include #include #include #include "bwt_map.h" #include "common.h" #include "inserts.h" using namespace std; bool InsertAlignmentGrade::operator<(const InsertAlignmentGrade& rhs) { // penalty for discordant mapping is reflected in the alignment score. #if 0 if (fusion && !rhs.fusion) return true; if (!fusion && rhs.fusion) return false; #endif // We always prefer a insert alignment with both ends mapped than a // singleton if (num_mapped != rhs.num_mapped) { return num_mapped < rhs.num_mapped; } else if (num_mapped == 2) { // daehwan - I'm testing this! #if 1 if (alignment_score != rhs.alignment_score) return alignment_score < rhs.alignment_score; return false; #else // if significant difference in their inner mate distances if (abs(rhs.inner_dist - inner_dist) >= 30) { // Prefer a pair that is too close or perfect to one that is too far if (too_far && !rhs.too_far) return true; // Prefer a pair that is perfect to one that is too close if (too_close && !(rhs.too_close || rhs.too_far)) return true; // Prefer closer mates if (rhs.inner_dist < inner_dist) return true; } if (edit_dist != rhs.edit_dist) return rhs.edit_dist < edit_dist; // daehwan - do somethings here if (!bowtie2) { // Prefer shorter introns if (longest_ref_skip != rhs.longest_ref_skip) return rhs.longest_ref_skip < longest_ref_skip; } #endif return false; } else { // We prefer a singleton mapping to an insert with neither end mapped if (num_mapped != rhs.num_mapped) { return num_mapped < rhs.num_mapped; // if RHS has MORE READS, RHS is BETTER (lhs < rhs) } else { if (rhs.num_spliced != num_spliced) return rhs.num_spliced < num_spliced;// if RHS is LESS SPLICED, RHS is BETTER (lhs < rhs) // Prefer shorter introns if (longest_ref_skip != rhs.longest_ref_skip) return rhs.longest_ref_skip < longest_ref_skip; // if RHS intron is SHORTER, RHS is BETTER (lhs < rhs) return rhs.edit_dist < edit_dist; // if RHS edit is LOWER, RHS is BETTER (lhs < rhs) } } return false; } bool gap_lt(const pair& lhs, const pair& rhs) { return abs(lhs.second - lhs.first) < abs(rhs.second - rhs.first); } pair pair_distances(const BowtieHit& h1, const BowtieHit& h2) { if (h1.left() <= h2.left() && h1.right() >= h2.right()) return make_pair(h1.right() - h1.left(), 0); else if (h2.left() <= h1.left() && h2.right() >= h1.right()) return make_pair(h2.right() - h2.left(), 0); int minor_hit_start, major_hit_start; int minor_hit_end, major_hit_end; if (h1.left() < h2.left()) { minor_hit_start = (int)h1.left(); minor_hit_end = (int)h1.right(); major_hit_start = (int)h2.left(); major_hit_end = (int)h2.right(); } else { minor_hit_start = (int)h2.left(); minor_hit_end = (int)h2.right(); major_hit_start = (int)h1.left(); major_hit_end = (int)h1.right(); } int inner_dist = major_hit_start - minor_hit_end; int outer_dist = major_hit_end - minor_hit_start; return make_pair(outer_dist, inner_dist); } void best_insert_mappings(uint64_t refid, ReadTable& it, HitList& hits1_in_ref, HitList& hits2_in_ref, BestInsertAlignmentTable& best_status_for_inserts, bool prefer_shorter_pairs) { long chucked_for_shorter_pair = 0; std::set marked; HitList::iterator last_good = hits2_in_ref.begin(); for (size_t i = 0; i < hits1_in_ref.size(); ++i) { BowtieHit& h1 = hits1_in_ref[i]; pair range_pair; range_pair = equal_range(last_good, hits2_in_ref.end(), h1, hit_insert_id_lt); bool found_hit = false; if (range_pair.first != range_pair.second) last_good = range_pair.first; uint32_t obs_order = it.observation_order(h1.insert_id()); for (HitList::iterator f = range_pair.first; f != range_pair.second; ++f) { BowtieHit& h2 = *f; if (h1.insert_id() == h2.insert_id()) { JunctionSet junctions; InsertAlignmentGrade s(h1, h2, junctions); pair >& insert_best = best_status_for_inserts[obs_order]; InsertAlignmentGrade& current = insert_best.first; // Is the new status better than the current best one? if (current < s) { insert_best.second.clear(); current = s; insert_best.second.push_back(InsertAlignment(refid, &h1, &h2)); } else if (!(s < current)) { if (prefer_shorter_pairs && current.num_mapped == 2) { pair dc = pair_distances(*(insert_best.second[0].left_alignment), *(insert_best.second[0].right_alignment)); pair ds = pair_distances(h1,h2); if (ds.second < dc.second) { chucked_for_shorter_pair += insert_best.second.size(); insert_best.second.clear(); current = s; insert_best.second.push_back(InsertAlignment(refid, &h1, &h2)); } } else { insert_best.second.push_back(InsertAlignment(refid, &h1, &h2)); } } marked.insert(f - hits2_in_ref.begin()); found_hit = true; } } if (!found_hit) { pair >& insert_best = best_status_for_inserts[obs_order]; InsertAlignmentGrade& current = insert_best.first; InsertAlignmentGrade s(h1); if (current < s) { insert_best.second.clear(); current = s; insert_best.second.push_back(InsertAlignment(refid, &h1, NULL)); } else if (! (s < current)) { insert_best.second.push_back(InsertAlignment(refid, &h1, NULL)); } } } for (size_t i = 0; i < hits2_in_ref.size(); ++i) { BowtieHit& h2 = hits2_in_ref[i]; uint32_t obs_order = it.observation_order(h2.insert_id()); pair >& insert_best = best_status_for_inserts[obs_order]; InsertAlignmentGrade& current = insert_best.first; InsertAlignmentGrade s(h2); // Did we include h2 as part of a pairing already, or is this first time // we've seen it? If so, it's a singleton. if (marked.find(i) == marked.end()) { if (current < s) { insert_best.second.clear(); current = s; insert_best.second.push_back(InsertAlignment(refid, NULL, &h2)); } else if (! (s < current)) { insert_best.second.push_back(InsertAlignment(refid, NULL, &h2)); } } } fprintf(stderr, "Chucked %ld pairs for shorter pairing of same mates\n", chucked_for_shorter_pair); } int long_spliced = 0; int short_spliced = 0; int singleton_splices = 0; bool valid_insert_alignment(const InsertAlignmentGrade& g, const InsertAlignment& a) { if (!a.left_alignment || !a.right_alignment) return false; if (g.num_mapped == 2) { // Take all the contiguously mapped pairs if (g.num_spliced == 0) return true; // Take the pairs that include one or more spliced reads as long as // the inner dist isn't too big // if (g.one_spliced || g.both_spliced) // { // if (g.too_far || g.too_close) // return false; // } return true; } return false; } void insert_best_pairings(RefSequenceTable& rt, ReadTable& it, HitTable& hits1, HitTable& hits2, BestInsertAlignmentTable& best_pairings, bool prefer_shorter_pairs) { for(RefSequenceTable::const_iterator ci = rt.begin(); ci != rt.end(); ++ci) { // Tracks the number of singleton ALIGNMENTS, not the number of singleton // READS in each Bowtie map. vector map1_singletons; vector map2_singletons; vector > happy_mates; uint64_t ref_id = ci->second.observation_order; HitList* hits1_in_ref = hits1.get_hits(ref_id); HitList* hits2_in_ref = hits2.get_hits(ref_id); if (!hits1_in_ref || !hits2_in_ref) continue; //if (verbose) // fprintf(stderr, "Looking for best insert mappings in %s\n", name.c_str()); best_insert_mappings(ref_id, it, *hits1_in_ref, *hits2_in_ref, best_pairings, prefer_shorter_pairs); } } tophat-2.0.9/src/common.cpp0000644000175000017500000012122412122334363014335 0ustar toortoor/* * common.cpp * TopHat * * Created by Cole Trapnell on 11/26/08. * Copyright 2008 Cole Trapnell. All rights reserved. * */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #include #include using namespace std; #include "common.h" #include "tokenize.h" #ifdef MEM_DEBUG //function for debugging memory usage of current program in Linux ////////////////////////////////////////////////////////////////////////////// // process_mem_usage(double &, double &) - takes two doubles by reference, // attempts to read the system-dependent data for a process' virtual memory // size and resident set size, and return the results in KB. // // On failure, returns 0.0, 0.0 void process_mem_usage(double& vm_usage, double& resident_set) { using std::ios_base; using std::ifstream; using std::string; vm_usage = 0.0; resident_set = 0.0; // 'file' stat seems to give the most reliable results ifstream stat_stream("/proc/self/stat",ios_base::in); // dummy vars for leading entries in stat that we don't care about string pid, comm, state, ppid, pgrp, session, tty_nr; string tpgid, flags, minflt, cminflt, majflt, cmajflt; string utime, stime, cutime, cstime, priority, nice; string O, itrealvalue, starttime; // the two fields we want // unsigned long vsize; long rss; stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr >> tpgid >> flags >> minflt >> cminflt >> majflt >> cmajflt >> utime >> stime >> cutime >> cstime >> priority >> nice >> O >> itrealvalue >> starttime >> vsize >> rss; // don't care about the rest stat_stream.close(); long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages vm_usage = vsize / 1024.0; resident_set = rss * page_size_kb; } void print_mem_usage() { double vs, rs; process_mem_usage(vs,rs); vs/=1024; rs/=1024; fprintf(stderr, "VMSize: %6.1fMB\tRSize: %6.1fMB\n", vs, rs); } #endif bool bowtie2 = true; int bowtie2_min_score = -10; //FIXME: experimental score threshold (activated by the hidden -W option) //for "soft" filtering in fix_map_ordering //should be removed if a bowtie2 min-score function is used instead int bowtie2_scoreflt = 0; int bowtie2_max_penalty = 6; int bowtie2_min_penalty = 2; int bowtie2_penalty_for_N = 1; int bowtie2_read_gap_open = 5; int bowtie2_read_gap_cont = 3; int bowtie2_ref_gap_open = 5; int bowtie2_ref_gap_cont = 3; // daehwan - temporary bool parallel = true; unsigned int max_insertion_length = 3; unsigned int max_deletion_length = 3; int inner_dist_mean = 200; int inner_dist_std_dev = 20; int max_mate_inner_dist = -1; int min_anchor_len = 8; int min_report_intron_length = 50; int max_report_intron_length = 500000; int min_closure_intron_length = 50; int max_closure_intron_length = 5000; int min_coverage_intron_length = 50; int max_coverage_intron_length = 20000; int min_segment_intron_length = 50; int max_segment_intron_length = 500000; uint32_t min_closure_exon_length = 100; int island_extension = 25; int segment_length = 25; int segment_mismatches = 2; int read_mismatches = 2; int read_gap_length = 2; int read_edit_dist = 2; int read_realign_edit_dist = 2; int max_splice_mismatches = 1; ReadFormat reads_format = FASTQ; bool verbose = false; unsigned int max_multihits = 20; bool suppress_hits = false; unsigned int max_seg_multihits = 40; bool no_closure_search = false; bool no_coverage_search = false; bool no_microexon_search = false; bool butterfly_search = false; int num_threads = 1; float min_isoform_fraction = 0.15f; string output_dir = "tophat_out"; string std_outfile = ""; string aux_outfile = ""; //auxiliary output file name (e.g. prep_reads read stats) string index_outfile = ""; string gene_filter = ""; string gff_file = ""; string ium_reads = ""; string sam_header = ""; string sam_readgroup_id = ""; string zpacker = ""; string samtools_path = "samtools"; bool solexa_quals = false; bool phred64_quals = false; bool quals = false; bool integer_quals = false; bool color = false; string gtf_juncs = ""; bool report_secondary_alignments = false; bool report_discordant_pair_alignments = false; bool report_mixed_alignments = false; string flt_reads = ""; string flt_mappings = ""; int flt_side = 2; bool fusion_search = false; size_t fusion_anchor_length = 20; size_t fusion_min_dist = 10000000; size_t fusion_read_mismatches = 2; size_t fusion_multireads = 2; size_t fusion_multipairs = 2; std::vector fusion_ignore_chromosomes; bool fusion_do_not_resolve_conflicts = false; eLIBRARY_TYPE library_type = LIBRARY_TYPE_NONE; extern void print_usage(); /** * Parse an int out of optarg and enforce that it be at least 'lower'; * if it is less than 'lower', than output the given error message and * exit with an error and a usage message. */ int parseIntOpt(int lower, const char *errmsg, void (*print_usage)()) { long l; char *endPtr= NULL; l = strtol(optarg, &endPtr, 10); if (endPtr != NULL) { if (l < lower) { cerr << errmsg << endl; print_usage(); exit(1); } return (int32_t)l; } cerr << errmsg << endl; print_usage(); exit(1); return -1; } /** * Parse an int out of optarg and enforce that it be at least 'lower'; * if it is less than 'lower', than output the given error message and * exit with an error and a usage message. */ static float parseFloatOpt(float lower, float upper, const char *errmsg, void (*print_usage)()) { float l; l = (float)atof(optarg); if (l < lower) { cerr << errmsg << endl; print_usage(); exit(1); } if (l > upper) { cerr << errmsg << endl; print_usage(); exit(1); } return l; cerr << errmsg << endl; print_usage(); exit(1); return -1; } /* this is from http://www.winehq.org/pipermail/wine-patches/2001-November/001322.html */ char* get_token(char** str, const char* delims) { char* token; if (*str == NULL) return NULL; token = *str; while (**str != '\0') { if (strchr(delims, **str) != NULL) { **str = '\0'; ++(*str); return token; } ++(*str); } *str = NULL; return token; } const char *short_options = "QCp:z:N:w:W:"; enum { OPT_FASTA = 127, OPT_FASTQ, OPT_MIN_ANCHOR, OPT_SPLICE_MISMATCHES, OPT_VERBOSE, OPT_INSERT_LENGTH_MEAN, OPT_INSERT_LENGTH_STD_DEV, OPT_MIN_ISOFORM_FRACTION, OPT_OUTPUT_DIR, OPT_GENE_FILTER, OPT_GFF_ANNOTATIONS, OPT_MAX_MULTIHITS, OPT_SUPPRESS_HITS, OPT_MAX_SEG_MULTIHITS, OPT_NO_CLOSURE_SEARCH, OPT_NO_COVERAGE_SEARCH, OPT_NO_MICROEXON_SEARCH, OPT_SEGMENT_LENGTH, OPT_READ_MISMATCHES, OPT_READ_GAP_LENGTH, OPT_READ_EDIT_DIST, OPT_READ_REALIGN_EDIT_DIST, OPT_SEGMENT_MISMATCHES, OPT_MIN_CLOSURE_EXON, OPT_MAX_CLOSURE_INTRON, OPT_MIN_CLOSURE_INTRON, OPT_MAX_COVERAGE_INTRON, OPT_MIN_COVERAGE_INTRON, OPT_MIN_SEGMENT_INTRON, OPT_MAX_SEGMENT_INTRON, OPT_MIN_REPORT_INTRON, OPT_MAX_REPORT_INTRON, OPT_IUM_READS, OPT_BUTTERFLY_SEARCH, OPT_SOLEXA_QUALS, OPT_PHRED64_QUALS, OPT_SAM_HEADER, OPT_SAM_READGROUP_ID, OPT_QUALS, OPT_INTEGER_QUALS, OPT_COLOR, OPT_LIBRARY_TYPE, OPT_MAX_DELETION_LENGTH, OPT_MAX_INSERTION_LENGTH, OPT_NUM_THREADS, OPT_ZPACKER, OPT_SAMTOOLS, OPT_AUX_OUT, OPT_STD_OUT, OPT_INDEX_OUT, OPT_GTF_JUNCS, OPT_FILTER_READS, OPT_FILTER_HITS, OPT_FILTER_SIDE, OPT_REPORT_SECONDARY_ALIGNMENTS, OPT_REPORT_DISCORDANT_PAIR_ALIGNMENTS, OPT_REPORT_MIXED_ALIGNMENTS, OPT_FUSION_SEARCH, OPT_FUSION_ANCHOR_LENGTH, OPT_FUSION_MIN_DIST, OPT_FUSION_READ_MISMATCHES, OPT_FUSION_MULTIREADS, OPT_FUSION_MULTIPAIRS, OPT_FUSION_IGNORE_CHROMOSOMES, OPT_FUSION_DO_NOT_RESOLVE_CONFLICTS, OPT_BOWTIE1, OPT_BOWTIE2_MIN_SCORE, OPT_BOWTIE2_MAX_PENALTY, OPT_BOWTIE2_MIN_PENALTY, OPT_BOWTIE2_PENALTY_FOR_N, OPT_BOWTIE2_READ_GAP_OPEN, OPT_BOWTIE2_READ_GAP_CONT, OPT_BOWTIE2_REF_GAP_OPEN, OPT_BOWTIE2_REF_GAP_CONT, OPT_BOWTIE2_SCOREFLT }; static struct option long_options[] = { {"fasta", no_argument, 0, OPT_FASTA}, {"fastq", no_argument, 0, OPT_FASTQ}, {"min-anchor", required_argument, 0, OPT_MIN_ANCHOR}, {"sam-header", required_argument, 0, OPT_SAM_HEADER}, {"rg-id", required_argument, 0, OPT_SAM_READGROUP_ID}, {"splice-mismatches", required_argument, 0, OPT_SPLICE_MISMATCHES}, {"verbose", no_argument, 0, OPT_VERBOSE}, {"inner-dist-mean", required_argument, 0, OPT_INSERT_LENGTH_MEAN}, {"inner-dist-std-dev", required_argument, 0, OPT_INSERT_LENGTH_STD_DEV}, {"output-dir", required_argument, 0, OPT_OUTPUT_DIR}, {"gene-filter", required_argument, 0, OPT_GENE_FILTER}, {"gtf-annotations", required_argument, 0, OPT_GFF_ANNOTATIONS}, {"max-multihits", required_argument, 0, OPT_MAX_MULTIHITS}, {"suppress-hits", no_argument, 0, OPT_SUPPRESS_HITS}, {"max-seg-multihits", required_argument, 0, OPT_MAX_SEG_MULTIHITS}, {"no-closure-search", no_argument, 0, OPT_NO_CLOSURE_SEARCH}, {"no-coverage-search", no_argument, 0, OPT_NO_COVERAGE_SEARCH}, {"no-microexon-search", no_argument, 0, OPT_NO_MICROEXON_SEARCH}, {"segment-length", required_argument, 0, OPT_SEGMENT_LENGTH}, {"segment-mismatches", required_argument, 0, OPT_SEGMENT_MISMATCHES}, {"read-mismatches", required_argument, 0, OPT_READ_MISMATCHES}, {"read-gap-length", required_argument, 0, OPT_READ_GAP_LENGTH}, {"read-edit-dist", required_argument, 0, OPT_READ_EDIT_DIST}, {"read-realign-edit-dist", required_argument, 0, OPT_READ_REALIGN_EDIT_DIST}, {"min-closure-exon", required_argument, 0, OPT_MIN_CLOSURE_EXON}, {"min-closure-intron", required_argument, 0, OPT_MIN_CLOSURE_INTRON}, {"max-closure-intron", required_argument, 0, OPT_MAX_CLOSURE_INTRON}, {"min-coverage-intron", required_argument, 0, OPT_MIN_COVERAGE_INTRON}, {"max-coverage-intron", required_argument, 0, OPT_MAX_COVERAGE_INTRON}, {"min-segment-intron", required_argument, 0, OPT_MIN_SEGMENT_INTRON}, {"max-segment-intron", required_argument, 0, OPT_MAX_SEGMENT_INTRON}, {"min-report-intron", required_argument, 0, OPT_MIN_REPORT_INTRON}, {"max-report-intron", required_argument, 0, OPT_MAX_REPORT_INTRON}, {"min-isoform-fraction",required_argument, 0, OPT_MIN_ISOFORM_FRACTION}, {"ium-reads", required_argument, 0, OPT_IUM_READS}, {"butterfly-search", no_argument, 0, OPT_BUTTERFLY_SEARCH}, {"solexa-quals", no_argument, 0, OPT_SOLEXA_QUALS}, {"phred64-quals", no_argument, 0, OPT_PHRED64_QUALS}, {"quals", no_argument, 0, OPT_QUALS}, {"integer-quals", no_argument, 0, OPT_INTEGER_QUALS}, {"color", no_argument, 0, OPT_COLOR}, {"library-type", required_argument, 0, OPT_LIBRARY_TYPE}, {"max-deletion-length", required_argument, 0, OPT_MAX_DELETION_LENGTH}, {"max-insertion-length", required_argument, 0, OPT_MAX_INSERTION_LENGTH}, {"num-threads", required_argument, 0, OPT_NUM_THREADS}, {"zpacker", required_argument, 0, OPT_ZPACKER}, {"samtools", required_argument, 0, OPT_SAMTOOLS}, {"aux-outfile", required_argument, 0, OPT_AUX_OUT}, {"outfile", required_argument, 0, OPT_STD_OUT}, {"index-outfile", required_argument, 0, OPT_INDEX_OUT}, {"gtf-juncs", required_argument, 0, OPT_GTF_JUNCS}, {"flt-reads",required_argument, 0, OPT_FILTER_READS}, {"flt-hits",required_argument, 0, OPT_FILTER_HITS}, {"flt-side",required_argument, 0, OPT_FILTER_SIDE}, {"report-secondary-alignments", no_argument, 0, OPT_REPORT_SECONDARY_ALIGNMENTS}, {"report-discordant-pair-alignments", no_argument, 0, OPT_REPORT_DISCORDANT_PAIR_ALIGNMENTS}, {"report-mixed-alignments", no_argument, 0, OPT_REPORT_MIXED_ALIGNMENTS}, {"fusion-search", no_argument, 0, OPT_FUSION_SEARCH}, {"fusion-anchor-length", required_argument, 0, OPT_FUSION_ANCHOR_LENGTH}, {"fusion-min-dist", required_argument, 0, OPT_FUSION_MIN_DIST}, {"fusion-read-mismatches", required_argument, 0, OPT_FUSION_READ_MISMATCHES}, {"fusion-multireads", required_argument, 0, OPT_FUSION_MULTIREADS}, {"fusion-multipairs", required_argument, 0, OPT_FUSION_MULTIPAIRS}, {"fusion-ignore-chromosomes", required_argument, 0, OPT_FUSION_IGNORE_CHROMOSOMES}, {"fusion-do-not-resolve-conflicts", no_argument, 0, OPT_FUSION_DO_NOT_RESOLVE_CONFLICTS}, {"bowtie1", no_argument, 0, OPT_BOWTIE1}, {"bowtie2-min-score", required_argument, 0, OPT_BOWTIE2_MIN_SCORE}, {"bowtie2-max-penalty", required_argument, 0, OPT_BOWTIE2_MAX_PENALTY}, {"bowtie2-min-penalty", required_argument, 0, OPT_BOWTIE2_MIN_PENALTY}, {"bowtie2-penalty-for-N", required_argument, 0, OPT_BOWTIE2_PENALTY_FOR_N}, {"bowtie2-read-gap-open", required_argument, 0, OPT_BOWTIE2_READ_GAP_OPEN}, {"bowtie2-read-gap-cont", required_argument, 0, OPT_BOWTIE2_READ_GAP_CONT}, {"bowtie2-ref-gap-open", required_argument, 0, OPT_BOWTIE2_REF_GAP_OPEN}, {"bowtie2-ref-gap-cont", required_argument, 0, OPT_BOWTIE2_REF_GAP_CONT}, {0, 0, 0, 0} // terminator }; string str_replace(const string& base_str, const string& oldStr, const string& newStr) { size_t pos = 0; string str(base_str); while((pos = str.find(oldStr, pos)) != string::npos) { str.replace(pos, oldStr.length(), newStr); pos += newStr.length(); } return str; } void str_appendInt(string& str, int64_t v) { char int_str[32] = {0}; sprintf(int_str, "%ld", v); str += int_str; } void str_appendUInt(string& str, uint64_t v) { char uint_str[32] = {0}; sprintf(uint_str, "%lu", v); str += uint_str; } bool str_endsWith(string& str, const char* suffix) { if (str.empty() || str.length()<3) return false; size_t l=strlen(suffix); if (str.length()<=l) return false; if (str.rfind(suffix, str.length()-l-1)!=string::npos) return true; return false; } int parse_options(int argc, char** argv, void (*print_usage)()) { int option_index = 0; int next_option; do { next_option = getopt_long(argc, argv, short_options, long_options, &option_index); switch (next_option) { case -1: break; case OPT_FASTA: reads_format = FASTA; break; case OPT_FASTQ: reads_format = FASTQ; break; case OPT_MIN_ANCHOR: min_anchor_len = (uint32_t)parseIntOpt(3, "--min-anchor arg must be at least 3", print_usage); break; case OPT_SPLICE_MISMATCHES: max_splice_mismatches = parseIntOpt(0, "--splice-mismatches arg must be at least 0", print_usage); break; case OPT_VERBOSE: verbose = true; break; case OPT_INSERT_LENGTH_MEAN: inner_dist_mean = parseIntOpt(-1024, "--inner-dist-mean arg must be at least -1024", print_usage); break; case OPT_INSERT_LENGTH_STD_DEV: inner_dist_std_dev = parseIntOpt(0, "--inner-dist-std-dev arg must be at least 0", print_usage); break; case OPT_OUTPUT_DIR: output_dir = optarg; break; case OPT_GENE_FILTER: gene_filter = optarg; break; case OPT_GFF_ANNOTATIONS: gff_file = optarg; break; case OPT_MAX_MULTIHITS: max_multihits = parseIntOpt(1, "--max-multihits arg must be at least 1", print_usage); break; case OPT_SUPPRESS_HITS: suppress_hits = true; break; case OPT_MAX_SEG_MULTIHITS: max_seg_multihits = parseIntOpt(1, "--max-seg-multihits arg must be at least 1", print_usage); break; case OPT_NO_CLOSURE_SEARCH: no_closure_search = true; break; case OPT_NO_COVERAGE_SEARCH: no_coverage_search = true; break; case OPT_NO_MICROEXON_SEARCH: no_microexon_search = true; break; case OPT_SEGMENT_LENGTH: segment_length = parseIntOpt(4, "--segment-length arg must be at least 4", print_usage); break; case OPT_SEGMENT_MISMATCHES: segment_mismatches = parseIntOpt(0, "--segment-mismatches arg must be at least 0", print_usage); break; case 'N': case OPT_READ_MISMATCHES: read_mismatches = parseIntOpt(0, "--read-mismatches arg must be at least 0", print_usage); break; case OPT_READ_GAP_LENGTH: read_gap_length = parseIntOpt(0, "--read-gap-length arg must be at least 0", print_usage); break; case OPT_READ_EDIT_DIST: read_edit_dist = parseIntOpt(0, "--read-edit-dist arg must be at least 0", print_usage); break; case OPT_READ_REALIGN_EDIT_DIST: read_realign_edit_dist = parseIntOpt(0, "--read-realign-edit-dist arg must be at least 0", print_usage); break; case OPT_MIN_CLOSURE_EXON: min_closure_exon_length = parseIntOpt(1, "--min-closure-exon arg must be at least 1", print_usage); break; case OPT_MIN_CLOSURE_INTRON: min_closure_intron_length = parseIntOpt(1, "--min-closure-intron arg must be at least 1", print_usage); break; case OPT_MAX_CLOSURE_INTRON: max_closure_intron_length = parseIntOpt(1, "--max-closure-intron arg must be at least 1", print_usage); break; case OPT_MIN_COVERAGE_INTRON: min_coverage_intron_length = parseIntOpt(1, "--min-coverage-intron arg must be at least 1", print_usage); break; case OPT_MAX_COVERAGE_INTRON: max_coverage_intron_length = parseIntOpt(1, "--max-coverage-intron arg must be at least 1", print_usage); break; case OPT_MIN_SEGMENT_INTRON: min_segment_intron_length = parseIntOpt(1, "--min-segment-intron arg must be at least 1", print_usage); break; case OPT_MAX_SEGMENT_INTRON: max_segment_intron_length = parseIntOpt(1, "--max-segment-intron arg must be at least 1", print_usage); break; case OPT_MIN_REPORT_INTRON: min_report_intron_length = parseIntOpt(1, "--min-report-intron arg must be at least 1", print_usage); break; case OPT_MAX_REPORT_INTRON: max_report_intron_length = parseIntOpt(1, "--max-report-intron arg must be at least 1", print_usage); break; case OPT_MIN_ISOFORM_FRACTION: min_isoform_fraction = parseFloatOpt(0.0f, 1.0f, "--min-isoform-fraction arg must be [0.0,1.0]", print_usage); break; case OPT_IUM_READS: ium_reads = optarg; break; case OPT_SAM_HEADER: sam_header = optarg; break; case OPT_SAM_READGROUP_ID: sam_readgroup_id = optarg; break; case OPT_BUTTERFLY_SEARCH: butterfly_search = true; break; case OPT_SOLEXA_QUALS: solexa_quals = true; break; case OPT_PHRED64_QUALS: phred64_quals = true; break; case 'Q': case OPT_QUALS: quals = true; break; case OPT_INTEGER_QUALS: integer_quals = true; break; case 'C': case OPT_COLOR: color = true; break; case OPT_LIBRARY_TYPE: if (strcmp(optarg, "fr-unstranded") == 0) library_type = FR_UNSTRANDED; else if (strcmp(optarg, "fr-firststrand") == 0) library_type = FR_FIRSTSTRAND; else if (strcmp(optarg, "fr-secondstrand") == 0) library_type = FR_SECONDSTRAND; else if (strcmp(optarg, "ff-unstranded") == 0) library_type = FF_UNSTRANDED; else if (strcmp(optarg, "ff-firststrand") == 0) library_type = FF_FIRSTSTRAND; else if (strcmp(optarg, "ff-secondstrand") == 0) library_type = FF_SECONDSTRAND; break; case OPT_MAX_DELETION_LENGTH: max_deletion_length = parseIntOpt(0, "--max-deletion-length must be at least 0", print_usage); break; case OPT_MAX_INSERTION_LENGTH: max_insertion_length = parseIntOpt(0, "--max-insertion-length must be at least 0", print_usage); break; case 'z': case OPT_ZPACKER: zpacker = optarg; break; case OPT_SAMTOOLS: samtools_path = optarg; break; case OPT_AUX_OUT: aux_outfile = optarg; break; case 'w': case OPT_STD_OUT: std_outfile = optarg; break; case OPT_INDEX_OUT: index_outfile = optarg; break; case 'p': case OPT_NUM_THREADS: num_threads=parseIntOpt(1,"-p/--num-threads must be at least 1",print_usage); break; case OPT_GTF_JUNCS: gtf_juncs = optarg; break; case OPT_FILTER_READS: flt_reads = optarg; break; case OPT_FILTER_HITS: flt_mappings = optarg; break; case OPT_FILTER_SIDE: flt_side = (optarg[0]=='0') ? 0 : 1; break; case OPT_REPORT_SECONDARY_ALIGNMENTS: report_secondary_alignments = true; break; case OPT_REPORT_DISCORDANT_PAIR_ALIGNMENTS: report_discordant_pair_alignments = true; break; case OPT_REPORT_MIXED_ALIGNMENTS: report_mixed_alignments = true; break; case OPT_FUSION_SEARCH: fusion_search = true; break; case OPT_FUSION_ANCHOR_LENGTH: fusion_anchor_length = parseIntOpt(10, "--fusion-anchor-length must be at least 10", print_usage); break; case OPT_FUSION_MIN_DIST: fusion_min_dist = parseIntOpt(0, "--fusion-min-dist must be at least 0", print_usage); break; case OPT_FUSION_READ_MISMATCHES: fusion_read_mismatches = parseIntOpt(0, "--fusion-read-mismatches must be at least 0", print_usage); break; case OPT_FUSION_MULTIREADS: fusion_multireads = parseIntOpt(1, "--fusion-multireads must be at least 1", print_usage); break; case OPT_FUSION_MULTIPAIRS: fusion_multipairs = parseIntOpt(1, "--fusion-multipairs must be at least 0", print_usage); break; case OPT_FUSION_IGNORE_CHROMOSOMES: tokenize(optarg, ",", fusion_ignore_chromosomes); break; case OPT_FUSION_DO_NOT_RESOLVE_CONFLICTS: fusion_do_not_resolve_conflicts = true; break; case OPT_BOWTIE1: bowtie2 = false; break; case OPT_BOWTIE2_MIN_SCORE: bowtie2_min_score = -1 * parseIntOpt(0, "--bowtie2-min-score must be at least 0", print_usage); break; case OPT_BOWTIE2_MAX_PENALTY: bowtie2_max_penalty = parseIntOpt(0, "--bowtie2-max-penalty must be at least 0", print_usage); break; case OPT_BOWTIE2_MIN_PENALTY: bowtie2_min_penalty = parseIntOpt(0, "--bowtie2-min-penalty must be at least 0", print_usage); break; case OPT_BOWTIE2_PENALTY_FOR_N: bowtie2_penalty_for_N = parseIntOpt(0, "--bowtie2-penalty-for-N must be at least 0", print_usage); break; case OPT_BOWTIE2_READ_GAP_OPEN: bowtie2_read_gap_open = parseIntOpt(0, "--bowtie2-read-gap-open must be at least 0", print_usage); break; case OPT_BOWTIE2_READ_GAP_CONT: bowtie2_read_gap_cont = parseIntOpt(0, "--bowtie2-read-gap-cont must be at least 0", print_usage); break; case OPT_BOWTIE2_REF_GAP_OPEN: bowtie2_ref_gap_open = parseIntOpt(0, "--bowtie2-ref-gap-open must be at least 0", print_usage); break; case OPT_BOWTIE2_REF_GAP_CONT: bowtie2_ref_gap_cont = parseIntOpt(0, "--bowtie2-ref-gap-cont must be at least 0", print_usage); break; case 'W': case OPT_BOWTIE2_SCOREFLT: bowtie2_scoreflt = -1 * parseIntOpt(1, "-W option must be at least 1", print_usage); break; default: print_usage(); return 1; } } while(next_option != -1); return 0; } // Error routine (prints error message and exits!) void err_exit(const char* format,...){ va_list arguments; va_start(arguments,format); vfprintf(stderr,format,arguments); va_end(arguments); #ifdef DEBUG // trigger a core dump for later inspection abort(); #endif exit(1); } FILE* FZPipe::openRead(const char* fname, string& popencmd) { pipecmd=popencmd; filename=fname; if (pipecmd.empty()) { file=fopen(filename.c_str(), "r"); return file; } else { string pcmd(pipecmd); pcmd.append(" '"); pcmd.append(filename); pcmd.append("'"); file=popen(pcmd.c_str(), "r"); } return file; } FILE* FZPipe::openWrite(const char* fname, string& popencmd) { pipecmd=popencmd; filename=fname; if (pipecmd.empty()) { file=fopen(filename.c_str(), "w"); } else { string pcmd(pipecmd); pcmd.append(" - > '"); pcmd.append(filename.c_str()); pcmd.append("'"); file=popen(pcmd.c_str(), "w"); } return file; } FILE* FZPipe::openWrite(const char* fname) { string pcmd; return this->openWrite(fname,pcmd); } void FZPipe::rewind() { if (is_bam && !filename.empty()) { if (bam_file) { samclose(bam_file); bam_file=NULL; } bam_file=samopen(filename.c_str(), "rb", 0); return; } if (pipecmd.empty()) { if (file!=NULL) { ::rewind(file); return; } if (!filename.empty()) { file=fopen(filename.c_str(),"r"); return; } } if (filename.empty()) err_die("Error: FZPipe::rewind() failed (missing filename)!\n"); this->close(); string pcmd(pipecmd); pcmd.append(" '"); pcmd.append(filename); pcmd.append("'"); file=popen(pcmd.c_str(), "r"); if (file==NULL) { err_die("Error: FZPipe::rewind() popen(%s) failed!\n",pcmd.c_str()); } } string getFext(const string& s) { string r(""); //if (xpos!=NULL) *xpos=0; if (s.empty() || s=="-") return r; int slen=(int)s.length(); size_t pos=s.rfind('.'); if (pos==string::npos) return r; int p=(int)pos; int d=s.rfind('/'); if (p<=0 || p>slen-2 || p1) { pipecmd.append(" -p"); str_appendInt(pipecmd,num_threads); } } if (!pipecmd.empty()) pipecmd.append(" -cd"); return pipecmd; } void checkSamHeader() { if (sam_header.empty()) err_die("Error: writeSamHeader() with empty sam_header string\n"); //copy the SAM header FILE* fh=fopen(sam_header.c_str(), "r"); if (fh==NULL) err_die("Error: cannot open SAM header file %s\n",sam_header.c_str()); fclose(fh); } void writeSamHeader(FILE* fout) { if (fout==NULL) err_die("Error: writeSamHeader(NULL)\n"); checkSamHeader(); //copy the SAM header FILE* fh=fopen(sam_header.c_str(), "r"); int ch=-1; while ((ch=fgetc(fh))!=EOF) { if (fputc(ch, fout)==EOF) err_die("Error copying SAM header\n"); } fclose(fh); } //auxiliary functions for BAM record handling uint8_t* realloc_bdata(bam1_t *b, int size) { if (b->m_data < size) { b->m_data = size; kroundup32(b->m_data); b->data = (uint8_t*)realloc(b->data, b->m_data); } if (b->data_lendata_len=size; return b->data; } uint8_t* dupalloc_bdata(bam1_t *b, int size) { //same as realloc_bdata, but does not free previous data //but returns it instead //it ALWAYS duplicates data b->m_data = size; kroundup32(b->m_data); uint8_t* odata=b->data; b->data = (uint8_t*)malloc(b->m_data); memcpy((void*)b->data, (void*)odata, b->data_len); b->data_len=size; return odata; //user must FREE this after } extern unsigned short bam_char2flag_table[]; GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid, int pos, bool reverse, const char* qseq, const char* cigar, const char* quals) { novel=true; b=bam_init1(); if (pos<=0 || gseq_tid<0) { b->core.pos=-1; //unmapped b->core.flag |= BAM_FUNMAP; gseq_tid=-1; } else b->core.pos=pos-1; //BAM is 0-based b->core.tid=gseq_tid; b->core.mtid=-1; b->core.mpos=-1; b->core.qual=255; int l_qseq=strlen(qseq); //this may not be accurate, setting CIGAR is the correct way //b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1); b->core.l_qname=strlen(qname)+1; //includes the \0 at the end memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname); set_cigar(cigar); //this will also set core.bin add_sequence(qseq, l_qseq); add_quals(quals); //quals must be given as Phred33 if (reverse) { b->core.flag |= BAM_FREVERSE; } } GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid, int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos, int insert_size, const char* qseq, const char* quals, const vector* aux_strings) { novel=true; b=bam_init1(); b->core.tid=g_tid; b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based b->core.qual=map_qual; int l_qseq=strlen(qseq); b->core.l_qname=strlen(qname)+1; //includes the \0 at the end memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname); set_cigar(cigar); //this will also set core.bin add_sequence(qseq, l_qseq); add_quals(quals); //quals must be given as Phred33 set_flags(flags); set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size); if (aux_strings!=NULL) { for (vector::const_iterator itr=aux_strings->begin(); itr!=aux_strings->end(); ++itr) { add_aux(itr->c_str()); } } } void GBamRecord::set_cigar(const char* cigar) { //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call int doff=b->core.l_qname; uint8_t* after_cigar=NULL; int after_cigar_len=0; uint8_t* prev_bdata=NULL; if (b->data_len>doff) { //cigar string already allocated, replace it int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data after_cigar=b->data+d; after_cigar_len=b->data_len-d; } const char *s; char *t; int i, op; long x; b->core.n_cigar = 0; if (cigar && strcmp(cigar, "*")) { for (s = cigar; *s; ++s) { if (isalpha(*s)) b->core.n_cigar++; else if (!isdigit(*s)) { err_die("Error: invalid CIGAR character (%s)\n",cigar); } } if (after_cigar_len>0) { //replace/insert into existing full data prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len); memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len); free(prev_bdata); } else { realloc_bdata(b, doff + b->core.n_cigar * 4); } for (i = 0, s = cigar; i != (int)b->core.n_cigar; ++i) { x = strtol(s, &t, 10); op = toupper(*t); if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; else if (op == 'I') op = BAM_CINS; else if (op == 'D') op = BAM_CDEL; else if (op == 'N') op = BAM_CREF_SKIP; else if (op == 'S') op = BAM_CSOFT_CLIP; else if (op == 'H') op = BAM_CHARD_CLIP; else if (op == 'P') op = BAM_CPAD; else err_die("Error: invalid CIGAR operation (%s)\n",cigar); s = t + 1; bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; } if (*s) err_die("Error: unmatched CIGAR operation (%s)\n", cigar); b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b))); } else {//no CIGAR string given if (!(b->core.flag&BAM_FUNMAP)) { fprintf(stderr, "Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data); b->core.flag |= BAM_FUNMAP; } b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1); } } //set_cigar() void GBamRecord::add_sequence(const char* qseq, int slen) { //must be called AFTER set_cigar (cannot replace existing sequence for now) if (qseq==NULL) return; //should we ever care about this? if (slen<0) slen=strlen(qseq); int doff = b->core.l_qname + b->core.n_cigar * 4; if (strcmp(qseq, "*")!=0) { b->core.l_qseq=slen; if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b))) err_die("Error: CIGAR and sequence length are inconsistent!(%s)\n", qseq); uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff; //also allocated quals memory memset(p, 0, (b->core.l_qseq+1)/2); for (int i = 0; i < b->core.l_qseq; ++i) p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2); } else b->core.l_qseq = 0; } void GBamRecord::add_quals(const char* quals) { //requires core.l_qseq already set //and must be called AFTER add_sequence(), which also allocates the memory for quals uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2); if (quals==NULL || strcmp(quals, "*") == 0) { for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff; return; } for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33; } void GBamRecord::add_aux(const char* str) { //requires: being called AFTER add_quals() // static char tag[2]; // static uint8_t abuf[512]; //requires: being called AFTER add_quals() int strl=strlen(str); //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux; //int doff0=doff; if (strl < 6 || str[2] != ':' || str[4] != ':') parse_error("missing colon in auxiliary data"); tag[0] = str[0]; tag[1] = str[1]; uint8_t atype = str[3]; uint8_t* adata=abuf; int alen=0; if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility atype='A'; alen=1; adata=(uint8_t*)&str[5]; } else if (atype == 'I' || atype == 'i') { long long x=(long long)atoll(str + 5); if (x < 0) { if (x >= -127) { atype='c'; abuf[0] = (int8_t)x; alen=1; } else if (x >= -32767) { atype = 's'; *(int16_t*)abuf = (int16_t)x; alen=2; } else { atype='i'; *(int32_t*)abuf = (int32_t)x; alen=4; if (x < -2147483648ll) fprintf(stderr, "Parse warning: integer %lld is out of range.\n", x); } } else { //x >=0 if (x <= 255) { atype = 'C'; abuf[0] = (uint8_t)x; alen=1; } else if (x <= 65535) { atype='S'; *(uint16_t*)abuf = (uint16_t)x; alen=2; } else { atype='I'; *(uint32_t*)abuf = (uint32_t)x; alen=4; if (x > 4294967295ll) fprintf(stderr, "Parse warning: integer %lld is out of range.\n", x); } } } //integer type else if (atype == 'f') { *(float*)abuf = (float)atof(str + 5); alen = sizeof(float); } else if (atype == 'd') { //? *(float*)abuf = (float)atof(str + 9); alen=8; } else if (atype == 'Z' || atype == 'H') { if (atype == 'H') { // check whether the hex string is valid if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even"); for (int i = 0; i < strl - 5; ++i) { int c = toupper(str[5 + i]); if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))) parse_error("invalid hex character"); } } memcpy(abuf, str + 5, strl - 5); abuf[strl-5] = 0; alen=strl-4; //making sure the len includes the terminal \0 } else parse_error("unrecognized aux type"); this->add_aux(tag, atype, alen, adata); }//add_aux() uint8_t* GBamRecord::find_tag(const char tag[2]) { return bam_aux_get(this->b, tag); } char GBamRecord::tag_char(const char tag[2]) { //retrieve tag data as single char uint8_t* s=find_tag(tag); if (s) return ( bam_aux2A(s) ); return 0; } int GBamRecord::tag_int(const char tag[2]) { //get the numeric value of tag uint8_t *s=find_tag(tag); if (s) return ( bam_aux2i(s) ); return 0; } string GBamRecord::tag_str(const char tag[2]) { //return string value for a tag string r(""); uint8_t *sz=find_tag(tag); if (sz) { r = bam_aux2Z(sz); } return r; } char GBamRecord::spliceStrand() { // '+', '-' from the XS tag, or 0 if no XS tag char c=tag_char("XS"); if (c) return c; else return '.'; } string GBamRecord::sequence() { char *s = (char*)bam1_seq(b); string qseq; qseq.resize(b->core.l_qseq); for (int i=0;i<(b->core.l_qseq);i++) { int8_t v = bam1_seqi(s,i); qseq[i] = bam_nt16_rev_table[v]; } return qseq; } string GBamRecord::qualities() { char *qual = (char*)bam1_qual(b); string qv; qv.resize(b->core.l_qseq); for(int i=0;i<(b->core.l_qseq);++i) { qv[i]=qual[i]+33; } return qv; } string GBamRecord::seqData(string* readquals) { static const int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 }; string seq; string squal; unsigned char *qual = (unsigned char*)bam1_qual(b); unsigned char *s = (unsigned char*)bam1_seq(b); int i; //bool ismapped=((b->core.flag & BAM_FUNMAP) == 0); bool isreversed=((b->core.flag & BAM_FREVERSE) != 0); bool is_paired = ((b->core.flag & BAM_FPAIRED) != 0); int mate_num=0; if (is_paired) { if (b->core.flag & BAM_FREAD1) mate_num=1; else if (b->core.flag & BAM_FREAD2) mate_num=2; } int seqlen = b->core.l_qseq; if (seqlen>0) { seq.resize(seqlen); for(i=0;i0 return seq; } tophat-2.0.9/src/segment_juncs.cpp0000644000175000017500000047536612122334363015734 0ustar toortoor#ifdef HAVE_CONFIG_H #include #endif /* * segment_juncs.cpp * TopHat * * Created by Cole Trapnell on 2/5/09. * Copyright 2009 Cole Trapnell. All rights reserved. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common.h" #include "utils.h" #include "bwt_map.h" #include "tokenize.h" #include "segments.h" #include "reads.h" #include "junctions.h" #include "insertions.h" #include "deletions.h" #include "fusions.h" using namespace seqan; using namespace std; using namespace __gnu_cxx; // daehwan //geo //#define B_DEBUG 1 void print_usage() { fprintf(stderr, "Usage: segment_juncs [right_reads.fq right_reads.bwtout right_seg1.bwtout,...,right_segN.bwtout]\n"); } // This is the maximum number of bowtie mismatches allower per segment hit static const int num_bowtie_mismatches = 2; static const int max_cov_juncs = 5000000; // static const int max_cov_juncs = std::numeric_limits::max(); static const int max_seg_juncs = 10000000; int max_microexon_stretch = 2000; int butterfly_overhang = 6; int min_cov_length = 20; void get_seqs(istream& ref_stream, RefSequenceTable& rt, bool keep_seqs = true) { while(ref_stream.good() && !ref_stream.eof()) { RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence(); string name; readMeta(ref_stream, name, Fasta()); string::size_type space_pos = name.find_first_of(" \t\r"); if (space_pos != string::npos) { name.resize(space_pos); } fprintf(stderr, "\tLoading %s...", name.c_str()); seqan::read(ref_stream, *ref_str, Fasta()); fprintf(stderr, "done\n"); rt.get_id(name, keep_seqs ? ref_str : NULL, 0); if (!keep_seqs) delete ref_str; } } RefSeg seg_from_bowtie_hit(const BowtieHit& T) { RefSeg r_seg(T.ref_id(), POINT_DIR_DONTCARE, T.antisense_align(), READ_DONTCARE, 0, 0); if (T.antisense_align()) { r_seg.left = max(0, T.right() - 2); r_seg.right = T.right() + (T.right() - T.left() + 1); // num allowed bowtie mismatches r_seg.points_where = POINT_DIR_RIGHT; } else { r_seg.left = max(0, T.left() - (T.right() - T.left() + 1)); r_seg.right = T.left() + 2; // num allowed bowtie mismatches r_seg.points_where = POINT_DIR_LEFT; } return r_seg; } pair segs_from_bowtie_hits(const BowtieHit& T, const BowtieHit& H) { pair seg_pair; if (H.antisense_align() == false && abs((H.right() + 1) - T.left()) < (int)max_segment_intron_length) { RefSeg left_seg(H.ref_id(), POINT_DIR_RIGHT, H.antisense_align(), READ_DONTCARE, 0, 0); left_seg.left = max(0, H.right() - 2); left_seg.right = H.right() + (H.right() - H.left() + 1); // num allowed bowtie mismatches RefSeg right_seg(T.ref_id(), POINT_DIR_LEFT, T.antisense_align(), READ_DONTCARE, 0, 0); right_seg.left = max(0, T.left() - (T.right() - T.left() + 1)); right_seg.right = T.left() + 2; // num allowed bowtie mismatches seg_pair = make_pair(left_seg, right_seg); } else if (H.antisense_align() == true && abs((T.right() + 1) - H.left()) < (int)max_segment_intron_length) { RefSeg left_seg(T.ref_id(), POINT_DIR_RIGHT, T.antisense_align(), READ_DONTCARE, 0, 0); left_seg.left = max(0, T.right() - 2); left_seg.right = T.right() + (T.right() - T.left() + 1); // num allowed bowtie mismatches RefSeg right_seg(H.ref_id(), POINT_DIR_LEFT, H.antisense_align(), READ_DONTCARE, 0, 0); right_seg.left = max(0, H.left() - (H.right() - H.left() + 1)); right_seg.right = H.left() + 2; // num allowed bowtie mismatches seg_pair = make_pair(left_seg, right_seg); } return seg_pair; } //static const size_t half_splice_mer_len = 6; //static const size_t splice_mer_len = 2 * half_splice_mer_len; struct MerExtension { static const int MAX_EXTENSION_BP = 14; uint32_t left_dna_str : 28; // up to 14bp encoded in 2-bits-per-base uint8_t left_ext_len : 4; // how many bases in this extension uint32_t right_dna_str : 28; // up to 14bp encoded in 2-bits-per-base uint8_t right_ext_len : 4; // how many bases in this extension MerExtension() : left_dna_str(0), left_ext_len(0), right_dna_str(0), right_ext_len(0) {} bool operator<(const MerExtension& rhs) const { if (left_dna_str != rhs.left_dna_str) return left_dna_str < rhs.left_dna_str; if (left_ext_len != rhs.left_ext_len) return left_ext_len < rhs.left_ext_len; if (right_dna_str != rhs.right_dna_str) return right_dna_str < rhs.right_dna_str; if (right_ext_len != rhs.right_ext_len) return right_ext_len < rhs.right_ext_len; return false; } bool operator==(const MerExtension& rhs) const { bool eq = left_dna_str == rhs.left_dna_str && right_dna_str == rhs.right_dna_str && left_ext_len == rhs.left_ext_len && right_ext_len == rhs.right_ext_len; return eq; } }; /// For converting from ASCII to the Dna5 code where A=0, C=1, G=2, /// T=3, N=4 uint8_t charToDna5[] = { /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 48 */ 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, /* A C G N */ /* 80 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* T */ /* 96 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, /* a c g n */ /* 112 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* t */ /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; typedef vector > MerExtensionTable; typedef vector MerExtensionCounts; MerExtensionTable extensions; MerExtensionCounts extension_counts; uint64_t dna5str_to_idx(const string& str) { uint64_t idx = 0; for (size_t i = 0; i < str.length(); ++i) { idx <<=2; char c = toupper(str[i]); idx |= (0x3 & charToDna5[(size_t)c]); } return idx; } uint64_t colorstr_to_idx(const string& str) { uint64_t idx = 0; for (size_t i = 0; i < str.length(); ++i) { idx <<=2; char c = str[i]; idx |= (0x3 & charToDna5[(size_t)c]); } return idx; } void store_read_extensions(MerExtensionTable& ext_table, int seq_key_len, int min_ext_len, const string& seq, bool use_precount_table) { // h is will hold the 2-bit-per-base representation of the k-mer seeds for // this read. uint64_t seed = 0; bitset<256> left = 0; bitset<256> right = 0; const char* p = seq.c_str(); unsigned int seq_len = (int)seq.length(); const char* seq_end = p + seq_len; // Build the first seed while (p < seq.c_str() + (2 * seq_key_len)) { seed <<= 2; seed |= (0x3 & charToDna5[(size_t)*p]); ++p; } // Build the rest of them with a sliding window, adding successive bases // to the "right-remainder" word. while (p < seq_end) { right <<= 2; right |= (0x3 & charToDna5[(size_t)*p]); ++p; } // This loop will construct successive seed, along with 32-bit words // containing the left and right remainders for each seed uint32_t i = 0; size_t new_hits = 0; do { // How many base pairs exist in the right remainder beyond what we have // space for ? int extra_right_bp = ((int)seq.length() - (i + 2 * seq_key_len)) - MerExtension::MAX_EXTENSION_BP; uint32_t hit_right = 0; if (extra_right_bp > 0) { //bitset<32> tmp_hit_right = (right >> (extra_right_bp << 1)); hit_right = (uint32_t)(right >> (extra_right_bp << 1)).to_ulong(); } else { hit_right = (uint32_t)right.to_ulong(); } uint32_t hit_left = (uint32_t)((left << (256 - 32)) >> (256 - 32)).to_ulong(); //size_t prev_cap = (*mer_table)[seed].capacity(); //(*mer_table)[seed].push_back(ReadHit(hit_left, hit_right,i, read_num, reverse_complement)); //cap_increase += ((*mer_table)[seed].capacity() - prev_cap) * sizeof (ReadHit); MerExtension ext; ext.right_dna_str = hit_right; ext.right_ext_len = min(seq_len - (2 * seq_key_len) - i, (unsigned int)MerExtension::MAX_EXTENSION_BP); ext.left_dna_str = hit_left; ext.left_ext_len = min(i, (unsigned int)MerExtension::MAX_EXTENSION_BP); if (use_precount_table) { int curr_seed = --extension_counts[seed]; if (curr_seed < 0 || curr_seed > (int)ext_table[seed].size()) { fprintf(stderr, "Error: curr_seed is %d, max is %lu\n", curr_seed, (long unsigned int)ext_table[seed].size()); } ext_table[seed][curr_seed] = ext; } else { ext_table[seed].push_back(ext); } new_hits++; // Take the leftmost base of the seed and stick it into bp uint64_t bp = seed & (0x3uLL << ((seq_key_len << 2) - 2)); // Move that base down to the least significant bits of bp bp >>= ((seq_key_len << 2) - 2); // And tack it onto the left remainder of the read left <<= 2; left |= bp; // Now take the leftmost base of the right remainder and stick it into // the rightmost position of the seed uint32_t right_len = seq_len - (i + seq_key_len * 2); //bp = right & (0x3uLL << ((right_len - 1) << 1)); seed <<= 2; //cout << right << endl; bitset<256> tmp_right = (right >> ((right_len - 1) << 1)); //cout < left = 0; bitset<256> right = 0; const char* p = seq.c_str(); unsigned int seq_len = (int)seq.length(); const char* seq_end = p + seq_len; // Build the first seed while (p < seq.c_str() + (2 * seq_key_len)) { seed <<= 2; seed |= (0x3 & charToDna5[(size_t)*p]); ++p; } // Build the rest of them with a sliding window, adding successive bases // to the "right-remainder" word. while (p < seq_end) { right <<= 2; right |= (0x3 & charToDna5[(size_t)*p]); ++p; } // This loop will construct successive seed, along with 32-bit words // containing the left and right remainders for each seed uint32_t i = 0; size_t new_hits = 0; do { ext_counts[seed]++; new_hits++; // Take the leftmost base of the seed and stick it into bp uint64_t bp = seed & (0x3uLL << ((seq_key_len << 2) - 2)); // Move that base down to the least significant bits of bp bp >>= ((seq_key_len << 2) - 2); // And tack it onto the left remainder of the read left <<= 2; left |= bp; // Now take the leftmost base of the right remainder and stick it into // the rightmost position of the seed uint32_t right_len = seq_len - (i + seq_key_len * 2); //bp = right & (0x3uLL << ((right_len - 1) << 1)); seed <<= 2; //cout << right << endl; bitset<256> tmp_right = (right >> ((right_len - 1) << 1)); //cout < 32) read.seq.resize(32); count_read_extensions(extension_counts, half_splice_mer_len, half_splice_mer_len, read.seq); } //reads_file.rewind(); } void compact_extension_table() { for (size_t i = 0; i < extensions.size(); ++i) { vector& exts = extensions[i]; sort(exts.begin(), exts.end()); vector::iterator new_end = unique(exts.begin(), exts.end()); exts.erase(new_end, exts.end()); vector(exts).swap(exts); } } void prune_extension_table(uint8_t max_extension_bp) { uint32_t mask = ~(0xFFFFFFFFuLL << (max_extension_bp << 1)); for (size_t i = 0; i < extensions.size(); ++i) { vector& exts = extensions[i]; for (size_t j = 0; j < exts.size(); ++j) { MerExtension& ex = exts[j]; if (ex.left_ext_len > max_extension_bp) { ex.left_ext_len = max_extension_bp; ex.left_dna_str &= mask; } if (ex.right_ext_len > max_extension_bp) { ex.right_dna_str >>= ((ex.right_ext_len - max_extension_bp) << 1); ex.right_ext_len = max_extension_bp; } } } } //void store_read_mers(FILE* reads_file, size_t half_splice_mer_len) void store_read_mers(string& reads_file, size_t half_splice_mer_len) { Read read; size_t splice_mer_len = 2 * half_splice_mer_len; size_t mer_table_size = 1 << ((splice_mer_len)<<1); extensions.resize(mer_table_size); size_t num_indexed_reads = 0; ReadStream readstream(reads_file); while (readstream.get_direct(read, reads_format)) { /* FLineReader fr(reads_file); //while(!feof(reads_file)) while(!fr.isEof()) { read.clear(); // Get the next read from the file if (!next_fasta_record(fr, read.name, read.seq, reads_format)) break; if (reads_format == FASTQ) { if (!next_fastq_record(fr, read.seq, read.alt_name, read.qual, reads_format)) break; } */ if (color && !readstream.isBam()) // erase the primer and the adjacent color read.seq.erase(0, 2); if (read.seq.size() > 32) read.seq.resize(32); store_read_extensions(extensions, half_splice_mer_len, half_splice_mer_len, read.seq, true); // Do NOT index the reverse of the reads num_indexed_reads++; if (num_indexed_reads % 1000000 == 0) { //fprintf(stderr, "Indexed %lu reads, compacting extension table\n", num_indexed_reads); //compact_extension_table(); } } //fprintf(stderr, "Indexed %lu reads, compacting extension table\n", num_indexed_reads) uint64_t num_extensions = 0; for (size_t i = 0; i < extensions.size(); ++i) { num_extensions += extensions[i].size(); } //fprintf (stderr, "Total extensions: %lu\n", (long unsigned int)num_extensions); //reads_file.rewind(); } //void index_read_mers(vector reads_files, void index_read_mers(vector& reads_files, size_t half_splice_mer_len) { extensions.clear(); for (size_t i = 0; i < reads_files.size(); ++i) { count_read_mers(reads_files[i], half_splice_mer_len); } extensions.resize(extension_counts.size()); for (size_t i = 0; i < extension_counts.size(); ++i) { extensions[i].resize(extension_counts[i]); } for (size_t i = 0; i < reads_files.size(); ++i) { store_read_mers(reads_files[i], half_splice_mer_len); } compact_extension_table(); } /** Returns the number of characters in strings w1 and w2 that match, * starting from right to left */ int get_matching_chars(uint32_t w1, uint32_t w2) { //find the least significant mismatching bit between w1 and w2 int mismatch_bit = ffs(w1 ^ w2); // If there is no mismatching bit, the words are equal if (!mismatch_bit) return -1; // Given the mismatching bit, determine where the mismatching base is mismatch_bit -= 1; mismatch_bit -= ((mismatch_bit) & 1); mismatch_bit >>= 1; // Return the number of matching characters. return mismatch_bit; } /** * Computes the Hamming distance between two 16bp words, up to a specified * maximum number of mismatches. */ uint32_t mismatching_bases(uint32_t w1_word, uint32_t w2_word, int len, uint32_t max_mis) { uint32_t diffs = 0; int shift = 0; int L = len; uint32_t misses = 0; // While we haven't yet exceeded the maximum allowable mismatches, // and there are still unaligned bases, keep shift-anding while (shift < len && misses <= max_mis) { int match_chars = 0; // Get the number of characters matching on the right sides of // both words match_chars = get_matching_chars(w1_word, w2_word); // If they are equal for this shift, we are done, // the loop will stop at the next iteration if (match_chars == -1 || match_chars >= len) { match_chars = len; shift = len; } else { // If there is a mismatch in the remaining words // decide how much to shift by and try again match_chars = min(len, match_chars); int shift_chars = (match_chars + 1); L -= shift_chars; int shift_bits = shift_chars << 1; // Shift right past the matching part and the first mismatching base w1_word >>= (shift_bits); w2_word >>= (shift_bits); shift += shift_chars; diffs++; misses++; } } return diffs; } uint64_t rc_dna_str(uint64_t dna_str) { dna_str = ~dna_str; uint64_t rc = 0; for (int i = 0; i < 32; i++) { rc <<= 2; rc |= (dna_str & 0x3); dna_str >>= 2; } return rc; } uint64_t rc_color_str(uint64_t color_str) { uint64_t rc = 0; for (int i = 0; i < 32; ++i) { rc <<= 2; rc |= (color_str & 0x3); color_str >>= 2; } return rc; } struct DnaSpliceStrings { DnaSpliceStrings(uint64_t f, uint64_t r) : fwd_string(f), rev_string(r), first_in_string('N'), last_in_string('N') {} uint64_t fwd_string; uint64_t rev_string; // for color-space purposes char first_in_string; char last_in_string; bool operator<(const DnaSpliceStrings& rhs) const { if (fwd_string != rhs.fwd_string) return fwd_string < rhs.fwd_string; if (rev_string != rhs.rev_string) return rev_string < rhs.rev_string; return false; } bool operator==(const DnaSpliceStrings& rhs) const { return fwd_string == rhs.fwd_string && rev_string == rhs.rev_string; } }; struct IntronMotifs { IntronMotifs(uint32_t rid) : ref_id(rid) {} uint32_t ref_id; vector > fwd_donors; vector > fwd_acceptors; vector > rev_donors; vector > rev_acceptors; void unique(vector >& f) { sort(f.begin(), f.end()); vector >::iterator i = std::unique(f.begin(), f.end()); f.erase(i, f.end()); } void unique() { unique(fwd_donors); unique(fwd_acceptors); unique(rev_donors); unique(rev_acceptors); } void attach_mers(RefSequenceTable::Sequence& ref_str) { attach_upstream_mers(ref_str, fwd_donors); attach_upstream_mers(ref_str, rev_acceptors); attach_downstream_mers(ref_str, rev_donors); attach_downstream_mers(ref_str, fwd_acceptors); } void attach_upstream_mers(RefSequenceTable::Sequence& ref_str, vector >& dinucs) { for (size_t i = 0; i < dinucs.size(); ++i) { size_t pos = dinucs[i].first; int half_splice_mer_len = 32; if (color) { if (pos <= (size_t)half_splice_mer_len+1 || pos >= length(ref_str)) continue; Dna5String seg_str = seqan::infixWithLength(ref_str, pos - half_splice_mer_len - 1, half_splice_mer_len + 1); stringstream ss(stringstream::in | stringstream::out); string s; ss << seg_str; ss >> s; string col_seg_str = convert_bp_to_color(s, true); uint64_t idx = colorstr_to_idx(col_seg_str); dinucs[i].second.fwd_string = idx; dinucs[i].second.rev_string = rc_color_str(idx); dinucs[i].second.first_in_string = s[1]; dinucs[i].second.last_in_string = s[half_splice_mer_len]; } else { if (pos <= (size_t)half_splice_mer_len || pos >= length(ref_str)) continue; Dna5String seg_str = seqan::infixWithLength(ref_str, pos - half_splice_mer_len, half_splice_mer_len); stringstream ss(stringstream::in | stringstream::out); string s; ss << seg_str; ss >> s; uint64_t idx = dna5str_to_idx(s); dinucs[i].second.fwd_string = idx; dinucs[i].second.rev_string = rc_dna_str(idx); } } } void attach_downstream_mers(RefSequenceTable::Sequence& ref_str, vector >& dinucs) { for (size_t i = 0; i < dinucs.size(); ++i) { size_t pos = dinucs[i].first; int half_splice_mer_len = 32; if (pos + 2 + half_splice_mer_len >= length(ref_str)) continue; if (color) { Dna5String seg_str = seqan::infixWithLength(ref_str, pos + 2 - 1, half_splice_mer_len + 1); stringstream ss(stringstream::in | stringstream::out); string s; ss << seg_str; ss >> s; string col_seg_str = convert_bp_to_color(s, true); uint64_t idx = colorstr_to_idx(col_seg_str); dinucs[i].second.fwd_string = idx; dinucs[i].second.rev_string = rc_color_str(idx); dinucs[i].second.first_in_string = s[1]; dinucs[i].second.last_in_string = s[half_splice_mer_len]; } else { Dna5String seg_str = seqan::infixWithLength(ref_str, pos + 2, half_splice_mer_len); stringstream ss(stringstream::in | stringstream::out); string s; ss << seg_str; ss >> s; uint64_t idx = dna5str_to_idx(s); dinucs[i].second.fwd_string = idx; dinucs[i].second.rev_string = rc_dna_str(idx); } } } }; struct PackedSplice { PackedSplice() : left(0u), seed(0u), right(0u) {} PackedSplice(uint32_t l, uint64_t s, uint32_t r) : left(l), seed(s), right(r) {} uint32_t left; uint64_t seed; uint32_t right; }; // The second element of these pairs is the left (or right) side of a splice // seed from a possible junction. The first element is the sequence flanking // that seed typedef pair PackedSpliceHalf; static inline std::string u32ToDna(uint32_t a, int len) { char buf[17]; assert(len <= 16); for(int i = 0; i < len; i++) { buf[len-i-1] = "ACGT"[a & 3]; a >>= 2; } buf[len] = '\0'; return std::string(buf); } PackedSpliceHalf pack_left_splice_half(const string& seq, uint32_t pos_in_l, unsigned int seq_key_len) { const char* l = seq.c_str(); l += pos_in_l; const char* left_end = l; l -= 16; assert (l + seq_key_len < seq.c_str() + seq.length()); PackedSpliceHalf packed_half = make_pair(0u,0u); // Pack up to 32 bits (16 bases) of sequence into left if (l < seq.c_str()) l = seq.c_str(); while (l < left_end) { packed_half.first <<= 2; packed_half.first |= (0x3 & charToDna5[(size_t)*l]); ++l; } // Pack up the seed bits for (unsigned int i = 0; i < seq_key_len; ++i) { packed_half.second <<= 2; packed_half.second |= (0x3 & charToDna5[(size_t)*(l + i)]); } return packed_half; } PackedSpliceHalf pack_right_splice_half(const string& seq, uint32_t pos, unsigned int seq_key_len) { const char* r = seq.c_str(); r += pos - seq_key_len; PackedSpliceHalf packed_half = make_pair(0u,0u); // Pack the seed bits for (unsigned int i = 0; i < seq_key_len; ++i) { packed_half.second <<= 2; packed_half.second |= (0x3 & charToDna5[(size_t)*(r + i)]); } r += seq_key_len; // Now pack 32 bits (16 bases) of sequence into left const char* right_end = r + 16; if ((size_t)(right_end - seq.c_str()) > seq.length()) right_end = seq.c_str() + seq.length(); while (r < right_end) { packed_half.first <<= 2; packed_half.first |= (0x3 & charToDna5[(size_t)*r]); ++r; } return packed_half; } PackedSplice combine_splice_halves(const PackedSpliceHalf& left_half, const PackedSpliceHalf& right_half, int seq_key_len) { uint64_t seed = left_half.second << (seq_key_len << 1) | right_half.second; return PackedSplice(left_half.first,seed, right_half.first); } PackedSplice pack_splice(const string& seq, int l_pos_in_seq, int r_pos_in_seq, unsigned int seq_key_len) { const char* l = seq.c_str(); // l points to beginning of left exon sequence l += l_pos_in_seq; assert (l + seq_key_len < seq.c_str() + seq.length()); const char* r = seq.c_str(); // r points to beginning of right exon sequence r += r_pos_in_seq - seq_key_len; //r += 2; // r points to right side of junction; uint64_t seed = 0; uint64_t left = 0; uint64_t right = 0; // Pack up the seed bits for (unsigned int i = 0; i < seq_key_len; ++i) { seed <<= 2; seed |= (0x3 & charToDna5[(size_t)*(l + i)]); } for (unsigned int i = 0; i < seq_key_len; ++i) { seed <<= 2; seed |= (0x3 & charToDna5[(size_t)*(r + i)]); } // Now pack 32 bits (16 bases) of sequence into left const char* left_end = l; l -= 16; if (l < seq.c_str()) l = seq.c_str(); while (l < left_end) { left <<= 2; left |= (0x3 & charToDna5[(size_t)*l]); ++l; } r += seq_key_len; // Now pack 32 bits (16 bases) of sequence into left const char* right_end = r + 16; if ((size_t)(right_end - seq.c_str()) > seq.length()) right_end = seq.c_str() + seq.length(); while (r < right_end) { right <<= 2; right |= (0x3 & charToDna5[(size_t)*r]); ++r; } return PackedSplice((uint32_t)left,seed,(uint32_t)right); } /* Represents a hit between a splice seed and a read. */ // TODO: consider packing pos and meta into a single 32-bit int. struct ReadHit { ReadHit(uint32_t l, uint32_t r, uint32_t p, uint32_t m, bool rc) : left(l), right(r), pos(p), meta(m), reverse_complement(rc) {} uint32_t left; // 2-bits per base rep of the left remainder uint32_t right; //2-bits per base rep of the right remainder uint32_t pos; // position of the seed within the read uint32_t meta : 31; bool reverse_complement : 1; } __attribute__((packed)); // A MerTable maps k-mers to hits in indexed reads. See the comment for // mer_table typedef vector ReadHitList; typedef vector MerTable; size_t index_read(MerTable* mer_table, int seq_key_len, const string& seq, unsigned int read_num, bool reverse_complement, vector& seeds) { // h is will hold the 2-bit-per-base representation of the k-mer seeds for // this read. uint64_t seed = 0; bitset<256> left = 0; bitset<256> right = 0; const char* p = seq.c_str(); unsigned int seq_len = (int)seq.length(); const char* seq_end = p + seq_len; // Build the first seed while (p < seq.c_str() + (2 * seq_key_len)) { seed <<= 2; seed |= (0x3 & charToDna5[(size_t)*p]); ++p; } seeds.push_back(seed); // Build the rest of them with a sliding window, adding successive bases // to the "right-remainder" word. while (p < seq_end) { right <<= 2; right |= (0x3 & charToDna5[(size_t)*p]); ++p; } size_t cap_increase = 0; // At this point, seed contains the 5'-most 2*min_anchor_len bases of the // read, and right contains everthing else on the 3' end. // This loop will construct successive seed, along with 32-bit words // containing the left and right remainders for each seed uint32_t i = 0; size_t new_hits = 0; do { // Let's not make an out-of-bounds write, if this fails the global // mer_table is too small assert (!mer_table || seed < mer_table->size()); // How many base pairs exist in the right remainder beyond what we have // space for ? int extra_right_bp = ((int)seq.length() - (i + 2 * seq_key_len)) - 16; uint32_t hit_right; if (extra_right_bp > 0) { //bitset<32> tmp_hit_right = (right >> (extra_right_bp << 1)); hit_right = (uint32_t)(right >> (extra_right_bp << 1)).to_ulong(); } else { hit_right = (uint32_t)right.to_ulong(); } uint32_t hit_left = (uint32_t)((left << (256 - 32)) >> (256 - 32)).to_ulong(); if (mer_table) { size_t prev_cap = (*mer_table)[seed].capacity(); (*mer_table)[seed].push_back(ReadHit(hit_left, hit_right,i, read_num, reverse_complement)); cap_increase += ((*mer_table)[seed].capacity() - prev_cap) * sizeof (ReadHit); } new_hits++; // Take the leftmost base of the seed and stick it into bp uint64_t bp = seed & (0x3uLL << ((seq_key_len << 2) - 2)); // Move that base down to the least significant bits of bp bp >>= ((seq_key_len << 2) - 2); // And tack it onto the left remainder of the read left <<= 2; left |= bp; // Now take the leftmost base of the right remainder and stick it into // the rightmost position of the seed uint32_t right_len = seq_len - (i + seq_key_len * 2); //bp = right & (0x3uLL << ((right_len - 1) << 1)); seed <<= 2; //cout << right << endl; bitset<256> tmp_right = (right >> ((right_len - 1) << 1)); //cout < left_extend(const string& ref, const string& read, int ref_pos, int read_pos, int num_mismatches) { string::size_type ext = 0; int mm_encountered = 0; while(ref_pos >= 0 && read_pos >= 0) { //char ref_char = ref[ref_pos]; //char read_char = read[read_pos]; if (ref[ref_pos] != read[read_pos]) { if (mm_encountered + 1 > num_mismatches) return make_pair(ext, mm_encountered); mm_encountered++; } ext++; --ref_pos; --read_pos; } return make_pair(ext, mm_encountered); } pair right_extend(const string& ref, const string& read, int ref_pos, int read_pos, int num_mismatches) { string::size_type ext = 0; int mm_encountered = 0; while(ref_pos < (int)ref.size() && read_pos < (int)read.size()) { if (ref[ref_pos] != read[read_pos]) { if (mm_encountered + 1 > num_mismatches) return make_pair(ext, mm_encountered); mm_encountered++; } ext++; ++ref_pos; ++read_pos; } return make_pair(ext, mm_encountered); } void extend_from_seeds(vector& extensions, const PackedSplice& p, const MerTable& mer_table, const string& ref, const string& read, size_t l_pos_in_ref, size_t r_pos_in_ref, int seq_key_len) { assert(p.seed < mer_table.size()); const ReadHitList& hl = mer_table[p.seed]; for (size_t hit = 0; hit < hl.size(); ++hit) { const ReadHit& rh = hl[hit]; uint32_t pos = rh.pos; pair left_extension; pair right_extension; left_extension = left_extend(ref, read, l_pos_in_ref - seq_key_len + 1, pos, 2); right_extension = right_extend(ref, read, r_pos_in_ref + seq_key_len, pos + 2 * seq_key_len, 2); extensions.push_back(SeedExtension(l_pos_in_ref, r_pos_in_ref, pos + seq_key_len, left_extension.first, right_extension.first, left_extension.second + right_extension.second)); } } typedef pair SpliceHalf; void get_seed_extensions(const string& ref, const string& read, int seq_key_len, MerTable& mer_table, vector& donors, vector& acceptors, vector& extensions) { for (size_t d = 0; d < donors.size(); ++d) { bool broke_out = false; // start pos is a lower bound on downstream acceptor positions // to consider size_t start_pos = donors[d].first + min_report_intron_length; SpliceHalf dummy = make_pair(start_pos,PackedSpliceHalf()); vector::iterator lb = upper_bound(acceptors.begin(), acceptors.end(), dummy); if (lb == acceptors.end()) break; for (size_t a = lb - acceptors.begin(); a < acceptors.size(); ++a) { if (acceptors[a].first - donors[d].first > (size_t)max_microexon_stretch) { broke_out = true; break; } size_t l_pos_in_ref = donors[d].first - 1; size_t r_pos_in_ref = acceptors[a].first + 2; PackedSplice p = combine_splice_halves(donors[d].second, acceptors[a].second, seq_key_len); extend_from_seeds(extensions, p, mer_table, ref, read, l_pos_in_ref, r_pos_in_ref, seq_key_len); } if (broke_out) continue; } } void hits_from_seed_extension(uint32_t ref_id, int ref_offset, uint64_t insert_id, bool antisense, vector& extensions, vector& hits_out, int left_read_edge, int right_read_edge, int seq_key_len) { for (size_t i = 0; i < extensions.size(); ++i) { SeedExtension& s = extensions[i]; if (s.read_pos >= right_read_edge || s.read_pos < left_read_edge) continue; if (s.read_pos - seq_key_len - s.left_extent <= left_read_edge && s.read_pos + seq_key_len + s.right_extent >= right_read_edge && s.mismatches <= 2 ) { vector cigar; int off_adjust; if (antisense) { CigarOp m1 = CigarOp(MATCH, s.read_pos - left_read_edge); CigarOp skip = CigarOp(REF_SKIP, s.r_pos_in_ref - s.l_pos_in_ref); CigarOp m2 = CigarOp(MATCH, right_read_edge - s.read_pos); cigar.push_back(m1); cigar.push_back(skip); cigar.push_back(m2); off_adjust = m1.length; } else { CigarOp m1 = CigarOp(MATCH, s.read_pos - left_read_edge + 1); CigarOp skip = CigarOp(REF_SKIP, s.r_pos_in_ref - s.l_pos_in_ref); CigarOp m2 = CigarOp(MATCH, right_read_edge - s.read_pos - 1); cigar.push_back(m1); cigar.push_back(skip); cigar.push_back(m2); off_adjust = m1.length; } // daehwan - check this bool end = false; BowtieHit bh(ref_id, ref_id, insert_id, ref_offset + s.l_pos_in_ref - off_adjust + 1, cigar, antisense, false, s.mismatches, s.mismatches, 0, end); hits_out.push_back(bh); } } } void align(uint32_t ref_id, uint64_t insert_id, bool antisense, const string& ref, const string& read, int ref_offset, int left_read_edge, int right_read_edge, MerTable& mer_table, int seq_key_len, vector& hits_out) { // Reserve an entry for each k-mer we might see size_t mer_table_size = 1 << ((seq_key_len << 1)<<1); mer_table.resize(mer_table_size); vector seeds; index_read(&mer_table, seq_key_len, read, 0, antisense, seeds); vector forward_donors; vector forward_acceptors; vector reverse_donors; vector reverse_acceptors; const string& seq = ref; unsigned int pos = 0; for (size_t z = seq_key_len + 1; z < seq.length() - seq_key_len - 2; ++z) { char l = seq[z - 1]; char r = seq[z]; if (l == 'G' && r == 'T') { size_t donor_pos = pos + z - 1; size_t s = donor_pos - seq_key_len; PackedSpliceHalf p = pack_left_splice_half(seq, s, seq_key_len); forward_donors.push_back(make_pair(donor_pos,p)); } if (l == 'A' && r == 'G') { size_t acceptor_pos = pos + z - 1; size_t s = acceptor_pos + 2 + seq_key_len; PackedSpliceHalf p = pack_right_splice_half(seq, s, seq_key_len); forward_acceptors.push_back(make_pair(acceptor_pos,p)); } if (l == 'C' && r == 'T') { size_t acceptor_pos = pos + z - 1; size_t s = acceptor_pos - seq_key_len; PackedSpliceHalf p = pack_left_splice_half(seq, s, seq_key_len); reverse_acceptors.push_back(make_pair(pos + z - 1,p)); } if (l == 'A' && r == 'C') { size_t donor_pos = pos + z - 1; size_t s = donor_pos + 2 + seq_key_len; PackedSpliceHalf p = pack_right_splice_half(seq, s, seq_key_len); reverse_donors.push_back(make_pair(donor_pos,p)); } } vector fwd_extensions; get_seed_extensions(seq, read, seq_key_len, mer_table, forward_donors, forward_acceptors, fwd_extensions); hits_from_seed_extension(ref_id, ref_offset, insert_id, antisense, fwd_extensions, hits_out, left_read_edge, right_read_edge, seq_key_len); //fprintf(stderr, "Found %d seed hits\n", fwd_extensions.size()); vector rev_extensions; get_seed_extensions(seq, read, seq_key_len, mer_table, reverse_donors, reverse_acceptors, rev_extensions); hits_from_seed_extension(ref_id, ref_offset, insert_id, antisense, rev_extensions, hits_out, left_read_edge, right_read_edge, seq_key_len); for (size_t i = 0; i < seeds.size(); ++i) mer_table[seeds[i]].clear(); } int extension_mismatches = 0; bool left_extendable_junction(uint64_t upstream_dna_str, size_t key, size_t splice_mer_len, size_t min_ext_len) { vector& exts = extensions[key]; for (size_t i = 0; i < exts.size(); ++i) { const MerExtension& ext = exts[i]; if (ext.left_ext_len < min_ext_len) continue; uint64_t upstream = upstream_dna_str & ~(0xFFFFFFFFFFFFFFFFull << (ext.left_ext_len << 1)); int mism = mismatching_bases(ext.left_dna_str, upstream, ext.left_ext_len, extension_mismatches); if (mism <= extension_mismatches) return true; } return false; } bool right_extendable_junction(uint64_t downstream_dna_str, size_t key, size_t splice_mer_len, size_t min_ext_len) { vector& exts = extensions[key]; for (size_t i = 0; i < exts.size(); ++i) { const MerExtension& ext = exts[i]; if (ext.right_ext_len < min_ext_len) continue; uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull >> (ext.right_ext_len << 1)); uint64_t downstream = downstream_dna_str & mask; downstream >>= ((32 - ext.right_ext_len) << 1); int mism = mismatching_bases(ext.right_dna_str, downstream, ext.right_ext_len, extension_mismatches); if (mism <= extension_mismatches) return true; } return false; } uint32_t junction_key(uint64_t upstream_dna_str, uint64_t downstream_dna_str, size_t splice_mer_len) { uint64_t upstream_mask = ~(0xFFFFFFFFFFFFFFFFull << (splice_mer_len)); uint64_t upstream_key_half = upstream_dna_str & upstream_mask; uint64_t downstream_mask = ~(0xFFFFFFFFFFFFFFFFull >> (splice_mer_len)); uint64_t downstream_key_half = (downstream_dna_str & downstream_mask) >> (64 - splice_mer_len); uint32_t key = ((uint32_t)upstream_key_half << splice_mer_len) | (uint32_t)downstream_key_half; return key; } bool extendable_junction(uint64_t upstream_dna_str, uint64_t downstream_dna_str, size_t splice_mer_len, size_t min_ext_len, bool reverse, char last_in_upstream = 'N', char first_in_downstream = 'N') { if (color) { string two_bp; two_bp.push_back(last_in_upstream); two_bp.push_back(first_in_downstream); string color = convert_bp_to_color(two_bp, true); char num = (color[0] - '0') & 0x3; if (reverse) { upstream_dna_str = (upstream_dna_str >> 2) << 2; upstream_dna_str |= (uint64_t)num; } else { downstream_dna_str = (downstream_dna_str << 2) >> 2; downstream_dna_str |= ((uint64_t)num << 62); } } uint32_t key = junction_key(upstream_dna_str, downstream_dna_str, splice_mer_len); upstream_dna_str >>= splice_mer_len; downstream_dna_str <<= splice_mer_len; bool extendable = (left_extendable_junction(upstream_dna_str, key, splice_mer_len, min_ext_len) || right_extendable_junction(downstream_dna_str, key, splice_mer_len, min_ext_len)); return extendable; } typedef std::set PotentialJuncs; struct RecordExtendableJuncs { void record(uint32_t ref_id, const vector >& left_sites, const vector >& right_sites, bool antisense, PotentialJuncs& juncs, int min_intron, int max_intron, size_t max_juncs, size_t half_splice_mer_len) { size_t splice_mer_len = 2 * half_splice_mer_len; size_t curr_R = 0; for (size_t L = 0; L < left_sites.size(); ++L) { while (curr_R < right_sites.size() && right_sites[curr_R].first < left_sites[L].first + min_intron) { curr_R++; } size_t left_pos = left_sites[L].first; size_t max_right_pos = left_pos + max_intron; for (size_t R = curr_R; R < right_sites.size() && right_sites[R].first < max_right_pos; ++R) { uint64_t upstream_dna_str = left_sites[L].second.fwd_string; char last_in_upstream = left_sites[L].second.last_in_string; uint64_t downstream_dna_str = right_sites[R].second.fwd_string; char first_in_downstream = right_sites[R].second.first_in_string; uint64_t rc_upstream_dna_str = left_sites[L].second.rev_string; uint64_t rc_downstream_dna_str = right_sites[R].second.rev_string; if (extendable_junction(upstream_dna_str, downstream_dna_str, splice_mer_len, 7, false, last_in_upstream, first_in_downstream) || extendable_junction(rc_downstream_dna_str, rc_upstream_dna_str, splice_mer_len, 7, true, last_in_upstream, first_in_downstream)) { juncs.insert(Junction(ref_id, left_sites[L].first - 1, right_sites[R].first + 2, antisense, R - curr_R)); } if (juncs.size() > max_juncs) juncs.erase(*(juncs.rbegin())); } } } }; struct RecordAllJuncs { void record(uint32_t ref_id, const vector >& left_sites, const vector >& right_sites, bool antisense, PotentialJuncs& juncs, int min_intron, int max_intron, size_t max_juncs, size_t half_splice_mer_len) { size_t curr_R = 0; for (size_t L = 0; L < left_sites.size(); ++L) { while (curr_R < right_sites.size() && right_sites[curr_R].first < left_sites[L].first + min_intron) { curr_R++; } size_t left_pos = left_sites[L].first; size_t max_right_pos = left_pos + max_intron; for (size_t R = curr_R; R < right_sites.size() && right_sites[R].first < max_right_pos; ++R) { Junction j(ref_id, left_sites[L].first - 1, right_sites[R].first + 2, antisense, R - curr_R); juncs.insert(j); if (juncs.size() > max_juncs) juncs.erase(*(juncs.rbegin())); } } } }; struct RecordSegmentJuncs { void record(uint32_t ref_id, const vector >& left_sites, const vector >& right_sites, bool antisense, PotentialJuncs& juncs, int min_intron, int max_intron, size_t max_juncs, size_t half_splice_mer_len) { if (left_sites.size() != right_sites.size()) return; for (size_t i = 0; i < left_sites.size(); ++i) { Junction j(ref_id, left_sites[i].first - 1, right_sites[i].first + 2, antisense); juncs.insert(j); if (juncs.size() > max_juncs) juncs.erase(*(juncs.rbegin())); } } }; struct ButterflyKey { uint32_t pos; uint32_t key; ButterflyKey(uint32_t p, uint32_t k) : pos(p), key(k) {} bool operator<(const ButterflyKey& rhs) const { if (key != rhs.key) return key < rhs.key; if (pos != rhs.pos) return pos < rhs.pos; return false; } bool operator==(const ButterflyKey& rhs) const { return pos == rhs.pos && key == rhs.key; } }; uint32_t get_left_butterfly_key(uint64_t upstream_key, const MerExtension& ext, size_t half_splice_mer_len) { uint64_t key = ext.right_dna_str >> ((ext.right_ext_len - half_splice_mer_len) << 1); uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (half_splice_mer_len << 1)); uint64_t top_half = upstream_key & mask; key |= (top_half << (half_splice_mer_len << 1)); return (uint32_t)key; } uint32_t get_right_butterfly_key(uint64_t downstream_key, const MerExtension& ext, size_t half_splice_mer_len) { uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (half_splice_mer_len << 1)); uint64_t key = (ext.left_dna_str & mask) << (half_splice_mer_len << 1); uint64_t bottom_half = (downstream_key >> (half_splice_mer_len << 1)); key |= bottom_half; return (uint32_t)key; } struct RecordButterflyJuncs { void record(uint32_t ref_id, const vector >& all_left_sites, const vector >& all_right_sites, bool antisense, PotentialJuncs& juncs, int min_intron, int max_intron, size_t max_juncs, size_t half_splice_mer_len) { size_t key_length = 2 * half_splice_mer_len; size_t extension_length = butterfly_overhang; uint64_t bottom_bit_mask = ~(0xFFFFFFFFFFFFFFFFull << (key_length<<1)); uint64_t top_bit_mask = ~(0xFFFFFFFFFFFFFFFFull >> (key_length<<1)); if (all_left_sites.empty() || all_right_sites.empty()) return; size_t last_site = max(all_left_sites.back().first, all_right_sites.back().first); size_t curr_left_site = 0; size_t curr_right_site = 0; for (size_t window_left_edge = 0; window_left_edge < last_site; window_left_edge += max_intron) { //fprintf(stderr, "\twindow %lu - %lu\n", window_left_edge, window_left_edge + 2 * max_intron); vector > left_sites; vector > right_sites; while(curr_left_site < all_left_sites.size() && all_left_sites[curr_left_site].first < window_left_edge) { curr_left_site++; } while(curr_right_site < all_right_sites.size() && all_right_sites[curr_right_site].first < window_left_edge) { curr_right_site++; } for (size_t ls = curr_left_site; ls < all_left_sites.size(); ++ls) { if (all_left_sites[ls].first < window_left_edge + 2 * max_intron) { left_sites.push_back(all_left_sites[ls]); } } for (size_t rs = curr_right_site; rs < all_right_sites.size(); ++rs) { if (all_right_sites[rs].first < window_left_edge + 2 * max_intron) { right_sites.push_back(all_right_sites[rs]); } } vector left_keys; for (size_t L = 0; L < left_sites.size(); ++L) { uint64_t fwd_upstream_dna_str = left_sites[L].second.fwd_string; uint64_t fwd_upstream_key = fwd_upstream_dna_str & bottom_bit_mask; assert (fwd_upstream_key < extensions.size()); vector& fwd_exts = extensions[fwd_upstream_key]; for (size_t i = 0; i < fwd_exts.size(); ++i) { const MerExtension& ext = fwd_exts[i]; if (ext.right_ext_len < extension_length) continue; /* < f_u_key > NNNNNNNNNN GT */ // take the top bits of the right extension uint64_t key = ext.right_dna_str >> ((ext.right_ext_len - extension_length) << 1); // and the bottom bits of the site key uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (extension_length << 1)); uint64_t top_half = fwd_upstream_key & mask; // and cat them together key |= (top_half << (extension_length << 1)); left_keys.push_back(ButterflyKey((uint32_t)left_sites[L].first, key)); } uint64_t rev_upstream_dna_str = left_sites[L].second.rev_string; uint64_t rev_upstream_key = (rev_upstream_dna_str & top_bit_mask) >> (64 - (key_length<<1)); assert (rev_upstream_key < extensions.size()); vector& rev_exts = extensions[rev_upstream_key]; for (size_t i = 0; i < rev_exts.size(); ++i) { const MerExtension& ext = rev_exts[i]; if (ext.left_ext_len < extension_length) continue; /* < r_u_key > NNNNNNNNNN GT */ // reverse complement the left extension, and we will need // what were the bottom bits. these become the top bits in the // rc. uint64_t ext_str = color ? rc_color_str(ext.left_dna_str) : rc_dna_str(ext.left_dna_str); ext_str >>= 64 - (ext.left_ext_len << 1); // now take the top bits of the rc, make them the bottom of // the key uint64_t key = ext_str >> ((ext.left_ext_len - extension_length) << 1); // now add in the seed key bottom bits, making them the top of // the key uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (extension_length << 1)); uint64_t top_half = fwd_upstream_key & mask; key |= (top_half << (extension_length << 1)); left_keys.push_back(ButterflyKey((uint32_t)left_sites[L].first, key)); } } sort (left_keys.begin(), left_keys.end()); vector::iterator new_end = unique(left_keys.begin(), left_keys.end()); left_keys.erase(new_end, left_keys.end()); vector right_keys; for (size_t R = 0; R < right_sites.size(); ++R) { uint64_t fwd_downstream_dna_str = right_sites[R].second.fwd_string; uint64_t fwd_downstream_key = (fwd_downstream_dna_str & top_bit_mask) >> (64 - (key_length<<1)); assert (fwd_downstream_key < extensions.size()); vector fwd_downstream_keys; if (color) { for(size_t color_value = 0; color_value < 4; ++color_value) { uint64_t tmp_key = (fwd_downstream_key << 2) >> 2 | (color_value << ((key_length - 1) << 1)); fwd_downstream_keys.push_back(tmp_key); } } else { fwd_downstream_keys.push_back(fwd_downstream_key); } for(size_t key = 0; key < fwd_downstream_keys.size(); ++key) { uint64_t tmp_fwd_downstream_key = fwd_downstream_keys[key]; vector& fwd_exts = extensions[tmp_fwd_downstream_key]; for (size_t i = 0; i < fwd_exts.size(); ++i) { const MerExtension& ext = fwd_exts[i]; if (ext.left_ext_len < extension_length) continue; /* < f_d_key > AG NNNNNNNNNN */ // take the bottom bits of the left extension, making them the // top of the key. uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (extension_length << 1)); uint64_t key = (ext.left_dna_str & mask) << (extension_length << 1); // add in the top bits of the seed key, making them the bottom bits // of the key. uint64_t bottom_half = (tmp_fwd_downstream_key >> ((key_length - extension_length) << 1)); key |= bottom_half; right_keys.push_back(ButterflyKey((uint32_t)right_sites[R].first, key)); } } uint64_t rev_downstream_dna_str = right_sites[R].second.rev_string; uint64_t rev_downstream_key = rev_downstream_dna_str & bottom_bit_mask; assert (rev_downstream_key < extensions.size()); vector rev_downstream_keys; if (color) { for(size_t color_value = 0; color_value < 4; ++color_value) { uint64_t tmp_key = (rev_downstream_key >> 2) << 2 | color_value; rev_downstream_keys.push_back(tmp_key); } } else { rev_downstream_keys.push_back(rev_downstream_key); } for(size_t key = 0; key < rev_downstream_keys.size(); ++key) { uint64_t tmp_rev_downstream_key = rev_downstream_keys[key]; uint64_t tmp_fwd_downstream_key = fwd_downstream_key; if (color) { tmp_fwd_downstream_key = rc_color_str(tmp_rev_downstream_key) >> (64 - (key_length << 1)); } vector& rev_exts = extensions[tmp_rev_downstream_key]; for (size_t i = 0; i < rev_exts.size(); ++i) { const MerExtension& ext = rev_exts[i]; if (ext.right_ext_len < extension_length) continue; /* < r_d_key > AG NNNNNNNNNN */ // reverse complement the right_extension. we want the // top bits of the extension, but these become the bottom bits // under the rc. uint64_t ext_str = color ? rc_color_str(ext.right_dna_str) : rc_dna_str(ext.right_dna_str); ext_str >>= 64 - (ext.right_ext_len << 1); // take the bottom bits of the rc and make it the top of the key uint64_t key = ext_str << (extension_length << 1); // take the top bits of the seed key and make them the bottom // of the key. uint64_t bottom_half = (tmp_fwd_downstream_key >> ((key_length - extension_length) << 1)); key |= bottom_half; right_keys.push_back(ButterflyKey((uint32_t)right_sites[R].first, key)); } } } sort (right_keys.begin(), right_keys.end()); new_end = unique(right_keys.begin(), right_keys.end()); right_keys.erase(new_end, right_keys.end()); size_t lk = 0; size_t rk = 0; while (lk < left_keys.size() && rk < right_keys.size()) { while (lk < left_keys.size() && left_keys[lk].key < right_keys[rk].key) { ++lk; } if (lk == left_keys.size()) break; while (rk < right_keys.size() && right_keys[rk].key < left_keys[lk].key) { ++rk; } if (rk == right_keys.size()) break; if (lk < left_keys.size() && rk < right_keys.size() && right_keys[rk].key == left_keys[lk].key) { size_t k = right_keys[rk].key; size_t lk_end = lk; size_t rk_end = rk; while (rk_end < right_keys.size() && right_keys[rk_end].key == k) {++rk_end;} while (lk_end < left_keys.size() && left_keys[lk_end].key == k) {++lk_end;} size_t tmp_lk = lk; while (tmp_lk < lk_end) { size_t tmp_rk = rk; while (tmp_rk < rk_end) { int donor = (int)left_keys[tmp_lk].pos - 1; int acceptor = (int)right_keys[tmp_rk].pos + 2; if (acceptor - donor > min_intron && acceptor - donor < max_intron) { Junction j(ref_id, donor, acceptor, antisense, acceptor - donor); // just prefer shorter introns juncs.insert(j); if (juncs.size() > max_juncs) { juncs.erase(*(juncs.rbegin())); } } ++tmp_rk; } ++tmp_lk; } lk = lk_end; rk = rk_end; } } } } }; template void juncs_from_ref_segs(RefSequenceTable& rt, vector& expected_don_acc_windows, PotentialJuncs& juncs, const DnaString& donor_dinuc, const DnaString& acceptor_dinuc, int max_intron, int min_intron, size_t max_juncs, bool talkative, size_t half_splice_mer_len) { typedef map MotifMap; MotifMap ims; seqan::DnaStringReverseComplement rev_donor_dinuc(donor_dinuc); seqan::DnaStringReverseComplement rev_acceptor_dinuc(acceptor_dinuc); if (talkative) fprintf(stderr, "Collecting potential splice sites in islands\n"); // bool all_both = true; for (size_t r = 0; r < expected_don_acc_windows.size(); ++r) { const RefSeg& seg = expected_don_acc_windows[r]; if (seg.points_where != POINT_DIR_BOTH) all_both = false; RefSequenceTable::Sequence* ref_str = rt.get_seq(seg.ref_id); if (!ref_str) continue; bool skip_fwd = false; bool skip_rev = false; if (library_type == FR_FIRSTSTRAND) { if (seg.read == READ_LEFT) { if (seg.antisense) skip_rev = true; else skip_fwd = true; } else if(seg.read == READ_RIGHT) { if (seg.antisense) skip_fwd = true; else skip_rev = true; } } if (library_type == FR_SECONDSTRAND) { if (seg.read == READ_LEFT) { if (seg.antisense) skip_fwd = true; else skip_rev = true; } else if(seg.read == READ_RIGHT) { if (seg.antisense) skip_rev = true; else skip_fwd = true; } } pair::iterator, bool> ret = ims.insert(make_pair(seg.ref_id, IntronMotifs(seg.ref_id))); IntronMotifs& motifs = ret.first->second; int left_color_offset = 0, right_color_offset = 0; if (color) { if (seg.antisense) right_color_offset = 1; else left_color_offset = -1; } if (seg.left + left_color_offset < 0 || seg.right + right_color_offset >= (int)length(*ref_str) - 1) continue; DnaString org_seg_str = seqan::infix(*ref_str, seg.left + left_color_offset, seg.right + right_color_offset); String seg_str; assign(seg_str, org_seg_str); #ifdef B_DEBUG2 cout << "coord: " << seg.left << " " << seg.right << endl; //<< "seg_str: " << seg_str << endl; #endif if (color) { bool remove_primer = true; seg_str = convert_bp_to_color(org_seg_str, remove_primer); } size_t to = 0; size_t seg_len = length(seg_str); size_t read_len = seg.support_read.size(); if (read_len <= 0) to = seg_len - 2; else to = read_len - 2; const size_t max_segment_len = 128; uint8_t left_mismatches[max_segment_len] = {0,}; uint8_t right_mismatches[max_segment_len] = {0,}; if (max_segment_len < read_len) { fprintf(stderr, "Error: read len(%d) is greater than %d\n", (int)read_len, (int)max_segment_len); exit(-1); } if (read_len == seg_len || seg.points_where == POINT_DIR_BOTH) { if (seg.points_where == POINT_DIR_RIGHT || seg.points_where == POINT_DIR_BOTH) { size_t num_mismatches = 0; for (size_t i = 0; i < read_len - 1; ++i) { if (seg_str[i] != seg.support_read[i]) ++num_mismatches; left_mismatches[i] = num_mismatches; if (num_mismatches > 2) { to = i; break; } } } if (seg.points_where == POINT_DIR_LEFT || seg.points_where == POINT_DIR_BOTH) { size_t num_mismatches = 0; for (int i = read_len - 1; i >= 0; --i) { if (seg_str[i + (seg_len - read_len)] != seg.support_read[i]) ++num_mismatches; right_mismatches[i] = num_mismatches; if (num_mismatches > 2) break; } } // daehwan #ifdef B_DEBUG2 cout << "antisense: " << (seg.antisense ? "-" : "+") << endl << seqan::infix(seg_str, 0, segment_length) << " " << seqan::infix(seg_str, length(seg_str) - segment_length, length(seg_str)) << endl << seg.support_read << endl << 0 << " - " << to << endl; for (unsigned int i = 0; i < read_len; ++i) cout << (int)left_mismatches[i]; cout << "\t"; for (unsigned int i = 0; i < read_len; ++i) cout << (int)right_mismatches[i]; cout << endl; #endif } if (seg.points_where == POINT_DIR_BOTH) { for (size_t i = 0; i <= to; ++i) { // Look at a slice of the reference without creating a copy. DnaString curr = seqan::infix(org_seg_str, i - left_color_offset, i + 2 - left_color_offset); if ((!skip_fwd && curr == donor_dinuc) || (!skip_rev && curr == rev_acceptor_dinuc)) { DnaString partner; if (curr == donor_dinuc) partner = acceptor_dinuc; else partner = rev_donor_dinuc; uint8_t left_mismatch = 0; if (i > 0) left_mismatch = left_mismatches[i-1]; // daehwan #ifdef B_DEBUG2 cout << "i: " << i << endl << "mismatches: " << (int)left_mismatch << " - " << (int)right_mismatches[i] << endl; #endif if (left_mismatch + right_mismatches[i] <= 2) { size_t pos = length(seg_str) - (read_len - i) - 2; if (partner == seqan::infix(org_seg_str, pos - left_color_offset, pos + 2 - left_color_offset)) { if (curr == donor_dinuc) { motifs.fwd_donors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0))); motifs.fwd_acceptors.push_back(make_pair(seg.left + pos, DnaSpliceStrings(0,0))); } else { motifs.rev_acceptors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0))); motifs.rev_donors.push_back(make_pair(seg.left + pos, DnaSpliceStrings(0,0))); } // daehwan #ifdef B_DEBUG2 cout << curr << ":" << partner << " added" << endl; #endif } } } } } else if (seg.points_where == POINT_DIR_LEFT) { // A ref segment that "points left" is one that was flanked // on the right by a partial bowtie hit, indicating that we // should be looking for an intron to the left of the hit // In this seg, that means either an "AG" or an "AC" for (size_t i = 0; i <= to; ++i) { // Look at a slice of the reference without creating a copy. DnaString curr = seqan::infix(org_seg_str, i - left_color_offset, i + 2 - left_color_offset); if (curr == acceptor_dinuc && !skip_fwd) motifs.fwd_acceptors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0))); else if (curr == rev_donor_dinuc && !skip_rev) motifs.rev_donors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0))); } } else { // A right pointing ref seg wants either a "GT" or a "CT" for (size_t i = 0; i <= to; ++i) { // Look at a slice of the reference without creating a copy. DnaString curr = seqan::infix(org_seg_str, i - left_color_offset, i + 2 - left_color_offset); if (curr == donor_dinuc && !skip_fwd) motifs.fwd_donors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0))); else if (curr == rev_acceptor_dinuc && !skip_rev) motifs.rev_acceptors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0))); } } } if (talkative) { fprintf(stderr, "reporting synthetic splice junctions...\n"); } for (MotifMap::iterator motif_itr = ims.begin(); motif_itr != ims.end(); ++motif_itr) { uint32_t ref_id = motif_itr->first; RefSequenceTable::Sequence* ref_str = rt.get_seq(ref_id); if (!ref_str) err_die("Error: couldn't get ref string for %u\n", ref_id); if (talkative) fprintf(stderr, "Examining donor-acceptor pairings in %s\n", rt.get_name(ref_id)); IntronMotifs& motifs = motif_itr->second; if (!all_both) motifs.unique(); //motifs.attach_mer_counts(*ref_str); motifs.attach_mers(*ref_str); vector >& fwd_donors = motifs.fwd_donors; vector >& fwd_acceptors = motifs.fwd_acceptors; vector >& rev_acceptors = motifs.rev_acceptors; vector >& rev_donors = motifs.rev_donors; //const char* ref_name = rt.get_name(motif_itr->second.ref_id); JunctionRecorder recorder; recorder.record(ref_id, fwd_donors, fwd_acceptors, false, juncs, min_intron, max_intron, max_juncs, half_splice_mer_len); recorder.record(ref_id, rev_acceptors, rev_donors, true, juncs, min_intron, max_intron, max_juncs, half_splice_mer_len); } //fprintf(stderr, "Found %d total splices\n", num_juncs); } /** * Performs a simple global alignment. * This function will perform a restricted global alignment. The restriction is that only one insertion/deletion * is allowed in the final alignment. * @param shortSequence The short sequence to be aligned. * @param leftReference The left end of the reference to be aligned, must be exactly as long as the short sequence * @param leftReference The right end of the reference to be aligned, must be exactly as long as the short sequenc * @param insertPosition This will contain the 0-based index of the first position in the shorter sequence after the insertion/deletion. A value of -1 indicates that the alignment could not be performed. * @param mismatchCount This will contain the number of mismatches in the optimal restricted global alignment. The number and length of insertions/deletions is fixed. A value of -1 indicates that the alignment could not be performed. */ void simpleSplitAlignment(seqan::String& shorterSequence, seqan::String& leftReference, seqan::String& rightReference, vector& insertPositions, int& mismatchCount) { /* * In this restricted alignment, we already know the length and number (1) of insertions/deletions. * We simply need to know where to put it. Do a linear scan through sequence counting the number of induced * errors before and after putting the insertion at each sequence. */ /* * Note that we could have a case, where both the alignment and the read have the unknown * nucleotide ('N') and we don't want to reward cases where these characters match */ vector beforeErrors(seqan::length(shorterSequence)); for(int idx = seqan::length(shorterSequence) - 1; idx >= 0; idx -= 1){ unsigned short prevCount = 0; /* * We guarentee idx >= 0, so cast to hide the compiler * warning here */ if(((size_t)idx) < seqan::length(shorterSequence) - 1){ prevCount = beforeErrors.at(idx + 1); } unsigned short currentMismatch = 0; if(rightReference[idx] == 'N' || shorterSequence[idx] == 'N' || rightReference[idx] != shorterSequence[idx]){ currentMismatch = 1; } beforeErrors.at(idx) = prevCount + currentMismatch; } vector afterErrors(seqan::length(shorterSequence)); for(size_t idx = 0; idx < seqan::length(shorterSequence) ; idx += 1){ unsigned short prevCount = 0; if(idx > 0){ prevCount = afterErrors.at(idx - 1); } unsigned short currentMismatch = 0; if(leftReference[idx] == 'N' || shorterSequence[idx] == 'N' || leftReference[idx] != shorterSequence[idx]){ currentMismatch = 1; } afterErrors.at(idx) = prevCount + currentMismatch; } mismatchCount = seqan::length(shorterSequence) + 1; insertPositions.clear(); /* * Technically, we could allow the insert position to be at the end or beginning of the sequence, * but we are disallowing it here */ for(size_t currentInsertPosition = 1; currentInsertPosition < seqan::length(shorterSequence); currentInsertPosition += 1){ size_t errorCount = beforeErrors.at(currentInsertPosition) + afterErrors.at(currentInsertPosition - 1); if(((int)errorCount) < mismatchCount){ mismatchCount = (int)errorCount; insertPositions.clear(); insertPositions.push_back(currentInsertPosition); } else if ((int)errorCount == mismatchCount) { insertPositions.push_back(currentInsertPosition); } } return; } /** * Try to detect a small insertion. * This code will try to identify a small insertion based on the ungapped alignment of two neighboring * segments. The general idea is to try to realign the local region, and see if we can reduce the * number of errors. Note that the function makes use of the global parameter "max_insertion_length" to limit the maximum * size of a detected insertion. * @param rt Sequence table used to lookup sequence information * @param leftHit The alignment of the left segment. Note that the leftHit must have a left position less than that of the right hit. * @param rightHit The alignment of the right segment. Note that the rightHit must have a left position greater than that of the left hit. * @param insertions If an insertion is sucessfully detected, it will be added to this set */ void detect_small_insertion(RefSequenceTable& rt, seqan::String& read_sequence, BowtieHit& leftHit, BowtieHit& rightHit, std::set& insertions) { RefSequenceTable::Sequence* ref_str = rt.get_seq(leftHit.ref_id()); if(!ref_str){ fprintf(stderr, "Error accessing sequence record\n"); }else{ size_t read_length = seqan::length(read_sequence); int begin_offset = 0; int end_offset = 0; if(color){ if(leftHit.antisense_align()) end_offset = 1; else begin_offset = -1; } if(leftHit.left() + begin_offset < 0) return; /* * If there is in fact a deletion, we are expecting the genomic sequence to be shorter than * the actual read sequence */ int discrepancy = read_length - (rightHit.right() - leftHit.left()); DnaString genomic_sequence_temp = seqan::infix(*ref_str, leftHit.left() + begin_offset, rightHit.right() + end_offset); String genomic_sequence; assign(genomic_sequence, genomic_sequence_temp); if(color) genomic_sequence = convert_bp_to_color(genomic_sequence, true); String left_read_sequence = seqan::infix(read_sequence, 0, 0 + seqan::length(genomic_sequence)); String right_read_sequence = seqan::infix(read_sequence, read_length - seqan::length(genomic_sequence), read_length); vector bestInsertPositions; int minErrors = -1; simpleSplitAlignment(genomic_sequence, left_read_sequence, right_read_sequence, bestInsertPositions, minErrors); if (bestInsertPositions.size() <= 0) return; int bestInsertPosition = bestInsertPositions[0]; /* * Need to decide if the insertion is suitably improves the alignment */ /* * If these two segments anchors constitue the entire read, then we require * that this alignment actually improve the number of errors observed in the alignment * Otherwise, it is OK as long as the number of errors doesn't increase. */ int adjustment = 0; if(leftHit.read_len() + rightHit.read_len() >= (int)read_length){ adjustment = -1; } if(minErrors <= (leftHit.edit_dist() + rightHit.edit_dist() + adjustment) && bestInsertPosition + discrepancy <= (int)length(left_read_sequence)){ String insertedSequence = seqan::infix(left_read_sequence, bestInsertPosition, bestInsertPosition + discrepancy); if(color) insertedSequence = convert_color_to_bp(genomic_sequence_temp[bestInsertPosition - begin_offset + end_offset - 1], insertedSequence); insertions.insert(Insertion(leftHit.ref_id(), leftHit.left() + bestInsertPosition - 1 + end_offset, seqan::toCString(insertedSequence))); } } return; } /** * Try to detect a small deletion. * This code will try to identify a small deletion based on the ungapped alignment of two neighboring * segments. The general idea is to try to realign the local region, and see if we can reduce the * number of errors. Note that the function makes use of the global parameter "max_deletion_length" to limit the maximum * size of a detected deletion. * @param rt Sequence table used to lookup sequence information * @param leftHit The alignment of the left segment. Note that the leftHit must have a left position less than that of the right hit. * @param rightHit The alignment of the right segment. Note that the rightHit must have a left position greater than that of the left hit. * @param deletion_juncs If a deletion is sucessfully detected, it will be added to this set */ void detect_small_deletion(RefSequenceTable& rt, seqan::String& read_sequence, BowtieHit& leftHit, BowtieHit& rightHit, std::set& deletions) { RefSequenceTable::Sequence* ref_str = rt.get_seq(leftHit.ref_id()); if(!ref_str){ fprintf(stderr, "Error accessing sequence record\n"); }else{ int begin_offset = 0; int end_offset = 0; if(color){ if(leftHit.antisense_align()) end_offset = 1; else begin_offset = -1; } if(leftHit.left() + begin_offset < 0) return; size_t read_length = seqan::length(read_sequence); if (rightHit.right() + begin_offset < (int)read_length) return; int discrepancy = (rightHit.right() - leftHit.left()) - read_length; Dna5String leftGenomicSequence_temp = seqan::infix(*ref_str, leftHit.left() + begin_offset, leftHit.left() + read_length + end_offset); Dna5String rightGenomicSequence_temp = seqan::infix(*ref_str, rightHit.right() - read_length + begin_offset, rightHit.right() + end_offset); if (length(leftGenomicSequence_temp) < read_length || length(rightGenomicSequence_temp) < read_length) return; String leftGenomicSequence; assign(leftGenomicSequence, leftGenomicSequence_temp); String rightGenomicSequence; assign(rightGenomicSequence, rightGenomicSequence_temp); if(color){ leftGenomicSequence = convert_bp_to_color(leftGenomicSequence, true); rightGenomicSequence = convert_bp_to_color(rightGenomicSequence, true); } vector bestInsertPositions; int minErrors = -1; simpleSplitAlignment(read_sequence, leftGenomicSequence, rightGenomicSequence, bestInsertPositions, minErrors); assert (bestInsertPositions.size() > 0); int bestInsertPosition = bestInsertPositions[0]; /* * Need to decide if the deletion is suitably improves the alignment */ int adjustment = 0; /* * If these two segments anchors constitue the entire read, then we require * that this alignment actually improve the number of errors observed in the alignment * Otherwise, it is OK as long as the number of errors doesn't increase. */ if(leftHit.read_len() + rightHit.read_len() >= (int)read_length){ adjustment = -1; } if(minErrors <= (leftHit.edit_dist()+rightHit.edit_dist()+adjustment)){ deletions.insert(Deletion(leftHit.ref_id(), leftHit.left() + bestInsertPosition - 1 + end_offset, leftHit.left() + bestInsertPosition + discrepancy + end_offset, false)); } } return; } void gappedAlignment(const seqan::String& read, const seqan::String& leftReference, const seqan::String& rightReference, vector& insertLeftPositions, vector& insertRightPositions, int& mismatchCount) { const Score globalScore(0, -5, -1, -10); // (match, mismatch, gapextend, gapopen) Align > align; appendValue(rows(align), read); String genomicSequence; assign(genomicSequence, leftReference); append(genomicSequence, rightReference); appendValue(rows(align), genomicSequence); // int score = globalAlignment(align, globalScore); Row > >::Type& row0 = row(align, 0); Row > >::Type& row1 = row(align, 1); // find gap whose length >= read_len - 10 int start_in_align = -1, end_in_align = -1; int start_in_ref = -1, end_in_ref = -1; int temp_start = -1; int ref_pos = 0; int gap = 0; mismatchCount = 0; int len = length(row0); for (int i = 0; i < len; ++i) { if (row0[i] == '-') { if (temp_start < 0) temp_start = i; } else if (row1[i] != '-') { if (temp_start >= 0) { if (i - temp_start > end_in_align - start_in_align) { end_in_align = i; start_in_align = temp_start; end_in_ref = ref_pos; start_in_ref = ref_pos - (i - temp_start); temp_start = -1; } } if (row0[i] != row1[i]) ++mismatchCount; } if (row0[i] == '-' || row1[i] == '-') ++gap; if (row1[i] != '-') ++ref_pos; } // assume the lengths of read, leftReference, and rightReference are all equal. const int max_gap = end_in_align - start_in_align; if (max_gap < (int)length(read) - 10) return; if (start_in_ref < 0) return; insertLeftPositions.push_back(start_in_ref); insertRightPositions.push_back(end_in_ref - length(leftReference)); #if B_DEBUG if (gap - max_gap >= 0) { cerr << "Score = " << score << endl; cerr << row(align, 0) << endl << row(align, 1) << endl; cerr << "len: " << len << ", gap: " << gap << ", max_gap: " << max_gap << "(" << start_in_align << ", " << end_in_align << "), ref (" << start_in_ref << ", " << end_in_ref << "), mismatch: " << mismatchCount << endl; if (gap - max_gap > 0) cerr << "daehwan" << endl; } #endif } void detect_fusion(RefSequenceTable& rt, seqan::String& read_sequence, BowtieHit& leftHit, BowtieHit& rightHit, FusionSimpleSet& fusions, uint32_t dir) { RefSequenceTable::Sequence* left_ref_str = rt.get_seq(leftHit.ref_id()); RefSequenceTable::Sequence* right_ref_str = rt.get_seq(rightHit.ref_id()); int read_length = (int)seqan::length(read_sequence); Dna5String leftGenomicSequence_temp; Dna5String rightGenomicSequence_temp; if (dir == FUSION_FF || dir == FUSION_FR) { if (leftHit.left() + read_length > (int)seqan::length(*left_ref_str)) return; leftGenomicSequence_temp = seqan::infix(*left_ref_str, leftHit.left(), leftHit.left() + read_length); } else { if (leftHit.right() < read_length) return; leftGenomicSequence_temp = seqan::infix(*left_ref_str, leftHit.right() - read_length, leftHit.right()); seqan::reverseComplement(leftGenomicSequence_temp); } if (dir == FUSION_FF || dir == FUSION_RF) { if (rightHit.right() < read_length) return; rightGenomicSequence_temp = seqan::infix(*right_ref_str, rightHit.right() - read_length, rightHit.right()); } else { if (rightHit.left() + read_length > (int)seqan::length(*right_ref_str)) return; rightGenomicSequence_temp = seqan::infix(*right_ref_str, rightHit.left(), rightHit.left() + read_length); seqan::reverseComplement(rightGenomicSequence_temp); } String leftGenomicSequence; assign(leftGenomicSequence, leftGenomicSequence_temp); String rightGenomicSequence; assign(rightGenomicSequence, rightGenomicSequence_temp); vector bestLeftInsertPositions; vector bestRightInsertPositions; int minErrors = -1; // todo - we need to do (efficient) Smith-Waterman Alignment using SIMD like the way Bowtie2 does! // too slow and too many false positives // daehwan - turn off this for now. if (bowtie2 && false) gappedAlignment(read_sequence, leftGenomicSequence, rightGenomicSequence, bestLeftInsertPositions, bestRightInsertPositions, minErrors); else simpleSplitAlignment(read_sequence, leftGenomicSequence, rightGenomicSequence, bestLeftInsertPositions, minErrors); uint32_t total_edit_dist = leftHit.edit_dist() + rightHit.edit_dist(); if (minErrors > (int)total_edit_dist) return; #if 1 if (minErrors > 2) return; for (size_t i = 0; i < bestLeftInsertPositions.size(); ++i) { const int left = bestLeftInsertPositions[i]; if (left < (int)fusion_anchor_length) return; const int right = bestRightInsertPositions.size() > i ? bestRightInsertPositions[i] : left; if (length(rightGenomicSequence) - right < fusion_anchor_length) return; } // daehwan - this is very slow - the older version of "difference" is way faster #else if (read_length <= 60) { /* * check if the two contig from two different chromosome are different enough */ const Score globalScore(0, -1, -2, -2); Align > align; appendValue(rows(align), read_sequence); appendValue(rows(align), leftGenomicSequence); int score = globalAlignment(align, globalScore); assignSource(row(align, 0), read_sequence); assignSource(row(align, 1), rightGenomicSequence); score = max(score, globalAlignment(align, globalScore)); if (abs(score) < read_length / 6) return; } #endif for (size_t i = 0; i < bestLeftInsertPositions.size(); ++i) { int bestLeftInsertPosition = bestLeftInsertPositions[i]; int bestRightInsertPosition = bestLeftInsertPosition; if (bestRightInsertPositions.size() > i) bestRightInsertPosition = bestRightInsertPositions[i]; uint32_t left, right; if (dir == FUSION_FF || dir == FUSION_FR) left = leftHit.left() + bestLeftInsertPosition - 1; else left = leftHit.right() - bestLeftInsertPosition; if (dir == FUSION_FF || dir == FUSION_RF) right = rightHit.right() - (read_length - bestRightInsertPosition); else right = rightHit.left() + (read_length - bestRightInsertPosition) - 1; uint32_t ref_id1 = leftHit.ref_id(); uint32_t ref_id2 = rightHit.ref_id(); #if B_DEBUG cerr << endl << endl << "read id: " << leftHit.insert_id() << endl << "dir: " << dir << ", sense: " << (leftHit.antisense_align() ? "-" : "+") << endl << "left ref_id: " << rt.get_name(leftHit.ref_id()) << "\tright ref_id: " << rt.get_name(rightHit.ref_id()) << endl << read_sequence << endl << leftGenomicSequence << "\t" << rightGenomicSequence << endl << "insertion pos: " << bestLeftInsertPosition << endl; if (bowtie2) { Dna5String left_sequence; if (dir == FUSION_FF || dir == FUSION_FR) left_sequence = seqan::infix(*left_ref_str, leftHit.left(), left + 1); else { left_sequence = seqan::infix(*left_ref_str, left, leftHit.right()); seqan::reverseComplement(left_sequence); } Dna5String right_sequence; if (dir == FUSION_FF || dir == FUSION_RF) right_sequence = seqan::infix(*right_ref_str, right, rightHit.right()); else { right_sequence = seqan::infix(*right_ref_str, rightHit.left(), right + 1); seqan::reverseComplement(right_sequence); } cerr << "right insertion pos: " << bestRightInsertPosition << endl << left_sequence << "\t" << right_sequence << endl; } cerr << left << ":" << right << endl << "errors: " << minErrors << endl; #endif uint32_t temp_dir = dir; if ((ref_id2 < ref_id1) || (ref_id1 == ref_id2 && left > right)) { uint32_t temp = ref_id1; ref_id1 = ref_id2; ref_id2 = temp; temp = left; left = right; right = temp; if (dir == FUSION_FF) temp_dir = FUSION_RR; } Fusion fusion(ref_id1, ref_id2, left, right, temp_dir); FusionSimpleSet::iterator itr = fusions.find(fusion); if (itr == fusions.end()) { FusionSimpleStat simpleStat; simpleStat.count = 1; simpleStat.edit_dist = total_edit_dist; simpleStat.skip = false; simpleStat.left_coincide_with_splice_junction = false; simpleStat.right_coincide_with_splice_junction = false; fusions[fusion] = simpleStat; } else { itr->second.count += 1; itr->second.edit_dist = min(itr->second.edit_dist, total_edit_dist); } } } void find_insertions_and_deletions(RefSequenceTable& rt, ReadStream& reads_file, vector& hits_for_read, std::set& deletions, std::set& insertions){ if (hits_for_read.empty()) return; size_t last_segment = hits_for_read.size() - 1; size_t first_segment = 0; if (last_segment == first_segment) return; uint64_t insert_id = 0; for (size_t i = 0; i < hits_for_read.size(); ++i) { if (hits_for_read[i].insert_id != 0) { insert_id = hits_for_read[i].insert_id; break; } } if (insert_id == 0) return; /* * We can check up front whether the first or last element is empty * and avoid doing any more work. Note that the following code requires * that there be at least one elment in each */ #if 0 if (hits_for_read[first_segment].hits.empty() || hits_for_read[last_segment].hits.empty()) return; #endif /* * Need to identify the appropriate insert id for this group of reads */ Read read; bool got_read = reads_file.getRead(insert_id, read); if (!got_read) { err_die("Error: could not get read# %d from stream!", (int)insert_id); return; } for (size_t i = 0; i < hits_for_read.size() - 2; ++i) { /* * Work through all combinations of mappings for the first and last segment to see if any are indicative * of a small insertions or deletion */ HitsForRead& left_segment_hits = hits_for_read[i]; HitsForRead& right_segment_hits = hits_for_read[i+1]; /* * If either of the segment match lists is empty, we could try * to be smarter and work our way in until we find good a segment * match; however, won't do that for noe. */ if (left_segment_hits.hits.empty() || right_segment_hits.hits.empty()) return; seqan::String fullRead, rcRead; if (color) { fullRead = read.seq.substr(1 + i * segment_length, 2 * segment_length); rcRead = fullRead; seqan::reverse(rcRead); } else { fullRead = read.seq.substr(i * segment_length, 2 * segment_length); rcRead = fullRead; seqan::reverseComplement(rcRead); } size_t partial_read_length = seqan::length(fullRead); for (size_t left_segment_index = 0; left_segment_index < left_segment_hits.hits.size(); left_segment_index++) { for (size_t right_segment_index = 0; right_segment_index < right_segment_hits.hits.size(); right_segment_index++) { BowtieHit* leftHit = &left_segment_hits.hits[left_segment_index]; BowtieHit* rightHit = &right_segment_hits.hits[right_segment_index]; /* * Now we have found a pair of segment hits to investigate. Need to ensure * that * 1. the alignment orientation is consistent * 2. the distance separation is in the appropriate range * 3. Both hits are aligned to the same contig */ if (leftHit->ref_id() != rightHit->ref_id()) continue; if (leftHit->antisense_align() != rightHit->antisense_align()) continue; seqan::String* modifiedRead = &fullRead; /* * If we are dealing with an antisense alignment, then the left * read will actually be on the right, fix this now, to simplify * the rest of the logic, in addition, we will need to use the reverse * complement of the read sequence */ if (leftHit->antisense_align()) { BowtieHit * tmp = leftHit; leftHit = rightHit; rightHit = tmp; modifiedRead = &rcRead; } int apparent_length = rightHit->right() - leftHit->left(); int length_discrepancy = apparent_length - partial_read_length; if (length_discrepancy > 0 && length_discrepancy <= (int)max_deletion_length) { /* * Search for a deletion */ detect_small_deletion(rt, *modifiedRead, *leftHit, *rightHit, deletions); } if(length_discrepancy < 0 && length_discrepancy >= -(int)max_insertion_length) { /* * Search for an insertion */ detect_small_insertion(rt, *modifiedRead, *leftHit, *rightHit, insertions); } } } } } /* */ int map_read_to_contig(const String& contig, const String& read) { int contig_len = length(contig); int read_len = length(read); int pos = -1; int mismatch = 3; for (int i = 0; i < contig_len - read_len; ++i) { int temp_mismatch = 0; for (int j = 0; j < read_len; ++j) { if (contig[i+j] != read[j]) ++temp_mismatch; if (temp_mismatch >= mismatch) break; } if (temp_mismatch < mismatch) { pos = i; mismatch = temp_mismatch; } } return pos; } void find_fusions(RefSequenceTable& rt, ReadStream& reads_file, vector& hits_for_read, HitStream& partner_hit_stream, HitStream& seg_partner_hit_stream, FusionSimpleSet& fusions, eREAD read_side) { if (hits_for_read.empty()) return; size_t last_segment = hits_for_read.size() - 1; while (last_segment > 0) { if (!hits_for_read[last_segment].hits.empty()) break; --last_segment; } // daehwan #if 0 if (last_segment == 0 || hits_for_read[0].hits.empty()) { vector& hits = last_segment == 0 ? hits_for_read[0].hits : hits_for_read[1].hits; for (size_t i = 0; i < hits.size(); ++i) { BowtieHit* hit = &hits[i]; static const uint32_t chr_id1 = rt.get_id("chr2"); static const uint32_t chr_id2 = rt.get_id("chr3"); // KPL-4 PPP1R12A 12:80167343-80329235:-1 SEPT10 2:110300380-110371783:-1 // const uint32_t left1 = 80167343, right1 = 80329235, left2 = 110300380, right2 = 110371783; // VCaP TIA1 2:70436576-70475792:-1 DIRC2 3:122513642-122599986:1 const uint32_t left1 = 70436576, right1 = 70475792, left2 = 122513642, right2 = 122599986; if ((hit->ref_id() == chr_id1 && hit->left() >= left1 && hit->left() <= right1) || (hit->ref_id() == chr_id2 && hit->left() >= left2 && hit->left() <= right2)) { cout << hit->insert_id() << endl; break; #if 0 cout << "insert id: " << hit->insert_id() << "\t num hits: " << hits.size() << endl << hit->ref_id() << ":" << (hit->antisense_align() ? "-" : "+") << " " << (int)hit->edit_dist() << endl << hit->left() << "-" << hit->right() << endl << hit->seq() << endl << endl; #endif } } } #endif size_t first_segment = 0; if (last_segment == first_segment && (hits_for_read[first_segment].hits.empty() || hits_for_read[first_segment].hits[0].end())) return; uint32_t insert_id = hits_for_read[last_segment].insert_id; HitsForRead partner_hit_group; uint32_t next_order = partner_hit_stream.next_group_id(); bool has_partner = false; while (insert_id >= next_order && next_order != 0) { partner_hit_stream.next_read_hits(partner_hit_group); next_order = partner_hit_stream.next_group_id(); } has_partner = insert_id == partner_hit_group.insert_id; if (!has_partner) { next_order = seg_partner_hit_stream.next_group_id(); while (insert_id >= next_order && next_order != 0) { seg_partner_hit_stream.next_read_hits(partner_hit_group); next_order = seg_partner_hit_stream.next_group_id(); } has_partner = insert_id == partner_hit_group.insert_id; } /* * Need to identify the appropriate insert id for this group of reads */ Read read; bool got_read = reads_file.getRead(hits_for_read[last_segment].insert_id, read); if (!got_read) return; HitsForRead partner_hits_for_read; if (first_segment != last_segment) partner_hits_for_read = hits_for_read[last_segment]; HitsForRead& left_segment_hits = hits_for_read[first_segment]; HitsForRead& right_segment_hits = partner_hits_for_read; seqan::String fullRead, rcRead; fullRead = read.seq; rcRead = read.seq; seqan::reverseComplement(rcRead); size_t read_length = seqan::length(fullRead); bool check_partner = true; if (first_segment != last_segment) { for (size_t i = 0; i < left_segment_hits.hits.size(); ++i) { BowtieHit& leftHit = left_segment_hits.hits[i]; for (size_t j = 0; j < right_segment_hits.hits.size(); ++j) { BowtieHit& rightHit = right_segment_hits.hits[j]; if (leftHit.ref_id() == rightHit.ref_id() && leftHit.antisense_align() == rightHit.antisense_align()) { int dist = 0; if (leftHit.antisense_align()) dist = leftHit.left() - rightHit.right(); else dist = rightHit.left() - leftHit.right(); if (dist > -(int)max_insertion_length && dist <= (int)fusion_min_dist) { check_partner = false; break; } } } if (!check_partner) break; } } const int minus_dist = -(int)max_insertion_length * 2; if (check_partner && has_partner) { for (size_t l = 0; l < left_segment_hits.hits.size(); ++l) { BowtieHit& leftHit = left_segment_hits.hits[l]; for (size_t r = 0; r < partner_hit_group.hits.size(); ++r) { BowtieHit& rightHit = partner_hit_group.hits[r]; if (leftHit.ref_id() == rightHit.ref_id()) { if (leftHit.antisense_align() != rightHit.antisense_align()) { int dist = 0; if (leftHit.antisense_align()) dist = leftHit.left() - rightHit.right(); else dist = rightHit.left() - leftHit.right(); if (dist > minus_dist && dist <= (int)fusion_min_dist) continue; } } RefSequenceTable::Sequence* ref_str = rt.get_seq(rightHit.ref_id()); const int part_seq_len = inner_dist_std_dev > inner_dist_mean ? inner_dist_std_dev - inner_dist_mean : 0; const int flanking_seq_len = inner_dist_mean + inner_dist_std_dev; Dna5String right_flanking_seq; size_t left = 0; if (rightHit.antisense_align()) { if (flanking_seq_len <= rightHit.left()) { left = rightHit.left() - flanking_seq_len; right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len); } else break; } else { if (part_seq_len <= rightHit.right()) { left = rightHit.right() - part_seq_len; right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len); } else break; } const size_t check_read_len = min(15, segment_length - segment_mismatches - 3); seqan::String fwd_read = infix(fullRead, read_length - check_read_len, read_length); seqan::String rev_read = infix(rcRead, 0, check_read_len); int fwd_pos = map_read_to_contig(right_flanking_seq, fwd_read); if (fwd_pos >= 0) { BowtieHit hit(rightHit.ref_id(), rightHit.ref_id(), rightHit.insert_id(), left + fwd_pos, check_read_len, false, 0, 0, true); right_segment_hits.hits.push_back(hit); } int rev_pos = map_read_to_contig(right_flanking_seq, rev_read); if (rev_pos >= 0) { BowtieHit hit(rightHit.ref_id(), rightHit.ref_id(), rightHit.insert_id(), left + rev_pos, check_read_len, true, 0, 0, true); right_segment_hits.hits.push_back(hit); } // daehwan #if 0 if (fwd_pos >= 0 || rev_pos >= 0) { // if (leftHit.insert_id() == 409048 || leftHit.insert_id() == 4341516) { cout << "insert id: " << leftHit.insert_id() << endl << "fwd: " << fwd_read << " " << fwd_pos << endl << "rev: " << rev_read << " " << rev_pos << endl << "ref: " << right_flanking_seq << endl; } } #endif } } } static std::set ignore_chromosomes; if (ignore_chromosomes.size() <= 0 && fusion_ignore_chromosomes.size() > 0) { for (size_t i = 0; i < fusion_ignore_chromosomes.size(); ++i) ignore_chromosomes.insert(rt.get_id(fusion_ignore_chromosomes[i])); } for (size_t left_segment_index = 0; left_segment_index < left_segment_hits.hits.size(); ++left_segment_index) { for (size_t right_segment_index = 0; right_segment_index < right_segment_hits.hits.size(); ++right_segment_index) { BowtieHit* leftHit = &left_segment_hits.hits[left_segment_index]; BowtieHit* rightHit = &right_segment_hits.hits[right_segment_index]; if (ignore_chromosomes.find(leftHit->ref_id()) != ignore_chromosomes.end() || ignore_chromosomes.find(rightHit->ref_id()) != ignore_chromosomes.end()) continue; if (bowtie2) { if (leftHit->edit_dist() + rightHit->edit_dist() > (segment_mismatches << 1)) continue; } // daehwan #if 0 if (leftHit->ref_id() == rightHit->ref_id() && leftHit->ref_id() == 1119738090) { const uint32_t left1 = 113951556, right1 = 113977987, left2 = 113548692, right2 = 113754053; if ((leftHit->left() >= left1 && leftHit->left() <= right1 && rightHit->left() >= left2 && rightHit->left() <= right2) || (leftHit->left() >= left2 && leftHit->left() <= right2 && rightHit->left() >= left1 && rightHit->left() <= right1)) { cout << "insert id: " << leftHit->insert_id() << "\t num hits: " << left_segment_hits.hits.size() << ":" << right_segment_hits.hits.size() << endl << leftHit->ref_id() << ":" << (leftHit->antisense_align() ? "-" : "+") << " " << (int)leftHit->edit_dist() <left() << "-" << leftHit->right() << endl << rightHit->ref_id() << ":" << (rightHit->antisense_align() ? "-" : "+") << " " << (int)rightHit->edit_dist() << endl << rightHit->left() << "-" << rightHit->right() << endl << endl; } } #endif if (leftHit->ref_id() == rightHit->ref_id()) { if (leftHit->antisense_align() == rightHit->antisense_align()) { int dist = 0; if (leftHit->antisense_align()) dist = leftHit->left() - rightHit->right(); else dist = rightHit->left() - leftHit->right(); if (dist > minus_dist && dist <= (int)fusion_min_dist) continue; } } uint32_t dir = FUSION_FF; seqan::String* modifiedRead = &fullRead; if (leftHit->antisense_align() == rightHit->antisense_align()) { if (leftHit->antisense_align()) { BowtieHit * tmp = leftHit; leftHit = rightHit; rightHit = tmp; modifiedRead = &rcRead; } } else if (leftHit->antisense_align() == false && rightHit->antisense_align() == true) dir = FUSION_FR; else dir = FUSION_RF; detect_fusion(rt, *modifiedRead, *leftHit, *rightHit, fusions, dir); } } } void find_gaps(RefSequenceTable& rt, ReadStream& reads_file, vector& hits_for_read, HitStream& partner_hit_stream, HitStream& seg_partner_hit_stream, std::set& seg_juncs, eREAD read_side) { if (hits_for_read.empty()) return; size_t last_segment = hits_for_read.size() - 1; while (last_segment > 0) { if (!hits_for_read[last_segment].hits.empty()) break; --last_segment; } hits_for_read.resize(last_segment + 1); size_t first_segment = 0; if (last_segment == first_segment && (hits_for_read[first_segment].hits.empty() || hits_for_read[first_segment].hits[0].end())) return; uint32_t insert_id = hits_for_read[last_segment].insert_id; HitsForRead partner_hit_group; uint32_t next_order = partner_hit_stream.next_group_id(); bool has_partner = false; while (insert_id >= next_order && next_order != 0) { partner_hit_stream.next_read_hits(partner_hit_group); next_order = partner_hit_stream.next_group_id(); } has_partner = insert_id == partner_hit_group.insert_id; if (!has_partner) { next_order = seg_partner_hit_stream.next_group_id(); while (insert_id >= next_order && next_order != 0) { seg_partner_hit_stream.next_read_hits(partner_hit_group); next_order = seg_partner_hit_stream.next_group_id(); } has_partner = insert_id == partner_hit_group.insert_id; } Read read; bool got_read = reads_file.getRead(hits_for_read[last_segment].insert_id, read); if (!got_read) { err_die("Error: could not get read# %d from stream!", (int)hits_for_read[last_segment].insert_id); return; } HitsForRead partner_hits_for_read; if (first_segment != last_segment) partner_hits_for_read = hits_for_read[last_segment]; HitsForRead& left_segment_hits = hits_for_read[first_segment]; HitsForRead& right_segment_hits = partner_hits_for_read; bool check_partner = true; if (first_segment != last_segment) { for (size_t i = 0; i < left_segment_hits.hits.size(); ++i) { BowtieHit& leftHit = left_segment_hits.hits[i]; for (size_t j = 0; j < right_segment_hits.hits.size(); ++j) { BowtieHit& rightHit = right_segment_hits.hits[j]; if (leftHit.ref_id() == rightHit.ref_id() && leftHit.antisense_align() == rightHit.antisense_align()) { int dist = 0; if (leftHit.antisense_align()) dist = leftHit.left() - rightHit.right(); else dist = rightHit.left() - leftHit.right(); if (dist >= min_segment_intron_length && dist < (int)max_segment_intron_length) { check_partner = false; break; } } } if (!check_partner) break; } } if (check_partner && has_partner) { // empty hits from 1 to num_segments - 1 for (size_t i = first_segment + 1; i < hits_for_read.size(); ++i) { hits_for_read[i].hits.clear(); } seqan::String fullRead, rcRead; fullRead = read.seq; rcRead = read.seq; seqan::reverseComplement(rcRead); size_t read_length = read.seq.length(); for (size_t l = 0; l < left_segment_hits.hits.size(); ++l) { BowtieHit& leftHit = left_segment_hits.hits[l]; for (size_t r = 0; r < partner_hit_group.hits.size(); ++r) { BowtieHit& rightHit = partner_hit_group.hits[r]; if (leftHit.ref_id() != rightHit.ref_id() || leftHit.antisense_align() == rightHit.antisense_align()) continue; int dist = 0; if (leftHit.antisense_align()) dist = leftHit.left() - rightHit.right(); else dist = rightHit.left() - leftHit.right(); if (dist < min_segment_intron_length && dist >= (int)max_segment_intron_length) continue; RefSequenceTable::Sequence* ref_str = rt.get_seq(rightHit.ref_id()); const int part_seq_len = inner_dist_std_dev > inner_dist_mean ? inner_dist_std_dev - inner_dist_mean : 0; const int flanking_seq_len = inner_dist_mean + inner_dist_std_dev; Dna5String right_flanking_seq; size_t left = 0; if (rightHit.antisense_align()) { if (flanking_seq_len <= rightHit.left()) { left = rightHit.left() - flanking_seq_len; right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len); } else break; } else { if (part_seq_len <= rightHit.right()) { left = rightHit.right() - part_seq_len; right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len); } else break; } const size_t check_read_len = min(15, segment_length - segment_mismatches - 3); seqan::String fwd_read = infix(fullRead, read_length - check_read_len, read_length); seqan::String rev_read = infix(rcRead, 0, check_read_len); int fwd_pos = map_read_to_contig(right_flanking_seq, fwd_read); if (fwd_pos >= 0) { BowtieHit hit(rightHit.ref_id(), rightHit.ref_id2(), rightHit.insert_id(), left + fwd_pos, check_read_len, false, 0, 0, true); hits_for_read[last_segment].hits.push_back(hit); } int rev_pos = map_read_to_contig(right_flanking_seq, rev_read); if (rev_pos >= 0) { BowtieHit hit(rightHit.ref_id(), rightHit.ref_id2(), rightHit.insert_id(), left + rev_pos, check_read_len, true, 0, 0, true); hits_for_read[last_segment].hits.push_back(hit); } // daehwan - for debug purposes #if B_DEBUG cerr << "daehwan!!!" << endl << "insert id: " << rightHit.insert_id() << endl << "first segment: " << first_segment << ", last_segment: " << last_segment << endl << right_flanking_seq << " : " << seqan::length(right_flanking_seq) << endl << fwd_read << " : " << fwd_pos << endl << rev_read << " : " << rev_pos << endl << "left: " << leftHit.left() << "-" << leftHit.right() << (leftHit.antisense_align() ? " -" : " +") << endl << "right: " << rightHit.left() << "-" << rightHit.right() << (rightHit.antisense_align() ? " -" : " +") << endl; if (fwd_pos >= 0 || rev_pos >= 0) { const BowtieHit& hit = hits_for_read[last_segment].hits.back(); cerr << "back: " << hit.left() << "-" << hit.right() << (hit.antisense_align() ? " -" : " +") << endl; } #endif } } } vector expected_don_acc_windows; string seq(read.seq); // ignore segments that map to more than this many places that would otherwise produce // many false splice junctions. if (bowtie2) { for (size_t s = 0; s < hits_for_read.size(); ++s) { if (hits_for_read[s].hits.size() > max_seg_multihits) return; } } for (size_t s = 0; s < hits_for_read.size(); ++s) { HitsForRead& curr = hits_for_read[s]; for (size_t h = 0; h < curr.hits.size(); ++h) { bool found_right_seg_partner = s == hits_for_read.size() - 1; BowtieHit& bh = curr.hits[h]; // "drs" is distant seg right partner // "rrs" is right of right seg partner vector drs_bhs; vector rrs_bhs; if (s < hits_for_read.size() - 1) { // Look for a right partner for the current hit HitsForRead& right = hits_for_read[s + 1]; for (size_t r = 0; r < right.hits.size(); ++r) { BowtieHit& rh = right.hits[r]; if (bh.antisense_align() != rh.antisense_align() || bh.ref_id() != rh.ref_id()) continue; if ((bh.antisense_align() && rh.right() == bh.left()) || (!bh.antisense_align() && bh.right() == rh.left() )) { found_right_seg_partner = true; break; } int dist = 0; if (bh.antisense_align()) dist = bh.left() - rh.right(); else dist = rh.left() - bh.right(); if (dist >= min_segment_intron_length && dist < (int)max_segment_intron_length) drs_bhs.push_back(&rh); } } if (!found_right_seg_partner && s < hits_for_read.size() - 2) { // Look for a right of right partner for the current hit HitsForRead& right_right = hits_for_read[s + 2]; for (size_t r = 0; r < right_right.hits.size(); ++r) { BowtieHit& rrh = right_right.hits[r]; if (bh.antisense_align() != rrh.antisense_align() || bh.ref_id() != rrh.ref_id()) continue; int dist = 0; if (bh.antisense_align()) dist = bh.left() - rrh.right(); else dist = rrh.left() - bh.right(); if (dist >= min_segment_intron_length + segment_length && dist < (int)max_segment_intron_length + segment_length) rrs_bhs.push_back(&rrh); } } if (!found_right_seg_partner && (drs_bhs.size() > 0 || rrs_bhs.size() > 0)) { const int look_bp = 8; const size_t color_offset = color ? 1 : 0; vector d_bhs = rrs_bhs.size() > 0 ? rrs_bhs : drs_bhs; for (size_t r = 0; r < d_bhs.size(); ++r) { string support_read; if (rrs_bhs.size() <= 0) support_read = seq.substr(color_offset + (s+1) * segment_length - look_bp, look_bp * 2); else support_read = seq.substr(color_offset + (s+1) * segment_length - look_bp, segment_length + look_bp * 2); BowtieHit& d_bh = *(d_bhs[r]); if (!bh.antisense_align()) { RefSeg right_seg(bh.ref_id(), POINT_DIR_BOTH, bh.antisense_align(), read_side, 0, 0, support_read); right_seg.left = max(0, bh.right() - look_bp); right_seg.right = d_bh.left() + look_bp; expected_don_acc_windows.push_back(right_seg); } else { if (color) reverse(support_read.begin(), support_read.end()); else reverse_complement(support_read); RefSeg left_seg(bh.ref_id(), POINT_DIR_BOTH, bh.antisense_align(), read_side, 0, 0, support_read); left_seg.left = d_bh.right() - look_bp; left_seg.right = bh.left() + look_bp; expected_don_acc_windows.push_back(left_seg); } // daehwan #ifdef B_DEBUG2 cout << "insert id: " << bh.insert_id() << endl << (bh.antisense_align() ? "-" : "+") << endl << seq << endl << "(" << s << ") - " << support_read << endl; #endif } } } } //for each hits_for_read juncs_from_ref_segs(rt, expected_don_acc_windows, seg_juncs, "GT", "AG", max_segment_intron_length, min_segment_intron_length, max_seg_juncs, false, 0); juncs_from_ref_segs(rt, expected_don_acc_windows, seg_juncs, "GC", "AG", max_segment_intron_length, min_segment_intron_length, max_seg_juncs, false, 0); juncs_from_ref_segs(rt, expected_don_acc_windows, seg_juncs, "AT", "AC", max_segment_intron_length, min_segment_intron_length, max_seg_juncs, false, 0); } MerTable mer_table; int seed_alignments = 0; int microaligned_segs = 0; map* > microexon_windows; bool overlap_in_genome(int ll, int lr, int rl, int rr) { if (ll >= rl && ll < rr) return true; if (lr > rl && lr < rr) return true; if (rl >= ll && rl < lr) return true; if (rr > ll && rr < lr) return true; return false; } void add_to_microexon_windows(uint32_t ref_id, int left_boundary, int right_boundary, const string& dna_str, eREAD read) { RefSeg left_dummy(ref_id, POINT_DIR_DONTCARE, false, read, left_boundary, right_boundary); RefSeg right_dummy(ref_id, POINT_DIR_DONTCARE, false, read, right_boundary, right_boundary + 1); map* >::iterator lb = microexon_windows.lower_bound(left_dummy); map* >::iterator ub = microexon_windows.lower_bound(right_dummy); vector* new_vec = NULL; if (lb == microexon_windows.end()) { microexon_windows.insert(make_pair(left_dummy, new vector(1, dna_str))); return; } map* >::iterator first_to_be_erased = microexon_windows.end(); map* >::iterator last_to_be_erased = ub; while (lb != ub) { // everyone in this range that overlaps with the new interval needs to // be merged together. if (overlap_in_genome(lb->first.left, lb->first.right, left_boundary, right_boundary)) { if (!new_vec) new_vec = new vector(); if (first_to_be_erased == microexon_windows.end()) first_to_be_erased = lb; left_dummy.left = min(lb->first.left, left_boundary); left_dummy.right = max(lb->first.right, right_boundary); new_vec->insert(new_vec->end(), lb->second->begin(), lb->second->end()); delete lb->second; } else if (first_to_be_erased != microexon_windows.end()) { last_to_be_erased = lb; } ++lb; } if (first_to_be_erased != microexon_windows.end()) { microexon_windows.erase(first_to_be_erased, last_to_be_erased); } if (!new_vec) { // never found an overlapping window, so just add this one and bail microexon_windows.insert(make_pair(left_dummy, new vector(1, dna_str))); return; } else { new_vec->push_back(dna_str); microexon_windows.insert(make_pair(left_dummy, new_vec)); return; } } void align_microexon_segs(RefSequenceTable& rt, std::set& juncs, int max_juncs, int half_splice_mer_len) { int num_segments = 0; for (map* >::iterator itr = microexon_windows.begin(); itr != microexon_windows.end(); ++itr) { vector& unaligned_segments = *itr->second; num_segments += unaligned_segments.size(); } fprintf(stderr, "Aligning %d microexon segments in %lu windows\n", num_segments, (long unsigned int)microexon_windows.size()); extensions.clear(); size_t splice_mer_len = 2 * half_splice_mer_len; size_t mer_table_size = 1 << ((splice_mer_len)<<1); extensions.resize(mer_table_size); int window_num = 0; for (map* >::iterator itr = microexon_windows.begin(); itr != microexon_windows.end(); ++itr) { window_num++; if ((window_num % 100) == 0) fprintf(stderr, "\twindow %d\n",window_num); stringstream ss(stringstream::in | stringstream::out); for (size_t j = 0; j < extensions.size(); ++j) { extensions[j].clear(); } vector& unaligned_segments = *itr->second; for (size_t j = 0; j < unaligned_segments.size(); ++j) { stringstream ss(stringstream::in | stringstream::out); string s; //cerr << w.unaligned_segments[j]; ss << unaligned_segments[j]; ss >> s; store_read_extensions(extensions, half_splice_mer_len, half_splice_mer_len, s, false); } vector segs; segs.push_back(itr->first); RefSeg r = itr->first; r.points_where = POINT_DIR_LEFT; segs.push_back(r); juncs_from_ref_segs(rt, segs, juncs, "GT", "AG", max_microexon_stretch, min_coverage_intron_length, max_juncs, false, half_splice_mer_len); num_segments += unaligned_segments.size(); delete itr->second; } fprintf(stderr, "Checked %d segments against %lu windows for microexon junctions\n", num_segments, (long unsigned int)microexon_windows.size()); fprintf(stderr, "Found %ld potential microexon junctions\n", (long int)juncs.size()); } /* * Easy guys ... this function puts its pants on just like the rest of you -- one leg * at a time. Except, once its pants are on, it makes gold records. Alright, here we * go. */ void look_for_hit_group(RefSequenceTable& rt, ReadStream& readstream, ReadStream& readstream_for_segment_search, ReadStream& readstream_for_indel_discovery, ReadStream& readstream_for_fusion_discovery, ReadTable& unmapped_reads, vector& seg_files, int curr_file, uint64_t insert_id, vector& hits_for_read, //<-- collecting segment hits for this read HitStream& partner_hit_stream_for_segment_search, HitStream& seg_partner_hit_stream_for_segment_search, HitStream& partner_hit_stream_for_fusion_discovery, HitStream& seg_partner_hit_stream_for_fusion_discovery, std::set& juncs, std::set& deletions, std::set& insertions, FusionSimpleSet& fusions, eREAD read_side, uint32_t begin_id, uint32_t end_id) { HitStream& hit_stream = seg_files[curr_file]; HitsForRead hit_group; uint32_t order = unmapped_reads.observation_order(insert_id); int seq_key_len = min((int)min_anchor_len, 6); while(true) { uint64_t next_group_id = hit_stream.next_group_id(); uint32_t next_order = unmapped_reads.observation_order(next_group_id); // If we would have seen the hits by now, stop looking in this stream, // but forward the search to the next (lower) segment if possible. if (order < next_order || next_group_id == 0) { if (curr_file > 0) { //look for next (lower) segment mappings for this read look_for_hit_group(rt, readstream, readstream_for_segment_search, readstream_for_indel_discovery, readstream_for_fusion_discovery, unmapped_reads, seg_files, curr_file - 1, insert_id, hits_for_read, partner_hit_stream_for_segment_search, seg_partner_hit_stream_for_segment_search, partner_hit_stream_for_fusion_discovery, seg_partner_hit_stream_for_fusion_discovery, juncs, deletions, insertions, fusions, read_side, begin_id, end_id); } else if (insert_id && !no_microexon_search) { //microexon search Read read; // The hits are missing for the leftmost segment, which means // we should try looking for junctions via seed and extend // using it (helps find junctions to microexons). bool got_read = readstream.getRead(insert_id, read); if (!got_read) { //fprintf(stderr, "Warning: could not get read with insert_id=%d\n", (int)insert_id); //break; //exit loop err_die("Error: could not get read with insert_id=%d from file %s\n", (int)insert_id, readstream.filename()); } string fwd_read = read.seq; if (color) // remove the primer and the adjacent color fwd_read.erase(0, 2); // make sure there are hits for all the other segs, all the // way to the root (right-most) one. int empty_seg = 0; for (size_t h = 1; h < hits_for_read.size(); ++h) { if (hits_for_read[h].hits.empty()) empty_seg = h; } // if not, no microexon alignment for this segment if (empty_seg != 0) break; fwd_read = fwd_read.substr(0, segment_length); string rev_read = fwd_read; //check the reverse if (color) reverse(rev_read.begin(), rev_read.end()); else reverse_complement(rev_read); for (size_t h = 0; h < hits_for_read[empty_seg + 1].hits.size(); ++h) { const BowtieHit& bh = hits_for_read[empty_seg + 1].hits[h]; RefSequenceTable::Sequence* ref_str = rt.get_seq(bh.ref_id()); if (ref_str == NULL) continue; int ref_len = length(*ref_str); int left_boundary; int right_boundary; bool antisense = bh.antisense_align(); vector empty_seg_hits; seed_alignments++; if (antisense) { left_boundary = max(0, bh.right() - (int)min_anchor_len); right_boundary = min(ref_len - 2, left_boundary + max_microexon_stretch); if (right_boundary - left_boundary < 2 * seq_key_len) continue; microaligned_segs++; add_to_microexon_windows(bh.ref_id(), left_boundary, right_boundary, rev_read, read_side); } else { right_boundary = min(ref_len - 2, bh.left() + (int)min_anchor_len); left_boundary = max(0, right_boundary - max_microexon_stretch); if (right_boundary - left_boundary < 2 * seq_key_len) continue; microaligned_segs++; add_to_microexon_windows(bh.ref_id(), left_boundary, right_boundary, fwd_read, read_side); } } //for h } // !no_microexon_search break; } else if (hit_stream.next_read_hits(hit_group)) { // if we found hits for the target group in the left stream, // add them to the accumulating vector and continue the search if (hit_group.insert_id == insert_id) { hits_for_read[curr_file] = hit_group; if (curr_file > 0) // we need to look left (recursively) for the group we // just read for this stream. look_for_hit_group(rt, readstream, readstream_for_segment_search, readstream_for_indel_discovery, readstream_for_fusion_discovery, unmapped_reads, seg_files, curr_file - 1, insert_id, hits_for_read, partner_hit_stream_for_segment_search, seg_partner_hit_stream_for_segment_search, partner_hit_stream_for_fusion_discovery, seg_partner_hit_stream_for_fusion_discovery, juncs, deletions, insertions, fusions, read_side, begin_id, end_id); break; } //same insert_id (group) else if (curr_file >= 0) { // different group, we need to start a whole new // search for it, with a whole new vector of hits. vector hits_for_new_read(seg_files.size()); hits_for_new_read[curr_file] = hit_group; if (curr_file > 0) { look_for_hit_group(rt, readstream, readstream_for_segment_search, readstream_for_indel_discovery, readstream_for_fusion_discovery, unmapped_reads, seg_files, curr_file - 1, hit_group.insert_id, hits_for_new_read, partner_hit_stream_for_segment_search, seg_partner_hit_stream_for_segment_search, partner_hit_stream_for_fusion_discovery, seg_partner_hit_stream_for_fusion_discovery, juncs, deletions, insertions, fusions, read_side, begin_id, end_id); if (hit_group.insert_id >= begin_id && hit_group.insert_id < end_id) { find_insertions_and_deletions(rt, readstream_for_indel_discovery, hits_for_new_read, deletions, insertions); find_gaps(rt, readstream_for_segment_search, hits_for_new_read, partner_hit_stream_for_segment_search, seg_partner_hit_stream_for_segment_search, juncs, read_side); } } if (hit_group.insert_id >= begin_id && hit_group.insert_id < end_id) { if (fusion_search) { find_fusions(rt, readstream_for_fusion_discovery, hits_for_new_read, partner_hit_stream_for_fusion_discovery, seg_partner_hit_stream_for_fusion_discovery, fusions, read_side); } } } //different group }//got next group } //while loop } uint64_t process_next_hit_group(RefSequenceTable& rt, ReadStream& readstream, ReadStream& readstream_for_segment_search, ReadStream& readstream_for_indel_discovery, ReadStream& readstream_for_fusion_discovery, ReadTable& unmapped_reads, vector& seg_files, size_t last_file_idx, HitStream& partner_hit_stream_for_segment_search, HitStream& seg_partner_hit_stream_for_segment_search, HitStream& partner_hit_stream_for_fusion_discovery, HitStream& seg_partner_hit_stream_for_fusion_discovery, std::set& juncs, std::set& deletions, std::set& insertions, FusionSimpleSet& fusions, eREAD read, uint32_t begin_id = 0, uint32_t end_id = VMAXINT32) { HitStream& last_segmap_hitstream = seg_files[last_file_idx]; HitsForRead hit_group; bool result = last_segmap_hitstream.next_read_hits(hit_group); vector hits_for_read(seg_files.size()); hits_for_read.back() = hit_group; if (result && hit_group.insert_id >= end_id) return 0; look_for_hit_group(rt, readstream, readstream_for_segment_search, readstream_for_indel_discovery, readstream_for_fusion_discovery, unmapped_reads, seg_files, (int)last_file_idx - 1, hit_group.insert_id, hits_for_read, partner_hit_stream_for_segment_search, seg_partner_hit_stream_for_segment_search, partner_hit_stream_for_fusion_discovery, seg_partner_hit_stream_for_fusion_discovery, juncs, deletions, insertions, fusions, read, begin_id, end_id); if (result) { find_insertions_and_deletions(rt, readstream_for_indel_discovery, hits_for_read, deletions, insertions); if (fusion_search) { find_fusions(rt, readstream_for_fusion_discovery, hits_for_read, partner_hit_stream_for_fusion_discovery, seg_partner_hit_stream_for_fusion_discovery, fusions, read); } find_gaps(rt, readstream_for_segment_search, hits_for_read, partner_hit_stream_for_segment_search, seg_partner_hit_stream_for_segment_search, juncs, read); return hit_group.insert_id; } return 0; } static const int UNCOVERED = 0; static const int LOOK_LEFT = 1; static const int LOOK_RIGHT = 2; uint8_t get_cov(const vector& cov, uint32_t c) { uint32_t b = c >> 2; uint32_t r = c & 0x3; uint8_t s = (r << 1); uint8_t v = cov[b]; v &= (0x3 << s); v >>= s; return v; } void build_coverage_map(ReadTable& it, RefSequenceTable& rt, vector& seg_files, map >& coverage_map) { if (!coverage_map.empty()) return; BAMHitFactory hit_factory(it,rt); for (size_t f = 0; f < seg_files.size(); ++f) { //fprintf(stderr, "Adding hits from segment file %d to coverage map\n", (int)f); //seg_files[f]->rewind(); HitStream hs(seg_files[f], &hit_factory, false, false, false); HitsForRead hit_group; while (hs.next_read_hits(hit_group)) { for (size_t h = 0; h < hit_group.hits.size(); ++h) { BowtieHit& bh = hit_group.hits[h]; pair >::iterator, bool> ret = coverage_map.insert(make_pair(bh.ref_id(), vector())); vector& ref_cov = ret.first->second; size_t right_extent = bh.right(); if (right_extent >= ref_cov.size()) { ref_cov.resize(right_extent + 1, 0); } for (uint32_t c = (uint32_t)bh.left(); c < (uint32_t)bh.right(); ++c) { ref_cov[c] = true; //if (ref_cov[c]<255) ref_cov[c]++; } } } //while next_read_hits } } void pair_covered_sites(ReadTable& it, RefSequenceTable& rt, vector& segmap_fnames, std::set& cov_juncs, map >& coverage_map, size_t half_splice_mer_len) { vector expected_look_left_windows; vector expected_look_right_windows; build_coverage_map(it,rt, segmap_fnames, coverage_map); static const int extend = 45; int num_islands = 0; vector expected_don_acc_windows; fprintf(stderr, "Recording coverage islands\n"); size_t cov_bases = 0; for (map >::iterator itr = coverage_map.begin(); itr != coverage_map.end(); ++itr) { vector& cov = itr->second; size_t island_left_edge = 0; for (size_t c = 1; c < cov.size(); ++c) { if (cov[c]) { cov_bases++; if (!cov[c - 1]) { num_islands += 1; int edge = (int)c - extend; edge = max(edge, 0); island_left_edge = edge; } } else { if (cov[c - 1]) { expected_don_acc_windows.push_back(RefSeg(itr->first, POINT_DIR_LEFT, false, /* not important */ READ_DONTCARE, island_left_edge, c + extend)); expected_don_acc_windows.push_back(RefSeg(itr->first, POINT_DIR_RIGHT, false, /* not important */ READ_DONTCARE, island_left_edge, c + extend)); } } } } fprintf(stderr, "Found %d islands covering %ld bases\n", num_islands, (long int)cov_bases); juncs_from_ref_segs(rt, expected_don_acc_windows, cov_juncs, "GT", "AG", max_coverage_intron_length, min_coverage_intron_length, max_cov_juncs, true, half_splice_mer_len); fprintf(stderr, "Found %ld potential intra-island junctions\n", (long int)cov_juncs.size()); } struct ReadInfo { ReadID read_id; uint32_t left; uint32_t right; bool operator<(const ReadInfo& rhs) const { if (left != rhs.left) return left < rhs.left; if (right != rhs.right) return right < rhs.right; return false; } }; void capture_island_ends(ReadTable& it, RefSequenceTable& rt, vector& segmap_fnames, std::set& cov_juncs, map >& coverage_map, size_t half_splice_mer_len) { //static int island_repeat_tolerance = 10; vector expected_look_left_windows; vector expected_look_right_windows; // daehwan //#define DEBUG_CHECK_EXONS 1 //#define DEBUG_RANGE_ONLY 1 #ifndef DEBUG_CHECK_EXONS build_coverage_map(it, rt, segmap_fnames, coverage_map); #else //build coverage map here, so we can debug it #ifdef DEBUG_RANGE_ONLY static const uint32_t chr14_id = rt.get_id("chr14"); #endif vector hits; BAMHitFactory hit_factory(it,rt); for (size_t f = 0; f < seg_files.size(); ++f) { fprintf(stderr, "Adding hits from segment file %d to coverage map\n", (int)f); seg_files[f]->rewind(); FILE* fp = seg_files[f]->file; //rewind(fp); HitStream hs(fp, &hit_factory, false, false, false); HitsForRead hit_group; while (hs.next_read_hits(hit_group)) { for (size_t h = 0; h < hit_group.hits.size(); ++h) { BowtieHit& bh = hit_group.hits[h]; // daehwan //if (check_exons) <-- DEBUG_CHECK_EXONS #ifdef DEBUG_RANGE_ONLY if (bh.ref_id() != chr14_id) continue; // if (bh.left() < 66567028 && bh.right() > 66604392) if (bh.left() < 66400000 || bh.right() > 66700000) continue; ReadInfo read_info; read_info.read_id = bh.insert_id(); read_info.left = bh.left(); read_info.right = bh.right(); hits.push_back(read_info); #endif pair >::iterator, bool> ret = coverage_map.insert(make_pair(bh.ref_id(), vector())); vector& ref_cov = ret.first->second; size_t right_extent = bh.right(); if (right_extent >= ref_cov.size()) { ref_cov.resize(right_extent + 1, 0); } for (uint32_t c = (uint32_t)bh.left(); c < (uint32_t)bh.right(); ++c) { ref_cov[c] = true; } } } } sort(hits.begin(), hits.end()); #endif // static const int min_cov_length = segment_length + 2; long covered_bases = 0; int long_enough_bases = 0; int left_looking = 0; int right_looking = 0; static const int extend = 45; static const int repeat_tol = 5; int num_islands = 0; for (map >::iterator itr = coverage_map.begin(); itr != coverage_map.end(); ++itr) { #ifdef B_DEBUG fprintf (stderr, "Finding pairings in ref seq %s\n", rt.get_name(itr->first)); #endif vector& cov = itr->second; vector long_enough(cov.size()); size_t last_uncovered = 0; static const uint8_t min_cov = 1; for (size_t c = 1; c < cov.size(); ++c) { uint8_t c_cov = cov[c]; //get_cov(cov, c); if (c_cov < min_cov || c == (cov.size()) - 1) { int putative_exon_length = (int)c - (int)last_uncovered; uint32_t last_pos_cov = cov[c - 1]; //get_cov(cov,c - 1); if (last_pos_cov >= min_cov && putative_exon_length >= min_cov_length) { #ifdef B_DEBUG fprintf(stderr, "cov. island: %d-%d\n", (int)(last_uncovered + 1), (int)c); fprintf(stderr, "\t(putative exon length = %d, min_cov_length=%d)\n",putative_exon_length, min_cov_length); #endif covered_bases += (c + 1 - last_uncovered); for (int l = (int)c; l > (int)last_uncovered; --l) { long_enough[l] = true; } } last_uncovered = c; } } vector& ref_cov = long_enough; vector cov_state(ref_cov.size(), UNCOVERED); // daehwan - print islands (exons) //if (check_exons) #ifdef DEBUG_CHECK_EXONS //{ uint32_t left = 0, right = 0; for (size_t c = 1; c < ref_cov.size(); ++c) { if (ref_cov[c] && !ref_cov[c-1]) { left = c; cout << "Exon: " << left << "-"; } else if (!ref_cov[c] && ref_cov[c-1]) { right = c - 1; cout << right << endl; for (size_t k = 0; k < hits.size(); ++k) { const ReadInfo& hit = hits[k]; if (hit.right < left) continue; if (hit.left > right) break; cout << "\t" << hit.read_id << " " << hit.left << "-" << hit.right << endl; } } } //} #endif for (size_t c = 1; c < ref_cov.size(); ++c) { if (ref_cov[c]) { long_enough_bases++; if (!ref_cov[c - 1]) { num_islands += 1; for (int r = (int)c - extend; r >= 0 && r < (int)c + repeat_tol && r < (int)cov_state.size(); ++r) { cov_state[r] |= LOOK_LEFT; } } } else { if (ref_cov[c - 1]) { for (int l = (int)c - repeat_tol; l >= 0 && l < (int)c + extend && l < (int)cov_state.size(); ++l) { cov_state[l] |= LOOK_RIGHT; } } } } RefSeg* curr_look_left = NULL; RefSeg* curr_look_right = NULL; for (size_t c = 1; c < cov_state.size(); ++c) { if (cov_state[c] & LOOK_LEFT) { left_looking++; if (!(cov_state[c-1] & LOOK_LEFT)) { expected_look_left_windows.push_back(RefSeg(itr->first, POINT_DIR_LEFT, false, /* not important */ READ_DONTCARE, c, c + 1)); curr_look_left = &(expected_look_left_windows.back()); } else if (curr_look_left) { curr_look_left->right++; } } else { if ((cov_state[c-1] & LOOK_LEFT)) { curr_look_left = NULL; } } if (cov_state[c] & LOOK_RIGHT) { right_looking++; if (!(cov_state[c-1] & LOOK_RIGHT)) { expected_look_right_windows.push_back(RefSeg(itr->first, POINT_DIR_RIGHT, false, /* not important */ READ_DONTCARE, c, c + 1)); curr_look_right = &(expected_look_right_windows.back()); } else if (curr_look_right) { curr_look_right->right++; } } else { if ((cov_state[c-1] & LOOK_RIGHT)) { curr_look_right = NULL; } } } } fprintf(stderr, " Map covers %ld bases\n", covered_bases); fprintf(stderr, " Map covers %d bases in sufficiently long segments\n", long_enough_bases); fprintf(stderr, " Map contains %d good islands\n", num_islands + 1); fprintf(stderr, " %d are left looking bases\n", left_looking); fprintf(stderr, " %d are right looking bases\n", right_looking); vector expected_don_acc_windows; expected_don_acc_windows.insert(expected_don_acc_windows.end(), expected_look_right_windows.begin(), expected_look_right_windows.end()); expected_don_acc_windows.insert(expected_don_acc_windows.end(), expected_look_left_windows.begin(), expected_look_left_windows.end()); if (!butterfly_search) coverage_map.clear(); //free some memory juncs_from_ref_segs(rt, expected_don_acc_windows, cov_juncs, "GT", "AG", max_coverage_intron_length, min_coverage_intron_length, max_cov_juncs, true, half_splice_mer_len); //fprintf(stderr, "Found %ld potential island-end pairing junctions\n", (long int)cov_juncs.size()); } void print_juncs(RefSequenceTable& rt, std::set& juncs, const char* str) { fprintf (stderr, "-- %s --\n", str); for(std::set::iterator itr = juncs.begin(); itr != juncs.end(); ++itr) { const char* ref_name = rt.get_name(itr->refid); fprintf(stderr, "%s\t%d\t%d\t%c\n", ref_name, itr->left, itr->right, itr->antisense ? '-' : '+'); } fprintf (stderr, "-- done --\n"); } struct SegmentSearchWorker { void operator()() { ReadTable it; ReadStream readstream(reads_fname); ReadStream readstream_for_segment_search(reads_fname); ReadStream readstream_for_indel_discovery(reads_fname); ReadStream readstream_for_fusion_discovery(reads_fname); if (readstream.file() == NULL || readstream_for_segment_search.file() == NULL || readstream_for_indel_discovery.file() == NULL || readstream_for_fusion_discovery.file() == NULL) { fprintf(stderr, "Error: cannot open %s for reading\n", reads_fname.c_str()); exit(1); } if (read_offset > 0) { readstream.seek(read_offset); readstream_for_segment_search.seek(read_offset); readstream_for_indel_discovery.seek(read_offset); readstream_for_fusion_discovery.seek(read_offset); } vector hit_factories; hit_factories.push_back(new BAMHitFactory(it, *rt)); HitStream partner_hit_stream_for_segment_search(partner_reads_map_fname, hit_factories.back(), false, false, false); hit_factories.push_back(new BAMHitFactory(it, *rt)); HitStream partner_hit_stream_for_fusion_discovery(partner_reads_map_fname, hit_factories.back(), false, false, false); if (partner_hit_offset > 0) { partner_hit_stream_for_segment_search.seek(partner_hit_offset); partner_hit_stream_for_fusion_discovery.seek(partner_hit_offset); } hit_factories.push_back(new BAMHitFactory(it, *rt)); HitStream seg_partner_hit_stream_for_segment_search(seg_partner_reads_map_fname, hit_factories.back(), false, false, false); hit_factories.push_back(new BAMHitFactory(it, *rt)); HitStream seg_partner_hit_stream_for_fusion_discovery(seg_partner_reads_map_fname, hit_factories.back(), false, false, false); if (seg_partner_hit_offset > 0) { seg_partner_hit_stream_for_segment_search.seek(seg_partner_hit_offset); seg_partner_hit_stream_for_fusion_discovery.seek(seg_partner_hit_offset); } vector hit_streams; for (size_t i = 0; i < segmap_fnames->size(); ++i) { hit_factories.push_back(new BAMHitFactory(it, *rt)); HitStream hs((*segmap_fnames)[i], hit_factories.back(), false, false, false); if (seg_offsets[i] > 0) hs.seek(seg_offsets[i]); hit_streams.push_back(hs); } int num_group = 0; uint64_t read_id = 0, last_read_id = 0; while ((read_id = process_next_hit_group(*rt, readstream, readstream_for_segment_search, readstream_for_indel_discovery, readstream_for_fusion_discovery, it, hit_streams, hit_streams.size() - 1, partner_hit_stream_for_segment_search, seg_partner_hit_stream_for_segment_search, partner_hit_stream_for_fusion_discovery, seg_partner_hit_stream_for_fusion_discovery, *juncs, *deletions, *insertions, *fusions, read, begin_id, end_id)) != 0) { num_group++; #if 0 if (num_group % 500000 == 0) { fprintf(stderr, "\tProcessed %lu in %d between %lu and %lu root segment groups\n", num_group, read_id, begin_id, end_id); fprintf(stderr, "\t# of events %lu(j)-%lu(i)-%lu(d)\n", juncs->size(), deletions->size(), insertions->size()); } #endif last_read_id = read_id; } // "microaligned_segs" is not protected against multi-threading // fprintf(stderr, "Microaligned %d segments\n", microaligned_segs); for (size_t i = 0; i < hit_factories.size(); ++i) delete hit_factories[i]; hit_factories.clear(); } RefSequenceTable* rt; string reads_fname; vector* segmap_fnames; string partner_reads_map_fname; string seg_partner_reads_map_fname; std::set* juncs; std::set* deletions; std::set* insertions; FusionSimpleSet* fusions; eREAD read; uint64_t begin_id; uint64_t end_id; int64_t read_offset; vector seg_offsets; int64_t partner_hit_offset; int64_t seg_partner_hit_offset; }; struct SpliceJunctionCoord { uint32_t refid; int coord; SpliceJunctionCoord(uint32_t r, int c) : refid(r), coord(c) {} bool operator< (const SpliceJunctionCoord& r) const { if (refid < r.refid) return true; else if (refid == r.refid && coord < r.coord) return true; else return false; } }; void driver(istream& ref_stream, FILE* juncs_out, FILE* insertions_out, FILE* deletions_out, FILE* fusions_out, string& left_reads_fname, string& left_reads_map_fname, vector& left_segmap_fnames, string& right_reads_fname, string& right_reads_map_fname, vector& right_segmap_fnames) { if (!parallel) num_threads = 1; // turn off parallelization in case of the following search methods if (!no_coverage_search || !no_microexon_search || butterfly_search) num_threads = 1; //fprintf(stderr, ">>>>>>>>>> num_threads = %d\n",num_threads); assert (num_threads > 0); if (left_segmap_fnames.size() == 0) { fprintf(stderr, "No hits to process, exiting\n"); exit(0); } vector > vseg_juncs(num_threads); std::set cov_juncs; std::set butterfly_juncs; std::set juncs; vector > vdeletions(num_threads); vector > vinsertions(num_threads); vector vfusions(num_threads); RefSequenceTable rt(sam_header, true); fprintf (stderr, "Loading reference sequences...\n"); get_seqs(ref_stream, rt, true); string left_seg_fname_for_segment_search = left_segmap_fnames.back(); string right_seg_fname_for_segment_search; if (right_segmap_fnames.size() > 0) right_seg_fname_for_segment_search = right_segmap_fnames.back(); fprintf(stderr, ">> Performing segment-search:\n"); if (left_segmap_fnames.size() > 1) { fprintf( stderr, "Loading left segment hits...\n"); vector read_ids; vector > offsets; vector partner_offsets; vector seg_partner_offsets; if (num_threads > 1) { vector fnames; fnames.push_back(left_reads_fname); fnames.insert(fnames.end(), left_segmap_fnames.begin(), left_segmap_fnames.end()); bool enough_data = calculate_offsets(fnames, read_ids, offsets); if (!enough_data) num_threads = 1; if (enough_data && right_reads_map_fname != "") calculate_offsets_from_ids(right_reads_map_fname, read_ids, partner_offsets); if (enough_data && right_seg_fname_for_segment_search != "") calculate_offsets_from_ids(right_seg_fname_for_segment_search, read_ids, seg_partner_offsets); } vector threads; for (int i = 0; i < num_threads; ++i) { SegmentSearchWorker worker; worker.rt = &rt; worker.reads_fname = left_reads_fname; worker.segmap_fnames = &left_segmap_fnames; worker.partner_reads_map_fname = right_reads_map_fname; worker.seg_partner_reads_map_fname = right_seg_fname_for_segment_search; worker.juncs = &vseg_juncs[i]; worker.deletions = &vdeletions[i]; worker.insertions = &vinsertions[i]; worker.fusions = &vfusions[i]; worker.read = READ_LEFT; worker.partner_hit_offset = 0; worker.seg_partner_hit_offset = 0; if (i == 0) { worker.begin_id = 0; worker.seg_offsets = vector(left_segmap_fnames.size(), 0); worker.read_offset = 0; } else { worker.begin_id = read_ids[i-1]; worker.seg_offsets.insert(worker.seg_offsets.end(), offsets[i-1].begin()+1, offsets[i-1].end()); worker.read_offset = offsets[i-1][0]; if (partner_offsets.size() > 0) worker.partner_hit_offset = partner_offsets[i-1]; if (seg_partner_offsets.size() > 0) worker.seg_partner_hit_offset = seg_partner_offsets[i-1]; } worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits::max(); //Geo debug: //fprintf(stderr, "Worker %d: begin_id=%lu, end_id=%lu\n", i, worker.begin_id, worker.end_id); if (num_threads > 1 && i + 1 < num_threads) threads.push_back(new boost::thread(worker)); else worker(); } for (size_t i = 0; i < threads.size(); ++i) { threads[i]->join(); delete threads[i]; threads[i] = NULL; } threads.clear(); fprintf( stderr, "done.\n"); } if (right_segmap_fnames.size() > 1) { fprintf( stderr, "Loading right segment hits...\n"); vector read_ids; vector > offsets; vector partner_offsets; vector seg_partner_offsets; if (num_threads > 1) { vector fnames; fnames.push_back(right_reads_fname); fnames.insert(fnames.end(), right_segmap_fnames.begin(), right_segmap_fnames.end()); bool enough_data = calculate_offsets(fnames, read_ids, offsets); if (!enough_data) num_threads = 1; if (enough_data) calculate_offsets_from_ids(left_reads_map_fname, read_ids, partner_offsets); if (enough_data) calculate_offsets_from_ids(left_seg_fname_for_segment_search, read_ids, seg_partner_offsets); } vector threads; for (int i = 0; i < num_threads; ++i) { SegmentSearchWorker worker; worker.rt = &rt; worker.reads_fname = right_reads_fname; worker.segmap_fnames = &right_segmap_fnames; worker.partner_reads_map_fname = left_reads_map_fname; worker.seg_partner_reads_map_fname = left_seg_fname_for_segment_search; worker.juncs = &vseg_juncs[i]; worker.deletions = &vdeletions[i]; worker.insertions = &vinsertions[i]; worker.fusions = &vfusions[i]; worker.read = READ_RIGHT; worker.partner_hit_offset = 0; worker.seg_partner_hit_offset = 0; if (i == 0) { worker.begin_id = 0; worker.seg_offsets = vector(right_segmap_fnames.size(), 0); worker.read_offset = 0; } else { worker.begin_id = read_ids[i-1]; worker.seg_offsets.insert(worker.seg_offsets.end(), offsets[i-1].begin() + 1, offsets[i-1].end()); worker.read_offset = offsets[i-1][0]; if (partner_offsets.size() > 0) worker.partner_hit_offset = partner_offsets[i-1]; if (seg_partner_offsets.size() > 0) worker.seg_partner_hit_offset = seg_partner_offsets[i-1]; } worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits::max(); if (num_threads > 1 && i + 1 < num_threads) threads.push_back(new boost::thread(worker)); else worker(); } for (size_t i = 0; i < threads.size(); ++i) { threads[i]->join(); delete threads[i]; threads[i] = NULL; } threads.clear(); fprintf( stderr, "done.\n"); } std::set seg_juncs; std::set deletions; std::set insertions; for (int i = 0; i < num_threads; ++i) { seg_juncs.insert(vseg_juncs[i].begin(), vseg_juncs[i].end()); deletions.insert(vdeletions[i].begin(), vdeletions[i].end()); insertions.insert(vinsertions[i].begin(), vinsertions[i].end()); } FusionSimpleSet fusions = vfusions[0]; for (int i = 1; i < num_threads; ++i) { merge_with(fusions, vfusions[i]); } fprintf(stderr, "\tfound %ld potential split-segment junctions\n", (long int)seg_juncs.size()); fprintf(stderr, "\tfound %ld potential small deletions\n", (long int)deletions.size()); fprintf(stderr, "\tfound %ld potential small insertions\n", (long int)insertions.size()); vector all_segmap_fnames; for (vector::size_type i = 0; i != left_segmap_fnames.size();i++) { all_segmap_fnames.push_back( left_segmap_fnames[i] ); } for (vector::size_type i = 0; i != right_segmap_fnames.size();i++) { all_segmap_fnames.push_back( right_segmap_fnames[i] ); } #if 0 // daehwan - check this out as Cole insists on using segments gives better results. vector all_map_files; if (left_seg_files.size() > 1) { all_map_files.push_back(&left_reads_map_file); } if (right_seg_files.size() > 1) { all_map_files.push_back(&right_reads_map_file); } copy(all_seg_files.begin(), all_seg_files.end(), back_inserter(all_map_files)); #endif ReadTable it; map > coverage_map; if (!no_coverage_search || butterfly_search) { if (ium_reads != "") { vector ium_read_files; tokenize(ium_reads,",", ium_read_files); /* vector iums; string unzcmd=getUnpackCmd(ium_read_files[0],false); //could be BAM file for (size_t ium = 0; ium < ium_read_files.size(); ++ium) { //fprintf (stderr, "Indexing extensions in %s\n", ium_read_files[ium].c_str()); FZPipe ium_file(ium_read_files[ium],unzcmd); if (ium_file.file==NULL) { fprintf (stderr, "Can't open file %s for reading, skipping...\n",ium_read_files[ium].c_str()); continue; } iums.push_back(ium_file); } */ index_read_mers(ium_read_files, 5); } else { //no unmapped reads no_coverage_search = true; butterfly_search = false; } if (!no_coverage_search) { //coverage search // looking for junctions by island end pairings fprintf(stderr, ">> Performing coverage-search:\n"); capture_island_ends(it, rt, all_segmap_fnames, cov_juncs, coverage_map, 5); fprintf(stderr, "\tfound %d potential junctions\n",(int)cov_juncs.size()); } } //coverage search or butterfly search if (butterfly_search) { //looking for junctions between and within islands fprintf(stderr, ">> Performing butterfly-search: \n"); prune_extension_table(butterfly_overhang); compact_extension_table(); pair_covered_sites(it, rt, all_segmap_fnames, butterfly_juncs, coverage_map, 5); fprintf(stderr, "\tfound %d potential junctions\n",(int)butterfly_juncs.size()); } coverage_map.clear(); std::set microexon_juncs; if (!no_microexon_search) { fprintf(stderr, ">> Performing microexon-search: \n"); std::set microexon_juncs; align_microexon_segs(rt, microexon_juncs, max_cov_juncs, 5); fprintf(stderr, "\tfound %d potential junctions\n",(int)microexon_juncs.size()); juncs.insert(microexon_juncs.begin(), microexon_juncs.end()); } juncs.insert(cov_juncs.begin(), cov_juncs.end()); juncs.insert(seg_juncs.begin(), seg_juncs.end()); juncs.insert(butterfly_juncs.begin(), butterfly_juncs.end()); //fprintf(stderr, "Reporting potential splice junctions..."); vector splice_junction_coords; for(std::set::iterator itr = juncs.begin(); itr != juncs.end(); ++itr) { const char* ref_name = rt.get_name(itr->refid); fprintf(juncs_out, "%s\t%d\t%d\t%c\n", ref_name, itr->left, itr->right, itr->antisense ? '-' : '+'); if (fusion_search) { splice_junction_coords.push_back(SpliceJunctionCoord(itr->refid, itr->left)); splice_junction_coords.push_back(SpliceJunctionCoord(itr->refid, itr->right)); } } //close all reading pipes, just to exit cleanly /* for (vector::size_type i = 0; i != all_segmap_fnames.size();i++) { all_segmap_fnames[i]->close(); } */ fprintf(stderr, "Reported %d total potential splices\n", (int)juncs.size()); sort(splice_junction_coords.begin(), splice_junction_coords.end()); fprintf(stderr, "Reporting %lu potential deletions...\n", deletions.size()); if(deletions_out){ for(std::set::iterator itr = deletions.begin(); itr != deletions.end(); ++itr){ const char* ref_name = rt.get_name(itr->refid); /* * We fix up the left co-ordinate to reference the first deleted base */ fprintf(deletions_out, "%s\t%d\t%d\n", ref_name, itr->left + 1, itr->right); } fclose(deletions_out); }else{ fprintf(stderr, "Failed to open deletions file for writing, no deletions reported\n"); } fprintf(stderr, "Reporting %lu potential insertions...\n", insertions.size()); if(insertions_out){ for(std::set::iterator itr = insertions.begin(); itr != insertions.end(); ++itr){ const char* ref_name = rt.get_name(itr->refid); fprintf(insertions_out, "%s\t%d\t%d\t%s\n", ref_name, itr->left, itr->left, itr->sequence.c_str()); } fclose(insertions_out); }else{ fprintf(stderr, "Failed to open insertions file for writing, no insertions reported\n"); } if (fusions_out) { // check if a fusion point coincides with splice junctions. for(FusionSimpleSet::iterator itr = fusions.begin(); itr != fusions.end(); ++itr) { const Fusion& fusion = itr->first; FusionSimpleStat& fusion_stat = itr->second; bool found = binary_search(splice_junction_coords.begin(), splice_junction_coords.end(), SpliceJunctionCoord(fusion.refid1, fusion.left)); if (found) fusion_stat.left_coincide_with_splice_junction = true; found = binary_search(splice_junction_coords.begin(), splice_junction_coords.end(), SpliceJunctionCoord(fusion.refid2, fusion.right)); if (found) fusion_stat.right_coincide_with_splice_junction = true; } for(FusionSimpleSet::iterator itr = fusions.begin(); itr != fusions.end(); ++itr) { const Fusion& fusion = itr->first; const FusionSimpleStat& fusion_stat = itr->second; // compare the current fusion with the next fusion, pick up the better one. FusionSimpleSet::iterator next_itr = itr; ++next_itr; while (next_itr != fusions.end()) { const Fusion& next_fusion = next_itr->first; const FusionSimpleStat& next_fusion_stat = next_itr->second; int left_diff = abs((int)fusion.left - (int)next_fusion.left); if (fusion.refid1 == next_fusion.refid1 && fusion.refid2 == next_fusion.refid2 && left_diff < 10) { if (fusion.dir == next_fusion.dir && left_diff == abs((int)fusion.right - (int)next_fusion.right)) { if (next_fusion_stat.count > fusion_stat.count) itr->second.skip = true; else if (next_fusion_stat.count == fusion_stat.count) { int curr_count = (int)fusion_stat.left_coincide_with_splice_junction + (int)fusion_stat.right_coincide_with_splice_junction; int next_count = (int)next_fusion_stat.left_coincide_with_splice_junction + (int)next_fusion_stat.right_coincide_with_splice_junction; if (curr_count < next_count) itr->second.skip = true; else next_itr->second.skip = true; } else next_itr->second.skip = true; } ++next_itr; } else break; } if (itr->second.skip && !fusion_do_not_resolve_conflicts) continue; const char* ref_name1 = rt.get_name(fusion.refid1); const char* ref_name2 = rt.get_name(fusion.refid2); const char* dir = ""; if (fusion.dir == FUSION_FR) dir = "fr"; else if(fusion.dir == FUSION_RF) dir = "rf"; else if(fusion.dir == FUSION_RR) dir = "rr"; else dir = "ff"; fprintf(fusions_out, "%s\t%d\t%s\t%d\t%s\n", ref_name1, fusion.left, ref_name2, fusion.right, dir); } fclose(fusions_out); } fprintf(stderr, "Reporting potential fusions...\n"); } int main(int argc, char** argv) { fprintf(stderr, "segment_juncs v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION); fprintf(stderr, "---------------------------\n"); int parse_ret = parse_options(argc, argv, print_usage); if (parse_ret) return parse_ret; if(optind >= argc) { print_usage(); return 1; } string ref_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string juncs_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string insertions_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string deletions_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string fusions_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string left_reads_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string left_reads_map_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string left_segment_map_file_list = argv[optind++]; string right_segment_map_file_list; string right_reads_file_name; string right_reads_map_file_name; if (optind < argc) { right_reads_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } right_reads_map_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } right_segment_map_file_list = argv[optind++]; } // Open the approppriate files ifstream ref_stream(ref_file_name.c_str(), ifstream::in); if (!ref_stream.good()) { fprintf(stderr, "Error: cannot open %s for reading\n", ref_file_name.c_str()); exit(1); } FILE* juncs_file = fopen(juncs_file_name.c_str(), "w"); if (!juncs_file) { fprintf(stderr, "Error: cannot open %s for writing\n", juncs_file_name.c_str()); exit(1); } FILE* insertions_file = fopen(insertions_file_name.c_str(), "w"); if (!insertions_file) { fprintf(stderr, "Error: cannot open %s for writing\n", insertions_file_name.c_str()); exit(1); } FILE* deletions_file = fopen(deletions_file_name.c_str(), "w"); if (!deletions_file) { fprintf(stderr, "Error: cannot open %s for writing\n", deletions_file_name.c_str()); exit(1); } vector left_segment_map_fnames; string left_segment_file_for_segment_search; tokenize(left_segment_map_file_list, ",",left_segment_map_fnames); FILE* fusions_file = fopen(fusions_file_name.c_str(), "w"); if (!fusions_file) { fprintf(stderr, "Error: cannot open %s for writing\n", fusions_file_name.c_str()); exit(1); } //FILE* left_reads_file = fopen(left_reads_file_name.c_str(), "r"); //FILE* left_reads_file_for_indel_discovery = fopen(left_reads_file_name.c_str(),"r"); string unzcmd=getUnpackCmd(left_reads_file_name, false); FZPipe left_reads_file(left_reads_file_name, unzcmd); FZPipe left_reads_file_for_segment_search(left_reads_file_name, unzcmd); FZPipe left_reads_file_for_indel_discovery(left_reads_file_name, unzcmd); FILE* left_reads_file_for_fusion_discovery = fopen(left_reads_file_name.c_str(),"r"); if (left_reads_file.file==NULL || left_reads_file_for_segment_search.file==NULL || left_reads_file_for_indel_discovery.file==NULL || left_reads_file_for_fusion_discovery==NULL) { fprintf(stderr, "Error: cannot open %s for reading\n", left_reads_file_name.c_str()); exit(1); } vector right_segment_map_fnames; string right_segment_file_for_segment_search; if (right_segment_map_file_list != "") { tokenize(right_segment_map_file_list, ",", right_segment_map_fnames); } // min_cov_length=20; if (min_cov_length>segment_length-2) min_cov_length=segment_length-2; driver(ref_stream, juncs_file, insertions_file, deletions_file, fusions_file, left_reads_file_name, left_reads_map_file_name, left_segment_map_fnames, right_reads_file_name, right_reads_map_file_name, right_segment_map_fnames); return 0; } tophat-2.0.9/src/GFaSeqGet.cpp0000644000175000017500000002376212122334361014621 0ustar toortoor#include "GFaSeqGet.h" #include "gdna.h" #include void GSubSeq::setup(uint sstart, int slen, int sovl, int qfrom, int qto, uint maxseqlen) { if (sovl==0) { GFREE(sq); sqstart=sstart; uint max_len=(maxseqlen>0) ? maxseqlen : MAX_FASUBSEQ; sqlen = (slen==0 ? max_len : slen); GMALLOC(sq, sqlen); return; } //overlap -- copy the overlapping region char* newsq=NULL; GMALLOC(newsq, slen); memcpy((void*)&newsq[qto], (void*)&sq[qfrom], sovl); GFREE(sq); sq=newsq; sqstart=sstart; sqlen=slen; } void GFaSeqGet::finit(const char* fn, off_t fofs, bool validate) { fh=fopen(fn,"rb"); if (fh==NULL) { GError("Error (GFaSeqGet) opening file '%s'\n",fn); } fname=Gstrdup(fn); initialParse(fofs, validate); lastsub=new GSubSeq(); } GFaSeqGet::GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen) { //for GFastaIndex use mostly -- the important difference is that //the file offset is to the sequence, not to the defline fh=fopen(faname,"rb"); if (fh==NULL) { GError("Error (GFaSeqGet) opening file '%s'\n",faname); } fname=Gstrdup(faname); line_len=l_len; line_blen=l_blen; seq_len=seqlen; if (line_blen0) { //end of the first "sequence" line lendlen++; break; } else {// another EoL char at the end of defline fseqstart++; continue; } }// end-of-line characters line_len++; } //we are at the end of first sequence line while ((c=getc(fh))!=EOF) { if (c=='\n' || c=='\r') lendlen++; else { ungetc(c,fh); break; } } line_blen=line_len+lendlen; if (c==EOF) return; // -- you don't need to check it all if you're sure it's safe if (checkall) { //validate the rest of the FASTA record int llen=0; //last line length int elen=0; //length of last line ending bool waseol=true; while ((c=getc(fh))!=EOF) { if (c=='>' && waseol) { ungetc(c,fh); break; } if (c=='\n' || c=='\r') { // eol char elen++; if (waseol) continue; //2nd eol char waseol=true; elen=1; continue; } if (c<=32) GError(gfa_ERRPARSE); //invalid character encountered //--- on a seq char here: if (waseol) {//beginning of a seq line if (elen && (llen!=line_len || elen!=lendlen)) //GError(gfa_ERRPARSE); GError("Error: invalid FASTA format for GFaSeqGet; make sure that\n\ the sequence lines have the same length (except for the last line)"); waseol=false; llen=0; elen=0; } llen++; } //while reading chars }// FASTA checking was requested fseeko(fh,fseqstart,SEEK_SET); } const char* GFaSeqGet::subseq(uint cstart, int& clen) { //cstart is 1-based genomic coordinate within current fasta sequence int maxlen=(seq_len>0)?seq_len : MAX_FASUBSEQ; //GMessage("--> call: subseq(%u, %d)\n", cstart, clen); if (clen>maxlen) { GMessage("Error (GFaSeqGet): subsequence cannot be larger than %d\n", maxlen); return NULL; } if (seq_len>0 && clen+cstart-1>seq_len) { GMessage("Error (GFaSeqGet): end coordinate (%d) cannot be larger than sequence length %d\n", clen+cstart-1, seq_len); } if (lastsub->sq==NULL || lastsub->sqlen==0) { lastsub->setup(cstart, clen, 0,0,0,seq_len); loadsubseq(cstart, clen); lastsub->sqlen=clen; return (const char*)lastsub->sq; } //allow extension up to MAX_FASUBSEQ uint bstart=lastsub->sqstart; uint bend=lastsub->sqstart+lastsub->sqlen-1; uint cend=cstart+clen-1; int qlen=0; //only the extra len to be allocated/appended/prepended uint qstart=cstart; //start coordinate of the new seq block of length qlen to be read from file int newlen=0; //the new total length of the buffered sequence lastsub->sq int kovl=0; int czfrom=0;//0-based offsets for copying a previously read sequence chunk int czto=0; uint newstart=cstart; if (cstart>=bstart && cend<=bend) { //new reg contained within existing buffer return (const char*) &(lastsub->sq[cstart-bstart]) ; } //extend downward uint newend=GMAX(cend, bend); if (cstartMAX_FASUBSEQ) { newlen=MAX_FASUBSEQ; newend=cstart+newlen-1; //keep newstart, set newend } qlen=bstart-cstart; if (newend>bstart) { //overlap if (newend>bend) {// new region is larger & around the old one - so we have two regions to update kovl=bend-bstart+1; czfrom=0; czto=bstart-cstart; lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc and copy the kovl subseq qlen=bstart-cstart; loadsubseq(newstart, qlen); qlen=newend-bend; int toread=qlen; loadsubseq(bend+1, qlen); clen-=(toread-qlen); lastsub->sqlen=clen; return (const char*)lastsub->sq; } //newend<=bend kovl=newend-bstart+1; } else { //no overlap with previous buffer if (newend>bend) kovl=bend-bstart+1; else kovl=newend-bstart+1; } qlen=bstart-cstart; czfrom=0; czto=qlen; } //cstart=bstart, possibly extend upwards newstart=bstart; newlen=(newend-newstart+1); if (newlen>MAX_FASUBSEQ) { newstart=bstart+(newlen-MAX_FASUBSEQ);//keep newend, assign newstart newlen=MAX_FASUBSEQ; if (newstart<=bend) { //overlap with old buffer kovl=bend-newstart+1; czfrom=newstart-bstart; czto=0; } else { //not overlapping old buffer kovl=0; } } //newstart reassigned else { //we can extend the buffer to include the old one qlen=newend-bend; //how much to read from file qstart=bend+1; kovl=bend-bstart+1; czfrom=0; czto=0; } } lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc but copy any overlapping region lastsub->sqlen-=qlen; //appending may result in a premature eof int toread=qlen; loadsubseq(qstart, qlen); //read the missing chunk, if any clen-=(toread-qlen); lastsub->sqlen+=qlen; return (const char*)(lastsub->sq+(cstart-newstart)); } char* GFaSeqGet::copyRange(uint cstart, uint cend, bool revCmpl, bool upCase) { if (cstart>cend) { Gswap(cstart, cend); } int clen=cend-cstart+1; const char* gs=subseq(cstart, clen); if (gs==NULL) return NULL; char* r=NULL; GMALLOC(r,clen+1); r[clen]=0; memcpy((void*)r,(void*)gs, clen); if (revCmpl) reverseComplement(r,clen); if (upCase) { for (int i=0;isq space allocated previously //only loads the requested clen chars from file, at offset &lastsub->sq[cstart-lastsub->sqstart] int sofs=cstart-lastsub->sqstart; int lendlen=line_blen-line_len; char* seqp=lastsub->sq+sofs; //find the proper file offset and read the appropriate lines uint seqofs=cstart-1; uint startlno = seqofs/line_len; int lineofs = seqofs % line_len; off_t fstart=fseqstart + (startlno*line_blen); fstart+=lineofs; fseeko(fh, fstart, SEEK_SET); int toread=clen; int maxlen=(seq_len>0)? seq_len-cstart+1 : MAX_FASUBSEQ ; if (toread==0) toread=maxlen; //read max allowed, or to the end of file int actualrlen=0; int sublen=0; if (lineofs>0) { //read the partial first line int reqrlen=line_len-lineofs; if (reqrlen>toread) reqrlen=toread; //in case we need to read just a few chars actualrlen=fread((void*)seqp, 1, reqrlen, fh); if (actualrlen=line_len) { char* rseqp=&(seqp[sublen]); actualrlen=fread((void*)rseqp, 1, line_len, fh); /* char dbuf[256];dbuf[255]=0; strncpy(dbuf,rseqp, actualrlen); dbuf[actualrlen]=0; GMessage("<<0) { char* rseqp=&(seqp[sublen]); actualrlen=fread((void*)rseqp, 1, toread, fh); if (actualrlensqlen+=sublen; clen=sublen; return (const char*)seqp; } tophat-2.0.9/src/junctions.cpp0000644000175000017500000002372212122334362015064 0ustar toortoor/* * junctions.cpp * TopHat * * Created by Cole Trapnell on 12/12/08. * Copyright 2008 Cole Trapnell. All rights reserved. * */ #ifdef HAVE_CONFIG_H #include #endif #include #include "common.h" #include "junctions.h" #include "bwt_map.h" void junctions_from_spliced_hit(const BowtieHit& h, vector >& new_juncs) { const vector& cigar = h.cigar(); int j = h.left(); bool bSawFusion = false; for (size_t c = 0 ; c < cigar.size(); ++c) { Junction junc; JunctionStats stats; int opcode = cigar[c].opcode; int length = cigar[c].length; switch(opcode) { case REF_SKIP: case rEF_SKIP: if (bSawFusion) junc.refid = h.ref_id2(); else junc.refid = h.ref_id(); // daehwan - we need to consider indels very next to REF_SKIP, // which is possible due to Bowtie2 assert (c > 0 && c < cigar.size() - 1); assert (cigar[c - 1].length); assert (cigar[c + 1].length); if (opcode == REF_SKIP) { junc.left = j - 1; junc.right = j + length; stats.left_extent = cigar[c - 1].length; stats.right_extent = cigar[c + 1].length; j += length; } else { junc.right = j + 1; junc.left = j - length; stats.right_extent = cigar[c - 1].length; stats.left_extent = cigar[c + 1].length; j -= length; } junc.antisense = h.antisense_splice(); /* * Note that in valid_hit() in tophat_report.cpp * we have tried to ensure that the REF_SKIP operator * will only be surrounded by match operators */ stats.min_splice_mms = h.splice_mms(); stats.supporting_hits++; new_juncs.push_back(make_pair(junc, stats)); break; case MATCH: case DEL: j += cigar[c].length; break; case mATCH: case dEL: j -= cigar[c].length; break; case FUSION_FF: case FUSION_FR: case FUSION_RF: j = cigar[c].length; bSawFusion = true; break; default: break; } } } void print_junction(FILE* junctions_out, const char* name, const Junction& j, const JunctionStats& s, uint64_t junc_id) { int left_plus_one = j.left + 1; fprintf(junctions_out, "%s\t%d\t%d\tJUNC%08d\t%d\t%c\t%d\t%d\t255,0,0\t2\t%d,%d\t0,%d\n", name, left_plus_one - s.left_extent, j.right + s.right_extent, (int)junc_id, s.supporting_hits, j.antisense ? '-' : '+', left_plus_one - s.left_extent, j.right + s.right_extent, s.left_extent, s.right_extent, j.right - (left_plus_one - s.left_extent)); } void junctions_from_alignment(const BowtieHit& spliced_alignment, JunctionSet& junctions) { vector > juncs; junctions_from_spliced_hit(spliced_alignment, juncs); for (size_t i = 0; i < juncs.size(); ++i) { pair& junc = juncs[i]; JunctionSet::iterator itr = junctions.find(junc.first); if (itr != junctions.end()) { JunctionStats& j = itr->second; j.merge_with(junc.second); } else { assert(junc.first.refid != VMAXINT32); junctions[junc.first] = junc.second; } } } #if !NDEBUG void validate_junctions(const JunctionSet& junctions) { uint32_t invalid_juncs = 0; for (JunctionSet::const_iterator i = junctions.begin(); i != junctions.end(); ++i) { if (!i->first.valid()) invalid_juncs++; } fprintf(stderr, "Found %d invalid junctions\n", invalid_juncs); } #endif int rejected = 0; int rejected_spliced = 0; int total_spliced = 0; int total = 0; /* void junctions_from_alignments(HitTable& hits, JunctionSet& junctions) { for (HitTable::iterator ci = hits.begin(); ci != hits.end(); ++ci) { HitList& rh = ci->second; if (rh.size() == 0) continue; for (size_t i = 0; i < rh.size(); ++i) { BowtieHit& bh = rh[i]; AlignStatus s = status(&bh); total++; if (s == SPLICED) total_spliced++; if (s == SPLICED) { junctions_from_alignment(bh, junctions); } } } } */ bool accept_if_valid(const Junction& j, JunctionStats& s) { if (min(s.left_extent, s.right_extent) < min_anchor_len) { s.accepted = false; return false; } if (s.min_splice_mms > max_splice_mismatches) { s.accepted = false; return false; } // uint32_t junc_doc = 0; // uint8_t extent = 0; // if (s.left_exon_doc > s.right_exon_doc) // { // junc_doc = s.left_exon_doc; // extent = s.left_extent; // } // else // { // junc_doc = s.right_exon_doc; // extent = s.right_extent; // } // // double avg_junc_doc = junc_doc / (double)(extent); //if (avg_junc_doc / (float) s.num_reads > 100.0) // if (s.supporting_hits / avg_junc_doc < min_isoform_fraction) // { // s.accepted = false; // } // else { //fprintf (stderr, "Junction size = %d\n, support = %d", (int)j.right - (int)j.left, (int)s.supporting_hits.size() ); if ((int)j.right - (int)j.left > 50000) { s.accepted = (s.supporting_hits >= 2 && min(s.left_extent, s.right_extent) > 12); } else { s.accepted = true; } } return s.accepted; } void knockout_shadow_junctions(JunctionSet& junctions) { vector ref_ids; for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i) { ref_ids.push_back(i->first.refid); } sort(ref_ids.begin(), ref_ids.end()); vector::iterator new_end = unique(ref_ids.begin(), ref_ids.end()); ref_ids.erase(new_end, ref_ids.end()); for(size_t i = 0; i < ref_ids.size(); ++i) { uint32_t refid = ref_ids[i]; Junction dummy_left(refid, 0, 0, true); Junction dummy_right(refid, VMAXINT32, VMAXINT32, true); pair r; r.first = junctions.lower_bound(dummy_left); r.second = junctions.upper_bound(dummy_right); JunctionSet::iterator itr = r.first; while(itr != r.second && itr != junctions.end()) { if (itr->second.accepted && !itr->second.gtf_match) { Junction fuzzy_left = itr->first; Junction fuzzy_right = itr->first; fuzzy_left.left -= min_anchor_len; fuzzy_right.right += min_anchor_len; fuzzy_left.antisense = !itr->first.antisense; fuzzy_right.antisense = !itr->first.antisense; pair s; s.first = junctions.lower_bound(fuzzy_left); s.second = junctions.upper_bound(fuzzy_right); JunctionSet::iterator itr2 = s.first; int junc_support = itr->second.supporting_hits; while(itr2 != s.second && itr2 != junctions.end()) { int left_diff = itr->first.left - itr2->first.left; int right_diff = itr->first.right - itr2->first.right; if (itr != itr2 && itr->first.antisense != itr2->first.antisense && (left_diff < min_anchor_len || right_diff < min_anchor_len)) { if (junc_support < itr2->second.supporting_hits) itr->second.accepted = false; } ++itr2; } } ++itr; } } } void filter_junctions(JunctionSet& junctions, const JunctionSet& gtf_junctions) { for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i) { if (gtf_junctions.find(i->first) == gtf_junctions.end()) accept_if_valid(i->first, i->second); else {//automatically accept junctions matching GTF i->second.accepted = true; i->second.gtf_match = true; } } knockout_shadow_junctions(junctions); } void accept_all_junctions(JunctionSet& junctions, const uint32_t refid) { fprintf(stderr, "Accepting all junctions\n"); for (JunctionSet::iterator itr = junctions.begin(); itr != junctions.end(); ++itr) { itr->second.accepted = true; } } void print_junctions(FILE* junctions_out, const JunctionSet& junctions, RefSequenceTable& ref_sequences) { uint64_t junc_id = 1; fprintf(junctions_out, "track name=junctions description=\"TopHat junctions\"\n"); for (JunctionSet::const_iterator i = junctions.begin(); i != junctions.end(); ++i) { const pair& j_itr = *i; const Junction& j = j_itr.first; const JunctionStats& s = j_itr.second; assert(ref_sequences.get_name(j.refid)); //fprintf(stdout,"%d\t%d\t%d\t%c\n", j.refid, j.left, j.right, j.antisense ? '-' : '+'); print_junction(junctions_out, ref_sequences.get_name(j.refid), j, s, junc_id++); } //fprintf(stderr, "Rejected %d / %d alignments, %d / %d spliced\n", rejected, total, rejected_spliced, total_spliced); } // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file // resets the stream when finished. void get_junctions_from_hits(HitStream& hit_stream, ReadTable& it, JunctionSet& junctions) { HitsForRead curr_hit_group; hit_stream.next_read_hits(curr_hit_group); uint32_t curr_obs_order = it.observation_order(curr_hit_group.insert_id); while(curr_obs_order != VMAXINT32) { for (size_t i = 0; i < curr_hit_group.hits.size(); ++i) { BowtieHit& bh = curr_hit_group.hits[i]; if (!bh.contiguous()) { junctions_from_alignment(bh, junctions); } hit_stream.next_read_hits(curr_hit_group); curr_obs_order = it.observation_order(curr_hit_group.insert_id); } } hit_stream.reset(); } void merge_with(JunctionSet& juncs, const JunctionSet& other_juncs) { for (JunctionSet::const_iterator junc = other_juncs.begin(); junc != other_juncs.end(); ++junc) { JunctionSet::iterator itr = juncs.find(junc->first); if (itr != juncs.end()) { JunctionStats& curr = itr->second; curr.merge_with(junc->second); } else { juncs[junc->first] = junc->second; } } } tophat-2.0.9/src/map2gtf.cpp0000644000175000017500000003073712163555303014421 0ustar toortoor/* * Author: Harold Pimentel * Contact: http://cs.berkeley.edu/~pimentel * Date: June 10, 2011 */ #include "map2gtf.h" #include "tokenize.h" void m2g_print_usage() { std::cerr << "Usage: map2gtf annotation.tlst " << "alignments.bam out_file.bam" << std::endl; } void tline_parserr(const std::string& tline, std::string add="") { std::cerr << "Error at parsing .tlst line " << add << ":" << std::endl << '\t' << tline << std::endl; exit(1); } GffTranscript::GffTranscript(const std::string& tline): exons(1), numID(-1), gffID(), refID(), strand(0) { std::istringstream f(tline); std::string token; std::vector tokens; while (std::getline(f, token, ' ')) { tokens.push_back(token); } if (tokens.size()!=4) { tline_parserr(tline); } numID=atoi(tokens[0].c_str()); gffID=tokens[1]; refID=tokens[2]; if (refID.length()<1) { tline_parserr(tline, "(refID empty)"); } strand=refID[refID.length()-1]; if (strand!='-' && strand!='+') { tline_parserr(tline, "(invalid strand)"); } refID.erase(refID.length()-1); f.clear(); //to reset the std::getline() iterator f.str(tokens[3]); while (std::getline(f, token, ',')) { size_t sp_pos=token.find('-'); if (sp_pos == std::string::npos) { std::string s("(invalid exon str: "); s+=token;s+=")"; tline_parserr(tline, s); } std::string s_start=token.substr(0,sp_pos); std::string s_end=token.substr(sp_pos+1); GSeg exon(atoi(s_start.c_str()), atoi(s_end.c_str())); if (exon.start==0 || exon.end==0 || exon.endexon.start) start=exon.start; if (end==0 || endheader; std::cout << "Reading the transcript data: " << gtf_fname_ << std::endl; //gtfReader_.init(gtf_fhandle_, true); //only recognizable transcripts will be loaded //gtfReader_.readAll(); std::string tline; while (std::getline(tlststream, tline)) { if (tline.length()>4) { GffTranscript* t=new GffTranscript(tline); transcripts.Add(t); tidx_to_t[t->numID]=t; } } tlststream.close(); std::cout << "Transcript data loaded." << std::endl; } Map2GTF::~Map2GTF() { std::cout << "map2gtf has completed. Cleaning up." << std::endl; /* if (gtf_fhandle_ != NULL && fclose(gtf_fhandle_)) { std::cerr << "Warning: Error closing annotation: " << gtf_fname_ << std::endl; } */ if (in_fhandle_ != NULL) { samclose(in_fhandle_); } std::cout << "Done. Thanks!" << std::endl; } // bool Map2GTF::next_read_hits(vector& hits, size_t& num_hits, long& read_id) { if (hits.size() > num_hits) { bam1_t* temp = hits[num_hits]; hits[num_hits] = hits.front(); hits.front() = temp; num_hits = 1; char* name = bam1_qname(hits.front()); read_id = atol(name); } else num_hits = 0; while (true) { bam1_t* hit = NULL; if (num_hits >= hits.size()) hits.push_back(bam_init1()); hit = hits[num_hits]; if (samread(in_fhandle_, hit) <= 0) { for (size_t i = num_hits; i < hits.size(); ++i) bam_destroy1(hits[i]); hits.erase(hits.begin() + num_hits, hits.end()); break; } char* name = bam1_qname(hit); long temp_read_id = atol(name); if (num_hits == 0) { read_id = temp_read_id; } else if (read_id != temp_read_id) { break; } ++num_hits; } return num_hits > 0; } void Map2GTF::convert_coords(const std::string& out_fname, const std::string& sam_header) { samfile_t* out_sam_header_file = samopen(sam_header.c_str(), "r", 0); if (out_sam_header_file == NULL) std::cerr << "Error opening sam header: " << sam_header << std::endl; out_sam_header_ = out_sam_header_file->header; string index_out_fname = out_fname + ".index"; GBamWriter bam_writer(out_fname.c_str(), out_sam_header_, index_out_fname); ref_to_id_.clear(); for (int i = 0; i < out_sam_header_->n_targets; ++i) { ref_to_id_[out_sam_header_->target_name[i]] = i; } std::vector read_list; //GffObj* p_trans = NULL; GffTranscript* p_trans = NULL; HitsForRead hit_group; std::vector::iterator bh_it; std::vector::iterator bh_unique_it; BowtieHit bwt_hit; vector hits; size_t num_hits = 0; long read_id = 0; // a hit group is a set of reads with the same name while (next_read_hits(hits, num_hits, read_id)) { for (size_t i = 0; i < num_hits; ++i) { bam1_t* hit = hits[i]; const char* target_name = in_sam_header_->target_name[hit->core.tid]; int trans_idx = atoi(target_name); //p_trans = gtfReader_.gflst.Get(trans_idx); p_trans = tidx_to_t[trans_idx]; TranscriptomeHit converted_out(hit, p_trans); bool success = trans_to_genomic_coords(converted_out); if (success) read_list.push_back(converted_out); } // XXX: Fine for now... should come up with a more efficient way though // FIXME: Take frag length into consideration when filtering std::sort(read_list.begin(), read_list.end()); bh_unique_it = std::unique(read_list.begin(), read_list.end()); for (bh_it = read_list.begin(); bh_it != bh_unique_it; ++bh_it) { bam_writer.write(bh_it->hit, read_id); } read_list.clear(); } for (size_t i = 0; i < hits.size(); ++i) { bam_destroy1(hits[i]); } hits.clear(); } bool Map2GTF::trans_to_genomic_coords(TranscriptomeHit& hit) //out.trans must already have the corresponding GffObj* { // read start in genomic coords size_t read_start = 0; //GList& exon_list = hit.trans->exons; GVec& exon_list = hit.trans->exons; //GffExon* cur_exon; //GffExon* next_exon; GSeg* next_exon=NULL; int cur_pos; int match_length; int miss_length; int cur_intron_len = 0; int i = 0; static const int MAX_CIGARS = 1024; int cigars[MAX_CIGARS]; int num_cigars = 0; // TODO: Check this return value bool ret_val = get_read_start(exon_list, hit.hit->core.pos, read_start, i); if (!ret_val) { } cur_pos = read_start; for (int c = 0; c < hit.hit->core.n_cigar; ++c) { int opcode = bam1_cigar(hit.hit)[c] & BAM_CIGAR_MASK; int length = bam1_cigar(hit.hit)[c] >> BAM_CIGAR_SHIFT; if (opcode == BAM_CINS) { cigars[num_cigars] = opcode | (length << BAM_CIGAR_SHIFT); ++num_cigars; } if (opcode != BAM_CMATCH && opcode != BAM_CDEL) continue; int remaining_length = length; for (; i < exon_list.Count(); ++i) { GSeg& cur_exon = exon_list[i]; if (cur_pos >= (int)cur_exon.start && cur_pos + remaining_length - 1 <= (int)cur_exon.end) // read ends in this exon { cigars[num_cigars] = opcode | (remaining_length << BAM_CIGAR_SHIFT); ++num_cigars; cur_pos += remaining_length; break; } // shouldn't need the check... can switch to a regular "else" else if (cur_pos >= (int)cur_exon.start && cur_pos + remaining_length - 1 > (int)cur_exon.end)// read is spliced and overlaps this exon { // XXX: This should _never_ go out of range. // get the max length that fits in this exon, go to next exon // cur_pos should be the next exon start // set assertion to check this // TODO: check this match_length = (int)cur_exon.end - cur_pos + 1; if (match_length > 0) { cigars[num_cigars] = opcode | (match_length << BAM_CIGAR_SHIFT); ++num_cigars; } // XXX: DEBUG if (i + 1 >= exon_list.Count()) { std::cerr << "trying to access: " << i + 2 << " when size is: " << exon_list.Count() << std::endl; print_trans(hit.trans, hit.hit, remaining_length, match_length, cur_pos, read_start); return false; } else next_exon = & (exon_list[i + 1]); // and this miss_length = next_exon->start - cur_exon.end - 1; cur_intron_len += miss_length; cigars[num_cigars] = BAM_CREF_SKIP | (miss_length << BAM_CIGAR_SHIFT); ++num_cigars; cur_pos += match_length + miss_length; remaining_length -= match_length; assert(cur_pos == (int)next_exon->start); } } } hit.hit->core.tid = ref_to_id_[hit.trans->getRefName()]; hit.hit->core.pos = read_start - 1; hit.hit->core.flag &= ~BAM_FSECONDARY; int old_n_cigar = hit.hit->core.n_cigar; if (num_cigars != old_n_cigar) { int data_len = hit.hit->data_len + 4 * (num_cigars - old_n_cigar); int m_data = max(data_len, hit.hit->m_data); kroundup32(m_data); uint8_t* data = (uint8_t*)calloc(m_data, 1); int copy1_len = (uint8_t*)bam1_cigar(hit.hit) - hit.hit->data; memcpy(data, hit.hit->data, copy1_len); int copy2_len = num_cigars * 4; memcpy(data + copy1_len, cigars, copy2_len); int copy3_len = hit.hit->data_len - copy1_len - (old_n_cigar * 4); memcpy(data + copy1_len + copy2_len, bam1_seq(hit.hit), copy3_len); hit.hit->core.n_cigar = num_cigars; free(hit.hit->data); hit.hit->data = data; hit.hit->data_len = data_len; hit.hit->m_data = m_data; } char strand = hit.trans->strand; uint8_t* ptr = bam_aux_get(hit.hit, "XS"); if (ptr) bam_aux_del(hit.hit, ptr); if (strand == '+' || strand == '-') bam_aux_append(hit.hit, "XS", 'A', 1, (uint8_t*)&strand); return true; } void print_trans(GffTranscript* trans, const bam1_t* in, size_t rem_len, size_t match_len, size_t cur_pos, size_t start_pos) { GSeg* p_exon; std::cerr << "\tCur_pos: " << cur_pos << " remaining: " << rem_len << " match_len: " << match_len << std::endl; std::cerr << "\tTranscript:\t" << trans->start << "\t" << trans->end << std::endl; for (int i = 0; i < trans->exons.Count(); ++i) { p_exon = & (trans->exons[i]); std::cerr << "\t\t" << p_exon->start << "\t" << p_exon->end << std::endl; } std::cerr << std::endl; std::cerr << "Read_id: " << bam1_qname(in) << std::endl; std::cerr << "\tgff_start: " << in->core.pos << "\tgenome_start: " << start_pos << std::endl; } // Returns false if not in this exon list //bool get_read_start(GList* exon_list, size_t gtf_start, bool get_read_start(GVec& exon_list, size_t gtf_start, size_t& genome_start, int& exon_idx) { //GffExon* cur_exon; const GSeg* cur_exon; size_t cur_intron_dist = 0; //size_t trans_start = exon_list->First()->start; size_t trans_start = exon_list[0].start; int trans_offset = 0; for (int i = 0; i < exon_list.Count(); ++i) { //cur_exon = exon_list->Get(i); cur_exon = & (exon_list[i]); trans_offset = trans_start + cur_intron_dist; if (gtf_start >= cur_exon->start - trans_offset && gtf_start <= cur_exon->end - trans_offset) { genome_start = gtf_start + trans_start + cur_intron_dist; exon_idx = i; return true; } else { if (i + 1 < exon_list.Count()) //cur_intron_dist += exon_list->Get(i + 1)->start - cur_exon->end - 1; cur_intron_dist += exon_list[i + 1].start - cur_exon->end - 1; else return false; } } return false; } int main(int argc, char *argv[]) { int parse_ret = parse_options(argc, argv, m2g_print_usage); if (parse_ret) return parse_ret; if (optind >= argc) { m2g_print_usage(); return 1; } std::string gtf_file(argv[optind++]); std::string in_fname(argv[optind++]); std::string out_fname(argv[optind++]); if (gtf_file == "" || in_fname == "" || out_fname == "") { m2g_print_usage(); exit(1); } Map2GTF gtfMapper(gtf_file, in_fname); gtfMapper.convert_coords(out_fname, sam_header); return 0; } tophat-2.0.9/src/GTFToFasta.cpp0000644000175000017500000001235512157116165014761 0ustar toortoor// // gtfToFasta.cpp // TopHat // // Created by Harold Pimentel on 10/26/11. // #include "GTFToFasta.h" std::string get_exonic_sequence(GffObj &p_trans, FastaRecord &rec, std::string& coords) { GList& exon_list = p_trans.exons; std::string exon_seq(""); size_t length; coords.clear(); std::stringstream ss; for (int i = 0; i < exon_list.Count(); ++i) { GffExon& cur_exon = *(exon_list.Get(i)); length = cur_exon.end - cur_exon.start + 1; exon_seq += rec.seq_.substr(cur_exon.start - 1, length); ss << ',' << cur_exon.start << '-' << cur_exon.end; } coords = ss.str().substr(1); return exon_seq; } GTFToFasta::GTFToFasta(std::string gtf_fname, std::string genome_fname) : genome_fhandle_(genome_fname.c_str(), false) { gtf_fname_ = gtf_fname; gtf_fhandle_ = fopen(gtf_fname_.c_str(), "r"); if (gtf_fhandle_ == NULL) { std::cerr << "FATAL: Couldn't open annotation: " << gtf_fname_ << std::endl; exit(1); } std::cout << "Reading the annotation file: " << gtf_fname_ << std::endl; gtfReader_.init(gtf_fhandle_, true); //load recognizable transcript features only gtfReader_.readAll(); genome_fname_ = genome_fname; // Make a map from the GffObj transcript_map(); } GTFToFasta::~GTFToFasta() { ContigTransMap::iterator it; for (it = contigTransMap_.begin(); it != contigTransMap_.end(); ++it) { delete it->second; } } void GTFToFasta::make_transcriptome(std::string out_fname) { std::vector *p_contig_vec; FastaReader fastaReader(genome_fname_); FastaWriter fastaWriter(out_fname); std::string tlst_fname(out_fname); tlst_fname.append(".tlst"); std::ofstream tlst(tlst_fname.c_str()); FastaRecord cur_contig; while (fastaReader.good()) { fastaReader.next(cur_contig); // If this contig isn't in the map, then there are no transcripts // associated with it. Skip it. if (contigTransMap_.find(cur_contig.id_) == contigTransMap_.end()) { continue; } p_contig_vec = contigTransMap_[cur_contig.id_]; FastaRecord out_rec; for (size_t i = 0; i < p_contig_vec->size(); ++i) { int trans_idx = (*p_contig_vec)[i]; GffObj *p_trans = gtfReader_.gflst.Get(trans_idx); //if (p_trans->isDiscarded() || p_trans->exons.Count()==0) continue; std::string coordstr; out_rec.seq_ = get_exonic_sequence(*p_trans, cur_contig, coordstr); if (out_rec.seq_.empty()) continue; std::stringstream ss; ss << trans_idx; out_rec.id_ = ss.str(); //ss.str(std::string()); //clear ss out_rec.desc_=p_trans->getID(); out_rec.desc_.push_back(' '); //ss << p_trans->getID() << ' ' << p_trans->getGSeqName() << p_trans->strand << '\t' << coordstr ; out_rec.desc_.append(cur_contig.id_); out_rec.desc_.push_back(p_trans->strand); out_rec.desc_.push_back(' '); out_rec.desc_.append(coordstr); //list of exon coordinates tlst << out_rec.id_ << ' ' << out_rec.desc_ << std::endl; //out_rec.desc_ = ""; //out_rec.desc_ = ss.str(); //out_rec.seq_ = exon_seq; fastaWriter.write(out_rec); } } tlst.close(); } void GTFToFasta::transcript_map() { GffObj *p_gffObj; const char *p_contig_name; std::vector *p_contig_vec; for (int i = 0; i < gtfReader_.gflst.Count(); ++i) { p_gffObj = gtfReader_.gflst.Get(i); if (p_gffObj->isDiscarded() || p_gffObj->exons.Count()==0) continue; p_contig_name = p_gffObj->getRefName(); std::string contig_name(p_contig_name); // Check if the current contig exists in the map // If it doesn't, add it if (contigTransMap_.find(contig_name) == contigTransMap_.end()) { p_contig_vec = new std::vector; contigTransMap_[contig_name] = p_contig_vec; } else { p_contig_vec = contigTransMap_[contig_name]; } p_contig_vec->push_back(i); } } void GTFToFasta::print_mapping() { std::ofstream out_file("out.names"); GffObj *p_gffObj; for (int i = 0; i < gtfReader_.gflst.Count(); ++i) { p_gffObj = gtfReader_.gflst.Get(i); if (p_gffObj->isDiscarded() || p_gffObj->exons.Count()==0) continue; out_file << i << "\t" << p_gffObj->getID() << std::endl; } out_file.close(); } void gtf2fasta_print_usage() { std::cerr << "Usage: gtf_to_fasta transcripts.gtf genome.fa out_file" << std::endl; } int main(int argc, char *argv[]) { int parse_ret = parse_options(argc, argv, gtf2fasta_print_usage); if (parse_ret) return parse_ret; if (optind >= argc) { gtf2fasta_print_usage(); return 1; } std::string gtf_fname(argv[optind++]); std::string genome_fname(argv[optind++]); std::string out_fname(argv[optind++]); GTFToFasta gtfToFasta(gtf_fname, genome_fname); gtfToFasta.make_transcriptome(out_fname); //gtfToFasta.print_mapping(); return 0; } tophat-2.0.9/src/GFaSeqGet.h0000644000175000017500000000777612157116165014305 0ustar toortoor#ifndef GFASEQGET_H #define GFASEQGET_H #include "GList.hh" #define MAX_FASUBSEQ 0x20000000 //max 512MB sequence data held in memory at a time class GSubSeq { public: uint sqstart; //1-based coord of subseq start on sequence uint sqlen; //length of subseq loaded char* sq; //actual subsequence data will be stored here // (with end-of-line characters removed) /*char* xseq; //the exposed pointer to the last requested subsequence start off_t xstart; //the coordinate start for the last requested subseq off_t xlen; //the last requested subseq len*/ GSubSeq() { sqstart=0; sqlen=0; sq=NULL; /* xseq=NULL; xstart=0; xlen=0;*/ } void forget() { //forget about pointer data, so we can reuse it sq=NULL; sqstart=0; sqlen=0; } ~GSubSeq() { GFREE(sq); } // genomic, 1-based coordinates: void setup(uint sstart, int slen, int sovl=0, int qfrom=0, int qto=0, uint maxseqlen=0); //check for overlap with previous window and realloc/extend appropriately //returns offset from seq that corresponds to sstart // the window will keep extending until MAX_FASUBSEQ is reached }; class GFaSeqGet { char* fname; FILE* fh; //raw offset in the file where the sequence actually starts: off_t fseqstart; uint seq_len; //total sequence length, if known (when created from GFastaIndex) int line_len; //length of each line of text int line_blen; //binary length of each line // = line_len + number of EOL character(s) GSubSeq* lastsub; void initialParse(off_t fofs=0, bool checkall=true); const char* loadsubseq(uint cstart, int& clen); void finit(const char* fn, off_t fofs, bool validate); public: GFaSeqGet() { fh=NULL; fseqstart=0; seq_len=0; line_len=0; line_blen=0; fname=NULL; lastsub=NULL; } GFaSeqGet(const char* fn, off_t fofs, bool validate=false) { seq_len=0; finit(fn,fofs,validate); } GFaSeqGet(const char* fn, bool validate=false) { seq_len=0; finit(fn,0,validate); } GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen); //constructor from GFastaIndex record GFaSeqGet(FILE* f, off_t fofs=0, bool validate=false); ~GFaSeqGet() { if (fname!=NULL) { GFREE(fname); fclose(fh); } delete lastsub; } const char* subseq(uint cstart, int& clen); const char* getRange(uint cstart=1, uint cend=0) { if (cend==0) cend=(seq_len>0)?seq_len : MAX_FASUBSEQ; if (cstart>cend) { Gswap(cstart, cend); } int clen=cend-cstart+1; //int rdlen=clen; return subseq(cstart, clen); } //caller is responsible for deallocating the return string char* copyRange(uint cstart, uint cend, bool revCmpl=false, bool upCase=false); //uncached, read and return allocated buffer //caller is responsible for deallocating the return string char* fetchSeq(int* retlen=NULL) { int clen=(seq_len>0) ? seq_len : MAX_FASUBSEQ; if (lastsub) { delete lastsub; lastsub=NULL; } subseq(1, clen); if (retlen) *retlen=clen; char* r=lastsub->sq; lastsub->forget(); if (clen>0) { r[clen]=0; } else { r=NULL; } return r; } void loadall(uint32 max_len=0) { //TODO: better read the whole sequence differently here - line by line //so when EOF or another '>' line is found, the reading stops! int clen=(seq_len>0) ? seq_len : ((max_len>0) ? max_len : MAX_FASUBSEQ); subseq(1, clen); } void load(uint cstart, uint cend) { //cache as much as possible if (seq_len>0 && cend>seq_len) cend=seq_len; //correct a bad request int clen=cend-cstart+1; subseq(cstart, clen); } int getsublen() { return lastsub!=NULL ? lastsub->sqlen : 0 ; } int getseqlen() { return seq_len; } //known when loaded with GFastaIndex off_t getseqofs() { return fseqstart; } int getLineLen() { return line_len; } int getLineBLen() { return line_blen; } //reads a subsequence starting at genomic coordinate cstart (1-based) }; #endif tophat-2.0.9/src/gdna.h0000644000175000017500000000056412122334361013424 0ustar toortoor#ifndef GDNA_H #define GDNA_H #include "GBase.h" char ntComplement(char c); //in-place reverse complement of a nucleotide (sub)sequence char* reverseComplement(char* seq, int slen=0); bool gDnaInit(); byte gdna2bit(char* &nt, int n=4); //pack n bases into a byte (n can be 1..4) char g2bit2base(byte v2bit); //convert the 2-bit value into 'A', 'C', 'G' or 'T' #endif tophat-2.0.9/src/align_status.cpp0000755000175000017500000002001112122334361015533 0ustar toortoor/* * align_status.cpp * TopHat * * Created by Ryan Kelley on 11/09/2010. * */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include "common.h" #include "bwt_map.h" #include "tokenize.h" #include "reads.h" #include "junctions.h" #include "insertions.h" #include "deletions.h" #include "fusions.h" #include "coverage.h" #include "align_status.h" using namespace std; AlignStatus::AlignStatus() { _alignment_score = std::numeric_limits::min(); } /** * Parse the cigar string of a BowtieHit in order to determine the alignment status. */ AlignStatus::AlignStatus(const BowtieHit& bh, const JunctionSet& gtf_junctions, const JunctionSet& junctions, const InsertionSet& insertions, const DeletionSet& deletions, const FusionSet& fusions, const Coverage& coverage) { // it seems like we need to work on this more // daehwan - it doesn't seem to work. const bool recalculate_indel_score = false; const vector& cigar = bh.cigar(); _alignment_score = bh.alignment_score(); const int read_len = bh.read_len(); const int min_extent = min(read_len / 4, 10); bool recalculate_score = !junctions.empty(); int j = bh.left(); int r = 0; RefID ref_id = bh.ref_id(); for (size_t c = 0 ; c < cigar.size(); ++c) { int opcode = cigar[c].opcode; int length = cigar[c].length; switch(opcode) { case REF_SKIP: case rEF_SKIP: { Junction junc; junc.refid = bh.ref_id(); if (opcode == REF_SKIP) { junc.left = j - 1; junc.right = j + length; j += length; } else { junc.right = j + 1; junc.left = j - length; j -= length; } if (recalculate_score) { junc.antisense = bh.antisense_splice(); if (gtf_junctions.find(junc) == gtf_junctions.end()) { JunctionSet::const_iterator itr = junctions.find(junc); if (itr == junctions.end()) { _alignment_score -= bowtie2_max_penalty; } else { const int left_cov = coverage.get_coverage(ref_id, junc.left + 1); const int right_cov = coverage.get_coverage(ref_id, junc.right - 1); const int avg_cov = (left_cov + right_cov) / 2; int penalty = bowtie2_max_penalty + 2; const int supporting_hits = itr->second.supporting_hits; const int left_extent = itr->second.left_extent; const int right_extent = itr->second.right_extent; float extent_penalty = 0.0f; if (left_extent < min_extent || right_extent < min_extent) extent_penalty = 0.5f; if (supporting_hits >= 5) penalty *= min((float)avg_cov/supporting_hits + extent_penalty, 1.f); // daehwan - check this out // add two points to prefer junction alignments to other that may span one side of split site. // penalty -= 2; if (itr->second.gtf_match) penalty -= bowtie2_max_penalty; int prev_alignment_score = _alignment_score; _alignment_score -= penalty; // daehwan - for debugging purposes if (bh.insert_id() == 325708 && false) { fprintf(stderr, "junc(%d:%d-%d) %d / (%d + %d) = %d => %d\n", junc.refid, junc.left, junc.right, itr->second.supporting_hits, left_cov, right_cov, prev_alignment_score, _alignment_score); fprintf(stderr, "\textent: %d-%d\n", left_extent, right_extent); } } } else { _alignment_score += 2; } } } break; case MATCH: case mATCH: { if (opcode == MATCH) j += length; else j -= length; r += length; } break; case DEL: case dEL: { Deletion deletion; deletion.refid = bh.ref_id(); if (opcode == DEL) { deletion.left = j - 1; deletion.right = j + length; j += length; } else { deletion.right = j + 1; deletion.left = j - length; j -= length; } if (recalculate_score && recalculate_indel_score) { DeletionSet::const_iterator itr = deletions.find(deletion); if (itr != deletions.end()) { const int left_cov = coverage.get_coverage(ref_id, deletion.left + 1); const int right_cov = (length == 1 ? left_cov : coverage.get_coverage(ref_id, deletion.right - 1)); const int avg_cov = (left_cov + right_cov) / 2; const int del_penalty = bowtie2_ref_gap_open + bowtie2_ref_gap_cont * length; int addition = del_penalty; const int supporting_hits = itr->second.supporting_hits; const int left_extent = itr->second.left_extent; const int right_extent = itr->second.right_extent; int penalty = 0; if (left_extent < min_extent || right_extent < min_extent) penalty = del_penalty * 0.5f; if (avg_cov > 0 && supporting_hits >= 5) addition *= min((float)supporting_hits/avg_cov, 1.f); else addition = 0; addition -= penalty; if (addition < 0) addition = 0; int prev_alignment_score = _alignment_score; _alignment_score += addition; _alignment_score = min(0, _alignment_score); // daehwan - for debug purposes if (bh.insert_id() == 325708 && false) { fprintf(stderr, "del(%d:%d-%d) %d / (%d + %d) = %d => %d\n", deletion.refid, deletion.left, deletion.right, supporting_hits, left_cov, right_cov, prev_alignment_score, _alignment_score); fprintf(stderr, "\textent: %d-%d\n", left_extent, right_extent); } } } } break; case INS: case iNS: { if (recalculate_score && recalculate_indel_score) { string seq = bh.seq().substr(r, length); Insertion ins(ref_id, j, seq); InsertionSet::const_iterator itr = insertions.find(ins); if (itr != insertions.end()) { const int supporting_hits = itr->second.supporting_hits; const int left_extent = itr->second.left_extent; const int right_extent = itr->second.right_extent; const int left_cov = coverage.get_coverage(ref_id, j); const int right_cov = coverage.get_coverage(ref_id, j + (opcode == INS ? 1 : -1)); const int avg_cov = (left_cov + right_cov) / 2 - supporting_hits; const int ins_penalty = bowtie2_read_gap_open + bowtie2_read_gap_cont * length; int addition = ins_penalty; int extent_penalty = 0.0f; if (left_extent < min_extent || right_extent < min_extent) extent_penalty = ins_penalty * 0.5f; if (avg_cov > 0 && supporting_hits >= 5) addition *= min((float)supporting_hits/avg_cov, 1.f); else addition = 0; addition -= extent_penalty; if (addition < 0) addition = 0; // int prev_alignment_score = _alignment_score; _alignment_score += addition; _alignment_score = min(0, _alignment_score); /* fprintf(stderr, "ins(%d:%d:%s) %d / (%d - %d) = %d => %d (%d)\n", ref_id, ins.left, seq.c_str(), supporting_hits, avg_cov, supporting_hits, prev_alignment_score, _alignment_score, ins_penalty); fprintf(stderr, "\textent: %d-%d\n", left_extent, right_extent); */ } } r += length; } break; case FUSION_FF: case FUSION_FR: case FUSION_RF: case FUSION_RR: // daehwan - implement this later j = length; ref_id = bh.ref_id2(); break; default: break; } } } /** * Establish an ordering on alignments. * Prefer aligned reads over unaligned reads * Within the space of aligned reads * prefer splice-free reads over splice reads, and * indel-free reads over indel reads. * If a read can either be indel-free or splice-free, * prefer the indel-free alignment */ bool AlignStatus::operator<(const AlignStatus& rhs) const { if (_alignment_score != rhs._alignment_score) return _alignment_score > rhs._alignment_score; return false; } /** * Alignments are only equal if all fields are identical. */ bool AlignStatus::operator==(const AlignStatus& rhs) const { return _alignment_score == rhs._alignment_score; } bool AlignStatus::operator!=(const AlignStatus& rhs) const { return _alignment_score != rhs._alignment_score; } tophat-2.0.9/src/sra_to_solid0000755000175000017500000000131412122334360014742 0ustar toortoor#!/usr/bin/env python """ sra_to_solid.py """ import sys use_message = ''' convert SOLiD sequences downloaded from SRA FTP (not via the web interface) to a format TopHat processes. the script simply removes one primer quality value '!' from the sequences. Usage: sra_to_solid input.fastq > output.fastq ''' if __name__ == "__main__": if len(sys.argv) == 2: input_file = open(sys.argv[-1], 'r') expect_qual = 0 for line in input_file: line = line.rstrip('\n') if expect_qual % 4 == 3: line = line[1:] print line expect_qual = (expect_qual + 1) % 4 else: print use_message; tophat-2.0.9/src/bed_to_juncs0000755000175000017500000000343112122334361014722 0ustar toortoor#!/usr/bin/env python # encoding: utf-8 """ bed_to_juncs.py Created by Cole Trapnell on 2008-09-19. Copyright (c) 2008 Cole Trapnell. All rights reserved. """ import sys import getopt help_message = ''' This script converts junctions in BED format produced by TopHat to the internal .juncs format for re-use with future runs. Usage: bed_to_juncs.py < junctions.bed ''' class Usage(Exception): def __init__(self, msg): self.msg = msg def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "h", ["help"]) except getopt.error, msg: raise Usage(msg) for option, value in opts: if option in ("-h", "--help"): raise Usage(help_message) line_num = 0 for line in sys.stdin.readlines(): line = line.strip() cols = line.split() line_num += 1 if len(cols) < 12: print >> sys.stderr, "Warning: malformed line %d, missing columns" % line_num print >> sys.stderr, "\t", line continue chromosome = cols[0] orientation = cols[5] block_starts = [int(x) for x in cols[11].split(",")] block_sizes = [int(x) for x in cols[10].split(",")] left_pos = int(cols[1]) + block_starts[0] + block_sizes[0] - 1 right_pos = int(cols[1]) + block_starts[1] print "%s\t%d\t%d\t%s" % (chromosome, left_pos, right_pos, orientation) except Usage, err: print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg) print >> sys.stderr, "\t for help use --help" return 2 if __name__ == "__main__": sys.exit(main()) tophat-2.0.9/src/common.h0000644000175000017500000004532312157116165014015 0ustar toortoor#ifndef COMMON_H #define COMMON_H /* * common.h * TopHat * * Created by Cole Trapnell on 11/26/08. * Copyright 2008 Cole Trapnell. All rights reserved. * */ #include #include #include #include #include #include #include #include "bam/bam.h" #include "bam/sam.h" #define MAX_READ_LEN 1024 //for fastq and bam indexing by read# (multi-threading) #define INDEX_REC_COUNT 1000 #define VMAXINT32 0xFFFFFFFF #ifdef MEM_DEBUG void process_mem_usage(double& vm_usage, double& resident_set); void print_mem_usage(); #endif extern bool bowtie2; extern int bowtie2_min_score; extern int bowtie2_max_penalty; extern int bowtie2_min_penalty; extern int bowtie2_penalty_for_N; extern int bowtie2_read_gap_open; extern int bowtie2_read_gap_cont; extern int bowtie2_ref_gap_open; extern int bowtie2_ref_gap_cont; //geo - enforcing an absolute score filter extern int bowtie2_scoreflt; // daehwan - temporary for parallelization extern bool parallel; /* * Maximum allowable length of an * an insertion. Used mainly in * segment_juncs.cpp */ extern unsigned int max_insertion_length; /* * Maximum allowable length of a * deletion. Used mainly in segment_juncs.cpp * and long_spanning_reads.cpp */ extern unsigned int max_deletion_length; extern int inner_dist_mean; extern int inner_dist_std_dev; extern int max_mate_inner_dist; extern int min_anchor_len; extern int min_report_intron_length; extern int max_report_intron_length; extern int min_closure_intron_length; extern int max_closure_intron_length; extern int min_coverage_intron_length; extern int max_coverage_intron_length; extern int min_segment_intron_length; extern int max_segment_intron_length; extern uint32_t min_closure_exon_length; extern int island_extension; extern int num_threads; extern int segment_length; // the read segment length used by the pipeline extern int segment_mismatches; extern int read_mismatches; extern int read_gap_length; extern int read_edit_dist; extern int read_realign_edit_dist; extern int max_splice_mismatches; enum ReadFormat {FASTA, FASTQ}; extern ReadFormat reads_format; extern bool verbose; extern unsigned int max_multihits; extern bool suppress_hits; extern unsigned int max_seg_multihits; extern bool no_closure_search; extern bool no_coverage_search; extern bool no_microexon_search; extern bool butterfly_search; extern float min_isoform_fraction; extern std::string output_dir; extern std::string gff_file; extern std::string gene_filter; extern std::string ium_reads; extern std::string sam_header; extern std::string sam_readgroup_id; extern std::string zpacker; //path to program to use for de/compression (gzip, pigz, bzip2, pbzip2) extern std::string samtools_path; //path to samtools executable extern std::string std_outfile; //main output file that some modules can use instead of stdout extern std::string aux_outfile; //auxiliary output file name extern std::string index_outfile; //index output file name extern bool solexa_quals; extern bool phred64_quals; extern bool quals; extern bool integer_quals; extern bool color; extern std::string gtf_juncs; extern bool report_secondary_alignments; extern bool report_discordant_pair_alignments; extern bool report_mixed_alignments; //prep_reads only: --flt-reads // filter out reads if their numeric ID is in this fastq file // OR if flt_mappings was given too, filter out reads if their ID // is NOT in this fastq file extern std::string flt_reads; //prep_reads special usage: filter out mappings whose read ID //is NOT found in the flt_reads file, and write them into // aux_outfile; also reverses the flt_reads filter itself extern std::string flt_mappings; //for on-the-fly search during pre-filtering of PE reads, prep_reads will take both mates as input //but output only one side to stdout (into Bowtie); 0 = left, 1 = right, 2 = both extern int flt_side; extern bool fusion_search; extern size_t fusion_anchor_length; extern size_t fusion_min_dist; extern size_t fusion_read_mismatches; extern size_t fusion_multireads; extern size_t fusion_multipairs; extern std::vector fusion_ignore_chromosomes; extern bool fusion_do_not_resolve_conflicts; enum eLIBRARY_TYPE { LIBRARY_TYPE_NONE = 0, FR_UNSTRANDED, FR_FIRSTSTRAND, FR_SECONDSTRAND, FF_UNSTRANDED, FF_FIRSTSTRAND, FF_SECONDSTRAND, NUM_LIBRARY_TYPE }; extern eLIBRARY_TYPE library_type; std::string getFext(const std::string& s); //returns file extension converted to lowercase std::string getFdir(const std::string& s); //returns file extension converted to lowercase bool str_endsWith(std::string& str, const char* suffix); void str_appendInt(std::string& str, int64_t v); void str_appendUInt(std::string& str, uint64_t v); std::string str_replace(const std::string& str, const std::string& oldStr, const std::string& newStr); FILE* fzOpen(std::string& fname, const char* mode); int parseIntOpt(int lower, const char *errmsg, void (*print_usage)()); int parse_options(int argc, char** argv, void (*print_usage)()); void err_exit(const char* format,...); // exit with an error char* get_token(char** str, const char* delims); std::string guess_packer(const std::string& fname, bool use_all_cpus=false); std::string getUnpackCmd(const std::string& fname, bool use_all_cpus=false); void checkSamHeader(); void writeSamHeader(FILE* fout); class FZPipe { public: union { FILE* file; samfile_t* bam_file; }; std::string filename; std::string pipecmd; bool is_bam; FZPipe(const std::string& fname, bool guess=false):filename(fname),pipecmd() { //this constructor is only to use FZPipe as a READER //also accepts/recognizes BAM files (without needing pipes) //for which it only stores the filename openRead(fname, guess); } FILE* openRead(const std::string& fname, bool guess=false) { filename=fname; pipecmd=""; is_bam=false; if (getFext(fname) == "bam") { //file=(FILE*)this; //just to be non-NULL; is_bam=true; bam_file=samopen(filename.c_str(), "rb", 0); return file; } if (guess) { pipecmd=guess_packer(fname); if (!pipecmd.empty()) pipecmd.append(" -cd"); } else { pipecmd=getUnpackCmd(fname); } if (pipecmd.empty()) { //this->openRead(fname.c_str()); file=fopen(filename.c_str(), "r"); return file; } this->openRead(fname.c_str(), pipecmd); return file; } FILE* openRead(const char* fname, std::string& popencmd); FILE* openRead(const char* fname) { std::string s(fname); return openRead(s); } FILE* openRead(const std::string fname, std::string& popencmd) { return this->openRead(fname.c_str(),popencmd); } FZPipe():filename(),pipecmd() { is_bam=false; file=NULL; } FZPipe(std::string& fname, std::string& pcmd):filename(fname),pipecmd(pcmd) { //open as a compressed file reader if (pipecmd.empty()) { this->openRead(fname); return; } is_bam=false; file=NULL; this->openRead(fname.c_str(), pipecmd); } void close() { if (file!=NULL) { if (is_bam) { samclose(bam_file); bam_file=NULL; } else { if (pipecmd.empty()) fclose(file); else pclose(file); } file=NULL; } } FILE* openWrite(const char* fname, std::string& popencmd); FILE* openWrite(const char* fname); void rewind(); void seek(int64_t foffset) { if (is_bam) { bgzf_seek(bam_file->x.bam, foffset, SEEK_SET); } else { fseek(this->file, foffset, SEEK_SET); } } }; void err_die(const char* format,...); void warn_msg(const char* format,...); class GBamRecord { bam1_t* b; // b->data has the following strings concatenated: // qname (including the terminal \0) // +cigar (each event encoded on 32 bits) // +seq (4bit-encoded) // +qual // +aux bool novel; char tag[2]; uint8_t abuf[1024]; public: GBamRecord(bam1_t* from_b=NULL) { if (from_b==NULL) { b=bam_init1(); novel=true; } else { b=from_b; novel=false; } } void clear() { if (novel) { bam_destroy1(b); } novel=true; b=bam_init1(); } ~GBamRecord() { if (novel) { bam_destroy1(b); } } void parse_error(const char* s) { err_die("BAM parsing error: %s\n", s); } bam1_t* get_b() { return b; } void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available int32_t isize=0) { //mate info for current record b->core.mtid=mtid; b->core.mpos=m0pos; // should be -1 if '*' b->core.isize=isize; //should be 0 if not available } void set_flags(uint16_t flags) { b->core.flag=flags; } void set_flag(uint16_t flag) { //use BAM_F* constants b->core.flag |= flag; } void unset_flag(uint16_t flag) { //use BAM_F* constants b->core.flag &= ~flag; } //creates a new record from 1-based alignment coordinate //quals should be given as Phred33 //Warning: pos and mate_pos must be given 1-based! GBamRecord(const char* qname, int32_t gseq_tid, int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL); GBamRecord(const char* qname, int32_t flags, int32_t g_tid, int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos, int insert_size, const char* qseq, const char* quals=NULL, const std::vector* aux_strings=NULL); void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format void add_quals(const char* quals); //quality values string in Phred33 format void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1") void add_aux(const char tag[2], char atype, int len, uint8_t *data) { //IMPORTANT: strings (Z,H) should include the terminal \0 int addz=0; if ((atype=='Z' || atype=='H') && data[len-1]!=0) { addz=1; } int ori_len = b->data_len; b->data_len += 3 + len + addz; b->l_aux += 3 + len + addz; if (b->m_data < b->data_len) { b->m_data = b->data_len; kroundup32(b->m_data); b->data = (uint8_t*)realloc(b->data, b->m_data); } b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1]; b->data[ori_len + 2] = atype; if (addz) { b->data[ori_len+len+3]=0; } memcpy(b->data + ori_len + 3, data, len); } //--reading back aux tags: uint8_t* find_tag(const char tag[2]); //returns pointer at the beginning of tag data, or NULL if tag not found //the returned pointer can then be used by bam_aux2*() functions std::string tag_str(const char tag[2]); //return tag value for tag type 'Z' int tag_int(const char tag[2]); //return numeric value of tag (for numeric types) char tag_char(const char tag[2]); //return char value of tag (for type 'A') char spliceStrand(); // '+', '-' from the XS tag, or '.' if no XS tag //-- std::string qualities(); //return quality string, as is (ignores BAM_FREVERSE) std::string sequence(); //return read sequence as is (ignores BAM_FREVERSE) std::string seqData(std::string* readquals=NULL); //return seq and qv, reversed if BAM_FREVERSE }; class GBamWriter { samfile_t* bam_file; bam_header_t* bam_header; FILE* findex; uint64_t wcount; uint64_t idxcount; int64_t idx_last_id; bool external_header; public: void create(const char* fname, bool uncompressed=false) { findex=NULL; wcount=0; idxcount=0; idx_last_id=0; external_header=false; if (bam_header==NULL) err_die("Error: no bam_header for GBamWriter::create()!\n"); if (uncompressed) { bam_file=samopen(fname, "wbu", bam_header); } else { bam_file=samopen(fname, "wb", bam_header); } if (bam_file==NULL) err_die("Error: could not create BAM file %s!\n",fname); //do we need to call bam_header_write() ? } void create(const char* fname, std::string& idxfile) { findex=NULL; wcount=0; idxcount=0; idx_last_id=0; external_header=false; if (bam_header==NULL) err_die("Error: no bam_header for GBamWriter::create()!\n"); bam_file=samopen(fname, "wb", bam_header); if (bam_file==NULL) err_die("Error: could not create BAM file %s!\n",fname); if (!idxfile.empty()) { findex = fopen(idxfile.c_str(), "w"); if (findex == NULL) err_die("Error: cannot create file %s\n", idxfile.c_str()); } } void create(const char* fname, bam_header_t* bh, bool uncompressed=false) { findex=NULL; wcount=0; idxcount=0; idx_last_id=0; external_header=false; bam_header=bh; create(fname, uncompressed); } GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) { create(fname, bh, uncompressed); external_header=true; } GBamWriter(const char* fname, bam_header_t* bh, std::string& idxfile) { bam_header=bh; create(fname, idxfile); external_header=true; } GBamWriter(std::string& fname, std::string& idxfile) { //create BAM with empty header external_header=false; bam_header=bam_header_init(); create(fname.c_str()); } GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) { tamFile samf_in=sam_open(samfname); if (samf_in==NULL) err_die("Error: could not open SAM file %s\n", samfname); bam_header=sam_header_read(samf_in); if (bam_header==NULL) err_die("Error: could not read SAM header from %s!\n",samfname); sam_close(samf_in); create(fname, uncompressed); } GBamWriter(const char* fname, const char* samfname, std::string idxfile) { tamFile samf_in=sam_open(samfname); if (samf_in==NULL) err_die("Error: could not open SAM file %s\n", samfname); bam_header=sam_header_read(samf_in); if (bam_header==NULL) err_die("Error: could not read SAM header from %s!\n",samfname); sam_close(samf_in); create(fname, idxfile); } ~GBamWriter() { samclose(bam_file); if (bam_header && !external_header) bam_header_destroy(bam_header); if (findex != NULL) fclose(findex); } bam_header_t* get_header() { return bam_header; } int32_t get_tid(const char *seq_name) { if (bam_header==NULL) err_die("Error: missing SAM header (get_tid())\n"); return bam_get_tid(bam_header, seq_name); } //just a convenience function for creating a new record, but it's NOT written //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based) GBamRecord* new_record(const char* qname, const char* gseqname, int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) { if (gseqname==NULL || strcmp(gseqname, "*")==0) { //probably an unmapped read //if (pos>0) err_die("Error: genomic position given for unmapped read!\n"); return (new GBamRecord(qname, -1, 0, false, qseq, cigar, qual)); } else { int32_t gseq_tid=get_tid(gseqname); if (gseq_tid < 0) { if (bam_header->n_targets == 0) { err_die("Error: missing/invalid SAM header\n"); } else fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n", gseqname); } return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual)); } } GBamRecord* new_record(const char* qname, int32_t flags, const char* gseqname, int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos, int insert_size, const char* qseq, const char* quals=NULL, const std::vector* aux_strings=NULL) { int32_t gseq_tid=get_tid(gseqname); if (gseq_tid < 0 && strcmp(gseqname, "*")) { if (bam_header->n_targets == 0) { err_die("Error: missing/invalid SAM header\n"); } else fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n", gseqname); } int32_t mgseq_tid=-1; if (mgseqname!=NULL) { if (strcmp(mgseqname, "=")==0) { mgseq_tid=gseq_tid; } else { mgseq_tid=get_tid(mgseqname); if (mgseq_tid < 0 && strcmp(mgseqname, "*")) { fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n", mgseqname); } } } return (new GBamRecord(qname, flags, gseq_tid, pos, map_qual, cigar, mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings)); } void write(GBamRecord* brec) { if (brec!=NULL) { if (findex) { bam1_t* b = brec->get_b(); char* name = bam1_qname(b); long read_id = atol(name); write(b, read_id); } else samwrite(this->bam_file, brec->get_b()); wcount++; } } void write(bam1_t* b, int64_t read_id=0) { int64_t pre_block_addr=0; //offsets after last write() int pre_block_offs=0; //but before this write() int64_t pre_pos=0; bool write_index=false; if (findex && read_id) { if (idxcount >= INDEX_REC_COUNT && read_id != idx_last_id) { pre_pos = this->tell(); pre_block_offs = pre_pos & 0xFFFF; pre_block_addr = (pre_pos >> 16) & 0xFFFFFFFFFFFFLL; write_index=true; } idx_last_id=read_id; idxcount++; } samwrite(this->bam_file, b); wcount++; if (write_index) { int64_t offset = this->tell(); int post_block_offs = offset & 0xFFFF; //offsets after this write() int64_t post_block_addr = (offset >> 16) & 0xFFFFFFFFFFFFLL; int data_len = b->data_len+BAM_CORE_SIZE; if (post_block_addr != pre_block_addr && post_block_offs>=data_len) //all data written in this block //WARNING: this check fails for very large BAM records (> 64K) { pre_pos = post_block_addr << 16; } fprintf(findex, "%ld\t%ld\n", (long)read_id, (long)pre_pos); idxcount = 0; } } int64_t tell() { return bam_tell(this->bam_file->x.bam); } int64_t writtenCount() { return wcount; } void flush() { bgzf_flush(this->bam_file->x.bam); } void seek(int64_t offset) { bam_seek(this->bam_file->x.bam, offset, SEEK_SET); } }; #endif tophat-2.0.9/src/junctions.h0000644000175000017500000000672512122334360014533 0ustar toortoor#ifndef JUNCTIONS_H #define JUNCTIONS_H /* * junctions.h * TopHat * * Created by Cole Trapnell on 11/22/08. * Copyright 2008 Cole Trapnell. All rights reserved. * */ #include #include #include #include #include #include #include #include #include #include #include "bwt_map.h" using namespace std; struct Junction { Junction (uint32_t ref, uint32_t l, uint32_t r, bool a, int sc = 0) : refid(ref), left(l), right(r), antisense(a), skip_count(sc){} Junction() : refid(0), left(0), right(0), antisense(false), skip_count(0) {} uint32_t refid; uint32_t left; uint32_t right; bool antisense; int skip_count; bool operator<(const Junction& rhs) const { if (refid < rhs.refid) return true; else if (refid > rhs.refid) return false; if (left < rhs.left) return true; else if (left > rhs.left) return false; if (right < rhs.right) return true; else if (right > rhs.right) return false; return antisense < rhs.antisense; } bool operator==(const Junction& rhs) const { return (refid == rhs.refid && left == rhs.left && right == rhs.right && antisense == rhs.antisense); } #if !NDEBUG bool valid() const { return refid != VMAXINT32 && left < right && (left != right); } #endif }; struct skip_count_lt { bool operator()(const Junction& lhs, const Junction& rhs) { if (lhs.skip_count != rhs.skip_count) return lhs.skip_count < rhs.skip_count; return lhs < rhs; } }; struct JunctionStats { JunctionStats() : left_extent(0), right_extent(0), left_exon_doc(0), right_exon_doc(0), min_splice_mms(0), supporting_hits(0), gtf_match(false), accepted(false) {} JunctionStats& merge_with(const JunctionStats& other) { if (this == &other) return *this; left_extent = max(left_extent, other.left_extent); right_extent = max(right_extent, other.right_extent); min_splice_mms = min(min_splice_mms, other.min_splice_mms); supporting_hits += other.supporting_hits; gtf_match |= other.gtf_match; accepted |= other.accepted; return *this; } int left_extent; int right_extent; int left_exon_doc; int right_exon_doc; int min_splice_mms; int supporting_hits; bool gtf_match; bool accepted; }; typedef std::map JunctionSet; // This routine DOES NOT set the real refid! pair junction_from_spliced_hit(const BowtieHit& h); void print_junction(FILE* junctions_out, const string& name, const Junction& j, const JunctionStats& s, uint32_t junc_id); void junctions_from_alignment(const BowtieHit& spliced_alignment, JunctionSet& junctions); void accept_valid_junctions(JunctionSet& junctions, const uint32_t refid, const vector& DoC, double min_isoform_fraction); void accept_all_junctions(JunctionSet& junctions, const uint32_t refid); void print_junctions(FILE* junctions_out, const JunctionSet& junctions, RefSequenceTable& ref_sequences); bool accept_if_valid(const Junction& j, JunctionStats& s); void filter_junctions(JunctionSet& junctions, const JunctionSet& gtf_junctions); void get_junctions_from_hits(HitStream& hit_stream, ReadTable& it, JunctionSet& junctions); void merge_with(JunctionSet& juncs, const JunctionSet& other_juncs); #endif tophat-2.0.9/src/gff.cpp0000644000175000017500000022161212157116165013617 0ustar toortoor#include "gff.h" //GffNames* GffReader::names=NULL; GffNames* GffObj::names=NULL; //global set of feature names, attribute names etc. // -- common for all GffObjs in current application! const uint GFF_MAX_LOCUS = 7000000; //longest known gene in human is ~2.2M, UCSC claims a gene for mouse of ~ 3.1 M const uint GFF_MAX_EXON = 30000; //longest known exon in human is ~11K const uint GFF_MAX_INTRON= 6000000; //Ensembl shows a >5MB human intron bool gff_show_warnings = false; //global setting, set by GffReader->showWarnings() const int gff_fid_mRNA=0; const int gff_fid_transcript=1; const int gff_fid_exon=2; const uint gfo_flag_HAS_ERRORS = 0x00000001; const uint gfo_flag_CHILDREN_PROMOTED= 0x00000002; const uint gfo_flag_IS_GENE = 0x00000004; const uint gfo_flag_IS_TRANSCRIPT = 0x00000008; const uint gfo_flag_HAS_GFF_ID = 0x00000010; //found GFF3 feature line with its own ID const uint gfo_flag_BY_EXON = 0x00000020; //created by subfeature (exon) directly const uint gfo_flag_DISCARDED = 0x00000100; const uint gfo_flag_LST_KEEP = 0x00000200; const uint gfo_flag_LEVEL_MSK = 0x00FF0000; const byte gfo_flagShift_LEVEL = 16; void gffnames_ref(GffNames* &n) { if (n==NULL) n=new GffNames(); n->numrefs++; } void gffnames_unref(GffNames* &n) { if (n==NULL) GError("Error: attempt to remove reference to null GffNames object!\n"); n->numrefs--; if (n->numrefs==0) { delete n; n=NULL; } } const char* strExonType(char xtype) { static const char* extbl[7]={"None", "start_codon", "stop_codon", "CDS", "UTR", "CDS_UTR", "exon"}; if (xtype>0 && xtype<7) return extbl[(int)xtype]; else return "NULL"; } int gfo_cmpByLoc(const pointer p1, const pointer p2) { GffObj& g1=*((GffObj*)p1); GffObj& g2=*((GffObj*)p2); if (g1.gseq_id==g2.gseq_id) { if (g1.start!=g2.start) return (int)(g1.start-g2.start); else if (g1.getLevel()!=g2.getLevel()) return (int)(g1.getLevel()-g2.getLevel()); else if (g1.end!=g2.end) return (int)(g1.end-g2.end); else return strcmp(g1.getID(), g2.getID()); } else return (int)(g1.gseq_id-g2.gseq_id); } char* GffLine::extractAttr(const char* attr, bool caseStrict, bool enforce_GTF2) { //parse a key attribute and remove it from the info string //(only works for attributes that have values following them after ' ' or '=') static const char GTF2_ERR[]="Error parsing attribute %s ('\"' required) at GTF line:\n%s\n"; int attrlen=strlen(attr); char cend=attr[attrlen-1]; //char* pos = (caseStrict) ? strstr(info, attr) : strifind(info, attr); //must make sure attr is not found in quoted text char* pos=info; char prevch=0; bool in_str=false; bool notfound=true; int (*strcmpfn)(const char*, const char*, int) = caseStrict ? Gstrcmp : Gstricmp; while (notfound && *pos) { char ch=*pos; if (ch=='"') { in_str=!in_str; pos++; prevch=ch; continue; } if (!in_str && (prevch==0 || prevch==' ' || prevch == ';') && strcmpfn(attr, pos, attrlen)==0) { //attr match found //check for word boundary on right char* epos=pos+attrlen; if (cend=='=' || cend==' ' || *epos==0 || *epos==' ') { notfound=false; break; } //not a perfect match, move on pos=epos; prevch=*(pos-1); continue; } //not a match or in_str prevch=ch; pos++; } if (notfound) return NULL; char* vp=pos+attrlen; while (*vp==' ') vp++; if (*vp==';' || *vp==0) GError("Error parsing value of GFF attribute \"%s\", line:\n%s\n", attr, dupline); bool dq_enclosed=false; //value string enclosed by double quotes if (*vp=='"') { dq_enclosed=true; vp++; } if (enforce_GTF2 && !dq_enclosed) GError(GTF2_ERR,attr, dupline); char* vend=vp; if (dq_enclosed) { while (*vend!='"' && *vend!=';' && *vend!=0) vend++; } else { while (*vend!=';' && *vend!=0) vend++; } if (enforce_GTF2 && *vend!='"') GError(GTF2_ERR, attr, dupline); char *r=Gstrdup(vp, vend-1); //-- now remove this attribute from the info string while (*vend!=0 && (*vend=='"' || *vend==';' || *vend==' ')) vend++; if (*vend==0) vend--; for (char *src=vend, *dest=pos;;src++,dest++) { *dest=*src; if (*src==0) break; } return r; } static char fnamelc[128]; GffLine::GffLine(GffReader* reader, const char* l) { llen=strlen(l); GMALLOC(line,llen+1); memcpy(line, l, llen+1); GMALLOC(dupline, llen+1); memcpy(dupline, l, llen+1); skip=true; gseqname=NULL; track=NULL; ftype=NULL; info=NULL; _parents=NULL; _parents_len=0; num_parents=0; parents=NULL; is_gff3=false; is_cds=false; is_transcript=false; is_exon=false; is_gene=false; exontype=0; gene_id=NULL; gene_name=NULL; qstart=0; qend=0; qlen=0; ID=NULL; char* t[9]; int i=0; int tidx=1; t[0]=line; while (line[i]!=0) { if (line[i]=='\t') { line[i]=0; t[tidx]=line+i+1; tidx++; if (tidx>8) break; } i++; } if (tidx<8) { // ignore non-GFF lines // GMessage("Warning: error parsing GFF/GTF line:\n%s\n", l); return; } gseqname=t[0]; track=t[1]; ftype=t[2]; info=t[8]; char* p=t[3]; if (!parseUInt(p,fstart)) { //chromosome_band entries in Flybase GMessage("Warning: invalid start coordinate at line:\n%s\n",l); return; } p=t[4]; if (!parseUInt(p,fend)) { GMessage("Warning: invalid end coordinate at line:\n%s\n",l); return; } if (fend=fend, always p=t[5]; if (p[0]=='.' && p[1]==0) { score=0; } else { if (!parseDouble(p,score)) GError("Error parsing feature score from GFF line:\n%s\n",l); } strand=*t[6]; if (strand!='+' && strand!='-' && strand!='.') GError("Error parsing strand (%c) from GFF line:\n%s\n",strand,l); phase=*t[7]; // must be '.', '0', '1' or '2' ID=NULL; // exon/CDS/mrna filter strncpy(fnamelc, ftype, 127); fnamelc[127]=0; strlower(fnamelc); //convert to lower case bool is_t_data=false; if (strstr(fnamelc, "utr")!=NULL) { exontype=exgffUTR; is_exon=true; is_t_data=true; } else if (endsWith(fnamelc, "exon")) { exontype=exgffExon; is_exon=true; is_t_data=true; } else if (strstr(fnamelc, "stop") && (strstr(fnamelc, "codon") || strstr(fnamelc, "cds"))){ exontype=exgffStop; is_cds=true; //though some place it outside the last CDS segment is_t_data=true; } else if (strstr(fnamelc, "start") && ((strstr(fnamelc, "codon")!=NULL) || strstr(fnamelc, "cds")!=NULL)){ exontype=exgffStart; is_cds=true; is_t_data=true; } else if (strcmp(fnamelc, "cds")==0) { exontype=exgffCDS; is_cds=true; is_t_data=true; } else if (startsWith(fnamelc, "intron") || endsWith(fnamelc, "intron")) { exontype=exgffIntron; } else if (endsWith(fnamelc, "gene") || startsWith(fnamelc, "gene")) { is_gene=true; is_t_data=true; //because its name will be attached to parented transcripts } else if (endsWith(fnamelc,"rna") || endsWith(fnamelc,"transcript")) { is_transcript=true; is_t_data=true; } if (reader->transcriptsOnly && !is_t_data) { char* id=extractAttr("ID="); if (id==NULL) id=extractAttr("transcript_id"); //GMessage("Discarding non-transcript line:\n%s\n",l); if (id!=NULL) { reader->discarded_ids.Add(id, new int(1)); GFREE(id); } return; //skip this line, unwanted feature name } ID=extractAttr("ID=",true); char* Parent=extractAttr("Parent=",true); is_gff3=(ID!=NULL || Parent!=NULL); if (is_gff3) { //parse as GFF3 if (ID!=NULL) { //has ID attr so it's likely to be a parent feature //look for explicit gene name gene_name=extractAttr("gene_name="); if (gene_name==NULL) { gene_name=extractAttr("geneName="); if (gene_name==NULL) { gene_name=extractAttr("gene_sym="); if (gene_name==NULL) { gene_name=extractAttr("gene="); } } } gene_id=extractAttr("geneID="); if (gene_id==NULL) { gene_id=extractAttr("gene_id="); } if (is_gene) { //special case: keep the Name and ID attributes of the gene feature if (gene_name==NULL) gene_name=extractAttr("Name="); if (gene_id==NULL) //the ID is also gene_id in this case gene_id=Gstrdup(ID); //skip=false; //return; GFREE(Parent); //TMI, we really don't care about gene Parents? } //gene feature }// has GFF3 ID if (Parent!=NULL) { //keep Parent attr //parse multiple parents num_parents=1; p=Parent; int last_delim_pos=-1; while (*p!=';' && *p!=0) { if (*p==',' && *(p+1)!=0 && *(p+1)!=';') { num_parents++; last_delim_pos=(p-Parent); } p++; } _parents_len=p-Parent+1; _parents=Parent; GMALLOC(parents, num_parents*sizeof(char*)); parents[0]=_parents; int i=1; if (last_delim_pos>0) { for (p=_parents+1;p<=_parents+last_delim_pos;p++) { if (*p==',') { char* ep=p-1; while (*ep==' ' && ep>_parents) ep--; *(ep+1)=0; //end the string there parents[i]=p+1; i++; } } } } //has Parent field } //GFF3 else { // GTF-like expected Parent=extractAttr("transcript_id",true); if (Parent!=NULL) { //GTF2 format detected if (is_transcript) { // atypical GTF with a parent transcript line declared ID=Parent; Parent=NULL; } gene_id=extractAttr("gene_id"); // for GTF this is the only attribute accepted as geneID if (gene_id==NULL) gene_id=extractAttr("geneid"); gene_name=extractAttr("gene_name"); if (gene_name==NULL) { gene_name=extractAttr("gene_sym"); if (gene_name==NULL) { gene_name=extractAttr("gene"); if (gene_name==NULL) gene_name=extractAttr("genesymbol"); } } //prepare for parseAttr by adding '=' character instead of spaces for all attributes //after the attribute name p=info; bool noed=true; //not edited after the last delim bool nsp=false; //non-space found after last delim while (*p!=0) { if (*p==' ') { if (nsp && noed) { *p='='; noed=false; p++; continue; } } else nsp=true; //non-space if (*p==';') { noed=true; nsp=false; } p++; } } //GTF2 detected (no parent line) else {// Parent is NULL, check for jigsaw format or other pre-GTF2 format //char* fexon=strstr(fnamelc, "exon"); //if (fexon!=NULL) { if (exontype==exgffExon) { if (startsWith(track,"jigsaw")) { is_cds=true; strcpy(track,"jigsaw"); p=strchr(info,';'); if (p==NULL) { Parent=Gstrdup(info); info=NULL; } else { Parent=Gstrdup(info,p-1); info=p+1; } } } //exon feature? if (Parent==NULL && exontype>=exgffCDS && (i=strcspn(info,"; \t\n\r"))<=(int)(strlen(info)+1)) { //one word ID ? really desperate attempt to parse it here Parent=Gstrdup(info,info+i-1); info=NULL; //discard anything else on the line } } if (Parent!=NULL) { //GTF transcript_id for exon/CDS feature _parents=Parent; GMALLOC(parents,sizeof(char*)); num_parents=1; parents[0]=_parents; } } //GTF-like //parse other potentially useful features if (is_gff3) { if ((p=strstr(info,"Target="))!=NULL) { //has Target attr p+=7; while (*p!=';' && *p!=0 && *p!=' ') p++; if (*p!=' ') { GError("Error parsing target coordinates from GFF line:\n%s\n",l); } if (!parseUInt(p,qstart)) GError("Error parsing target start coordinate from GFF line:\n%s\n",l); if (*p!=' ') { GError("Error parsing next target coordinate from GFF line:\n%s\n",l); } p++; if (!parseUInt(p,qend)) GError("Error parsing target end coordinate from GFF line:\n%s\n",l); } if ((p=strifind(info,"Qreg="))!=NULL) { //has Qreg attr p+=5; if (!parseUInt(p,qstart)) GError("Error parsing target start coordinate from GFF line:\n%s\n",l); if (*p!='-') { GError("Error parsing next target coordinate from GFF line:\n%s\n",l); } p++; if (!parseUInt(p,qend)) GError("Error parsing target end coordinate from GFF line:\n%s\n",l); if (*p=='|' || *p==':') { p++; if (!parseUInt(p,qlen)) GError("Error parsing target length from GFF Qreg|: \n%s\n",l); } }//has Qreg attr if (qlen==0 && (p=strifind(info,"Qlen="))!=NULL) { p+=5; if (!parseUInt(p,qlen)) GError("Error parsing target length from GFF Qlen:\n%s\n",l); } }//parsing some useful attributes in GFF3 records if (ID==NULL && parents==NULL) { if (reader->gff_warns) GMessage("Warning: could not parse ID or Parent from GFF line:\n%s\n",dupline); return; //skip } skip=false; } void GffObj::addCDS(uint cd_start, uint cd_end, char phase) { if (cd_start>=this->start) { this->CDstart=cd_start; if (strand=='+') this->CDphase=phase; } else this->CDstart=this->start; if (cd_end<=this->end) { this->CDend=cd_end; if (strand=='-') this->CDphase=phase; } else this->CDend=this->end; isTranscript(true); exon_ftype_id=gff_fid_exon; if (monoFeature()) { if (exons.Count()==0) addExon(this->start, this->end,0,'.',0,0,false,exgffExon); else exons[0]->exontype=exgffExon; } } int GffObj::addExon(GffReader* reader, GffLine* gl, bool keepAttr, bool noExonAttr) { //this will make sure we have the right subftype_id! //int subf_id=-1; if (!isTranscript() && gl->is_cds) { isTranscript(true); exon_ftype_id=gff_fid_exon; if (exons.Count()==1) exons[0]->exontype=exgffExon; } if (isTranscript()) { if (exon_ftype_id<0) {//exon_ftype_id=gff_fid_exon; if (gl->exontype>0) exon_ftype_id=gff_fid_exon; else exon_ftype_id=names->feats.addName(gl->ftype); } //any recognized mRNA segment gets the generic "exon" type (also applies to CDS) if (gl->exontype==0 && !gl->is_transcript) { //extraneous mRNA feature, discard if (reader->gff_warns) GMessage("Warning: discarding unrecognized transcript subfeature '%s' of %s\n", gl->ftype, gffID); return -1; } } else { //non-mRNA parent feature, check this subf type int subf_id=names->feats.addName(gl->ftype); if (exon_ftype_id<0 || exons.Count()==0) //never assigned a subfeature type before (e.g. first exon being added) exon_ftype_id=subf_id; else { if (exon_ftype_id!=subf_id) { // if (exon_ftype_id==ftype_id && exons.Count()==1 && exons[0]->start==start && exons[0]->end==end) { //the existing exon was just a dummy one created by default, discard it exons.Clear(); covlen=0; exon_ftype_id=subf_id; //allow the new subfeature to completely takeover } else { //multiple subfeatures, prefer those with if (reader->gff_warns) GMessage("GFF Warning: multiple subfeatures (%s and %s) found for %s, discarding ", names->feats.getName(subf_id), names->feats.getName(exon_ftype_id),gffID); if (gl->exontype!=0) { //new feature is an exon, discard previously parsed subfeatures if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(exon_ftype_id)); exon_ftype_id=subf_id; exons.Clear(); covlen=0; } else { //discard new feature if (reader->gff_warns) GMessage("%s.\n", names->feats.getName(subf_id)); return -1; //skip this 2nd subfeature type for this parent! } } } //incoming subfeature is of different type } //new subfeature type } //non-mRNA parent int eidx=addExon(gl->fstart, gl->fend, gl->score, gl->phase, gl->qstart,gl->qend, gl->is_cds, gl->exontype); if (eidx<0) return eidx; //this should never happen if (keepAttr) { if (noExonAttr) { if (attrs==NULL) //place the parsed attributes directly at transcript level parseAttrs(attrs, gl->info); } else { //need all exon-level attributes parseAttrs(exons[eidx]->attrs, gl->info, true); } } return eidx; } int GffObj::addExon(uint segstart, uint segend, double sc, char fr, int qs, int qe, bool iscds, char exontype) { if (exons.Count()==0) { if (iscds) isCDS=true; //for now, assume CDS only if first "exon" given is a CDS if (exon_ftype_id<0) { exon_ftype_id = isTranscript() ? gff_fid_exon : ftype_id; } } //special treatment of start/stop codon features, they might be broken/split between exons //and in that case some providers will still give the wrong end coordinate as start+2 (e.g. UCSC) //so we should not trust the end coordinate for such features if (exontype==exgffStart || exontype==exgffStop) { if (strand=='-') segstart=segend; else segend=segstart; if (exontype==exgffStart) { if (CDstart==0 || segstartCDend) CDend=segstart; } } else if (iscds) { //update CDS anchors: if (CDstart==0 || segstartCDend) { if (exontype==exgffCDS && strand=='-') CDphase=fr; CDend=segend; } } else { // not a CDS/start/stop isCDS=false; } if (qs || qe) { if (qs>qe) Gswap(qs,qe); if (qs==0) qs=1; } int ovlen=0; if (exontype>0) { //check for overlaps between exon-type segments int oi=exonOverlapIdx(segstart, segend, &ovlen); if (oi>=0) { //overlap existing segment if (ovlen==0) { //adjacent segments will be merged //e.g. CDS to (UTR|exon) if ((exons[oi]->exontype>=exgffUTR && exontype==exgffCDS) || (exons[oi]->exontype==exgffCDS && exontype>=exgffUTR)) { expandExon(oi, segstart, segend, exgffCDSUTR, sc, fr, qs, qe); return oi; } //CDS adjacent to stop_codon: UCSC does (did?) this if ((exons[oi]->exontype==exgffStop && exontype==exgffCDS) || (exons[oi]->exontype==exgffCDS && exontype==exgffStop)) { expandExon(oi, segstart, segend, exgffCDS, sc, fr, qs, qe); return oi; } } //only allow this for CDS within exon, stop_codon within (CDS|UTR|exon), // start_codon within (CDS|exon) if (exons[oi]->start<=segstart && exons[oi]->end>=segend) { //larger segment given first, now the smaller included one is redundant if (exons[oi]->exontype>exontype && !(exons[oi]->exontype==exgffUTR && exontype==exgffCDS)) { return oi; //only used to store attributes from current GffLine } else { if (gff_show_warnings && (exons[oi]->startend>segend)) { GMessage("GFF Warning: unusual segment inclusion: %s(%d-%d) within %s(%d-%d) (ID=%s)\n", strExonType(exontype), segstart, segend, strExonType(exons[oi]->exontype), exons[oi]->start, exons[oi]->end, this->gffID); } return oi; } } if (exontype>exons[oi]->exontype && segstart<=exons[oi]->start && segend>=exons[oi]->end && !(exontype==exgffUTR && exons[oi]->exontype==exgffCDS)) { //smaller segment given first, so we have to enlarge it expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe); //this should also check for overlapping next exon (oi+1) ? return oi; } //there is also the special case of "ribosomal slippage exception" (programmed frameshift) //where two CDS segments may actually overlap for 1 or 2 bases, but there should be only one encompassing exon //if (ovlen>2 || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) { // had to relax this because of some weird UCSC annotations with exons partially overlapping the CDS segments /* if (ovlen>2 && exons[oi]->exontype!=exgffUTR && exontype!=exgffUTR) { if (gff_show_warnings) GMessage("GFF Warning: discarding overlapping feature segment (%d-%d) (vs %d-%d (%s)) for GFF ID %s on %s\n", segstart, segend, exons[oi]->start, exons[oi]->end, getSubfName(), gffID, getGSeqName()); hasErrors(true); return -1; //segment NOT added } */ if ((ovlen>2 || ovlen==0) || exons[oi]->exontype!=exgffCDS || exontype!=exgffCDS) { if (gff_show_warnings) GMessage("GFF Warning: merging overlapping/adjacent feature segment %s (%d-%d) with %s (%d-%d) for GFF ID %s on %s\n", strExonType(exontype), segstart, segend, strExonType(exons[oi]->exontype), exons[oi]->start, exons[oi]->end, gffID, getGSeqName()); expandExon(oi, segstart, segend, exontype, sc, fr, qs, qe); return oi; } // else add the segment if the overlap is small and between two CDS segments //TODO: we might want to add an attribute here with the slippage coordinate and size? covlen-=ovlen; }//overlap or adjacent to existing segment } //check for overlap // --- no overlap, or accepted micro-overlap (ribosomal slippage) // create & add the new segment /* if (start>0 && exontype==exgffCDS && exons.Count()==0) { //adding a CDS directly as the first subfeature of a declared parent segstart=start; segend=end; } */ GffExon* enew=new GffExon(segstart, segend, sc, fr, qs, qe, exontype); int eidx=exons.Add(enew); if (eidx<0) { //this would actually be acceptable if the object is a "Gene" and "exons" are in fact isoforms if (gff_show_warnings) GMessage("GFF Warning: failed adding segment %d-%d for %s (discarded)!\n", segstart, segend, gffID); delete enew; hasErrors(true); return -1; } covlen+=(int)(exons[eidx]->end-exons[eidx]->start)+1; //adjust parent feature coordinates to contain this exon if (start==0 || start>exons.First()->start) { start=exons.First()->start; } if (endend) end=exons.Last()->end; return eidx; } void GffObj::expandExon(int oi, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe) { //oi is the index of the *first* overlapping segment found that must be enlarged covlen-=exons[oi]->len(); if (segstartstart) exons[oi]->start=segstart; if (qs && qsqstart) exons[oi]->qstart=qs; if (segend>exons[oi]->end) exons[oi]->end=segend; if (qe && qe>exons[oi]->qend) exons[oi]->qend=qe; //warning: score cannot be properly adjusted! e.g. if it's a p-value it's just going to get worse if (sc!=0) exons[oi]->score=sc; covlen+=exons[oi]->len(); //if (exons[oi]->exontype< exontype) -- always true exons[oi]->exontype = exontype; if (exontype==exgffCDS) exons[oi]->phase=fr; //we must check if any more exons are also overlapping this int ni=oi+1; //next exon index after oi while (ni=exons[ni]->start) { // next segment overlaps new enlarged segment //only allow this if next segment is fully included, and a subordinate if (exons[ni]->exontypeend<=segend) { /* I guess we have to relax this due to stupid UCSC hg18 files having a start_codon sticking out chr1 hg18_knownGene start_codon 69806911 69806913 0.000000 + . chr1 hg18_knownGene CDS 69806911 69806912 0.000000 + 0 chr1 hg18_knownGene exon 69805456 69806912 0.000000 + . */ if (exons[ni]->qstartqstart) exons[oi]->qstart=exons[ni]->qstart; if (exons[ni]->qend>exons[oi]->qend) exons[oi]->qend=exons[ni]->qend; exons.Delete(ni); } else { if (gff_show_warnings) GMessage("GFF Warning: overlapping existing exon(%d-%d) while expanding to %d-%d for GFF ID %s\n", exons[ni]->start, exons[ni]->end, segstart, segend, gffID); //hasErrors(true); break; } } // -- make sure any other related boundaries are updated: start=exons.First()->start; end=exons.Last()->end; if (uptr!=NULL) { //collect stats about the underlying genomic sequence GSeqStat* gsd=(GSeqStat*)uptr; if (startmincoord) gsd->mincoord=start; if (end>gsd->maxcoord) gsd->maxcoord=end; if (this->len()>gsd->maxfeat_len) { gsd->maxfeat_len=this->len(); gsd->maxfeat=this; } } } void GffObj::removeExon(int idx) { /* if (idx==0 && segs[0].start==gstart) gstart=segs[1].start; if (idx==segcount && segs[segcount].end==gend) gend=segs[segcount-1].end; */ if (idx<0 || idx>=exons.Count()) return; int segstart=exons[idx]->start; int segend=exons[idx]->end; exons.Delete(idx); covlen -= (int)(segend-segstart)+1; start=exons.First()->start; end=exons.Last()->end; if (isCDS) { CDstart=start; CDend=end; } } void GffObj::removeExon(GffExon* p) { for (int idx=0;idxstart; int segend=exons[idx]->end; exons.Delete(idx); covlen -= (int)(segend-segstart)+1; if (exons.Count() > 0) { start=exons.First()->start; end=exons.Last()->end; if (isCDS) { CDstart=start; CDend=end; } } return; } } } GffObj::GffObj(GffReader *gfrd, GffLine* gffline, bool keepAttr, bool noExonAttr): GSeg(0,0), exons(true,true,false), children(1,false) { xstart=0; xend=0; xstatus=0; partial=false; isCDS=false; uptr=NULL; ulink=NULL; parent=NULL; udata=0; flags=0; CDstart=0; CDend=0; CDphase=0; geneID=NULL; gene_name=NULL; attrs=NULL; gffID=NULL; track_id=-1; gseq_id=-1; ftype_id=-1; exon_ftype_id=-1; strand='.'; if (gfrd==NULL) GError("Cannot use this GffObj constructor with a NULL GffReader!\n"); gffnames_ref(names); if (gfrd->names==NULL) gfrd->names=names; //qlen=0;qstart=0;qend=0; gscore=0; uscore=0; covlen=0; qcov=0; start=gffline->fstart; end=gffline->fend; gseq_id=names->gseqs.addName(gffline->gseqname); track_id=names->tracks.addName(gffline->track); strand=gffline->strand; qlen=gffline->qlen; qstart=gffline->qstart; qend=gffline->qend; //setup flags from gffline isCDS=gffline->is_cds; //for now isGene(gffline->is_gene); isTranscript(gffline->is_transcript || gffline->exontype!=0); //fromGff3(gffline->is_gff3); if (gffline->parents!=NULL && !gffline->is_transcript) { //GTF style -- create a GffObj directly by subfeature //(also possible orphan GFF3 exon line, or an exon given before its parent (chado)) if (gffline->exontype!=0) { //recognized exon-like feature ftype_id=gff_fid_transcript; //so this is some sort of transcript exon_ftype_id=gff_fid_exon; //subfeatures MUST be exons } else {//unrecognized subfeatures //make this GffObj of the same feature type ftype_id=names->feats.addName(gffline->ftype); } if (gffline->ID==NULL) { //typical GTF2 without "transcript" line gffID=Gstrdup(gffline->parents[0]); this->createdByExon(true); //this is likely the first exon/segment of the feature addExon(gfrd, gffline, keepAttr, noExonAttr); } else { //a parented feature with an ID: orphan or premature GFF3 subfeature line if (gffline->is_gff3 && gffline->exontype!=0) { //premature exon given before its parent transcript //create the transcript entry here gffID=Gstrdup(gffline->parents[0]); this->createdByExon(true); //this is the first exon/segment of the transcript addExon(gfrd, gffline, keepAttr, noExonAttr); } else { //unrecognized non-exon feature ? use the ID instead this->hasGffID(true); gffID=Gstrdup(gffline->ID); if (keepAttr) this->parseAttrs(attrs, gffline->info); } } } //non-transcript parented subfeature given directly else { //non-parented feature OR a recognizable transcript //create a parent feature in its own right gscore=gffline->score; if (gffline->ID==NULL || gffline->ID[0]==0) GError("Error: no ID found for GFF record start\n"); this->hasGffID(true); gffID=Gstrdup(gffline->ID); //there must be an ID here //if (gffline->is_transcript) ftype_id=gff_fid_mRNA; //else ftype_id=names->feats.addName(gffline->ftype); if (gffline->is_transcript) exon_ftype_id=gff_fid_exon; if (keepAttr) this->parseAttrs(attrs, gffline->info); }//no parent if (gffline->gene_name!=NULL) { gene_name=Gstrdup(gffline->gene_name); } if (gffline->gene_id) { geneID=Gstrdup(gffline->gene_id); } else if (gffline->is_transcript && gffline->parents) { geneID=Gstrdup(gffline->parents[0]); } //GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,names->gseqs.lastNameUsed()),true); GSeqStat* gsd=gfrd->gseqstats.AddIfNew(new GSeqStat(gseq_id,gffline->gseqname), true); uptr=gsd; /* if (startmincoord) gsd->mincoord=start; if (end>gsd->maxcoord) gsd->maxcoord=end; if (this->len()>gsd->maxfeat_len) { gsd->maxfeat_len=this->len(); gsd->maxfeat=this; } */ } GffLine* GffReader::nextGffLine() { if (gffline!=NULL) return gffline; //caller should free gffline after processing while (gffline==NULL) { int llen=0; buflen=GFF_LINELEN-1; char* l=fgetline(linebuf, buflen, fh, &fpos, &llen); if (l==NULL) { return NULL; //end of file } int ns=0; //first nonspace position while (l[ns]!=0 && isspace(l[ns])) ns++; if (l[ns]=='#' || llen<10) continue; gffline=new GffLine(this, l); if (gffline->skip) { delete gffline; gffline=NULL; continue; } if (gffline->ID==NULL && gffline->parents==NULL) { //it must have an ID //this might not be needed, already checked in the GffLine constructor if (gff_warns) GMessage("Warning: malformed GFF line, no parent or record Id (kipping\n"); delete gffline; gffline=NULL; //continue; } } return gffline; } char* GffReader::gfoBuildId(const char* id, const char* ctg) { //caller must free the returned pointer char* buf=NULL; int idlen=strlen(id); GMALLOC(buf, idlen+strlen(ctg)+2); strcpy(buf, id); buf[idlen]='~'; strcpy(buf+idlen+1, ctg); return buf; } /* void GffReader::gfoRemove(const char* id, const char* ctg) { char* buf=gfoBuildId(id,ctg); phash.Remove(buf); GFREE(buf); } */ GffObj* GffReader::gfoAdd(GffObj* gfo) { GPVec* glst=phash.Find(gfo->gffID); if (glst==NULL) glst=new GPVec(false); //GfoHolder gh(gfo); //,idx); int i=glst->Add(gfo); phash.Add(gfo->gffID, glst); return glst->Get(i); } GffObj* GffReader::gfoAdd(GPVec& glst, GffObj* gfo) { int i=glst.Add(gfo); return glst[i]; } GffObj* GffReader::gfoFind(const char* id, const char* ctg, GPVec** glst, char strand, uint start, uint end) { GPVec* gl=phash.Find(id); GffObj* gh=NULL; if (gl) { for (int i=0;iCount();i++) { GffObj& gfo = *(gl->Get(i)); if (ctg!=NULL && strcmp(ctg, gfo.getGSeqName())!=0) continue; if (strand && gfo.strand!='.' && strand != gfo.strand) continue; if (start>0) { if (abs((int)start-(int)gfo.start)> (int)GFF_MAX_LOCUS) continue; if (end>0 && (gfo.start>end || gfo.end=0) { gflst.Put(replaceidx,newgfo); r=gfoAdd(newgfo); } else { int gfoidx=gflst.Add(newgfo); r=gfoAdd(newgfo); } return r; } */ GffObj* GffReader::updateParent(GffObj* newgfo, GffObj* parent) { //assert(parent); //assert(newgfo); parent->children.Add(newgfo); if (newgfo->parent==NULL) newgfo->parent=parent; newgfo->setLevel(parent->getLevel()+1); if (parent->isGene()) { if (parent->gene_name!=NULL && newgfo->gene_name==NULL) newgfo->gene_name=Gstrdup(parent->gene_name); if (parent->geneID!=NULL && newgfo->geneID==NULL) newgfo->geneID=Gstrdup(parent->geneID); } return newgfo; } GffObj* GffReader::newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, GffObj* parent, GffExon* pexon, GPVec* glst) { GffObj* newgfo=new GffObj(this, gffline, keepAttr, noExonAttr); GffObj* r=NULL; //int gfoidx=gflst.Add(newgfo); gflst.Add(newgfo); r=(glst) ? gfoAdd(*glst, newgfo) : gfoAdd(newgfo); if (parent!=NULL) { updateParent(r, parent); if (pexon!=NULL) parent->removeExon(pexon); } /* if (gff_warns) { int* pcount=tids.Find(newgfo->gffID); if (pcount!=NULL) { if (gff_warns) GMessage("Warning: duplicate GFF ID: %s\n", newgfo->gffID); (*pcount)++; } else { tids.Add(newgfo->gffID,new int(1)); } } */ return r; } GffObj* GffReader::updateGffRec(GffObj* prevgfo, GffLine* gffline, bool keepAttr) { if (prevgfo==NULL) return NULL; //prevgfo->gffobj->createdByExon(false); prevgfo->ftype_id=prevgfo->names->feats.addName(gffline->ftype); prevgfo->start=gffline->fstart; prevgfo->end=gffline->fend; prevgfo->isGene(gffline->is_gene); prevgfo->isTranscript(gffline->is_transcript || gffline->exontype!=0); prevgfo->hasGffID(gffline->ID!=NULL); if (keepAttr) { if (prevgfo->attrs!=NULL) prevgfo->attrs->Clear(); prevgfo->parseAttrs(prevgfo->attrs, gffline->info); } return prevgfo; } bool GffReader::addExonFeature(GffObj* prevgfo, GffLine* gffline, GHash& pex, bool noExonAttr) { bool r=true; if (gffline->strand!=prevgfo->strand) { if (prevgfo->strand=='.') { prevgfo->strand=gffline->strand; } else { GMessage("GFF Error at %s (%c): exon %d-%d (%c) found on different strand; discarded.\n", prevgfo->gffID, prevgfo->strand, gffline->fstart, gffline->fend, gffline->strand, prevgfo->getGSeqName()); //r=false; return true; } } int gdist=(gffline->fstart>prevgfo->end) ? gffline->fstart-prevgfo->end : ((gffline->fendstart)? prevgfo->start-gffline->fend : 0 ); if (gdist>(int)GFF_MAX_LOCUS) { //too far apart, most likely this is a duplicate ID GMessage("Error: duplicate GFF ID '%s' (or exons too far apart)!\n",prevgfo->gffID); //validation_errors = true; r=false; if (!gff_warns) exit(1); } int eidx=prevgfo->addExon(this, gffline, !noExonAttr, noExonAttr); if (eidx>=0 && gffline->ID!=NULL && gffline->exontype==0) subfPoolAdd(pex, prevgfo); return r; } CNonExon* GffReader::subfPoolCheck(GffLine* gffline, GHash& pex, char*& subp_name) { CNonExon* subp=NULL; subp_name=NULL; for (int i=0;inum_parents;i++) { if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL) continue; subp_name=gfoBuildId(gffline->parents[i], gffline->gseqname); //e.g. mRNA name subp=pex.Find(subp_name); if (subp!=NULL) return subp; GFREE(subp_name); } return NULL; } void GffReader::subfPoolAdd(GHash& pex, GffObj* newgfo) { //this might become a parent feature later if (newgfo->exons.Count()>0) { char* xbuf=gfoBuildId(gffline->ID, gffline->gseqname); pex.Add(xbuf, new CNonExon(newgfo, newgfo->exons[0], gffline)); GFREE(xbuf); } } GffObj* GffReader::promoteFeature(CNonExon* subp, char*& subp_name, GHash& pex, bool keepAttr, bool noExonAttr) { GffObj* prevp=subp->parent; //grandparent of gffline (e.g. gene) //if (prevp!=gflst[subp->idx]) // GError("Error promoting subfeature %s, gflst index mismatch?!\n", subp->gffline->ID); subp->gffline->discardParent(); GffObj* gfoh=newGffRec(subp->gffline, keepAttr, noExonAttr, prevp, subp->exon); pex.Remove(subp_name); //no longer a potential parent, moved it to phash already prevp->promotedChildren(true); return gfoh; //returns the holder of newly promoted feature } //have to parse the whole file because exons and other subfeatures can be scattered, unordered in the input //Trans-splicing and fusions are only accepted in proper GFF3 format, i.e. multiple features with the same ID //are accepted if they are NOT overlapping/continuous // *** BUT (exception): proximal xRNA features with the same ID, on the same strand, will be merged // and the segments will be treated like exons (e.g. TRNAR15 (rna1940) in RefSeq) void GffReader::readAll(bool keepAttr, bool mergeCloseExons, bool noExonAttr) { bool validation_errors = false; //loc_debug=false; GHash pex; //keep track of any "exon"-like features that have an ID //and thus could become promoted to parent features while (nextGffLine()!=NULL) { GffObj* prevseen=NULL; GPVec* prevgflst=NULL; if (gffline->ID && gffline->exontype==0) { //>> for a parent-like IDed feature (mRNA, gene, etc.) //look for same ID on the same chromosome/strand/locus prevseen=gfoFind(gffline->ID, gffline->gseqname, &prevgflst, gffline->strand, gffline->fstart); if (prevseen!=NULL) { //same ID/chromosome combo encountered before if (prevseen->createdByExon()) { if (gff_show_warnings && (prevseen->startfstart || prevseen->end>gffline->fend)) GMessage("GFF Warning: invalid coordinates for %s parent feature (ID=%s)\n", gffline->ftype, gffline->ID); //an exon of this ID was given before //this line has the main attributes for this ID updateGffRec(prevseen, gffline, keepAttr); } else { //- duplicate ID -- this must be a discontinuous feature according to GFF3 specs // e.g. a trans-spliced transcript if (prevseen->overlap(gffline->fstart, gffline->fend)) { //overlapping with same ID not allowed GMessage("GFF Error: duplicate/invalid '%s' feature ID=%s\n", gffline->ftype, gffline->ID); //validation_errors = true; if (gff_warns) { delete gffline; gffline=NULL; continue; } else exit(1); } //create a new entry with the same ID int distance=INT_MAX; if (prevseen->isTranscript() && prevseen->strand==gffline->strand) { if (prevseen->start>=gffline->fstart) distance=prevseen->start-gffline->fend; else distance=gffline->fstart-prevseen->end; } if (distance<1000) {//FIXME: arbitrary proximity threshold (yuck) //exception: make this an exon of previous ID //addExonFeature(prevseen, gffline, pex, noExonAttr); prevseen->addExon(this, gffline, false, true); } else { //create a separate entry (true discontinuous feature) prevseen=newGffRec(gffline, keepAttr, noExonAttr, prevseen->parent, NULL, prevgflst); } } //duplicate ID on the same chromosome } //prevseeen != NULL } //parent-like ID feature if (gffline->parents==NULL) {//start GFF3-like record with no parent (mRNA, gene) if (!prevseen) newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, prevgflst); } else { //--- it's a child feature (exon/CDS but could still be a mRNA with gene(s) as parent) //updates all the declared parents with this child bool found_parent=false; GffObj* newgfo=prevseen; GPVec* newgflst=NULL; for (int i=0;inum_parents;i++) { if (transcriptsOnly && discarded_ids.Find(gffline->parents[i])!=NULL) continue; //skipping discarded parent feature GffObj* parentgfo=NULL; if (gffline->is_transcript || gffline->exontype==0) {//possibly a transcript parentgfo=gfoFind(gffline->parents[i], gffline->gseqname, &newgflst, gffline->strand, gffline->fstart, gffline->fend); } else { //for exon-like entities we only need a parent to be in locus distance, //on the same strand parentgfo=gfoFind(gffline->parents[i], gffline->gseqname, &newgflst, gffline->strand, gffline->fstart); } if (parentgfo!=NULL) { //parent GffObj parsed earlier found_parent=true; if (parentgfo->isGene() && gffline->is_transcript && gffline->exontype==0) { //not an exon, but a transcript parented by a gene if (newgfo) { updateParent(newgfo, parentgfo); } else { newgfo=newGffRec(gffline, keepAttr, noExonAttr, parentgfo); } } else { //potential exon subfeature? //always discards dummy "intron" features if (!(gffline->exontype==exgffIntron && (parentgfo->isTranscript() || parentgfo->exons.Count()>0))) { if (!addExonFeature(parentgfo, gffline, pex, noExonAttr)) validation_errors=true; } } } //overlapping parent feature found } //for each parsed parent Id if (!found_parent) { //new GTF-like record starting here with a subfeature directly //or it could be some chado GFF3 barf with exons coming BEFORE their parent :( //check if this feature isn't parented by a previously stored "exon" subfeature char* subp_name=NULL; CNonExon* subp=subfPoolCheck(gffline, pex, subp_name); if (subp!=NULL) { //found a subfeature that is the parent of this gffline //promote that subfeature to a full GffObj GffObj* gfoh=promoteFeature(subp, subp_name, pex, keepAttr, noExonAttr); //add current gffline as an exon of the newly promoted subfeature if (!addExonFeature(gfoh, gffline, pex, noExonAttr)) validation_errors=true; } else { //no parent seen before, //loc_debug=true; GffObj* ngfo=prevseen; if (ngfo==NULL) { //if it's an exon type, create directly the parent with this exon //but if it's recognized as a transcript, the object itself is created ngfo=newGffRec(gffline, keepAttr, noExonAttr, NULL, NULL, newgflst); } if (!ngfo->isTranscript() && gffline->ID!=NULL && gffline->exontype==0) subfPoolAdd(pex, ngfo); //even those with errors will be added here! } GFREE(subp_name); } //no previous parent found } //parented feature //-- delete gffline; gffline=NULL; }//while gff lines if (gflst.Count()>0) { gflst.finalize(this, mergeCloseExons, keepAttr, noExonAttr); //force sorting by locus if so constructed gseqStats.setCount(gseqstats.Last()->gseqid+1); for (int gi=0;gigseqid, gseqstats[gi]); //copy the pointer only } } // all gff records are now loaded in GList gflst // so we can free the hash phash.Clear(); //tids.Clear(); if (validation_errors) { exit(1); } } void GfList::finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) { //if set, enforce sort by locus if (mustSort) { //force (re-)sorting this->setSorted(false); this->setSorted((GCompareProc*)gfo_cmpByLoc); } GList discarded(false,true,false); for (int i=0;ifinalize(gfr, mergeCloseExons, keepAttrs, noExonAttr); if (fList[i]->isDiscarded()) { discarded.Add(fList[i]); if (fList[i]->children.Count()>0) { for (int c=0;cchildren.Count();c++) { fList[i]->children[c]->parent=NULL; if (keepAttrs) fList[i]->children[c]->copyAttrs(fList[i]); //inherit the attributes of discarded parent (e.g. pseudo=true; ) } } this->Forget(i); } } if (discarded.Count()>0) { this->Pack(); } } GffObj* GffObj::finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs, bool noExonAttr) { //merge //always merge adjacent or overlapping segments //but if mergeCloseExons then merge even when distance is up to 5 bases udata=0; uptr=NULL; if (gfr->transcriptsOnly && !(isTranscript() || (isGene() && children.Count()==0))) { isDiscarded(true); } if (ftype_id==gff_fid_transcript && CDstart>0) { ftype_id=gff_fid_mRNA; //exon_ftype_id=gff_fid_exon; } if (exons.Count()>0 && (isTranscript() || exon_ftype_id==gff_fid_exon)) { if (mergeCloseExons) { int mindist=mergeCloseExons ? 5:1; for (int i=0;iend; while (nistart-mend); if (dist>mindist) break; //no merging with next segment if (gfr!=NULL && gfr->gff_warns && dist!=0 && (exons[ni]->exontype!=exgffUTR && exons[i]->exontype!=exgffUTR)) { GMessage("GFF warning: merging adjacent/overlapping segments of %s on %s (%d-%d, %d-%d)\n", gffID, getGSeqName(), exons[i]->start, exons[i]->end,exons[ni]->start, exons[ni]->end); } mend=exons[ni]->end; covlen-=exons[i]->len(); exons[i]->end=mend; covlen+=exons[i]->len(); covlen-=exons[ni]->len(); if (exons[ni]->attrs!=NULL && (exons[i]->attrs==NULL || exons[i]->attrs->Count()attrs->Count())) { //use the other exon attributes, if more delete(exons[i]->attrs); exons[i]->attrs=exons[ni]->attrs; exons[ni]->attrs=NULL; } exons.Delete(ni); } //check for merge with next exon } //for each exon } //merge close exons //shrink transcript to the exons' span this->start=exons.First()->start; this->end=exons.Last()->end; //also update the stats for the reference sequence if (uptr!=NULL) { //collect stats about the underlying genomic sequence GSeqStat* gsd=(GSeqStat*)uptr; if (startmincoord) gsd->mincoord=start; if (end>gsd->maxcoord) gsd->maxcoord=end; if (this->len()>gsd->maxfeat_len) { gsd->maxfeat_len=this->len(); gsd->maxfeat=this; } } this->uptr=NULL; this->udata=0; } //attribute reduction for GTF records if (keepAttrs && !noExonAttr && !hasGffID() && exons.Count()>0 && exons[0]->attrs!=NULL) { bool attrs_discarded=false; for (int a=0;aattrs->Count();a++) { int attr_name_id=exons[0]->attrs->Get(a)->attr_id; char* attr_name=names->attrs.getName(attr_name_id); char* attr_val =exons[0]->attrs->Get(a)->attr_val; bool sameExonAttr=true; for (int i=1;igetAttr(attr_name_id); if (ov==NULL || (strcmp(ov,attr_val)!=0)) { sameExonAttr=false; break; } } if (sameExonAttr) { //delete this attribute from exons level attrs_discarded=true; this->addAttr(attr_name, attr_val); for (int i=1;iattrs->freeItem(a); } } if (attrs_discarded) exons[0]->attrs->Pack(); } return this; } void GffObj::parseAttrs(GffAttrs*& atrlist, char* info, bool isExon) { if (names==NULL) GError(ERR_NULL_GFNAMES, "parseAttrs()"); if (atrlist==NULL) atrlist=new GffAttrs(); char* endinfo=info+strlen(info); char* start=info; char* pch=start; while (startaddAttr(start, ech); start=pch; continue; } atrlist->add_or_update(names, start, ech); } /* else { //not an attr=value format atrlist->Add(new GffAttr(names->attrs.addName(start),"1")); } */ start=pch; } if (atrlist->Count()==0) { delete atrlist; atrlist=NULL; } } void GffObj::addAttr(const char* attrname, const char* attrvalue) { if (this->attrs==NULL) this->attrs=new GffAttrs(); //this->attrs->Add(new GffAttr(names->attrs.addName(attrname),attrvalue)); this->attrs->add_or_update(names, attrname, attrvalue); } void GffObj::copyAttrs(GffObj* from) { //typically from is the parent gene, and this is a transcript if (from==NULL || from->attrs==NULL) return; if (this->attrs==NULL) { this->attrs=new GffAttrs(); } //special RefSeq case int desc_attr_id=names->attrs.getId("description"); //from gene int prod_attr_id=names->attrs.getId("product"); //from transcript (this) char* prod = (prod_attr_id>=0) ? this->attrs->getAttr(prod_attr_id) : NULL; for (int i=0;iattrs->Count();++i) { //this->attrs->add_no_update(names, from->attrs->Get(i)->attr_id, from->attrs->Get(i)->attr_val); int aid=from->attrs->Get(i)->attr_id; //special case for GenBank refseq genes vs transcripts: if (prod && aid==desc_attr_id && strcmp(from->attrs->getAttr(desc_attr_id), prod)==0) continue; //skip description if product already there and the same bool haveit=false; for (int ai=0;aiattrs->Count();++ai) { //do we have it already? if (aid==this->attrs->Get(i)->attr_id) { haveit=true; break; //skip this, don't replace } } if (!haveit) this->attrs->Add(new GffAttr(aid, from->attrs->Get(i)->attr_val)); } } void GffObj::setFeatureName(const char* feature) { //change the feature name/type for a transcript int fid=names->feats.addName(feature); if (monoFeature() && exons.Count()>0) this->exon_ftype_id=fid; this->ftype_id=fid; } void GffObj::setRefName(const char* newname) { //change the feature name/type for a transcript int rid=names->gseqs.addName(newname); this->gseq_id=rid; } int GffObj::removeAttr(const char* attrname, const char* attrval) { if (this->attrs==NULL || attrname==NULL || attrname[0]==0) return 0; int aid=this->names->attrs.getId(attrname); if (aid<0) return 0; int delcount=0; //could be more than one ? for (int i=0;iattrs->Count();i++) { if (aid==this->attrs->Get(i)->attr_id) { if (attrval==NULL || strcmp(attrval, this->attrs->Get(i)->attr_val)==0) { delcount++; this->attrs->freeItem(i); } } } if (delcount>0) this->attrs->Pack(); return delcount; } int GffObj::removeAttr(int aid, const char* attrval) { if (this->attrs==NULL || aid<0) return 0; int delcount=0; //could be more than one ? for (int i=0;iattrs->Count();i++) { if (aid==this->attrs->Get(i)->attr_id) { if (attrval==NULL || strcmp(attrval, this->attrs->Get(i)->attr_val)==0) { delcount++; this->attrs->freeItem(i); } } } if (delcount>0) this->attrs->Pack(); return delcount; } int GffObj::removeExonAttr(GffExon& exon, const char* attrname, const char* attrval) { if (exon.attrs==NULL || attrname==NULL || attrname[0]==0) return 0; int aid=this->names->attrs.getId(attrname); if (aid<0) return 0; int delcount=0; //could be more than one for (int i=0;iCount();i++) { if (aid==exon.attrs->Get(i)->attr_id) { if (attrval==NULL || strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) { delcount++; exon.attrs->freeItem(i); } } } if (delcount>0) exon.attrs->Pack(); return delcount; } int GffObj::removeExonAttr(GffExon& exon, int aid, const char* attrval) { if (exon.attrs==NULL || aid<0) return 0; int delcount=0; //could be more than one for (int i=0;iCount();i++) { if (aid==exon.attrs->Get(i)->attr_id) { if (attrval==NULL || strcmp(attrval, exon.attrs->Get(i)->attr_val)==0) { delcount++; exon.attrs->freeItem(i); } } } if (delcount>0) exon.attrs->Pack(); return delcount; } void GffObj::getCDS_ends(uint& cds_start, uint& cds_end) { cds_start=0; cds_end=0; if (CDstart==0 || CDend==0) return; //no CDS info int cdsadj=0; if (CDphase=='1' || CDphase=='2') { cdsadj=CDphase-'0'; } cds_start=CDstart; cds_end=CDend; if (strand=='-') cds_end-=cdsadj; else cds_start+=cdsadj; } void GffObj::mRNA_CDS_coords(uint& cds_mstart, uint& cds_mend) { //sets cds_start and cds_end to the CDS start,end coordinates on the spliced mRNA transcript cds_mstart=0; cds_mend=0; if (CDstart==0 || CDend==0) return; //no CDS info //restore normal coordinates, just in case unxcoord(); int cdsadj=0; if (CDphase=='1' || CDphase=='2') { cdsadj=CDphase-'0'; } /* uint seqstart=CDstart; uint seqend=CDend; */ uint seqstart=exons.First()->start; uint seqend=exons.Last()->end; int s=0; //resulting nucleotide counter if (strand=='-') { for (int x=exons.Count()-1;x>=0;x--) { uint sgstart=exons[x]->start; uint sgend=exons[x]->end; if (seqendsgend) continue; if (seqstart>=sgstart && seqstart<=sgend) sgstart=seqstart; //seqstart within this segment if (seqend>=sgstart && seqend<=sgend) sgend=seqend; //seqend within this segment s+=(int)(sgend-sgstart)+1; if (CDstart>=sgstart && CDstart<=sgend) { //CDstart in this segment //and we are getting the whole transcript cds_mend=s-(int)(CDstart-sgstart); } if (CDend>=sgstart && CDend<=sgend) { //CDstart in this segment //and we are getting the whole transcript cds_mstart=s-(int)(CDend-cdsadj-sgstart); } } //for each exon } // - strand else { // + strand for (int x=0;xstart; uint sgend=exons[x]->end; if (seqendsgend) continue; if (seqstart>=sgstart && seqstart<=sgend) sgstart=seqstart; //seqstart within this segment if (seqend>=sgstart && seqend<=sgend) sgend=seqend; //seqend within this segment s+=(int)(sgend-sgstart)+1; /* for (uint i=sgstart;i<=sgend;i++) { spliced[s]=gsubseq[i-gstart]; s++; }//for each nt */ if (CDstart>=sgstart && CDstart<=sgend) { //CDstart in this segment cds_mstart=s-(int)(sgend-CDstart-cdsadj); } if (CDend>=sgstart && CDend<=sgend) { //CDend in this segment cds_mend=s-(int)(sgend-CDend); } } //for each exon } // + strand //spliced[s]=0; //if (rlen!=NULL) *rlen=s; //return spliced; } char* GffObj::getUnspliced(GFaSeqGet* faseq, int* rlen, GList* seglst) { if (faseq==NULL) { GMessage("Warning: getUnspliced(NULL,.. ) called!\n"); return NULL; } //restore normal coordinates: unxcoord(); if (exons.Count()==0) return NULL; int fspan=end-start+1; const char* gsubseq=faseq->subseq(start, fspan); if (gsubseq==NULL) { GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end); } char* unspliced=NULL; int seqstart=exons.First()->start; int seqend=exons.Last()->end; int unsplicedlen = 0; unsplicedlen += seqend - seqstart + 1; GMALLOC(unspliced, unsplicedlen+1); //allocate more here //uint seqstart, seqend; int s = 0; //resulting nucleotide counter if (strand=='-') { if (seglst!=NULL) seglst->Add(new GSeg(s+1,s+1+seqend-seqstart)); for (int i=seqend;i>=seqstart;i--) { unspliced[s] = ntComplement(gsubseq[i-start]); s++; }//for each nt } // - strand else { // + strand if (seglst!=NULL) seglst->Add(new GSeg(s+1,s+1+seqend-seqstart)); for (int i=seqstart;i<=seqend;i++) { unspliced[s]=gsubseq[i-start]; s++; }//for each nt } // + strand //assert(s <= unsplicedlen); unspliced[s]=0; if (rlen!=NULL) *rlen=s; return unspliced; } char* GffObj::getSpliced(GFaSeqGet* faseq, bool CDSonly, int* rlen, uint* cds_start, uint* cds_end, GList* seglst) { if (CDSonly && CDstart==0) return NULL; if (faseq==NULL) { GMessage("Warning: getSpliced(NULL,.. ) called!\n"); return NULL; } //restore normal coordinates: unxcoord(); if (exons.Count()==0) return NULL; int fspan=end-start+1; const char* gsubseq=faseq->subseq(start, fspan); if (gsubseq==NULL) { GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end); } if (fspan<(int)(end-start+1)) { //special case: stop coordinate was extended past the gseq length, must adjust int endadj=end-start+1-fspan; uint prevend=end; end-=endadj; if (CDend>end) CDend=end; if (exons.Last()->end>end) { exons.Last()->end=end; //this could get us into trouble if exon start is also > end if (exons.Last()->start>exons.Last()->end) { GError("GffObj::getSpliced() error: improper genomic coordinate %d on %s for %s\n", prevend,getGSeqName(), getID()); } covlen-=endadj; } } char* spliced=NULL; GMALLOC(spliced, covlen+1); //allocate more here uint seqstart, seqend; int cdsadj=0; if (CDphase=='1' || CDphase=='2') { cdsadj=CDphase-'0'; } if (CDSonly) { seqstart=CDstart; seqend=CDend; if (strand=='-') seqend-=cdsadj; else seqstart+=cdsadj; } else { seqstart=exons.First()->start; seqend=exons.Last()->end; } int s=0; //resulting nucleotide counter if (strand=='-') { for (int x=exons.Count()-1;x>=0;x--) { uint sgstart=exons[x]->start; uint sgend=exons[x]->end; if (seqendsgend) continue; if (seqstart>=sgstart && seqstart<=sgend) sgstart=seqstart; //seqstart within this segment if (seqend>=sgstart && seqend<=sgend) sgend=seqend; //seqend within this segment if (seglst!=NULL) seglst->Add(new GSeg(s+1,s+1+sgend-sgstart)); for (uint i=sgend;i>=sgstart;i--) { spliced[s] = ntComplement(gsubseq[i-start]); s++; }//for each nt if (!CDSonly && cds_start!=NULL && CDstart>0) { if (CDstart>=sgstart && CDstart<=sgend) { //CDstart in this segment //and we are getting the whole transcript *cds_end=s-(CDstart-sgstart); } if (CDend>=sgstart && CDend<=sgend) { //CDstart in this segment //and we are getting the whole transcript *cds_start=s-(CDend-cdsadj-sgstart); } }//update local CDS coordinates } //for each exon } // - strand else { // + strand for (int x=0;xstart; uint sgend=exons[x]->end; if (seqendsgend) continue; if (seqstart>=sgstart && seqstart<=sgend) sgstart=seqstart; //seqstart within this segment if (seqend>=sgstart && seqend<=sgend) sgend=seqend; //seqend within this segment if (seglst!=NULL) seglst->Add(new GSeg(s+1,s+1+sgend-sgstart)); for (uint i=sgstart;i<=sgend;i++) { spliced[s]=gsubseq[i-start]; s++; }//for each nt if (!CDSonly && cds_start!=NULL && CDstart>0) { if (CDstart>=sgstart && CDstart<=sgend) { //CDstart in this segment //and we are getting the whole transcript *cds_start=s-(sgend-CDstart-cdsadj); } if (CDend>=sgstart && CDend<=sgend) { //CDstart in this segment //and we are getting the whole transcript *cds_end=s-(sgend-CDend); } }//update local CDS coordinates } //for each exon } // + strand spliced[s]=0; if (rlen!=NULL) *rlen=s; return spliced; } char* GffObj::getSplicedTr(GFaSeqGet* faseq, bool CDSonly, int* rlen) { if (CDSonly && CDstart==0) return NULL; //restore normal coordinates: unxcoord(); if (exons.Count()==0) return NULL; int fspan=end-start+1; const char* gsubseq=faseq->subseq(start, fspan); if (gsubseq==NULL) { GError("Error getting subseq for %s (%d..%d)!\n", gffID, start, end); } char* translation=NULL; GMALLOC(translation, (int)(covlen/3)+1); uint seqstart, seqend; int cdsadj=0; if (CDphase=='1' || CDphase=='2') { cdsadj=CDphase-'0'; } if (CDSonly) { seqstart=CDstart; seqend=CDend; if (strand=='-') seqend-=cdsadj; else seqstart+=cdsadj; } else { seqstart=exons.First()->start; seqend=exons.Last()->end; } Codon codon; int nt=0; //codon nucleotide counter (0..2) int aa=0; //aminoacid count if (strand=='-') { for (int x=exons.Count()-1;x>=0;x--) { uint sgstart=exons[x]->start; uint sgend=exons[x]->end; if (seqendsgend) continue; if (seqstart>=sgstart && seqstart<=sgend) sgstart=seqstart; //seqstart within this segment if (seqend>=sgstart && seqend<=sgend) { sgend=seqend; //seqend within this segment } for (uint i=sgend;i>=sgstart;i--) { codon.nuc[nt]=ntComplement(gsubseq[i-start]); nt++; if (nt==3) { nt=0; translation[aa]=codon.translate(); aa++; } }//for each nt } //for each exon } // - strand else { // + strand for (int x=0;xstart; uint sgend=exons[x]->end; if (seqendsgend) continue; if (seqstart>=sgstart && seqstart<=sgend) sgstart=seqstart; //seqstart within this segment if (seqend>=sgstart && seqend<=sgend) sgend=seqend; //seqend within this segment for (uint i=sgstart;i<=sgend;i++) { codon.nuc[nt]=gsubseq[i-start]; nt++; if (nt==3) { nt=0; translation[aa]=codon.translate(); aa++; } }//for each nt } //for each exon } // + strand translation[aa]=0; if (rlen!=NULL) *rlen=aa; return translation; } void GffObj::printSummary(FILE* fout) { if (fout==NULL) fout=stdout; fprintf(fout, "%s\t%c\t%d\t%d\t%4.2f\t%4.1f\n", gffID, strand, start, end, gscore, (float)qcov/10.0); } void decodeHexChars(char* dbuf, const char* s, int maxlen=1023) { int dlen=0; dbuf[0]=0; if (s==NULL) return; for (const char* p=s;(*p)!=0 && dlen'Z') a^=0x20; //toupper() if (a>'9') a=10+(a-'A'); else a-='0'; int b=p[2]; if (b>'Z') b^=0x20; if (b>'9') b=10+(b-'A'); else b-='0'; char c=(char)((a<<4)+b); if (c==';') c='.'; if (c>' ') { dbuf[dlen]=c; ++p;++p; ++dlen; continue; } } dbuf[dlen]=*p; ++dlen; } dbuf[dlen]=0; } void GffObj::printGxfLine(FILE* fout, const char* tlabel, const char* gseqname, bool iscds, uint segstart, uint segend, int exidx, char phase, bool gff3, bool cvtChars) { char dbuf[1024]; strcpy(dbuf,"."); GffAttrs* xattrs=NULL; if (exidx>=0) { if (exons[exidx]->score) sprintf(dbuf,"%.2f", exons[exidx]->score); xattrs=exons[exidx]->attrs; } if (phase==0 || !iscds) phase='.'; const char* ftype=iscds ? "CDS" : getSubfName(); const char* attrname=NULL; const char* attrval=NULL; if (gff3) { fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\tParent=%s", gseqname, tlabel, ftype, segstart, segend, dbuf, strand, phase, gffID); if (xattrs!=NULL) { for (int i=0;iCount();i++) { attrname=names->attrs.getName(xattrs->Get(i)->attr_id); if (cvtChars) { decodeHexChars(dbuf, xattrs->Get(i)->attr_val); fprintf(fout,";%s=%s", attrname, dbuf); } else { fprintf(fout,";%s=%s", attrname, xattrs->Get(i)->attr_val); } } } fprintf(fout, "\n"); } //GFF3 else {//for GTF -- we print only transcripts //if (isValidTranscript()) fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t%c\ttranscript_id \"%s\";", gseqname, tlabel, ftype, segstart, segend, dbuf, strand, phase, gffID); //char* geneid=(geneID!=NULL)? geneID : gffID; if (geneID) fprintf(fout," gene_id \"%s\";",geneID); if (gene_name!=NULL) { //fprintf(fout, " gene_name "); //if (gene_name[0]=='"') fprintf (fout, "%s;",gene_name); // else fprintf(fout, "\"%s\";",gene_name); fprintf(fout," gene_name \"%s\";",gene_name); } if (xattrs!=NULL) { for (int i=0;iCount();i++) { if (xattrs->Get(i)->attr_val==NULL) continue; attrname=names->attrs.getName(xattrs->Get(i)->attr_id); fprintf(fout, " %s ",attrname); if (cvtChars) { decodeHexChars(dbuf, xattrs->Get(i)->attr_val); attrval=dbuf; } else { attrval=xattrs->Get(i)->attr_val; } if (attrval[0]=='"') fprintf(fout, "%s;",attrval); else fprintf(fout, "\"%s\";",attrval); } } //for GTF, also append the GffObj attributes to each exon line if ((xattrs=this->attrs)!=NULL) { for (int i=0;iCount();i++) { if (xattrs->Get(i)->attr_val==NULL) continue; attrname=names->attrs.getName(xattrs->Get(i)->attr_id); fprintf(fout, " %s ",attrname); if (cvtChars) { decodeHexChars(dbuf, xattrs->Get(i)->attr_val); attrval=dbuf; } else { attrval=xattrs->Get(i)->attr_val; } if (attrval[0]=='"') fprintf(fout, "%s;",attrval); else fprintf(fout, "\"%s\";",attrval); } } fprintf(fout, "\n"); }//GTF } void GffObj::printGxf(FILE* fout, GffPrintMode gffp, const char* tlabel, const char* gfparent, bool cvtChars) { //char tmpstr[255]; char dbuf[1024]; if (tlabel==NULL) { tlabel=track_id>=0 ? names->tracks.Get(track_id)->name : (char*)"gffobj" ; } unxcoord(); //if (exons.Count()==0) return; const char* gseqname=names->gseqs.Get(gseq_id)->name; bool gff3 = (gffp>=pgffAny); bool showCDS = (gffp==pgtfAny || gffp==pgtfCDS || gffp==pgffCDS || gffp==pgffAny || gffp==pgffBoth); bool showExon = (gffp<=pgtfExon || gffp==pgffAny || gffp==pgffExon || gffp==pgffBoth); if (gff3) { //print GFF3 mRNA line: if (gscore>0.0) sprintf(dbuf,"%.2f", gscore); else strcpy(dbuf,"."); uint pstart, pend; if (gffp==pgffCDS) { pstart=CDstart; pend=CDend; } else { pstart=start;pend=end; } //const char* ftype=isTranscript() ? "mRNA" : getFeatureName(); const char* ftype=getFeatureName(); fprintf(fout, "%s\t%s\t%s\t%d\t%d\t%s\t%c\t.\tID=%s", gseqname, tlabel, ftype, pstart, pend, dbuf, strand, gffID); if (CDstart>0 && !showCDS/* && !isCDS*/) fprintf(fout,";CDS=%d-%d",CDstart,CDend); if (gfparent!=NULL) { //parent override fprintf(fout, ";Parent=%s",gfparent); } else { if (parent!=NULL && !parent->isDiscarded()) fprintf(fout, ";Parent=%s",parent->getID()); } if (geneID!=NULL) fprintf(fout, ";geneID=%s",geneID); if (gene_name!=NULL) fprintf(fout, ";gene_name=%s",gene_name); if (attrs!=NULL) { for (int i=0;iCount();i++) { const char* attrname=names->attrs.getName(attrs->Get(i)->attr_id); if (cvtChars) { decodeHexChars(dbuf, attrs->Get(i)->attr_val); fprintf(fout,";%s=%s", attrname, dbuf); } else { fprintf(fout,";%s=%s", attrname, attrs->Get(i)->attr_val); } } } fprintf(fout,"\n"); }// gff3 mRNA line bool is_cds_only = (gffp==pgffBoth) ? false : isCDS; if (showExon) { //print exons if (isCDS && exons.Count()>0 && ((strand=='-' && exons.Last()->phase<'0') || (strand=='+' && exons.Last()->phase<'0'))) updateExonPhase(); for (int i=0;istart, exons[i]->end, i, exons[i]->phase, gff3, cvtChars); } }//printing exons if (showCDS && !is_cds_only && CDstart>0) { if (isCDS) { for (int i=0;istart, exons[i]->end, i, exons[i]->phase, gff3, cvtChars); } } else { GArray cds(true,true); getCDSegs(cds); for (int i=0;i=0;i--) { exons[i]->phase='0'+ (3-cdsacc%3)%3; cdsacc+=exons[i]->end-exons[i]->start+1; } } else { //forward strand for (int i=0;iphase='0'+ (3-cdsacc%3)%3; cdsacc+=exons[i]->end-exons[i]->start+1; } } } void GffObj::getCDSegs(GArray& cds) { GffCDSeg cdseg; int cdsacc=0; if (CDphase=='1' || CDphase=='2') { cdsacc+= 3-(CDphase-'0'); } if (strand=='-') { for (int x=exons.Count()-1;x>=0;x--) { uint sgstart=exons[x]->start; uint sgend=exons[x]->end; if (CDendsgend) continue; if (CDstart>=sgstart && CDstart<=sgend) sgstart=CDstart; //cdstart within this segment if (CDend>=sgstart && CDend<=sgend) sgend=CDend; //cdend within this segment cdseg.start=sgstart; cdseg.end=sgend; cdseg.exonidx=x; //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0); cdseg.phase='0'+ (3-cdsacc%3)%3; cdsacc+=sgend-sgstart+1; cds.Add(cdseg); } //for each exon } // - strand else { // + strand for (int x=0;xstart; uint sgend=exons[x]->end; if (CDendsgend) continue; if (CDstart>=sgstart && CDstart<=sgend) sgstart=CDstart; //seqstart within this segment if (CDend>=sgstart && CDend<=sgend) sgend=CDend; //seqend within this segment cdseg.start=sgstart; cdseg.end=sgend; cdseg.exonidx=x; //cdseg.phase='0'+(cdsacc>0 ? (3-cdsacc%3)%3 : 0); cdseg.phase='0' + (3-cdsacc%3)%3 ; cdsacc+=sgend-sgstart+1; cds.Add(cdseg); } //for each exon } // + strand } tophat-2.0.9/src/codons.cpp0000644000175000017500000001074612122334361014336 0ustar toortoor#include "codons.h" static char codonTable[32768]; //32K table for fasta codon decoding // codons are encoded as triplets of 5-bit-encoded nucleotides // (so any codon can be encoded/decoded as a unique 15-bit value) static char codonData[]={ //long list of 3+1 characters (codon+translation) 'A','A','A','K', 'A','A','C','N', 'A','A','G','K', 'A','A','R','K', 'A','A','T','N', 'A','A','Y','N', 'A','C','A','T', 'A','C','B','T', 'A','C','C','T', 'A','C','D','T', 'A','C','G','T', 'A','C','H','T', 'A','C','K','T', 'A','C','M','T', 'A','C','N','T', 'A','C','R','T', 'A','C','S','T', 'A','C','T','T', 'A','C','V','T', 'A','C','W','T', 'A','C','Y','T', 'A','G','A','R', 'A','G','C','S', 'A','G','G','R', 'A','G','R','R', 'A','G','T','S', 'A','G','Y','S', 'A','T','A','I', 'A','T','C','I', 'A','T','G','M', 'A','T','H','I', 'A','T','M','I', 'A','T','T','I', 'A','T','W','I', 'A','T','Y','I', 'C','A','A','Q', 'C','A','C','H', 'C','A','G','Q', 'C','A','R','Q', 'C','A','T','H', 'C','A','Y','H', 'C','C','A','P', 'C','C','B','P', 'C','C','C','P', 'C','C','D','P', 'C','C','G','P', 'C','C','H','P', 'C','C','K','P', 'C','C','M','P', 'C','C','N','P', 'C','C','R','P', 'C','C','S','P', 'C','C','T','P', 'C','C','V','P', 'C','C','W','P', 'C','C','Y','P', 'C','G','A','R', 'C','G','B','R', 'C','G','C','R', 'C','G','D','R', 'C','G','G','R', 'C','G','H','R', 'C','G','K','R', 'C','G','M','R', 'C','G','N','R', 'C','G','R','R', 'C','G','S','R', 'C','G','T','R', 'C','G','V','R', 'C','G','W','R', 'C','G','Y','R', 'C','T','A','L', 'C','T','B','L', 'C','T','C','L', 'C','T','D','L', 'C','T','G','L', 'C','T','H','L', 'C','T','K','L', 'C','T','M','L', 'C','T','N','L', 'C','T','R','L', 'C','T','S','L', 'C','T','T','L', 'C','T','V','L', 'C','T','W','L', 'C','T','Y','L', 'G','A','A','E', 'G','A','C','D', 'G','A','G','E', 'G','A','R','E', 'G','A','T','D', 'G','A','Y','D', 'G','C','A','A', 'G','C','B','A', 'G','C','C','A', 'G','C','D','A', 'G','C','G','A', 'G','C','H','A', 'G','C','K','A', 'G','C','M','A', 'G','C','N','A', 'G','C','R','A', 'G','C','S','A', 'G','C','T','A', 'G','C','V','A', 'G','C','W','A', 'G','C','Y','A', 'G','G','A','G', 'G','G','B','G', 'G','G','C','G', 'G','G','D','G', 'G','G','G','G', 'G','G','H','G', 'G','G','K','G', 'G','G','M','G', 'G','G','N','G', 'G','G','R','G', 'G','G','S','G', 'G','G','T','G', 'G','G','V','G', 'G','G','W','G', 'G','G','Y','G', 'G','T','A','V', 'G','T','B','V', 'G','T','C','V', 'G','T','D','V', 'G','T','G','V', 'G','T','H','V', 'G','T','K','V', 'G','T','M','V', 'G','T','N','V', 'G','T','R','V', 'G','T','S','V', 'G','T','T','V', 'G','T','V','V', 'G','T','W','V', 'G','T','Y','V', 'M','G','A','R', 'M','G','G','R', 'M','G','R','R', 'N','N','N','X', 'R','A','Y','B', 'S','A','R','Z', 'T','A','A','.', 'T','A','C','Y', 'T','A','G','.', 'T','A','R','.', 'T','A','T','Y', 'T','A','Y','Y', 'T','C','A','S', 'T','C','B','S', 'T','C','C','S', 'T','C','D','S', 'T','C','G','S', 'T','C','H','S', 'T','C','K','S', 'T','C','M','S', 'T','C','N','S', 'T','C','R','S', 'T','C','S','S', 'T','C','T','S', 'T','C','V','S', 'T','C','W','S', 'T','C','Y','S', 'T','G','A','.', 'T','G','C','C', 'T','G','G','W', 'T','G','T','C', 'T','G','Y','C', 'T','R','A','.', 'T','T','A','L', 'T','T','C','F', 'T','T','G','L', 'T','T','R','L', 'T','T','T','F', 'T','T','Y','F', 'X','X','X','X', 'Y','T','A','L', 'Y','T','G','L', 'Y','T','R','L' }; static bool isCodonTableReady=codonTableInit(); unsigned short packCodon(char n1, char n2, char n3) { //assumes they are uppercase already! byte b1=n1-'A'; byte b2=n2-'A'; byte b3=n3-'A'; b1 |= (b2 << 5); b2 = (b2 >> 3) | (b3 << 2); return ( ((unsigned short)b2) << 8) + b1; } bool codonTableInit() { memset((void*)codonTable, 'X', 32768); int cdsize=sizeof(codonData); for (int i=0;i #endif #include #include #include #include #include #include #include #include #include #include #include "common.h" #include "gff.h" #include "GFaSeqGet.h" #include "FastaTools.h" std::string get_exonic_sequence(GffObj& p_trans, FastaRecord& rec, std::string& coords); class GTFToFasta { public: GTFToFasta(std::string gtf_fname, std::string genome_fname); ~GTFToFasta(); void make_transcriptome(std::string out_fname); // for debugging void print_mapping(); private: GffReader gtfReader_; GFaSeqGet genome_fhandle_; std::string gtf_fname_; std::string genome_fname_; FILE* gtf_fhandle_; // "contig" => vector(index_of_gff_obj) // typedef std::map* > ContigTransMap; typedef std::map* > ContigTransMap; ContigTransMap contigTransMap_; void transcript_map(); GTFToFasta(); // Don't want anyone calling the constructor w/o options }; #endif tophat-2.0.9/src/fusions.h0000755000175000017500000001324512122334360014203 0ustar toortoor#ifndef FUSIONS_H #define FUSIONS_H /* * fusions.h * TopHat * * Adapted from junctions.h */ #include #include #include #include #include #include #include #include #include #include #include "bwt_map.h" using namespace std; struct Fusion { Fusion (uint32_t ref1, uint32_t ref2, uint32_t l, uint32_t r, uint32_t d = FUSION_FF) : refid1(ref1), refid2(ref2), left(l), right(r), dir(d) {} Fusion() : refid1(0), refid2(0), left(0), right(0), dir(FUSION_FF) {} uint32_t refid1; uint32_t refid2; uint32_t left; uint32_t right; uint32_t dir; bool operator<(const Fusion& rhs) const { if (refid1 < rhs.refid1) return true; else if (refid1 > rhs.refid1) return false; if (refid2 < rhs.refid2) return true; else if (refid2 > rhs.refid2) return false; if (left < rhs.left) return true; else if (left > rhs.left) return false; if (right < rhs.right) return true; else if (right > rhs.right) return false; if (dir < rhs.dir) return true; else if(dir > rhs.dir) return false; return false; } bool operator==(const Fusion& rhs) const { return refid1 == rhs.refid1 && refid2 == rhs.refid2 && left == rhs.left && right == rhs.right && dir == rhs.dir; } }; struct FusionPairSupport { FusionPairSupport(int l, int r) : ldist(l), rdist(r) {} bool operator<(const FusionPairSupport& rhs) const { if (abs(ldist) + abs(rdist) < abs(rhs.ldist) + abs(rhs.rdist)) return true; else return false; } int ldist; int rdist; }; struct FusionSimpleStat { uint32_t count; // # of reads that support the fusion uint32_t edit_dist; // the smallest edit dist among the supporting reads bool skip; // bool left_coincide_with_splice_junction; // bool right_coincide_with_splice_junction; // FusionSimpleStat& merge_with(const FusionSimpleStat& other) { if (this == &other) return *this; count += other.count; edit_dist = min(edit_dist, other.edit_dist); return *this; } }; struct FusionStat { uint32_t count; // # of reads that support the fusion uint32_t unsupport_count; // # of reads that unsupport the fusion, that is, reads that span one chromosome. uint32_t unsupport_count_pair; uint32_t pair_count; // # of pairs that support the fusion uint32_t pair_count_fusion; // # of pairs that support the fusion where one read spans the fusion float symm; string chr1_seq; string chr2_seq; static const uint32_t NUM_BASES = 50; vector left_bases; vector right_bases; vector diffs; uint32_t left_ext; uint32_t right_ext; vector vPairSupport; /* * daehwan - this is a metadata indicating whether Fusion is reversed .. * this is not a good way to ... */ bool reversed; FusionStat() : left_bases(vector(NUM_BASES, 0)), right_bases(vector(NUM_BASES, 0)) { count = 0; unsupport_count = 0; unsupport_count_pair = 0; pair_count = 0; pair_count_fusion = 0; symm = 0.0f; left_ext = right_ext = 0; reversed = false; } FusionStat& merge_with(const FusionStat& other_fusion) { if (this == &other_fusion) return *this; count += other_fusion.count; unsupport_count += other_fusion.unsupport_count; unsupport_count_pair += other_fusion.unsupport_count_pair; pair_count += other_fusion.pair_count; pair_count_fusion += other_fusion.pair_count_fusion; if (other_fusion.count > 0) { symm = other_fusion.symm; chr1_seq = other_fusion.chr1_seq; chr2_seq = other_fusion.chr2_seq; diffs = other_fusion.diffs; } assert (left_bases.size() == right_bases.size()); assert (left_bases.size() == other_fusion.left_bases.size()); assert (right_bases.size() == other_fusion.right_bases.size()); for (size_t i = 0; i < left_bases.size(); ++i) { left_bases[i] += other_fusion.left_bases[i]; right_bases[i] += other_fusion.right_bases[i]; } left_ext = max(left_ext, other_fusion.left_ext); right_ext = max(right_ext, other_fusion.right_ext); vPairSupport.insert(vPairSupport.end(), other_fusion.vPairSupport.begin(), other_fusion.vPairSupport.end()); return *this; } }; struct fusion_comparison { bool operator()(const Fusion& lhs, const Fusion& rhs) const { if (lhs.left != rhs.left) return lhs.left < rhs.left; if (lhs.right != rhs.right) return lhs.right < rhs.right; if (lhs.refid1 != rhs.refid1) return lhs.refid1 < rhs.refid1; if (lhs.refid2 != rhs.refid2) return lhs.refid2 < rhs.refid2; if (lhs.dir != rhs.dir) return lhs.dir < rhs.dir; return false; } }; // this is used in segment_juncs typedef std::map FusionSimpleSet; // this is used in tophat_reports typedef std::map FusionSet; void fusions_from_alignment(const BowtieHit& bh, FusionSet& fusions, RefSequenceTable& rt, bool update_stat = false); void unsupport_fusions(const BowtieHit& bh, FusionSet& fusions, const FusionSet& fusions_ref); void print_fusions(FILE* fusions_out, FusionSet& fusions, RefSequenceTable& ref_sequences); void fusions_from_spliced_hit(const BowtieHit& bh, vector& fusions, bool auto_sort = true); void pair_support(const vector >& best_hits, FusionSet& fusions, FusionSet& fusions_ref); void merge_with(FusionSimpleSet& fusions, const FusionSimpleSet& other_fusions); void merge_with(FusionSet& fusions, const FusionSet& other_fusions); #endif tophat-2.0.9/src/GStr.h0000644000175000017500000002155512157116165013405 0ustar toortoor//--------------------------------------------------------------------------- #ifndef GSTR_H #define GSTR_H //--------------------------------------------------------------------------- #include "GBase.h" #include #include #include // This class uses reference counting and copy-on-write semantics // All indexes are zero-based. For all functions that accept an index, a // negative index specifies an index from the right of the string. Also, // for all functions that accept a length, a length of -1 specifies the rest // of the string. enum enTokenizeMode { tkFullString, tkCharSet }; class GStr { friend GStr operator+(const char* s1, const GStr& s2); friend bool operator==(const char* s1, const GStr& s2); friend bool operator<(const char* s1, const GStr& s2); friend bool operator<=(const char* s1, const GStr& s2); friend bool operator>(const char* s1, const GStr& s2); friend bool operator>=(const char* s1, const GStr& s2); friend bool operator!=(const char* s1, const GStr& s2); friend void Gswap(GStr& s1, GStr& s2); public: GStr(); GStr(const GStr& s); GStr(const char* s); GStr(const int i); GStr(const double f); GStr(char c, int n = 1); ~GStr(); operator const char* () const { return my_data->chars;} //inline here char& operator[](int index); char operator[](int index) const; GStr& operator=(const GStr& s); GStr& operator=(const char* s); GStr& operator=(const int i); GStr& operator=(const double f); GStr operator+(const GStr& s) const; GStr operator+(const char* s) const; GStr operator+(const char c) const; GStr operator+(const int i) const; GStr operator+(const double f) const; bool operator==(const GStr& s) const; bool operator==(const char* s) const; bool operator<(const GStr& s) const; bool operator<(const char* s) const; bool operator<=(const GStr& s) const; bool operator<=(const char* s) const; bool operator>(const GStr& s) const; bool operator>(const char* s) const; bool operator>=(const GStr& s) const; bool operator>=(const char* s) const; bool operator!=(const GStr& s) const; bool operator!=(const char* s) const; GStr& operator+=(const GStr& s) { return append(s.chars()); } GStr& operator+=(const char* s) { return append(s); } GStr& operator+=(char c) { return append(c); } GStr& operator+=(int i) { return append(i); } GStr& operator+=(uint i) { return append(i); } GStr& operator+=(long l) { return append(l); } GStr& operator+=(unsigned long l) { return append(l); } GStr& operator+=(double f); //interface: public: int length() const; bool is_empty() const; bool is_space() const; GStr substr(int index = 0, int len = -1) const; GStr to(char c); //return the first part up to first occurence of c //or whole string if c not found GStr from(char c); //same as to, but starting from the right side GStr copy() const; GStr& format(const char *fmt,...); GStr& reverse(); GStr& appendfmt(const char *fmt,...); GStr& cut(int index = 0, int len = -1); //delete a specified length GStr& remove(int from, int to) { return cut(from, to-from+1); } //paste a string at the specified position GStr& paste(const GStr& s, int index = 0, int len=-1); GStr& paste(const char* s, int index = 0, int len = -1); GStr& replace(const char* from, const char* to=NULL); GStr& insert(const GStr& s, int index = 0); GStr& insert(const char* s, int index = 0); GStr& append(const char* s); GStr& append(const GStr& s); GStr& append(char c); GStr& append(int i); GStr& append(long l); GStr& append(double f); GStr& append(uint i); GStr& append(unsigned long l); GStr& upper(); GStr& lower(); GStr& clear();//make empty //character translation or removal: GStr& tr(const char* from, const char* to=NULL); //number of occurences of a char in the string: int count(char c); void startTokenize(const char* delimiter=" \t\n", enTokenizeMode tokenizemode=tkCharSet); bool nextToken(GStr& token); int asInt(int base=10); double asReal(); double asDouble() { return asReal(); } bool asReal(double& r); bool asDouble(double& r) { return asReal(r); } bool asInt(int& r, int base=10); int index(const GStr& s, int start_index = 0) const; int index(const char* s, int start_index = 0) const; int index(char c, int start_index = 0) const; int rindex(char c, int end_index = -1) const; int rindex(const char* str, int end_index = -1) const; bool contains(const GStr& s) const; bool contains(const char* s) const; bool contains(char c) const; bool startsWith(const char* s) const; bool startsWith(const GStr& s) const; bool endsWith(const char* s) const; bool endsWith(const GStr& s) const; GStr split(const char* delim); GStr split(char c); /* splits "this" in two parts, at the first (leftmost) encounter of delim: 1st would stay in "this" (which this way is truncated) 2nd will go to the returned string */ GStr splitr(const char* delim); GStr splitr(char c); /* splits "this" in two parts, at the last (rightmost) encounter of delim: 1st would stay in "this" 2nd will be returned */ int peelInt() const; //extract an integer, (left to right), from a //mixed alphanumeric string, e.g. 'T24HC1234b'=> 2 int peelIntR() const; //same as above, but starts from the right side //e.g. 'T2HC1234b'=> 1234 GStr& trim(char c); GStr& trim(const char* c=" \t\n\r"); //trim both ends of characters in given set GStr& trimR(const char* c=" \t\n\r"); //trim only right end GStr& trimR(char c=' '); GStr& chomp(char c='\n') { return trimR(c); } GStr& chomp(const char* cstr); //like trimR, but given string is taken as a whole GStr& trimL(const char* c=" \t\n\r"); //trim only left end GStr& trimL(char c=' '); GStr& padR(int len, char c=' '); //align it in len spaces to the right GStr& padL(int len, char c=' '); //align it in len spaces to the left GStr& padC(int len, char c=' '); //center it size_t read(FILE* stream, const char* delimiter="\n", size_t bufsize=4096); //read next token from stream, using the given string as //a marker where the block should stop const char* chars() const; const char* text() const; protected: char* fTokenDelimiter; int fLastTokenStart; enTokenizeMode fTokenizeMode; void* readbuf; //file read buffer for the read() function size_t readbufsize; //last setting for the readbuf static void invalid_args_error(const char* fname); static void invalid_index_error(const char* fname); struct Data {//structure holding actual //string data and reference count information Data() { ref_count=0; length=0; chars[0] = '\0'; } unsigned int ref_count; int length; char chars[1]; }; static Data* new_data(int length); //alloc a specified length string's Data static Data* new_data(const char* str); //alloc a copy of a specified string void replace_data(int length); void replace_data(Data* data); void make_unique(); char* chrs(); // this is dangerous, length should not be affected static Data null_data; //a null (empty) string Data is available here Data* my_data; //pointer to a Data object holding actual string data }; /***************************************************************************/ inline int GStr::length() const { return my_data->length; } inline const char *GStr::chars() const { return my_data->chars; } inline char *GStr::chrs() { //protected version, allows modification of the chars return my_data->chars; } inline const char *GStr::text() const { return my_data->chars; } inline bool operator>=(const char *s1, const GStr& s2) { return (strcmp(s1, s2.chars()) >= 0); } inline bool operator!=(const char *s1, const GStr& s2) { return (strcmp(s1, s2.chars()) != 0); } inline void Gswap(GStr& s1, GStr& s2) { GStr::Data *tmp = s1.my_data; s1.my_data = s2.my_data; s2.my_data = tmp; } #endif tophat-2.0.9/src/GStr.cpp0000644000175000017500000010417012157116165013733 0ustar toortoor//--------------------------------------------------------------------------- #include "GStr.h" #include #include #include #include "GBase.h" #include #include //--------------------------------------------------------------------------- GStr::Data GStr::null_data; //========================================= GStr::Data * GStr::new_data(int length) { //static method to return a new Data object (allocate length) //content is undefined, but it's null terminated if (length > 0) { Data* data; GMALLOC(data, sizeof(Data)+length); data->ref_count = 0; data->length = length; data->chars[length] = '\0'; return data; } else return &null_data; } GStr::Data* GStr::new_data(const char* str) { //static method to return a new Data object (allocate length) //as a copy of a given string if (str==NULL) return &null_data; int length=strlen(str); if (length > 0) { Data* data; GMALLOC(data, sizeof(Data)+length); strcpy(data->chars, str); data->ref_count = 0; data->length = length; data->chars[length] = '\0'; return data; } else return &null_data; } void GStr::replace_data(int len) { if (len == my_data->length && my_data->ref_count <= 1) return; if (my_data != &null_data && --my_data->ref_count == 0) GFREE(my_data); if (len > 0) { //my_data = (Data *) malloc(sizeof(Data) + len); GMALLOC(my_data, sizeof(Data) + len); my_data->ref_count = 1; my_data->length = len; my_data->chars[len] = '\0'; } else my_data = &null_data; } void GStr::replace_data(Data *data) { if (my_data != &null_data && --my_data->ref_count == 0) GFREE(my_data); if (data != &null_data) data->ref_count++; my_data = data; } void GStr::make_unique() {//make sure it's not a reference to other string if (my_data->ref_count > 1) { Data *data = new_data(length()); ::memcpy(data->chars, chars(), length()); my_data->ref_count--; my_data = data; my_data->ref_count++; } } bool operator==(const char *s1, const GStr& s2){ if (s1==NULL) return s2.is_empty(); return (strcmp(s1, s2.chars()) == 0); } bool operator<(const char *s1, const GStr& s2) { if (s1==NULL) return !s2.is_empty(); return (strcmp(s1, s2.chars()) < 0); } bool operator<=(const char *s1, const GStr& s2){ if (s1==NULL) return true; return (strcmp(s1, s2.chars()) <= 0); } bool operator>(const char *s1, const GStr& s2) { if (s1==NULL) return false; return (strcmp(s1, s2.chars()) > 0); } GStr::GStr():my_data(&null_data) { fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; } GStr::GStr(const GStr& s): my_data(&null_data){ fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; replace_data(s.my_data); } GStr::GStr(const char *s): my_data(&null_data) { fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; my_data=new_data(s); my_data->ref_count = 1; } GStr::GStr(const int i): my_data(&null_data) { fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; char buf[20]; sprintf(buf,"%d",i); const int len = ::strlen(buf); replace_data(len); ::memcpy(chrs(), buf, len); } GStr::GStr(const double f): my_data(&null_data) { fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; char buf[20]; sprintf(buf,"%f",f); const int len = ::strlen(buf); replace_data(len); ::memcpy(chrs(), buf, len); } GStr::GStr(char c, int n): my_data(&null_data) { fTokenDelimiter=NULL; fTokenizeMode=tkCharSet; fLastTokenStart=0; readbuf=NULL; readbufsize=0; replace_data(n); ::memset(chrs(), c, n); } GStr::~GStr() { if (my_data != &null_data && --my_data->ref_count == 0) GFREE(my_data); GFREE(fTokenDelimiter); GFREE(readbuf); } char& GStr::operator[](int idx){ //returns reference to char (can be l-value) if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("operator[]"); make_unique(); //because the user will probably modify this char! return chrs()[idx]; } char GStr::operator[](int idx) const { //returns char copy (cannot be l-value!) if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("operator[]"); return chars()[idx]; } GStr& GStr::operator=(const GStr& s) { make_unique(); //edit operation ahead replace_data(s.my_data); return *this; } GStr& GStr::operator=(const char *s) { make_unique(); //edit operation ahead if (s==NULL) { replace_data(0); return *this; } const int len = ::strlen(s); replace_data(len); ::memcpy(chrs(), s, len); return *this; } GStr& GStr::operator=(const double f) { make_unique(); //edit operation ahead char buf[20]; sprintf(buf,"%f",f); const int len = ::strlen(buf); replace_data(len); ::memcpy(chrs(), buf, len); return *this; } GStr& GStr::operator=(const int i) { make_unique(); //edit operation ahead char buf[20]; sprintf(buf,"%d",i); const int len = ::strlen(buf); replace_data(len); ::memcpy(chrs(), buf, len); return *this; } bool GStr::operator==(const GStr& s) const { if (s.is_empty()) return is_empty(); return (length() == s.length()) && (memcmp(chars(), s.chars(), length()) == 0); } bool GStr::operator==(const char *s) const { if (s==NULL) return is_empty(); return (strcmp(chars(), s) == 0); } bool GStr::operator<(const GStr& s) const { if (s.is_empty()) return false; return (strcmp(chars(), s.chars()) < 0); } bool GStr::operator<(const char *s) const { if (s==NULL) return false; return (strcmp(chars(), s) < 0); } bool GStr::operator<=(const GStr& s) const { if (s.is_empty()) return is_empty(); return (strcmp(chars(), s.chars()) <= 0); } bool GStr::operator<=(const char *s) const { if (s==NULL) return is_empty(); return (strcmp(chars(), s) <= 0); } bool GStr::operator>(const GStr& s) const { if (s.is_empty()) return !is_empty(); return (strcmp(chars(), s.chars()) > 0); } bool GStr::operator>(const char *s) const { if (s==NULL) return !is_empty(); return (strcmp(chars(), s) > 0); } bool GStr::operator>=(const GStr& s) const { if (s.is_empty()) return true; return (strcmp(chars(), s.chars()) >= 0); } bool GStr::operator>=(const char *s) const { if (s==NULL) return true; return (strcmp(chars(), s) >= 0); } bool GStr::operator!=(const GStr& s) const { if (s.is_empty()) return !is_empty(); return (length() != s.length()) || (memcmp(chars(), s.chars(), length()) != 0); } bool GStr::operator!=(const char *s) const { if (s==NULL) return !is_empty(); return (strcmp(chars(), s) != 0); } GStr& GStr::append(char c) { char buf[5]; sprintf(buf,"%c",c); return append(buf); } GStr& GStr::append(int i) { char buf[20]; sprintf(buf,"%d",i); return append(buf); } GStr& GStr::append(uint i) { char buf[20]; sprintf(buf,"%u",i); return append(buf); } GStr& GStr::append(long l) { char buf[20]; sprintf(buf,"%ld",l); return append(buf); } GStr& GStr::append(unsigned long l) { char buf[20]; sprintf(buf,"%lu", l); return append(buf); } GStr& GStr::append(double f) { char buf[30]; sprintf(buf,"%f",f); return append(buf); } bool GStr::is_empty() const { //return my_data == &null_data; return (length()==0); } GStr GStr::copy() const { GStr newstring(*this); return newstring; } GStr& GStr::clear() { make_unique(); //edit operation ahead replace_data(0); return *this; } int GStr::index(const GStr& s, int start_index) const { return index(s.chars(), start_index); } bool GStr::contains(const GStr& s) const { return (index(s, 0) >= 0); } bool GStr::contains(const char *s) const { return (index(s, 0) >= 0); } bool GStr::startsWith(const char *s) const { //return (index(s, 0) == 0); return ::startsWith(this->chars(), s); } bool GStr::startsWith(const GStr& s) const { //return (index(s, 0) == 0); return ::startsWith(this->chars(), s.chars()); } bool GStr::endsWith(const char *s) const { //return (index(s, 0) == 0); return ::endsWith(this->chars(), s); } bool GStr::endsWith(const GStr& s) const { //return (index(s, 0) == 0); return ::endsWith(this->chars(), s.chars()); } bool GStr::contains(char c) const { return (index(c, 0) >= 0); } GStr& GStr::format(const char *fmt,...) { // Format as in sprintf make_unique(); //edit operation ahead char* buf; GMALLOC(buf, strlen(fmt)+1024); va_list arguments; va_start(arguments,fmt); //+1K buffer, should be enough for common expressions int len=vsprintf(buf,fmt,arguments); va_end(arguments); replace_data(len); //this also adds the '\0' at the end! //and sets the right len ::memcpy(chrs(), buf, len); GFREE(buf); return *this; } GStr& GStr::appendfmt(const char *fmt,...) { // Format as in sprintf make_unique(); //edit operation ahead char* buf; GMALLOC(buf, strlen(fmt)+1024); va_list arguments; va_start(arguments,fmt); //+1K buffer, should be enough for common expressions vsprintf(buf,fmt,arguments); va_end(arguments); append(buf); GFREE(buf); return *this; } GStr& GStr::trim(char c) { register int istart; register int iend; for (istart=0; istartistart && chars()[iend]==c;iend--) ; int newlen=iend-istart+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, &chars()[istart], newlen); replace_data(data); return *this; } GStr& GStr::trim(const char* c) { register int istart; register int iend; for (istart=0; istartistart && strchr(c, chars()[iend])!=NULL;iend--) ; int newlen=iend-istart+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, &chars()[istart], newlen); replace_data(data); return *this; } GStr& GStr::trimR(char c) { //only trim the right end //register int istart; register int iend; for (iend=length()-1; iend>=0 && chars()[iend]==c;iend--) ; if (iend==-1) { replace_data(0); //string was entirely trimmed return *this; } int newlen=iend+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, chars(), newlen); replace_data(data); return *this; } GStr& GStr::trimR(const char* c) { register int iend; for (iend=length()-1; iend>=0 && strchr(c,chars()[iend])!=NULL;iend--) ; if (iend==-1) { replace_data(0); //string was entirely trimmed return *this; } int newlen=iend+1; if (newlen==length()) //nothing to trim return *this; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, chars(), newlen); replace_data(data); return *this; } GStr& GStr::chomp(const char* cstr) { register int iend; if (cstr==NULL || *cstr==0) return *this; //check if this ends with cstr int cend=strlen(cstr)-1; iend=my_data->length-1; while (iend>=0 && cend>=0) { if (my_data->chars[iend]!=cstr[cend]) return *this; iend--; cend--; } if (iend==-1) { replace_data(0); //string will be entirely trimmed return *this; } int newlen=iend+1; make_unique(); //edit operation ahead Data *data = new_data(newlen); ::memcpy(data->chars, chars(), newlen); replace_data(data); return *this; } GStr& GStr::trimL(char c) { register int istart; for (istart=0; istartchars, &chars()[istart], newlen); replace_data(data); return *this; } GStr& GStr::trimL(const char* c) { register int istart; for (istart=0; istartchars, &chars()[istart], newlen); replace_data(data); return *this; } GStr& GStr::padR(int len, char c) { //actually means align right in len if (length()>=len) return *this; //no room for padding make_unique(); //edit operation ahead Data *data = new_data(len); ::memset(data->chars,c,len-length()); ::memcpy(&data->chars[len-length()], chars(), length()); replace_data(data); return *this; } GStr& GStr::padL(int len, char c) { //align left the string if (length()>=len) return *this; //no room for padding make_unique(); //edit operation ahead Data *data = new_data(len); ::memcpy(data->chars, chars(), length()); ::memset(&data->chars[length()],c,len-length()); replace_data(data); return *this; } GStr& GStr::padC(int len, char c) { if (length()>=len) return *this; //no room for padding make_unique(); //edit operation ahead int istart=(len-length())/2; Data *data = new_data(len); if (istart>0) ::memset(data->chars, c, istart); ::memcpy(&data->chars[istart], chars(), length()); int iend=istart+length(); if (iendchars[iend],c,len-iend); replace_data(data); return *this; } GStr operator+(const char *s1, const GStr& s2) { const int s1_length = ::strlen(s1); if (s1_length == 0) return s2; else { GStr newstring; newstring.replace_data(s1_length + s2.length()); ::memcpy(newstring.chrs(), s1, s1_length); ::memcpy(&(newstring.chrs())[s1_length], s2.chars(), s2.length()); return newstring; } } //========================================= GStr GStr::operator+(const GStr& s) const { if (length() == 0) return s; else if (s.length() == 0) return *this; else { GStr newstring; newstring.replace_data(length() + s.length()); ::memcpy(newstring.chrs(), chars(), length()); ::memcpy(&(newstring.chrs())[length()], s.chars(), s.length()); return newstring; } } //========================================= GStr GStr::operator+(const char *s) const { const int s_length = ::strlen(s); if (s_length == 0) return *this; else { GStr newstring; newstring.replace_data(length() + s_length); ::memcpy(newstring.chrs(), chars(), length()); ::memcpy(&(newstring.chrs())[length()], s, s_length); return newstring; } } GStr GStr::operator+(const int i) const { char buf[20]; sprintf(buf, "%d", i); const int s_length = ::strlen(buf); GStr newstring; newstring.replace_data(length() + s_length); ::memcpy(newstring.chrs(), chars(), length()); ::memcpy(&(newstring.chrs())[length()], buf, s_length); return newstring; } GStr GStr::operator+(const char c) const { char buf[4]; sprintf(buf, "%c", c); const int s_length = ::strlen(buf); GStr newstring; newstring.replace_data(length() + s_length); ::memcpy(newstring.chrs(), chars(), length()); ::memcpy(&(newstring.chrs())[length()], buf, s_length); return newstring; } GStr GStr::operator+(const double f) const { char buf[30]; sprintf(buf, "%f", f); const int s_length = ::strlen(buf); GStr newstring; newstring.replace_data(length() + s_length); ::memcpy(newstring.chrs(), chars(), length()); ::memcpy(&(newstring.chrs())[length()], buf, s_length); return newstring; } //========================================= bool GStr::is_space() const { if (my_data == &null_data) return false; for (register const char *p = chars(); *p; p++) if (!isspace(*p)) return false; return true; } //========================================= GStr GStr::substr(int idx, int len) const { // A negative idx specifies an idx from the right of the string. if (idx < 0) idx += length(); // A length of -1 specifies the rest of the string. if (len < 0 || len>length()-idx) len = length() - idx; if (idx<0 || idx>=length() || len<0 ) invalid_args_error("substr()"); GStr newstring; newstring.replace_data(len); ::memcpy(newstring.chrs(), &chars()[idx], len); return newstring; } GStr& GStr::reverse() { make_unique(); int l=0; int r=my_data->length-1; char c; while (lchars[l]; my_data->chars[l]=my_data->chars[r]; my_data->chars[r]=c; l++;r--; } return *this; } //transform: any character from 'from' is replaced with a coresponding //char from 'to' GStr& GStr::tr(const char *rfrom, const char* rto) { if (length() == 0 || rfrom==NULL || strlen(rfrom)==0) return *this; unsigned int l=strlen(rfrom); if (rto!=NULL && strlen(rto)!=l) invalid_args_error("tr()"); make_unique(); //edit operation ahead Data *data = new_data(length()); if (rto==NULL) { //deletion case char* s = my_data->chars; char* p; char* dest = data->chars; do { if ((p=strpbrk(s,rfrom))!=NULL) { memcpy(dest,s,p-s); dest+=p-s; s=p+1; } else { strcpy(dest, s); dest+=strlen(s); } } while (p!=NULL); (*dest)='\0'; } else { //char substitution case - easier! const char* p; for (int i=0; ichars[i]))!=NULL) my_data->chars[i]=rto[p-rfrom]; } } data->length=strlen(data->chars); replace_data(data); return *this; } // search and replace all the occurences of a string with another string // or just remove the given string (if replacement is NULL) GStr& GStr::replace(const char *rfrom, const char* rto) { if (length() == 0 || rfrom==NULL || strlen(rfrom)==0) return *this; unsigned int l=strlen(rfrom); unsigned int tl= (rto==NULL)?0:strlen(rto); make_unique(); //edit operation ahead char* p; char* dest; char* newdest=NULL; char* s = my_data->chars; if (tl!=l) { //reallocation if (tl>l) { //possible enlargement GMALLOC(newdest, length()*(tl-l+1)+1); } else {//delete or replace with a shorter string GMALLOC(newdest, length() + 1); } dest=newdest; if (tl==0) {//deletion while ((p=strstr(s,rfrom))!=NULL) { //rfrom found at position p memcpy(dest,s,p-s); dest+=p-s; s+=p-s+l; //s positioned in string after rfrom } //no more occurences, copy the remaining string strcpy(dest, s); } else { //replace with another string while ((p=strstr(s,rfrom))!=NULL) { memcpy(dest,s,p-s); //copy up rto the match dest+=p-s; memcpy(dest,rto,tl); //put the replacement string dest+=tl; s+=p-s+l; } //not found any more, copy rto end of string strcpy(dest, s); } Data* data=new_data(newdest); replace_data(data); GFREE(newdest); } else { //inplace editing: no need rto reallocate while ((p=strstr(s,rfrom))!=NULL) { memcpy(p,rto,l); s+=p-s+l; } } return *this; } GStr& GStr::cut(int idx, int len) { if (len == 0) return *this; make_unique(); //edit operation ahead // A negative idx specifies an idx from the right of the string, // so the left part will be cut out if (idx < 0) idx += length(); // A length of -1 specifies the rest of the string. if (len == -1) len = length() - idx; if (idx<0 || idx>=length() || len<0 || len>length()-idx) invalid_args_error("cut()"); Data *data = new_data(length() - len); if (idx > 0) ::memcpy(data->chars, chars(), idx); ::strcpy(&data->chars[idx], &chars()[idx+len]); replace_data(data); return *this; } //========================================= GStr& GStr::paste(const GStr& s, int idx, int len) { // A negative idx specifies an idx from the right of the string. if (idx < 0) idx += length(); make_unique(); //edit operation ahead // A length of -1 specifies the rest of the string. if (len == -1) len = length() - idx; if (idx<0 || idx>=length() || len<0 || len>length()-idx) invalid_args_error("replace()"); if (len == s.length() && my_data->ref_count == 1) ::memcpy(&chrs()[idx], s.chars(), len); else { Data *data = new_data(length() - len + s.length()); if (idx > 0) ::memcpy(data->chars, chars(), idx); if (s.length() > 0) ::memcpy(&data->chars[idx], s.chars(), s.length()); ::strcpy(&data->chars[idx+s.length()], &chars()[idx+len]); replace_data(data); } return *this; } //========================================= GStr& GStr::paste(const char *s, int idx, int len) { // A negative idx specifies an idx from the right of the string. make_unique(); //edit operation ahead if (idx < 0) idx += length(); // A length of -1 specifies the rest of the string. if (len == -1) len = length() - idx; if (idx<0 || idx>=length() || len<0 || len>length()-idx) invalid_args_error("replace()"); const int s_length = ::strlen(s); if (len == s_length && my_data->ref_count == 1) ::memcpy(&chrs()[idx], s, len); else { Data *data = new_data(length() - len + s_length); if (idx > 0) ::memcpy(data->chars, chars(), idx); if (s_length > 0) ::memcpy(&data->chars[idx], s, s_length); ::strcpy(&data->chars[idx+s_length], &chars()[idx+len]); replace_data(data); } return *this; } //========================================= GStr& GStr::insert(const GStr& s, int idx) { make_unique(); //edit operation ahead // A negative idx specifies an idx from the right of the string. if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("insert()"); if (s.length() > 0) { Data *data = new_data(length() + s.length()); if (idx > 0) ::memcpy(data->chars, chars(), idx); ::memcpy(&data->chars[idx], s.chars(), s.length()); ::strcpy(&data->chars[idx+s.length()], &chars()[idx]); replace_data(data); } return *this; } //========================================= GStr& GStr::insert(const char *s, int idx) { // A negative idx specifies an idx from the right of the string. make_unique(); //edit operation ahead if (idx < 0) idx += length(); if (idx < 0 || idx >= length()) invalid_index_error("insert()"); const int s_length = ::strlen(s); if (s_length > 0) { Data *data = new_data(length() + s_length); if (idx > 0) ::memcpy(data->chars, chars(), idx); ::memcpy(&data->chars[idx], s, s_length); ::strcpy(&data->chars[idx+s_length], &chars()[idx]); replace_data(data); } return *this; } //========================================= GStr& GStr::append(const char* s) { make_unique(); //edit operation ahead int len=::strlen(s); int newlength=len+my_data->length; if (newlength<=my_data->length) return *this; if (my_data->length==0) { replace_data(len); ::memcpy(my_data->chars, s, len); return *this; } //faster solution with realloc GREALLOC(my_data, sizeof(Data)+newlength); ::strcpy(&my_data->chars[my_data->length], s); my_data->length=newlength; my_data->chars[newlength]='\0'; return *this; } GStr& GStr::append(const GStr& s) { return append((const char *)s); } GStr& GStr::upper() { make_unique(); //edit operation ahead for (register char *p = chrs(); *p; p++) *p = (char) toupper(*p); return *this; } //========================================= GStr& GStr::lower() { make_unique(); for (register char *p = chrs(); *p; p++) *p = (char) tolower(*p); return *this; } //========================================= int GStr::index(const char *s, int start_index) const { // A negative index specifies an index from the right of the string. if (strlen(s)>(size_t)length()) return -1; if (start_index < 0) start_index += length(); if (start_index < 0 || start_index >= length()) invalid_index_error("index()"); const char* idx = strstr(&chars()[start_index], s); if (!idx) return -1; else return idx - chars(); } //========================================= int GStr::index(char c, int start_index) const { // A negative index specifies an index from the right of the string. if (length()==0) return -1; if (start_index < 0) start_index += length(); if (start_index < 0 || start_index >= length()) invalid_index_error("index()"); if (c == '\0') return -1; const char *idx=(char *) ::memchr(&chars()[start_index], c, length()-start_index); if (idx==NULL) return -1; else return idx - chars(); } int GStr::rindex(char c, int end_index) const { if (c == 0 || length()==0 || end_index>=length()) return -1; if (end_index<0) end_index=my_data->length-1; for (int i=end_index;i>=0;i--) { if (my_data->chars[i]==c) return i; } return -1; } int GStr::rindex(const char* str, int end_index) const { if (str==NULL || *str == '\0' || length()==0 || end_index>=length()) return -1; int slen=strlen(str); if (end_index<0) end_index=my_data->length-1; //end_index is the index of the right-side boundary //the scanning starts at the end if (end_index>=0 && end_index=0;i--) { if (memcmp((void*)(my_data->chars+i),(void*)str, slen)==0) return i; } return -1; } GStr GStr::split(const char* delim) { /* splits "this" in two parts, at the first (left) encounter of delim: 1st would stay in "this", 2nd part will be returned as a new string! */ GStr result; int i=index(delim); if (i>=0){ result=substr(i+strlen(delim)); cut(i); return result; } return result; } GStr GStr::split(char c) { /* splits "this" in two parts, at the first (left) encounter of delim: 1st would stay in "this", 2nd part will be returned as a new string! */ GStr result; int i=index(c); if (i>=0){ result=substr(i+1); cut(i); return result; } return result; } GStr GStr::splitr(const char* delim) { GStr result; int i=rindex(delim); if (i>=0){ result=substr(i+strlen(delim)); cut(i); return result; } return result; } GStr GStr::splitr(char c) { GStr result; int i=rindex(c); if (i>=0){ result=substr(i+1); cut(i); return result; } return result; } void GStr::startTokenize(const char* delimiter, enTokenizeMode tokenizemode) { GFREE(fTokenDelimiter); if (delimiter) { GMALLOC(fTokenDelimiter,strlen(delimiter)+1); strcpy(fTokenDelimiter, delimiter); } fLastTokenStart=0; fTokenizeMode=tokenizemode; } bool GStr::nextToken(GStr& token) { if (fTokenDelimiter==NULL) { GError("GStr:: no token delimiter; use StartTokenize first\n"); } if (fLastTokenStart>=length()) {//no more GFREE(fTokenDelimiter); fLastTokenStart=0; return false; } int dlen=strlen(fTokenDelimiter); char* delpos=NULL; //delimiter position int tlen=0; if (fTokenizeMode==tkFullString) { //exact string as a delimiter delpos=(char*)strstr(chars()+fLastTokenStart,fTokenDelimiter); if (delpos==NULL) delpos=(char*)(chars()+length()); //empty records may be returned if (chars()+fLastTokenStart == delpos) { //empty token fLastTokenStart=(delpos-chars())+dlen; token=""; return true; } else { tlen=delpos-(chars()+fLastTokenStart); token.replace_data(tlen); ::memcpy(token.chrs(), &chars()[fLastTokenStart], tlen); fLastTokenStart=(delpos-chars())+dlen; return true; } } else { //tkCharSet - any character is a delimiter //empty records are never returned ! if (fLastTokenStart==0) {//skip any starting delimiters delpos=(char*)chars(); while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL) delpos++; if (*delpos!='\0') fLastTokenStart = delpos-chars(); else { //only delimiters here,no tokens GFREE(fTokenDelimiter); fLastTokenStart=0; return false; } } //now fLastTokenStart is on a non-delimiter char //GMessage("String at fLastTokenStart=%d is %s\n", fLastTokenStart, delpos); char* token_end=NULL; delpos=(char*)strpbrk(chars()+fLastTokenStart,fTokenDelimiter); if (delpos==NULL) delpos=(char*)(chars()+length()); token_end=delpos-1; while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL) delpos++; //skip any other delimiters in the set! //now we know that delpos is on the beginning of next token tlen=(token_end-chars())-fLastTokenStart+1; if (tlen==0) { GFREE(fTokenDelimiter); fLastTokenStart=0; return false; } token.replace_data(tlen); ::memcpy(token.chrs(), &chars()[fLastTokenStart], tlen); fLastTokenStart=delpos-chars(); return true; } //return true; } size_t GStr::read(FILE* stream, const char* delimiter, size_t bufsize) { //read up to (and including) the given delimiter string if (readbuf==NULL) { GMALLOC(readbuf, bufsize); readbufsize=bufsize; } else if (bufsize!=readbufsize) { GFREE(readbuf); if (bufsize>0) { GMALLOC(readbuf, bufsize); } readbufsize=bufsize; } if (bufsize==0) { replace_data(0); return 0; //clear the string and free the buffer } size_t numread; size_t acc_len=0; //accumulated length int seplen=strlen(delimiter); void* p=NULL; Data *data = new_data(0); do { numread=fread(readbuf, 1, bufsize, stream); if (numread) { p=Gmemscan(readbuf, bufsize, (void*) delimiter, seplen); if (p!=NULL) {//found the delimiter //position the stream after it int l = (char*)p-(char*)readbuf; fseek(stream, l+seplen-numread, SEEK_CUR); numread=l+seplen; } else {//not found, go back if not eof if (numread==bufsize) { fseek(stream, -seplen, SEEK_CUR); //check if this works! numread-=seplen; } } if (data==&null_data) { data=new_data(numread); ::memcpy(data->chars, readbuf, numread); acc_len+=numread; } else { GREALLOC(data, sizeof(Data)+acc_len+numread); memcpy(&data->chars[acc_len], readbuf, numread); acc_len+=numread; data->length=acc_len; data->chars[acc_len]='\0'; } } //if something read } while (p==NULL && numread!=0); replace_data(data); return acc_len; } int GStr::asInt(int base /*=10 */) { return strtol(text(), NULL, base); } bool GStr::asInt(int& r, int base) { errno=0; char*endptr; long val=strtol(text(), &endptr, base); if (errno!=0) return false; if (endptr == text()) return false; /* If we got here, strtol() successfully parsed a number */ r=val; return true; } double GStr::asReal() { return strtod(text(), NULL); } bool GStr::asReal(double& r) { errno=0; char* endptr; double val=strtod(text(), &endptr); if (errno!=0) return false; if (endptr == text()) return false; //no digits to parse r=val; return true; } int GStr::peelInt() const { if (is_empty()) return 0; char buf[24]; bool started=false; int j=0; int i; for (i=0;ichars[i])) j++; //set coord else break; //finished } else if (isdigit(my_data->chars[i])) { j++; started=true; } } if (j>0) { strncpy(buf, &my_data->chars[i-j], j); buf[j]='\0'; return strtol(buf, NULL, 10); } return 0; } int GStr::peelIntR() const { if (is_empty()) return 0; char buf[24]; bool started=false; int j=0; int i; for (i=length()-1;i>=0;i--) { if (started) { if (isdigit(my_data->chars[i])) j++; //set length else break; //finished } else if (isdigit(my_data->chars[i])) { j++; started=true; } } if (j>0) { strncpy(buf, &my_data->chars[i+1], j); buf[j]='\0'; return strtol(buf, NULL, 10); } return 0; } GStr GStr::to(char c) { //return the first part up to first occurence of c int i=index(c); if (i>=0) return substr(0,i); else return (*this); } //or whole string if c not found GStr GStr::from(char c) { //same as to, but starting from the right side int i=rindex(c); if (i>=0) return substr(i+1); else return (*this); } int GStr::count(char c){ //return the number of occurences of char c within the string int result=0; for (int i=0;ichars[i]==c) result++; return result; } //========================================= void GStr::invalid_args_error(const char *fname) { GError("GStr:: %s - invalid arguments\n", fname); } //**************************************************************************** void GStr::invalid_index_error(const char *fname) { GError("GStr:: %s - invalid index\n", fname); } //**************************************************************************** tophat-2.0.9/src/tokenize.cpp0000644000175000017500000000230712122334361014673 0ustar toortoor#ifdef HAVE_CONFIG_H #include #endif #include #include #include using namespace std; /** * Split string s according to given delimiters. Mostly borrowed * from C++ Programming HOWTO 7.3. */ void tokenize(const string& s, const string& delims, vector& ss) { string::size_type lastPos = s.find_first_not_of(delims, 0); string::size_type pos = s.find_first_of(delims, lastPos); while (string::npos != pos || string::npos != lastPos) { ss.push_back(s.substr(lastPos, pos - lastPos)); lastPos = s.find_first_not_of(delims, pos); pos = s.find_first_of(delims, lastPos); } } /** * Split string s according to given delimiters. If two delimiters occur in * succession, sticks an empty string token at that position in ss */ void tokenize_strict(const string& s, const string& delims, vector& ss) { string::size_type lastPos = s.find_first_not_of(delims, 0); string::size_type pos = s.find_first_of(delims, lastPos); while (lastPos < s.length() || pos < s.length()) { ss.push_back(s.substr(lastPos, pos - lastPos)); if (pos == string::npos) break; lastPos = pos + 1; pos = s.find_first_of(delims, lastPos); } } tophat-2.0.9/src/utils.h0000644000175000017500000000147112122334357013656 0ustar toortoor#ifndef UTILS_H #define UTILS_H /* * utils.h * TopHat * * Created by Daehwan Kim on 12/28/11. * Copyright 2011 Daehwan Kim. All rights reserved. * */ #include #include using namespace std; #include "common.h" // this is for parallelization purposes in segment_juncs, long_spanning_reads, and tophat_reports. // given "index" files, it calculates "read ids" in increasing order and // their corresponding file offsets. bool calculate_offsets(const vector& fnames, vector& ids, vector >& offsets); // given "read ids" as reference read ids, // it finds the closest read ids (with file offsets) not greater than them. void calculate_offsets_from_ids(const string& fname, const vector& ids, vector& offsets); #endif tophat-2.0.9/src/reads.cpp0000644000175000017500000004321512162605263014152 0ustar toortoor/* * reads.cpp * TopHat * * Created by Cole Trapnell on 9/2/48. * Copyright 2448 Cole Trapnell. All rights reserved. * */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #include #include #include #include "reads.h" #include "bwt_map.h" #include "tokenize.h" using namespace std; char* FLineReader::nextLine() { if(!file) return NULL; if (pushed) { pushed=false; return buf; } //reads a char at a time until \n and/or \r are encountered len=0; int c=0; while ((c=getc(file))!=EOF) { if (len>=allocated-1) { allocated+=512; buf=(char*)realloc(buf,allocated); } if (c=='\n' || c=='\r') { buf[len]='\0'; if (c=='\r') { //DOS file: double-char line terminator, skip the second one if ((c=getc(file))!='\n') ungetc(c,file); } lcount++; return buf; } buf[len]=(char)c; len++; } if (c==EOF) { isEOF=true; if (len==0) return NULL; } buf[len]='\0'; lcount++; return buf; } void skip_lines(FLineReader& fr) { if (fr.fhandle() == NULL) return; char* buf = NULL; while ((buf = fr.nextLine()) != NULL) { if (buf[0] == '\0') continue; if (buf[0] == '>' || buf[0] == '@') { fr.pushBack(); break; } } } bool next_fasta_record(FLineReader& fr, string& defline, string& seq, ReadFormat reads_format) { seq.clear(); defline.clear(); char* buf=NULL; while ((buf=fr.nextLine())!=NULL) { if (buf[0]==0) continue; //skip empty lines if ((reads_format == FASTA && buf[0] == '>') || (reads_format == FASTQ && (buf[0] == '+' || buf[0] == '@'))) { //next record if (seq.length()>0) { //current record ending fr.pushBack(); return true; } defline=buf+1; string::size_type space_pos = defline.find_first_of(" \t"); if (space_pos != string::npos) { defline.resize(space_pos); } continue; } //defline // sequence line seq.append(buf); } //line reading loop replace(seq.begin(), seq.end(), '.', color ? '4' : 'N'); //shouldn't really be needed for FASTA files return !(seq.empty()); } bool next_fastq_record(FLineReader& fr, const string& seq, string& alt_name, string& qual, ReadFormat reads_format) { alt_name.clear(); qual.clear(); char* fline=fr.nextLine(); if (fline==NULL) return false; while (fline[0]==0) { //skip empty lines fline=fr.nextLine(); if (fline==NULL) return false; } //must be on '+' line here if (fline==NULL || (reads_format == FASTQ && fline[0] != '+') || (reads_format == FASTA && quals && fline[0] != '>')) { err_exit("Error: '+' not found for fastq record %s\n",fline); return false; } alt_name=fline+1; string::size_type space_pos = alt_name.find_first_of(" \t"); if (space_pos != string::npos) alt_name.resize(space_pos); //read qv line(s) now: while ((fline=fr.nextLine())!=NULL) { if (integer_quals) { vector integer_qual_values; tokenize(string(fline), " ", integer_qual_values); string temp_qual; for (size_t i = 0; i < integer_qual_values.size(); ++i) { int qual_value = atoi(integer_qual_values[i].c_str()); if (qual_value < 0) qual_value = 0; temp_qual.push_back((char)(qual_value + 33)); } qual.append(temp_qual); } else qual.append(fline); if (qual.length()>=seq.length()-1) break; } // final check if (color) { if (seq.length()==qual.length()) { //discard first qv qual=qual.substr(1); } if (seq.length()!=qual.length()+1) { err_exit("Error: length of quality string does not match seq length (%d) for color read %s!\n", seq.length(), alt_name.c_str()); } } else { if (seq.length()!=qual.length()) { err_exit("Error: qual string length (%d) differs from seq length (%d) for read %s!\n", qual.length(), seq.length(), alt_name.c_str()); //return false; } } // return !(qual.empty()); } bool next_fastx_read(FLineReader& fr, Read& read, ReadFormat reads_format, FLineReader* frq) { /* if (fr.pushed_read) { read = fr.last_read; fr.pushed_read = false; return true; } */ read.clear(); char* buf=NULL; while ((buf=fr.nextLine())!=NULL) { if (buf[0]==0) continue; //skip empty lines if ((reads_format == FASTA && buf[0] == '>') || (reads_format == FASTQ && (buf[0] == '+' || buf[0] == '@'))) { //next record if (read.seq.length()>0) { //current record ending fr.pushBack(); break; } read.name=buf+1; string::size_type space_pos = read.name.find_first_of(" \t"); if (space_pos != string::npos) { read.name.resize(space_pos); } continue; } //defline // sequence line read.seq.append(buf); } //line reading loop replace(read.seq.begin(), read.seq.end(), '.', color ? '4' : 'N'); //shouldn't really be needed for FASTA files if (reads_format != FASTQ && frq==NULL) return (!read.seq.empty()); if (frq==NULL) frq=&fr; //FASTQ //FASTQ or quals in a separate file -- now read quality values buf=frq->nextLine(); if (buf==NULL) return false; while (buf[0]==0) { //skip empty lines buf=frq->nextLine(); if (buf==NULL) return false; } //must be on '+' line here if (buf==NULL || (reads_format == FASTQ && buf[0] != '+') || (reads_format == FASTA && buf[0] != '>')) { err_exit("Error: beginning of quality values record not found! (%s)\n",buf); return false; } read.alt_name=buf+1; string::size_type space_pos = read.alt_name.find_first_of(" \t"); if (space_pos != string::npos) read.alt_name.resize(space_pos); //read qv line(s) now: while ((buf=frq->nextLine())!=NULL) { if (integer_quals) { vector integer_qual_values; tokenize(string(buf), " ", integer_qual_values); string temp_qual; for (size_t i = 0; i < integer_qual_values.size(); ++i) { int qual_value = atoi(integer_qual_values[i].c_str()); if (qual_value < 0) qual_value = 0; temp_qual.push_back((char)(qual_value + 33)); } read.qual.append(temp_qual); } else { read.qual.append(buf); } if (read.qual.length()>=read.seq.length()-1) break; } //while qv lines // final check if (color) { if (read.seq.length()==read.qual.length()) { //discard first qv read.qual=read.qual.substr(1); } if (read.seq.length()!=read.qual.length()+1) { err_exit("Error: length of quality string does not match sequence length (%d) for color read %s!\n", read.seq.length(), read.alt_name.c_str()); } } else { if (read.seq.length()!=read.qual.length()) { err_exit("Error: qual length (%d) differs from seq length (%d) for fastq record %s!\n", read.qual.length(), read.seq.length(), read.alt_name.c_str()); return false; } } //fr.last_read = read; return !(read.seq.empty()); } // This could be faster. void reverse_complement(string& seq) { //fprintf(stderr,"fwd: %s\n", seq.c_str()); for (string::size_type i = 0; i < seq.length(); ++i) { switch(seq[i]) { case 'A' : seq[i] = 'T'; break; case 'T' : seq[i] = 'A'; break; case 'C' : seq[i] = 'G'; break; case 'G' : seq[i] = 'C'; break; default: seq[i] = 'N'; break; } } reverse(seq.begin(), seq.end()); //fprintf(stderr, "rev: %s\n", seq.c_str()); } string str_convert_color_to_bp(const string& color) { if (color.length() <= 0) return ""; char base = color[0]; string bp; for (string::size_type i = 1; i < color.length(); ++i) { char next = color[i]; switch(base) { // 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', case 'A': { switch(next) { case '0': next = 'A'; break; case '1': next = 'C'; break; case '2': next = 'G'; break; case '3': next = 'T'; break; default: next = 'N'; break; } } break; case 'C': { // 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', switch(next) { case '0': next = 'C'; break; case '1': next = 'A'; break; case '2': next = 'T'; break; case '3': next = 'G'; break; default: next = 'N'; break; } } break; case 'G': { // 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', switch(next) { case '0': next = 'G'; break; case '1': next = 'T'; break; case '2': next = 'A'; break; case '3': next = 'C'; break; default: next = 'N'; break; } } break; case 'T': { // 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', switch(next) { case '0': next = 'T'; break; case '1': next = 'G'; break; case '2': next = 'C'; break; case '3': next = 'A'; break; default: next = 'N'; break; } } break; default: next = 'N'; break; } bp.push_back(next); base = next; } return bp; } // daehwan - reduce code redundancy! seqan::String convert_color_to_bp(char base, const seqan::String& color) { if (seqan::length(color) <= 0) return ""; string bp; for (string::size_type i = 0; i < seqan::length(color); ++i) { char next = color[i]; switch(base) { // 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', case 'A': { switch(next) { case '0': next = 'A'; break; case '1': next = 'C'; break; case '2': next = 'G'; break; case '3': next = 'T'; break; default: next = 'N'; break; } } break; case 'C': { // 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', switch(next) { case '0': next = 'C'; break; case '1': next = 'A'; break; case '2': next = 'T'; break; case '3': next = 'G'; break; default: next = 'N'; break; } } break; case 'G': { // 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', switch(next) { case '0': next = 'G'; break; case '1': next = 'T'; break; case '2': next = 'A'; break; case '3': next = 'C'; break; default: next = 'N'; break; } } break; case 'T': { // 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', switch(next) { case '0': next = 'T'; break; case '1': next = 'G'; break; case '2': next = 'C'; break; case '3': next = 'A'; break; default: next = 'N'; break; } } break; default: next = 'N'; break; } bp.push_back(next); base = next; } return bp; } #define check_color(b1, b2, c1, c2) ((b1 == c1 && b2 == c2) || (b1 == c2 && b2 == c1)) #define two_bps_to_color(b1, b2, c) \ if (((b1) == 'A' || (b1) == 'G' || (b1) == 'C' || (b1) == 'T') && (b1) == (b2)) \ c = '0'; \ else if (check_color((b1), (b2), 'A', 'C') || check_color((b1), (b2), 'G', 'T')) \ c = '1'; \ else if (check_color((b1), (b2), 'A', 'G') || check_color((b1), (b2), 'C', 'T')) \ c = '2'; \ else if (check_color((b1), (b2), 'A', 'T') || check_color((b1), (b2), 'C', 'G')) \ c = '3'; \ else \ c = '4'; string convert_bp_to_color(const string& bp, bool remove_primer) { if (bp.length() <= 1) return ""; char base = toupper(bp[0]); string color; if (!remove_primer) color.push_back(base); for (string::size_type i = 1; i < bp.length(); ++i) { char next = toupper(bp[i]); char c = '0'; two_bps_to_color(base, next, c); color.push_back(c); base = next; } return color; } // daehwan - check this - seqan::String convert_bp_to_color(const seqan::String& bp, bool remove_primer) { if (seqan::length(bp) <= 1) return ""; char base = toupper(bp[0]); string color; if (!remove_primer) color.push_back(base); for (string::size_type i = 1; i < seqan::length(bp); ++i) { char next = toupper(bp[i]); char c = '0'; two_bps_to_color(base, next, c); color.push_back(c); base = next; } return color; } /* */ void BWA_decode(const string& color, const string& qual, const string& ref, string& decode) { assert(color.length() == ref.length() - 1); static const size_t max_length = MAX_READ_LEN; const unsigned int max_value = max_length * 0xff; size_t length = color.length(); if (length < 1 || length + 1 > max_length) { return; } unsigned int f[max_length * 4]; char ptr[max_length * 4]; unsigned int q_prev = 0; for (unsigned int i = 0; i < length + 1; ++i) { unsigned int q = (unsigned int) (qual.length() <= i ? 'I' : qual[i]) - 33; for (unsigned int j = 0; j < 4; ++j) { size_t i_j = i * 4 + j; if (i == 0) { f[i_j] = "ACGT"[j] == ref[i] ? 0 : q; ptr[i_j] = 4; continue; } f[i_j] = max_value; char base = "ACGT"[j]; for (unsigned int k = 0; k < 4; ++k) { char base_prev = "ACGT"[k]; char ref_color; two_bps_to_color(base_prev, base, ref_color); char base_prev_prev = "ACGTN"[(int)ptr[(i-1)*4 + k]]; char ref_color_prev; two_bps_to_color(base_prev_prev, base_prev, ref_color_prev); char color_curr = color[i-1]; char color_prev = i >= 2 ? color[i-2] : '4'; int q_hat = 0; if (color_prev == ref_color_prev && color_prev != '4') { if (color_curr == ref_color) q_hat = q + q_prev; else q_hat = q_prev - q; } else if (color_curr == ref_color) { q_hat = q - q_prev; } unsigned int f_k = f[(i-1) * 4 + k] + (base == ref[i] ? 0 : q_hat) + (color_curr == ref_color ? 0 : q); if (f_k < f[i_j]) { f[i_j] = f_k; ptr[i_j] = k; } } } q_prev = q; } unsigned int min_index = 0; unsigned int min_f = f[length * 4]; for (unsigned int i = 1; i < 4; ++i) { unsigned int temp_f = f[length * 4 + i]; if (temp_f < min_f) { min_f = temp_f; min_index = i; } } decode.resize(length + 1); decode[length] = "ACGT"[min_index]; for (unsigned int i = length; i > 0; --i) { min_index = ptr[i * 4 + min_index]; decode[i-1] = "ACGT"[min_index]; } } void bam2Read(bam1_t *b, Read& rd, bool alt_name=false) { GBamRecord bamrec(b); rd.clear(); rd.seq=bamrec.seqData(&rd.qual); rd.name=bam1_qname(b); if (alt_name) rd.alt_name=bamrec.tag_str("ZN"); } bool ReadStream::next_read(QReadData& rdata, ReadFormat read_format) { while (read_pq.size()core.flag & BAM_FQCFAIL)==0) got_read=true; } } bam2Read(b, r, bam_alt_name); return true; } if (!next_fastx_read(*flseqs, r, read_format, flquals)) { r_eof=true; return false; } return true; } // reads must ALWAYS be requested in increasing order of their ID bool ReadStream::getRead(uint64_t r_id, Read& read, ReadFormat read_format, bool strip_slash, uint64_t begin_id, uint64_t end_id, /* GBamWriter* um_out, //unmapped reads output char um_code, //if non-zero, write the found read to um_out with this code int64_t* unmapped_counter, //update this counter for unmapped/skipped reads *only* int64_t* multimapped_counter //update this counter for too multi-mapped reads */ GetReadProc* rProc, bool is_unmapped ) { if (!fstream.file) err_die("Error: calling ReadStream::getRead() with no file handle!"); if (r_id= end_id) return false; if (rdata.id < begin_id) continue; //silently skip until begin_id found //does not trigger rProc->process() until begin_id if (rdata.id == r_id) { read=rdata.read; //it will be returned found=true; } else if (rdata.id > r_id) { //can't find it, went too far //only happens when reads [mates] were removed for some reason //read_pq.push(make_pair(id, read)); read_pq.push(rdata); break; } if (rProc) { //skipped read processing (unmapped reads) if (!rProc->process(rdata, found, is_unmapped)) // rProc->process() should normally return TRUE return false; //abort search for r_id, return "not found" } } //while target read id not found return found; } tophat-2.0.9/src/gff.h0000644000175000017500000010547312157116165013272 0ustar toortoor#ifndef GFF_H #define GFF_H #include "GBase.h" #include "gdna.h" #include "codons.h" #include "GFaSeqGet.h" #include "GList.hh" #include "GHash.hh" /* const byte exMskMajSpliceL = 0x01; const byte exMskMajSpliceR = 0x02; const byte exMskMinSpliceL = 0x04; const byte exMskMinSpliceR = 0x08; const byte exMskTag = 0x80; */ //reserved Gffnames::feats entries -- basic feature types extern const int gff_fid_mRNA; // "mRNA" feature name extern const int gff_fid_transcript; // *RNA, *transcript feature name extern const int gff_fid_exon; extern const uint GFF_MAX_LOCUS; extern const uint GFF_MAX_EXON; extern const uint GFF_MAX_INTRON; extern const uint gfo_flag_CHILDREN_PROMOTED; extern const uint gfo_flag_HAS_ERRORS; extern const uint gfo_flag_IS_GENE; extern const uint gfo_flag_HAS_GFF_ID; //found a GFF3 formatted main feature with its own ID extern const uint gfo_flag_BY_EXON; //created by subfeature (exon) directly //(GTF2 and some chado gff3 dumps with exons given before their mRNA) extern const uint gfo_flag_IS_TRANSCRIPT; //recognized as '*RNA' or '*transcript' extern const uint gfo_flag_DISCARDED; //should not be printed under the "transcriptsOnly" directive extern const uint gfo_flag_LST_KEEP; //GffObj from GffReader::gflst is to be kept (not deallocated) //when GffReader is destroyed extern const uint gfo_flag_LEVEL_MSK; //hierarchical level: 0 = no parent extern const byte gfo_flagShift_LEVEL; extern bool gff_show_warnings; #define GFF_LINELEN 2048 #define ERR_NULL_GFNAMES "Error: GffObj::%s requires a non-null GffNames* names!\n" enum GffExonType { exgffIntron=-1, // useless "intron" feature exgffNone=0, //not a recognizable exon or CDS segment exgffStart, //from "start_codon" feature (within CDS) exgffStop, //from "stop_codon" feature (may be outside CDS) exgffCDS, //from "CDS" feature exgffUTR, //from "UTR" feature exgffCDSUTR, //from a merge of UTR and CDS feature exgffExon, //from "exon" feature }; const char* strExonType(char xtype); class GffReader; class GffLine { char* _parents; //stores a copy of the Parent attribute value, //with commas replaced by \0 int _parents_len; public: char* dupline; //duplicate of original line char* line; //this will have tabs replaced by \0 int llen; char* gseqname; char* track; char* ftype; //feature name: mRNA/gene/exon/CDS char* info; //the last, attributes' field, unparsed uint fstart; uint fend; uint qstart; //overlap coords on query, if available uint qend; uint qlen; //query len, if given double score; char strand; bool skip; bool is_gff3; //if the line appears to be in GFF3 format bool is_cds; //"cds" and "stop_codon" features bool is_exon; //"exon" and "utr" features char exontype; // gffExonType bool is_transcript; //if current feature is *RNA or *transcript bool is_gene; //if current feature is *gene char phase; // '.' , '0', '1' or '2' // -- allocated strings: char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of a gene feature (GFF3) char* gene_id; //value of gene_id attribute (GTF) if present or ID attribute of a gene feature (GFF3) // char** parents; //for GTF only parents[0] is used int num_parents; char* ID; // if a ID=.. attribute was parsed, or a GTF with 'transcript' line (transcript_id) GffLine(GffReader* reader, const char* l); //parse the line accordingly void discardParent() { GFREE(_parents); _parents_len=0; num_parents=0; parents=NULL; } char* extractAttr(const char* pre, bool caseStrict=false, bool enforce_GTF2=false); GffLine(GffLine* l):_parents(NULL), _parents_len(0), dupline(NULL), line(NULL), llen(0), gseqname(NULL), track(NULL), ftype(NULL), info(NULL), fstart(0), fend(0), qstart(0), qend(0), qlen(0), score(0), strand(0), skip(true), is_gff3(false), is_cds(false), is_exon(false), exontype(0), is_transcript(false), is_gene(false), phase(0), gene_name(NULL), gene_id(NULL), parents(NULL), num_parents(0), ID(NULL) { //a copy constructor if (l==NULL || l->line==NULL) GError("Error: invalid GffLine(l)\n"); memcpy((void*)this, (void*)l, sizeof(GffLine)); GMALLOC(line, llen+1); memcpy(line, l->line, llen+1); GMALLOC(dupline, llen+1); memcpy(dupline, l->dupline, llen+1); //--offsets within line[] gseqname=line+(l->gseqname-l->line); track=line+(l->track-l->line); ftype=line+(l->ftype-l->line); info=line+(l->info-l->line); if (num_parents>0 && parents) { parents=NULL; //re-init, just copied earlier GMALLOC(parents, num_parents*sizeof(char*)); //_parents_len=l->_parents_len; copied above _parents=NULL; //re-init, forget pointer copy GMALLOC(_parents, _parents_len); memcpy(_parents, l->_parents, _parents_len); for (int i=0;iparents[i] - l->_parents); } } //-- allocated string copies: ID=Gstrdup(l->ID); if (l->gene_name!=NULL) gene_name=Gstrdup(l->gene_name); if (l->gene_id!=NULL) gene_id=Gstrdup(l->gene_id); } GffLine():_parents(NULL), _parents_len(0), dupline(NULL), line(NULL), llen(0), gseqname(NULL), track(NULL), ftype(NULL), info(NULL), fstart(0), fend(0), qstart(0), qend(0), qlen(0), score(0), strand(0), skip(true), is_gff3(false), is_cds(false), is_exon(false), exontype(0), is_transcript(false), is_gene(false), phase(0), gene_name(NULL), gene_id(NULL), parents(NULL), num_parents(0), ID(NULL) { } ~GffLine() { GFREE(dupline); GFREE(line); GFREE(_parents); GFREE(parents); GFREE(ID); GFREE(gene_name); GFREE(gene_id); } }; class GffAttr { public: int attr_id; char* attr_val; GffAttr(int an_id, const char* av=NULL) { attr_id=an_id; attr_val=NULL; setValue(av); } ~GffAttr() { GFREE(attr_val); } void setValue(const char* av) { if (attr_val!=NULL) { GFREE(attr_val); } if (av==NULL || av[0]==0) return; //trim spaces const char* vstart=av; while (*vstart==' ') av++; const char* vend=vstart; bool keep_dq=false; while (vend[1]!=0) { if (*vend==' ' && vend[1]!=' ') keep_dq=true; else if (*vend==';') keep_dq=true; vend++; } //remove spaces at the end: while (*vend==' ' && vend!=vstart) vend--; //practical clean-up: if it doesn't have any internal spaces just strip those useless double quotes if (!keep_dq && *vstart=='"' && *vend=='"') { vend--; vstart++; } attr_val=Gstrdup(vstart, vend); } bool operator==(GffAttr& d){ return (this==&d); } bool operator>(GffAttr& d){ return (this>&d); } bool operator<(GffAttr& d){ return (this<&d); } }; class GffNameList; class GffNames; class GffNameInfo { friend class GffNameList; public: int idx; char* name; GffNameInfo(const char* n=NULL):idx(-1),name(NULL) { if (n) name=Gstrdup(n); } ~GffNameInfo() { GFREE(name); } bool operator==(GffNameInfo& d){ return (strcmp(this->name, d.name)==0); } bool operator<(GffNameInfo& d){ return (strcmp(this->name, d.name)<0); } }; class GffNameList:public GList { friend class GffNameInfo; friend class GffNames; protected: GHash byName;//hash with shared keys int idlast; //fList index of last added/reused name void addStatic(const char* tname) {// fast add GffNameInfo* f=new GffNameInfo(tname); idlast=this->Add(f); f->idx=idlast; byName.shkAdd(f->name,f); } public: GffNameList(int init_capacity=6):GList(init_capacity, false,true,true), byName(false) { idlast=-1; setCapacity(init_capacity); } char* lastNameUsed() { return idlast<0 ? NULL : Get(idlast)->name; } int lastNameId() { return idlast; } char* getName(int nid) { //retrieve name by its ID if (nid<0 || nid>=fCount) GError("GffNameList Error: invalid index (%d)\n",nid); return fList[nid]->name; } int addName(const char* tname) {//returns or create an id for the given name //check idlast first, chances are it's the same feature name checked /*if (idlast>=0 && strcmp(fList[idlast]->name,tname)==0) return idlast;*/ GffNameInfo* f=byName.Find(tname); int fidx=-1; if (f!=NULL) fidx=f->idx; else {//add new entry f=new GffNameInfo(tname); fidx=this->Add(f); f->idx=fidx; byName.shkAdd(f->name,f); } idlast=fidx; return fidx; } int addNewName(const char* tname) { GffNameInfo* f=new GffNameInfo(tname); int fidx=this->Add(f); f->idx=fidx; byName.shkAdd(f->name,f); return fidx; } int getId(const char* tname) { //only returns a name id# if found GffNameInfo* f=byName.Find(tname); if (f==NULL) return -1; return f->idx; } int removeName() { GError("Error: removing names from GffNameList not allowed!\n"); return -1; } }; class GffNames { public: int numrefs; GffNameList tracks; GffNameList gseqs; GffNameList attrs; GffNameList feats; //feature names: 'mRNA', 'exon', 'CDS' etc. GffNames():tracks(),gseqs(),attrs(), feats() { numrefs=0; //the order below is critical! //has to match: gff_fid_mRNA, gff_fid_exon feats.addStatic("mRNA");//index 0=gff_fid_mRNA feats.addStatic("transcript");//index 1=gff_fid_transcript feats.addStatic("exon");//index 1=gff_fid_exon //feats.addStatic("CDS"); //index 2=gff_fid_CDS } }; void gffnames_ref(GffNames* &n); void gffnames_unref(GffNames* &n); enum GffPrintMode { pgtfAny, //print record as read pgtfExon, pgtfCDS, pgffAny, //print record as read pgffExon, pgffCDS, pgffBoth, }; class GffAttrs:public GList { public: GffAttrs():GList(false,true,false) { } void add_or_update(GffNames* names, const char* attrname, const char* val) { int aid=names->attrs.getId(attrname); if (aid>=0) { //attribute found in the dictionary for (int i=0;iattr_id) { //update the value Get(i)->setValue(val); return; } } } else { aid=names->attrs.addNewName(attrname); } this->Add(new GffAttr(aid, val)); } char* getAttr(GffNames* names, const char* attrname) { int aid=names->attrs.getId(attrname); if (aid>=0) for (int i=0;iattr_id) return Get(i)->attr_val; return NULL; } char* getAttr(int aid) { if (aid>=0) for (int i=0;iattr_id) return Get(i)->attr_val; return NULL; } }; class GffExon : public GSeg { public: void* uptr; //for later extensions GffAttrs* attrs; //other attributes kept for this exon double score; // gff score column char phase; //GFF phase column - for CDS segments only // '.' = undefined (UTR), '0','1','2' for CDS exons char exontype; // 1="exon" 2="cds" 3="utr" 4="stop_codon" int qstart; // for mRNA/protein exon mappings: coordinates on query int qend; GffExon(int s=0, int e=0, double sc=0, char fr=0, int qs=0, int qe=0, char et=0) { uptr=NULL; attrs=NULL; if (sgetAttr(names, atrname); } char* getAttr(int aid) { if (attrs==NULL) return NULL; return attrs->getAttr(aid); } ~GffExon() { //destructor if (attrs!=NULL) delete attrs; } }; class GffCDSeg:public GSeg { public: char phase; int exonidx; }; //one GFF mRNA object -- e.g. a mRNA with its exons and/or CDS segments class GffObj:public GSeg { //utility segment-merging function for addExon() void expandExon(int xovl, uint segstart, uint segend, char exontype, double sc, char fr, int qs, int qe); protected: //coordinate transformation data: uint xstart; //absolute genomic coordinates of reference region uint xend; char xstatus; //coordinate transform status: //0 : (start,end) coordinates are absolute //'+' : (start,end) coords are relative to xstart..xend region //'-' : (start,end) are relative to the reverse complement of xstart..xend region //-- char* gffID; // ID name for mRNA (parent) feature char* gene_name; //value of gene_name attribute (GTF) if present or Name attribute of the parent gene feature (GFF3) char* geneID; //value of gene_id attribute (GTF) if present or ID attribute of a parent gene feature (GFF3) unsigned int flags; //-- friends: friend class GffReader; friend class GffExon; public: static GffNames* names; // dictionary storage that holds the various attribute names etc. int track_id; // index of track name in names->tracks int gseq_id; // index of genomic sequence name in names->gseqs int ftype_id; // index of this record's feature name in names->feats, or the special gff_fid_mRNA value int exon_ftype_id; //index of child subfeature name in names->feats (that subfeature stored in "exons") //if ftype_id==gff_fid_mRNA then this value is ignored GList exons; //for non-mRNA entries, these can be any subfeature of type subftype_id GPVec children; GffObj* parent; int udata; //user data, flags etc. void* uptr; //user pointer (to a parent object, cluster, locus etc.) GffObj* ulink; //link to another GffObj (user controlled field) // mRNA specific fields: bool isCDS; //just a CDS, no UTRs bool partial; //partial CDS uint CDstart; //CDS start coord uint CDend; //CDS end coord char CDphase; //initial phase for CDS start bool hasErrors() { return ((flags & gfo_flag_HAS_ERRORS)!=0); } void hasErrors(bool v) { if (v) flags |= gfo_flag_HAS_ERRORS; else flags &= ~gfo_flag_HAS_ERRORS; } bool hasGffID() { return ((flags & gfo_flag_HAS_GFF_ID)!=0); } void hasGffID(bool v) { if (v) flags |= gfo_flag_HAS_GFF_ID; else flags &= ~gfo_flag_HAS_GFF_ID; } bool createdByExon() { return ((flags & gfo_flag_BY_EXON)!=0); } void createdByExon(bool v) { if (v) flags |= gfo_flag_BY_EXON; else flags &= ~gfo_flag_BY_EXON; } bool isGene() { return ((flags & gfo_flag_IS_GENE)!=0); } void isGene(bool v) { if (v) flags |= gfo_flag_IS_GENE; else flags &= ~gfo_flag_IS_GENE; } bool isDiscarded() { return ((flags & gfo_flag_DISCARDED)!=0); } void isDiscarded(bool v) { if (v) flags |= gfo_flag_DISCARDED; else flags &= ~gfo_flag_DISCARDED; } bool isUsed() { return ((flags & gfo_flag_LST_KEEP)!=0); } void isUsed(bool v) { if (v) flags |= gfo_flag_LST_KEEP; else flags &= ~gfo_flag_LST_KEEP; } bool isTranscript() { return ((flags & gfo_flag_IS_TRANSCRIPT)!=0); } void isTranscript(bool v) { if (v) flags |= gfo_flag_IS_TRANSCRIPT; else flags &= ~gfo_flag_IS_TRANSCRIPT; } bool promotedChildren() { return ((flags & gfo_flag_CHILDREN_PROMOTED)!=0); } void promotedChildren(bool v) { if (v) flags |= gfo_flag_CHILDREN_PROMOTED; else flags &= ~gfo_flag_CHILDREN_PROMOTED; } void setLevel(byte v) { if (v==0) flags &= ~gfo_flag_LEVEL_MSK; else flags &= ~(((uint)v) << gfo_flagShift_LEVEL); } byte incLevel() { uint v=((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL); v++; flags &= ~(v << gfo_flagShift_LEVEL); return v; } byte getLevel() { return ((byte)((flags & gfo_flag_LEVEL_MSK) >> gfo_flagShift_LEVEL)); } bool isValidTranscript() { //return (ftype_id==gff_fid_mRNA && exons.Count()>0); return (isTranscript() && exons.Count()>0); } int addExon(uint segstart, uint segend, double sc=0, char fr='.', int qs=0, int qe=0, bool iscds=false, char exontype=0); int addExon(GffReader* reader, GffLine* gl, bool keepAttr=false, bool noExonAttr=true); void removeExon(int idx); void removeExon(GffExon* p); char strand; //true if features are on the reverse complement strand double gscore; double uscore; //custom, user-computed score, if needed int covlen; //total coverage of reference genomic sequence (sum of maxcf segment lengths) //--------- optional data: int qlen; //query length, start, end - if available int qstart; int qend; int qcov; //query coverage - percent GffAttrs* attrs; //other gff3 attributes found for the main mRNA feature //constructor by gff line parsing: GffObj(GffReader* gfrd, GffLine* gffline, bool keepAttrs=false, bool noExonAttr=true); //if gfline->Parent!=NULL then this will also add the first sub-feature // otherwise, only the main feature is created void copyAttrs(GffObj* from); void clearAttrs() { if (attrs!=NULL) { bool sharedattrs=(exons.Count()>0 && exons[0]->attrs==attrs); delete attrs; attrs=NULL; if (sharedattrs) exons[0]->attrs=NULL; } } GffObj(char* anid=NULL):GSeg(0,0), exons(true,true,false), children(1,false) { //exons: sorted, free, non-unique gffID=NULL; uptr=NULL; ulink=NULL; flags=0; udata=0; parent=NULL; ftype_id=-1; exon_ftype_id=-1; if (anid!=NULL) gffID=Gstrdup(anid); gffnames_ref(names); qlen=0; qstart=0; qend=0; qcov=0; partial=true; isCDS=false; CDstart=0; // hasCDS <=> CDstart>0 CDend=0; CDphase=0; gseq_id=-1; track_id=-1; xstart=0; xend=0; xstatus=0; strand='.'; gscore=0; uscore=0; attrs=NULL; covlen=0; gene_name=NULL; geneID=NULL; } ~GffObj() { GFREE(gffID); GFREE(gene_name); GFREE(geneID); clearAttrs(); gffnames_unref(names); } //-------------- GffObj* finalize(GffReader* gfr, bool mergeCloseExons=false, bool keepAttrs=false, bool noExonAttr=true); //complete parsing: must be called in order to merge adjacent/close proximity subfeatures void parseAttrs(GffAttrs*& atrlist, char* info, bool isExon=false); const char* getSubfName() { //returns the generic feature type of the entries in exons array //int sid=exon_ftype_id; //if (sid==gff_fid_exon && isCDS) sid=gff_fid_CDS; return names->feats.getName(exon_ftype_id); } void addCDS(uint cd_start, uint cd_end, char phase=0); bool monoFeature() { return (exons.Count()==0 || (exons.Count()==1 && //exon_ftype_id==ftype_id && exons[0]->end==this->end && exons[0]->start==this->start)); } bool hasCDS() { return (CDstart>0); } const char* getFeatureName() { return names->feats.getName(ftype_id); } void setFeatureName(const char* feature); void addAttr(const char* attrname, const char* attrvalue); int removeAttr(const char* attrname, const char* attrval=NULL); int removeAttr(int aid, const char* attrval=NULL); int removeExonAttr(GffExon& exon, const char* attrname, const char* attrval=NULL); int removeExonAttr(GffExon& exon, int aid, const char* attrval=NULL); const char* getAttrName(int i) { if (attrs==NULL) return NULL; return names->attrs.getName(attrs->Get(i)->attr_id); } char* getAttr(const char* attrname, bool checkFirstExon=false) { if (names==NULL || attrname==NULL) return NULL; char* r=NULL; if (attrs==NULL) { if (!checkFirstExon) return NULL; } else r=attrs->getAttr(names, attrname); if (r!=NULL) return r; if (checkFirstExon && exons.Count()>0) { r=exons[0]->getAttr(names, attrname); } return r; } char* getExonAttr(GffExon* exon, const char* attrname) { if (exon==NULL || attrname==NULL) return NULL; return exon->getAttr(names, attrname); } char* getExonAttr(int exonidx, const char* attrname) { if (exonidx<0 || exonidx>=exons.Count() || attrname==NULL) return NULL; return exons[exonidx]->getAttr(names, attrname); } char* getAttrValue(int i) { if (attrs==NULL) return NULL; return attrs->Get(i)->attr_val; } const char* getGSeqName() { return names->gseqs.getName(gseq_id); } const char* getRefName() { return names->gseqs.getName(gseq_id); } void setRefName(const char* newname); const char* getTrackName() { return names->tracks.getName(track_id); } bool exonOverlap(uint s, uint e) {//check if ANY exon overlaps given segment //ignores strand! if (s>e) Gswap(s,e); for (int i=0;ioverlap(s,e)) return true; } return false; } bool exonOverlap(GffObj& m) {//check if ANY exon overlaps given segment //if (gseq_id!=m.gseq_id) return false; // ignores strand and gseq_id, must check in advance for (int i=0;istart>m.exons[j]->end) continue; if (m.exons[j]->start>exons[i]->end) break; //-- overlap if we are here: return true; } } return false; } int exonOverlapIdx(uint s, uint e, int* ovlen=NULL) { //return the exons' index for the overlapping OR ADJACENT exon //ovlen, if given, will return the overlap length if (s>e) Gswap(s,e); s--;e++; //to also catch adjacent exons for (int i=0;istart>e) break; if (s>exons[i]->end) continue; //-- overlap if we are here: if (ovlen!=NULL) { s++;e--; int ovlend= (exons[i]->end>e) ? e : exons[i]->end; *ovlen= ovlend - ((s>exons[i]->start)? s : exons[i]->start)+1; } return i; } //for each exon *ovlen=0; return -1; } int exonOverlapLen(GffObj& m) { if (start>m.end || m.start>end) return 0; int i=0; int j=0; int ovlen=0; while (istart; uint iend=exons[i]->end; uint jstart=m.exons[j]->start; uint jend=m.exons[j]->end; if (istart>jend) { j++; continue; } if (jstart>iend) { i++; continue; } //exon overlap uint ovstart=GMAX(istart,jstart); if (iend0) xcoordseg(CDstart, CDend); for (int i=0;istart, exons[i]->end); } if (xstatus=='-') { exons.Reverse(); int flen=end-start; start=xend-end+1; end=start+flen; } else { start=start-xstart+1; end=end-xstart+1; } } //transform an arbitrary segment based on current xstatus/xstart-xend void xcoordseg(uint& segstart, uint &segend) { if (xstatus==0) return; if (xstatus=='-') { int flen=segend-segstart; segstart=xend-segend+1; segend=segstart+flen; return; } else { segstart=segstart-xstart+1; segend=segend-xstart+1; } } void unxcoord() { //revert back to absolute genomic/gff coordinates if xstatus==true if (xstatus==0) return; //nothing to do, no transformation appplied if (CDstart>0) unxcoordseg(CDstart, CDend); //restore all GffExon intervals too for (int i=0;istart, exons[i]->end); } if (xstatus=='-') { exons.Reverse(); int flen=end-start; start=xend-end+1; end=start+flen; } else { start=start+xstart-1; end=end+xstart-1; } xstatus=0; } void unxcoordseg(uint& astart, uint &aend) { //restore an arbitrary interval -- does NOT change the transform state! if (xstatus==0) return; if (xstatus=='-') { int flen=aend-astart; astart=xend-aend+1; aend=astart+flen; } else { astart=astart+xstart-1; aend=aend+xstart-1; } } //--------------------- bool operator==(GffObj& d){ return (gseq_id==d.gseq_id && start==d.start && end==d.end && strcmp(gffID, d.gffID)==0); } bool operator>(GffObj& d){ if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id); if (start==d.start) { if (getLevel()==d.getLevel()) { if (end==d.end) return (strcmp(gffID, d.gffID)>0); else return (end>d.end); } else return (getLevel()>d.getLevel()); } else return (start>d.start); } bool operator<(GffObj& d){ if (gseq_id!=d.gseq_id) return (gseq_id& cds); void updateExonPhase(); //for CDS-only features, updates GExon::phase void printGxfLine(FILE* fout, const char* tlabel, const char* gseqname, bool iscds, uint segstart, uint segend, int exidx, char phase, bool gff3, bool cvtChars=false); void printGxf(FILE* fout, GffPrintMode gffp=pgffExon, const char* tlabel=NULL, const char* gfparent=NULL, bool cvtChars=false); void printGtf(FILE* fout, const char* tlabel=NULL, bool cvtChars=false) { printGxf(fout, pgtfAny, tlabel, NULL, cvtChars); } void printGff(FILE* fout, const char* tlabel=NULL, const char* gfparent=NULL, bool cvtChars=false) { printGxf(fout, pgffAny, tlabel, gfparent, cvtChars); } void printTranscriptGff(FILE* fout, char* tlabel=NULL, bool showCDS=false, const char* gfparent=NULL, bool cvtChars=false) { if (isValidTranscript()) printGxf(fout, showCDS ? pgffBoth : pgffExon, tlabel, gfparent, cvtChars); } void printSummary(FILE* fout=NULL); void getCDS_ends(uint& cds_start, uint& cds_end); void mRNA_CDS_coords(uint& cds_start, uint& cds_end); char* getSpliced(GFaSeqGet* faseq, bool CDSonly=false, int* rlen=NULL, uint* cds_start=NULL, uint* cds_end=NULL, GList* seglst=NULL); char* getUnspliced(GFaSeqGet* faseq, int* rlen, GList* seglst); char* getSplicedTr(GFaSeqGet* faseq, bool CDSonly=true, int* rlen=NULL); //bool validCDS(GFaSeqGet* faseq); //has In-Frame Stop Codon ? bool empty() { return (start==0); } }; typedef bool GffRecFunc(GffObj* gobj, void* usrptr1, void* usrptr2); //user callback after parsing a mapping object: // Returns: "done with it" status: // TRUE if gobj is no longer needed so it's FREEd upon return // FALSE if the user needs the gobj pointer and is responsible for // collecting and freeing all GffObj objects //GSeqStat: collect basic stats about a common underlying genomic sequence // for multiple GffObj class GSeqStat { public: int gseqid; //gseq id in the global static pool of gseqs char* gseqname; //just a pointer to the name of gseq int fcount;//number of features on this gseq uint mincoord; uint maxcoord; uint maxfeat_len; //maximum feature length on this genomic sequence GffObj* maxfeat; GSeqStat(int id=-1, char* name=NULL) { gseqid=id; gseqname=name; fcount=0; mincoord=MAXUINT; maxcoord=0; maxfeat_len=0; maxfeat=NULL; } bool operator>(GSeqStat& g) { return (gseqid>g.gseqid); } bool operator<(GSeqStat& g) { return (gseqid { //just adding the option to sort by genomic sequence and coordinate bool mustSort; public: GfList(bool sortbyloc=false):GList(false,false,false) { //GffObjs in this list are NOT deleted when the list is cleared //-- for deallocation of these objects, call freeAll() or freeUnused() as needed mustSort=sortbyloc; } void sortedByLoc(bool v=true) { bool prev=mustSort; mustSort=v; if (fCount>0 && mustSort && !prev) { this->setSorted((GCompareProc*)gfo_cmpByLoc); } } void finalize(GffReader* gfr, bool mergeCloseExons, bool keepAttrs=false, bool noExonAttr=true); void freeAll() { for (int i=0;iisUsed()) continue; //inform the children for (int c=0;cchildren.Count();c++) { fList[i]->children[c]->parent=NULL; } delete fList[i]; fList[i]=NULL; } Clear(); } }; /* struct GfoHolder { //int idx; //position in GffReader::gflst array GffObj* gffobj; GfoHolder(GffObj* gfo=NULL) { //, int i=0) { //idx=i; gffobj=gfo; } }; */ class CNonExon { //utility class used in subfeature promotion public: //int idx; GffObj* parent; GffExon* exon; GffLine* gffline; //CNonExon(int i, GffObj* p, GffExon* e, GffLine* gl) { CNonExon(GffObj* p, GffExon* e, GffLine* gl) { parent=p; exon=e; //idx=i; gffline=new GffLine(gl); } ~CNonExon() { delete gffline; } }; class GffReader { friend class GffObj; friend class GffLine; char* linebuf; off_t fpos; int buflen; protected: bool gff_warns; //warn about duplicate IDs, etc. even when they are on different chromosomes FILE* fh; char* fname; //optional fasta file with the underlying genomic sequence to be attached to this reader GffLine* gffline; bool transcriptsOnly; //keep only transcripts w/ their exon/CDS features GHash discarded_ids; //for transcriptsOnly mode, keep track // of discarded parent IDs GHash< GPVec > phash; //transcript_id+contig (Parent~Contig) => [gflst index, GffObj] //GHash tids; //just for transcript_id uniqueness char* gfoBuildId(const char* id, const char* ctg); //void gfoRemove(const char* id, const char* ctg); GffObj* gfoAdd(GffObj* gfo); GffObj* gfoAdd(GPVec& glst, GffObj* gfo); // const char* id, const char* ctg, char strand, GVec** glst, uint start, uint end GffObj* gfoFind(const char* id, const char* ctg=NULL, GPVec** glst=NULL, char strand=0, uint start=0, uint end=0); CNonExon* subfPoolCheck(GffLine* gffline, GHash& pex, char*& subp_name); void subfPoolAdd(GHash& pex, GffObj* newgfo); GffObj* promoteFeature(CNonExon* subp, char*& subp_name, GHash& pex, bool keepAttr, bool noExonAttr); GList gseqstats; //list of all genomic sequences seen by this reader, accumulates stats public: GffNames* names; //just a pointer to the global static Gff names repository in GffObj GfList gflst; //accumulate GffObjs being read GffObj* newGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, GffObj* parent=NULL, GffExon* pexon=NULL, GPVec* glst=NULL); //GffObj* replaceGffRec(GffLine* gffline, bool keepAttr, bool noExonAttr, int replaceidx); GffObj* updateGffRec(GffObj* prevgfo, GffLine* gffline, bool keepAttr); GffObj* updateParent(GffObj* newgfh, GffObj* parent); bool addExonFeature(GffObj* prevgfo, GffLine* gffline, GHash& pex, bool noExonAttr); GPVec gseqStats; //only populated after finalize() GffReader(FILE* f=NULL, bool t_only=false, bool sortbyloc=false):discarded_ids(true), phash(true), gseqstats(true,true,true), gflst(sortbyloc), gseqStats(1, false) { gff_warns=gff_show_warnings; names=NULL; gffline=NULL; transcriptsOnly=t_only; fpos=0; fname=NULL; fh=f; GMALLOC(linebuf, GFF_LINELEN); buflen=GFF_LINELEN-1; } void init(FILE *f, bool t_only=false, bool sortbyloc=false) { fname=NULL; fh=f; if (fh!=NULL) rewind(fh); fpos=0; transcriptsOnly=t_only; gflst.sortedByLoc(sortbyloc); } GffReader(char* fn, bool t_only=false, bool sort=false):discarded_ids(true), phash(true), gseqstats(true,true,true), gflst(sort), gseqStats(1,false) { gff_warns=gff_show_warnings; names=NULL; fname=Gstrdup(fn); transcriptsOnly=t_only; fh=fopen(fname, "rb"); fpos=0; gffline=NULL; GMALLOC(linebuf, GFF_LINELEN); buflen=GFF_LINELEN-1; } ~GffReader() { delete gffline; gffline=NULL; fpos=0; gflst.freeUnused(); gflst.Clear(); discarded_ids.Clear(); phash.Clear(); gseqstats.Clear(); GFREE(fname); GFREE(linebuf); } void showWarnings(bool v=true) { gff_warns=v; gff_show_warnings=v; } GffLine* nextGffLine(); // load all subfeatures, re-group them: void readAll(bool keepAttr=false, bool mergeCloseExons=false, bool noExonAttr=true); }; // end of GffReader #endif tophat-2.0.9/src/assert_helpers.h0000644000175000017500000001711212122334361015533 0ustar toortoor#ifndef ASSERT_HELPERS_H_ #define ASSERT_HELPERS_H_ #include #include #include #include /** * Assertion for release-enabled assertions */ class ReleaseAssertException : public std::runtime_error { public: ReleaseAssertException(const std::string& msg = "") : std::runtime_error(msg) {} }; /** * Macros for release-enabled assertions, and helper macros to make * all assertion error messages more helpful. */ #ifndef NDEBUG #define ASSERT_ONLY(x...) x #else #define ASSERT_ONLY(x...) #endif #define rt_assert(b) \ if(!(b)) { \ std::cout << "rt_assert at " << __FILE__ << ":" << __LINE__ << std::endl; \ throw ReleaseAssertException(); \ } #define rt_assert_msg(b,msg) \ if(!(b)) { \ std::cout << msg << " at " << __FILE__ << ":" << __LINE__ << std::endl; \ throw ReleaseAssertException(msg); \ } #define rt_assert_eq(ex,ac) \ if(!((ex) == (ac))) { \ std::cout << "rt_assert_eq: expected (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ throw ReleaseAssertException(); \ } #define rt_assert_eq_msg(ex,ac,msg) \ if(!((ex) == (ac))) { \ std::cout << "rt_assert_eq: " << msg << ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ throw ReleaseAssertException(msg); \ } #ifndef NDEBUG #define assert_eq(ex,ac) \ if(!((ex) == (ac))) { \ std::cout << "assert_eq: expected (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ assert(0); \ } #define assert_eq_msg(ex,ac,msg) \ if(!((ex) == (ac))) { \ std::cout << "assert_eq: " << msg << ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ assert(0); \ } #else #define assert_eq(ex,ac) #define assert_eq_msg(ex,ac,msg) #endif #define rt_assert_neq(ex,ac) \ if(!((ex) != (ac))) { \ std::cout << "rt_assert_neq: expected not (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ throw ReleaseAssertException(); \ } #define rt_assert_neq_msg(ex,ac,msg) \ if(!((ex) != (ac))) { \ std::cout << "rt_assert_neq: " << msg << ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ throw ReleaseAssertException(msg); \ } #ifndef NDEBUG #define assert_neq(ex,ac) \ if(!((ex) != (ac))) { \ std::cout << "assert_neq: expected not (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ assert(0); \ } #define assert_neq_msg(ex,ac,msg) \ if(!((ex) != (ac))) { \ std::cout << "assert_neq: " << msg << ": (" << (ex) << ", 0x" << std::hex << (ex) << std::dec << ") got (" << (ac) << ", 0x" << std::hex << (ac) << std::dec << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ assert(0); \ } #else #define assert_neq(ex,ac) #define assert_neq_msg(ex,ac,msg) #endif #define rt_assert_gt(a,b) \ if(!((a) > (b))) { \ std::cout << "rt_assert_gt: expected (" << (a) << ") > (" << (b) << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ throw ReleaseAssertException(); \ } #define rt_assert_gt_msg(a,b,msg) \ if(!((a) > (b))) { \ std::cout << "rt_assert_gt: " << msg << ": (" << (a) << ") > (" << (b) << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ throw ReleaseAssertException(msg); \ } #ifndef NDEBUG #define assert_gt(a,b) \ if(!((a) > (b))) { \ std::cout << "assert_gt: expected (" << (a) << ") > (" << (b) << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ assert(0); \ } #define assert_gt_msg(a,b,msg) \ if(!((a) > (b))) { \ std::cout << "assert_gt: " << msg << ": (" << (a) << ") > (" << (b) << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ assert(0); \ } #else #define assert_gt(a,b) #define assert_gt_msg(a,b,msg) #endif #define rt_assert_geq(a,b) \ if(!((a) >= (b))) { \ std::cout << "rt_assert_geq: expected (" << (a) << ") >= (" << (b) << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ throw ReleaseAssertException(); \ } #define rt_assert_geq_msg(a,b,msg) \ if(!((a) >= (b))) { \ std::cout << "rt_assert_geq: " << msg << ": (" << (a) << ") >= (" << (b) << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ throw ReleaseAssertException(msg); \ } #ifndef NDEBUG #define assert_geq(a,b) \ if(!((a) >= (b))) { \ std::cout << "assert_geq: expected (" << (a) << ") >= (" << (b) << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ assert(0); \ } #define assert_geq_msg(a,b,msg) \ if(!((a) >= (b))) { \ std::cout << "assert_geq: " << msg << ": (" << (a) << ") >= (" << (b) << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ assert(0); \ } #else #define assert_geq(a,b) #define assert_geq_msg(a,b,msg) #endif #define rt_assert_lt(a,b) \ if(!(a < b)) { \ std::cout << "rt_assert_lt: expected (" << a << ") < (" << b << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ throw ReleaseAssertException(); \ } #define rt_assert_lt_msg(a,b,msg) \ if(!(a < b)) { \ std::cout << "rt_assert_lt: " << msg << ": (" << a << ") < (" << b << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ throw ReleaseAssertException(msg); \ } #ifndef NDEBUG #define assert_lt(a,b) \ if(!(a < b)) { \ std::cout << "assert_lt: expected (" << a << ") < (" << b << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ assert(0); \ } #define assert_lt_msg(a,b,msg) \ if(!(a < b)) { \ std::cout << "assert_lt: " << msg << ": (" << a << ") < (" << b << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ assert(0); \ } #else #define assert_lt(a,b) #define assert_lt_msg(a,b,msg) #endif #define rt_assert_leq(a,b) \ if(!((a) <= (b))) { \ std::cout << "rt_assert_leq: expected (" << (a) << ") <= (" << (b) << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ throw ReleaseAssertException(); \ } #define rt_assert_leq_msg(a,b,msg) \ if(!((a) <= (b))) { \ std::cout << "rt_assert_leq: " << msg << ": (" << (a) << ") <= (" << (b) << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ throw ReleaseAssertException(msg); \ } #ifndef NDEBUG #define assert_leq(a,b) \ if(!((a) <= (b))) { \ std::cout << "assert_leq: expected (" << (a) << ") <= (" << (b) << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ assert(0); \ } #define assert_leq_msg(a,b,msg) \ if(!((a) <= (b))) { \ std::cout << "assert_leq: " << msg << ": (" << (a) << ") <= (" << (b) << ")" << std::endl; \ std::cout << __FILE__ << ":" << __LINE__ << std::endl; \ assert(0); \ } #else #define assert_leq(a,b) #define assert_leq_msg(a,b,msg) #endif #endif /*ASSERT_HELPERS_H_*/ tophat-2.0.9/src/insertions.cpp0000755000175000017500000001324212122334361015243 0ustar toortoor/* * insertions.cpp * TopHat * * Created by Ryan Kelley on 11/04/2010. * */ #ifdef HAVE_CONFIG_H #include #else #define PACKAGE_VERSION "INTERNAL" #define SVN_REVISION "XXX" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common.h" #include "bwt_map.h" #include "junctions.h" #include "insertions.h" #include "fragments.h" #include "wiggles.h" #include "tokenize.h" #include "reads.h" #include "inserts.h" /** * Add insertions from an alignment to an InsertionSet. * This will look for insertion in the alignment specified by bh. If the * insertion is already in insertions, it will updated the count. Otherwise, * it will add the insertion to the set and initialize the count to 1. * @param bh The bowtie hit to be used to specify alignment infromation. * @param insertions The InsertionSet that will be updated with the insertion information from teh alignment. */ void insertions_from_alignment(const BowtieHit& bh, InsertionSet& insertions) { vector > new_insertions; insertions_from_spliced_hit(bh, new_insertions); for(size_t i = 0; i < new_insertions.size(); ++i) { const pair& insertion = new_insertions[i]; InsertionSet::iterator itr = insertions.find(insertion.first); if (itr != insertions.end()) { itr->second.supporting_hits += 1; itr->second.left_extent = max(itr->second.left_extent, insertion.second.left_extent); itr->second.right_extent = max(itr->second.right_extent, insertion.second.right_extent); } else { //assert(insertion.refid != VMAXINT32); insertions[insertion.first] = insertion.second; } } return; } /** * Print insertions in BED format. * Note, as per the BED-standard (http://genome.ucsc.edu/FAQ/FAQformat) * -The coordinates should be 0-based * -The chromEnd field should not include the actual feature * -The name will be the inserted sequence * -The score will be the number of supporing counts, which is capped at 1,000 * By (my) convention, the chromStart will be the last genome postion * before hte insertio. * * \t\t\t\t\n * @param insertions_out The output file * @param insertions Maps from insertions to number of supporting reads * @param ref_sequences The table of reference sequences */ void print_insertions(FILE* insertions_out, const InsertionSet& insertions, RefSequenceTable& ref_sequences) { fprintf(insertions_out, "track name=insertions description=\"TopHat insertions\"\n"); for(InsertionSet::const_iterator i = insertions.begin(); i != insertions.end(); ++i){ int counts = i->second.supporting_hits; if(counts > 1000){ counts = 1000; } fprintf(insertions_out, "%s\t%d\t%d\t%s\t%d\n", ref_sequences.get_name(i->first.refid), i->first.left, i->first.left, (i->first.sequence).c_str(), counts); } } /** * Extract a list of insertions from a bowtie hit. * Given a bowtie hit, extract a vector of insertions. * @param bh The bowtie hit to use for alignment information. * @param insertions Used to store the resultant vector of insertions. */ void insertions_from_spliced_hit(const BowtieHit& bh, vector >& insertions){ const vector& cigar = bh.cigar(); unsigned int positionInGenome = bh.left(); unsigned int positionInRead = 0; bool bSawFusion = false; for(size_t c = 0; c < cigar.size(); ++c){ switch(cigar[c].opcode){ case REF_SKIP: positionInGenome += cigar[c].length; break; case rEF_SKIP: positionInGenome -= cigar[c].length; break; case MATCH: case mATCH: if (cigar[c].opcode == MATCH) positionInGenome += cigar[c].length; else positionInGenome -= cigar[c].length; positionInRead += cigar[c].length; break; case DEL: positionInGenome += cigar[c].length; break; case dEL: positionInGenome -= cigar[c].length; break; case INS: case iNS: { Insertion insertion; InsertionStats stats; /* * Note that the reported position in the genome from the SAM * alignment is 1-based, since the insertion object is expecting * a 0-based co-ordinate, we need to subtract 1 */ if (bSawFusion) insertion.refid = bh.ref_id2(); else insertion.refid = bh.ref_id(); if (cigar[c].opcode == INS) insertion.left = positionInGenome - 1; else insertion.left = positionInGenome + 1; insertion.sequence = bh.seq().substr(positionInRead, cigar[c].length); stats.supporting_hits = 1; if (c > 0) stats.left_extent = cigar[c-1].length; if (c + 1 < cigar.size()) stats.right_extent = cigar[c+1].length; insertions.push_back(make_pair(insertion, stats)); positionInRead += cigar[c].length; } break; case FUSION_FF: case FUSION_FR: case FUSION_RF: bSawFusion = true; positionInGenome = cigar[c].length; break; default: break; } } return; } void merge_with(InsertionSet& insertions, const InsertionSet& other) { for (InsertionSet::const_iterator insertion = other.begin(); insertion != other.end(); ++insertion) { InsertionSet::iterator itr = insertions.find(insertion->first); if (itr != insertions.end()) { itr->second.merge_with(insertion->second); } else { insertions[insertion->first] = insertion->second; } } } tophat-2.0.9/src/Makefile.in0000644000175000017500000014645612157340460014427 0ustar toortoor# Makefile.in generated by automake 1.9.6 from Makefile.am. # @configure_input@ # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, # 2003, 2004, 2005 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY, to the extent permitted by law; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. @SET_MAKE@ #include $(top_srcdir)/build-aux/tophat.mk srcdir = @srcdir@ top_srcdir = @top_srcdir@ VPATH = @srcdir@ pkgdatadir = $(datadir)/@PACKAGE@ pkglibdir = $(libdir)/@PACKAGE@ pkgincludedir = $(includedir)/@PACKAGE@ top_builddir = .. am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd INSTALL = @INSTALL@ install_sh_DATA = $(install_sh) -c -m 644 install_sh_PROGRAM = $(install_sh) -c install_sh_SCRIPT = $(install_sh) -c INSTALL_HEADER = $(INSTALL_DATA) transform = $(program_transform_name) NORMAL_INSTALL = : PRE_INSTALL = : POST_INSTALL = : NORMAL_UNINSTALL = : PRE_UNINSTALL = : POST_UNINSTALL = : build_triplet = @build@ host_triplet = @host@ bin_PROGRAMS = prep_reads$(EXEEXT) gtf_to_fasta$(EXEEXT) \ fix_map_ordering$(EXEEXT) bam2fastx$(EXEEXT) \ segment_juncs$(EXEEXT) gtf_juncs$(EXEEXT) juncs_db$(EXEEXT) \ long_spanning_reads$(EXEEXT) bam_merge$(EXEEXT) \ map2gtf$(EXEEXT) tophat_reports$(EXEEXT) sam_juncs$(EXEEXT) subdir = src DIST_COMMON = $(dist_bin_SCRIPTS) $(noinst_HEADERS) \ $(srcdir)/Makefile.am $(srcdir)/Makefile.in ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/ax_boost_base.m4 \ $(top_srcdir)/ax_boost_thread.m4 $(top_srcdir)/ax_bam.m4 \ $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ $(ACLOCAL_M4) mkinstalldirs = $(install_sh) -d CONFIG_HEADER = $(top_builddir)/config.h CONFIG_CLEAN_FILES = LIBRARIES = $(noinst_LIBRARIES) AR = ar ARFLAGS = cru libgc_a_AR = $(AR) $(ARFLAGS) libgc_a_LIBADD = am_libgc_a_OBJECTS = GBase.$(OBJEXT) codons.$(OBJEXT) gdna.$(OBJEXT) \ GStr.$(OBJEXT) GFaSeqGet.$(OBJEXT) gff.$(OBJEXT) libgc_a_OBJECTS = $(am_libgc_a_OBJECTS) libtophat_a_AR = $(AR) $(ARFLAGS) libtophat_a_LIBADD = am_libtophat_a_OBJECTS = reads.$(OBJEXT) alphabet.$(OBJEXT) \ bwt_map.$(OBJEXT) common.$(OBJEXT) junctions.$(OBJEXT) \ insertions.$(OBJEXT) deletions.$(OBJEXT) fusions.$(OBJEXT) \ align_status.$(OBJEXT) fragments.$(OBJEXT) tokenize.$(OBJEXT) \ inserts.$(OBJEXT) qual.$(OBJEXT) bam_merge_impl.$(OBJEXT) \ utils.$(OBJEXT) coverage.$(OBJEXT) libtophat_a_OBJECTS = $(am_libtophat_a_OBJECTS) am__installdirs = "$(DESTDIR)$(bindir)" "$(DESTDIR)$(bindir)" \ "$(DESTDIR)$(bindir)" binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) PROGRAMS = $(bin_PROGRAMS) am_bam2fastx_OBJECTS = bam2fastx.$(OBJEXT) bam2fastx_OBJECTS = $(am_bam2fastx_OBJECTS) am__DEPENDENCIES_1 = bam2fastx_DEPENDENCIES = $(top_builddir)/src/libgc.a \ $(am__DEPENDENCIES_1) am_bam_merge_OBJECTS = bam_merge.$(OBJEXT) bam_merge_OBJECTS = $(am_bam_merge_OBJECTS) bam_merge_DEPENDENCIES = $(top_builddir)/src/libtophat.a \ $(top_builddir)/src/libgc.a $(am__DEPENDENCIES_1) am_fix_map_ordering_OBJECTS = fix_map_ordering.$(OBJEXT) fix_map_ordering_OBJECTS = $(am_fix_map_ordering_OBJECTS) fix_map_ordering_DEPENDENCIES = $(top_builddir)/src/libtophat.a \ $(am__DEPENDENCIES_1) am_gtf_juncs_OBJECTS = gtf_juncs.$(OBJEXT) gtf_juncs_OBJECTS = $(am_gtf_juncs_OBJECTS) gtf_juncs_DEPENDENCIES = $(top_builddir)/src/libtophat.a libgc.a \ $(am__DEPENDENCIES_1) am_gtf_to_fasta_OBJECTS = GTFToFasta.$(OBJEXT) FastaTools.$(OBJEXT) gtf_to_fasta_OBJECTS = $(am_gtf_to_fasta_OBJECTS) gtf_to_fasta_DEPENDENCIES = $(top_builddir)/src/libtophat.a libgc.a \ $(am__DEPENDENCIES_1) am_juncs_db_OBJECTS = juncs_db.$(OBJEXT) juncs_db_OBJECTS = $(am_juncs_db_OBJECTS) juncs_db_DEPENDENCIES = $(top_builddir)/src/libtophat.a \ $(am__DEPENDENCIES_1) am_long_spanning_reads_OBJECTS = long_spanning_reads.$(OBJEXT) long_spanning_reads_OBJECTS = $(am_long_spanning_reads_OBJECTS) long_spanning_reads_DEPENDENCIES = $(top_builddir)/src/libtophat.a \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) am_map2gtf_OBJECTS = map2gtf.$(OBJEXT) map2gtf_OBJECTS = $(am_map2gtf_OBJECTS) map2gtf_DEPENDENCIES = $(top_builddir)/src/libtophat.a libgc.a \ $(am__DEPENDENCIES_1) am_prep_reads_OBJECTS = prep_reads.$(OBJEXT) prep_reads_OBJECTS = $(am_prep_reads_OBJECTS) prep_reads_DEPENDENCIES = $(top_builddir)/src/libtophat.a \ $(am__DEPENDENCIES_1) am_sam_juncs_OBJECTS = sam_juncs.$(OBJEXT) sam_juncs_OBJECTS = $(am_sam_juncs_OBJECTS) sam_juncs_DEPENDENCIES = $(top_builddir)/src/libtophat.a \ $(am__DEPENDENCIES_1) am_segment_juncs_OBJECTS = segment_juncs.$(OBJEXT) segment_juncs_OBJECTS = $(am_segment_juncs_OBJECTS) segment_juncs_DEPENDENCIES = $(top_builddir)/src/libtophat.a \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) am_tophat_reports_OBJECTS = tophat_reports.$(OBJEXT) tophat_reports_OBJECTS = $(am_tophat_reports_OBJECTS) tophat_reports_DEPENDENCIES = $(top_builddir)/src/libtophat.a \ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \ $(am__DEPENDENCIES_1) binSCRIPT_INSTALL = $(INSTALL_SCRIPT) dist_binSCRIPT_INSTALL = $(INSTALL_SCRIPT) SCRIPTS = $(bin_SCRIPTS) $(dist_bin_SCRIPTS) DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir) depcomp = $(SHELL) $(top_srcdir)/build-aux/depcomp am__depfiles_maybe = depfiles COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) CCLD = $(CC) LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) CXXLD = $(CXX) CXXLINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \ -o $@ SOURCES = $(libgc_a_SOURCES) $(libtophat_a_SOURCES) \ $(bam2fastx_SOURCES) $(bam_merge_SOURCES) \ $(fix_map_ordering_SOURCES) $(gtf_juncs_SOURCES) \ $(gtf_to_fasta_SOURCES) $(juncs_db_SOURCES) \ $(long_spanning_reads_SOURCES) $(map2gtf_SOURCES) \ $(prep_reads_SOURCES) $(sam_juncs_SOURCES) \ $(segment_juncs_SOURCES) $(tophat_reports_SOURCES) DIST_SOURCES = $(libgc_a_SOURCES) $(libtophat_a_SOURCES) \ $(bam2fastx_SOURCES) $(bam_merge_SOURCES) \ $(fix_map_ordering_SOURCES) $(gtf_juncs_SOURCES) \ $(gtf_to_fasta_SOURCES) $(juncs_db_SOURCES) \ $(long_spanning_reads_SOURCES) $(map2gtf_SOURCES) \ $(prep_reads_SOURCES) $(sam_juncs_SOURCES) \ $(segment_juncs_SOURCES) $(tophat_reports_SOURCES) HEADERS = $(noinst_HEADERS) ETAGS = etags CTAGS = ctags DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ACLOCAL = @ACLOCAL@ AMDEP_FALSE = @AMDEP_FALSE@ AMDEP_TRUE = @AMDEP_TRUE@ AMTAR = @AMTAR@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ AUTOMAKE = @AUTOMAKE@ AWK = @AWK@ BAM_CPPFLAGS = @BAM_CPPFLAGS@ BAM_LDFLAGS = @BAM_LDFLAGS@ BAM_LIB = @BAM_LIB@ BOOST_CPPFLAGS = @BOOST_CPPFLAGS@ BOOST_LDFLAGS = @BOOST_LDFLAGS@ BOOST_SYSTEM_LIB = @BOOST_SYSTEM_LIB@ BOOST_THREAD_LIB = @BOOST_THREAD_LIB@ CC = @CC@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ CXX = @CXX@ CXXDEPMODE = @CXXDEPMODE@ CXXFLAGS = @CXXFLAGS@ CYGPATH_W = @CYGPATH_W@ DEFS = @DEFS@ DEPDIR = @DEPDIR@ ECHO_C = @ECHO_C@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ EGREP = @EGREP@ EXEEXT = @EXEEXT@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ LDFLAGS = @LDFLAGS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LTLIBOBJS = @LTLIBOBJS@ MAKEINFO = @MAKEINFO@ OBJEXT = @OBJEXT@ PACKAGE = @PACKAGE@ PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ PACKAGE_NAME = @PACKAGE_NAME@ PACKAGE_STRING = @PACKAGE_STRING@ PACKAGE_TARNAME = @PACKAGE_TARNAME@ PACKAGE_VERSION = @PACKAGE_VERSION@ PATH_SEPARATOR = @PATH_SEPARATOR@ PYTHON = @PYTHON@ PYTHON_EXEC_PREFIX = @PYTHON_EXEC_PREFIX@ PYTHON_PLATFORM = @PYTHON_PLATFORM@ PYTHON_PREFIX = @PYTHON_PREFIX@ PYTHON_VERSION = @PYTHON_VERSION@ RANLIB = @RANLIB@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ STRIP = @STRIP@ VERSION = @VERSION@ ac_ct_CC = @ac_ct_CC@ ac_ct_CXX = @ac_ct_CXX@ ac_ct_RANLIB = @ac_ct_RANLIB@ ac_ct_STRIP = @ac_ct_STRIP@ am__fastdepCC_FALSE = @am__fastdepCC_FALSE@ am__fastdepCC_TRUE = @am__fastdepCC_TRUE@ am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@ am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@ am__include = @am__include@ am__leading_dot = @am__leading_dot@ am__quote = @am__quote@ am__tar = @am__tar@ am__untar = @am__untar@ bindir = @bindir@ build = @build@ build_alias = @build_alias@ build_cpu = @build_cpu@ build_os = @build_os@ build_vendor = @build_vendor@ datadir = @datadir@ exec_prefix = @exec_prefix@ host = @host@ host_alias = @host_alias@ host_cpu = @host_cpu@ host_os = @host_os@ host_vendor = @host_vendor@ includedir = @includedir@ infodir = @infodir@ install_sh = @install_sh@ libdir = @libdir@ libexecdir = @libexecdir@ localstatedir = @localstatedir@ mandir = @mandir@ mkdir_p = @mkdir_p@ oldincludedir = @oldincludedir@ pkgpyexecdir = @pkgpyexecdir@ pkgpythondir = @pkgpythondir@ prefix = @prefix@ program_transform_name = @program_transform_name@ pyexecdir = @pyexecdir@ pythondir = @pythondir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ sysconfdir = @sysconfdir@ target_alias = @target_alias@ # Generated with # find SeqAn-1.3 -type f -print | grep -v ".svn" | sed 's/$/ \\/g' # run from src # - and tophat2.in added EXTRA_DIST = \ tophat.py \ tophat2.in \ SeqAn-1.3/COPYING \ SeqAn-1.3/README \ SeqAn-1.3/seqan/chaining/chain_generic.h \ SeqAn-1.3/seqan/chaining/rt_skip_base_element.h \ SeqAn-1.3/seqan/chaining/score_zero.h \ SeqAn-1.3/seqan/chaining/geom_distribution.h \ SeqAn-1.3/seqan/chaining/chain_base.h \ SeqAn-1.3/seqan/chaining/chaining_generated_forwards.h \ SeqAn-1.3/seqan/chaining/rmt_skip_base_element.h \ SeqAn-1.3/seqan/chaining/skip_element.h \ SeqAn-1.3/seqan/chaining/rmt_skip_element.h \ SeqAn-1.3/seqan/chaining/rt_sl_impl.h \ SeqAn-1.3/seqan/chaining/chain_wrapper_point.h \ SeqAn-1.3/seqan/chaining/rmt_compl_algos.h \ SeqAn-1.3/seqan/chaining/rmt_base.h \ SeqAn-1.3/seqan/chaining/skip_list.h \ SeqAn-1.3/seqan/chaining/skip_list_iterator.h \ SeqAn-1.3/seqan/chaining/skip_base_element.h \ SeqAn-1.3/seqan/chaining/fragment.h \ SeqAn-1.3/seqan/chaining/skip_pool_alloc.h \ SeqAn-1.3/seqan/chaining/skip_list_impl.h \ SeqAn-1.3/seqan/chaining/rt_skip_element.h \ SeqAn-1.3/seqan/chaining/tree_chain_sop.h \ SeqAn-1.3/seqan/chaining/rmt_common_algos.h \ SeqAn-1.3/seqan/chaining/score_manhattan.h \ SeqAn-1.3/seqan/chaining/score_chain_sop.h \ SeqAn-1.3/seqan/chaining/rt_impl.h \ SeqAn-1.3/seqan/chaining/tree_chain.h \ SeqAn-1.3/seqan/chaining/tree_chain_utils.h \ SeqAn-1.3/seqan/chaining/skip_list_base.h \ SeqAn-1.3/seqan/chaining/rt_sl_def_algos.h \ SeqAn-1.3/seqan/chaining/range_tree.h \ SeqAn-1.3/seqan/chaining/rmt_def_algos.h \ SeqAn-1.3/seqan/chaining/chain_point.h \ SeqAn-1.3/seqan/chaining/rt_base.h \ SeqAn-1.3/seqan/chaining/chain_meta_fragment.h \ SeqAn-1.3/seqan/chaining/score_chain.h \ SeqAn-1.3/seqan/chaining/rt_common_algos.h \ SeqAn-1.3/seqan/chaining/skip_list_dynamic.h \ SeqAn-1.3/seqan/chaining/rt_sl_base.h \ SeqAn-1.3/seqan/chaining/skip_list_type.h \ SeqAn-1.3/seqan/chaining/range_max_tree.h \ SeqAn-1.3/seqan/chaining/rt_sl_compl_algos.h \ SeqAn-1.3/seqan/file/file_format_cgviz.h \ SeqAn-1.3/seqan/file/stream.h \ SeqAn-1.3/seqan/file/file_format_embl.h \ SeqAn-1.3/seqan/file/file_format_fasta.h \ SeqAn-1.3/seqan/file/string_mmap.h \ SeqAn-1.3/seqan/file/file_format_raw.h \ SeqAn-1.3/seqan/file/string_external.h \ SeqAn-1.3/seqan/file/file_filereader.h \ SeqAn-1.3/seqan/file/file_cstyle.h \ SeqAn-1.3/seqan/file/file_page_raid0.h \ SeqAn-1.3/seqan/file/cstream.h \ SeqAn-1.3/seqan/file/file_format_guess.h \ SeqAn-1.3/seqan/file/file_base.h \ SeqAn-1.3/seqan/file/file_format_mmap.h \ SeqAn-1.3/seqan/file/file_forwards.h \ SeqAn-1.3/seqan/file/file_array.h \ SeqAn-1.3/seqan/file/meta.h \ SeqAn-1.3/seqan/file/stream_algorithms.h \ SeqAn-1.3/seqan/file/file_format.h \ SeqAn-1.3/seqan/file/file_filereaderiterator.h \ SeqAn-1.3/seqan/file/file_format_genbank.h \ SeqAn-1.3/seqan/file/file_generated_forwards.h \ SeqAn-1.3/seqan/file/file_format_fasta_align.h \ SeqAn-1.3/seqan/file/chunk_collector.h \ SeqAn-1.3/seqan/file/file_page.h \ SeqAn-1.3/seqan/seeds.h \ SeqAn-1.3/seqan/find_motif.h \ SeqAn-1.3/seqan/LICENSE \ SeqAn-1.3/seqan/find2/find_approx_find_begin.h \ SeqAn-1.3/seqan/find2/find_finder_default.h \ SeqAn-1.3/seqan/find2/find_exact_simple.h \ SeqAn-1.3/seqan/find2/find_multiple_exact_simple.h \ SeqAn-1.3/seqan/find2/find_hamming_simple.h \ SeqAn-1.3/seqan/find2/find_pattern_wild_shiftand.h \ SeqAn-1.3/seqan/find2/find_approx_dpsearch.h \ SeqAn-1.3/seqan/find2/find_exact_shiftand.h \ SeqAn-1.3/seqan/find2/find2_generated_forwards.h \ SeqAn-1.3/seqan/find2/find_base.h \ SeqAn-1.3/seqan/platform.h \ SeqAn-1.3/seqan/sequence_journaled.h \ SeqAn-1.3/seqan/chaining.h \ SeqAn-1.3/seqan/score/score_generated_forwards.h \ SeqAn-1.3/seqan/score/score_matrix_data.h \ SeqAn-1.3/seqan/score/score_matrix.h \ SeqAn-1.3/seqan/score/score_simple.h \ SeqAn-1.3/seqan/score/score_base.h \ SeqAn-1.3/seqan/score/score_edit.h \ SeqAn-1.3/seqan/graph_algorithms.h \ SeqAn-1.3/seqan/sequence_journaled/sequence_journaled.h \ SeqAn-1.3/seqan/sequence_journaled/journal_entries_unbalanced_tree_iterator.h \ SeqAn-1.3/seqan/sequence_journaled/sequence_journaled_forwards.h \ SeqAn-1.3/seqan/sequence_journaled/journal_entries_unbalanced_tree_node.h \ SeqAn-1.3/seqan/sequence_journaled/sequence_journaled_iterator.h \ SeqAn-1.3/seqan/sequence_journaled/journal_entry.h \ SeqAn-1.3/seqan/sequence_journaled/journal_entries_sorted_array.h \ SeqAn-1.3/seqan/sequence_journaled/sequence_journaled_generated_forwards.h \ SeqAn-1.3/seqan/sequence_journaled/journal_entries_unbalanced_tree.h \ SeqAn-1.3/seqan/align/gaps_iterator_base.h \ SeqAn-1.3/seqan/align/gaps_array.h \ SeqAn-1.3/seqan/align/matrix_base.h \ SeqAn-1.3/seqan/align/gaps_base.h \ SeqAn-1.3/seqan/align/align_iterator_base.h \ SeqAn-1.3/seqan/align/align_algorithms.h \ SeqAn-1.3/seqan/align/align_hirschberg.h \ SeqAn-1.3/seqan/align/align_local_dynprog.h \ SeqAn-1.3/seqan/align/align_cols_base.h \ SeqAn-1.3/seqan/align/align_generated_forwards.h \ SeqAn-1.3/seqan/align/align_base.h \ SeqAn-1.3/seqan/align/gaps_sumlist.h \ SeqAn-1.3/seqan/align/align_local_dynprog_banded.h \ SeqAn-1.3/seqan/align/align_dynprog.h \ SeqAn-1.3/seqan/align/align_myers.h \ SeqAn-1.3/seqan/align/hirschberg_set.h \ SeqAn-1.3/seqan/align/align_trace.h \ SeqAn-1.3/seqan/pipe/pipe_sampler.h \ SeqAn-1.3/seqan/pipe/pipe_filter.h \ SeqAn-1.3/seqan/pipe/pool_sorter.h \ SeqAn-1.3/seqan/pipe/pipe_namer.h \ SeqAn-1.3/seqan/pipe/pipe_joiner.h \ SeqAn-1.3/seqan/pipe/pipe_base.h \ SeqAn-1.3/seqan/pipe/pipe_echoer.h \ SeqAn-1.3/seqan/pipe/pipe_caster.h \ SeqAn-1.3/seqan/pipe/pipe_generated_forwards.h \ SeqAn-1.3/seqan/pipe/pipe_counter.h \ SeqAn-1.3/seqan/pipe/pool_mapper.h \ SeqAn-1.3/seqan/pipe/pipe_edit_environment.h \ SeqAn-1.3/seqan/pipe/pool_base.h \ SeqAn-1.3/seqan/pipe/pipe_shifter.h \ SeqAn-1.3/seqan/pipe/pipe_source.h \ SeqAn-1.3/seqan/pipe/pipe_tupler.h \ SeqAn-1.3/seqan/pipe/pipe_iterator.h \ SeqAn-1.3/seqan/graph_types/graph_iterator_vertex.h \ SeqAn-1.3/seqan/graph_types/graph_impl_oracle.h \ SeqAn-1.3/seqan/graph_types/graph_idmanager.h \ SeqAn-1.3/seqan/graph_types/graph_iterator_adjacency.h \ SeqAn-1.3/seqan/graph_types/graph_iterator.h \ SeqAn-1.3/seqan/graph_types/graph_impl_automaton.h \ SeqAn-1.3/seqan/graph_types/graph_impl_trie.h \ SeqAn-1.3/seqan/graph_types/graph_iterator_bfs.h \ SeqAn-1.3/seqan/graph_types/graph_iterator_dfs.h \ SeqAn-1.3/seqan/graph_types/graph_property.h \ SeqAn-1.3/seqan/graph_types/graph_impl_fragment.h \ SeqAn-1.3/seqan/graph_types/graph_iterator_outedge.h \ SeqAn-1.3/seqan/graph_types/graph_utility_parsing.h \ SeqAn-1.3/seqan/graph_types/graph_impl_undirected.h \ SeqAn-1.3/seqan/graph_types/graph_impl_hmm.h \ SeqAn-1.3/seqan/graph_types/graph_base.h \ SeqAn-1.3/seqan/graph_types/graph_iterator_edge.h \ SeqAn-1.3/seqan/graph_types/graph_edgestump.h \ SeqAn-1.3/seqan/graph_types/graph_impl_tree.h \ SeqAn-1.3/seqan/graph_types/graph_impl_wordgraph.h \ SeqAn-1.3/seqan/graph_types/graph_types_generated_forwards.h \ SeqAn-1.3/seqan/graph_types/graph_impl_directed.h \ SeqAn-1.3/seqan/graph_types/graph_drawing.h \ SeqAn-1.3/seqan/graph_types/graph_interface.h \ SeqAn-1.3/seqan/file.h \ SeqAn-1.3/seqan/system.h \ SeqAn-1.3/seqan/refinement.h \ SeqAn-1.3/seqan/pipe.h \ SeqAn-1.3/seqan/system/system_thread.h \ SeqAn-1.3/seqan/system/system_base.h \ SeqAn-1.3/seqan/system/system_event.h \ SeqAn-1.3/seqan/system/file_directory.h \ SeqAn-1.3/seqan/system/system_mutex.h \ SeqAn-1.3/seqan/system/file_manual_forwards.h \ SeqAn-1.3/seqan/system/system_generated_forwards.h \ SeqAn-1.3/seqan/system/file_sync.h \ SeqAn-1.3/seqan/system/file_async.h \ SeqAn-1.3/seqan/system/system_sema.h \ SeqAn-1.3/seqan/system/system_manual_forwards.h \ SeqAn-1.3/seqan/misc/edit_environment.h \ SeqAn-1.3/seqan/misc/misc_parsing.h \ SeqAn-1.3/seqan/misc/misc_generated_forwards.h \ SeqAn-1.3/seqan/misc/misc_base.h \ SeqAn-1.3/seqan/misc/priority_type_heap.h \ SeqAn-1.3/seqan/misc/priority_type_base.h \ SeqAn-1.3/seqan/misc/misc_dequeue.h \ SeqAn-1.3/seqan/misc/misc_set.h \ SeqAn-1.3/seqan/misc/misc_map.h \ SeqAn-1.3/seqan/misc/misc_svg.h \ SeqAn-1.3/seqan/misc/misc_interval_tree.h \ SeqAn-1.3/seqan/misc/misc_long_word.h \ SeqAn-1.3/seqan/misc/misc_cmdparser.h \ SeqAn-1.3/seqan/misc/misc_random.h \ SeqAn-1.3/seqan/platform/platform_mingw.h \ SeqAn-1.3/seqan/platform/platform_gcc.h \ SeqAn-1.3/seqan/platform/platform_windows.h \ SeqAn-1.3/seqan/platform/platform_solaris.h \ SeqAn-1.3/seqan/platform/platform_generated_forwards.h \ SeqAn-1.3/seqan/graph_align/graph_align_gotoh.h \ SeqAn-1.3/seqan/graph_align/graph_align_generated_forwards.h \ SeqAn-1.3/seqan/graph_align/graph_align_smith_waterman.h \ SeqAn-1.3/seqan/graph_align/graph_align_base.h \ SeqAn-1.3/seqan/graph_align/graph_align_hirschberg.h \ SeqAn-1.3/seqan/graph_align/graph_align_needleman_wunsch.h \ SeqAn-1.3/seqan/graph_align/graph_align_banded_smith_waterman_clump.h \ SeqAn-1.3/seqan/graph_align/graph_align_interface.h \ SeqAn-1.3/seqan/graph_align/graph_align_smith_waterman_clump.h \ SeqAn-1.3/seqan/graph_align/graph_align_banded_gotoh.h \ SeqAn-1.3/seqan/graph_align/graph_align_banded_needleman_wunsch.h \ SeqAn-1.3/seqan/graph_align/graph_align_config.h \ SeqAn-1.3/seqan/parallel/parallel_generated_forwards.h \ SeqAn-1.3/seqan/parallel/parallel_atomic_misc.h \ SeqAn-1.3/seqan/parallel/parallel_atomic_primitives.h \ SeqAn-1.3/seqan/parallel/parallel_macros.h \ SeqAn-1.3/seqan/store/store_io_gff.h \ SeqAn-1.3/seqan/store/store_library.h \ SeqAn-1.3/seqan/store/store_read.h \ SeqAn-1.3/seqan/store/store_annotation.h \ SeqAn-1.3/seqan/store/store_intervaltree.h \ SeqAn-1.3/seqan/store/store_io_ucsc.h \ SeqAn-1.3/seqan/store/store_all.h \ SeqAn-1.3/seqan/store/store_matepair.h \ SeqAn-1.3/seqan/store/store_align_intervals.h \ SeqAn-1.3/seqan/store/store_io.h \ SeqAn-1.3/seqan/store/store_io_sam.h \ SeqAn-1.3/seqan/store/store_align.h \ SeqAn-1.3/seqan/store/store_io_bam.h \ SeqAn-1.3/seqan/store/store_contig.h \ SeqAn-1.3/seqan/store/store_generated_forwards.h \ SeqAn-1.3/seqan/store/store_base.h \ SeqAn-1.3/seqan/align.h \ SeqAn-1.3/seqan/modifier/modifier_string.h \ SeqAn-1.3/seqan/modifier/modifier_alphabet_expansion.h \ SeqAn-1.3/seqan/modifier/modifier_shortcuts.h \ SeqAn-1.3/seqan/modifier/modifier_alphabet.h \ SeqAn-1.3/seqan/modifier/modifier_iterator.h \ SeqAn-1.3/seqan/modifier/modifier_reverse.h \ SeqAn-1.3/seqan/modifier/modifier_generated_forwards.h \ SeqAn-1.3/seqan/modifier/modifier_view.h \ SeqAn-1.3/seqan/modifier/modifier_functors.h \ SeqAn-1.3/seqan/seeds2.h \ SeqAn-1.3/seqan/index/index_sa_btree.h \ SeqAn-1.3/seqan/index/find_quasar.h \ SeqAn-1.3/seqan/index/index_esa_algs.h \ SeqAn-1.3/seqan/index/index_skew7.h \ SeqAn-1.3/seqan/index/index_pizzachili.h \ SeqAn-1.3/seqan/index/index_sa_lss.h \ SeqAn-1.3/seqan/index/pump_extender7.h \ SeqAn-1.3/seqan/index/index_shawarma.h \ SeqAn-1.3/seqan/index/find_index_approx.h \ SeqAn-1.3/seqan/index/index_lcp.h \ SeqAn-1.3/seqan/index/find_swift.h \ SeqAn-1.3/seqan/index/shape_threshold.h \ SeqAn-1.3/seqan/index/pizzachili_api.h \ SeqAn-1.3/seqan/index/index_sa_mm.h \ SeqAn-1.3/seqan/index/index_esa_stree.h \ SeqAn-1.3/seqan/index/index_qgram.h \ SeqAn-1.3/seqan/index/pump_extender3.h \ SeqAn-1.3/seqan/index/shape_onegapped.h \ SeqAn-1.3/seqan/index/find_index_qgram.h \ SeqAn-1.3/seqan/index/index_skew3.h \ SeqAn-1.3/seqan/index/index_childtab.h \ SeqAn-1.3/seqan/index/pipe_merger7.h \ SeqAn-1.3/seqan/index/pipe_merger3.h \ SeqAn-1.3/seqan/index/repeat_base.h \ SeqAn-1.3/seqan/index/index_pizzachili_find.h \ SeqAn-1.3/seqan/index/index_sa_qsort.h \ SeqAn-1.3/seqan/index/index_shims.h \ SeqAn-1.3/seqan/index/index_manual_forwards.h \ SeqAn-1.3/seqan/index/shape_base.h \ SeqAn-1.3/seqan/index/shape_predefined.h \ SeqAn-1.3/seqan/index/index_pizzachili_string.h \ SeqAn-1.3/seqan/index/index_generated_forwards.h \ SeqAn-1.3/seqan/index/find_index_esa.h \ SeqAn-1.3/seqan/index/pump_separator7.h \ SeqAn-1.3/seqan/index/index_esa_base.h \ SeqAn-1.3/seqan/index/radix.h \ SeqAn-1.3/seqan/index/index_skew7_multi.h \ SeqAn-1.3/seqan/index/index_esa_algs_multi.h \ SeqAn-1.3/seqan/index/index_base.h \ SeqAn-1.3/seqan/index/shape_gapped.h \ SeqAn-1.3/seqan/index/index_lcp_tree.h \ SeqAn-1.3/seqan/index/index_esa_drawing.h \ SeqAn-1.3/seqan/index/find_index.h \ SeqAn-1.3/seqan/index/pump_lcp_core.h \ SeqAn-1.3/seqan/index/index_dfi.h \ SeqAn-1.3/seqan/index/index_sa_bwtwalk.h \ SeqAn-1.3/seqan/index/index_qgram_openaddressing.h \ SeqAn-1.3/seqan/index/index_bwt.h \ SeqAn-1.3/seqan/index/index_wotd.h \ SeqAn-1.3/seqan/statistics/statistics_base.h \ SeqAn-1.3/seqan/statistics/statistics_markov_model.h \ SeqAn-1.3/seqan/statistics/statistics_generated_forwards.h \ SeqAn-1.3/seqan/score.h \ SeqAn-1.3/seqan/sequence.h \ SeqAn-1.3/seqan/parallel.h \ SeqAn-1.3/seqan/graph_algorithms/graph_algorithm_heap_tree.h \ SeqAn-1.3/seqan/graph_algorithms/graph_algorithms_generated_forwards.h \ SeqAn-1.3/seqan/graph_algorithms/graph_algorithm_hmm.h \ SeqAn-1.3/seqan/graph_algorithms/graph_algorithm.h \ SeqAn-1.3/seqan/graph_algorithms/graph_algorithm_lis_his.h \ SeqAn-1.3/seqan/blast.h \ SeqAn-1.3/seqan/store.h \ SeqAn-1.3/seqan/graph_align.h \ SeqAn-1.3/seqan/random/random_lognormal.h \ SeqAn-1.3/seqan/random/ext_MersenneTwister.h \ SeqAn-1.3/seqan/random/random_generated_forwards.h \ SeqAn-1.3/seqan/random/random_rng_functor.h \ SeqAn-1.3/seqan/random/random_base.h \ SeqAn-1.3/seqan/random/random_uniform.h \ SeqAn-1.3/seqan/random/random_shuffle.h \ SeqAn-1.3/seqan/random/random_normal.h \ SeqAn-1.3/seqan/random/random_geometric.h \ SeqAn-1.3/seqan/random/random_mt19937.h \ SeqAn-1.3/seqan/basic.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_progressive.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_distance.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_io.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_kmer.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_refinement.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_msa.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_guidetree.h \ SeqAn-1.3/seqan/graph_msa/graph_msa_generated_forwards.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_library.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_base.h \ SeqAn-1.3/seqan/basic/basic_sse2.h \ SeqAn-1.3/seqan/basic/basic_forwards.h \ SeqAn-1.3/seqan/basic/basic_testing.h \ SeqAn-1.3/seqan/basic/basic_counted_ptr.h \ SeqAn-1.3/seqan/basic/basic_iterator_base.h \ SeqAn-1.3/seqan/basic/basic_pointer.h \ SeqAn-1.3/seqan/basic/basic_allocator_chunkpool.h \ SeqAn-1.3/seqan/basic/basic_parallelism.h \ SeqAn-1.3/seqan/basic/basic_allocator_singlepool.h \ SeqAn-1.3/seqan/basic/basic_iterator_adaptor.h \ SeqAn-1.3/seqan/basic/basic_profile.h \ SeqAn-1.3/seqan/basic/basic_tag.h \ SeqAn-1.3/seqan/basic/basic_compare.h \ SeqAn-1.3/seqan/basic/basic_alphabet_interface.h \ SeqAn-1.3/seqan/basic/basic_iterator_simple.h \ SeqAn-1.3/seqan/basic/basic_definition.h \ SeqAn-1.3/seqan/basic/basic_operator.h \ SeqAn-1.3/seqan/basic/basic_holder_dynamic.h \ SeqAn-1.3/seqan/basic/basic_allocator_multipool.h \ SeqAn-1.3/seqan/basic/basic_proxy.h \ SeqAn-1.3/seqan/basic/basic_alphabet_simple_tabs.h \ SeqAn-1.3/seqan/basic/basic_metaprogramming.h \ SeqAn-1.3/seqan/basic/basic_host.h \ SeqAn-1.3/seqan/basic/basic_allocator_to_std.h \ SeqAn-1.3/seqan/basic/basic_allocator_interface.h \ SeqAn-1.3/seqan/basic/basic_alphabet_interface2.h \ SeqAn-1.3/seqan/basic/basic_logvalue.h \ SeqAn-1.3/seqan/basic/basic_volatile_ptr.h \ SeqAn-1.3/seqan/basic/basic_debug.h \ SeqAn-1.3/seqan/basic/basic_profchar.h \ SeqAn-1.3/seqan/basic/basic_aggregates.h \ SeqAn-1.3/seqan/basic/basic_transport.h \ SeqAn-1.3/seqan/basic/basic_holder.h \ SeqAn-1.3/seqan/basic/basic_converter.h \ SeqAn-1.3/seqan/basic/basic_alphabet_simple.h \ SeqAn-1.3/seqan/basic/basic_type.h \ SeqAn-1.3/seqan/basic/basic_iterator_adapt_std.h \ SeqAn-1.3/seqan/basic/basic_allocator_simple.h \ SeqAn-1.3/seqan/basic/basic_iterator.h \ SeqAn-1.3/seqan/basic/basic_generated_forwards.h \ SeqAn-1.3/seqan/basic/basic_iterator_position.h \ SeqAn-1.3/seqan/basic/basic_alphabet_trait_basic.h \ SeqAn-1.3/seqan/sequence/sequence_forwards.h \ SeqAn-1.3/seqan/sequence/segment_infix.h \ SeqAn-1.3/seqan/sequence/string_set_dependent_tight.h \ SeqAn-1.3/seqan/sequence/segment_base.h \ SeqAn-1.3/seqan/sequence/sequence_lexical.h \ SeqAn-1.3/seqan/sequence/string_cstyle.h \ SeqAn-1.3/seqan/sequence/sequence_concatenator.h \ SeqAn-1.3/seqan/sequence/string_packed.h \ SeqAn-1.3/seqan/sequence/sequence_stream.h \ SeqAn-1.3/seqan/sequence/sequence_shortcuts.h \ SeqAn-1.3/seqan/sequence/adapt_std_list.h \ SeqAn-1.3/seqan/sequence/string_base.h \ SeqAn-1.3/seqan/sequence/string_set_owner.h \ SeqAn-1.3/seqan/sequence/adapt_std_string.h \ SeqAn-1.3/seqan/sequence/sequence_generated_forwards.h \ SeqAn-1.3/seqan/sequence/string_set_concat_direct.h \ SeqAn-1.3/seqan/sequence/sequence_interface.h \ SeqAn-1.3/seqan/sequence/adapt_array_pointer.h \ SeqAn-1.3/seqan/sequence/segment_suffix.h \ SeqAn-1.3/seqan/sequence/string_array.h \ SeqAn-1.3/seqan/sequence/iter_concat_virtual.h \ SeqAn-1.3/seqan/sequence/segment_prefix.h \ SeqAn-1.3/seqan/sequence/string_block.h \ SeqAn-1.3/seqan/sequence/string_set_base.h \ SeqAn-1.3/seqan/sequence/adapt_std_vector.h \ SeqAn-1.3/seqan/sequence/string_set_dependent_generous.h \ SeqAn-1.3/seqan/sequence/string_alloc.h \ SeqAn-1.3/seqan/modifier.h \ SeqAn-1.3/seqan/random.h \ SeqAn-1.3/seqan/statistics.h \ SeqAn-1.3/seqan/consensus.h \ SeqAn-1.3/seqan/find.h \ SeqAn-1.3/seqan/find2.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_inexact.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_fragment.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_annotation.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_exact.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_align.h \ SeqAn-1.3/seqan/refinement/refinement_generated_forwards.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_scoring.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_exact_iterative.h \ SeqAn-1.3/seqan/refinement/graph_impl_align.h \ SeqAn-1.3/seqan/refinement/graph_impl_align_adapt.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_aligngraph.h \ SeqAn-1.3/seqan/seeds/global_seed_chain.h \ SeqAn-1.3/seqan/seeds/seedSet_score.h \ SeqAn-1.3/seqan/seeds/seeds_generated_forwards.h \ SeqAn-1.3/seqan/seeds/banded_chain_align_affine.h \ SeqAn-1.3/seqan/seeds/banded_align.h \ SeqAn-1.3/seqan/seeds/memoryManager_int.h \ SeqAn-1.3/seqan/seeds/seedHandlingTags.h \ SeqAn-1.3/seqan/seeds/seedSet_iterator.h \ SeqAn-1.3/seqan/seeds/banded_chain_align.h \ SeqAn-1.3/seqan/seeds/seed_multi.h \ SeqAn-1.3/seqan/seeds/seedSet_base.h \ SeqAn-1.3/seqan/seeds/propertyMap.h \ SeqAn-1.3/seqan/seeds/memoryManager_base.h \ SeqAn-1.3/seqan/seeds/seed_base.h \ SeqAn-1.3/seqan/graph_msa.h \ SeqAn-1.3/seqan/consensus/consensus_score.h \ SeqAn-1.3/seqan/consensus/consensus_base.h \ SeqAn-1.3/seqan/consensus/consensus_generated_forwards.h \ SeqAn-1.3/seqan/consensus/consensus_realign.h \ SeqAn-1.3/seqan/consensus/consensus_library.h \ SeqAn-1.3/seqan/map/map_generated_forwards.h \ SeqAn-1.3/seqan/map/map_base.h \ SeqAn-1.3/seqan/map/map_chooser.h \ SeqAn-1.3/seqan/map/map_vector.h \ SeqAn-1.3/seqan/map/sumlist_mini.h \ SeqAn-1.3/seqan/map/map_adapter_stl.h \ SeqAn-1.3/seqan/map/sumlist.h \ SeqAn-1.3/seqan/map/map_skiplist.h \ SeqAn-1.3/seqan/map/sumlist_skip.h \ SeqAn-1.3/seqan/blast/blast_stream_report.h \ SeqAn-1.3/seqan/blast/blast_run.h \ SeqAn-1.3/seqan/blast/blast_hsp_iterator.h \ SeqAn-1.3/seqan/blast/blast_stream_hit_iterator.h \ SeqAn-1.3/seqan/blast/blast_hsp.h \ SeqAn-1.3/seqan/blast/blast_base.h \ SeqAn-1.3/seqan/blast/blast_hit.h \ SeqAn-1.3/seqan/blast/blast_stream_hit.h \ SeqAn-1.3/seqan/blast/blast_iterator.h \ SeqAn-1.3/seqan/blast/blast_parsing.h \ SeqAn-1.3/seqan/blast/blast_report.h \ SeqAn-1.3/seqan/blast/blast_generated_forwards.h \ SeqAn-1.3/seqan/blast/blast_stream_hsp_iterator.h \ SeqAn-1.3/seqan/blast/blast_hit_iterator.h \ SeqAn-1.3/seqan/seeds2/align_dynprog_affine.h \ SeqAn-1.3/seqan/seeds2/seeds_global_chaining_base.h \ SeqAn-1.3/seqan/seeds2/seeds_extension.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_set_unordered.h \ SeqAn-1.3/seqan/seeds2/seeds_global_chaining.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_set_base.h \ SeqAn-1.3/seqan/seeds2/seeds2_generated_forwards.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_diagonal.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_simple.h \ SeqAn-1.3/seqan/seeds2/seeds_base.h \ SeqAn-1.3/seqan/seeds2/align_dynprog_banded_affine.h \ SeqAn-1.3/seqan/seeds2/seeds_combination.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_set_non_scored.h \ SeqAn-1.3/seqan/seeds2/align_dynprog_banded_linear.h \ SeqAn-1.3/seqan/seeds2/align_chain_banded.h \ SeqAn-1.3/seqan/seeds2/align_dynprog_linear.h \ SeqAn-1.3/seqan/seeds2/basic_iter_indirect.h \ SeqAn-1.3/seqan/seeds2/seeds_global_chaining_gusfield.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_chained.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_set_scored.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_base.h \ SeqAn-1.3/seqan/map.h \ SeqAn-1.3/seqan/find/find_pattern_base.h \ SeqAn-1.3/seqan/find/find_multiple_shiftand.h \ SeqAn-1.3/seqan/find/find_generated_forwards.h \ SeqAn-1.3/seqan/find/find_horspool.h \ SeqAn-1.3/seqan/find/find_abndm.h \ SeqAn-1.3/seqan/find/find_score.h \ SeqAn-1.3/seqan/find/find_set_horspool.h \ SeqAn-1.3/seqan/find/find_wild_shiftand.h \ SeqAn-1.3/seqan/find/find_hamming_simple.h \ SeqAn-1.3/seqan/find/find_shiftand.h \ SeqAn-1.3/seqan/find/find_begin.h \ SeqAn-1.3/seqan/find/find_shiftor.h \ SeqAn-1.3/seqan/find/find_wumanber.h \ SeqAn-1.3/seqan/find/find_multi.h \ SeqAn-1.3/seqan/find/find_simple.h \ SeqAn-1.3/seqan/find/find_bndm.h \ SeqAn-1.3/seqan/find/find_pex.h \ SeqAn-1.3/seqan/find/find_bom.h \ SeqAn-1.3/seqan/find/find_ahocorasick.h \ SeqAn-1.3/seqan/find/find_multiple_bfam.h \ SeqAn-1.3/seqan/find/find_myers_ukkonen.h \ SeqAn-1.3/seqan/find/find_base.h \ SeqAn-1.3/seqan/index.h \ SeqAn-1.3/seqan/graph_types.h \ SeqAn-1.3/seqan/find_motif/profile.h \ SeqAn-1.3/seqan/find_motif/find_motif_pmsp.h \ SeqAn-1.3/seqan/find_motif/sequence_model_types.h \ SeqAn-1.3/seqan/find_motif/find_motif_base.h \ SeqAn-1.3/seqan/find_motif/pseudocount_mode_p.h \ SeqAn-1.3/seqan/find_motif/find_motif_epatternbranching.h \ SeqAn-1.3/seqan/find_motif/find_motif_projection.h \ SeqAn-1.3/seqan/find_motif/find_motif_generated_forwards.h \ SeqAn-1.3/seqan/find_motif/pseudocount_mode_c.h \ SeqAn-1.3/seqan/find_motif/pseudocount_base.h \ SeqAn-1.3/seqan/find_motif/em_algorithm.h \ SeqAn-1.3/seqan/find_motif/find_motif_pms1.h \ SeqAn-1.3/seqan/find_motif/frequency_distribution.h \ SeqAn-1.3/seqan.h bin_SCRIPTS = \ tophat2 \ tophat #-- scripts to be installed in $prefix/bin dist_bin_SCRIPTS = \ contig_to_chr_coords \ bed_to_juncs \ sra_to_solid \ tophat-fusion-post CLEANFILES = \ tophat2 \ tophat #SUFFIXES = .py #.py: # (echo '#!$(PYTHON)'; sed '/^#!/d' $<) > $@ #-- tophat library for linking convienence noinst_LIBRARIES = libtophat.a libgc.a noinst_HEADERS = \ reads.h \ codons.h \ common.h \ GBase.h \ gdna.h \ GFaSeqGet.h \ gff.h \ GHash.hh \ GVec.hh \ GList.hh \ GStr.h \ FastaTools.h \ GTFToFasta.h \ map2gtf.h \ bwt_map.h \ junctions.h \ assert_helpers.h \ insertions.h \ wiggles.h \ deletions.h \ fusions.h \ align_status.h \ alphabet.h \ timer.h \ tokenize.h \ fragments.h \ inserts.h \ segments.h \ qual.h \ bam_merge.h \ utils.h \ coverage.h libtophat_a_SOURCES = \ reads.cpp \ alphabet.c \ bwt_map.cpp \ common.cpp \ junctions.cpp \ insertions.cpp \ deletions.cpp \ fusions.cpp \ align_status.cpp \ fragments.cpp \ tokenize.cpp \ inserts.cpp \ qual.cpp \ bam_merge_impl.cpp \ utils.cpp \ coverage.cpp libgc_a_SOURCES = \ GBase.cpp \ codons.cpp \ gdna.cpp \ GStr.cpp \ GFaSeqGet.cpp \ gff.cpp #-- program sources prep_reads_SOURCES = prep_reads.cpp prep_reads_LDADD = $(top_builddir)/src/libtophat.a $(BAM_LIB) prep_reads_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) segment_juncs_SOURCES = segment_juncs.cpp segment_juncs_LDADD = $(top_builddir)/src/libtophat.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB) segment_juncs_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS) long_spanning_reads_SOURCES = long_spanning_reads.cpp long_spanning_reads_LDADD = $(top_builddir)/src/libtophat.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB) long_spanning_reads_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS) gtf_juncs_SOURCES = gtf_juncs.cpp gtf_juncs_LDADD = $(top_builddir)/src/libtophat.a libgc.a $(BAM_LIB) gtf_juncs_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) juncs_db_SOURCES = juncs_db.cpp juncs_db_LDADD = $(top_builddir)/src/libtophat.a $(BAM_LIB) juncs_db_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) tophat_reports_SOURCES = tophat_reports.cpp tophat_reports_LDADD = $(top_builddir)/src/libtophat.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB) tophat_reports_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS) fix_map_ordering_SOURCES = fix_map_ordering.cpp fix_map_ordering_LDADD = $(top_builddir)/src/libtophat.a $(BAM_LIB) fix_map_ordering_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) bam2fastx_SOURCES = bam2fastx.cpp bam2fastx_LDADD = $(top_builddir)/src/libgc.a $(BAM_LIB) bam2fastx_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) bam_merge_SOURCES = bam_merge.cpp bam_merge_LDADD = $(top_builddir)/src/libtophat.a $(top_builddir)/src/libgc.a $(BAM_LIB) bam_merge_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) sam_juncs_SOURCES = sam_juncs.cpp sam_juncs_LDADD = $(top_builddir)/src/libtophat.a $(BAM_LIB) sam_juncs_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) map2gtf_SOURCES = map2gtf.cpp map2gtf_LDADD = $(top_builddir)/src/libtophat.a libgc.a $(BAM_LIB) map2gtf_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) gtf_to_fasta_SOURCES = GTFToFasta.cpp FastaTools.cpp gtf_to_fasta_LDADD = $(top_builddir)/src/libtophat.a libgc.a $(BAM_LIB) gtf_to_fasta_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) all: all-am .SUFFIXES: .SUFFIXES: .c .cpp .o .obj $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) @for dep in $?; do \ case '$(am__configure_deps)' in \ *$$dep*) \ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh \ && exit 0; \ exit 1;; \ esac; \ done; \ echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/Makefile'; \ cd $(top_srcdir) && \ $(AUTOMAKE) --foreign src/Makefile .PRECIOUS: Makefile Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status @case '$?' in \ *config.status*) \ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ *) \ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ esac; $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(top_srcdir)/configure: $(am__configure_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(ACLOCAL_M4): $(am__aclocal_m4_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh clean-noinstLIBRARIES: -test -z "$(noinst_LIBRARIES)" || rm -f $(noinst_LIBRARIES) libgc.a: $(libgc_a_OBJECTS) $(libgc_a_DEPENDENCIES) -rm -f libgc.a $(libgc_a_AR) libgc.a $(libgc_a_OBJECTS) $(libgc_a_LIBADD) $(RANLIB) libgc.a libtophat.a: $(libtophat_a_OBJECTS) $(libtophat_a_DEPENDENCIES) -rm -f libtophat.a $(libtophat_a_AR) libtophat.a $(libtophat_a_OBJECTS) $(libtophat_a_LIBADD) $(RANLIB) libtophat.a install-binPROGRAMS: $(bin_PROGRAMS) @$(NORMAL_INSTALL) test -z "$(bindir)" || $(mkdir_p) "$(DESTDIR)$(bindir)" @list='$(bin_PROGRAMS)'; for p in $$list; do \ p1=`echo $$p|sed 's/$(EXEEXT)$$//'`; \ if test -f $$p \ ; then \ f=`echo "$$p1" | sed 's,^.*/,,;$(transform);s/$$/$(EXEEXT)/'`; \ echo " $(INSTALL_PROGRAM_ENV) $(binPROGRAMS_INSTALL) '$$p' '$(DESTDIR)$(bindir)/$$f'"; \ $(INSTALL_PROGRAM_ENV) $(binPROGRAMS_INSTALL) "$$p" "$(DESTDIR)$(bindir)/$$f" || exit 1; \ else :; fi; \ done uninstall-binPROGRAMS: @$(NORMAL_UNINSTALL) @list='$(bin_PROGRAMS)'; for p in $$list; do \ f=`echo "$$p" | sed 's,^.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/'`; \ echo " rm -f '$(DESTDIR)$(bindir)/$$f'"; \ rm -f "$(DESTDIR)$(bindir)/$$f"; \ done clean-binPROGRAMS: -test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS) bam2fastx$(EXEEXT): $(bam2fastx_OBJECTS) $(bam2fastx_DEPENDENCIES) @rm -f bam2fastx$(EXEEXT) $(CXXLINK) $(bam2fastx_LDFLAGS) $(bam2fastx_OBJECTS) $(bam2fastx_LDADD) $(LIBS) bam_merge$(EXEEXT): $(bam_merge_OBJECTS) $(bam_merge_DEPENDENCIES) @rm -f bam_merge$(EXEEXT) $(CXXLINK) $(bam_merge_LDFLAGS) $(bam_merge_OBJECTS) $(bam_merge_LDADD) $(LIBS) fix_map_ordering$(EXEEXT): $(fix_map_ordering_OBJECTS) $(fix_map_ordering_DEPENDENCIES) @rm -f fix_map_ordering$(EXEEXT) $(CXXLINK) $(fix_map_ordering_LDFLAGS) $(fix_map_ordering_OBJECTS) $(fix_map_ordering_LDADD) $(LIBS) gtf_juncs$(EXEEXT): $(gtf_juncs_OBJECTS) $(gtf_juncs_DEPENDENCIES) @rm -f gtf_juncs$(EXEEXT) $(CXXLINK) $(gtf_juncs_LDFLAGS) $(gtf_juncs_OBJECTS) $(gtf_juncs_LDADD) $(LIBS) gtf_to_fasta$(EXEEXT): $(gtf_to_fasta_OBJECTS) $(gtf_to_fasta_DEPENDENCIES) @rm -f gtf_to_fasta$(EXEEXT) $(CXXLINK) $(gtf_to_fasta_LDFLAGS) $(gtf_to_fasta_OBJECTS) $(gtf_to_fasta_LDADD) $(LIBS) juncs_db$(EXEEXT): $(juncs_db_OBJECTS) $(juncs_db_DEPENDENCIES) @rm -f juncs_db$(EXEEXT) $(CXXLINK) $(juncs_db_LDFLAGS) $(juncs_db_OBJECTS) $(juncs_db_LDADD) $(LIBS) long_spanning_reads$(EXEEXT): $(long_spanning_reads_OBJECTS) $(long_spanning_reads_DEPENDENCIES) @rm -f long_spanning_reads$(EXEEXT) $(CXXLINK) $(long_spanning_reads_LDFLAGS) $(long_spanning_reads_OBJECTS) $(long_spanning_reads_LDADD) $(LIBS) map2gtf$(EXEEXT): $(map2gtf_OBJECTS) $(map2gtf_DEPENDENCIES) @rm -f map2gtf$(EXEEXT) $(CXXLINK) $(map2gtf_LDFLAGS) $(map2gtf_OBJECTS) $(map2gtf_LDADD) $(LIBS) prep_reads$(EXEEXT): $(prep_reads_OBJECTS) $(prep_reads_DEPENDENCIES) @rm -f prep_reads$(EXEEXT) $(CXXLINK) $(prep_reads_LDFLAGS) $(prep_reads_OBJECTS) $(prep_reads_LDADD) $(LIBS) sam_juncs$(EXEEXT): $(sam_juncs_OBJECTS) $(sam_juncs_DEPENDENCIES) @rm -f sam_juncs$(EXEEXT) $(CXXLINK) $(sam_juncs_LDFLAGS) $(sam_juncs_OBJECTS) $(sam_juncs_LDADD) $(LIBS) segment_juncs$(EXEEXT): $(segment_juncs_OBJECTS) $(segment_juncs_DEPENDENCIES) @rm -f segment_juncs$(EXEEXT) $(CXXLINK) $(segment_juncs_LDFLAGS) $(segment_juncs_OBJECTS) $(segment_juncs_LDADD) $(LIBS) tophat_reports$(EXEEXT): $(tophat_reports_OBJECTS) $(tophat_reports_DEPENDENCIES) @rm -f tophat_reports$(EXEEXT) $(CXXLINK) $(tophat_reports_LDFLAGS) $(tophat_reports_OBJECTS) $(tophat_reports_LDADD) $(LIBS) install-binSCRIPTS: $(bin_SCRIPTS) @$(NORMAL_INSTALL) test -z "$(bindir)" || $(mkdir_p) "$(DESTDIR)$(bindir)" @list='$(bin_SCRIPTS)'; for p in $$list; do \ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ if test -f $$d$$p; then \ f=`echo "$$p" | sed 's|^.*/||;$(transform)'`; \ echo " $(binSCRIPT_INSTALL) '$$d$$p' '$(DESTDIR)$(bindir)/$$f'"; \ $(binSCRIPT_INSTALL) "$$d$$p" "$(DESTDIR)$(bindir)/$$f"; \ else :; fi; \ done uninstall-binSCRIPTS: @$(NORMAL_UNINSTALL) @list='$(bin_SCRIPTS)'; for p in $$list; do \ f=`echo "$$p" | sed 's|^.*/||;$(transform)'`; \ echo " rm -f '$(DESTDIR)$(bindir)/$$f'"; \ rm -f "$(DESTDIR)$(bindir)/$$f"; \ done install-dist_binSCRIPTS: $(dist_bin_SCRIPTS) @$(NORMAL_INSTALL) test -z "$(bindir)" || $(mkdir_p) "$(DESTDIR)$(bindir)" @list='$(dist_bin_SCRIPTS)'; for p in $$list; do \ if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ if test -f $$d$$p; then \ f=`echo "$$p" | sed 's|^.*/||;$(transform)'`; \ echo " $(dist_binSCRIPT_INSTALL) '$$d$$p' '$(DESTDIR)$(bindir)/$$f'"; \ $(dist_binSCRIPT_INSTALL) "$$d$$p" "$(DESTDIR)$(bindir)/$$f"; \ else :; fi; \ done uninstall-dist_binSCRIPTS: @$(NORMAL_UNINSTALL) @list='$(dist_bin_SCRIPTS)'; for p in $$list; do \ f=`echo "$$p" | sed 's|^.*/||;$(transform)'`; \ echo " rm -f '$(DESTDIR)$(bindir)/$$f'"; \ rm -f "$(DESTDIR)$(bindir)/$$f"; \ done mostlyclean-compile: -rm -f *.$(OBJEXT) distclean-compile: -rm -f *.tab.c @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/FastaTools.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/GBase.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/GFaSeqGet.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/GStr.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/GTFToFasta.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/align_status.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/alphabet.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bam2fastx.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bam_merge.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bam_merge_impl.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bwt_map.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codons.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/common.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/coverage.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/deletions.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fix_map_ordering.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fragments.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fusions.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gdna.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gff.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gtf_juncs.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/insertions.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/inserts.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/juncs_db.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/junctions.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/long_spanning_reads.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/map2gtf.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/prep_reads.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/qual.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/reads.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sam_juncs.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/segment_juncs.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tokenize.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tophat_reports.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utils.Po@am__quote@ .c.o: @am__fastdepCC_TRUE@ if $(COMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \ @am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi @AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(COMPILE) -c $< .c.obj: @am__fastdepCC_TRUE@ if $(COMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ `$(CYGPATH_W) '$<'`; \ @am__fastdepCC_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi @AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'` .cpp.o: @am__fastdepCXX_TRUE@ if $(CXXCOMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \ @am__fastdepCXX_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi @AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ $< .cpp.obj: @am__fastdepCXX_TRUE@ if $(CXXCOMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ `$(CYGPATH_W) '$<'`; \ @am__fastdepCXX_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi @AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` uninstall-info-am: ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) ' { files[$$0] = 1; } \ END { for (i in files) print i; }'`; \ mkid -fID $$unique tags: TAGS TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ $(TAGS_FILES) $(LISP) tags=; \ here=`pwd`; \ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) ' { files[$$0] = 1; } \ END { for (i in files) print i; }'`; \ if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \ test -n "$$unique" || unique=$$empty_fix; \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ $$tags $$unique; \ fi ctags: CTAGS CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ $(TAGS_FILES) $(LISP) tags=; \ here=`pwd`; \ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) ' { files[$$0] = 1; } \ END { for (i in files) print i; }'`; \ test -z "$(CTAGS_ARGS)$$tags$$unique" \ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ $$tags $$unique GTAGS: here=`$(am__cd) $(top_builddir) && pwd` \ && cd $(top_srcdir) \ && gtags -i $(GTAGS_ARGS) $$here distclean-tags: -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags distdir: $(DISTFILES) $(mkdir_p) $(distdir)/SeqAn-1.3 $(distdir)/SeqAn-1.3/seqan $(distdir)/SeqAn-1.3/seqan/align $(distdir)/SeqAn-1.3/seqan/basic $(distdir)/SeqAn-1.3/seqan/blast $(distdir)/SeqAn-1.3/seqan/chaining $(distdir)/SeqAn-1.3/seqan/consensus $(distdir)/SeqAn-1.3/seqan/file $(distdir)/SeqAn-1.3/seqan/find $(distdir)/SeqAn-1.3/seqan/find2 $(distdir)/SeqAn-1.3/seqan/find_motif $(distdir)/SeqAn-1.3/seqan/graph_algorithms $(distdir)/SeqAn-1.3/seqan/graph_align $(distdir)/SeqAn-1.3/seqan/graph_msa $(distdir)/SeqAn-1.3/seqan/graph_types $(distdir)/SeqAn-1.3/seqan/index $(distdir)/SeqAn-1.3/seqan/map $(distdir)/SeqAn-1.3/seqan/misc $(distdir)/SeqAn-1.3/seqan/modifier $(distdir)/SeqAn-1.3/seqan/parallel $(distdir)/SeqAn-1.3/seqan/pipe $(distdir)/SeqAn-1.3/seqan/platform $(distdir)/SeqAn-1.3/seqan/random $(distdir)/SeqAn-1.3/seqan/refinement $(distdir)/SeqAn-1.3/seqan/score $(distdir)/SeqAn-1.3/seqan/seeds $(distdir)/SeqAn-1.3/seqan/seeds2 $(distdir)/SeqAn-1.3/seqan/sequence $(distdir)/SeqAn-1.3/seqan/sequence_journaled $(distdir)/SeqAn-1.3/seqan/statistics $(distdir)/SeqAn-1.3/seqan/store $(distdir)/SeqAn-1.3/seqan/system @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \ list='$(DISTFILES)'; for file in $$list; do \ case $$file in \ $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \ $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \ esac; \ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \ if test "$$dir" != "$$file" && test "$$dir" != "."; then \ dir="/$$dir"; \ $(mkdir_p) "$(distdir)$$dir"; \ else \ dir=''; \ fi; \ if test -d $$d/$$file; then \ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ fi; \ cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ else \ test -f $(distdir)/$$file \ || cp -p $$d/$$file $(distdir)/$$file \ || exit 1; \ fi; \ done check-am: all-am check: check-am all-am: Makefile $(LIBRARIES) $(PROGRAMS) $(SCRIPTS) $(HEADERS) installdirs: for dir in "$(DESTDIR)$(bindir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(bindir)"; do \ test -z "$$dir" || $(mkdir_p) "$$dir"; \ done install: install-am install-exec: install-exec-am install-data: install-data-am uninstall: uninstall-am install-am: all-am @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am installcheck: installcheck-am install-strip: $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ `test -z '$(STRIP)' || \ echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install mostlyclean-generic: clean-generic: -test -z "$(CLEANFILES)" || rm -f $(CLEANFILES) distclean-generic: -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) maintainer-clean-generic: @echo "This command is intended for maintainers to use" @echo "it deletes files that may require special tools to rebuild." clean: clean-am clean-am: clean-binPROGRAMS clean-generic clean-noinstLIBRARIES \ mostlyclean-am distclean: distclean-am -rm -rf ./$(DEPDIR) -rm -f Makefile distclean-am: clean-am distclean-compile distclean-generic \ distclean-tags dvi: dvi-am dvi-am: html: html-am info: info-am info-am: install-data-am: install-exec-am: install-binPROGRAMS install-binSCRIPTS \ install-dist_binSCRIPTS install-info: install-info-am install-man: installcheck-am: maintainer-clean: maintainer-clean-am -rm -rf ./$(DEPDIR) -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic mostlyclean: mostlyclean-am mostlyclean-am: mostlyclean-compile mostlyclean-generic pdf: pdf-am pdf-am: ps: ps-am ps-am: uninstall-am: uninstall-binPROGRAMS uninstall-binSCRIPTS \ uninstall-dist_binSCRIPTS uninstall-info-am .PHONY: CTAGS GTAGS all all-am check check-am clean clean-binPROGRAMS \ clean-generic clean-noinstLIBRARIES ctags distclean \ distclean-compile distclean-generic distclean-tags distdir dvi \ dvi-am html html-am info info-am install install-am \ install-binPROGRAMS install-binSCRIPTS install-data \ install-data-am install-dist_binSCRIPTS install-exec \ install-exec-am install-info install-info-am install-man \ install-strip installcheck installcheck-am installdirs \ maintainer-clean maintainer-clean-generic mostlyclean \ mostlyclean-compile mostlyclean-generic pdf pdf-am ps ps-am \ tags uninstall uninstall-am uninstall-binPROGRAMS \ uninstall-binSCRIPTS uninstall-dist_binSCRIPTS \ uninstall-info-am tophat2: tophat2.in sed -e 's|__PREFIX__|$(prefix)|' tophat2.in > tophat2 tophat: tophat.py sed -e 's|__VERSION__|$(VERSION)|' tophat.py > tophat # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. .NOEXPORT: tophat-2.0.9/src/deletions.h0000755000175000017500000000243212122334361014500 0ustar toortoor#ifndef DELETIONS_H #define DELETIONS_H /* * deletions.h * TopHat * * Created by Ryan Kelley on 11/09/2010 * */ #include #include #include #include #include #include #include #include #include #include #include "bwt_map.h" #include "junctions.h" using namespace std; typedef Junction Deletion; struct DeletionStats { DeletionStats() : left_extent(0), right_extent(0), supporting_hits(0) {} DeletionStats& merge_with(const DeletionStats& other) { if (this == &other) return *this; left_extent = max(left_extent, other.left_extent); right_extent = max(right_extent, other.right_extent); supporting_hits += other.supporting_hits; return *this; } int left_extent; int right_extent; int supporting_hits; }; typedef std::map DeletionSet; void deletions_from_alignment(const BowtieHit& spliced_alignment, DeletionSet& junctions); void deletions_from_spliced_hit(const BowtieHit& bh, vector >& deletions); void print_deletions(FILE* deletions_out, const DeletionSet& deletions, RefSequenceTable& ref_sequences); void merge_with(DeletionSet& deletions, const DeletionSet& other); #endif tophat-2.0.9/src/GBase.cpp0000644000175000017500000004446112157116165014043 0ustar toortoor#include "GBase.h" #include #include #include #ifndef S_ISDIR #define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR) #endif #ifndef S_ISREG #define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG) #endif /* #ifdef _DEFINE_WIN32_FSEEKO int fseeko(FILE *stream, off_t offset, int whence) { } #endif #ifdef _DEFINE_WIN32_FTELLO off_t ftello(FILE *stream) { } #endif */ /* int saprintf(char **retp, const char *fmt, ...) { va_list argp; int len; char *buf; va_start(argp, fmt); len = vsnprintf(NULL, 0, fmt, argp); va_end(argp); GMALLOC(buf, (len + 1)); if(buf == NULL) { *retp = NULL; return -1; } va_start(argp, fmt); vsnprintf(buf, len+1, fmt, argp); va_end(argp); *retp = buf; return len; } */ //************************* Debug helpers ************************** // Assert failed routine void GAssert(const char* expression, const char* filename, unsigned int lineno){ char msg[4096]; sprintf(msg,"%s(%d): ASSERT(%s) failed.\n",filename,lineno,expression); fprintf(stderr,"%s",msg); //abort(); } // Error routine (prints error message and exits!) void GError(const char* format,...){ #ifdef __WIN32__ char msg[4096]; va_list arguments; va_start(arguments,format); vsprintf(msg,format,arguments); va_end(arguments); OutputDebugString(msg); fprintf(stderr,"%s",msg); // if a console is available MessageBox(NULL,msg,NULL,MB_OK|MB_ICONEXCLAMATION|MB_APPLMODAL); #else va_list arguments; va_start(arguments,format); vfprintf(stderr,format,arguments); va_end(arguments); #ifdef DEBUG // modify here if you want a core dump abort(); #endif #endif exit(1); } // Warning routine (just print message without exiting) void GMessage(const char* format,...){ char msg[4096]; va_list arguments; va_start(arguments,format); vsprintf(msg,format,arguments); va_end(arguments); #ifdef __WIN32__ OutputDebugString(msg); #endif fprintf(stderr,"%s",msg);fflush(stderr); } /*************** Memory management routines *****************/ // Allocate memory bool GMalloc(pointer* ptr,unsigned long size){ //GASSERT(ptr); if (size!=0) *ptr=malloc(size); return *ptr!=NULL; } // Allocate cleaned memory (0 filled) bool GCalloc(pointer* ptr,unsigned long size){ GASSERT(ptr); *ptr=calloc(size,1); return *ptr!=NULL; } // Resize memory bool GRealloc(pointer* ptr,unsigned long size){ //GASSERT(ptr); if (size==0) { GFree(ptr); return true; } if (*ptr==NULL) {//simple malloc void *p=malloc(size); if (p != NULL) { *ptr=p; return true; } else return false; }//malloc else {//realloc void *p=realloc(*ptr,size); if (p) { *ptr=p; return true; } return false; } } // Free memory, resets ptr to NULL afterward void GFree(pointer* ptr){ GASSERT(ptr); if (*ptr) free(*ptr); *ptr=NULL; } char* Gstrdup(const char* str) { if (str==NULL) return NULL; char *copy=NULL; GMALLOC(copy, strlen(str)+1); strcpy(copy,str); return copy; } char* newEmptyStr() { char* zs=NULL; GMALLOC(zs,1); zs[0]=0; return zs; } char* Gstrdup(const char* sfrom, const char* sto) { if (sfrom==NULL || sto==NULL) return NULL; char *copy=NULL; if (sfrom[0]==0) return newEmptyStr(); GMALLOC(copy, sto-sfrom+2); strncpy(copy, sfrom, sto-sfrom+1); copy[sto-sfrom+1]=0; return copy; } int Gstrcmp(const char* a, const char* b, int n) { if (a==NULL || b==NULL) { return a==NULL ? -1 : 1; } else { if (n<0) return strcmp(a,b); else return strncmp(a,b,n); } } int Gstricmp(const char* a, const char* b, int n) { if (a==NULL || b==NULL) return a==NULL ? -1 : 1; register int ua, ub; if (n<0) { while ((*a!=0) && (*b!=0)) { ua=tolower((unsigned char)*a); ub=tolower((unsigned char)*b); a++;b++; if (ua!=ub) return ua < ub ? -1 : 1; } return (*a == 0) ? ( (*b == 0) ? 0 : -1 ) : 1 ; } else { while (n && (*a!=0) && (*b!=0)) { ua=tolower((unsigned char)*a); ub=tolower((unsigned char)*b); a++;b++;n--; if (ua!=ub) return ua < ub ? -1 : 1; } //return (*a == 0) ? ( (*b == 0) ? 0 : -1 ) : 1 ; if (n==0) return 0; else { return (*a == 0) ? ( (*b == 0) ? 0 : -1 ) : 1 ; } } } int strsplit(char* str, char** fields, int maxfields, const char* delim) { //splits by placing 0 where delim chars are found, setting fields[] to the beginning //of each field (stopping after maxfields); returns number of fields parsed int tidx=0; bool afterdelim=true; int i=0; while (str[i]!=0 && tidx=str) { if (*p==ch) return p; p--; } return NULL; } /* DOS/UNIX safer fgets : reads a text line from a (binary) file and update the file position accordingly and the buffer capacity accordingly. The given buf is resized to read the entire line in memory -- even when it's abnormally long */ char* fgetline(char* & buf, int& buf_cap, FILE *stream, off_t* f_pos, int* linelen) { //reads a char at a time until \n and/or \r are encountered int i=0; int c=0; off_t fpos=(f_pos!=NULL) ? *f_pos : 0; while ((c=getc(stream))!=EOF) { if (i>=buf_cap-1) { buf_cap+=1024; GREALLOC(buf, buf_cap); } if (c=='\n' || c=='\r') { if (c=='\r') { if ((c=getc(stream))!='\n') ungetc(c,stream); else fpos++; } fpos++; break; } fpos++; buf[i]=(char)c; i++; } //while i=allocated-1) { allocated+=1024; GREALLOC(buf, allocated); } if (c=='\n' || c=='\r') { buf[len]='\0'; if (c=='\r') { //DOS file -- special case if ((c=getc(stream))!='\n') ungetc(c,stream); else f_pos++; } f_pos++; lcount++; return buf; } f_pos++; buf[len]=(char)c; len++; } //while i=str) { for (i=0; i=0 && s[i]==suffix[j]) { i--; j--; } return (j==-1); } char* reverseChars(char* str, int slen) { if (slen==0) slen=strlen(str); int l=0; int r=slen-1; char c; while (l=lend) { for (i=0;i>24; h&=0x0fffffff; } GASSERT(h<=0x0fffffff); return h; } // removes the last part (file or directory name) of a full path // this is a destructive operation for the given string!!! // the trailing '/' is guaranteed to be there void delFileName(char* filepath) { char *p, *sep; if (filepath==NULL) return; for (p=filepath, sep=filepath;*p!='\0';p++) if (*p=='/' || *p=='\\') sep=p+1; *sep='\0'; // truncate filepath } // returns a pointer to the last file or directory name in a full path const char* getFileName(const char* filepath) { const char *p, *sep; if (filepath==NULL) return NULL; for (p=filepath, sep=filepath;*p!='\0';p++) if (*p=='/' || *p=='\\') sep=p+1; return sep; } // returns a pointer to the file "extension" part in a filename const char* getFileExt(const char* filepath) { const char *p, *dp, *sep; if (filepath==NULL) return NULL; for (p=filepath, dp=filepath, sep=filepath;*p!='\0';p++) { if (*p=='.') dp=p+1; else if (*p=='/' || *p=='\\') sep=p+1; } return (dp>sep) ? dp : NULL ; } int fileExists(const char* fname) { struct stat stFileInfo; int r=0; // Attempt to get the file attributes int fs = stat(fname,&stFileInfo); if (fs == 0) { r=3; // We were able to get the file attributes // so the file obviously exists. if (S_ISREG (stFileInfo.st_mode)) { r=2; } if (S_ISDIR (stFileInfo.st_mode)) { r=1; } } return r; } /*bool fileExists(const char* filepath) { if (filepath==NULL) return false; FILE* ft=fopen(filepath, "rb"); if (ft==NULL) return false; fclose(ft); return true; } */ int64 fileSize(const char* fpath) { struct stat results; if (stat(fpath, &results) == 0) // The size of the file in bytes is in return (int64)results.st_size; else // An error occurred //GMessage("Error at stat(%s)!\n", fpath); return 0; } bool parseNumber(char* &p, double& v) { //skip any spaces.. while (*p==' ' || *p=='\t') p++; char* start=p; /*if (*p=='-') p++; else if (*p=='+') { p++;start++; }*/ /* while ((*p>='1' && *p<='9') || *p=='0' || *p=='.' || *p=='-' || tolower(*p)=='e') p++; */ int numlen=strspn(start, "0123456789eE.-+"); p=start+numlen; //now p is on a non-digit; if (*start=='-' && p==start+1) return false; char saved=*p; *p='\0'; char* endptr=p; v=strtod(start,&endptr); *p=saved; if (endptr!=p) return false; return true; } bool parseDouble(char* &p, double& v) { return parseNumber(p,v); } bool parseInt(char* &p, int& i) { while (*p==' ' || *p=='\t') p++; char* start=p; if (*p=='-') p++; else if (*p=='+') { p++;start++; } while ((*p>='1' && *p<='9') || *p=='0') p++; //now p is on a non-digit; if (*start=='-' && p==start+1) return false; char saved=*p; *p='\0'; char* endptr=p; long l=strtol(start,&endptr,10); i=(int)l; *p=saved; if (endptr!=p || i!=l) return false; return true; } bool parseUInt(char* &p, uint& i) { while (*p==' ' || *p=='\t') p++; char* start=p; if (*p=='-') return false; else if (*p=='+') { p++;start++; } while ((*p>='1' && *p<='9') || *p=='0') p++; //now p is on a non-digit; if (*start=='-' && p==start+1) return false; char saved=*p; *p='\0'; char* endptr=p; unsigned long l=strtoul(start,&endptr,10); i=(uint) l; *p=saved; if (endptr!=p || i!=l) return false; return true; } bool parseHex(char* &p, uint& i) { //skip initial spaces/prefix while (*p==' ' || *p=='\t' || *p=='0' || *p=='x') p++; char* start=p; if (*p=='-') return false; else if (*p=='+') { p++;start++; } while (isxdigit(*p)) p++; //now p is on a non-hexdigit; if (p==start+1) return false; char saved=*p; *p='\0'; char* endptr=p; unsigned long l=strtoul(start,&endptr,16); i=(uint) l; *p=saved; if (endptr!=p || i!=l) return false; return true; } //write a formatted fasta record, fasta formatted void writeFasta(FILE *fw, const char* seqid, const char* descr, const char* seq, int linelen, int seqlen) { fflush(fw); // write header line only if given! if (seqid!=NULL) { if (descr==NULL || descr[0]==0) fprintf(fw,">%s\n",seqid); else fprintf(fw,">%s %s\n",seqid, descr); } fflush(fw); if (seq==NULL || *seq==0) return; //nothing to print if (linelen==0) { //unlimited line length: write the whole sequence on a line if (seqlen>0) fwrite((const void*)seq, 1, seqlen,fw); else fprintf(fw,"%s",seq); fprintf(fw,"\n"); fflush(fw); return; } int ilen=0; if (seqlen>0) { //seq length given, so we know when to stop for (int i=0; i < seqlen; i++, ilen++) { if (ilen == linelen) { fputc('\n', fw); ilen = 0; } fputc(seq[i], fw); } fputc('\n', fw); } else { //seq length not given, stop when 0 encountered for (int i=0; seq[i]!=0; i++, ilen++) { if (ilen == linelen) { fputc('\n', fw); ilen = 0; } fputc(seq[i], fw); } //for fputc('\n', fw); } fflush(fw); } char* commaprint(uint64 n) { int comma = '\0'; char retbuf[48]; char *p = &retbuf[sizeof(retbuf)-1]; int i = 0; if(comma == '\0') { /* struct lconv *lcp = localeconv(); if(lcp != NULL) { if(lcp->thousands_sep != NULL && *lcp->thousands_sep != '\0') comma = *lcp->thousands_sep; else */ comma = ','; // } } *p = '\0'; do { if(i%3 == 0 && i != 0) *--p = comma; *--p = '0' + n % 10; n /= 10; i++; } while(n != 0); return p; } tophat-2.0.9/src/tophat_reports.cpp0000644000175000017500000026537312163075201016136 0ustar toortoor/* * tophat_reports.cpp * TopHat * * Created by Cole Trapnell on 11/20/08. * Copyright 2008 Cole Trapnell. All rights reserved. * */ #ifdef HAVE_CONFIG_H #include #else #define PACKAGE_VERSION "INTERNAL" #define SVN_REVISION "XXX" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common.h" #include "utils.h" #include "bwt_map.h" #include "junctions.h" #include "insertions.h" #include "deletions.h" #include "fusions.h" #include "align_status.h" #include "fragments.h" #include "wiggles.h" #include "tokenize.h" #include "reads.h" #include "coverage.h" #include "inserts.h" #include "bam_merge.h" using namespace std; using namespace seqan; using std::set; static const JunctionSet empty_junctions; static const InsertionSet empty_insertions; static const DeletionSet empty_deletions; static const FusionSet empty_fusions; static const Coverage empty_coverage; // daehwan - this is redundancy, which should be removed. void get_seqs(istream& ref_stream, RefSequenceTable& rt, bool keep_seqs = true) { while(ref_stream.good() && !ref_stream.eof()) { RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence(); string name; readMeta(ref_stream, name, Fasta()); string::size_type space_pos = name.find_first_of(" \t\r"); if (space_pos != string::npos) { name.resize(space_pos); } fprintf(stderr, "\tLoading %s...", name.c_str()); seqan::read(ref_stream, *ref_str, Fasta()); fprintf(stderr, "done\n"); rt.get_id(name, keep_seqs ? ref_str : NULL, 0); } } struct cmp_read_alignment { bool operator() (const BowtieHit& l, const BowtieHit& r) const { return l.alignment_score() > r.alignment_score(); } }; struct cmp_read_equal { bool operator() (const BowtieHit& l, const BowtieHit& r) const { if (l.insert_id() != r.insert_id() || l.ref_id() != r.ref_id() || l.ref_id2() != r.ref_id2() || l.left() != r.left() || l.right() != r.right() || l.antisense_align() != r.antisense_align() || l.mismatches() != r.mismatches() || l.edit_dist() != r.edit_dist() || l.cigar().size() != r.cigar().size()) return false; return true; } }; char read_best_alignments(const HitsForRead& hits_for_read, HitsForRead& best_hits, const JunctionSet& gtf_junctions, const JunctionSet& junctions = empty_junctions, const InsertionSet& insertions = empty_insertions, const DeletionSet& deletions = empty_deletions, const FusionSet& fusions = empty_fusions, const Coverage& coverage = empty_coverage, bool final_report = false, boost::mt19937* rng = NULL) { char ret_code=0; const vector& hits = hits_for_read.hits; /* if (hits.size() >= max_multihits * 5) { ret_code |= 16; return ret_code; //too many hits }*/ unsigned int nhits=0; for (size_t i = 0; i < hits.size(); ++i) { if (hits[i].mismatches() > read_mismatches || hits[i].gap_length() > read_gap_length || hits[i].edit_dist() > read_edit_dist) continue; ret_code |= 1; //read has a valid mapping nhits++; if (nhits>1) ret_code |= 4; //read has multiple valid mappings if (nhits>max_multihits) ret_code |= 16; //read has too many valid mappings BowtieHit hit = hits[i]; AlignStatus align_status(hit, gtf_junctions, junctions, insertions, deletions, fusions, coverage); hit.alignment_score(align_status._alignment_score); if (report_secondary_alignments || !final_report) { best_hits.hits.push_back(hit); } else { // Is the new status better than the current best one? if (best_hits.hits.size() == 0 || cmp_read_alignment()(hit, best_hits.hits[0])) { best_hits.hits.clear(); best_hits.hits.push_back(hit); } else if (!cmp_read_alignment()(best_hits.hits[0], hit)) // is it just as good? { best_hits.hits.push_back(hit); } } } // due to indel alignments, there may be alignments with the same location std::sort(best_hits.hits.begin(), best_hits.hits.end()); vector::iterator new_end = std::unique(best_hits.hits.begin(), best_hits.hits.end(), cmp_read_equal()); best_hits.hits.erase(new_end, best_hits.hits.end()); if ((report_secondary_alignments || !final_report) && best_hits.hits.size() > 0) { sort(best_hits.hits.begin(), best_hits.hits.end(), cmp_read_alignment()); } if (final_report) { if (suppress_hits && best_hits.hits.size() > max_multihits) best_hits.hits.clear(); if (best_hits.hits.size() > max_multihits) { // there may be several alignments with the same alignment scores, // all of which we can not include because of this limit. // let's pick up some of them randomly up to this many max multihits. vector tie_indexes; int tie_alignment_score = best_hits.hits[max_multihits - 1].alignment_score(); int count_better_alignments = 0; for (size_t i = 0; i < best_hits.hits.size(); ++i) { int temp_alignment_score = best_hits.hits[i].alignment_score(); if (temp_alignment_score == tie_alignment_score) tie_indexes.push_back(i); else if (temp_alignment_score < tie_alignment_score) break; else ++count_better_alignments; } while (count_better_alignments + tie_indexes.size() > max_multihits) { int random_index = (*rng)() % tie_indexes.size(); tie_indexes.erase(tie_indexes.begin() + random_index); } for (size_t i = 0; i < tie_indexes.size(); ++i) { if (count_better_alignments + i != tie_indexes[i]) best_hits.hits[count_better_alignments + i] = best_hits.hits[tie_indexes[i]]; } best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end()); } } return ret_code; } bool is_fusion_insert_alignment(const BowtieHit& lh, const BowtieHit& rh) { bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING; bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING; if (left_fusion || right_fusion) return true; if (lh.ref_id() != rh.ref_id()) return true; if (lh.ref_id() == rh.ref_id()) { if (lh.antisense_align() == rh.antisense_align()) return true; else { int inner_dist = 0; if (lh.antisense_align()) // used rh.left() instead of rh.right() for the cases, // where reads overlap with each other or reads span introns inner_dist = lh.left() - rh.left(); else inner_dist = rh.left() - lh.left(); if (inner_dist < 0 || inner_dist > (int)fusion_min_dist) return true; } } return false; } bool set_insert_alignment_grade(const BowtieHit& lh, const BowtieHit& rh, const JunctionSet& junctions, InsertAlignmentGrade& grade) { bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING; bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING; if (left_fusion && right_fusion) return false; bool fusion = is_fusion_insert_alignment(lh, rh); grade = InsertAlignmentGrade(lh, rh, junctions, fusion); return true; } struct cmp_pair_alignment { cmp_pair_alignment(const JunctionSet& junctions) : _junctions(&junctions) {} bool operator() (const pair& l, const pair& r) const { InsertAlignmentGrade gl; set_insert_alignment_grade(l.first, l.second, *_junctions, gl); InsertAlignmentGrade gr; set_insert_alignment_grade(r.first, r.second, *_junctions, gr); bool better = gr < gl; bool worse = gl < gr; if (better && !worse) return true; else return false; } const JunctionSet* _junctions; }; struct cmp_pair_equal { bool operator() (const pair& l, const pair& r) const { if (!cmp_read_equal()(l.first, r.first) || !cmp_read_equal()(l.second, r.second)) return false; return true; } }; struct cmp_pair_less { bool operator() (const pair& l, const pair& r) const { if (l.first < r.first) return true; else if (r.first < l.first) return false; if (l.second < r.second) return true; else if (r.second < l.second) return false; return false; } }; struct SAlignStats { int64_t num_aligned_left; //total left reads aligned int64_t num_unmapped_left; //total left reads unmapped, or mapped in too many places (!) int64_t num_aligned_left_multi; //total left reads mapped in more than 1 place int64_t num_aligned_left_xmulti; //total left reads mapped in too many places (> max_multihits) int64_t num_aligned_right; //total right reads aligned int64_t num_unmapped_right; //total right reads unmapped, or mapped in too many places (!) int64_t num_aligned_right_multi; //total right reads in more than 1 place int64_t num_aligned_right_xmulti; //total right reads mapped in too many places (> max_multihits) int64_t num_aligned_pairs; //total pairs aligned int64_t num_aligned_pairs_multi; //total pairs aligned in more than 1 place int64_t num_aligned_pairs_disc; //total pairs aligned discordantly only SAlignStats():num_aligned_left(0), num_unmapped_left(0), num_aligned_left_multi(0), num_aligned_left_xmulti(0), num_aligned_right(0), num_unmapped_right(0), num_aligned_right_multi(0), num_aligned_right_xmulti(0), num_aligned_pairs(0), num_aligned_pairs_multi(0), num_aligned_pairs_disc(0) { } void add(SAlignStats& a) { num_aligned_left+=a.num_aligned_left; num_unmapped_left+=a.num_unmapped_left; num_aligned_left_multi+=a.num_aligned_left_multi; num_aligned_left_xmulti+=a.num_aligned_left_xmulti; num_aligned_right+=a.num_aligned_right; num_unmapped_right+=a.num_unmapped_right; num_aligned_right_multi+=a.num_aligned_right_multi; num_aligned_right_xmulti+=a.num_aligned_right_xmulti; num_aligned_pairs+=a.num_aligned_pairs; num_aligned_pairs_multi+=a.num_aligned_pairs_multi; num_aligned_pairs_disc+=a.num_aligned_pairs_disc; } }; char pair_best_alignments(const HitsForRead& left_hits, const HitsForRead& right_hits, InsertAlignmentGrade& best_grade, vector >& best_hits, const JunctionSet& gtf_junctions, const JunctionSet& junctions = empty_junctions, const InsertionSet& insertions = empty_insertions, const DeletionSet& deletions = empty_deletions, const FusionSet& fusions = empty_fusions, const Coverage& coverage = empty_coverage, bool final_report = false, boost::mt19937* rng = NULL ) { const vector& left = left_hits.hits; const vector& right = right_hits.hits; char ret_code=0; /* if (left.size() >= max_multihits * 5) { ret_code |= 1; } if (right.size() >= max_multihits * 5) { ret_code |= 2; } if (ret_code) return ret_code; */ unsigned int l_nhits=0, r_nhits=0; vector rhits; for (size_t j = 0; j < right.size(); ++j) { if (right[j].mismatches() > read_mismatches || right[j].gap_length() > read_gap_length || right[j].edit_dist() > read_edit_dist) continue; r_nhits++; ret_code|=2; //right read has acceptable mappings if (r_nhits>1) ret_code |= 8; //right read has multiple valid mappings if (r_nhits>max_multihits) { ret_code |= 32; //left read has too many mappings //if (r_nhits > (max_multihits<<2)) break; } BowtieHit rh = right[j]; AlignStatus align_status(rh, gtf_junctions, junctions, insertions, deletions, fusions, coverage); rh.alignment_score(align_status._alignment_score); rhits.push_back(rh); } for (size_t i = 0; i < left.size(); ++i) { if (left[i].mismatches() > read_mismatches || left[i].gap_length() > read_gap_length || left[i].edit_dist() > read_edit_dist) continue; l_nhits++; ret_code|=1; //left read has acceptable mappings if (l_nhits>1) ret_code |= 4; //left read has multiple valid mappings if (l_nhits>max_multihits) { ret_code |= 16; //left read has too many mappings //if (l_nhits > (max_multihits<<2)) break; } BowtieHit lh = left[i]; AlignStatus align_status(lh, gtf_junctions, junctions, insertions, deletions, fusions, coverage); lh.alignment_score(align_status._alignment_score); for (size_t j = 0; j < rhits.size(); ++j) { BowtieHit rh = rhits[j]; InsertAlignmentGrade g; bool allowed; allowed = set_insert_alignment_grade(lh, rh, final_report ? junctions : gtf_junctions, g); // daehwan - for debugging purposes #if 0 if (lh.insert_id() == 10790262) { fprintf(stderr, "lh %d:%d %s score: %d (from %d) NM: %d\n", lh.ref_id(), lh.left(), print_cigar(lh.cigar()).c_str(), lh.alignment_score(), left[i].alignment_score(), lh.edit_dist()); fprintf(stderr, "rh %d:%d %s score: %d (from %d) NM: %d\n", rh.ref_id(), rh.left(), print_cigar(rh.cigar()).c_str(), rh.alignment_score(), rhits[j].alignment_score(), rh.edit_dist()); fprintf(stderr, "combined score: %d is_fusion(%d)\n", g.align_score(), g.is_fusion()); } #endif if (!allowed) continue; bool new_best_grade=false; if (best_grade < g) { best_grade = g; new_best_grade=true; } if (g.fusion && !fusion_search && !report_discordant_pair_alignments) continue; if (report_secondary_alignments || !final_report) { best_hits.push_back(make_pair(lh, rh)); } else { // Is the new status better than the current best one? // if (best_grade < g) if (new_best_grade) { best_hits.clear(); best_hits.push_back(make_pair(lh, rh)); } else if (!(g < best_grade)) { best_hits.push_back(make_pair(lh, rh)); } } }//for j in right mate hits } //for i in left mate hits std::sort(best_hits.begin(), best_hits.end(), cmp_pair_less()); // daehwan - for debugging purposes #if 0 if (best_hits.size() > 0 && best_hits[0].first.insert_id() == 10790262) { for (size_t i = 0; i < best_hits.size(); ++i) { const BowtieHit& lh = best_hits[i].first; const BowtieHit& rh = best_hits[i].second; fprintf(stderr, "%d %d:%d %s %d:%d %s\n", i, lh.ref_id(), lh.left(), print_cigar(lh.cigar()).c_str(), rh.ref_id(), rh.left(), print_cigar(rh.cigar()).c_str()); } fprintf(stderr, "\n\n\n"); } #endif vector >::iterator new_end = std::unique(best_hits.begin(), best_hits.end(), cmp_pair_equal()); best_hits.erase(new_end, best_hits.end()); // daehwan - for debugging purposes #if 0 if (best_hits.size() > 0 && best_hits[0].first.insert_id() == 10790262) { for (size_t i = 0; i < best_hits.size(); ++i) { const BowtieHit& lh = best_hits[i].first; const BowtieHit& rh = best_hits[i].second; fprintf(stderr, "%d %d:%d %s %d:%d %s\n", i, lh.ref_id(), lh.left(), print_cigar(lh.cigar()).c_str(), rh.ref_id(), rh.left(), print_cigar(rh.cigar()).c_str()); } fprintf(stderr, "\n\n\n"); } #endif if ((report_secondary_alignments || !final_report) && best_hits.size() > 0) { cmp_pair_alignment cmp(final_report ? junctions : gtf_junctions); sort(best_hits.begin(), best_hits.end(), cmp); set_insert_alignment_grade(best_hits[0].first, best_hits[0].second, final_report ? junctions : gtf_junctions, best_grade); } if (final_report) { if (suppress_hits && best_hits.size() > max_multihits) best_hits.clear(); if (best_hits.size() > max_multihits) { vector tie_indexes; InsertAlignmentGrade temp_grade; set_insert_alignment_grade(best_hits[max_multihits - 1].first, best_hits[max_multihits - 1].second, junctions, temp_grade); int tie_alignment_score = temp_grade.align_score(); int count_better_alignments = 0; for (size_t i = 0; i < best_hits.size(); ++i) { set_insert_alignment_grade(best_hits[i].first, best_hits[i].second, junctions, temp_grade); int temp_alignment_score = temp_grade.align_score(); if (temp_alignment_score == tie_alignment_score) tie_indexes.push_back(i); else if (temp_alignment_score < tie_alignment_score) break; else ++count_better_alignments; } while (count_better_alignments + tie_indexes.size() > max_multihits) { int random_index = (*rng)() % tie_indexes.size(); tie_indexes.erase(tie_indexes.begin() + random_index); } for (size_t i = 0; i < tie_indexes.size(); ++i) { if (count_better_alignments + i != tie_indexes[i]) best_hits[count_better_alignments + i] = best_hits[tie_indexes[i]]; } best_hits.erase(best_hits.begin() + max_multihits, best_hits.end()); } } //final report best_grade.num_alignments = best_hits.size(); return ret_code; } enum FragmentType {FRAG_UNPAIRED, FRAG_LEFT, FRAG_RIGHT}; void add_auxData(vector& auxdata, vector& sam_toks, const RefSequenceTable& rt,const BowtieHit& bh, FragmentType insert_side, int num_hits, const BowtieHit* next_hit, int hitIndex) { bool XS_found = false; if (sam_toks.size()>11) { for (size_t i=11;iref_id()); assert (nh_ref_name != NULL); bool same_contig=(next_hit->ref_id()==bh.ref_id()); aux="CC:Z:"; aux+= (same_contig ? "=" : nh_ref_name); auxdata.push_back(aux); aux="CP:i:"; int nh_gpos=next_hit->left() + 1; str_appendInt(aux, nh_gpos); auxdata.push_back(aux); } //has next_hit // FIXME: this code is still a bit brittle, because it contains no // consistency check that the mates are on opposite strands - a current protocol // requirement, and that the strand indicated by the alignment is consistent // with the orientation of the splices (though that should be handled upstream). if (!XS_found) { const string xs_f("XS:A:+"); const string xs_r("XS:A:-"); if (library_type == FR_FIRSTSTRAND) { if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED) { if (bh.antisense_align()) auxdata.push_back(xs_f); else auxdata.push_back(xs_r); } else { if (bh.antisense_align()) auxdata.push_back(xs_r); else auxdata.push_back(xs_f); } } else if (library_type == FR_SECONDSTRAND) { if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED){ if (bh.antisense_align()) auxdata.push_back(xs_r); else auxdata.push_back(xs_f); } else { if (bh.antisense_align()) auxdata.push_back(xs_f); else auxdata.push_back(xs_r); } } } if (hitIndex >= 0) { string aux("HI:i:"); str_appendInt(aux, hitIndex); auxdata.push_back(aux); } } bool rewrite_sam_record(GBamWriter& bam_writer, const RefSequenceTable& rt, const BowtieHit& bh, const char* bwt_buf, const char* read_alt_name, FragmentType insert_side, int num_hits, const BowtieHit* next_hit, bool primary, int hitIndex) { // Rewrite this hit, filling in the alt name, mate mapping // and setting the pair flag vector sam_toks; tokenize(bwt_buf, "\t", sam_toks); string ref_name = sam_toks[2], ref_name2 = ""; char cigar1[1024] = {0}, cigar2[1024] = {0}; string left_seq, right_seq, left_qual, right_qual; int left1 = -1, left2 = -1; bool fusion_alignment = false; size_t XF_index = 0; for (size_t i = 11; i < sam_toks.size(); ++i) { string& tok = sam_toks[i]; if (strncmp(tok.c_str(), "XF", 2) == 0) { fusion_alignment = true; XF_index = i; vector tuple_fields; tokenize(tok.c_str(), " ", tuple_fields); vector contigs; tokenize(tuple_fields[1].c_str(), "-", contigs); if (contigs.size() >= 2) { ref_name = contigs[0]; ref_name2 = contigs[1]; } extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(), cigar1, cigar2, left_seq, right_seq, left_qual, right_qual, left1, left2); break; } else if (strncmp(tok.c_str(), "AS", 2) == 0) { char AS_score[128] = {0}; sprintf(AS_score, "AS:i:%d", min(0, bh.alignment_score())); tok = AS_score; } } string qname(read_alt_name); size_t slash_pos=qname.rfind('/'); if (slash_pos!=string::npos) qname.resize(slash_pos); //read_alt_name as QNAME int flag=atoi(sam_toks[1].c_str()); //FLAG if (insert_side != FRAG_UNPAIRED) { //flag = atoi(sam_toks[1].c_str()); // mark this as a singleton mate flag |= 0x0001; if (insert_side == FRAG_LEFT) flag |= 0x0040; else if (insert_side == FRAG_RIGHT) flag |= 0x0080; flag |= 0x0008; //char flag_buf[64]; //sprintf(flag_buf, "%d", flag); //sam_toks[t] = flag_buf; } if (!primary) flag |= 0x100; int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0; int mapQ = 50; if (num_hits > 1) { double err_prob = 1 - (1.0 / num_hits); mapQ = (int)(-10.0 * log(err_prob) / log(10.0)); } int tlen =atoi(sam_toks[8].c_str()); //TLEN int mate_pos=atoi(sam_toks[7].c_str()); string rg_aux = ""; if (!sam_readgroup_id.empty()) rg_aux = string("RG:Z:") + sam_readgroup_id; GBamRecord* bamrec=NULL; if (fusion_alignment) { vector auxdata; add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex); if (rg_aux != "") auxdata.push_back(rg_aux); bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ, cigar1, sam_toks[6].c_str(), mate_pos, tlen, left_seq.c_str(), left_qual.c_str(), &auxdata); bam_writer.write(bamrec); delete bamrec; auxdata.clear(); sam_toks[XF_index][5] = '2'; add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex); if (rg_aux != "") auxdata.push_back(rg_aux); bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ, cigar2, sam_toks[6].c_str(), mate_pos, tlen, right_seq.c_str(), right_qual.c_str(), &auxdata); bam_writer.write(bamrec); delete bamrec; } else { vector auxdata; add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex); if (rg_aux != "") auxdata.push_back(rg_aux); bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ, sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos, tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata); bam_writer.write(bamrec); delete bamrec; } return true; } bool rewrite_sam_record(GBamWriter& bam_writer, const RefSequenceTable& rt, const BowtieHit& bh, const char* bwt_buf, const char* read_alt_name, const InsertAlignmentGrade& grade, FragmentType insert_side, const BowtieHit* partner, int num_hits, const BowtieHit* next_hit, bool primary, int hitIndex) { // Rewrite this hit, filling in the alt name, mate mapping // and setting the pair flag vector sam_toks; tokenize(bwt_buf, "\t", sam_toks); string qname(read_alt_name); size_t slash_pos=qname.rfind('/'); if (slash_pos!=string::npos) qname.resize(slash_pos); //read_alt_name as QNAME int flag = atoi(sam_toks[1].c_str()); // 0x0010 (strand of query) is assumed to be set correctly // to begin with flag |= 0x0001; //it must be paired if (insert_side == FRAG_LEFT) flag |= 0x0040; else if (insert_side == FRAG_RIGHT) flag |= 0x0080; if (!primary) flag |= 0x100; string ref_name = sam_toks[2], ref_name2 = ""; char cigar1[1024] = {0}, cigar2[1024] = {0}; string left_seq, right_seq, left_qual, right_qual; int left1 = -1, left2 = -1; bool fusion_alignment = false; size_t XF_tok_idx = 11; for (; XF_tok_idx < sam_toks.size(); ++XF_tok_idx) { string& tok = sam_toks[XF_tok_idx]; if (strncmp(tok.c_str(), "XF", 2) == 0) { fusion_alignment = true; vector tuple_fields; tokenize(tok.c_str(), " ", tuple_fields); vector contigs; tokenize(tuple_fields[1].c_str(), "-", contigs); if (contigs.size() >= 2) { ref_name = contigs[0]; ref_name2 = contigs[1]; } extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(), cigar1, cigar2, left_seq, right_seq, left_qual, right_qual, left1, left2); break; } else if (strncmp(tok.c_str(), "AS", 2) == 0) { char AS_score[128] = {0}; sprintf(AS_score, "AS:i:%d", min(0, bh.alignment_score())); tok = AS_score; } } int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0; int mapQ = 50; if (grade.num_alignments > 1) { double err_prob = 1 - (1.0 / grade.num_alignments); mapQ = (int)(-10.0 * log(err_prob) / log(10.0)); } int tlen=0; //TLEN int mate_pos=atoi(sam_toks[7].c_str()); string mate_contig = "*", mate_contig2 = "*"; if (partner) { if (partner->ref_id() == bh.ref_id()) { mate_contig = "="; //same chromosome //TLEN: // a read contains its partner if (bh.left() <= partner->left() && bh.right() >= partner->right() && bh.right() - bh.left() > partner->right() - partner->left()) tlen = bh.right() - bh.left(); else if (partner->left() <= bh.left() && partner->right() >= bh.right() && partner->right() - partner->left() > bh.right() - bh.left()) tlen = partner->left() - partner->right(); else tlen = bh.left() < partner->left() ? partner->right() - bh.left() : partner->left() - bh.right(); } else { //partner on different chromosome/contig mate_contig = rt.get_name(partner->ref_id()); } mate_pos = partner->left() + 1; if (grade.happy()) flag |= 0x0002; if (partner->antisense_align()) flag |= 0x0020; if (fusion_alignment) { if (partner->ref_id() == bh.ref_id2()) mate_contig2 = "="; else mate_contig2 = rt.get_name(partner->ref_id()); } if (fusion_search) { string cigar_str = print_cigar(partner->cigar()); char partner_pos[4096]; if (partner->fusion_opcode() != FUSION_NOTHING) { sprintf(partner_pos, "XP:Z:%s-%s %d %s", rt.get_name(partner->ref_id()), rt.get_name(partner->ref_id2()), partner->left() + 1, cigar_str.c_str()); } else { sprintf(partner_pos, "XP:Z:%s %d %s", rt.get_name(partner->ref_id()), partner->left() + 1, cigar_str.c_str()); } sam_toks.push_back(partner_pos); } } else { mate_pos = 0; flag |= 0x0008; } string rg_aux = ""; if (!sam_readgroup_id.empty()) rg_aux = string("RG:Z:") + sam_readgroup_id; GBamRecord* bamrec=NULL; if (fusion_alignment) { vector auxdata; add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex); if (rg_aux != "") auxdata.push_back(rg_aux); bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ, cigar1, mate_contig.c_str(), mate_pos, tlen, left_seq.c_str(), left_qual.c_str(), &auxdata); bam_writer.write(bamrec); delete bamrec; auxdata.clear(); sam_toks[XF_tok_idx][5] = '2'; add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex); if (rg_aux != "") auxdata.push_back(rg_aux); bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ, cigar2, mate_contig2.c_str(), mate_pos, tlen, right_seq.c_str(), right_qual.c_str(), &auxdata); bam_writer.write(bamrec); delete bamrec; } else { vector auxdata; add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex); if (rg_aux != "") auxdata.push_back(rg_aux); bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ, sam_toks[5].c_str(), mate_contig.c_str(), mate_pos, tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata); bam_writer.write(bamrec); delete bamrec; } return true; } struct lex_hit_sort { lex_hit_sort(const RefSequenceTable& rt, const HitsForRead& hits) : _rt(rt), _hits(hits) {} bool operator()(const uint32_t& l, const uint32_t& r) const { const BowtieHit& lhs = _hits.hits[l]; const BowtieHit& rhs = _hits.hits[r]; uint32_t l_id = lhs.ref_id(); uint32_t r_id = rhs.ref_id(); if (l_id != r_id) return l_id < r_id; return lhs.left() < rhs.left(); } const RefSequenceTable& _rt; const HitsForRead& _hits; }; void print_sam_for_single(const RefSequenceTable& rt, const HitsForRead& hits, FragmentType frag_type, const string& alt_name, GBamWriter& bam_writer, boost::mt19937& rng) { //assert(!read.alt_name.empty()); if (hits.hits.empty()) return; lex_hit_sort s(rt, hits); vector index_vector; size_t i; for (i = 0; i < hits.hits.size(); ++i) index_vector.push_back(i); sort(index_vector.begin(), index_vector.end(), s); size_t primaryHit = 0; if (!report_secondary_alignments) primaryHit = rng() % hits.hits.size(); bool multipleHits = (hits.hits.size() > 1); for (i = 0; i < hits.hits.size(); ++i) { size_t index = index_vector[i]; bool primary = (index == primaryHit); const BowtieHit& bh = hits.hits[index]; rewrite_sam_record(bam_writer, rt, bh, bh.hitfile_rec().c_str(), alt_name.c_str(), frag_type, hits.hits.size(), (i < hits.hits.size()-1) ? &(hits.hits[index_vector[i+1]]) : NULL, primary, (multipleHits? i: -1)); } } void print_sam_for_pair(const RefSequenceTable& rt, const vector >& best_hits, const InsertAlignmentGrade& grade, GBamWriter& bam_writer, const string& left_alt_name, const string& right_alt_name, boost::mt19937& rng, uint64_t begin_id = 0, uint64_t end_id = std::numeric_limits::max()) { Read left_read; Read right_read; if (best_hits.empty()) return; size_t i; HitsForRead right_hits; for (i = 0; i < best_hits.size(); ++i) right_hits.hits.push_back(best_hits[i].second); size_t primaryHit = 0; vector index_vector; lex_hit_sort s(rt, right_hits); for (i = 0; i < right_hits.hits.size(); ++i) index_vector.push_back(i); sort(index_vector.begin(), index_vector.end(), s); if (!report_secondary_alignments) primaryHit = rng() % right_hits.hits.size(); bool multipleHits = (best_hits.size() > 1); for (i = 0; i < best_hits.size(); ++i) { size_t index = index_vector[i]; bool primary = (index == primaryHit); const BowtieHit& right_bh = best_hits[index].second; const BowtieHit& left_bh = best_hits[index].first; rewrite_sam_record(bam_writer, rt, right_bh, right_bh.hitfile_rec().c_str(), right_alt_name.c_str(), grade, FRAG_RIGHT, &left_bh, best_hits.size(), (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].second) : NULL, primary, (multipleHits? i: -1)); rewrite_sam_record(bam_writer, rt, left_bh, left_bh.hitfile_rec().c_str(), left_alt_name.c_str(), grade, FRAG_LEFT, &right_bh, best_hits.size(), (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].first) : NULL, primary, (multipleHits? i: -1)); } } /** * Given all of the hits for a particular read, update the read counts for insertions and deletions. * @param hits hits The alignments for a particular read * @param insertions Maps from an insertion to the number of supporting reads for that insertion * @param deletions Maps from a deletion to the number of supporting reads for that deletion */ void update_insertions_and_deletions(const HitsForRead& hits, InsertionSet& insertions, DeletionSet& deletions) { for (size_t i = 0; i < hits.hits.size(); ++i) { const BowtieHit& bh = hits.hits[i]; insertions_from_alignment(bh, insertions); deletions_from_alignment(bh, deletions); } } void update_coverage(const HitsForRead& hits, Coverage& coverage) { for (size_t i = 0; i < hits.hits.size(); ++i) { const BowtieHit& hit = hits.hits[i]; const vector& cigar = hit.cigar(); unsigned int positionInGenome = hit.left(); RefID ref_id = hit.ref_id(); for(size_t c = 0; c < cigar.size(); ++c) { int opcode = cigar[c].opcode; int length = cigar[c].length; switch(opcode) { case REF_SKIP: case MATCH: case DEL: if (opcode == MATCH) coverage.add_coverage(ref_id, positionInGenome, length); positionInGenome += length; break; case rEF_SKIP: case mATCH: case dEL: positionInGenome -= length; if (opcode == mATCH) coverage.add_coverage(ref_id, positionInGenome + 1, length); break; case FUSION_FF: case FUSION_FR: case FUSION_RF: case FUSION_RR: positionInGenome = length; ref_id = hit.ref_id2(); break; default: break; } } } } void update_fusions(const HitsForRead& hits, RefSequenceTable& rt, FusionSet& fusions, const FusionSet& fusions_ref = empty_fusions) { if (!fusion_search) return; if (hits.hits.size() > fusion_multireads) return; bool update_stat = fusions_ref.size() > 0; for (size_t i = 0; i < hits.hits.size(); ++i) { const BowtieHit& bh = hits.hits[i]; if (bh.edit_dist() > fusion_read_mismatches) continue; fusions_from_alignment(bh, fusions, rt, update_stat); if (update_stat) unsupport_fusions(bh, fusions, fusions_ref); } } void update_junctions(const HitsForRead& hits, JunctionSet& junctions) { for (size_t i = 0; i < hits.hits.size(); ++i) { const BowtieHit& bh = hits.hits[i]; junctions_from_alignment(bh, junctions); } } // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file // resets the stream when finished. void exclude_hits_on_filtered_junctions(const JunctionSet& junctions, HitsForRead& hits) { HitsForRead remaining; remaining.insert_id = hits.insert_id; for (size_t i = 0; i < hits.hits.size(); ++i) { BowtieHit& bh = hits.hits[i]; if (bh.mismatches() > read_mismatches || bh.gap_length() > read_gap_length || bh.edit_dist() > read_edit_dist) continue; bool filter_hit = false; if (!bh.contiguous()) { JunctionSet bh_juncs; junctions_from_alignment(bh, bh_juncs); for (JunctionSet::iterator itr = bh_juncs.begin(); itr != bh_juncs.end(); itr++) { const Junction& j = itr->first; JunctionSet::const_iterator target = junctions.find(j); if (target == junctions.end() || !target->second.accepted) { filter_hit = true; break; } } } if (!filter_hit) remaining.hits.push_back(bh); } hits = remaining; } void realign_reads(HitsForRead& hits, const RefSequenceTable& rt, const JunctionSet& junctions, const JunctionSet& rev_junctions, const InsertionSet& insertions, const DeletionSet& deletions, const DeletionSet& rev_deletions, const FusionSet& fusions) { if (color) return; vector additional_hits; for (size_t i = 0; i < hits.hits.size(); ++i) { BowtieHit& bh = hits.hits[i]; if (fusion_search && bh.fusion_opcode() != FUSION_NOTHING) return; const vector& cigars = bh.cigar(); int pos = bh.left(); int refid = bh.ref_id(); for (size_t j = 0; j < cigars.size(); ++j) { const CigarOp& op = cigars[j]; if (j == 0 || j == cigars.size() - 1) { // let's do this for MATCH case only, if (op.opcode == MATCH || op.opcode == mATCH) { int left1, left2; if (op.opcode == MATCH) { left1 = pos; left2 = pos + op.length - 1; } else { left1 = pos - op.length + 1; left2 = pos; } { static const size_t max_temp_juncs = 5; JunctionSet::const_iterator lb, ub; JunctionSet temp_junctions; if (j == 0) { lb = rev_junctions.upper_bound(Junction(refid, left1, 0, true)); ub = rev_junctions.lower_bound(Junction(refid, left2, left2, false)); while (lb != ub && lb != rev_junctions.end()) { Junction temp_junction = lb->first; temp_junction.left = lb->first.right; temp_junction.right = lb->first.left; temp_junctions[temp_junction] = lb->second; ++lb; if (temp_junctions.size() > max_temp_juncs) break; } } if (j == cigars.size() - 1) { int common_right = left2 + max_report_intron_length; lb = junctions.upper_bound(Junction(refid, left1, common_right, true)); ub = junctions.lower_bound(Junction(refid, left2, common_right, false)); while (lb != ub && lb != junctions.end()) { temp_junctions[lb->first] = lb->second; ++lb; if (temp_junctions.size() > max_temp_juncs) break; } } // daehwan - for debugging purposes /* if (bh.insert_id() == 15461 && cigars.size() == 1) { printf("%d: %s\n", bh.insert_id(), print_cigar(bh.cigar()).c_str()); printf("candidate junctions: %d - max junctions: %d\n", temp_junctions.size(), max_temp_juncs); JunctionSet::const_iterator junc_iter = temp_junctions.begin(); for (; junc_iter != temp_junctions.end(); ++junc_iter) { Junction junc = junc_iter->first; fprintf(stderr, "%d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n", bh.insert_id(), bh.left(), bh.right(), print_cigar(bh.cigar()).c_str(), bh.alignment_score(), bh.edit_dist(), junc.left, junc.right); } } */ if (temp_junctions.size() > max_temp_juncs) continue; JunctionSet::const_iterator junc_iter = temp_junctions.begin(); for (; junc_iter != temp_junctions.end(); ++junc_iter) { Junction junc = junc_iter->first; #if 0 fprintf(stderr, "%d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n", bh.insert_id(), bh.left(), bh.right(), print_cigar(bh.cigar()).c_str(), bh.alignment_score(), bh.edit_dist(), junc.left, junc.right); #endif int new_left = bh.left(); int intron_length = junc.right - junc.left - 1; vector new_cigars; bool anchored = false; if (j == 0 && bh.left() > (int)junc.left) { new_left -= intron_length; int before_match_length = junc.left - new_left + 1;; int after_match_length = op.length - before_match_length; if (before_match_length > 0 && after_match_length > 0) { anchored = true; new_cigars.push_back(CigarOp(MATCH, before_match_length)); new_cigars.push_back(CigarOp(REF_SKIP, intron_length)); new_cigars.push_back(CigarOp(MATCH, after_match_length)); new_cigars.insert(new_cigars.end(), cigars.begin() + 1, cigars.end()); } } else if (j == cigars.size() - 1 && pos < (int)junc.left) { new_cigars.insert(new_cigars.end(), cigars.begin(), cigars.end() - 1); int before_match_length = junc.left - pos + 1; int after_match_length = op.length - before_match_length; if (before_match_length > 0 && after_match_length > 0) { anchored = true; new_cigars.push_back(CigarOp(MATCH, before_match_length)); new_cigars.push_back(CigarOp(REF_SKIP, intron_length)); new_cigars.push_back(CigarOp(MATCH, after_match_length)); } } if (!anchored) continue; BowtieHit new_bh(bh.ref_id(), bh.ref_id2(), bh.insert_id(), new_left, new_cigars, bh.antisense_align(), junc.antisense, 0, /* mismatches - needs to be recalculated */ 0, /* edit_dist - needs to be recalculated */ 0, /* splice_mms - needs to be recalculated */ false); new_bh.seq(bh.seq()); new_bh.qual(bh.qual()); const RefSequenceTable::Sequence* ref_str = rt.get_seq(bh.ref_id()); if (new_left >= 0 && new_bh.right() <= (int)length(*ref_str)) { vector aux_fields; bowtie_sam_extra(new_bh, rt, aux_fields); vector::const_iterator aux_iter = aux_fields.begin(); for (; aux_iter != aux_fields.end(); ++aux_iter) { const string& aux_field = *aux_iter; if (strncmp(aux_field.c_str(), "AS", 2) == 0) { int alignment_score = atoi(aux_field.c_str() + 5); new_bh.alignment_score(alignment_score); } else if (strncmp(aux_field.c_str(), "XM", 2) == 0) { int XM_value = atoi(aux_field.c_str() + 5); new_bh.mismatches(XM_value); new_bh.edit_dist(XM_value + gap_length(new_cigars)); } } string NM = "NM:i:"; str_appendInt(NM, new_bh.edit_dist()); aux_fields.push_back(NM); // replace the previous sam auxiliary fields with the new ones vector sam_toks; tokenize(bh.hitfile_rec().c_str(), "\t", sam_toks); char coord[20] = {0,}; sprintf(coord, "%d", new_bh.left() + 1); sam_toks[3] = coord; sam_toks[5] = print_cigar(new_bh.cigar()); for (size_t a = 11; a < sam_toks.size(); ++a) { string& sam_tok = sam_toks[a]; for (size_t b = 0; b < aux_fields.size(); ++b) { const string& aux_tok = aux_fields[b]; if (strncmp(sam_tok.c_str(), aux_tok.c_str(), 5) == 0) { sam_tok = aux_tok; break; } } } if (!bh.is_spliced()) { if (junc.antisense) sam_toks.push_back("XS:A:-"); else sam_toks.push_back("XS:A:+"); } string new_rec = ""; for (size_t d = 0; d < sam_toks.size(); ++d) { new_rec += sam_toks[d]; if (d < sam_toks.size() - 1) new_rec += "\t"; } new_bh.hitfile_rec(new_rec); if (new_bh.edit_dist() <= bh.edit_dist()) additional_hits.push_back(new_bh); #if 0 fprintf(stderr, "\t%d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n", new_bh.insert_id(), new_bh.left(), new_bh.right(), print_cigar(new_bh.cigar()).c_str(), new_bh.alignment_score(), new_bh.edit_dist(), junc.left, junc.right); #endif } } } #if 0 { DeletionSet::const_iterator lb, ub; bool use_rev_deletions = (j == 0); const DeletionSet& curr_deletions = (use_rev_deletions ? rev_deletions : deletions); if (use_rev_deletions) { lb = curr_deletions.upper_bound(Deletion(refid, left1, 0, true)); ub = curr_deletions.lower_bound(Deletion(refid, left2, left2, false)); } else { int common_right = left2 + 100; lb = curr_deletions.upper_bound(Deletion(refid, left1, common_right, true)); ub = curr_deletions.lower_bound(Deletion(refid, left2, common_right, false)); } while (lb != curr_deletions.end() && lb != ub) { Deletion del = lb->first; if (use_rev_deletions) { int temp = del.left; del.left = del.right; del.right = temp; } // daehwan - for debuggin purposes /* fprintf(stderr, "(type%d) %d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n", !use_rev_junctions, bh.insert_id(), bh.left(), bh.right(), print_cigar(bh.cigar()).c_str(), bh.alignment_score(), bh.edit_dist(), junc.left, junc.right); */ int del_length = del.right - del.left - 1; int new_left = bh.left(); if (j == 0) new_left -= del_length; vector new_cigars; if (j == 0) { int before_match_length = del.left - new_left + 1;; int after_match_length = op.length - before_match_length; if (before_match_length > 0) new_cigars.push_back(CigarOp(MATCH, before_match_length)); new_cigars.push_back(CigarOp(DEL, del_length)); if (after_match_length > 0) new_cigars.push_back(CigarOp(MATCH, after_match_length)); new_cigars.insert(new_cigars.end(), cigars.begin() + 1, cigars.end()); } else { new_cigars.insert(new_cigars.end(), cigars.begin(), cigars.end() - 1); int before_match_length = del.left - pos + 1; int after_match_length = op.length - before_match_length; if (before_match_length > 0) new_cigars.push_back(CigarOp(MATCH, before_match_length)); new_cigars.push_back(CigarOp(DEL, del_length)); if (after_match_length > 0) new_cigars.push_back(CigarOp(MATCH, after_match_length)); } BowtieHit new_bh(bh.ref_id(), bh.ref_id2(), bh.insert_id(), new_left, new_cigars, bh.antisense_align(), bh.antisense_splice(), 0, /* edit_dist - needs to be recalculated */ 0, /* splice_mms - needs to be recalculated */ false); new_bh.seq(bh.seq()); new_bh.qual(bh.qual()); vector aux_fields; bowtie_sam_extra(new_bh, rt, aux_fields); vector::const_iterator aux_iter = aux_fields.begin(); for (; aux_iter != aux_fields.end(); ++aux_iter) { const string& aux_field = *aux_iter; if (strncmp(aux_field.c_str(), "AS", 2) == 0) { int alignment_score = atoi(aux_field.c_str() + 5); new_bh.alignment_score(alignment_score); } else if (strncmp(aux_field.c_str(), "XM", 2) == 0) { int XM_value = atoi(aux_field.c_str() + 5); new_bh.edit_dist(XM_value); } } vector sam_toks; tokenize(bh.hitfile_rec().c_str(), "\t", sam_toks); char coord[20] = {0,}; sprintf(coord, "%d", new_bh.left() + 1); sam_toks[3] = coord; sam_toks[5] = print_cigar(new_bh.cigar()); for (size_t a = 11; a < sam_toks.size(); ++a) { string& sam_tok = sam_toks[a]; for (size_t b = 0; b < aux_fields.size(); ++b) { const string& aux_tok = aux_fields[b]; if (strncmp(sam_tok.c_str(), aux_tok.c_str(), 5) == 0) { sam_tok = aux_tok; break; } } } string new_rec = ""; for (size_t d = 0; d < sam_toks.size(); ++d) { new_rec += sam_toks[d]; if (d < sam_toks.size() - 1) new_rec += "\t"; } new_bh.hitfile_rec(new_rec); if (new_bh.edit_dist() <= bh.edit_dist()) additional_hits.push_back(new_bh); /* fprintf(stderr, "\t%d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n", new_bh.insert_id(), new_bh.left(), new_bh.right(), print_cigar(new_bh.cigar()).c_str(), new_bh.alignment_score(), new_bh.edit_dist(), junc.left, junc.right); */ ++lb; } } { InsertionSet::const_iterator lb, ub; lb = insertions.upper_bound(Insertion(refid, left1, "")); ub = insertions.lower_bound(Insertion(refid, left2, "")); while (lb != insertions.end() && lb != ub) { Insertion ins = lb->first; // daehwan - for debugging purposes /* fprintf(stderr, "(type%d) %d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n", !use_rev_junctions, bh.insert_id(), bh.left(), bh.right(), print_cigar(bh.cigar()).c_str(), bh.alignment_score(), bh.edit_dist(), junc.left, junc.right); */ int ins_length = ins.sequence.length(); int new_left = bh.left(); if (j == 0) new_left -= ins_length; vector new_cigars; if (j == 0) { int before_match_length = ins.left - new_left + 1;; int after_match_length = op.length - before_match_length - ins_length; if (before_match_length > 0) new_cigars.push_back(CigarOp(MATCH, before_match_length)); new_cigars.push_back(CigarOp(INS, ins_length)); if (after_match_length > 0) new_cigars.push_back(CigarOp(MATCH, after_match_length)); new_cigars.insert(new_cigars.end(), cigars.begin() + 1, cigars.end()); } else { new_cigars.insert(new_cigars.end(), cigars.begin(), cigars.end() - 1); int before_match_length = ins.left - pos + 1; int after_match_length = op.length - before_match_length - ins_length; if (before_match_length > 0) new_cigars.push_back(CigarOp(MATCH, before_match_length)); new_cigars.push_back(CigarOp(INS, ins_length)); if (after_match_length > 0) new_cigars.push_back(CigarOp(MATCH, after_match_length)); } BowtieHit new_bh(bh.ref_id(), bh.ref_id2(), bh.insert_id(), new_left, new_cigars, bh.antisense_align(), bh.antisense_splice(), 0, /* edit_dist - needs to be recalculated */ 0, /* splice_mms - needs to be recalculated */ false); new_bh.seq(bh.seq()); new_bh.qual(bh.qual()); vector aux_fields; bowtie_sam_extra(new_bh, rt, aux_fields); vector::const_iterator aux_iter = aux_fields.begin(); for (; aux_iter != aux_fields.end(); ++aux_iter) { const string& aux_field = *aux_iter; if (strncmp(aux_field.c_str(), "AS", 2) == 0) { int alignment_score = atoi(aux_field.c_str() + 5); new_bh.alignment_score(alignment_score); } else if (strncmp(aux_field.c_str(), "XM", 2) == 0) { int XM_value = atoi(aux_field.c_str() + 5); new_bh.edit_dist(XM_value); } } /* fprintf(stderr, "\t%d %d-%d %s (AS:%d XM:%d) with junc %u-%u\n", new_bh.insert_id(), new_bh.left(), new_bh.right(), print_cigar(new_bh.cigar()).c_str(), new_bh.alignment_score(), new_bh.edit_dist(), junc.left, junc.right); */ ++lb; } } #endif } } switch(op.opcode) { case REF_SKIP: pos += op.length; break; case rEF_SKIP: pos -= op.length; break; case MATCH: case DEL: pos += op.length; break; case mATCH: case dEL: pos -= op.length; break; case FUSION_FF: case FUSION_FR: case FUSION_RF: case FUSION_RR: pos = op.length; refid = bh.ref_id2(); break; default: break; } } } hits.hits.insert(hits.hits.end(), additional_hits.begin(), additional_hits.end()); std::sort(hits.hits.begin(), hits.hits.end()); vector::iterator new_end = std::unique(hits.hits.begin(), hits.hits.end()); hits.hits.erase(new_end, hits.hits.end()); } class MultipleBAMReader { public: MultipleBAMReader(ReadTable& it, RefSequenceTable& rt, const vector& fnames, long begin_id, long end_id) : _it(it), _rt(rt), _begin_id(begin_id), _end_id(end_id), _bam_hit_factory(it, rt), _bam_merge(NULL) { // calculate file offsets vector offsets; for (size_t i = 0; i < fnames.size(); ++i) { const string& fname = fnames[i]; vector next_file_read_ids; vector temp_fnames; if (fname.substr(fname.length() - 4) == ".bam") { temp_fnames.push_back(fname); next_file_read_ids.push_back(0); } else { size_t j = 0; while (true) { char suffix[128]; sprintf(suffix, "%lu.bam", j); string temp_fname = fname + suffix; string temp_index_fname = temp_fname + ".index"; ifstream index_file(temp_index_fname.c_str()); if (!index_file.is_open()) { next_file_read_ids.push_back(0); break; } temp_fnames.push_back(temp_fname); if (j > 0) { string line; int64_t offset = 0; uint64_t read_id = 0; if (getline(index_file, line)) { istringstream istream(line); istream >> read_id >> offset; next_file_read_ids.push_back(read_id); } else { next_file_read_ids.push_back(0); } } ++j; } } for (size_t j = 0; j < temp_fnames.size(); ++j) { ifstream reads_index_file((temp_fnames[j] + ".index").c_str()); if (!reads_index_file.is_open()) continue; bool pushed = false; int64_t offset = 0, last_offset = 0; uint64_t read_id = 0, last_read_id = 0; string line; while (getline(reads_index_file, line)) { istringstream istream(line); istream >> read_id >> offset; if (read_id > _begin_id && last_read_id <= _begin_id) { pushed = true; _fnames.push_back(temp_fnames[j]); offsets.push_back(last_offset); #if 0 fprintf(stderr, "bet %lu and %lu - %s %lu %ld\n", _begin_id, _end_id, temp_fnames[j].c_str(), last_offset, last_read_id); #endif break; } last_offset = offset; last_read_id = read_id; } if (!pushed) { if(next_file_read_ids[j] > _begin_id && last_read_id <= _begin_id) { pushed = true; _fnames.push_back(temp_fnames[j]); offsets.push_back(last_offset); #if 0 fprintf(stderr, "2 bet %lu and %lu - %s %lu %ld\n", _begin_id, _end_id, temp_fnames[j].c_str(), last_offset, last_read_id); #endif } } if (read_id >= _end_id) break; if (read_id == 0) { _fnames.push_back(temp_fnames[j]); offsets.push_back(0); } } } _bam_merge = new BamMerge(_fnames, offsets); _bam_hit_factory.set_sam_header(_bam_merge->get_sam_header()); } ~MultipleBAMReader() { if (_bam_merge) delete _bam_merge; } bool next_read_hits(HitsForRead& hits) { hits.insert_id = 0; if (!_bam_merge) return false; vector bam_lines; while (true) { if (!_bam_merge->next_bam_lines(bam_lines)) return false; if (bam_lines.size() <= 0) return false; uint64_t read_id = bam_lines[0].read_id; if (read_id >= _begin_id && read_id < _end_id) { hits.hits.clear(); for (size_t i = 0; i < bam_lines.size(); ++i) { CBamLine& bam_line = bam_lines[i]; BowtieHit bh; char seq[MAX_READ_LEN + 1] = {0}; char qual[MAX_READ_LEN + 1] = {0}; bool success = _bam_hit_factory.get_hit_from_buf((const char*)bam_line.b, bh, true, NULL, NULL, seq, qual); if (success) { bh.seq(seq); bh.qual(qual); char* sam_line = bam_format1(_bam_merge->get_sam_header(), bam_line.b); bh.hitfile_rec(sam_line); free(sam_line); hits.insert_id = bh.insert_id(); hits.hits.push_back(bh); } bam_line.b_free(); } bam_lines.clear(); if (hits.hits.size() > 0) return true; } for (size_t i = 0; i < bam_lines.size(); ++i) bam_lines[i].b_free(); bam_lines.clear(); if (read_id >= _end_id) break; } return false; } private: ReadTable& _it; RefSequenceTable& _rt; vector _fnames; uint64_t _begin_id; uint64_t _end_id; BAMHitFactory _bam_hit_factory; BamMerge* _bam_merge; }; // events include splice junction, indels, and fusion points. struct ConsensusEventsWorker { void operator()() { ReadTable it; MultipleBAMReader l_hs(it, *rt, left_map_fnames, begin_id, end_id); MultipleBAMReader r_hs(it, *rt, right_map_fnames, begin_id, end_id); HitsForRead curr_left_hit_group; HitsForRead curr_right_hit_group; l_hs.next_read_hits(curr_left_hit_group); r_hs.next_read_hits(curr_right_hit_group); uint32_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); uint32_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id); // While we still have unreported hits... while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) && (curr_left_obs_order < end_id || curr_right_obs_order < end_id)) { // Chew up left singletons while (curr_left_obs_order < curr_right_obs_order && curr_left_obs_order < end_id && curr_left_obs_order != VMAXINT32) { HitsForRead best_hits; best_hits.insert_id = curr_left_obs_order; // Process hits for left singleton, select best alignments read_best_alignments(curr_left_hit_group, best_hits, *gtf_junctions); // update_coverage(best_hits, *coverage); update_junctions(best_hits, *junctions); update_insertions_and_deletions(best_hits, *insertions, *deletions); update_fusions(best_hits, *rt, *fusions); // Get next hit group l_hs.next_read_hits(curr_left_hit_group); curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); } // Chew up right singletons while (curr_left_obs_order > curr_right_obs_order && curr_right_obs_order < end_id && curr_right_obs_order != VMAXINT32) { HitsForRead best_hits; best_hits.insert_id = curr_right_obs_order; if (curr_right_obs_order >= begin_id) { // Process hit for right singleton, select best alignments read_best_alignments(curr_right_hit_group, best_hits, *gtf_junctions); // update_coverage(best_hits, *coverage); update_junctions(best_hits, *junctions); update_insertions_and_deletions(best_hits, *insertions, *deletions); update_fusions(best_hits, *rt, *fusions); } // Get next hit group r_hs.next_read_hits(curr_right_hit_group); curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id); } // Since we have both left hits and right hits for this insert, // Find the best pairing and print both while (curr_left_obs_order == curr_right_obs_order && curr_left_obs_order < end_id && curr_left_obs_order != VMAXINT32) { vector > best_hits; InsertAlignmentGrade grade; pair_best_alignments(curr_left_hit_group, curr_right_hit_group, grade, best_hits, *gtf_junctions); HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order; HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order; if (best_hits.size() > 0) { for (size_t i = 0; i < best_hits.size(); ++i) { best_left_hit_group.hits.push_back(best_hits[i].first); best_right_hit_group.hits.push_back(best_hits[i].second); } } else { best_left_hit_group.hits = curr_left_hit_group.hits; best_right_hit_group.hits = curr_right_hit_group.hits; } // update_coverage(best_left_hit_group, *coverage); update_junctions(best_left_hit_group, *junctions); update_insertions_and_deletions(best_left_hit_group, *insertions, *deletions); update_fusions(best_left_hit_group, *rt, *fusions); // update_coverage(best_right_hit_group, *coverage); update_junctions(best_right_hit_group, *junctions); update_insertions_and_deletions(best_right_hit_group, *insertions, *deletions); update_fusions(best_right_hit_group, *rt, *fusions); l_hs.next_read_hits(curr_left_hit_group); curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id); r_hs.next_read_hits(curr_right_hit_group); curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id); } } } vector left_map_fnames; vector right_map_fnames; RefSequenceTable* rt; JunctionSet* gtf_junctions; uint64_t begin_id; uint64_t end_id; int64_t left_map_offset; int64_t right_map_offset; JunctionSet* junctions; InsertionSet* insertions; DeletionSet* deletions; FusionSet* fusions; Coverage* coverage; }; void print_alnStats(SAlignStats& alnStats) { string fname(output_dir); fname+="/align_summary.txt"; FILE* f = fopen(fname.c_str(), "w"); int64_t total_left=alnStats.num_aligned_left+alnStats.num_unmapped_left; int64_t total_right=alnStats.num_aligned_right+alnStats.num_unmapped_right; //int64_t accepted_left =alnStats.num_aligned_left -alnStats.num_aligned_left_xmulti; //accepted mappings, < max_multihits //int64_t accepted_right=alnStats.num_aligned_right-alnStats.num_aligned_right_xmulti; //accepted right mappings string rdn("Left reads"); if (total_right==0) rdn="Reads"; fprintf(f, "%s:\n", rdn.c_str()); fprintf(f, " Input: %9ld\n", total_left); double perc=(100.0*alnStats.num_aligned_left)/total_left; fprintf(f, " Mapped: %9ld (%4.1f%% of input)\n", alnStats.num_aligned_left, perc); if (alnStats.num_aligned_left) { perc=(100.0*alnStats.num_aligned_left_multi)/alnStats.num_aligned_left; fprintf(f," of these: %9ld (%4.1f%%) have multiple alignments (%ld have >%d)\n", alnStats.num_aligned_left_multi, perc, alnStats.num_aligned_left_xmulti, max_multihits); } /*perc=(100.0*accepted_left)/total_left; fprintf(f, " Mapped acceptably: %9ld (%4.1f%% of input)\n", accepted_left, perc); */ int64_t total_mapped=alnStats.num_aligned_left; int64_t total_input=total_left; int64_t total_pairs=0; if (total_right) { fprintf(f, "Right reads:\n"); fprintf(f, " Input: %9ld\n", total_right); total_input+=total_right; perc=(100.0*alnStats.num_aligned_right)/total_right; fprintf(f, " Mapped: %9ld (%4.1f%% of input)\n", alnStats.num_aligned_right, perc); if (alnStats.num_aligned_right) { perc=(100.0* alnStats.num_aligned_right_multi)/alnStats.num_aligned_right; fprintf(f," of these: %9ld (%4.1f%%) have multiple alignments (%ld have >%d)\n", alnStats.num_aligned_right_multi, perc, alnStats.num_aligned_right_xmulti, max_multihits); } /* perc=(100.0*accepted_right)/total_left; fprintf(f, " Mapped acceptably: %9ld (%4.1f%% of input)\n", accepted_right, perc); */ total_mapped+=alnStats.num_aligned_right; total_pairs=(total_rightWUM\t%c\t%s\t%c\t/%d\n", found?'F':'U', // rname.c_str(), (um_code && found) ? um_code: '-', rdata.matenum); //-- DEBUG. size_t slash_pos=rname.rfind('/'); if (slash_pos!=string::npos) rname.resize(slash_pos); GBamRecord bamrec(rname.c_str(), -1, 0, false, rdata.read.seq.c_str(), NULL, rdata.read.qual.c_str()); if (rdata.matenum) { bamrec.set_flag(BAM_FPAIRED); if (rdata.matenum==1) bamrec.set_flag(BAM_FREAD1); else bamrec.set_flag(BAM_FREAD2); } //if (found && um_code && !rdata.trashCode) { //rdata.trashCode=um_code; //} if (rdata.trashCode) { //multi-mapped reads did not really QC-fail //should also not be written to unmapped.bam bamrec.add_aux("ZT", 'A', 1, (uint8_t*)&rdata.trashCode); if (rdata.trashCode!='M') { bamrec.set_flag(BAM_FQCFAIL); //to be excluded from further processing? } } if (is_unmapped || (rdata.trashCode!='M' && !found)) { um_out->write(&bamrec); } // if (unmapped_counter && !found) { if (rdata.trashCode!='M') (*unmapped_counter)++; else if (multimapped_counter) (*multimapped_counter)++; } return true; } }; struct ReportWorker { ReportWorker(RefSequenceTable* r=NULL, SAlignStats* s=NULL): gtf_junctions(NULL), junctions(NULL), rev_junctions(NULL), insertions(NULL), deletions(NULL), rev_deletions(NULL), fusions(NULL), coverage(NULL), final_junctions(NULL), final_insertions(NULL), final_deletions(NULL), final_fusions(NULL), rt(r), begin_id(0), end_id(0), left_reads_offset(0), left_map_offset(0), right_reads_offset(0), right_map_offset(0), alnStats(s), is_paired(false) { } void write_singleton_alignments(uint64_t curr_obs_order, HitsForRead& curr_hit_group, ReadStream& reads_file, GBamWriter& bam_writer, FragmentType fragment_type, GetReadProc* readProc=NULL, Read* gotRead=NULL) { int64_t* unmapped_counter = & alnStats->num_unmapped_left; int64_t* aligned_counter = & alnStats->num_aligned_left; int64_t* aligned_counter_multi = & alnStats->num_aligned_left_multi; int64_t* aligned_counter_xmulti = & alnStats->num_aligned_left_xmulti; if (fragment_type == FRAG_RIGHT) { unmapped_counter = & alnStats->num_unmapped_right; aligned_counter = & alnStats->num_aligned_right; aligned_counter_multi = & alnStats->num_aligned_right_multi; aligned_counter_xmulti = & alnStats->num_aligned_right_xmulti; } if (is_paired && !report_mixed_alignments) { // FIXME: how did we get here? // can this be a problem if input reads are mixed: paired + single ? if (!gotRead) { if (curr_hit_group.hits.size()>1) { (*aligned_counter_multi)++; } else { (*aligned_counter)++; } } return; } HitsForRead best_hits; best_hits.insert_id = curr_obs_order; realign_reads(curr_hit_group, *rt, *junctions, *rev_junctions, *insertions, *deletions, *rev_deletions, *fusions); exclude_hits_on_filtered_junctions(*junctions, curr_hit_group); // Process hits for singleton, select best alignments const bool final_report = true; char map_flags=read_best_alignments(curr_hit_group, best_hits, *gtf_junctions, *junctions, *insertions, *deletions, *fusions, *coverage, final_report, &rng); string read_alt_name; bool got_read = false; if (gotRead!=NULL) { read_alt_name=gotRead->alt_name; got_read = true; } else { Read read; char map_code=0; if (map_flags & 4) { (*aligned_counter_multi)++; } if (map_flags & 16) { map_code='M'; (*aligned_counter_xmulti)++; } if (map_flags & 1) (*aligned_counter)++; //acceptable mappings found else (*unmapped_counter)++; //CReadProc readProc(&um_out, unmapped_counter, aligned_counter_xmulti); got_read=reads_file.getRead(curr_obs_order, read, reads_format, false, begin_id, end_id, readProc, (map_flags & 1)==0 ); //&um_out, map_code, unmapped_counter, aligned_counter_xmulti); read_alt_name=read.alt_name; } if (best_hits.hits.size() > 0) { if (got_read) { update_junctions(best_hits, *final_junctions); update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions); update_fusions(best_hits, *rt, *final_fusions, *fusions); print_sam_for_single(*rt, best_hits, fragment_type, read_alt_name, bam_writer, rng); } else { //Should never happen! fprintf(stderr, "Warning: getRead() failed for id# %ld.\n", curr_obs_order); } } //else reads_file. } void operator()() { rng.seed(1); ReadTable it; GBamWriter bam_writer(bam_output_fname.c_str(), sam_header.c_str()); is_paired = right_map_fnames.size() > 0; ReadStream left_reads_file(left_reads_fname); if (left_reads_file.file() == NULL) err_die("Error: cannot open %s for reading\n", left_reads_fname.c_str()); if (left_reads_file.isBam()) { left_reads_file.use_alt_name(); left_reads_file.ignoreQC(); } if (left_reads_offset > 0) left_reads_file.seek(left_reads_offset); GBamWriter* left_um_out = new GBamWriter(left_um_fname.c_str(), sam_header.c_str()); GBamWriter* right_um_out = NULL; ReadStream right_reads_file(right_reads_fname); if (right_reads_offset > 0) right_reads_file.seek(right_reads_offset); if (!right_reads_fname.empty()) { if (right_reads_file.isBam()) { right_reads_file.use_alt_name(); right_reads_file.ignoreQC(); right_um_out = new GBamWriter(right_um_fname.c_str(), sam_header.c_str()); } } MultipleBAMReader left_hs(it, *rt, left_map_fnames, begin_id, end_id); MultipleBAMReader right_hs(it, *rt, right_map_fnames, begin_id, end_id); HitsForRead curr_left_hit_group; HitsForRead curr_right_hit_group; left_hs.next_read_hits(curr_left_hit_group); right_hs.next_read_hits(curr_right_hit_group); uint64_t curr_left_obs_order = it.observation_order( curr_left_hit_group.insert_id); uint64_t curr_right_obs_order = it.observation_order( curr_right_hit_group.insert_id); const bool final_report = true; // While we still have unreported hits... Read l_read; Read r_read; CReadProc l_readProc(left_um_out, &(alnStats->num_unmapped_left), &(alnStats->num_aligned_left_xmulti)); CReadProc r_readProc(right_um_out, &(alnStats->num_unmapped_right), &(alnStats->num_aligned_right_xmulti)); while ((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) && (curr_left_obs_order < end_id || curr_right_obs_order < end_id)) { /*if (curr_left_obs_order >= 3463 || curr_right_obs_order >= 3463) { fprintf(stderr, "Debug target reached!\n"); }*/ // Chew up left singletons (pairs with right reads unmapped) while (curr_left_obs_order < curr_right_obs_order && curr_left_obs_order < end_id && curr_left_obs_order != VMAXINT32) { write_singleton_alignments(curr_left_obs_order, curr_left_hit_group, left_reads_file, bam_writer, //*left_um_out, is_paired ? FRAG_LEFT : FRAG_UNPAIRED, &l_readProc); // Get next hit group left_hs.next_read_hits(curr_left_hit_group); curr_left_obs_order = it.observation_order( curr_left_hit_group.insert_id); } //left singletons // Chew up right singletons while (curr_left_obs_order > curr_right_obs_order && curr_right_obs_order < end_id && curr_right_obs_order != VMAXINT32) { write_singleton_alignments(curr_right_obs_order, curr_right_hit_group, right_reads_file, bam_writer, FRAG_RIGHT, &r_readProc); // Get next hit group right_hs.next_read_hits(curr_right_hit_group); curr_right_obs_order = it.observation_order( curr_right_hit_group.insert_id); } // Since we have both left hits and right hits for this insert, // Find the best pairing and print both while (curr_left_obs_order == curr_right_obs_order && curr_left_obs_order < end_id && curr_left_obs_order != VMAXINT32) { realign_reads(curr_left_hit_group, *rt, *junctions, *rev_junctions, *insertions, *deletions, *rev_deletions, *fusions); exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group); realign_reads(curr_right_hit_group, *rt, *junctions, *rev_junctions, *insertions, *deletions, *rev_deletions, *fusions); exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group); vector > best_hits; bool paired_alignments = curr_left_hit_group.hits.size() > 0 && curr_right_hit_group.hits.size() > 0; InsertAlignmentGrade grade; bool got_left_read = false; bool got_right_read = false; if (paired_alignments) { char pair_map_flags=pair_best_alignments(curr_left_hit_group, curr_right_hit_group, grade, best_hits, *gtf_junctions, *junctions, *insertions, *deletions, *fusions, *coverage, final_report, &rng); if (pair_map_flags & 1) alnStats->num_aligned_left++; else alnStats->num_unmapped_left++; if (pair_map_flags & 2) alnStats->num_aligned_right++; else alnStats->num_unmapped_right++; if ((pair_map_flags & 3)==3) //at least one acceptable alignment was found for each read alnStats->num_aligned_pairs++; if (pair_map_flags & 4) alnStats->num_aligned_left_multi++; if (pair_map_flags & 8) alnStats->num_aligned_right_multi++; if ((pair_map_flags & 12) == 12) alnStats->num_aligned_pairs_multi++; char left_map_code=0; if (pair_map_flags & 16) { left_map_code='M'; alnStats->num_aligned_left_xmulti++; } char right_map_code=0; if (pair_map_flags & 32) { right_map_code='M'; alnStats->num_aligned_right_xmulti++; } got_left_read = left_reads_file.getRead( curr_left_obs_order, l_read, reads_format, false, begin_id, end_id, &l_readProc, (pair_map_flags & 1)==0); //left_um_out, left_map_code, &(alnStats->num_unmapped_left), //&(alnStats->num_aligned_left_xmulti)); got_right_read = right_reads_file.getRead( curr_right_obs_order, r_read, reads_format, false, begin_id, end_id, &r_readProc, (pair_map_flags & 2)==0); //right_um_out, right_map_code, &(alnStats->num_unmapped_right), //&(alnStats->num_aligned_right_xmulti)); //FIXME: what's the best way to check here if the pair alignment is discordant? //if (((pair_map_flags & 3)==3) && (best_hits.size() <= 0 || grade.fusion)) { if (((pair_map_flags & 3)==3) && !grade.concordant()) { alnStats->num_aligned_pairs_disc++; } if (report_mixed_alignments) { if (best_hits.size() <= 0 || (grade.fusion && !fusion_search && !report_discordant_pair_alignments)) paired_alignments = false; } } if (paired_alignments) { HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order; HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order; for (size_t i = 0; i < best_hits.size(); ++i) { best_left_hit_group.hits.push_back(best_hits[i].first); best_right_hit_group.hits.push_back(best_hits[i].second); } if (best_hits.size() > 0) { /* bool got_left_read = left_reads_file.getRead( best_hits[0].first.insert_id(), l_read, reads_format, false, begin_id, end_id, left_um_out, 0, &(alnStats->num_unmapped_left)); bool got_right_read = right_reads_file.getRead( best_hits[0].first.insert_id(), r_read, reads_format, false, begin_id, end_id, right_um_out, 0, &(alnStats->num_unmapped_right)); */ if (got_left_read && got_right_read) { update_junctions(best_left_hit_group, *final_junctions); update_insertions_and_deletions(best_left_hit_group, *final_insertions, *final_deletions); update_fusions(best_left_hit_group, *rt, *final_fusions, *fusions); update_junctions(best_right_hit_group, *final_junctions); update_insertions_and_deletions(best_right_hit_group, *final_insertions, *final_deletions); update_fusions(best_right_hit_group, *rt, *final_fusions, *fusions); pair_support(best_hits, *final_fusions, *fusions); print_sam_for_pair(*rt, best_hits, grade, bam_writer, l_read.alt_name, r_read.alt_name, rng, begin_id, end_id); } else { fprintf(stderr, "Warning: couldn't get reads for pair #%ld (%d, %d)\n", curr_left_obs_order, int(got_left_read), int(got_right_read)); } } } else { //alignments not paired properly if (curr_left_hit_group.hits.size() > 0) { write_singleton_alignments(curr_left_obs_order, curr_left_hit_group, left_reads_file, bam_writer, //*left_um_out, is_paired ? FRAG_LEFT : FRAG_UNPAIRED, &l_readProc, &l_read); } if (curr_right_hit_group.hits.size() > 0) { //only right read mapped write_singleton_alignments(curr_right_obs_order, curr_right_hit_group, right_reads_file, bam_writer, //*right_um_out, FRAG_RIGHT, &r_readProc, &r_read); } } left_hs.next_read_hits(curr_left_hit_group); curr_left_obs_order = it.observation_order( curr_left_hit_group.insert_id); right_hs.next_read_hits(curr_right_hit_group); curr_right_obs_order = it.observation_order( curr_right_hit_group.insert_id); } //both mates have alignments } //while we still have unreported hits.. //print the remaining unmapped reads at the end of each reads' stream left_reads_file.getRead(VMAXINT32, l_read, reads_format, false, begin_id, end_id, &l_readProc); //left_um_out, 0, &(alnStats->num_unmapped_left), &(alnStats->num_aligned_left_xmulti)); if (right_reads_file.file()) right_reads_file.getRead(VMAXINT32, r_read, reads_format, false, begin_id, end_id, &r_readProc); //right_um_out, 0, &(alnStats->num_unmapped_right), &(alnStats->num_aligned_right_xmulti)); // pclose (pipe close), which waits for a process to end, seems to conflict with boost::thread::join somehow, // resulting in deadlock like behavior. delete left_um_out; delete right_um_out; } string bam_output_fname; string sam_header_fname; string left_reads_fname; vector left_map_fnames; string right_reads_fname; vector right_map_fnames; string left_um_fname; string right_um_fname; JunctionSet* gtf_junctions; JunctionSet* junctions; JunctionSet* rev_junctions; InsertionSet* insertions; DeletionSet* deletions; DeletionSet* rev_deletions; FusionSet* fusions; Coverage* coverage; JunctionSet* final_junctions; InsertionSet* final_insertions; DeletionSet* final_deletions; FusionSet* final_fusions; RefSequenceTable* rt; uint64_t begin_id; uint64_t end_id; int64_t left_reads_offset; int64_t left_map_offset; int64_t right_reads_offset; int64_t right_map_offset; //read alignment accounting: SAlignStats* alnStats; bool is_paired; boost::mt19937 rng; }; void driver(const string& bam_output_fname, istream& ref_stream, const vector& left_map_fnames, const string& left_reads_fname, const vector& right_map_fnames, const string& right_reads_fname, FILE* junctions_out, FILE* insertions_out, FILE* deletions_out, FILE* fusions_out) { if (!parallel) num_threads = 1; RefSequenceTable rt(sam_header, true); get_seqs(ref_stream, rt, true); srandom(1); JunctionSet gtf_junctions; if (!gtf_juncs.empty()) { char splice_buf[4096]; FILE* splice_coords = fopen(gtf_juncs.c_str(), "r"); if (splice_coords) { while (fgets(splice_buf, sizeof(splice_buf), splice_coords)) { char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; char* ref_name = get_token((char**)&buf, "\t"); char* scan_left_coord = get_token((char**)&buf, "\t"); char* scan_right_coord = get_token((char**)&buf, "\t"); char* orientation = get_token((char**)&buf, "\t"); if (!scan_left_coord || !scan_right_coord || !orientation) { fprintf(stderr,"Error: malformed splice coordinate record in %s\n:%s\n", gtf_juncs.c_str(), buf); exit(1); } uint32_t ref_id = rt.get_id(ref_name, NULL, 0); uint32_t left_coord = atoi(scan_left_coord); uint32_t right_coord = atoi(scan_right_coord); bool antisense = *orientation == '-'; JunctionStats junction_stat; junction_stat.gtf_match = true; junction_stat.accepted = true; gtf_junctions.insert(make_pair(Junction(ref_id, left_coord, right_coord, antisense), junction_stat)); } } fprintf(stderr, "Loaded %d GFF junctions from %s.\n", (int)(gtf_junctions.size()), gtf_juncs.c_str()); } vector read_ids; vector > offsets; if (num_threads > 1) { vector fnames; if (right_map_fnames.size() > 0) { fnames.push_back(right_reads_fname); fnames.push_back(right_map_fnames.back()); } fnames.push_back(left_reads_fname); fnames.push_back(left_map_fnames.back()); bool enough_data = calculate_offsets(fnames, read_ids, offsets); if (!enough_data) num_threads = 1; } vector vjunctions(num_threads); vector vinsertions(num_threads); vector vdeletions(num_threads); vector vfusions(num_threads); vector vcoverages(num_threads); vector threads; for (int i = 0; i < num_threads; ++i) { ConsensusEventsWorker worker; worker.left_map_fnames = left_map_fnames; worker.right_map_fnames = right_map_fnames; worker.rt = &rt; worker.gtf_junctions = >f_junctions; worker.junctions = &vjunctions[i]; worker.insertions = &vinsertions[i]; worker.deletions = &vdeletions[i]; worker.fusions = &vfusions[i]; worker.coverage = &vcoverages[i]; worker.right_map_offset = 0; if (i == 0) { worker.begin_id = 0; worker.left_map_offset = 0; } else { size_t offsets_size = offsets[i-1].size(); worker.begin_id = read_ids[i-1]; worker.left_map_offset = offsets[i-1].back(); if (offsets_size == 4) worker.right_map_offset = offsets[i-1][1]; } worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits::max(); if (num_threads > 1 && i + 1 < num_threads) threads.push_back(new boost::thread(worker)); else worker(); } for (size_t i = 0; i < threads.size(); ++i) { threads[i]->join(); delete threads[i]; threads[i] = NULL; } threads.clear(); JunctionSet& junctions = vjunctions[0]; InsertionSet& insertions = vinsertions[0]; DeletionSet& deletions = vdeletions[0]; FusionSet& fusions = vfusions[0]; Coverage& coverage = vcoverages[0]; for (int i = 1; i < num_threads; ++i) { merge_with(junctions, vjunctions[i]); vjunctions[i].clear(); merge_with(insertions, vinsertions[i]); vinsertions[i].clear(); merge_with(deletions, vdeletions[i]); vdeletions[i].clear(); merge_with(fusions, vfusions[i]); vfusions[i].clear(); coverage.merge_with(vcoverages[i]); vcoverages[i].clear(); } merge_with(junctions, gtf_junctions); coverage.calculate_coverage(); JunctionSet rev_junctions; JunctionSet::const_iterator junction_iter = junctions.begin(); for (; junction_iter != junctions.end(); ++junction_iter) { const Junction& junction = junction_iter->first; Junction rev_junction = junction; rev_junction.left = junction.right; rev_junction.right = junction.left; rev_junctions[rev_junction] = junction_iter->second; } DeletionSet rev_deletions; #if 0 DeletionSet::const_iterator deletion_iter = deletions.begin(); for (; deletion_iter != deletions.end(); ++deletion_iter) { const Deletion& deletion = deletion_iter->first; Deletion rev_deletion = deletion; rev_deletion.left = deletion.right; rev_deletion.right = deletion.left; rev_deletions[rev_deletion] = deletion_iter->second; } #endif size_t num_unfiltered_juncs = junctions.size(); fprintf(stderr, "Loaded %lu junctions\n", (long unsigned int) num_unfiltered_juncs); // Read hits, extract junctions, and toss the ones that arent strongly enough supported. filter_junctions(junctions, gtf_junctions); //size_t num_juncs_after_filter = junctions.size(); //fprintf(stderr, "Filtered %lu junctions\n", // num_unfiltered_juncs - num_juncs_after_filter); /* size_t small_overhangs = 0; for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i) { if (i->second.accepted && (i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len)) { small_overhangs++; } } if (small_overhangs >0) fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs); */ JunctionSet vfinal_junctions[num_threads]; InsertionSet vfinal_insertions[num_threads]; DeletionSet vfinal_deletions[num_threads]; FusionSet vfinal_fusions[num_threads]; vector alnStats(num_threads); for (int i = 0; i < num_threads; ++i) { ReportWorker worker(&rt, &alnStats[i]); worker.sam_header_fname = sam_header; char filename[1024] = {0}; sprintf(filename, "%s%d.bam", bam_output_fname.c_str(), i); worker.bam_output_fname = filename; string tmpoutdir = getFdir(worker.bam_output_fname); worker.left_um_fname = tmpoutdir; sprintf(filename, "unmapped_left_%d.bam", i); worker.left_um_fname+=filename; if (right_reads_fname != "") { sprintf(filename, "unmapped_right_%d.bam", i); worker.right_um_fname = tmpoutdir; worker.right_um_fname += filename; } worker.left_reads_fname = left_reads_fname; worker.left_map_fnames = left_map_fnames; worker.right_reads_fname = right_reads_fname; worker.right_map_fnames = right_map_fnames; worker.gtf_junctions = >f_junctions; worker.junctions = &junctions; worker.rev_junctions = &rev_junctions; worker.insertions = &insertions; worker.deletions = &deletions; worker.rev_deletions = &rev_deletions; worker.fusions = &fusions; worker.coverage = &coverage; worker.final_junctions = &vfinal_junctions[i]; worker.final_insertions = &vfinal_insertions[i]; worker.final_deletions = &vfinal_deletions[i]; worker.final_fusions = &vfinal_fusions[i]; //worker.rt = &rt; //worker.right_reads_offset = 0; //worker.right_map_offset = 0; /*if (i == 0) { worker.begin_id = 0; worker.left_reads_offset = 0; worker.left_map_offset = 0; } else */ if (i != 0) { size_t offsets_size = offsets[i-1].size(); worker.begin_id = read_ids[i-1]; worker.left_reads_offset = offsets[i-1][offsets_size - 2]; worker.left_map_offset = offsets[i-1].back(); if (offsets_size == 4) { worker.right_reads_offset = offsets[i-1][0]; worker.right_map_offset = offsets[i-1][1]; } } worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits::max(); if (num_threads > 1 && i + 1 < num_threads) threads.push_back(new boost::thread(worker)); else worker(); } //for each thread for (size_t i = 0; i < threads.size(); ++i) { threads[i]->join(); delete threads[i]; threads[i] = NULL; } threads.clear(); JunctionSet& final_junctions = vfinal_junctions[0]; InsertionSet& final_insertions = vfinal_insertions[0]; DeletionSet& final_deletions = vfinal_deletions[0]; FusionSet& final_fusions = vfinal_fusions[0]; for (int i = 1; i < num_threads; ++i) { alnStats[0].add(alnStats[i]); //merge alignment stats merge_with(final_junctions, vfinal_junctions[i]); vfinal_junctions[i].clear(); merge_with(final_insertions, vfinal_insertions[i]); vfinal_insertions[i].clear(); merge_with(final_deletions, vfinal_deletions[i]); vfinal_deletions[i].clear(); merge_with(final_fusions, vfinal_fusions[i]); vfinal_fusions[i].clear(); } //small_overhangs = 0; for (JunctionSet::iterator i = final_junctions.begin(); i != final_junctions.end();) { if (i->second.supporting_hits == 0 || i->second.left_extent < 8 || i->second.right_extent < 8) { final_junctions.erase(i++); } else { ++i; } } // if (small_overhangs > 0) // fprintf(stderr, "Warning: %lu small overhang junctions!\n", small_overhangs); print_alnStats(alnStats[0]); fprintf (stderr, "Printing junction BED track..."); print_junctions(junctions_out, final_junctions, rt); fprintf (stderr, "done\n"); fprintf (stderr, "Printing insertions..."); print_insertions(insertions_out, final_insertions,rt); fclose(insertions_out); fprintf (stderr, "done\n"); fprintf (stderr, "Printing deletions..."); print_deletions(deletions_out, final_deletions, rt); fclose(deletions_out); fprintf (stderr, "done\n"); if (fusion_search) { fprintf (stderr, "Printing fusions..."); print_fusions(fusions_out, final_fusions, rt); fclose(fusions_out); fprintf (stderr, "done\n"); } fprintf(stderr, "Found %lu junctions from happy spliced reads\n", (long unsigned int)final_junctions.size()); } void print_usage() { fprintf(stderr, "Usage: tophat_reports [right_map1,...,right_mapN] [right_reads.fq]\n"); // fprintf(stderr, "Usage: tophat_reports [splice_map1.sbwtout]\n"); } int main(int argc, char** argv) { fprintf(stderr, "tophat_reports v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION); fprintf(stderr, "---------------------------------------\n"); reads_format = FASTQ; int parse_ret = parse_options(argc, argv, print_usage); if (parse_ret) return parse_ret; if(optind >= argc) { print_usage(); return 1; } string ref_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string junctions_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string insertions_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string deletions_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string fusions_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string accepted_hits_file_name = argv[optind++]; if(optind >= argc) { print_usage(); return 1; } string left_map_filename_list = argv[optind++]; vector left_map_filenames; tokenize(left_map_filename_list, ",", left_map_filenames); if(optind >= argc) { print_usage(); return 1; } string left_reads_filename = argv[optind++]; string unzcmd=getUnpackCmd(left_reads_filename, false); string right_map_filename_list; vector right_map_filenames; string right_reads_filename; if (optind < argc) { right_map_filename_list = argv[optind++]; tokenize(right_map_filename_list, ",", right_map_filenames); if(optind >= argc) { print_usage(); return 1; } right_reads_filename=argv[optind++]; } ifstream ref_stream(ref_file_name.c_str(), ifstream::in); if (!ref_stream.good()) { fprintf(stderr, "Error: cannot open %s for reading\n", ref_file_name.c_str()); exit(1); } FILE* junctions_file = fopen(junctions_file_name.c_str(), "w"); if (junctions_file == NULL) { fprintf(stderr, "Error: cannot open BED file %s for writing\n", junctions_file_name.c_str()); exit(1); } FILE* insertions_file = fopen(insertions_file_name.c_str(), "w"); if (insertions_file == NULL) { fprintf(stderr, "Error: cannot open VCF file %s for writing\n", insertions_file_name.c_str()); exit(1); } FILE* deletions_file = fopen(deletions_file_name.c_str(), "w"); if (deletions_file == NULL) { fprintf(stderr, "Error: cannot open VCF file %s for writing\n", deletions_file_name.c_str()); exit(1); } FILE* fusions_file = NULL; if (fusion_search) { fusions_file = fopen(fusions_file_name.c_str(), "w"); if (fusions_file == NULL) { fprintf(stderr, "Error: cannot open VCF file %s for writing\n", fusions_file_name.c_str()); exit(1); } } driver(accepted_hits_file_name, ref_stream, left_map_filenames, left_reads_filename, right_map_filenames, right_reads_filename, junctions_file, insertions_file, deletions_file, fusions_file); return 0; } tophat-2.0.9/src/bam_merge_impl.cpp0000644000175000017500000000740112157116165016012 0ustar toortoor#include "bam_merge.h" #define ERR_BAM_OPEN "Error: bam_merge failed to open BAM file %s\n" bool raw_merge = false; void CBamLine::b_init(bam_header_t* header) { if (b) { char *name = bam1_qname(b); if (raw_merge) { read_id=0; return; } read_id=(uint64_t)atol(name); if (read_id<1 && header) { char* samline=bam_format1(header, b); err_die("Error: invalid read Id (must be numeric) for BAM record:\n%s\n", samline); } } } void CBamLine::b_free() { if (b!=NULL) { bam_destroy1(b); b=NULL; } } BamMerge::BamMerge(const vector& bam_fnames, vector file_offsets) : _bam_fnames(bam_fnames), _lines(less_bam(true)), _last_id(0) { if (bam_fnames.size() <= 0) return; for (size_t i = 0; i < _bam_fnames.size(); ++i) { const char* fname = _bam_fnames[i].c_str(); samfile_t* fp = samopen(fname, "rb", 0); if (fp==0) { warn_msg(ERR_BAM_OPEN, fname); exit(1); } if (bam_fnames.size() == file_offsets.size() && file_offsets[i] > 0) bgzf_seek(fp->x.bam, file_offsets[i], SEEK_SET); bam1_t* b = bam_init1(); if (samread(fp, b) > 0) { _src_files.push_back(fp); CBamLine brec(_lines.size(), b, fp->header); _lines.push(brec); } else { bam_destroy1(b); } } if (_lines.size() == 0) { warn_msg("Warning: no input BAM records found.\n"); exit(1); } } BamMerge::~BamMerge() { while (_lines.size() > 0) { CBamLine brec(_lines.top()); brec.b_free(); _lines.pop(); }; for (size_t i = 0; i < _src_files.size(); ++i) samclose(_src_files[i]); _src_files.clear(); } bool BamMerge::next_bam_lines(vector& bam_lines) { if (_lines.size() <= 0) return false; bam_lines.clear(); vector temp_bam_lines; while (_lines.size() > 0) { CBamLine brec(_lines.top()); //should have the smallest read_id assert (brec.filenum>=0 && brec.b!=NULL); if ((raw_merge || _last_id != brec.read_id) && temp_bam_lines.size() > 0) { break; } _lines.pop(); _last_id = brec.read_id; temp_bam_lines.push_back(brec); //reuse brec brec.b = bam_init1(); if (samread(_src_files[brec.filenum], brec.b)>0) { brec.b_init(_src_files[brec.filenum]->header); _lines.push(brec); } else { //no more BAM records brec.b_free(); } } if (temp_bam_lines.size() <= 0) return false; // we need to eliminate duplicate alignments, which can happen when using Bowtie2 // as we may often map the same read against transcriptome, genome, and novel/known splice junctions. std::sort (temp_bam_lines.begin(), temp_bam_lines.end(), less_bam()); bool sense_strand = false, antisense_strand = false; vector free_indexes(temp_bam_lines.size(), false); for (size_t i = 0; i < temp_bam_lines.size(); ++i) { bool do_write = true; CBamLine& bam_line = temp_bam_lines[i]; uint8_t* ptr = bam_aux_get(bam_line.b, "XS"); char strand = 0; if (ptr) strand = bam_aux2A(ptr); if (i > 0) { if (equal_bam()(temp_bam_lines[i-1], bam_line)) { if (strand == 0) { do_write = false; } else { if (strand == '+' && sense_strand) do_write = false; else sense_strand = true; if (strand == '-' && antisense_strand) do_write = false; else antisense_strand = true; } } else { sense_strand = false; antisense_strand = false; } } if (strand == '+') sense_strand = true; else if (strand == '-') antisense_strand = true; if (do_write) bam_lines.push_back(bam_line); else free_indexes[i] = true; } for (size_t i = 0; i < free_indexes.size(); ++i) { if (free_indexes[i]) temp_bam_lines[i].b_free(); } return bam_lines.size() > 0; } tophat-2.0.9/src/bam2fastx.cpp0000644000175000017500000002413012162605263014736 0ustar toortoor#include #include #include #include #include #include #include "bam/bam.h" #include "bam/sam.h" using namespace std; bool is_fastq=true; //default is fastq bool sam_input=false; //default is BAM bool all_reads=false; bool mapped_only=false; bool add_matenum=false; bool pairs=false; bool color=false; bool ignoreQC=false; // ignore qc fail flag 0x400 bool ignoreOQ=false; // ignore OQ tag string outfname; #define USAGE "Usage: bam2fastx [--fasta|-a|--fastq|-q] [--color] [-Q] [--sam|-s|-t]\n\ [-M|--mapped-only|-A|--all] [-o ] [-P|--paired] [-N] \n\ \nNote: By default, reads flagged as not passing quality controls are\n\ discarded; the -Q option can be used to ignore the QC flag.\n\ \nUse the -N option if the /1 and /2 suffixes should be appended to\n\ read names according to the SAM flags\n\ \nUse the -O option to ignore the OQ tag, if present, when writing quality values\n" const char *short_options = "o:ac:qstOQMAPN"; enum { OPT_FASTA = 127, OPT_FASTQ, OPT_SAM, OPT_PAIRED, OPT_MAPPED_ONLY, OPT_ALL, OPT_COLOR }; struct Read { string name; int mate; string seq; string qual; void clear() { name.clear(); mate=0; seq.clear(); qual.clear(); } }; struct option long_options[] = { {"fasta", no_argument, 0, OPT_FASTA}, {"fastq", no_argument, 0, OPT_FASTQ}, {"sam", no_argument, 0, OPT_SAM}, {"paired", no_argument, 0, OPT_PAIRED}, {"mapped-only", no_argument, 0, OPT_MAPPED_ONLY}, {"all", no_argument, 0, OPT_ALL}, {"color", no_argument, 0, OPT_COLOR}, {0, 0, 0, 0} // terminator }; int parse_options(int argc, char** argv) { int option_index = 0; int next_option; do { next_option = getopt_long(argc, argv, short_options, long_options, &option_index); switch (next_option) { case -1: break; case 'a': case OPT_FASTA: is_fastq = false; break; case 'q': case OPT_FASTQ: is_fastq = true; break; case 's': case 't': case OPT_SAM: //sam (text) input sam_input = true; break; case 'M': case OPT_MAPPED_ONLY: mapped_only = true; break; case 'A': case OPT_ALL: all_reads = true; break; case OPT_COLOR: color = true; break; case 'P': case OPT_PAIRED: pairs = true; break; case 'Q': ignoreQC = true; break; case 'O': ignoreOQ = true; break; case 'o': outfname=optarg; break; case 'N': add_matenum=true; break; default: return 1; } } while(next_option != -1); if (all_reads && mapped_only) { fprintf(stderr, "Error: incompatible options !\n"); exit(2); } return 0; } void getRead(const bam1_t *b, samfile_t* fp, Read& rd) { static const int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 }; rd.clear(); char *name = bam1_qname(b); rd.name=name; unsigned char *qual = NULL; unsigned char *s = (unsigned char*)bam1_seq(b); int i; if ((b->core.flag & BAM_FQCFAIL) && !ignoreQC) return; bool ismapped=((b->core.flag & BAM_FUNMAP) == 0); if (ismapped && !(all_reads || mapped_only)) return; if (mapped_only && !ismapped) return; bool isreversed=((b->core.flag & BAM_FREVERSE) != 0); // bool is_paired = ((b->core.flag & BAM_FPAIRED) != 0); //if (add_matenum) { if (b->core.flag & BAM_FREAD1) rd.mate=1; else if (b->core.flag & BAM_FREAD2) rd.mate=2; // } int seqlen = b->core.l_qseq; if (seqlen>0) { rd.seq.resize(seqlen); for(i=0;i0 ? rd.mate : 4); if (is_fastq) { if (rd.mate && add_matenum) { fprintf(fout, "@%s/%d\n%s\n",rd.name.c_str(), rd.mate, rd.seq.c_str()); } else { fprintf(fout, "@%s\n%s\n",rd.name.c_str(), rd.seq.c_str()); } fprintf(fout, "+\n%s\n",rd.qual.c_str()); } else { if (rd.mate && add_matenum) { fprintf(fout, ">%s/%d\n%s\n",rd.name.c_str(), rd.mate, rd.seq.c_str()); } else { fprintf(fout, ">%s\n%s\n",rd.name.c_str(), rd.seq.c_str()); } } } void writePaired(Read& rd, int& wpair, FILE* fout, FILE* fout2) { if (rd.mate==1) { writeRead(rd, wpair, fout); } else if (rd.mate==2) { writeRead(rd, wpair, fout2); } else { fprintf(stderr, "Error: unpaired read encountered (%s)\n", rd.name.c_str()); exit(1); } } void err_order(string& last) { fprintf(stderr, "Error: couldn't retrieve both reads for pair %s. " "Perhaps the input file is not sorted by name?\n" "(using 'samtools sort -n' might fix this)\n", last.c_str()); exit(1); } string getFBase(const string& s, string& ext, string& pocmd) { string fbase(s); ext=""; pocmd=""; if (s.empty() || s=="-") return fbase; //size_t slen=s.length(); size_t p=s.rfind('.'); size_t d=s.rfind('/'); if (p==string::npos || (d!=string::npos && p2) fext=fext.substr(0,2); for(size_t i=0; i!=fext.length(); i++) fext[i] = std::tolower(fext[i]); if (fext=="gz" || fext=="z") pocmd="gzip -c "; else if (fext=="bz") pocmd="bzip2 -c "; if (!pocmd.empty()) { p=fbase.rfind('.'); d=fbase.rfind('/'); if (p==string::npos || (d!=string::npos && p=argc) { fprintf(stderr, USAGE); return -1; } fname=argv[optind++]; if (fname==NULL || fname[0]==0) { fprintf(stderr, USAGE); return 1; } if (sam_input) fp = samopen(fname, "r", 0); else fp = samopen(fname, "rb", 0); if (fp == 0) { fprintf(stderr, "Error: bam2fastx failed to open BAM file %s\n", fname); return 1; } FILE* fout=stdout; FILE* fout2=NULL; if (pairs && outfname.empty()) { fprintf(stderr, "Error: paired output (-P) requires the -o option.\n"); return 1; } if (!outfname.empty()) { string fext; string pocmd; string fbase=getFBase(outfname, fext, pocmd); if (pairs) { outfname=fbase+".1."+fext; string out2=fbase+".2."+fext; if (!pocmd.empty()) { out2=pocmd+">"+out2; fout2=popen(out2.c_str(),"w"); } else { fout2=fopen(out2.c_str(),"w"); } if (fout2==NULL) { fprintf(stderr, "Error opening file stream: %s\n", out2.c_str()); return 1; } } string out1(outfname); if (!pocmd.empty()) { out1=pocmd+">"+out1; fout=popen(out1.c_str(),"w"); use_pclose=true; } else { fout=fopen(out1.c_str(),"w"); } if (fout==NULL) { fprintf(stderr, "Error opening file stream: %s\n", out1.c_str()); return 1; } } bam1_t *b = bam_init1(); Read rd; //bool write_mapped=(all_reads || mapped_only); string last; int wpair=0; //writing pair status bitmask (bit 1 set mate 1 was written, // bit 2 set if mate 2 was written, bit 3 set if unpaired read was written) while (samread(fp, b) >= 0) { getRead(b, fp, rd); if (rd.seq.empty()) continue; //skip secondary alignments with no sequence int pstatus=rd.mate; if (rd.mate==0) pstatus=4; if (last!=rd.name) { if (pairs && !last.empty() && wpair!=3) err_order(last); wpair=0; last=rd.name; } if ( (pstatus & wpair)==0) { if (pairs) { writePaired(rd, wpair, fout, fout2); } //paired else { //single reads writeRead(rd, wpair, fout); } } //new pair } if (fout!=stdout) { if (use_pclose) pclose(fout); else fclose(fout); } if (fout2) { if (use_pclose) pclose(fout2); else fclose(fout2); } bam_destroy1(b); samclose(fp); return 0; } tophat-2.0.9/src/wiggles.h0000644000175000017500000000060612122334362014152 0ustar toortoor#ifndef WIGGLES_H #define WIGGLES_H /* * wiggles.h * TopHat * * Created by Cole Trapnell on 12/12/08. * Copyright 2008 Cole Trapnell. All rights reserved. * */ #include "common.h" #include "bwt_map.h" void print_wiggle_header(FILE* coverage_out); void print_wiggle_for_ref(FILE* coverage_out, const string& ref_name, const vector& DoC); #endif tophat-2.0.9/src/alphabet.h0000644000175000017500000000571312122334361014274 0ustar toortoor#ifndef ALPHABETS_H_ #define ALPHABETS_H_ #include #include #include #include #include "assert_helpers.h" using namespace std; using namespace seqan; /** * Helper function to print a uint32_t as a DNA string where each 2-bit * stretch is a character and more significiant bits appear to the left * of less singificant bits. */ static inline std::string u32ToDna(uint32_t a, int len) { char buf[17]; // TODO: return a new string; by value I guess assert_leq(len, 16); for(int i = 0; i < len; i++) { buf[len-i-1] = "ACGT"[a & 3]; a >>= 2; } buf[len] = '\0'; return std::string(buf); } /** * Return a new TStr containing the reverse-complement of s. */ template static inline TStr reverseComplement(const TStr& s) { typedef typename Value::Type TVal; TStr s_rc; size_t slen = length(s); resize(s_rc, slen); for(size_t i = 0; i < slen; i++) { int sv = (int)s[slen-i-1]; if(sv == 4) { s_rc[i] = (TVal)4; } else { s_rc[i] = (TVal)(sv ^ 3); } } return s_rc; } /// Reverse a string in-place template static inline void reverseInPlace(TStr& s) { typedef typename Value::Type TVal; size_t len = length(s); for(size_t i = 0; i < (len>>1); i++) { TVal tmp = s[i]; s[i] = s[len-i-1]; s[len-i-1] = tmp; } } /** * Return the reverse-complement of s. */ template static inline TStr reverseCopy(const TStr& s) { typedef typename Value::Type TVal; TStr s_rc; size_t slen = length(s); resize(s_rc, slen); for(size_t i = 0; i < slen; i++) { s_rc[i] = (TVal)((int)s[slen-i-1]); } return s_rc; } /** * Return the reverse-complement of s. */ static inline bool isReverseComplement(const String& s1, const String& s2) { if(length(s1) != length(s2)) return false; size_t slen = length(s1); for(size_t i = 0; i < slen; i++) { int i1 = (int)s1[i]; int i2 = (int)s2[slen - i - 1]; if(i1 == 4) { if(i2 != 4) return false; } else if(i1 != (i2 ^ 3)) return false; } return true; } /** * Return true iff the first string is dollar-less-than the second. * This means that we pretend that a 'dollar sign' character, * lexicographically larger than all other characters, exists at the * end of both strings. */ template static inline bool dollarLt(const TStr& l, const TStr& r) { return isPrefix(r, l) || (l < r && !isPrefix(l, r)); } /** * Return true iff the first string is dollar-greater-than the second. * This means that we pretend that a 'dollar sign' character, * lexicographically larger than all other characters, exists at the * end of both strings. */ template static inline bool dollarGt(const TStr& l, const TStr& r) { return !dollarLt(l, r); } extern uint8_t dna4Cat[]; extern uint8_t charToDna5[]; extern uint8_t rcCharToDna5[]; extern uint8_t dna4Cat[]; extern uint8_t charToDna5[]; extern uint8_t rcCharToDna5[]; #endif /*ALPHABETS_H_*/ tophat-2.0.9/src/bam_merge.h0000644000175000017500000000631512122334360014430 0ustar toortoor#ifndef BAM_MERGE_H #define BAM_MERGE_H #include #include #include #include #include #include "common.h" using namespace std; extern bool raw_merge; struct CBamLine { int filenum; uint64_t read_id; bam1_t* b; CBamLine(int fno=-1, bam1_t* br=NULL, bam_header_t* header = NULL) : filenum(fno), read_id(0), b(br) { b_init(header); } void b_init(bam_header_t* header = NULL); void b_free(); }; struct equal_bam { bool operator() (const CBamLine& first, const CBamLine& second) const { if (raw_merge) return false; if (first.read_id != second.read_id) return false; if (first.b->core.tid != second.b->core.tid) return false; if (first.b->core.pos != second.b->core.pos) return false; if (first.b->core.n_cigar != second.b->core.n_cigar) return false; for (int i = 0; i < first.b->core.n_cigar; ++i){ if (bam1_cigar(first.b)[i] != bam1_cigar(second.b)[i]) return false; } // for fusion alignments, two alignments are always not equal if (bam_aux_get(first.b, "XF") || bam_aux_get(second.b, "XF")) return false; return true; } }; struct less_bam { bool rev_cmp; //reverse the comparison less_bam(bool reverse_cmp = false) { rev_cmp = reverse_cmp; } bool operator() (const CBamLine& f, const CBamLine& s) const { if (raw_merge) return false; const CBamLine* first = &f; const CBamLine* second = &s; if (rev_cmp) { first = &s; second = &f; } if (first->read_id != second->read_id) return first->read_id < second->read_id; if (first->b->core.tid != second->b->core.tid) return first->b->core.tid < second->b->core.tid; if (first->b->core.pos != second->b->core.pos) return first->b->core.pos < second->b->core.pos; if (first->b->core.n_cigar != second->b->core.n_cigar) return first->b->core.n_cigar < second->b->core.n_cigar; for (int i = 0; i < first->b->core.n_cigar; ++i){ if (bam1_cigar(first->b)[i] != bam1_cigar(second->b)[i]) return bam1_cigar(first->b)[i] < bam1_cigar(second->b)[i]; } // prefer a record with XS attribute char strand1 = 0, strand2 = 0; uint8_t* ptr = bam_aux_get(first->b, "XS"); if (ptr) strand1 = bam_aux2A(ptr); ptr = bam_aux_get(second->b, "XS"); if (ptr) strand2 = bam_aux2A(ptr); if (strand1 != strand2) { if (strand1 == '+' || strand2 == 0) return true; else return false; } // prefer more aux fields. if (bowtie2) { if (first->b->data_len != second->b->data_len) return first->b->data_len > second->b->data_len; } return false; } }; class BamMerge { public: BamMerge(const vector& bam_fnames, vector file_offsets = vector()); ~BamMerge(); public: bool next_bam_lines(vector& bam_lines); bam_header_t* get_sam_header() { if (_src_files.size() > 0) return _src_files[0]->header; else return NULL; } private: vector _bam_fnames; priority_queue, less_bam> _lines; vector _src_files; //array of SAM file handles uint64_t _last_id; }; #endif tophat-2.0.9/src/bam_merge.cpp0000644000175000017500000000222512122334360014757 0ustar toortoor#include "bam_merge.h" #define USAGE "Usage: bam_merge [-Q] [...]\n" void print_usage() { fprintf(stderr, USAGE); } void write_bam_lines(GBamWriter& bw, vector& bam_lines) { for (size_t i = 0; i < bam_lines.size(); ++i) { CBamLine& bam_line = bam_lines[i]; bw.write(bam_line.b, bam_line.read_id); bam_line.b_free(); } bam_lines.clear(); } int main(int argc, char *argv[]) { int parse_ret = parse_options(argc, argv, print_usage); if (parse_ret) return parse_ret; if (quals) raw_merge=true; //hijack the -Q flag for this specific merging option char* outfname=NULL; if (argc-optind<3) { print_usage(); if (argc>1) warn_msg("Error: only %d arguments given.\n", argc-1); return -1; } outfname=argv[optind]; vector bam_fnames; for (int i=optind+1;i bam_lines; while (bamMerge.next_bam_lines(bam_lines)) { write_bam_lines(bamwriter, bam_lines); } return 0; } tophat-2.0.9/src/juncs_db.cpp0000644000175000017500000004516212122334361014640 0ustar toortoor/* * juncs_db.cpp * TopHat * * Created by Cole Trapnell on 12/12/08. * Copyright 2008 Cole Trapnell. All rights reserved. * */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common.h" #include "bwt_map.h" #include "tokenize.h" #include "junctions.h" #include "insertions.h" #include "deletions.h" #include "fusions.h" using namespace std; using namespace seqan; using std::set; // Length of the outer dimension of a single insert from the paired-end library static int read_length = -1; void print_usage() { fprintf(stderr, "Usage: juncs_db \n"); } typedef vector Mapped; bool splice_junc_lt(const pair& lhs, const pair& rhs) { if (lhs.first < rhs.first) return true; else return lhs.second < rhs.second; } /** * Given an insertion set, this code will print FASTA entries * for the surrounding sequence. The names of the FASTA entries * contain information about the exact location and nature of the * insertion. The entry is generally of the form * ||-||ins|<[fwd|rev]> */ template void print_insertion(const Insertion& insertion, int read_len, TStr& ref_str, const string& ref_name, ostream& splice_db) { int half_splice_len = read_len - min_anchor_len; size_t left_start, right_start; size_t left_end, right_end; size_t ref_len = length(ref_str); if (insertion.left >= 0 && insertion.left <= ref_len) { left_start = (int)insertion.left - half_splice_len + 1 >= 0 ? (int)insertion.left - half_splice_len + 1 : 0; left_end = left_start + half_splice_len; right_start = (int)left_end; right_end = right_start + half_splice_len < ref_len ? right_start + half_splice_len : ref_len; if (left_start < left_end && left_end <= ref_len && right_start < right_end && right_end <= ref_len) { Infix::Type left_splice = infix(ref_str, left_start, left_end); Infix::Type right_splice = infix(ref_str, right_start, right_end); splice_db << ">" << ref_name << "|" << left_start << "|" << insertion.left << "-" << insertion.sequence << "|" << right_end << "|ins|" << ("fwd") << endl; splice_db << left_splice << insertion.sequence << right_splice << endl; } } } template void print_splice(const Junction& junction, int read_len, const string& tag, TStr& ref_str, const string& ref_name, ostream& splice_db) { // daehwan - this is tentative, let's think about this more :) // int half_splice_len = read_len - min_anchor_len; int half_splice_len = read_len; size_t left_start, right_start; size_t left_end, right_end; size_t ref_len = length(ref_str); if (junction.left >= 0 && junction.left <= ref_len && junction.right >= 0 && junction.right <= ref_len) { left_start = (int)junction.left - half_splice_len + 1 >= 0 ? (int)junction.left - half_splice_len + 1 : 0; left_end = left_start + half_splice_len; right_start = junction.right; right_end = right_start + half_splice_len < ref_len ? right_start + half_splice_len : ref_len; if (left_start < left_end && left_end <= ref_len && right_start < right_end && right_end <= ref_len) { Infix::Type left_splice = infix(ref_str, left_start, left_end); Infix::Type right_splice = infix(ref_str, right_start, right_end); splice_db << ">" << ref_name << "|" << left_start << "|" << junction.left << "-" << junction.right << "|" << right_end << "|" << tag << endl; splice_db << left_splice << right_splice << endl; } } } template void print_fusion(const Fusion& fusion, int read_len, TStr& left_ref_str, TStr& right_ref_str, const char* left_ref_name, const char* right_ref_name, ostream& fusion_db) { int half_splice_len = read_len - min_anchor_len; size_t left_start, right_start; size_t left_end, right_end; size_t left_ref_len = length(left_ref_str); size_t right_ref_len = length(right_ref_str); if (fusion.left >= 0 && fusion.left < left_ref_len && fusion.right >= 0 && fusion.right < right_ref_len) { if (fusion.dir == FUSION_FF || fusion.dir == FUSION_FR) { left_start = fusion.left + 1 >= (size_t)half_splice_len ? fusion.left - half_splice_len + 1 : 0; left_end = left_start + half_splice_len; } else { left_start = fusion.left; left_end = left_start + half_splice_len < left_ref_len ? left_start + half_splice_len : left_ref_len; } if (fusion.dir == FUSION_FF || fusion.dir == FUSION_RF) { right_start = fusion.right; right_end = right_start + half_splice_len < right_ref_len ? right_start + half_splice_len : right_ref_len; } else { right_end = fusion.right + 1; right_start = right_end >= (size_t)half_splice_len ? right_end - half_splice_len : 0; } if (left_start < left_end && left_end <= left_ref_len && right_start < right_end && right_end <= right_ref_len) { seqan::Dna5String left_splice = infix(left_ref_str, left_start, left_end); seqan::Dna5String right_splice = infix(right_ref_str, right_start, right_end); if (fusion.dir == FUSION_RF || fusion.dir == FUSION_RR) { seqan::reverseComplement(left_splice); left_start = left_end - 1; } if (fusion.dir == FUSION_FR || fusion.dir == FUSION_RR) { seqan::reverseComplement(right_splice); right_end = right_start - 1; } const char* dir = "ff"; if (fusion.dir == FUSION_FR) dir = "fr"; else if (fusion.dir == FUSION_RF) dir = "rf"; else if (fusion.dir == FUSION_RR) dir = "rr"; fusion_db << ">" << left_ref_name << "-" << right_ref_name << "|" << left_start << "|" << fusion.left << "-" << fusion.right << "|" << right_end << "|fus|" << dir << endl; fusion_db << left_splice << right_splice << endl; } } } /** * Parse an int out of optarg and enforce that it be at least 'lower'; * if it is less than 'lower', than output the given error message and * exit with an error and a usage message. */ static int parse_oInt(int lower, char* arg, const char *errmsg) { long l; char *endPtr= NULL; l = strtol(arg, &endPtr, 10); if (endPtr != NULL) { if (l < lower) { cerr << errmsg << endl; print_usage(); exit(1); } return (int32_t)l; } cerr << errmsg << endl; print_usage(); exit(1); return -1; } // //int parse_options(int argc, char** argv) //{ // int option_index = 0; // int next_option; // do { // next_option = getopt_long(argc, argv, short_options, long_options, &option_index); // switch (next_option) { // case 'v': // verbose = true; // break; // case -1: /* Done with options. */ // break; // default: // print_usage(); // return 1; // } // } while(next_option != -1); // // return 0; //} void get_seqs(istream& ref_stream, RefSequenceTable& rt, bool keep_seqs = true) { while(ref_stream.good() && !ref_stream.eof()) { RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence(); string name; readMeta(ref_stream, name, Fasta()); string::size_type space_pos = name.find_first_of(" \t\r"); if (space_pos != string::npos) { name.resize(space_pos); } fprintf(stderr, "\tLoading %s...", name.c_str()); seqan::read(ref_stream, *ref_str, Fasta()); fprintf(stderr, "done\n"); rt.get_id(name, keep_seqs ? ref_str : NULL, 0); if (!keep_seqs) delete ref_str; } } void driver(const vector& splice_coords_files, const vector& insertion_coords_files, const vector& deletion_coords_files, const vector& fusion_coords_files, ifstream& ref_stream) { char splice_buf[2048]; RefSequenceTable rt(sam_header, true); get_seqs(ref_stream, rt, true); JunctionSet junctions; for (size_t i = 0; i < splice_coords_files.size(); ++i) { FILE* splice_coords = splice_coords_files[i]; if (!splice_coords) continue; while (fgets(splice_buf, 2048, splice_coords)) { char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; /** Fields are: 1) reference name 2) left coord of splice (last char of the left exon) 3) right coord of splice (first char of the right exon) */ char* ref_name = get_token((char**)&buf, "\t"); char* scan_left_coord = get_token((char**)&buf, "\t"); char* scan_right_coord = get_token((char**)&buf, "\t"); char* orientation = get_token((char**)&buf, "\t"); if (!scan_left_coord || !scan_right_coord || !orientation) { fprintf(stderr,"Error: malformed splice coordinate record\n"); exit(1); } uint32_t ref_id = rt.get_id(ref_name, NULL, 0); uint32_t left_coord = atoi(scan_left_coord); uint32_t right_coord = atoi(scan_right_coord); bool antisense = *orientation == '-'; junctions.insert(make_pair(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats())); } } /* * Read in the deletion coordinates * and store in a set */ std::set deletions; for(size_t i=0; i < deletion_coords_files.size(); ++i){ FILE* deletion_coords = deletion_coords_files[i]; if(!deletion_coords){ continue; } while (fgets(splice_buf, 2048, deletion_coords)) { char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; /** Fields are: 1) reference name 2) left coord of splice (last char of the left exon) 3) right coord of splice (first char of the right exon) */ char* ref_name = get_token((char**)&buf, "\t"); char* scan_left_coord = get_token((char**)&buf, "\t"); char* scan_right_coord = get_token((char**)&buf, "\t"); if (!scan_left_coord || !scan_right_coord) { fprintf(stderr,"Error: malformed deletion coordinate record\n"); exit(1); } /* * Note that when reading in a deletion, the left co-ord is the position of the * first deleted based. Since we are co-opting the junction data structure, need * to fix up this location */ uint32_t ref_id = rt.get_id(ref_name, NULL, 0); uint32_t left_coord = atoi(scan_left_coord); uint32_t right_coord = atoi(scan_right_coord); deletions.insert(Deletion(ref_id, left_coord - 1, right_coord, false)); } } /* * Read in the insertion coordinates * and store in a set */ std::set insertions; for(size_t i=0; i < insertion_coords_files.size(); ++i){ FILE* insertion_coords = insertion_coords_files[i]; if(!insertion_coords){ continue; } while(fgets(splice_buf, 2048, insertion_coords)){ char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; char* ref_name = get_token((char**)&buf, "\t"); char* scan_left_coord = get_token((char**)&buf, "\t"); char* scan_right_coord = get_token((char**)&buf, "\t"); char* scan_sequence = get_token((char**)&buf, "\t"); if (!scan_left_coord || !scan_sequence || !scan_right_coord) { fprintf(stderr,"Error: malformed insertion coordinate record\n"); exit(1); } seqan::Dna5String sequence = seqan::Dna5String(scan_sequence); bool containsN = false; for(size_t index = 0; index < seqan::length(sequence); index += 1){ /* * Don't allow any ambiguities in the insertion */ if(sequence[index] == 'N'){ containsN = true; break; } } if(containsN){ continue; } seqan::CharString charSequence = sequence; uint32_t ref_id = rt.get_id(ref_name,NULL,0); uint32_t left_coord = atoi(scan_left_coord); insertions.insert(Insertion(ref_id, left_coord, seqan::toCString(charSequence))); } } std::set fusions; for(size_t i=0; i < fusion_coords_files.size(); ++i){ FILE* fusion_coords = fusion_coords_files[i]; if(!fusion_coords){ continue; } while(fgets(splice_buf, 2048, fusion_coords)){ char* nl = strrchr(splice_buf, '\n'); char* buf = splice_buf; if (nl) *nl = 0; char* ref_name1 = strsep((char**)&buf, "\t"); char* scan_left_coord = strsep((char**)&buf, "\t"); char* ref_name2 = strsep((char**)&buf, "\t"); char* scan_right_coord = strsep((char**)&buf, "\t"); char* scan_dir = strsep((char**)&buf, "\t"); if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir) { fprintf(stderr,"Error: malformed insertion coordinate record\n"); exit(1); } uint32_t ref_id1 = rt.get_id(ref_name1, NULL, 0); uint32_t ref_id2 = rt.get_id(ref_name2, NULL, 0); uint32_t left_coord = atoi(scan_left_coord); uint32_t right_coord = atoi(scan_right_coord); uint32_t dir = FUSION_FF; if (strcmp(scan_dir, "fr") == 0) dir = FUSION_FR; else if(strcmp(scan_dir, "rf") == 0) dir = FUSION_RF; else if(strcmp(scan_dir, "rr") == 0) dir = FUSION_RR; fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir)); } } { JunctionSet::iterator itr = junctions.begin(); for (; itr != junctions.end(); ++itr) { RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->first.refid); if (ref_str == NULL) continue; const char* name = rt.get_name(itr->first.refid); print_splice(itr->first, read_length, itr->first.antisense ? "GTAG|rev" : "GTAG|fwd", *ref_str, name, cout); } } { std::set::iterator itr = deletions.begin(); for (; itr != deletions.end(); ++itr) { RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid); if (ref_str == NULL) continue; const char* name = rt.get_name(itr->refid); print_splice((Junction)*itr, read_length, itr->antisense ? "del|rev" : "del|fwd", *ref_str, name, cout); } } { std::set::iterator itr = insertions.begin(); for (; itr != insertions.end(); ++itr){ RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid); if (ref_str == NULL) continue; const char* name = rt.get_name(itr->refid); print_insertion(*itr, read_length, *ref_str, name, cout); } } { std::set::iterator itr = fusions.begin(); for (; itr != fusions.end(); ++itr){ RefSequenceTable::Sequence* left_ref_str = rt.get_seq(itr->refid1); RefSequenceTable::Sequence* right_ref_str = rt.get_seq(itr->refid2); if (left_ref_str == NULL || right_ref_str == NULL) continue; const char* left_ref_name = rt.get_name(itr->refid1); const char* right_ref_name = rt.get_name(itr->refid2); print_fusion(*itr, read_length, *left_ref_str, *right_ref_str, left_ref_name, right_ref_name, cout); } } } int main(int argc, char** argv) { fprintf(stderr, "juncs_db v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION); fprintf(stderr, "---------------------------\n"); int parse_ret = parse_options(argc, argv, print_usage); if (parse_ret) return parse_ret; if(optind >= argc) { print_usage(); return 1; } min_anchor_len = parse_oInt(3, argv[optind++], "anchor length must be at least 3"); if(optind >= argc) { print_usage(); return 1; } read_length = parse_oInt(4, argv[optind++], "read length must be at least 4"); if(optind >= argc) { print_usage(); return 1; } string splice_coords_file_list = argv[optind++]; vector splice_coords_file_names; vector coords_files; tokenize(splice_coords_file_list, ",", splice_coords_file_names); for (size_t s = 0; s < splice_coords_file_names.size(); ++s) { FILE* coords_file = fopen(splice_coords_file_names[s].c_str(), "r"); if (!coords_file) { fprintf(stderr, "Warning: cannot open %s for reading\n", splice_coords_file_names[s].c_str()); continue; } coords_files.push_back(coords_file); } if(optind >= argc) { print_usage(); return 1; } /* * Read in the insertion co-ordinates */ string insertion_coords_file_list = argv[optind++]; vector insertion_coords_file_names; vector insertion_coords_files; tokenize(insertion_coords_file_list, ",", insertion_coords_file_names); for(size_t s = 0; s < insertion_coords_file_names.size(); ++s) { FILE* insertion_coords_file = fopen(insertion_coords_file_names[s].c_str(),"r"); if(!insertion_coords_file) { fprintf(stderr, "Warning: cannot open %s for reading\n", insertion_coords_file_names[s].c_str()); continue; } insertion_coords_files.push_back(insertion_coords_file); } if(optind >= argc) { print_usage(); return 1; } /* * Read in the deletion co-ordinates */ string deletion_coords_file_list = argv[optind++]; vector deletion_coords_file_names; vector deletion_coords_files; tokenize(deletion_coords_file_list, ",", deletion_coords_file_names); for(size_t s = 0; s < deletion_coords_file_names.size(); ++s) { FILE* deletion_coords_file = fopen(deletion_coords_file_names[s].c_str(),"r"); if(!deletion_coords_file) { fprintf(stderr, "Warning: cannot open %s for reading\n", deletion_coords_file_names[s].c_str()); continue; } deletion_coords_files.push_back(deletion_coords_file); } if(optind >= argc) { print_usage(); return 1; } /* */ string fusion_coords_file_list = argv[optind++]; vector fusion_coords_file_names; vector fusion_coords_files; tokenize(fusion_coords_file_list, ",", fusion_coords_file_names); for(size_t s = 0; s < fusion_coords_file_names.size(); ++s) { FILE* fusion_coords_file = fopen(fusion_coords_file_names[s].c_str(),"r"); if(!fusion_coords_file) { fprintf(stderr, "Warning: cannot open %s for reading\n", fusion_coords_file_names[s].c_str()); continue; } fusion_coords_files.push_back(fusion_coords_file); } if(optind >= argc) { print_usage(); return 1; } string ref_file_name = argv[optind++]; ifstream ref_stream(ref_file_name.c_str()); if (!ref_stream.good()) { fprintf(stderr, "Error: cannot open %s for reading\n", ref_file_name.c_str()); exit(1); } driver(coords_files, insertion_coords_files, deletion_coords_files, fusion_coords_files, ref_stream); return 0; } tophat-2.0.9/src/inserts.h0000644000175000017500000001642412162605263014212 0ustar toortoor#ifndef INSERTS_H #define INSERTS_H /* * inserts.h * TopHat * * Created by Cole Trapnell on 1/14/09. * Copyright 2009 Cole Trapnell. All rights reserved. * */ #include "bwt_map.h" #include "junctions.h" struct InsertAlignment { InsertAlignment(uint64_t _refid, BowtieHit* _left_alignment, BowtieHit* _right_alignment) : refid(_refid), left_alignment(_left_alignment), right_alignment(_right_alignment) {} uint64_t refid; BowtieHit* left_alignment; BowtieHit* right_alignment; }; pair pair_distances(const BowtieHit& h1, const BowtieHit& h2); bool gap_lt(const pair& lhs, const pair& rhs); struct InsertAlignmentGrade { InsertAlignmentGrade() : too_close(false), too_far(false), num_spliced(0), num_mapped(0), opposite_strands(false), consistent_splices(false), longest_ref_skip(0x7FFFFu), edit_dist(0x1F), fusion(true), inner_dist(99999999), alignment_score(std::numeric_limits::min()) {} InsertAlignmentGrade(const BowtieHit& h1, bool fusion = false) : too_close(false), too_far(false), num_spliced(0), num_mapped(0), opposite_strands(false), consistent_splices(false), edit_dist(0x1F), fusion(fusion), num_alignments(0), inner_dist(99999999), alignment_score(std::numeric_limits::min()) { if (!h1.contiguous()) num_spliced++; num_mapped = 1; longest_ref_skip = min(0x3FFFFu, (unsigned int)get_longest_ref_skip(h1) / 100); edit_dist = h1.edit_dist(); num_alignments = 1; } InsertAlignmentGrade(const BowtieHit& h1, const BowtieHit& h2, const JunctionSet& junctions, bool fusion = false) : too_close(false), too_far(false), num_spliced(0), num_mapped(0), opposite_strands(false), consistent_splices(false), edit_dist(0x1F), fusion(fusion), num_alignments(0), alignment_score(std::numeric_limits::min()) { int min_inner_distance = inner_dist_mean - inner_dist_std_dev; int max_inner_distance = inner_dist_mean + inner_dist_std_dev; pair distances = pair_distances(h1, h2); inner_dist = distances.second; num_mapped = 2; if (!h1.contiguous()) num_spliced++; if (!h2.contiguous()) num_spliced++; too_far = (inner_dist > max_inner_distance); too_close = (inner_dist < min_inner_distance); opposite_strands = (h1.antisense_align() != h2.antisense_align()); consistent_splices = (num_spliced == 2 && h1.antisense_splice() == h2.antisense_splice()); uint32_t ls = max(get_longest_ref_skip(h1), get_longest_ref_skip(h2)); longest_ref_skip = min (ls / 100, 0x3FFFFu); edit_dist = h1.edit_dist() + h2.edit_dist(); num_alignments = 1; assert(!(too_far && too_close)); if (too_far && !fusion) { int inner1, inner2; if (h1.left() < h2.left()) inner1 = h1.right(), inner2 = h2.left(); else inner1 = h2.right(), inner2 = h1.left(); JunctionSet::const_iterator lb, ub; lb = junctions.upper_bound(Junction(h1.ref_id(), inner1, inner1, true)); ub = junctions.lower_bound(Junction(h1.ref_id(), inner2, inner2, false)); while (lb != ub && lb != junctions.end()) { const Junction& junction = lb->first; const JunctionStats& junction_stat = lb->second; if (inner1 <= (int)junction.left && inner2 >= (int)junction.right && junction_stat.supporting_hits >= 10) { int temp_dist = junction.left - inner1 + inner2 - junction.right; if (temp_dist >= min_inner_distance && temp_dist <= max_inner_distance) { // daehwan - for debugging purposes /* fprintf(stderr, "read id: %d\t %s %d-%d %d-%d %s\t %d->%d) \n", h1.insert_id(), print_cigar(h1.cigar()).c_str(), h1.left(), h1.right(), h2.left(), h2.right(), print_cigar(h2.cigar()).c_str(), inner_dist, temp_dist); */ inner_dist = temp_dist; too_far = false; break; } } ++lb; } } // static const int penalty_for_long_inner_dist = bowtie2_max_penalty; static const int penalty_for_discordant = bowtie2_max_penalty; alignment_score = h1.alignment_score() + h2.alignment_score(); if (!fusion) { if (too_far) { int penalty = penalty_for_long_inner_dist; if (inner_dist - max_inner_distance < inner_dist_std_dev) { penalty = penalty_for_long_inner_dist / 2; } alignment_score -= penalty; } else if (too_close) { int penalty = min(penalty_for_long_inner_dist/2, (min_inner_distance - inner_dist) / inner_dist_std_dev + 1); alignment_score -= penalty; } static const int penalty_for_same_strand = bowtie2_max_penalty; if (!opposite_strands) alignment_score -= penalty_for_same_strand; } else { if (!fusion_search) alignment_score -= penalty_for_discordant; } } InsertAlignmentGrade& operator=(const InsertAlignmentGrade& rhs) { too_close = rhs.too_close; too_far = rhs.too_far; num_spliced = rhs.num_spliced; num_mapped = rhs.num_mapped; opposite_strands = rhs.opposite_strands; consistent_splices = rhs.consistent_splices; longest_ref_skip = rhs.longest_ref_skip; edit_dist = rhs.edit_dist; fusion = rhs.fusion; num_alignments = rhs.num_alignments; inner_dist = rhs.inner_dist; alignment_score = rhs.alignment_score; return *this; } static int get_longest_ref_skip(const BowtieHit& h1) { vector > gaps; h1.gaps(gaps); if (gaps.empty()) return 0; vector >::iterator max_itr = max_element(gaps.begin(), gaps.end(), gap_lt); return abs(max_itr->second - max_itr->first); } // Returns true if rhs is a "happier" alignment for the ends of this insert // than this InsertStatus. bool operator<(const InsertAlignmentGrade& rhs); bool happy() const { return num_mapped == 2 && opposite_strands && (num_spliced != 2 || consistent_splices) && !too_far; } bool concordant() const { return (num_mapped == 2 && opposite_strands && inner_dist<=max_report_intron_length); } int align_score() { return alignment_score; } bool is_fusion() const { return fusion; } bool too_close; bool too_far; uint8_t num_spliced; uint8_t num_mapped; bool opposite_strands; bool consistent_splices; uint32_t longest_ref_skip; // in 100s of bp unsigned char edit_dist; bool fusion; int num_alignments; // number of equally good alignments for the insert int inner_dist; // distance between inner edges of mates int alignment_score; }; typedef vector > > BestInsertAlignmentTable; void accept_valid_hits(BestInsertAlignmentTable& best_status_for_inserts); void accept_all_best_hits(BestInsertAlignmentTable& best_status_for_inserts); void best_insert_mappings(uint64_t refid, ReadTable& it, HitList& hits1_in_ref, HitList& hits2_in_ref, BestInsertAlignmentTable& best_insert_alignments, bool prefer_shorter_pairs = false); void insert_best_pairings(RefSequenceTable& rt, ReadTable& it, HitTable& hits1, HitTable& hits2, BestInsertAlignmentTable& best_pairings, bool prefer_shorter_pairs = false); #endif tophat-2.0.9/src/coverage.cpp0000755000175000017500000001521012122334361014636 0ustar toortoor/* * coverage.cpp * TopHat * * Created by Daehwan Kim on 2/11/2012 */ #ifdef HAVE_CONFIG_H #include #else #define PACKAGE_VERSION "INTERNAL" #define SVN_REVISION "XXX" #endif #include "coverage.h" Coverage::Coverage() { debug = false; } Coverage::~Coverage() { clear(); } void Coverage::add_coverage(RefID refid, int pos, int length) { GenomeCoverage::iterator itr = genomeCoverage.find(refid); if (itr == genomeCoverage.end()) { genomeCoverage[refid] = PosCoverage(); itr = genomeCoverage.find(refid); } PosCoverage::iterator itr2 = get_contig(itr->second, pos); if (itr2 == itr->second.end()) { itr->second[pos] = vector(length + 1, 0); itr2 = itr->second.find(pos); vector& contig_coverage = itr2->second; contig_coverage[0] = 1; contig_coverage[length] = -1; } else { // daehwan - remove this if (debug) { fprintf(stderr, "found2\n"); } const size_t first = pos - itr2->first; const size_t last = first + length; const size_t resize = last + 1; if (resize > itr2->second.size()) itr2->second.resize(resize, 0); if (itr2->second[first] < cov_max_value) itr2->second[first] += 1; if (itr2->second[last] > cov_min_value) itr2->second[last] -= 1; } PosCoverage::iterator itr_lower = itr2; ++itr_lower; PosCoverage::iterator itr_upper = itr->second.upper_bound(itr2->first + itr2->second.size() - 1); PosCoverage::iterator itr_temp = itr_lower; while (itr_temp != itr_upper) { merge_contig(itr2->first, itr2->second, itr_temp->first, itr_temp->second); ++itr_temp; } if (itr_lower != itr_upper) itr->second.erase(itr_lower, itr_upper); } void Coverage::merge_with(const Coverage& other) { GenomeCoverage::const_iterator other_itr = other.genomeCoverage.begin(); for (; other_itr != other.genomeCoverage.end(); ++other_itr) { GenomeCoverage::iterator itr = genomeCoverage.find(other_itr->first); if (itr == genomeCoverage.end()) { genomeCoverage[other_itr->first] = other_itr->second; continue; } PosCoverage::const_iterator other_pos_itr = other_itr->second.begin(); for (; other_pos_itr != other_itr->second.end(); ++other_pos_itr) { PosCoverage::iterator insert_itr = get_contig(itr->second, other_pos_itr->first); if (insert_itr == itr->second.end()) { itr->second[other_pos_itr->first] = other_pos_itr->second; insert_itr = itr->second.find(other_pos_itr->first); } else { merge_contig(insert_itr->first, insert_itr->second, other_pos_itr->first, other_pos_itr->second); } int pos = insert_itr->first; size_t length = insert_itr->second.size(); PosCoverage::iterator itr_lower = insert_itr; ++itr_lower; PosCoverage::iterator itr_upper = itr->second.upper_bound(pos + length - 1); PosCoverage::iterator itr_temp = itr_lower; while (itr_temp != itr_upper) { merge_contig(insert_itr->first, insert_itr->second, itr_temp->first, itr_temp->second); ++itr_temp; } if (itr_lower != itr_upper) itr->second.erase(itr_lower, itr_upper); } } } void Coverage::merge_contig(int pos, vector& cov, int pos2, const vector& cov2) { assert (pos <= pos2); size_t resize = cov2.size() + pos2 - pos; if (resize > cov.size()) cov.resize(resize, 0); for (size_t i = 0; i < cov2.size(); ++i) { size_t update_pos = i + pos2 - pos; int temp_value = cov[update_pos] + cov2[i]; if (temp_value > cov_max_value) cov[update_pos] = cov_max_value; else if (temp_value < cov_min_value) cov[update_pos] = cov_min_value; else cov[update_pos] = (cov_t)temp_value; } } PosCoverage::iterator Coverage::get_contig(PosCoverage& posCoverage, int pos) { PosCoverage::iterator itr_contig = posCoverage.lower_bound(pos + 1); if (itr_contig != posCoverage.begin()) { --itr_contig; if (pos >= itr_contig->first && pos < itr_contig->first + (int)itr_contig->second.size()) return itr_contig; } return posCoverage.end(); } void Coverage::calculate_coverage() { GenomeCoverage::iterator itr = genomeCoverage.begin(); for (; itr != genomeCoverage.end(); ++itr) { PosCoverage::iterator itr2 = itr->second.begin(); for (; itr2 != itr->second.end(); ++itr2) { vector& contig_coverage = itr2->second; for (size_t i = 1; i < contig_coverage.size(); ++i) { int temp_value = contig_coverage[i] + contig_coverage[i-1]; if (temp_value > cov_max_value) contig_coverage[i] = cov_max_value; else if (temp_value < 0) contig_coverage[i] = 0; else contig_coverage[i] = (cov_t)temp_value; } } } } int Coverage::get_coverage(RefID refid, int pos) const { assert (pos >= 0); int coverage = 0; GenomeCoverage::const_iterator itr = genomeCoverage.find(refid); if (itr != genomeCoverage.end()) { PosCoverage::const_iterator itr_contig = itr->second.lower_bound(pos + 1); if (itr_contig != itr->second.begin()) { --itr_contig; const vector& contig_coverage = itr_contig->second; if (pos >= itr_contig->first) { int index = pos - itr_contig->first; if (index < (int)contig_coverage.size()) coverage = contig_coverage[index]; } } } return coverage; } void Coverage::clear() { genomeCoverage.clear(); } void Coverage::print_info() const { size_t total_bases = 0; GenomeCoverage::const_iterator itr = genomeCoverage.begin(); for (; itr != genomeCoverage.end(); ++itr) { fprintf(stderr, "Reference %d\n", itr->first); size_t bases = 0; PosCoverage::const_iterator itr2 = itr->second.begin(); for (; itr2 != itr->second.end(); ++itr2) bases += (itr2->second.size() - 1); fprintf(stderr, "# of islands: %lu, # of bases covered: %lu\n", itr->second.size(), bases); total_bases += bases; } fprintf(stderr, "# of total bases: %lu\n", total_bases); itr = genomeCoverage.begin(); for (; itr != genomeCoverage.end(); ++itr) { fprintf(stderr, "Reference %d\n", itr->first); print_info(itr->second); } } void Coverage::print_info(const PosCoverage& posCoverage, int begin, int end) const { PosCoverage::const_iterator itr = posCoverage.begin(); for (; itr != posCoverage.end(); ++itr) if (itr->first >= begin && itr->first < end) print_info(itr->first, itr->second); } void Coverage::print_info(int pos, const vector& cov) const { fprintf(stderr, "\tPos: %d, size: %lu\n", pos, cov.size()); for (size_t i = 0; i < cov.size(); ++i) fprintf(stderr, "\t\t%d (%d)\n", cov[i], pos + (int)i); } tophat-2.0.9/src/timer.h0000644000175000017500000000234612122334362013634 0ustar toortoor#ifndef TIMER_H_ #define TIMER_H_ #include #include #include using namespace std; /** * Use time() call to keep track of elapsed time between creation and * destruction. If verbose is true, Timer will print a message showing * elapsed time to the given output stream upon destruction. */ class Timer { public: Timer(ostream& out = cout, const char *msg = "", bool verbose = true) : _t(time(0)), _out(out), _msg(msg), _verbose(verbose) { } /// Optionally print message ~Timer() { if(_verbose) write(_out); } /// Return elapsed time since Timer object was created time_t elapsed() const { return time(0) - _t; } void write(ostream& out) { time_t passed = elapsed(); // Print the message supplied at construction time followed // by time elapsed formatted HH:MM:SS unsigned int hours = (passed / 60) / 60; unsigned int minutes = (passed / 60) % 60; unsigned int seconds = (passed % 60); out << _msg << setfill ('0') << setw (2) << hours << ":" << setfill ('0') << setw (2) << minutes << ":" << setfill ('0') << setw (2) << seconds << endl; } private: time_t _t; ostream& _out; const char *_msg; bool _verbose; }; #endif /*TIMER_H_*/ tophat-2.0.9/src/Makefile.am0000644000175000017500000006340412157340452014406 0ustar toortoor#include $(top_srcdir)/build-aux/tophat.mk # Generated with # find SeqAn-1.3 -type f -print | grep -v ".svn" | sed 's/$/ \\/g' # run from src # - and tophat2.in added EXTRA_DIST = \ tophat.py \ tophat2.in \ SeqAn-1.3/COPYING \ SeqAn-1.3/README \ SeqAn-1.3/seqan/chaining/chain_generic.h \ SeqAn-1.3/seqan/chaining/rt_skip_base_element.h \ SeqAn-1.3/seqan/chaining/score_zero.h \ SeqAn-1.3/seqan/chaining/geom_distribution.h \ SeqAn-1.3/seqan/chaining/chain_base.h \ SeqAn-1.3/seqan/chaining/chaining_generated_forwards.h \ SeqAn-1.3/seqan/chaining/rmt_skip_base_element.h \ SeqAn-1.3/seqan/chaining/skip_element.h \ SeqAn-1.3/seqan/chaining/rmt_skip_element.h \ SeqAn-1.3/seqan/chaining/rt_sl_impl.h \ SeqAn-1.3/seqan/chaining/chain_wrapper_point.h \ SeqAn-1.3/seqan/chaining/rmt_compl_algos.h \ SeqAn-1.3/seqan/chaining/rmt_base.h \ SeqAn-1.3/seqan/chaining/skip_list.h \ SeqAn-1.3/seqan/chaining/skip_list_iterator.h \ SeqAn-1.3/seqan/chaining/skip_base_element.h \ SeqAn-1.3/seqan/chaining/fragment.h \ SeqAn-1.3/seqan/chaining/skip_pool_alloc.h \ SeqAn-1.3/seqan/chaining/skip_list_impl.h \ SeqAn-1.3/seqan/chaining/rt_skip_element.h \ SeqAn-1.3/seqan/chaining/tree_chain_sop.h \ SeqAn-1.3/seqan/chaining/rmt_common_algos.h \ SeqAn-1.3/seqan/chaining/score_manhattan.h \ SeqAn-1.3/seqan/chaining/score_chain_sop.h \ SeqAn-1.3/seqan/chaining/rt_impl.h \ SeqAn-1.3/seqan/chaining/tree_chain.h \ SeqAn-1.3/seqan/chaining/tree_chain_utils.h \ SeqAn-1.3/seqan/chaining/skip_list_base.h \ SeqAn-1.3/seqan/chaining/rt_sl_def_algos.h \ SeqAn-1.3/seqan/chaining/range_tree.h \ SeqAn-1.3/seqan/chaining/rmt_def_algos.h \ SeqAn-1.3/seqan/chaining/chain_point.h \ SeqAn-1.3/seqan/chaining/rt_base.h \ SeqAn-1.3/seqan/chaining/chain_meta_fragment.h \ SeqAn-1.3/seqan/chaining/score_chain.h \ SeqAn-1.3/seqan/chaining/rt_common_algos.h \ SeqAn-1.3/seqan/chaining/skip_list_dynamic.h \ SeqAn-1.3/seqan/chaining/rt_sl_base.h \ SeqAn-1.3/seqan/chaining/skip_list_type.h \ SeqAn-1.3/seqan/chaining/range_max_tree.h \ SeqAn-1.3/seqan/chaining/rt_sl_compl_algos.h \ SeqAn-1.3/seqan/file/file_format_cgviz.h \ SeqAn-1.3/seqan/file/stream.h \ SeqAn-1.3/seqan/file/file_format_embl.h \ SeqAn-1.3/seqan/file/file_format_fasta.h \ SeqAn-1.3/seqan/file/string_mmap.h \ SeqAn-1.3/seqan/file/file_format_raw.h \ SeqAn-1.3/seqan/file/string_external.h \ SeqAn-1.3/seqan/file/file_filereader.h \ SeqAn-1.3/seqan/file/file_cstyle.h \ SeqAn-1.3/seqan/file/file_page_raid0.h \ SeqAn-1.3/seqan/file/cstream.h \ SeqAn-1.3/seqan/file/file_format_guess.h \ SeqAn-1.3/seqan/file/file_base.h \ SeqAn-1.3/seqan/file/file_format_mmap.h \ SeqAn-1.3/seqan/file/file_forwards.h \ SeqAn-1.3/seqan/file/file_array.h \ SeqAn-1.3/seqan/file/meta.h \ SeqAn-1.3/seqan/file/stream_algorithms.h \ SeqAn-1.3/seqan/file/file_format.h \ SeqAn-1.3/seqan/file/file_filereaderiterator.h \ SeqAn-1.3/seqan/file/file_format_genbank.h \ SeqAn-1.3/seqan/file/file_generated_forwards.h \ SeqAn-1.3/seqan/file/file_format_fasta_align.h \ SeqAn-1.3/seqan/file/chunk_collector.h \ SeqAn-1.3/seqan/file/file_page.h \ SeqAn-1.3/seqan/seeds.h \ SeqAn-1.3/seqan/find_motif.h \ SeqAn-1.3/seqan/LICENSE \ SeqAn-1.3/seqan/find2/find_approx_find_begin.h \ SeqAn-1.3/seqan/find2/find_finder_default.h \ SeqAn-1.3/seqan/find2/find_exact_simple.h \ SeqAn-1.3/seqan/find2/find_multiple_exact_simple.h \ SeqAn-1.3/seqan/find2/find_hamming_simple.h \ SeqAn-1.3/seqan/find2/find_pattern_wild_shiftand.h \ SeqAn-1.3/seqan/find2/find_approx_dpsearch.h \ SeqAn-1.3/seqan/find2/find_exact_shiftand.h \ SeqAn-1.3/seqan/find2/find2_generated_forwards.h \ SeqAn-1.3/seqan/find2/find_base.h \ SeqAn-1.3/seqan/platform.h \ SeqAn-1.3/seqan/sequence_journaled.h \ SeqAn-1.3/seqan/chaining.h \ SeqAn-1.3/seqan/score/score_generated_forwards.h \ SeqAn-1.3/seqan/score/score_matrix_data.h \ SeqAn-1.3/seqan/score/score_matrix.h \ SeqAn-1.3/seqan/score/score_simple.h \ SeqAn-1.3/seqan/score/score_base.h \ SeqAn-1.3/seqan/score/score_edit.h \ SeqAn-1.3/seqan/graph_algorithms.h \ SeqAn-1.3/seqan/sequence_journaled/sequence_journaled.h \ SeqAn-1.3/seqan/sequence_journaled/journal_entries_unbalanced_tree_iterator.h \ SeqAn-1.3/seqan/sequence_journaled/sequence_journaled_forwards.h \ SeqAn-1.3/seqan/sequence_journaled/journal_entries_unbalanced_tree_node.h \ SeqAn-1.3/seqan/sequence_journaled/sequence_journaled_iterator.h \ SeqAn-1.3/seqan/sequence_journaled/journal_entry.h \ SeqAn-1.3/seqan/sequence_journaled/journal_entries_sorted_array.h \ SeqAn-1.3/seqan/sequence_journaled/sequence_journaled_generated_forwards.h \ SeqAn-1.3/seqan/sequence_journaled/journal_entries_unbalanced_tree.h \ SeqAn-1.3/seqan/align/gaps_iterator_base.h \ SeqAn-1.3/seqan/align/gaps_array.h \ SeqAn-1.3/seqan/align/matrix_base.h \ SeqAn-1.3/seqan/align/gaps_base.h \ SeqAn-1.3/seqan/align/align_iterator_base.h \ SeqAn-1.3/seqan/align/align_algorithms.h \ SeqAn-1.3/seqan/align/align_hirschberg.h \ SeqAn-1.3/seqan/align/align_local_dynprog.h \ SeqAn-1.3/seqan/align/align_cols_base.h \ SeqAn-1.3/seqan/align/align_generated_forwards.h \ SeqAn-1.3/seqan/align/align_base.h \ SeqAn-1.3/seqan/align/gaps_sumlist.h \ SeqAn-1.3/seqan/align/align_local_dynprog_banded.h \ SeqAn-1.3/seqan/align/align_dynprog.h \ SeqAn-1.3/seqan/align/align_myers.h \ SeqAn-1.3/seqan/align/hirschberg_set.h \ SeqAn-1.3/seqan/align/align_trace.h \ SeqAn-1.3/seqan/pipe/pipe_sampler.h \ SeqAn-1.3/seqan/pipe/pipe_filter.h \ SeqAn-1.3/seqan/pipe/pool_sorter.h \ SeqAn-1.3/seqan/pipe/pipe_namer.h \ SeqAn-1.3/seqan/pipe/pipe_joiner.h \ SeqAn-1.3/seqan/pipe/pipe_base.h \ SeqAn-1.3/seqan/pipe/pipe_echoer.h \ SeqAn-1.3/seqan/pipe/pipe_caster.h \ SeqAn-1.3/seqan/pipe/pipe_generated_forwards.h \ SeqAn-1.3/seqan/pipe/pipe_counter.h \ SeqAn-1.3/seqan/pipe/pool_mapper.h \ SeqAn-1.3/seqan/pipe/pipe_edit_environment.h \ SeqAn-1.3/seqan/pipe/pool_base.h \ SeqAn-1.3/seqan/pipe/pipe_shifter.h \ SeqAn-1.3/seqan/pipe/pipe_source.h \ SeqAn-1.3/seqan/pipe/pipe_tupler.h \ SeqAn-1.3/seqan/pipe/pipe_iterator.h \ SeqAn-1.3/seqan/graph_types/graph_iterator_vertex.h \ SeqAn-1.3/seqan/graph_types/graph_impl_oracle.h \ SeqAn-1.3/seqan/graph_types/graph_idmanager.h \ SeqAn-1.3/seqan/graph_types/graph_iterator_adjacency.h \ SeqAn-1.3/seqan/graph_types/graph_iterator.h \ SeqAn-1.3/seqan/graph_types/graph_impl_automaton.h \ SeqAn-1.3/seqan/graph_types/graph_impl_trie.h \ SeqAn-1.3/seqan/graph_types/graph_iterator_bfs.h \ SeqAn-1.3/seqan/graph_types/graph_iterator_dfs.h \ SeqAn-1.3/seqan/graph_types/graph_property.h \ SeqAn-1.3/seqan/graph_types/graph_impl_fragment.h \ SeqAn-1.3/seqan/graph_types/graph_iterator_outedge.h \ SeqAn-1.3/seqan/graph_types/graph_utility_parsing.h \ SeqAn-1.3/seqan/graph_types/graph_impl_undirected.h \ SeqAn-1.3/seqan/graph_types/graph_impl_hmm.h \ SeqAn-1.3/seqan/graph_types/graph_base.h \ SeqAn-1.3/seqan/graph_types/graph_iterator_edge.h \ SeqAn-1.3/seqan/graph_types/graph_edgestump.h \ SeqAn-1.3/seqan/graph_types/graph_impl_tree.h \ SeqAn-1.3/seqan/graph_types/graph_impl_wordgraph.h \ SeqAn-1.3/seqan/graph_types/graph_types_generated_forwards.h \ SeqAn-1.3/seqan/graph_types/graph_impl_directed.h \ SeqAn-1.3/seqan/graph_types/graph_drawing.h \ SeqAn-1.3/seqan/graph_types/graph_interface.h \ SeqAn-1.3/seqan/file.h \ SeqAn-1.3/seqan/system.h \ SeqAn-1.3/seqan/refinement.h \ SeqAn-1.3/seqan/pipe.h \ SeqAn-1.3/seqan/system/system_thread.h \ SeqAn-1.3/seqan/system/system_base.h \ SeqAn-1.3/seqan/system/system_event.h \ SeqAn-1.3/seqan/system/file_directory.h \ SeqAn-1.3/seqan/system/system_mutex.h \ SeqAn-1.3/seqan/system/file_manual_forwards.h \ SeqAn-1.3/seqan/system/system_generated_forwards.h \ SeqAn-1.3/seqan/system/file_sync.h \ SeqAn-1.3/seqan/system/file_async.h \ SeqAn-1.3/seqan/system/system_sema.h \ SeqAn-1.3/seqan/system/system_manual_forwards.h \ SeqAn-1.3/seqan/misc/edit_environment.h \ SeqAn-1.3/seqan/misc/misc_parsing.h \ SeqAn-1.3/seqan/misc/misc_generated_forwards.h \ SeqAn-1.3/seqan/misc/misc_base.h \ SeqAn-1.3/seqan/misc/priority_type_heap.h \ SeqAn-1.3/seqan/misc/priority_type_base.h \ SeqAn-1.3/seqan/misc/misc_dequeue.h \ SeqAn-1.3/seqan/misc/misc_set.h \ SeqAn-1.3/seqan/misc/misc_map.h \ SeqAn-1.3/seqan/misc/misc_svg.h \ SeqAn-1.3/seqan/misc/misc_interval_tree.h \ SeqAn-1.3/seqan/misc/misc_long_word.h \ SeqAn-1.3/seqan/misc/misc_cmdparser.h \ SeqAn-1.3/seqan/misc/misc_random.h \ SeqAn-1.3/seqan/platform/platform_mingw.h \ SeqAn-1.3/seqan/platform/platform_gcc.h \ SeqAn-1.3/seqan/platform/platform_windows.h \ SeqAn-1.3/seqan/platform/platform_solaris.h \ SeqAn-1.3/seqan/platform/platform_generated_forwards.h \ SeqAn-1.3/seqan/graph_align/graph_align_gotoh.h \ SeqAn-1.3/seqan/graph_align/graph_align_generated_forwards.h \ SeqAn-1.3/seqan/graph_align/graph_align_smith_waterman.h \ SeqAn-1.3/seqan/graph_align/graph_align_base.h \ SeqAn-1.3/seqan/graph_align/graph_align_hirschberg.h \ SeqAn-1.3/seqan/graph_align/graph_align_needleman_wunsch.h \ SeqAn-1.3/seqan/graph_align/graph_align_banded_smith_waterman_clump.h \ SeqAn-1.3/seqan/graph_align/graph_align_interface.h \ SeqAn-1.3/seqan/graph_align/graph_align_smith_waterman_clump.h \ SeqAn-1.3/seqan/graph_align/graph_align_banded_gotoh.h \ SeqAn-1.3/seqan/graph_align/graph_align_banded_needleman_wunsch.h \ SeqAn-1.3/seqan/graph_align/graph_align_config.h \ SeqAn-1.3/seqan/parallel/parallel_generated_forwards.h \ SeqAn-1.3/seqan/parallel/parallel_atomic_misc.h \ SeqAn-1.3/seqan/parallel/parallel_atomic_primitives.h \ SeqAn-1.3/seqan/parallel/parallel_macros.h \ SeqAn-1.3/seqan/store/store_io_gff.h \ SeqAn-1.3/seqan/store/store_library.h \ SeqAn-1.3/seqan/store/store_read.h \ SeqAn-1.3/seqan/store/store_annotation.h \ SeqAn-1.3/seqan/store/store_intervaltree.h \ SeqAn-1.3/seqan/store/store_io_ucsc.h \ SeqAn-1.3/seqan/store/store_all.h \ SeqAn-1.3/seqan/store/store_matepair.h \ SeqAn-1.3/seqan/store/store_align_intervals.h \ SeqAn-1.3/seqan/store/store_io.h \ SeqAn-1.3/seqan/store/store_io_sam.h \ SeqAn-1.3/seqan/store/store_align.h \ SeqAn-1.3/seqan/store/store_io_bam.h \ SeqAn-1.3/seqan/store/store_contig.h \ SeqAn-1.3/seqan/store/store_generated_forwards.h \ SeqAn-1.3/seqan/store/store_base.h \ SeqAn-1.3/seqan/align.h \ SeqAn-1.3/seqan/modifier/modifier_string.h \ SeqAn-1.3/seqan/modifier/modifier_alphabet_expansion.h \ SeqAn-1.3/seqan/modifier/modifier_shortcuts.h \ SeqAn-1.3/seqan/modifier/modifier_alphabet.h \ SeqAn-1.3/seqan/modifier/modifier_iterator.h \ SeqAn-1.3/seqan/modifier/modifier_reverse.h \ SeqAn-1.3/seqan/modifier/modifier_generated_forwards.h \ SeqAn-1.3/seqan/modifier/modifier_view.h \ SeqAn-1.3/seqan/modifier/modifier_functors.h \ SeqAn-1.3/seqan/seeds2.h \ SeqAn-1.3/seqan/index/index_sa_btree.h \ SeqAn-1.3/seqan/index/find_quasar.h \ SeqAn-1.3/seqan/index/index_esa_algs.h \ SeqAn-1.3/seqan/index/index_skew7.h \ SeqAn-1.3/seqan/index/index_pizzachili.h \ SeqAn-1.3/seqan/index/index_sa_lss.h \ SeqAn-1.3/seqan/index/pump_extender7.h \ SeqAn-1.3/seqan/index/index_shawarma.h \ SeqAn-1.3/seqan/index/find_index_approx.h \ SeqAn-1.3/seqan/index/index_lcp.h \ SeqAn-1.3/seqan/index/find_swift.h \ SeqAn-1.3/seqan/index/shape_threshold.h \ SeqAn-1.3/seqan/index/pizzachili_api.h \ SeqAn-1.3/seqan/index/index_sa_mm.h \ SeqAn-1.3/seqan/index/index_esa_stree.h \ SeqAn-1.3/seqan/index/index_qgram.h \ SeqAn-1.3/seqan/index/pump_extender3.h \ SeqAn-1.3/seqan/index/shape_onegapped.h \ SeqAn-1.3/seqan/index/find_index_qgram.h \ SeqAn-1.3/seqan/index/index_skew3.h \ SeqAn-1.3/seqan/index/index_childtab.h \ SeqAn-1.3/seqan/index/pipe_merger7.h \ SeqAn-1.3/seqan/index/pipe_merger3.h \ SeqAn-1.3/seqan/index/repeat_base.h \ SeqAn-1.3/seqan/index/index_pizzachili_find.h \ SeqAn-1.3/seqan/index/index_sa_qsort.h \ SeqAn-1.3/seqan/index/index_shims.h \ SeqAn-1.3/seqan/index/index_manual_forwards.h \ SeqAn-1.3/seqan/index/shape_base.h \ SeqAn-1.3/seqan/index/shape_predefined.h \ SeqAn-1.3/seqan/index/index_pizzachili_string.h \ SeqAn-1.3/seqan/index/index_generated_forwards.h \ SeqAn-1.3/seqan/index/find_index_esa.h \ SeqAn-1.3/seqan/index/pump_separator7.h \ SeqAn-1.3/seqan/index/index_esa_base.h \ SeqAn-1.3/seqan/index/radix.h \ SeqAn-1.3/seqan/index/index_skew7_multi.h \ SeqAn-1.3/seqan/index/index_esa_algs_multi.h \ SeqAn-1.3/seqan/index/index_base.h \ SeqAn-1.3/seqan/index/shape_gapped.h \ SeqAn-1.3/seqan/index/index_lcp_tree.h \ SeqAn-1.3/seqan/index/index_esa_drawing.h \ SeqAn-1.3/seqan/index/find_index.h \ SeqAn-1.3/seqan/index/pump_lcp_core.h \ SeqAn-1.3/seqan/index/index_dfi.h \ SeqAn-1.3/seqan/index/index_sa_bwtwalk.h \ SeqAn-1.3/seqan/index/index_qgram_openaddressing.h \ SeqAn-1.3/seqan/index/index_bwt.h \ SeqAn-1.3/seqan/index/index_wotd.h \ SeqAn-1.3/seqan/statistics/statistics_base.h \ SeqAn-1.3/seqan/statistics/statistics_markov_model.h \ SeqAn-1.3/seqan/statistics/statistics_generated_forwards.h \ SeqAn-1.3/seqan/score.h \ SeqAn-1.3/seqan/sequence.h \ SeqAn-1.3/seqan/parallel.h \ SeqAn-1.3/seqan/graph_algorithms/graph_algorithm_heap_tree.h \ SeqAn-1.3/seqan/graph_algorithms/graph_algorithms_generated_forwards.h \ SeqAn-1.3/seqan/graph_algorithms/graph_algorithm_hmm.h \ SeqAn-1.3/seqan/graph_algorithms/graph_algorithm.h \ SeqAn-1.3/seqan/graph_algorithms/graph_algorithm_lis_his.h \ SeqAn-1.3/seqan/blast.h \ SeqAn-1.3/seqan/store.h \ SeqAn-1.3/seqan/graph_align.h \ SeqAn-1.3/seqan/random/random_lognormal.h \ SeqAn-1.3/seqan/random/ext_MersenneTwister.h \ SeqAn-1.3/seqan/random/random_generated_forwards.h \ SeqAn-1.3/seqan/random/random_rng_functor.h \ SeqAn-1.3/seqan/random/random_base.h \ SeqAn-1.3/seqan/random/random_uniform.h \ SeqAn-1.3/seqan/random/random_shuffle.h \ SeqAn-1.3/seqan/random/random_normal.h \ SeqAn-1.3/seqan/random/random_geometric.h \ SeqAn-1.3/seqan/random/random_mt19937.h \ SeqAn-1.3/seqan/basic.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_progressive.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_distance.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_io.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_kmer.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_refinement.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_msa.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_guidetree.h \ SeqAn-1.3/seqan/graph_msa/graph_msa_generated_forwards.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_library.h \ SeqAn-1.3/seqan/graph_msa/graph_align_tcoffee_base.h \ SeqAn-1.3/seqan/basic/basic_sse2.h \ SeqAn-1.3/seqan/basic/basic_forwards.h \ SeqAn-1.3/seqan/basic/basic_testing.h \ SeqAn-1.3/seqan/basic/basic_counted_ptr.h \ SeqAn-1.3/seqan/basic/basic_iterator_base.h \ SeqAn-1.3/seqan/basic/basic_pointer.h \ SeqAn-1.3/seqan/basic/basic_allocator_chunkpool.h \ SeqAn-1.3/seqan/basic/basic_parallelism.h \ SeqAn-1.3/seqan/basic/basic_allocator_singlepool.h \ SeqAn-1.3/seqan/basic/basic_iterator_adaptor.h \ SeqAn-1.3/seqan/basic/basic_profile.h \ SeqAn-1.3/seqan/basic/basic_tag.h \ SeqAn-1.3/seqan/basic/basic_compare.h \ SeqAn-1.3/seqan/basic/basic_alphabet_interface.h \ SeqAn-1.3/seqan/basic/basic_iterator_simple.h \ SeqAn-1.3/seqan/basic/basic_definition.h \ SeqAn-1.3/seqan/basic/basic_operator.h \ SeqAn-1.3/seqan/basic/basic_holder_dynamic.h \ SeqAn-1.3/seqan/basic/basic_allocator_multipool.h \ SeqAn-1.3/seqan/basic/basic_proxy.h \ SeqAn-1.3/seqan/basic/basic_alphabet_simple_tabs.h \ SeqAn-1.3/seqan/basic/basic_metaprogramming.h \ SeqAn-1.3/seqan/basic/basic_host.h \ SeqAn-1.3/seqan/basic/basic_allocator_to_std.h \ SeqAn-1.3/seqan/basic/basic_allocator_interface.h \ SeqAn-1.3/seqan/basic/basic_alphabet_interface2.h \ SeqAn-1.3/seqan/basic/basic_logvalue.h \ SeqAn-1.3/seqan/basic/basic_volatile_ptr.h \ SeqAn-1.3/seqan/basic/basic_debug.h \ SeqAn-1.3/seqan/basic/basic_profchar.h \ SeqAn-1.3/seqan/basic/basic_aggregates.h \ SeqAn-1.3/seqan/basic/basic_transport.h \ SeqAn-1.3/seqan/basic/basic_holder.h \ SeqAn-1.3/seqan/basic/basic_converter.h \ SeqAn-1.3/seqan/basic/basic_alphabet_simple.h \ SeqAn-1.3/seqan/basic/basic_type.h \ SeqAn-1.3/seqan/basic/basic_iterator_adapt_std.h \ SeqAn-1.3/seqan/basic/basic_allocator_simple.h \ SeqAn-1.3/seqan/basic/basic_iterator.h \ SeqAn-1.3/seqan/basic/basic_generated_forwards.h \ SeqAn-1.3/seqan/basic/basic_iterator_position.h \ SeqAn-1.3/seqan/basic/basic_alphabet_trait_basic.h \ SeqAn-1.3/seqan/sequence/sequence_forwards.h \ SeqAn-1.3/seqan/sequence/segment_infix.h \ SeqAn-1.3/seqan/sequence/string_set_dependent_tight.h \ SeqAn-1.3/seqan/sequence/segment_base.h \ SeqAn-1.3/seqan/sequence/sequence_lexical.h \ SeqAn-1.3/seqan/sequence/string_cstyle.h \ SeqAn-1.3/seqan/sequence/sequence_concatenator.h \ SeqAn-1.3/seqan/sequence/string_packed.h \ SeqAn-1.3/seqan/sequence/sequence_stream.h \ SeqAn-1.3/seqan/sequence/sequence_shortcuts.h \ SeqAn-1.3/seqan/sequence/adapt_std_list.h \ SeqAn-1.3/seqan/sequence/string_base.h \ SeqAn-1.3/seqan/sequence/string_set_owner.h \ SeqAn-1.3/seqan/sequence/adapt_std_string.h \ SeqAn-1.3/seqan/sequence/sequence_generated_forwards.h \ SeqAn-1.3/seqan/sequence/string_set_concat_direct.h \ SeqAn-1.3/seqan/sequence/sequence_interface.h \ SeqAn-1.3/seqan/sequence/adapt_array_pointer.h \ SeqAn-1.3/seqan/sequence/segment_suffix.h \ SeqAn-1.3/seqan/sequence/string_array.h \ SeqAn-1.3/seqan/sequence/iter_concat_virtual.h \ SeqAn-1.3/seqan/sequence/segment_prefix.h \ SeqAn-1.3/seqan/sequence/string_block.h \ SeqAn-1.3/seqan/sequence/string_set_base.h \ SeqAn-1.3/seqan/sequence/adapt_std_vector.h \ SeqAn-1.3/seqan/sequence/string_set_dependent_generous.h \ SeqAn-1.3/seqan/sequence/string_alloc.h \ SeqAn-1.3/seqan/modifier.h \ SeqAn-1.3/seqan/random.h \ SeqAn-1.3/seqan/statistics.h \ SeqAn-1.3/seqan/consensus.h \ SeqAn-1.3/seqan/find.h \ SeqAn-1.3/seqan/find2.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_inexact.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_fragment.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_annotation.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_exact.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_align.h \ SeqAn-1.3/seqan/refinement/refinement_generated_forwards.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_scoring.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_exact_iterative.h \ SeqAn-1.3/seqan/refinement/graph_impl_align.h \ SeqAn-1.3/seqan/refinement/graph_impl_align_adapt.h \ SeqAn-1.3/seqan/refinement/graph_algorithm_refine_aligngraph.h \ SeqAn-1.3/seqan/seeds/global_seed_chain.h \ SeqAn-1.3/seqan/seeds/seedSet_score.h \ SeqAn-1.3/seqan/seeds/seeds_generated_forwards.h \ SeqAn-1.3/seqan/seeds/banded_chain_align_affine.h \ SeqAn-1.3/seqan/seeds/banded_align.h \ SeqAn-1.3/seqan/seeds/memoryManager_int.h \ SeqAn-1.3/seqan/seeds/seedHandlingTags.h \ SeqAn-1.3/seqan/seeds/seedSet_iterator.h \ SeqAn-1.3/seqan/seeds/banded_chain_align.h \ SeqAn-1.3/seqan/seeds/seed_multi.h \ SeqAn-1.3/seqan/seeds/seedSet_base.h \ SeqAn-1.3/seqan/seeds/propertyMap.h \ SeqAn-1.3/seqan/seeds/memoryManager_base.h \ SeqAn-1.3/seqan/seeds/seed_base.h \ SeqAn-1.3/seqan/graph_msa.h \ SeqAn-1.3/seqan/consensus/consensus_score.h \ SeqAn-1.3/seqan/consensus/consensus_base.h \ SeqAn-1.3/seqan/consensus/consensus_generated_forwards.h \ SeqAn-1.3/seqan/consensus/consensus_realign.h \ SeqAn-1.3/seqan/consensus/consensus_library.h \ SeqAn-1.3/seqan/map/map_generated_forwards.h \ SeqAn-1.3/seqan/map/map_base.h \ SeqAn-1.3/seqan/map/map_chooser.h \ SeqAn-1.3/seqan/map/map_vector.h \ SeqAn-1.3/seqan/map/sumlist_mini.h \ SeqAn-1.3/seqan/map/map_adapter_stl.h \ SeqAn-1.3/seqan/map/sumlist.h \ SeqAn-1.3/seqan/map/map_skiplist.h \ SeqAn-1.3/seqan/map/sumlist_skip.h \ SeqAn-1.3/seqan/blast/blast_stream_report.h \ SeqAn-1.3/seqan/blast/blast_run.h \ SeqAn-1.3/seqan/blast/blast_hsp_iterator.h \ SeqAn-1.3/seqan/blast/blast_stream_hit_iterator.h \ SeqAn-1.3/seqan/blast/blast_hsp.h \ SeqAn-1.3/seqan/blast/blast_base.h \ SeqAn-1.3/seqan/blast/blast_hit.h \ SeqAn-1.3/seqan/blast/blast_stream_hit.h \ SeqAn-1.3/seqan/blast/blast_iterator.h \ SeqAn-1.3/seqan/blast/blast_parsing.h \ SeqAn-1.3/seqan/blast/blast_report.h \ SeqAn-1.3/seqan/blast/blast_generated_forwards.h \ SeqAn-1.3/seqan/blast/blast_stream_hsp_iterator.h \ SeqAn-1.3/seqan/blast/blast_hit_iterator.h \ SeqAn-1.3/seqan/seeds2/align_dynprog_affine.h \ SeqAn-1.3/seqan/seeds2/seeds_global_chaining_base.h \ SeqAn-1.3/seqan/seeds2/seeds_extension.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_set_unordered.h \ SeqAn-1.3/seqan/seeds2/seeds_global_chaining.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_set_base.h \ SeqAn-1.3/seqan/seeds2/seeds2_generated_forwards.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_diagonal.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_simple.h \ SeqAn-1.3/seqan/seeds2/seeds_base.h \ SeqAn-1.3/seqan/seeds2/align_dynprog_banded_affine.h \ SeqAn-1.3/seqan/seeds2/seeds_combination.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_set_non_scored.h \ SeqAn-1.3/seqan/seeds2/align_dynprog_banded_linear.h \ SeqAn-1.3/seqan/seeds2/align_chain_banded.h \ SeqAn-1.3/seqan/seeds2/align_dynprog_linear.h \ SeqAn-1.3/seqan/seeds2/basic_iter_indirect.h \ SeqAn-1.3/seqan/seeds2/seeds_global_chaining_gusfield.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_chained.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_set_scored.h \ SeqAn-1.3/seqan/seeds2/seeds_seed_base.h \ SeqAn-1.3/seqan/map.h \ SeqAn-1.3/seqan/find/find_pattern_base.h \ SeqAn-1.3/seqan/find/find_multiple_shiftand.h \ SeqAn-1.3/seqan/find/find_generated_forwards.h \ SeqAn-1.3/seqan/find/find_horspool.h \ SeqAn-1.3/seqan/find/find_abndm.h \ SeqAn-1.3/seqan/find/find_score.h \ SeqAn-1.3/seqan/find/find_set_horspool.h \ SeqAn-1.3/seqan/find/find_wild_shiftand.h \ SeqAn-1.3/seqan/find/find_hamming_simple.h \ SeqAn-1.3/seqan/find/find_shiftand.h \ SeqAn-1.3/seqan/find/find_begin.h \ SeqAn-1.3/seqan/find/find_shiftor.h \ SeqAn-1.3/seqan/find/find_wumanber.h \ SeqAn-1.3/seqan/find/find_multi.h \ SeqAn-1.3/seqan/find/find_simple.h \ SeqAn-1.3/seqan/find/find_bndm.h \ SeqAn-1.3/seqan/find/find_pex.h \ SeqAn-1.3/seqan/find/find_bom.h \ SeqAn-1.3/seqan/find/find_ahocorasick.h \ SeqAn-1.3/seqan/find/find_multiple_bfam.h \ SeqAn-1.3/seqan/find/find_myers_ukkonen.h \ SeqAn-1.3/seqan/find/find_base.h \ SeqAn-1.3/seqan/index.h \ SeqAn-1.3/seqan/graph_types.h \ SeqAn-1.3/seqan/find_motif/profile.h \ SeqAn-1.3/seqan/find_motif/find_motif_pmsp.h \ SeqAn-1.3/seqan/find_motif/sequence_model_types.h \ SeqAn-1.3/seqan/find_motif/find_motif_base.h \ SeqAn-1.3/seqan/find_motif/pseudocount_mode_p.h \ SeqAn-1.3/seqan/find_motif/find_motif_epatternbranching.h \ SeqAn-1.3/seqan/find_motif/find_motif_projection.h \ SeqAn-1.3/seqan/find_motif/find_motif_generated_forwards.h \ SeqAn-1.3/seqan/find_motif/pseudocount_mode_c.h \ SeqAn-1.3/seqan/find_motif/pseudocount_base.h \ SeqAn-1.3/seqan/find_motif/em_algorithm.h \ SeqAn-1.3/seqan/find_motif/find_motif_pms1.h \ SeqAn-1.3/seqan/find_motif/frequency_distribution.h \ SeqAn-1.3/seqan.h #-- progs to be installed in $prefix/bin bin_PROGRAMS = \ prep_reads \ gtf_to_fasta \ fix_map_ordering \ bam2fastx \ segment_juncs \ gtf_juncs \ juncs_db \ long_spanning_reads \ bam_merge \ map2gtf \ tophat_reports \ sam_juncs bin_SCRIPTS = \ tophat2 \ tophat #-- scripts to be installed in $prefix/bin dist_bin_SCRIPTS = \ contig_to_chr_coords \ bed_to_juncs \ sra_to_solid \ tophat-fusion-post CLEANFILES = \ tophat2 \ tophat tophat2: tophat2.in sed -e 's|__PREFIX__|$(prefix)|' tophat2.in > tophat2 tophat: tophat.py sed -e 's|__VERSION__|$(VERSION)|' tophat.py > tophat #SUFFIXES = .py #.py: # (echo '#!$(PYTHON)'; sed '/^#!/d' $<) > $@ #-- tophat library for linking convienence noinst_LIBRARIES = libtophat.a libgc.a noinst_HEADERS = \ reads.h \ codons.h \ common.h \ GBase.h \ gdna.h \ GFaSeqGet.h \ gff.h \ GHash.hh \ GVec.hh \ GList.hh \ GStr.h \ FastaTools.h \ GTFToFasta.h \ map2gtf.h \ bwt_map.h \ junctions.h \ assert_helpers.h \ insertions.h \ wiggles.h \ deletions.h \ fusions.h \ align_status.h \ alphabet.h \ timer.h \ tokenize.h \ fragments.h \ inserts.h \ segments.h \ qual.h \ bam_merge.h \ utils.h \ coverage.h libtophat_a_SOURCES = \ reads.cpp \ alphabet.c \ bwt_map.cpp \ common.cpp \ junctions.cpp \ insertions.cpp \ deletions.cpp \ fusions.cpp \ align_status.cpp \ fragments.cpp \ tokenize.cpp \ inserts.cpp \ qual.cpp \ bam_merge_impl.cpp \ utils.cpp \ coverage.cpp libgc_a_SOURCES = \ GBase.cpp \ codons.cpp \ gdna.cpp \ GStr.cpp \ GFaSeqGet.cpp \ gff.cpp #-- program sources prep_reads_SOURCES = prep_reads.cpp prep_reads_LDADD = $(top_builddir)/src/libtophat.a $(BAM_LIB) prep_reads_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) segment_juncs_SOURCES = segment_juncs.cpp segment_juncs_LDADD = $(top_builddir)/src/libtophat.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB) segment_juncs_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS) long_spanning_reads_SOURCES = long_spanning_reads.cpp long_spanning_reads_LDADD = $(top_builddir)/src/libtophat.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB) long_spanning_reads_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS) gtf_juncs_SOURCES = gtf_juncs.cpp gtf_juncs_LDADD = $(top_builddir)/src/libtophat.a libgc.a $(BAM_LIB) gtf_juncs_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) juncs_db_SOURCES = juncs_db.cpp juncs_db_LDADD = $(top_builddir)/src/libtophat.a $(BAM_LIB) juncs_db_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) tophat_reports_SOURCES = tophat_reports.cpp tophat_reports_LDADD = $(top_builddir)/src/libtophat.a $(BOOST_THREAD_LIB) $(BOOST_SYSTEM_LIB) $(BAM_LIB) tophat_reports_LDFLAGS = $(LDFLAGS) $(BOOST_LDFLAGS) $(BAM_LDFLAGS) fix_map_ordering_SOURCES = fix_map_ordering.cpp fix_map_ordering_LDADD = $(top_builddir)/src/libtophat.a $(BAM_LIB) fix_map_ordering_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) bam2fastx_SOURCES = bam2fastx.cpp bam2fastx_LDADD = $(top_builddir)/src/libgc.a $(BAM_LIB) bam2fastx_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) bam_merge_SOURCES = bam_merge.cpp bam_merge_LDADD = $(top_builddir)/src/libtophat.a $(top_builddir)/src/libgc.a $(BAM_LIB) bam_merge_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) sam_juncs_SOURCES = sam_juncs.cpp sam_juncs_LDADD = $(top_builddir)/src/libtophat.a $(BAM_LIB) sam_juncs_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) map2gtf_SOURCES = map2gtf.cpp map2gtf_LDADD = $(top_builddir)/src/libtophat.a libgc.a $(BAM_LIB) map2gtf_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) gtf_to_fasta_SOURCES = GTFToFasta.cpp FastaTools.cpp gtf_to_fasta_LDADD = $(top_builddir)/src/libtophat.a libgc.a $(BAM_LIB) gtf_to_fasta_LDFLAGS = $(LDFLAGS) $(BAM_LDFLAGS) tophat-2.0.9/src/deletions.cpp0000755000175000017500000001116412122334360015034 0ustar toortoor/* * deletions.cpp * TopHat * * Created by Ryan Kelley on 10/09/2010. * */ #ifdef HAVE_CONFIG_H #include #endif #include #include "common.h" #include "deletions.h" /* * Print deletions in BED format * As per the BED-standard (http://genome.ucsc.edu/FAQ/FAQformat) * -The coordinates should be 0-based * -The chromEnd field should not contain the actual feature * -The name will be "-" * -The score will be count of supporting reads (max of 1,000) * * chromStart refers to the position of the first deleted based * \t\t\t-\t\n * @param deletions_out The output file * @param deletions Maps from deletions to number of supporting reads * @pram ref_sequences The table of reference sequences * */ void print_deletions(FILE* deletions_out, const DeletionSet& deletions, RefSequenceTable& ref_sequences){ fprintf(deletions_out, "track name=deletions description=\"TopHat deletions\"\n"); for(DeletionSet::const_iterator i = deletions.begin(); i != deletions.end(); ++i){ fprintf(deletions_out, "%s\t%d\t%d\t-\t%d\n", ref_sequences.get_name(i->first.refid), i->first.left + 1, i->first.right, i->second.supporting_hits); } } /** * Add deletions from an alignment to an DeletionSet. * This will look for deletion in the alignment specified by bh. If the * deletion is already in deletions, it will updated the count. Otherwise, * it will add the deletion to the set and initialize the count to 1. * @param bh The bowtie hit to be used to specify alignment infromation. * @param deletions The DeletionSet that will be updated with the deletion information from teh alignment. */ void deletions_from_alignment(const BowtieHit& bh, DeletionSet& deletions) { vector > new_deletions; deletions_from_spliced_hit(bh, new_deletions); for(size_t i = 0; i < new_deletions.size(); ++i){ const pair& deletion = new_deletions[i]; DeletionSet::iterator itr = deletions.find(deletion.first); if (itr != deletions.end()) { itr->second.supporting_hits += 1; itr->second.left_extent = max(itr->second.left_extent, deletion.second.left_extent); itr->second.right_extent = max(itr->second.right_extent, deletion.second.right_extent); } else { deletions[deletion.first] = deletion.second; } } return; } /** * Extract a list of deletions from a bowtie hit. * Given a bowtie hit, extract a vector of deletions. * @param bh The bowtie hit to use for alignment information. * @param insertions Used to store the resultant vector of deletions. */ void deletions_from_spliced_hit(const BowtieHit& bh, vector >& deletions){ const vector& cigar = bh.cigar(); unsigned int positionInGenome = bh.left(); unsigned int positionInRead = 0; bool bSawFusion = false; for(size_t c = 0; c < cigar.size(); ++c){ switch(cigar[c].opcode){ case REF_SKIP: positionInGenome += cigar[c].length; break; case rEF_SKIP: positionInGenome -= cigar[c].length; break; case MATCH: case mATCH: if (cigar[c].opcode == MATCH) positionInGenome += cigar[c].length; else positionInGenome -= cigar[c].length; positionInRead += cigar[c].length; break; case DEL: case dEL: { Deletion deletion; DeletionStats stats; if (bSawFusion) deletion.refid = bh.ref_id2(); else deletion.refid = bh.ref_id(); if (cigar[c].opcode == DEL) { deletion.left = positionInGenome - 1; deletion.right = positionInGenome + cigar[c].length; } else { deletion.left = positionInGenome - cigar[c].length; deletion.right = positionInGenome + 1; } stats.supporting_hits = 1; if (c > 0) stats.left_extent = cigar[c-1].length; if (c + 1 < cigar.size()) stats.right_extent = cigar[c+1].length; deletions.push_back(make_pair(deletion, stats)); positionInGenome += cigar[c].length; } break; case INS: case iNS: positionInRead += cigar[c].length; break; case FUSION_FF: case FUSION_FR: case FUSION_RF: bSawFusion = true; positionInGenome = cigar[c].length; break; default: break; } } return; } void merge_with(DeletionSet& deletions, const DeletionSet& other) { for (DeletionSet::const_iterator deletion = other.begin(); deletion != other.end(); ++deletion) { DeletionSet::iterator itr = deletions.find(deletion->first); if (itr != deletions.end()) { itr->second.merge_with(deletion->second); } else { deletions[deletion->first] = deletion->second; } } } tophat-2.0.9/src/utils.cpp0000644000175000017500000000756112157667340014230 0ustar toortoor/* * utils.cpp * TopHat * * Created by Daehwan Kim on 12/28/11. * Copyright 2011 Daehwan Kim. All rights reserved. * */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include "utils.h" bool calculate_offsets(const vector& fnames, vector& ids, vector >& offsets) { vector > > index_list(fnames.size()); for (size_t i = 0; i < fnames.size(); ++i) { const string& fname = fnames[i]; vector temp_fnames; if (fname.substr(fname.length() - 4) == ".bam") { temp_fnames.push_back(fname); } else { size_t j = 0; while (true) { char suffix[128]; sprintf(suffix, "%lu.bam", j); string temp_fname = fname + suffix; temp_fnames.push_back(temp_fname); string temp_index_fname = temp_fname + ".index"; ifstream index_file(temp_index_fname.c_str()); if (!index_file.is_open()) break; ++j; } } for (size_t j = 0; j < temp_fnames.size(); ++j) { ifstream reads_index_file((temp_fnames[j] + ".index").c_str()); if (!reads_index_file.is_open()) continue; string line; while (getline(reads_index_file, line)) { istringstream istream(line); uint64_t read_id; int64_t offset; istream >> read_id >> offset; index_list[i].push_back(make_pair(read_id, offset)); } } } // too small for blocking for (size_t i = 0; i < index_list.size(); ++i) { if (index_list[i].size() < (size_t)num_threads) return false; } offsets.resize(num_threads - 1); for (int i = 1; i < num_threads; ++i) { size_t index = index_list.back().size() / num_threads * i; const pair& data = index_list.back()[index]; uint64_t id = data.first; int64_t offset = data.second; ids.push_back(id); offsets[i-1].push_back(offset); // fprintf(stderr, "%lu)read %lu - offset %ld\n", index_list.size() - 1, id, offset); for (int j = index_list.size() - 2; j >= 0; --j) { size_t other_index = index_list[j].size() / num_threads * i; uint64_t other_id = index_list[j][other_index].first; while (other_id > id && other_index > 0) { other_index -= 1; other_id = index_list[j][other_index].first; } while (other_index + 1 < index_list[j].size() && index_list[j][other_index+1].first < id) { other_index += 1; other_id = index_list[j][other_index].first; } int64_t other_offset = index_list[j][other_index].second; if (other_id > id) { other_id = 0; other_offset = 0; } id = other_id; offsets[i-1].push_back(other_offset); // fprintf(stderr, "\t%d)read %lu - offset %lu\n", j, other_id, other_offset); } reverse(offsets[i-1].begin(), offsets[i-1].end()); } return true; } void calculate_offsets_from_ids(const string& fname, const vector& ids, vector& offsets) { vector > index_list; ifstream reads_index_file((fname + ".index").c_str()); string line; uint64_t last_id = 0; int64_t last_offset = 0; for (size_t i = 0; i < ids.size(); ++i) { uint64_t ref_id = ids[i]; while (getline(reads_index_file, line)) { istringstream istream(line); uint64_t read_id; int64_t offset; istream >> read_id >> offset; if (read_id > ref_id) { assert(last_id <= ref_id); offsets.push_back(last_offset); // fprintf(stderr, "ref read: %lu\n", ref_id); // fprintf(stderr, "\tread %lu - offset %lu\n", last_id, last_offset); } last_id = read_id; last_offset = offset; if (last_id > ref_id) break; } } if (ids.size() != offsets.size()) offsets.clear(); } tophat-2.0.9/src/insertions.h0000755000175000017500000000545312122334361014715 0ustar toortoor#ifndef INSERTIONS_H #define INSERTIONS_H /* * insertions.h * TopHat * * Adapted from junctions.h */ #include #include #include #include #include #include #include #include #include #include #include "bwt_map.h" using namespace std; /** * Data structure to represent an insertion. * Need to keep track of the actual position of the insertion * as well the actual inserted sequence. */ struct Insertion { Insertion (uint32_t ref, uint32_t l, const std::string& seq) : refid(ref), left(l), sequence(seq){} Insertion() : refid(0), left(0), sequence("") {} /** * The ID of the assoicated reference sequence (eg, chr22). In order to actually turn this into * a string, need to use the map associated with the reference table */ uint32_t refid; /** * The position of the insertion. * This is the 0-based position of the last genomic nucleotide before the insertion */ uint32_t left; /** * The actual inserted sequence. */ std::string sequence; bool operator<(const Insertion& rhs) const { if (refid < rhs.refid) return true; else if (refid > rhs.refid) return false; if (left < rhs.left) return true; else if (left > rhs.left) return false; if (sequence.length() < rhs.sequence.length()) return true; return false; } bool operator==(const Insertion& rhs) const { return (refid == rhs.refid && left == rhs.left && sequence == rhs.sequence); } }; struct InsertionStats { InsertionStats() : left_extent(0), right_extent(0), supporting_hits(0) {} InsertionStats& merge_with(const InsertionStats& other) { if (this == &other) return *this; left_extent = max(left_extent, other.left_extent); right_extent = max(right_extent, other.right_extent); supporting_hits += other.supporting_hits; return *this; } int left_extent; int right_extent; int supporting_hits; }; /** * A function used to compare Insertions, specifically for use * with std::sets. My C++ is a little weak, but I imagine * there should be someway to directly address the overloaded * operator function and ditch this code. */ struct insertion_comparison { bool operator()(const Insertion& lhs, const Insertion& rhs) { return lhs < rhs; } }; typedef std::map InsertionSet; void insertions_from_alignment(const BowtieHit& bh, InsertionSet& insertions); void print_insertions(FILE* insertions_out, const InsertionSet& insertions, RefSequenceTable& ref_sequences); void insertions_from_spliced_hit(const BowtieHit& bh, vector >& insertions); void merge_with(InsertionSet& insertions, const InsertionSet& other); #endif tophat-2.0.9/src/tophat.py0000755000175000017500000051406512162605263014232 0ustar toortoor#!/usr/bin/env python # encoding: utf-8 """ tophat.py Created by Cole Trapnell on 2008-12-25. Copyright (c) 2008 Cole Trapnell. All rights reserved. Updated and maintained by Daehwan Kim and Geo Pertea since Jul 2010. """ import sys try: import psyco psyco.full() except ImportError: pass import getopt import subprocess import errno import os import warnings import re import glob import signal from datetime import datetime, date, time from shutil import copy, rmtree import logging use_message = ''' TopHat maps short sequences from spliced transcripts to whole genomes. Usage: tophat [options] [reads1[,reads2,...]] \\ [quals1,[quals2,...]] [quals1[,quals2,...]] Options: -v/--version -o/--output-dir [ default: ./tophat_out ] --bowtie1 [ default: bowtie2 ] -N/--read-mismatches [ default: 2 ] --read-gap-length [ default: 2 ] --read-edit-dist [ default: 2 ] --read-realign-edit-dist [ default: "read-edit-dist" + 1 ] -a/--min-anchor [ default: 8 ] -m/--splice-mismatches <0-2> [ default: 0 ] -i/--min-intron-length [ default: 50 ] -I/--max-intron-length [ default: 500000 ] -g/--max-multihits [ default: 20 ] --suppress-hits -x/--transcriptome-max-hits [ default: 60 ] -M/--prefilter-multihits ( for -G/--GTF option, enable an initial bowtie search against the genome ) --max-insertion-length [ default: 3 ] --max-deletion-length [ default: 3 ] --solexa-quals --solexa1.3-quals (same as phred64-quals) --phred64-quals (same as solexa1.3-quals) -Q/--quals --integer-quals -C/--color (Solid - color space) --color-out --library-type (fr-unstranded, fr-firststrand, fr-secondstrand) -p/--num-threads [ default: 1 ] -R/--resume ( try to resume execution ) -G/--GTF (GTF/GFF with known transcripts) --transcriptome-index (transcriptome bowtie index) -T/--transcriptome-only (map only to the transcriptome) -j/--raw-juncs --insertions --deletions -r/--mate-inner-dist [ default: 50 ] --mate-std-dev [ default: 20 ] --no-novel-juncs --no-novel-indels --no-gtf-juncs --no-coverage-search --coverage-search --microexon-search --keep-tmp --tmp-dir [ default: /tmp ] -z/--zpacker [ default: gzip ] -X/--unmapped-fifo [use mkfifo to compress more temporary files for color space reads] Advanced Options: --report-secondary-alignments --no-discordant --no-mixed --segment-mismatches [ default: 2 ] --segment-length [ default: 25 ] --bowtie-n [ default: bowtie -v ] --min-coverage-intron [ default: 50 ] --max-coverage-intron [ default: 20000 ] --min-segment-intron [ default: 50 ] --max-segment-intron [ default: 500000 ] --no-sort-bam (Output BAM is not coordinate-sorted) --no-convert-bam (Do not output bam format. Output is /accepted_hit.sam) --keep-fasta-order --allow-partial-mapping Bowtie2 related options: Preset options in --end-to-end mode (local alignment is not used in TopHat2) --b2-very-fast --b2-fast --b2-sensitive --b2-very-sensitive Alignment options --b2-N [ default: 0 ] --b2-L [ default: 20 ] --b2-i [ default: S,1,1.25 ] --b2-n-ceil [ default: L,0,0.15 ] --b2-gbar [ default: 4 ] Scoring options --b2-mp , [ default: 6,2 ] --b2-np [ default: 1 ] --b2-rdg , [ default: 5,3 ] --b2-rfg , [ default: 5,3 ] --b2-score-min [ default: L,-0.6,-0.6 ] Effort options --b2-D [ default: 15 ] --b2-R [ default: 2 ] Fusion related options: --fusion-search --fusion-anchor-length [ default: 20 ] --fusion-min-dist [ default: 10000000 ] --fusion-read-mismatches [ default: 2 ] --fusion-multireads [ default: 2 ] --fusion-multipairs [ default: 2 ] --fusion-ignore-chromosomes [ e.g, ] --fusion-do-not-resolve-conflicts [this is for test purposes ] SAM Header Options (for embedding sequencing run metadata in output): --rg-id (read group ID) --rg-sample (sample ID) --rg-library (library ID) --rg-description (descriptive string, no tabs allowed) --rg-platform-unit (e.g Illumina lane ID) --rg-center (sequencing center name) --rg-date (ISO 8601 date of the sequencing run) --rg-platform (Sequencing platform descriptor) ''' # Deprecated: # --min-closure-exon [ default: 100 ] # --min-closure-intron [ default: 50 ] # --max-closure-intron [ default: 5000 ] # --no-closure-search # --closure-search # --butterfly-search # --no-butterfly-search # -F/--min-isoform-fraction [ default: 0.15 ] class Usage(Exception): def __init__(self, msg): self.msg = msg output_dir = "./tophat_out/" logging_dir = output_dir + "logs/" run_log = None tophat_log = None #main log file handle tophat_logger = None # main logging object run_cmd = None tmp_dir = output_dir + "tmp/" bin_dir = sys.path[0] + "/" use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False) use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color # enabled by -X/-unmapped-fifo option (unless -z0) unmapped_reads_fifo = None # if use_BWT_FIFO is True, this tricks bowtie into writing the # unmapped reads into a compressed file samtools_path = None bowtie_path = None fail_str = "\t[FAILED]\n" gtf_juncs = None #file name with junctions extracted from given GFF file # version of GFF transcriptome parser accepted for pre-built transcriptome indexes # TopHat will automatically rebuild a transcriptome index if the version # found in the {transcriptome_index}.ver file is lower than this value # -do NOT increment this unless you want TopHat to force a rebuild of all users' transcriptome indexes! GFF_T_VER = 209 #GFF parser version #mapping types: _reads_vs_G, _reads_vs_T, _segs_vs_G, _segs_vs_J = range(1,5) # execution resuming stages (for now, execution can be resumed only for stages # after the pre-filter and transcriptome searches): _stage_prep, _stage_map_start, _stage_map_segments, _stage_find_juncs, _stage_juncs_db, _stage_map2juncs, _stage_tophat_reports, _stage_alldone = range(1,9) stageNames = ["start", "prep_reads", "map_start", "map_segments", "find_juncs", "juncs_db", "map2juncs", "tophat_reports", "alldone"] # 0 1 2 3 4 5 6 7 , 8 runStages = dict([(stageNames[st], st) for st in range(0, 9)]) currentStage = 0 resumeStage = 0 def getResumeStage(rlog): #returns tuple: (resumeStage, old_cmd_args) oldargv = None try: flog=open(rlog) #first line must be the actual tophat command used thcmd=None try: thcmd = flog.next() except StopIteration: die("Error: cannot resume, run.log is empty.") oldargv=thcmd.split() resume_tag = None for line in flog: #scan for last resume code, if any r=re.match("^#>(\w+):$", line) if r: resume_tag=r.group(1) #global resumeStage if resume_tag: if resume_tag in runStages: resume_stage = runStages[resume_tag] else: die("Error: unrecognized run stage '"+resume_tag+"'") else: die("Error: resuming requested but no valid stage found in run.log") flog.close() except IOError: die("Error: cannot resume, failed to open "+rlog) return (resume_stage, oldargv) def doResume(odir): #must return the original list of arguments rlog = odir+"/logs/run.log" rstage = 0 rargv = None r0log = odir+"/logs/run.resume0.log" r0stage = 0 r0argv = None if fileExists(r0log): r0stage, r0argv = getResumeStage(r0log) else: if fileExists(rlog, 10): copy(rlog, r0log) rstage, rargv = getResumeStage(rlog) best_stage = rstage best_argv = rargv[:] if r0stage > rstage: best_stage = r0stage best_argv = r0argv[:] if best_stage == _stage_alldone: print >> sys.stderr, "Nothing to resume." sys.exit(1) global resumeStage resumeStage = best_stage return best_argv def setRunStage(stnum): global currentStage print >> run_log, "#>"+stageNames[stnum]+":" currentStage = stnum def init_logger(log_fname): global tophat_logger tophat_logger = logging.getLogger('project') formatter = logging.Formatter('%(asctime)s %(message)s', '[%Y-%m-%d %H:%M:%S]') tophat_logger.setLevel(logging.DEBUG) hstream = logging.StreamHandler(sys.stderr) hstream.setFormatter(formatter) tophat_logger.addHandler(hstream) # # Output logging information to file if os.path.isfile(log_fname): os.remove(log_fname) global tophat_log logfh = logging.FileHandler(log_fname) logfh.setFormatter(formatter) tophat_logger.addHandler(logfh) tophat_log=logfh.stream # TopHatParams captures all of the runtime paramaters used by TopHat, and many # of these are passed as command line options to exectubles run by the pipeline # This class and its nested classes also do options parsing through parse_options() # and option validation via the member function check() class BowtieFltFiles: def __init__(self, seqfiles=None, qualfiles=None, mappings=None, unmapped_reads=None, multihit_reads=None): self.seqfiles=seqfiles self.qualfiles=qualfiles self.mappings=mappings self.unmapped_reads=unmapped_reads self.multihit_reads=multihit_reads class TopHatParams: # SpliceConstraints is a group of runtime parameters that specify what # constraints to put on junctions discovered by the program. These constraints # are used to filter out spurious/false positive junctions. class SpliceConstraints: def __init__(self, min_anchor_length, min_intron_length, max_intron_length, splice_mismatches, min_isoform_fraction): self.min_anchor_length = min_anchor_length self.min_intron_length = min_intron_length self.max_intron_length = max_intron_length self.splice_mismatches = splice_mismatches self.min_isoform_fraction = min_isoform_fraction def parse_options(self, opts): for option, value in opts: if option in ("-m", "--splice-mismatches"): self.splice_mismatches = int(value) elif option in ("-a", "--min-anchor"): self.min_anchor_length = int(value) elif option in ("-F", "--min-isoform-fraction"): self.min_isoform_fraction = float(value) elif option in ("-i", "--min-intron-length"): self.min_intron_length = int(value) elif option in ("-I", "--max-intron-length"): self.max_intron_length = int(value) def check(self): if self.splice_mismatches not in [0,1,2]: die("Error: arg to --splice-mismatches must be 0, 1, or 2") if self.min_anchor_length < 4: die("Error: arg to --min-anchor-len must be greater than 4") if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0: die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0") if self.min_intron_length <= 0: die("Error: arg to --min-intron-length must be greater than 0") if self.max_intron_length <= 0: die("Error: arg to --max-intron-length must be greater than 0") # SystemParams is a group of runtime parameters that determine how to handle # temporary files produced during a run and how many threads to use for threaded # stages of the pipeline (e.g. Bowtie) class SystemParams: def __init__(self, num_threads, keep_tmp): self.num_threads = num_threads self.keep_tmp = keep_tmp self.zipper = "gzip" self.zipper_opts= [] def parse_options(self, opts): global use_zpacker global use_BWT_FIFO for option, value in opts: if option in ("-p", "--num-threads"): self.num_threads = int(value) elif option == "--keep-tmp": self.keep_tmp = True elif option in ("-z","--zpacker"): if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]: value="" self.zipper = value #if not self.zipper: # self.zipper='gzip' elif option in ("-X", "--unmapped-fifo"): use_BWT_FIFO=True if self.zipper: use_zpacker=True if self.num_threads>1 and not self.zipper_opts: if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'): self.zipper_opts.append('-p'+str(self.num_threads)) else: use_zpacker=False if use_BWT_FIFO: use_BWT_FIFO=False def cmd(self): cmdline=[] if self.zipper: cmdline.extend(['-z',self.zipper]) if self.num_threads>1: cmdline.extend(['-p'+str(self.num_threads)]) return cmdline def check(self): if self.num_threads<1 : die("Error: arg to --num-threads must be greater than 0") if self.zipper: xzip=which(self.zipper) if not xzip: die("Error: cannot find compression program "+self.zipper) # ReadParams is a group of runtime parameters that specify various properties # of the user's reads (e.g. which quality scale their are on, how long the # fragments are, etc). class ReadParams: def __init__(self, solexa_quals, phred64_quals, quals, integer_quals, color, library_type, seed_length, reads_format, mate_inner_dist, mate_inner_dist_std_dev, read_group_id, sample_id, library_id, description, seq_platform_unit, seq_center, seq_run_date, seq_platform): self.solexa_quals = solexa_quals self.phred64_quals = phred64_quals self.quals = quals self.integer_quals = integer_quals self.color = color self.library_type = library_type self.seed_length = seed_length self.reads_format = reads_format self.mate_inner_dist = mate_inner_dist self.mate_inner_dist_std_dev = mate_inner_dist_std_dev self.read_group_id = read_group_id self.sample_id = sample_id self.library_id = library_id self.description = description self.seq_platform_unit = seq_platform_unit self.seq_center = seq_center self.seq_run_date = seq_run_date self.seq_platform = seq_platform def parse_options(self, opts): for option, value in opts: if option == "--solexa-quals": self.solexa_quals = True elif option in ("--solexa1.3-quals", "--phred64-quals"): self.phred64_quals = True elif option in ("-Q", "--quals"): self.quals = True elif option == "--integer-quals": self.integer_quals = True elif option in ("-C", "--color"): self.color = True elif option == "--library-type": self.library_type = value elif option in ("-s", "--seed-length"): self.seed_length = int(value) elif option in ("-r", "--mate-inner-dist"): self.mate_inner_dist = int(value) elif option == "--mate-std-dev": self.mate_inner_dist_std_dev = int(value) elif option == "--rg-id": self.read_group_id = value elif option == "--rg-sample": self.sample_id = value elif option == "--rg-library": self.library_id = value elif option == "--rg-description": self.description = value elif option == "--rg-platform-unit": self.seq_platform_unit = value elif option == "--rg-center": self.seq_center = value elif option == "--rg-date": self.seq_run_date = value elif option == "--rg-platform": self.seq_platform = value def check(self): if self.seed_length and self.seed_length < 20: die("Error: arg to --seed-length must be at least 20") if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0: die("Error: arg to --mate-std-dev must at least 0") if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id): die("Error: --rg-id and --rg-sample must be specified or omitted together") # SearchParams is a group of runtime parameters that specify how TopHat will # search for splice junctions class SearchParams: def __init__(self, min_closure_exon, min_closure_intron, max_closure_intron, min_coverage_intron, max_coverage_intron, min_segment_intron, max_segment_intron): self.min_closure_exon_length = min_closure_exon self.min_closure_intron_length = min_closure_intron self.max_closure_intron_length = max_closure_intron self.min_coverage_intron_length = min_coverage_intron self.max_coverage_intron_length = max_coverage_intron self.min_segment_intron_length = min_segment_intron self.max_segment_intron_length = max_segment_intron def parse_options(self, opts): for option, value in opts: if option == "--min-closure-exon": self.min_closure_exon_length = int(value) if option == "--min-closure-intron": self.min_closure_intron_length = int(value) if option == "--max-closure-intron": self.max_closure_intron_length = int(value) if option == "--min-coverage-intron": self.min_coverage_intron_length = int(value) if option == "--max-coverage-intron": self.max_coverage_intron_length = int(value) if option == "--min-segment-intron": self.min_segment_intron_length = int(value) if option == "--max-segment-intron": self.max_segment_intron_length = int(value) def check(self): if self.min_closure_exon_length < 0: die("Error: arg to --min-closure-exon must be at least 20") if self.min_closure_intron_length < 0: die("Error: arg to --min-closure-intron must be at least 20") if self.max_closure_intron_length < 0: die("Error: arg to --max-closure-intron must be at least 20") if self.min_coverage_intron_length < 0: die("Error: arg to --min-coverage-intron must be at least 20") if self.max_coverage_intron_length < 0: die("Error: arg to --max-coverage-intron must be at least 20") if self.min_segment_intron_length < 0: die("Error: arg to --min-segment-intron must be at least 20") if self.max_segment_intron_length < 0: die("Error: arg to --max-segment-intron must be at least 20") class ReportParams: def __init__(self): self.sort_bam = True self.convert_bam = True def parse_options(self, opts): for option, value in opts: if option == "--no-sort-bam": self.sort_bam = False if option == "--no-convert-bam": self.convert_bam = False class Bowtie2Params: def __init__(self): self.very_fast = False self.fast = False self.sensitive = False self.very_sensitive = False self.N = 0 self.L = 20 self.i = "S,1,1.25" self.n_ceil = "L,0,0.15" self.gbar = 4 self.mp = "6,2" self.np = 1 self.rdg = "5,3" self.rfg = "5,3" # self.score_min = "L,-0.6,-0.6" self.score_min = None self.D = 15 self.R = 2 def parse_options(self, opts): for option, value in opts: if option == "--b2-very-fast": self.very_fast = True if option == "--b2-fast": self.fast = True if option == "--b2-sensitive": self.sensitive = True if option == "--b2-very-sensitive": self.very_sensitive = True if option == "--b2-N": self.N = int(value) if option == "--b2-L": self.L = 20 if option == "--b2-i": self.i = value if option == "--b2-n-ceil": self.n_ceil = value if option == "--b2-gbar": self.gbar = 4 if option == "--b2-mp": self.mp = value if option == "--b2-np": self.np = int(value) if option == "--b2-rdg": self.rdg = value if option == "--b2-rfg": self.rfg = value if option == "--b2-score-min": self.score_min = value if option == "--b2-D": self.D = int(value) if option == "--b2-R": self.R = int(value) def check(self): more_than_once = False if self.very_fast: if self.fast or self.sensitive or self.very_sensitive: more_than_once = True else: if self.fast: if self.sensitive or self.very_sensitive: more_than_once = True else: if self.sensitive and self.very_sensitive: more_than_once = True if more_than_once: die("Error: use only one of --b2-very-fast, --b2-fast, --b2-sensitive, --b2-very-sensitive") if not self.N in [0, 1]: die("Error: arg to --b2-N must be either 0 or 1") function_re = r'^[CLSG],-?\d+(\.\d+)?,-?\d+(\.\d+)?$' function_match = re.search(function_re, self.i) if not function_match: die("Error: arg to --b2-i must be (e.g. --b2-i S,1,1.25)") function_match = re.search(function_re, self.n_ceil) if not function_match: die("Error: arg to --b2-n-ceil must be (e.g. --b2-n-ceil L,0,0.15)") if self.score_min: function_match = re.search(function_re, self.score_min) if not function_match: die("Error: arg to --b2-score-min must be (e.g. --b2-score-min L,-0.6,-0.6)") pair_re = r'^\d+,\d+$' pair_match = re.search(pair_re, self.mp) if not pair_match: die("Error: arg to --b2-mp must be , (e.g. --b2-mp 6,2)") pair_match = re.search(pair_re, self.rdg) if not pair_match: die("Error: arg to --b2-rdg must be , (e.g. --b2-mp 5,3)") pair_match = re.search(pair_re, self.rfg) if not pair_match: die("Error: arg to --b2-rfg must be , (e.g. --b2-mp 5,3)") def __init__(self): self.splice_constraints = self.SpliceConstraints(8, # min_anchor 50, # min_intron 500000, # max_intron 0, # splice_mismatches 0.15) # min_isoform_frac self.preflt_data = [ BowtieFltFiles(), BowtieFltFiles() ] self.sam_header = None self.read_params = self.ReadParams(False, # solexa_scale False, False, # quals None, # integer quals False, # SOLiD - color space "", # library type (e.g. "illumina-stranded-pair-end") None, # seed_length "fastq", # quality_format None, # mate inner distance 20, # mate inner dist std dev None, # read group id None, # sample id None, # library id None, # description None, # platform unit (i.e. lane) None, # sequencing center None, # run date None) # sequencing platform self.system_params = self.SystemParams(1, # bowtie_threads (num_threads) False) # keep_tmp self.search_params = self.SearchParams(100, # min_closure_exon_length 50, # min_closure_intron_length 5000, # max_closure_intron_length 50, # min_coverage_intron_length 20000, # max_coverage_intron_length 50, # min_segment_intron_length 500000) # max_segment_intron_length self.report_params = self.ReportParams() self.bowtie2_params = self.Bowtie2Params() self.bowtie2 = True self.gff_annotation = None self.transcriptome_only = False self.transcriptome_index = None self.transcriptome_outdir = None self.raw_junctions = None self.resume_dir = None self.find_novel_juncs = True self.find_novel_indels = True self.find_novel_fusions = True self.find_GFF_juncs = True self.max_hits = 20 self.suppress_hits = False self.t_max_hits = 60 self.max_seg_hits = 40 self.prefilter_multi = False self.read_mismatches = 2 self.read_gap_length = 2 self.read_edit_dist = 2 self.read_realign_edit_dist = None self.segment_length = 25 self.segment_mismatches = 2 self.bowtie_alignment_option = "-v" self.max_insertion_length = 3 self.max_deletion_length = 3 self.raw_insertions = None self.raw_deletions = None self.coverage_search = None self.closure_search = False #self.butterfly_search = None self.butterfly_search = False self.microexon_search = False self.report_secondary_alignments = False self.report_discordant_pair_alignments = True self.report_mixed_alignments = True # experimental -W option to activate score and edit distance filtering # in fix_map_ordering (hits post processing) self.b2scoreflt = False self.keep_fasta_order = False self.partial_mapping = False self.fusion_search = False self.fusion_anchor_length = 20 self.fusion_min_dist = 10000000 self.fusion_read_mismatches = 2 self.fusion_multireads = 2 self.fusion_multipairs = 2 self.fusion_ignore_chromosomes = [] self.fusion_do_not_resolve_conflicts = False def check(self): self.splice_constraints.check() self.read_params.check() self.system_params.check() if self.segment_length < 10: die("Error: arg to --segment-length must at least 10") if self.segment_mismatches < 0 or self.segment_mismatches > 3: die("Error: arg to --segment-mismatches must in [0, 3]") if self.read_params.color: if self.bowtie2: th_log("Warning: bowtie2 in colorspace is not supported; --bowtie1 option assumed.") self.bowtie2=False if self.fusion_search: die("Error: fusion-search in colorspace is not yet supported") if self.butterfly_search: die("Error: butterfly-search in colorspace is not yet supported") self.bowtie2_params.check() if self.bowtie2 and self.fusion_search: th_logp("\tWarning: --fusion-search with Bowtie2 may not work well as it may require much memory space and produce many spurious fusions. Please try --bowtie1 option if this doesn't work.") library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"] if self.read_params.library_type and self.read_params.library_type not in library_types: die("Error: library-type should be one of: "+', '.join(library_types)) self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length, self.search_params.max_closure_intron_length) self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length, self.search_params.max_segment_intron_length) self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length, self.search_params.max_coverage_intron_length) if self.max_insertion_length >= self.segment_length: die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")") if self.max_insertion_length < 0: die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0") if self.max_deletion_length >= self.splice_constraints.min_intron_length: die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")") if self.max_deletion_length < 0: die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0") if self.read_mismatches > self.read_edit_dist or self.read_gap_length > self.read_edit_dist: die("Error: the read mismatches (" + str(self.read_mismatches) + ") and the read gap length (" + str(self.read_edit_dist) + ") should be less than or equal to the read edit dist (" + str(self.read_edit_dist) + ")\n" + \ "Either decrease --read-mismatches or --read-gap-length, or increase --read-edit-dist") self.search_params.min_segment_intron_length = min(self.search_params.min_segment_intron_length, self.splice_constraints.min_intron_length) self.search_params.max_segment_intron_length = max(self.search_params.max_segment_intron_length, self.splice_constraints.max_intron_length) def cmd(self): cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length), "--splice-mismatches", str(self.splice_constraints.splice_mismatches), "--min-report-intron", str(self.splice_constraints.min_intron_length), "--max-report-intron", str(self.splice_constraints.max_intron_length), "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction), "--output-dir", output_dir, "--max-multihits", str(self.max_hits), "--max-seg-multihits", str(self.max_seg_hits), "--segment-length", str(self.segment_length), "--segment-mismatches", str(self.segment_mismatches), "--min-closure-exon", str(self.search_params.min_closure_exon_length), "--min-closure-intron", str(self.search_params.min_closure_intron_length), "--max-closure-intron", str(self.search_params.max_closure_intron_length), "--min-coverage-intron", str(self.search_params.min_coverage_intron_length), "--max-coverage-intron", str(self.search_params.max_coverage_intron_length), "--min-segment-intron", str(self.search_params.min_segment_intron_length), "--max-segment-intron", str(self.search_params.max_segment_intron_length), "--read-mismatches", str(self.read_mismatches), "--read-gap-length", str(self.read_gap_length), "--read-edit-dist", str(self.read_edit_dist), "--read-realign-edit-dist", str(self.read_realign_edit_dist), "--max-insertion-length", str(self.max_insertion_length), "--max-deletion-length", str(self.max_deletion_length)] if self.suppress_hits: cmd.extend(["--suppress-hits"]) if not self.bowtie2: cmd.extend(["--bowtie1"]) if self.fusion_search: cmd.extend(["--fusion-search", "--fusion-anchor-length", str(self.fusion_anchor_length), "--fusion-min-dist", str(self.fusion_min_dist), "--fusion-read-mismatches", str(self.fusion_read_mismatches), "--fusion-multireads", str(self.fusion_multireads), "--fusion-multipairs", str(self.fusion_multipairs)]) if self.fusion_ignore_chromosomes: cmd.extend(["--fusion-ignore-chromosomes", ",".join(self.fusion_ignore_chromosomes)]) if self.fusion_do_not_resolve_conflicts: cmd.extend(["--fusion-do-not-resolve-conflicts"]) cmd.extend(self.system_params.cmd()) if self.read_params.mate_inner_dist != None: cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist), "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)]) if self.gff_annotation != None: cmd.extend(["--gtf-annotations", str(self.gff_annotation)]) if gtf_juncs: cmd.extend(["--gtf-juncs", gtf_juncs]) if self.closure_search == False: cmd.append("--no-closure-search") if not self.coverage_search: cmd.append("--no-coverage-search") if not self.microexon_search: cmd.append("--no-microexon-search") if self.butterfly_search: cmd.append("--butterfly-search") if self.read_params.solexa_quals: cmd.append("--solexa-quals") if self.read_params.quals: cmd.append("--quals") if self.read_params.integer_quals: cmd.append("--integer-quals") if self.read_params.color: cmd.append("--color") if self.read_params.library_type: cmd.extend(["--library-type", self.read_params.library_type]) if self.read_params.read_group_id: cmd.extend(["--rg-id", self.read_params.read_group_id]) if self.read_params.phred64_quals: cmd.append("--phred64-quals") return cmd # This is the master options parsing routine, which calls parse_options for # the delegate classes (e.g. SpliceConstraints) that handle certain groups # of options. def parse_options(self, argv): try: opts, args = getopt.getopt(argv[1:], "hvp:m:n:N:F:a:i:I:G:Tr:o:j:Xz:s:g:x:R:MQCW", ["version", "help", "output-dir=", "bowtie1", "solexa-quals", "solexa1.3-quals", "phred64-quals", "quals", "integer-quals", "color", "library-type=", "num-threads=", "splice-mismatches=", "max-multihits=", "suppress-hits", "min-isoform-fraction=", "min-anchor-length=", "min-intron-length=", "max-intron-length=", "GTF=", "transcriptome-only", "transcriptome-max-hits=", "transcriptome-index=", "raw-juncs=", "no-novel-juncs", "allow-fusions", "fusion-search", "fusion-anchor-length=", "fusion-min-dist=", "fusion-read-mismatches=", "fusion-multireads=", "fusion-multipairs=", "fusion-ignore-chromosomes=", "fusion-do-not-resolve-conflicts", "no-novel-indels", "no-gtf-juncs", "mate-inner-dist=", "mate-std-dev=", "no-coverage-search", "coverage-search", "prefilter-multihits", "microexon-search", "min-coverage-intron=", "max-coverage-intron=", "min-segment-intron=", "max-segment-intron=", "resume=", "seed-length=", "read-mismatches=", "read-gap-length=", "read-edit-dist=", "read-realign-edit-dist=", "segment-length=", "segment-mismatches=", "bowtie-n", "keep-tmp", "rg-id=", "rg-sample=", "rg-library=", "rg-description=", "rg-platform-unit=", "rg-center=", "rg-date=", "rg-platform=", "tmp-dir=", "zpacker=", "unmapped-fifo", "max-insertion-length=", "max-deletion-length=", "insertions=", "deletions=", "no-sort-bam", "no-convert-bam", "report-secondary-alignments", "no-discordant", "no-mixed", "keep-fasta-order", "allow-partial-mapping", "b2-very-fast", "b2-fast", "b2-sensitive", "b2-very-sensitive", "b2-N=", "b2-L=", "b2-i=", "b2-n-ceil=", "b2-gbar=", "b2-ma=", "b2-mp=", "b2-np=", "b2-rdg=", "b2-rfg=", "b2-score-min=", "b2-D=", "b2-R="]) except getopt.error, msg: raise Usage(msg) self.splice_constraints.parse_options(opts) self.system_params.parse_options(opts) self.read_params.parse_options(opts) self.search_params.parse_options(opts) self.report_params.parse_options(opts) self.bowtie2_params.parse_options(opts) global use_BWT_FIFO global use_BAM_Unmapped if not self.read_params.color: use_BWT_FIFO=False use_BAM_Unmapped=True global output_dir global logging_dir global tmp_dir custom_tmp_dir = None custom_out_dir = None # option processing for option, value in opts: if option in ("-v", "--version"): print "TopHat v%s" % (get_version()) sys.exit(0) if option in ("-h", "--help"): raise Usage(use_message) if option == "--bowtie1": self.bowtie2 = False if option in ("-g", "--max-multihits"): self.max_hits = int(value) self.max_seg_hits = max(10, self.max_hits * 2) if option == "--suppress-hits": self.suppress_hits = True if option in ("-x", "--transcriptome-max-hits"): self.t_max_hits = int(value) if option in ("-G", "--GTF"): self.gff_annotation = value if option in ("-T", "--transcriptome-only"): self.transcriptome_only = True if option == "--transcriptome-index": self.transcriptome_index = value if option in("-M", "--prefilter-multihits"): self.prefilter_multi = True if option in ("-j", "--raw-juncs"): self.raw_junctions = value if option == "--no-novel-juncs": self.find_novel_juncs = False if option == "--no-novel-indels": self.find_novel_indels = False if option == "--fusion-search": self.fusion_search = True if option == "--fusion-anchor-length": self.fusion_anchor_length = int(value) if option == "--fusion-min-dist": self.fusion_min_dist = int(value) if option == "--fusion-read-mismatches": self.fusion_read_mismatches = int(value) if option == "--fusion-multireads": self.fusion_multireads = int(value) if option == "--fusion-multipairs": self.fusion_multipairs = int(value) if option == "--fusion-ignore-chromosomes": self.fusion_ignore_chromosomes = value.split(",") if option == "--fusion-do-not-resolve-conflicts": self.fusion_do_not_resolve_conflicts = True if option == "--no-gtf-juncs": self.find_GFF_juncs = False if option == "--no-coverage-search": self.coverage_search = False if option == "--coverage-search": self.coverage_search = True # -W option : score and edit distance filtering in fix_map_ordering # this is *soft* post-processing of bowtie2 results, should be # more effectively implemented by using bowtie2's score function if option == "-W": self.b2scoreflt = True self.closure_search = False #if option == "--no-closure-search": # self.closure_search = False #if option == "--closure-search": # self.closure_search = True if option == "--microexon-search": self.microexon_search = True self.butterfly_search = False #if option == "--butterfly-search": # self.butterfly_search = True #if option == "--no-butterfly-search": # self.butterfly_search = False if option in ("-N", "--read-mismatches"): self.read_mismatches = int(value) if option == "--read-gap-length": self.read_gap_length = int(value) if option == "--read-edit-dist": self.read_edit_dist = int(value) if option == "--read-realign-edit-dist": self.read_realign_edit_dist = int(value) if option == "--segment-length": self.segment_length = int(value) if option == "--segment-mismatches": self.segment_mismatches = int(value) if option == "--bowtie-n": self.bowtie_alignment_option = "-n" if option == "--max-insertion-length": self.max_insertion_length = int(value) if option == "--max-deletion-length": self.max_deletion_length = int(value) if option == "--insertions": self.raw_insertions = value if option == "--deletions": self.raw_deletions = value if option == "--report-secondary-alignments": self.report_secondary_alignments = True if option == "--no-discordant": self.report_discordant_pair_alignments = False if option == "--no-mixed": self.report_mixed_alignments = False if option == "--keep-fasta-order": self.keep_fasta_order = True if option == "--allow-partial-mapping": self.partial_mapping = True if option in ("-o", "--output-dir"): custom_out_dir = value + "/" if option in ("-R", "--resume"): self.resume_dir = value if option == "--tmp-dir": custom_tmp_dir = value + "/" if self.transcriptome_only: self.find_novel_juncs=False self.find_novel_indels=False if custom_out_dir: output_dir = custom_out_dir logging_dir = output_dir + "logs/" tmp_dir = output_dir + "tmp/" sam_header = tmp_dir + "stub_header.sam" if custom_tmp_dir: tmp_dir = custom_tmp_dir sam_header = tmp_dir + "stub_header.sam" if len(args) < 2 and not self.resume_dir: raise Usage(use_message) if self.read_realign_edit_dist == None: self.read_realign_edit_dist = self.read_edit_dist + 1 return args def nonzeroFile(filepath): if os.path.exists(filepath): fpath, fname=os.path.split(filepath) fbase, fext =os.path.splitext(fname) if fext.lower() == ".bam": samtools_view_cmd = ["samtools", "view", filepath] samtools_view = subprocess.Popen(samtools_view_cmd, stdout=subprocess.PIPE) head_cmd = ["head", "-1"] head = subprocess.Popen(head_cmd, stdin=samtools_view.stdout, stdout=subprocess.PIPE) samtools_view.stdout.close() # as per http://bugs.python.org/issue7678 output = head.communicate()[0][:-1] if len(output) > 0: return True else: if os.path.getsize(filepath)>25: return True return False # check if a file exists and has non-zero (or minimum) size def fileExists(filepath, minfsize=2): if os.path.exists(filepath) and os.path.getsize(filepath)>=minfsize: return True else: return False def removeFileWithIndex(filepath): if os.path.exists(filepath): os.remove(filepath) fileindexpath = filepath + ".index" if os.path.exists(fileindexpath): os.remove(fileindexpath) def getFileDir(filepath): #if fullpath given, returns path including the ending / fpath, fname=os.path.split(filepath) if fpath: fpath+='/' return fpath def getFileBaseName(filepath): fpath, fname=os.path.split(filepath) fbase, fext =os.path.splitext(fname) fx=fext.lower() if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0: return fbase elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0: fb, fext = os.path.splitext(fbase) fx=fext.lower() if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0: return fb else: return fbase else: if len(fbase)>0: return fbase else: return fname # Returns the current time in a nice format def right_now(): curr_time = datetime.now() return curr_time.strftime("%c") # The TopHat logging formatter def th_log(out_str): if tophat_logger: tophat_logger.info(out_str) def th_logp(out_str=""): print >> sys.stderr, out_str if tophat_log: print >> tophat_log, out_str def die(msg=None): if msg is not None: th_logp(msg) sys.exit(1) # Ensures that the output, logging, and temp directories are present. If not, # they are created def prepare_output_dir(): #th_log("Preparing output location "+output_dir) if os.path.exists(output_dir): pass else: os.mkdir(output_dir) if os.path.exists(logging_dir): pass else: os.mkdir(logging_dir) if os.path.exists(tmp_dir): pass else: try: os.makedirs(tmp_dir) except OSError, o: die("\nError creating directory %s (%s)" % (tmp_dir, o)) # to be added as preexec_fn for every subprocess.Popen() call: # see http://bugs.python.org/issue1652 def subprocess_setup(): # Python installs a SIGPIPE handler by default, which causes # gzip or other de/compression pipes to complain about "stdout: Broken pipe" signal.signal(signal.SIGPIPE, signal.SIG_DFL) # Check that the Bowtie index specified by the user is present and all files # are there. def check_bowtie_index(idx_prefix, is_bowtie2, add="(genome)"): if currentStage >= resumeStage: th_log("Checking for Bowtie index files "+add+"..") idxext="ebwt" bowtie_ver="" if is_bowtie2: idxext="bt2" bowtie_ver="2 " idx_fwd_1 = idx_prefix + ".1."+idxext idx_fwd_2 = idx_prefix + ".2."+idxext idx_rev_1 = idx_prefix + ".rev.1."+idxext idx_rev_2 = idx_prefix + ".rev.2."+idxext #bwtbotherr = "Warning: we do not recommend to have both Bowtie1 and Bowtie2 indexes in the same directory \n the genome sequence (*.fa) may not be compatible with one of them" bwtbotherr = "\tFound both Bowtie1 and Bowtie2 indexes." if os.path.exists(idx_fwd_1) and \ os.path.exists(idx_fwd_2) and \ os.path.exists(idx_rev_1) and \ os.path.exists(idx_rev_2): if os.path.exists(idx_prefix + ".1.ebwt") and os.path.exists(idx_prefix + ".1.bt2"): print >> sys.stderr, bwtbotherr return else: bwtidxerr="Error: Could not find Bowtie "+bowtie_ver+"index files (" + idx_prefix + ".*."+idxext+")" if is_bowtie2: bwtidx_env = os.environ.get("BOWTIE2_INDEXES") else: bwtidx_env = os.environ.get("BOWTIE_INDEXES") if bwtidx_env == None: die(bwtidxerr) if os.path.exists(bwtidx_env+idx_fwd_1) and \ os.path.exists(bwtidx_env+idx_fwd_2) and \ os.path.exists(bwtidx_env+idx_rev_1) and \ os.path.exists(bwtidx_env+idx_rev_2): if os.path.exists(bwtidx_env + idx_prefix + ".1.ebwt") and os.path.exists(bwtidx_env + idx_prefix + ".1.bt2"): print >> sys.stderr, bwtbotherr return else: die(bwtidxerr) # Reconstructs the multifasta file from which the Bowtie index was created, if # it's not already there. def bowtie_idx_to_fa(idx_prefix, is_bowtie2): idx_name = idx_prefix.split('/')[-1] th_log("Reconstituting reference FASTA file from Bowtie index") try: tmp_fasta_file_name = tmp_dir + idx_name + ".fa" tmp_fasta_file = open(tmp_fasta_file_name, "w") inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w") if is_bowtie2: inspect_cmd = [prog_path("bowtie2-inspect")] else: inspect_cmd = [prog_path("bowtie-inspect")] inspect_cmd += [idx_prefix] th_logp(" Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name) ret = subprocess.call(inspect_cmd, stdout=tmp_fasta_file, stderr=inspect_log) # Bowtie reported an error if ret != 0: die(fail_str+"Error: bowtie-inspect returned an error\n"+log_tail(logging_dir + "bowtie_inspect_recons.log")) # Bowtie not found except OSError, o: if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?") return tmp_fasta_file_name # Checks whether the multifasta file for the genome is present alongside the # Bowtie index files for it. def check_fasta(idx_prefix, is_bowtie2): th_log("Checking for reference FASTA file") idx_fasta = idx_prefix + ".fa" if os.path.exists(idx_fasta): return idx_fasta else: if is_bowtie2: bowtie_idx_env_var = os.environ.get("BOWTIE2_INDEXES") else: bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES") if bowtie_idx_env_var: idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa" if os.path.exists(idx_fasta): return idx_fasta th_logp("\tWarning: Could not find FASTA file " + idx_fasta) idx_fa = bowtie_idx_to_fa(idx_prefix, is_bowtie2) return idx_fa # Check that both the Bowtie index and the genome's fasta file are present def check_index(idx_prefix, is_bowtie2): check_bowtie_index(idx_prefix, is_bowtie2) ref_fasta_file = check_fasta(idx_prefix, is_bowtie2) return (ref_fasta_file, None) # Retrive a tuple containing the system's version of Bowtie. Parsed from # `bowtie --version` def get_bowtie_version(): try: # Launch Bowtie to capture its version info proc = subprocess.Popen([bowtie_path, "--version"], stdout=subprocess.PIPE) stdout_value = proc.communicate()[0] bowtie_version = None if not stdout_value: stdout_value='' bowtie_out = stdout_value.splitlines()[0] version_str=" version " ver_str_idx = bowtie_out.find(version_str) if ver_str_idx != -1: version_val = bowtie_out[(ver_str_idx + len(version_str)):] bvers=re.findall(r'\d+', version_val) bowtie_version = [int(x) for x in bvers] while len(bowtie_version)<4: bowtie_version.append(0) return bowtie_version except OSError, o: errmsg=fail_str+str(o)+"\n" if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: errmsg+="Error: bowtie not found on this system" die(errmsg) def get_index_sam_header(params, idx_prefix, name = ""): noSkip = currentStage >= resumeStage try: temp_sam_header_filename = tmp_dir + "temp.samheader.sam" temp_sam_header_file = None if noSkip: temp_sam_header_file = open(temp_sam_header_filename, "w") bowtie_header_cmd = [bowtie_path] read_params = params.read_params if not params.bowtie2: bowtie_header_cmd += ["--sam"] if read_params.color: bowtie_header_cmd.append('-C') bowtie_header_cmd.extend([idx_prefix, '/dev/null']) if noSkip: subprocess.call(bowtie_header_cmd, stdout=temp_sam_header_file, stderr=open('/dev/null')) temp_sam_header_file.close() temp_sam_header_file = open(temp_sam_header_filename, "r") bowtie_sam_header_filename = tmp_dir + idx_prefix.split('/')[-1] if name != "": bowtie_sam_header_filename += ("_" + name) bowtie_sam_header_filename += ".bwt.samheader.sam" if not noSkip: return bowtie_sam_header_filename bowtie_sam_header_file = open(bowtie_sam_header_filename, "w") preamble = [] sq_dict_lines = [] for line in temp_sam_header_file.readlines(): line = line.strip() if line.find("@SQ") != -1: # Sequence dictionary record cols = line.split('\t') seq_name = None for col in cols: fields = col.split(':') #print fields if len(fields) > 0 and fields[0] == "SN": seq_name = fields[1] if seq_name == None: die("Error: malformed sequence dictionary in sam header") sq_dict_lines.append([seq_name,line]) elif line.find("CL"): continue else: preamble.append(line) print >> bowtie_sam_header_file, "@HD\tVN:1.0\tSO:coordinate" if read_params.read_group_id and read_params.sample_id: rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id, read_params.sample_id) if read_params.library_id: rg_str += "\tLB:%s" % read_params.library_id if read_params.description: rg_str += "\tDS:%s" % read_params.description if read_params.seq_platform_unit: rg_str += "\tPU:%s" % read_params.seq_platform_unit if read_params.seq_center: rg_str += "\tCN:%s" % read_params.seq_center if read_params.mate_inner_dist: rg_str += "\tPI:%s" % read_params.mate_inner_dist if read_params.seq_run_date: rg_str += "\tDT:%s" % read_params.seq_run_date if read_params.seq_platform: rg_str += "\tPL:%s" % read_params.seq_platform print >> bowtie_sam_header_file, rg_str if not params.keep_fasta_order: sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0])) for [name, line] in sq_dict_lines: print >> bowtie_sam_header_file, line print >> bowtie_sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd) bowtie_sam_header_file.close() temp_sam_header_file.close() return bowtie_sam_header_filename except OSError, o: errmsg=fail_str+str(o)+"\n" if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: errmsg+="Error: bowtie not found on this system" die(errmsg) # Make sure Bowtie is installed and is recent enough to be useful def check_bowtie(params): bowtie_req="" if params.bowtie2: bowtie_req="2-align" log_msg = "Checking for Bowtie" th_log(log_msg) bowtie_bin = "bowtie"+bowtie_req global bowtie_path bowtie_version = None bowtie_path=which(bowtie_bin) if bowtie_path: bowtie_version = get_bowtie_version() if params.bowtie2 and bowtie_version == None: th_logp(" Bowtie 2 not found, checking for older version..") #try to fallback on bowtie 1 params.bowtie2=False bowtie_path=which('bowtie') if bowtie_path: bowtie_version=get_bowtie_version() if bowtie_version == None: die("Error: Bowtie not found on this system.") if params.bowtie2: if bowtie_version[1] < 1 and bowtie_version[2] < 5: die("Error: TopHat requires Bowtie 2.0.5 or later") else: if bowtie_version[0] < 1 and (bowtie_version[1] < 12 or bowtie_version[2] < 9): die("Error: TopHat requires Bowtie 0.12.9 or later") th_logp("\t\t Bowtie version:\t %s" % ".".join([str(x) for x in bowtie_version])) # Retrive a tuple containing the system's version of samtools. Parsed from # `samtools` def get_samtools_version(): try: # Launch Bowtie to capture its version info proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE) samtools_out = proc.communicate()[1] # Find the version identifier version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out) samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]] if version_match.group(4): samtools_version_arr.append(version_match.group(4)) else: samtools_version_arr.append(0) return version_match.group(), samtools_version_arr except OSError, o: errmsg=fail_str+str(o)+"\n" if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: errmsg+="Error: samtools not found on this system" die(errmsg) # Make sure the SAM tools are installed and are recent enough to be useful def check_samtools(): th_log("Checking for Samtools") global samtools_path samtools_path=prog_path("samtools") samtools_version_str, samtools_version_arr = get_samtools_version() if samtools_version_str == None: die("Error: Samtools not found on this system") elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7: die("Error: TopHat requires Samtools 0.1.7 or later") th_logp("\t\tSamtools version:\t %s" % ".".join([str(x) for x in samtools_version_arr])) class FastxReader: def __init__(self, i_file, is_color=0, fname=''): self.bufline=None self.format=None self.ifile=i_file self.nextRecord=None self.eof=None self.fname=fname self.lastline=None self.numrecords=0 self.isColor=0 if is_color : self.isColor=1 # determine file type #no records processed yet, skip custom header lines if any hlines=10 # allow maximum 10 header lines self.lastline=" " while hlines>0 and self.lastline[0] not in "@>" : self.lastline=self.ifile.readline() hlines-=1 if self.lastline[0] == '@': self.format='fastq' self.nextRecord=self.nextFastq elif self.lastline[0] == '>': self.format='fasta' self.nextRecord=self.nextFasta else: die("Error: cannot determine record type in input file %s" % fname) self.bufline=self.lastline self.lastline=None def nextFastq(self): # returning tuple: (seqID, sequence_string, seq_len, qv_string) seqid,seqstr,qstr,seq_len='','','',0 if self.eof: return (seqid, seqstr, seq_len, qstr) fline=self.getLine #shortcut to save a bit of time line=fline() if not line : return (seqid, seqstr, seq_len, qstr) while len(line.rstrip())==0: # skip empty lines line=fline() if not line : return (seqid, seqstr,seq_len, qstr) try: if line[0] != "@": raise ValueError("Records in Fastq files should start with '@' character") seqid = line[1:].rstrip() seqstr = fline().rstrip() #There may now be more sequence lines, or the "+" quality marker line: while True: line = fline() if not line: raise ValueError("Premature end of file (missing quality values for "+seqid+")") if line[0] == "+": # -- sequence string ended #qtitle = line[1:].rstrip() #if qtitle and qtitle != seqid: # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \ # % (seqid, qtitle)) break seqstr += line.rstrip() #removes trailing newlines #loop until + found seq_len = len(seqstr) #at least one line of quality data should follow qstrlen=0 #now read next lines as quality values until seq_len is reached while True: line=fline() if not line : break #end of file qstr += line.rstrip() qstrlen=len(qstr) if qstrlen + self.isColor >= seq_len : break # qv string has reached the length of seq string #loop until qv has the same length as seq if self.isColor: # and qstrlen==seq_len : if qstrlen==seq_len: #qual string may have a dummy qv at the beginning, should be stripped qstr = qstr[1:] qstrlen -= 1 if qstrlen!=seq_len-1: raise ValueError("Length mismatch between sequence and quality strings "+ \ "for %s (%i vs %i)." % (seqid, seq_len, qstrlen)) else: if seq_len != qstrlen : raise ValueError("Length mismatch between sequence and quality strings "+ \ "for %s (%i vs %i)." % (seqid, seq_len, qstrlen)) except ValueError, err: die("\nError encountered parsing file "+self.fname+":\n "+str(err)) #return the record self.numrecords+=1 ##--discard the primer base [NO] if self.isColor : seq_len-=1 seqstr = seqstr[1:] return (seqid, seqstr, seq_len, qstr) def nextFasta(self): # returning tuple: (seqID, sequence_string, seq_len) seqid,seqstr,seq_len='','',0 fline=self.getLine # shortcut to readline function of f line=fline() # this will use the buffer line if it's there if not line : return (seqid, seqstr, seq_len, None) while len(line.rstrip())==0: # skip empty lines line=fline() if not line : return (seqid, seqstr, seq_len, None) try: if line[0] != ">": raise ValueError("Records in Fasta files must start with '>' character") seqid = line[1:].split()[0] #more sequence lines, or the ">" quality marker line: while True: line = fline() if not line: break if line[0] == '>': #next sequence starts here self.ungetLine() break seqstr += line.rstrip() #loop until '>' found seq_len = len(seqstr) if seq_len < 3: raise ValueError("Read %s too short (%i)." \ % (seqid, seq_len)) except ValueError, err: die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err)) #return the record and continue self.numrecords+=1 if self.isColor : # -- discard primer base seq_len-=1 seqstr=seqstr[1:] return (seqid, seqstr, seq_len, None) def getLine(self): if self.bufline: #return previously buffered line r=self.bufline self.bufline=None return r else: #read a new line from stream and return it if self.eof: return None self.lastline=self.ifile.readline() if not self.lastline: self.eof=1 return None return self.lastline def ungetLine(self): if self.lastline is None: th_logp("Warning: FastxReader called ungetLine() with no prior line!") self.bufline=self.lastline self.lastline=None #< class FastxReader def fa_write(fhandle, seq_id, seq): """ Write to a file in the FASTA format. Arguments: - `fhandle`: A file handle open for writing - `seq_id`: The sequence id string for this sequence - `seq`: An unformatted string of the sequence to write """ line_len = 60 fhandle.write(">" + seq_id + "\n") for i in xrange(len(seq) / line_len + 1): start = i * line_len #end = (i+1) * line_len if (i+1) * line_len < len(seq) else len(seq) if (i+1) * line_len < len(seq): end = (i+1) * line_len else: end = len(seq) fhandle.write( seq[ start:end ] + "\n") class ZReader: def __init__(self, filename, params, guess=True): self.fname=filename self.file=None self.fsrc=None self.popen=None sys_params = params.system_params pipecmd=[] s=filename.lower() if s.endswith(".bam"): pipecmd=[prog_path("bam2fastx")] if params.read_params.color: pipecmd+=["--color"] pipecmd+=["--all", "-"] else: if guess: if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"): pipecmd=['gzip'] else: if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"): pipecmd=['bzip2'] if len(pipecmd)>0 and which(pipecmd[0]) is None: die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename)) if len(pipecmd)>0: if pipecmd[0]=='gzip' and sys_params.zipper.endswith('pigz'): pipecmd[0]=sys_params.zipper pipecmd.extend(sys_params.zipper_opts) elif pipecmd[0]=='bzip2' and sys_params.zipper.endswith('pbzip2'): pipecmd[0]=sys_params.zipper pipecmd.extend(sys_params.zipper_opts) else: #not guessing, but must still check if it's a compressed file if use_zpacker and filename.endswith(".z"): pipecmd=[sys_params.zipper] pipecmd.extend(sys_params.zipper_opts) if pipecmd: pipecmd+=['-cd'] if pipecmd: try: self.fsrc=open(self.fname, 'rb') self.popen=subprocess.Popen(pipecmd, preexec_fn=subprocess_setup, stdin=self.fsrc, stdout=subprocess.PIPE, stderr=tophat_log, close_fds=True) except Exception: die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname) self.file=self.popen.stdout else: self.file=open(filename) def close(self): if self.fsrc: self.fsrc.close() self.file.close() if self.popen: self.popen.wait() self.popen=None class ZWriter: def __init__(self, filename, sysparams): self.fname=filename if use_zpacker: pipecmd=[sysparams.zipper,"-cf", "-"] self.ftarget=open(filename, "wb") try: self.popen=subprocess.Popen(pipecmd, preexec_fn=subprocess_setup, stdin=subprocess.PIPE, stderr=tophat_log, stdout=self.ftarget, close_fds=True) except Exception: die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname) self.file=self.popen.stdin # client writes to this end of the pipe else: #no compression self.file=open(filename, "w") self.ftarget=None self.popen=None def close(self): self.file.close() if self.ftarget: self.ftarget.close() if self.popen: self.popen.wait() #! required to actually flush the pipes (eek!) self.popen=None # check_reads_format() examines the first few records in the user files # to determines the file format def check_reads_format(params, reads_files): #seed_len = params.read_params.seed_length fileformat = params.read_params.reads_format observed_formats = set([]) # observed_scales = set([]) min_seed_len = 99999 max_seed_len = 0 files = reads_files.split(',') for f_name in files: #try: zf = ZReader(f_name, params) #except IOError: # die("Error: could not open file "+f_name) freader=FastxReader(zf.file, params.read_params.color, zf.fname) toread=4 #just sample the first 4 reads while toread>0: seqid, seqstr, seq_len, qstr = freader.nextRecord() if not seqid: break toread-=1 if seq_len < 20: th_logp("Warning: found a read < 20bp in "+f_name) else: min_seed_len = min(seq_len, min_seed_len) max_seed_len = max(seq_len, max_seed_len) zf.close() observed_formats.add(freader.format) if len(observed_formats) > 1: die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.") fileformat=list(observed_formats)[0] #if seed_len != None: # seed_len = max(seed_len, max_seed_len) #else: # seed_len = max_seed_len #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len) th_logp("\tformat:\t\t %s" % fileformat) if fileformat == "fastq": quality_scale = "phred33 (default)" if params.read_params.solexa_quals and not params.read_params.phred64_quals: quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)" elif params.read_params.phred64_quals: quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)" th_logp("\tquality scale:\t %s" % quality_scale) elif fileformat == "fasta": if params.read_params.color: params.read_params.integer_quals = True #print seed_len, format, solexa_scale #NOTE: seed_len will be re-evaluated later by prep_reads return TopHatParams.ReadParams(params.read_params.solexa_quals, params.read_params.phred64_quals, params.read_params.quals, params.read_params.integer_quals, params.read_params.color, params.read_params.library_type, #seed_len, params.read_params.seed_length, fileformat, params.read_params.mate_inner_dist, params.read_params.mate_inner_dist_std_dev, params.read_params.read_group_id, params.read_params.sample_id, params.read_params.library_id, params.read_params.description, params.read_params.seq_platform_unit, params.read_params.seq_center, params.read_params.seq_run_date, params.read_params.seq_platform) def grep_file(logfile, regex="warning"): f=open(logfile, "r") r=[] for line in f: if re.match(regex, line, re.IGNORECASE): r += [line.rstrip()] return r def log_tail(logfile, lines=1): f=open(logfile, "r") f.seek(0, 2) fbytes= f.tell() size=lines block=-1 while size > 0 and fbytes+block*1024 > 0: if (fbytes+block*1024 > 0): ##Seek back once more, if possible f.seek( block*1024, 2 ) else: #Seek to the beginning f.seek(0, 0) data= f.read( 1024 ) linesFound= data.count('\n') size -= linesFound block -= 1 if (fbytes + block*1024 > 0): f.seek(block*1024, 2) else: f.seek(0,0) #f.readline() # find a newline lastBlocks= list( f.readlines() ) f.close() return "".join(lastBlocks[-lines:]) # Format a DateTime as a pretty string. # FIXME: Currently doesn't support days! def formatTD(td): days = td.days hours = td.seconds // 3600 minutes = (td.seconds % 3600) // 60 seconds = td.seconds % 60 if days > 0: return '%d days %02d:%02d:%02d' % (days, hours, minutes, seconds) else: return '%02d:%02d:%02d' % (hours, minutes, seconds) class PrepReadsInfo: def __init__(self, fname, out_fname): self.min_len = [0, 0] self.max_len = [0, 0] self.in_count = [0, 0] self.out_count= [0, 0] self.kept_reads = [None, None] try: f=open(fname,"r") self.min_len[0]=int(f.readline().split("=")[-1]) self.max_len[0]=int(f.readline().split("=")[-1]) self.in_count[0]=int(f.readline().split("=")[-1]) self.out_count[0]=int(f.readline().split("=")[-1]) if (self.out_count[0]==0) or (self.max_len[0]<16): raise Exception() line=f.readline() if line and line.find("=") > 0: self.min_len[1]=int(line.split("=")[-1]) self.max_len[1]=int(f.readline().split("=")[-1]) self.in_count[1]=int(f.readline().split("=")[-1]) self.out_count[1]=int(f.readline().split("=")[-1]) if (self.out_count[1]==0) or (self.max_len[1]<16): raise Exception() except Exception: die(fail_str+"Error retrieving prep_reads info.") sides=["left", "right"] for ri in (0,1): if self.in_count[ri]==0: break trashed=self.in_count[ri]-self.out_count[ri] self.kept_reads[ri]=out_fname.replace("%side%", sides[ri]) th_logp("\t%5s reads: min. length=%s, max. length=%s, %s kept reads (%s discarded)" % (sides[ri], self.min_len[ri], self.max_len[ri], self.out_count[ri], trashed)) def prep_reads_cmd(params, l_reads_list, l_quals_list=None, r_reads_list=None, r_quals_list=None, out_file=None, aux_file=None, index_file=None, filter_reads=[], hits_to_filter=[]): #generate a prep_reads cmd arguments prep_cmd = [prog_path("prep_reads")] prep_cmd.extend(params.cmd()) if params.read_params.reads_format == "fastq": prep_cmd += ["--fastq"] elif params.read_params.reads_format == "fasta": prep_cmd += ["--fasta"] if hits_to_filter: prep_cmd += ["--flt-hits=" + ",".join(hits_to_filter)] if aux_file: prep_cmd += ["--aux-outfile="+aux_file] if index_file: prep_cmd += ["--index-outfile="+index_file] # could be a template if filter_reads: prep_cmd += ["--flt-reads=" + ",".join(filter_reads)] if params.sam_header: prep_cmd += ["--sam-header="+params.sam_header] if out_file: prep_cmd += ["--outfile="+out_file] #could be a template prep_cmd.append(l_reads_list) if l_quals_list: prep_cmd.append(l_quals_list) if r_reads_list: prep_cmd.append(r_reads_list) if r_quals_list: prep_cmd.append(r_quals_list) return prep_cmd # Calls the prep_reads executable, which prepares an internal read library. # The read library features reads with monotonically increasing integer IDs. # prep_reads also filters out very low complexy or garbage reads as well as # polyA reads. #--> returns a PrepReadsInfo structure def prep_reads(params, l_reads_list, l_quals_list, r_reads_list, r_quals_list, prefilter_reads=[]): reads_suffix = ".bam" use_bam = True #if params.read_params.color: # reads_suffix = ".fq" # use_bam = False # for parallelization, we don't compress the read files do_use_zpacker = use_zpacker and not use_bam if do_use_zpacker and params.system_params.num_threads > 1: do_use_zpacker = False if do_use_zpacker: reads_suffix += ".z" out_suffix = "_kept_reads" + reads_suffix #kept_reads_filename = tmp_dir + output_name + reads_suffix for side in ("left", "right"): kept_reads_filename = tmp_dir + side + out_suffix if resumeStage<1 and os.path.exists(kept_reads_filename): os.remove(kept_reads_filename) out_tmpl="left" out_fname=None kept_reads = None #output file handle if r_reads_list: out_tmpl="%side%" info_file = output_dir+"prep_reads.info" if fileExists(info_file,10) and resumeStage>0 : return PrepReadsInfo(info_file, tmp_dir + out_tmpl + out_suffix) if use_bam: out_fname = tmp_dir + out_tmpl + out_suffix else: #assumed no right reads given here, only one side is being processed kept_reads = open(tmp_dir + out_tmpl + out_suffix, "wb") log_fname=logging_dir + "prep_reads.log" filter_log = open(log_fname,"w") index_file = out_fname + ".index" if do_use_zpacker: index_file=None prep_cmd=prep_reads_cmd(params, l_reads_list, l_quals_list, r_reads_list, r_quals_list, out_fname, info_file, index_file, prefilter_reads) shell_cmd = ' '.join(prep_cmd) #finally, add the compression pipe if needed zip_cmd=[] if do_use_zpacker: zip_cmd=[ params.system_params.zipper ] zip_cmd.extend(params.system_params.zipper_opts) zip_cmd.extend(['-c','-']) shell_cmd +=' | '+' '.join(zip_cmd) if not use_bam: shell_cmd += ' >' +kept_reads_filename retcode = None try: print >> run_log, shell_cmd if do_use_zpacker: filter_proc = subprocess.Popen(prep_cmd, stdout=subprocess.PIPE, stderr=filter_log) zip_proc=subprocess.Popen(zip_cmd, preexec_fn=subprocess_setup, stdin=filter_proc.stdout, stderr=tophat_log, stdout=kept_reads) filter_proc.stdout.close() #as per http://bugs.python.org/issue7678 zip_proc.communicate() retcode=filter_proc.poll() if retcode==0: retcode=zip_proc.poll() else: if use_bam: retcode = subprocess.call(prep_cmd, stderr=filter_log) else: retcode = subprocess.call(prep_cmd, stdout=kept_reads, stderr=filter_log) if retcode: die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname)) except OSError, o: errmsg=fail_str+str(o) die(errmsg+"\n"+log_tail(log_fname)) if kept_reads: kept_reads.close() warnings=grep_file(log_fname) if warnings: th_logp("\n"+"\n".join(warnings)+"\n") return PrepReadsInfo(info_file, tmp_dir + out_tmpl + out_suffix) # Call bowtie def bowtie(params, bwt_idx_prefix, sam_headers, reads_list, reads_format, num_mismatches, gap_length, edit_dist, realign_edit_dist, mapped_reads, unmapped_reads, extra_output = "", mapping_type = _reads_vs_G, multihits_out = None): #only --prefilter-multihits should activate this parameter for the initial prefilter search start_time = datetime.now() bwt_idx_name = bwt_idx_prefix.split('/')[-1] reads_file=reads_list[0] readfile_basename=getFileBaseName(reads_file) g_mapping, t_mapping, seg_mapping = False, False, False sam_header_filename = None genome_sam_header_filename = None if mapping_type == _reads_vs_T: t_mapping = True sam_header_filename = sam_headers[0] genome_sam_header_filename = sam_headers[1] else: sam_header_filename = sam_headers if mapping_type >= _segs_vs_G: seg_mapping = True else: g_mapping = True bowtie_str = "Bowtie" if params.bowtie2: bowtie_str += "2" if seg_mapping: if not params.bowtie2: backup_bowtie_alignment_option = params.bowtie_alignment_option params.bowtie_alignment_option = "-v" resume_skip = resumeStage > currentStage unmapped_reads_out=None if unmapped_reads: unmapped_reads_out=unmapped_reads+".fq" mapped_reads += ".bam" if unmapped_reads: unmapped_reads_out = unmapped_reads + ".bam" use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color if use_FIFO: unmapped_reads_out+=".z" if resume_skip: #skipping this step return (mapped_reads, unmapped_reads_out) bwt_logname=logging_dir + 'bowtie.'+readfile_basename+'.log' if t_mapping: th_log("Mapping %s to transcriptome %s with %s %s" % (readfile_basename, bwt_idx_name, bowtie_str, extra_output)) else: qryname = readfile_basename if len(reads_list) > 1: bnames=[] for fname in reads_list: bnames += [getFileBaseName(fname)] qryname = ",".join(bnames) th_log("Mapping %s to genome %s with %s %s" % (qryname, bwt_idx_name, bowtie_str, extra_output)) if use_FIFO: global unmapped_reads_fifo unmapped_reads_fifo=unmapped_reads+".fifo" if os.path.exists(unmapped_reads_fifo): os.remove(unmapped_reads_fifo) try: os.mkfifo(unmapped_reads_fifo) except OSError, o: die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o)) # Launch Bowtie try: bowtie_cmd = [bowtie_path] if reads_format == "fastq": bowtie_cmd += ["-q"] elif reads_format == "fasta": bowtie_cmd += ["-f"] if params.read_params.color: bowtie_cmd += ["-C", "--col-keepends"] unzip_cmd=None bam_input=False if len(reads_list) > 0 and reads_list[0].endswith('.bam'): bam_input=True unzip_cmd=[ prog_path('bam2fastx'), "--all" ] if params.read_params.color: unzip_cmd.append("--color") if reads_format: unzip_cmd.append("--" + reads_format) unzip_cmd+=[reads_list[0]] if use_zpacker and (unzip_cmd is None): unzip_cmd=[ params.system_params.zipper ] unzip_cmd.extend(params.system_params.zipper_opts) unzip_cmd+=['-cd'] fifo_pid=None if use_FIFO: unm_zipcmd=[ params.system_params.zipper ] unm_zipcmd.extend(params.system_params.zipper_opts) unm_zipcmd+=['-c'] print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & ' fifo_pid=os.fork() if fifo_pid==0: def on_sig_exit(sig, func=None): os._exit(os.EX_OK) signal.signal(signal.SIGTERM, on_sig_exit) subprocess.call(unm_zipcmd, stdin=open(unmapped_reads_fifo, "r"), stderr=tophat_log, stdout=open(unmapped_reads_out, "wb")) os._exit(os.EX_OK) fix_map_cmd = [prog_path('fix_map_ordering')] if params.read_params.color: fix_map_cmd += ["--color"] if params.bowtie2: #if t_mapping or g_mapping: max_penalty, min_penalty = params.bowtie2_params.mp.split(',') max_penalty, min_penalty = int(max_penalty), int(min_penalty) min_score = (max_penalty - 1) * realign_edit_dist fix_map_cmd += ["--bowtie2-min-score", str(min_score)] # testing score filtering if params.b2scoreflt: fix_map_cmd +=["-W"+str(min_score+max_penalty)] fix_map_cmd += ["--read-mismatches", str(params.read_mismatches), "--read-gap-length", str(params.read_gap_length), "--read-edit-dist", str(params.read_edit_dist), "--read-realign-edit-dist", str(params.read_realign_edit_dist)] #write BAM file out_bam = mapped_reads if not t_mapping: fix_map_cmd += ["--index-outfile", mapped_reads + ".index"] if not params.bowtie2: fix_map_cmd += ["--bowtie1"] if multihits_out != None: fix_map_cmd += ["--aux-outfile", params.preflt_data[multihits_out].multihit_reads] fix_map_cmd += ["--max-multihits", str(params.max_hits)] if t_mapping: out_bam = "-" # we'll pipe into map2gtf fix_map_cmd += ["--sam-header", sam_header_filename, "-", out_bam] if unmapped_reads: fix_map_cmd += [unmapped_reads_out] if t_mapping: max_hits = params.t_max_hits elif seg_mapping: max_hits = params.max_seg_hits else: max_hits = params.max_hits if num_mismatches > 3: num_mismatches = 3 if params.bowtie2: if seg_mapping or multihits_out != None: # since bowtie2 does not suppress reads that map to too many places, # we suppress those in segment_juncs and long_spanning_reads. bowtie_cmd += ["-k", str(max_hits + 1)] else: bowtie_cmd += ["-k", str(max_hits)] bowtie2_params = params.bowtie2_params if seg_mapping: # after intensive testing, # the following parameters seem to work faster than Bowtie1 and as sensitive as Bowtie1, # but room for further improvements remains. bowtie_cmd += ["-N", str(min(num_mismatches, 1))] bowtie_cmd += ["-L", str(min(params.segment_length, 20))] # bowtie_cmd += ["-i", "C,10000,0"] # allow only one seed # bowtie_cmd += ["-L", "14"] else: bowtie2_preset = "" if bowtie2_params.very_fast: bowtie2_preset = "--very-fast" elif bowtie2_params.fast: bowtie2_preset = "--fast" elif bowtie2_params.sensitive: bowtie2_preset = "--sensitive" elif bowtie2_params.very_sensitive: bowtie2_preset = "--very-sensitive" if bowtie2_preset != "": bowtie_cmd += [bowtie2_preset] else: bowtie_cmd += ["-D", str(bowtie2_params.D), "-R", str(bowtie2_params.R), "-N", str(bowtie2_params.N), "-L", str(bowtie2_params.L), "-i", bowtie2_params.i] score_min = bowtie2_params.score_min if not score_min: max_penalty, min_penalty = bowtie2_params.mp.split(',') score_min_value = int(max_penalty) * edit_dist + 2 score_min = "C,-%d,0" % score_min_value # "--n-ceil" is not correctly parsed in Bowtie2, # I (daehwan) already talked to Ben who will fix the problem. bowtie_cmd += [# "--n-ceil", bowtie2_params.n_ceil, "--gbar", str(bowtie2_params.gbar), "--mp", bowtie2_params.mp, "--np", str(bowtie2_params.np), "--rdg", bowtie2_params.rdg, "--rfg", bowtie2_params.rfg, "--score-min", score_min] else: bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches), "-k", str(max_hits), "-m", str(max_hits), "-S"] bowtie_cmd += ["-p", str(params.system_params.num_threads)] if params.bowtie2: #always use headerless SAM file bowtie_cmd += ["--sam-no-hd"] else: bowtie_cmd += ["--sam-nohead"] if not params.bowtie2: if multihits_out != None: bowtie_cmd += ["--max", params.preflt_data[multihits_out].multihit_reads] else: bowtie_cmd += ["--max", "/dev/null"] if params.bowtie2: bowtie_cmd += ["-x"] bowtie_cmd += [ bwt_idx_prefix ] bowtie_proc=None shellcmd="" unzip_proc=None if multihits_out != None: #special prefilter bowtie run: we use prep_reads on the fly #in order to get multi-mapped reads to exclude later prep_cmd = prep_reads_cmd(params, params.preflt_data[0].seqfiles, params.preflt_data[0].qualfiles, params.preflt_data[1].seqfiles, params.preflt_data[1].qualfiles) prep_cmd.insert(1,"--flt-side="+str(multihits_out)) sides=["left", "right"] preplog_fname=logging_dir + "prep_reads.prefilter_%s.log" % sides[multihits_out] prepfilter_log = open(preplog_fname,"w") unzip_proc = subprocess.Popen(prep_cmd, stdout=subprocess.PIPE, stderr=prepfilter_log) shellcmd=' '.join(prep_cmd) + "|" else: z_input=use_zpacker and reads_file.endswith(".z") if z_input: unzip_proc = subprocess.Popen(unzip_cmd, stdin=open(reads_file, "rb"), stderr=tophat_log, stdout=subprocess.PIPE) shellcmd=' '.join(unzip_cmd) + "< " +reads_file +"|" else: #must be uncompressed fastq input (unmapped reads from a previous run) #or a BAM file with unmapped reads if bam_input: unzip_proc = subprocess.Popen(unzip_cmd, stderr=tophat_log, stdout=subprocess.PIPE) shellcmd=' '.join(unzip_cmd) + "|" else: bowtie_cmd += [reads_file] if not unzip_proc: bowtie_proc = subprocess.Popen(bowtie_cmd, stdout=subprocess.PIPE, stderr=open(bwt_logname, "w")) if unzip_proc: #input is compressed OR prep_reads is used as a filter bowtie_cmd += ['-'] bowtie_proc = subprocess.Popen(bowtie_cmd, stdin=unzip_proc.stdout, stdout=subprocess.PIPE, stderr=open(bwt_logname, "w")) unzip_proc.stdout.close() # see http://bugs.python.org/issue7678 shellcmd += ' '.join(bowtie_cmd) + '|' + ' '.join(fix_map_cmd) pipeline_proc = None fix_order_proc = None #write BAM format directly if t_mapping: #pipe into map2gtf fix_order_proc = subprocess.Popen(fix_map_cmd, stdin=bowtie_proc.stdout, stdout=subprocess.PIPE, stderr=tophat_log) bowtie_proc.stdout.close() m2g_cmd = [prog_path("map2gtf")] m2g_cmd += ["--sam-header", genome_sam_header_filename] #m2g_cmd.append(params.gff_annotation) m2g_cmd.append(params.transcriptome_index+".fa.tlst") m2g_cmd.append("-") #incoming uncompressed BAM stream m2g_cmd.append(mapped_reads) m2g_log = logging_dir + "m2g_"+readfile_basename+".out" m2g_err = logging_dir + "m2g_"+readfile_basename+".err" shellcmd += ' | '+' '.join(m2g_cmd)+ ' > '+m2g_log pipeline_proc = subprocess.Popen(m2g_cmd, stdin=fix_order_proc.stdout, stdout=open(m2g_log, "w"), stderr=open(m2g_err, "w")) fix_order_proc.stdout.close() else: fix_order_proc = subprocess.Popen(fix_map_cmd, stdin=bowtie_proc.stdout, stderr=tophat_log) bowtie_proc.stdout.close() pipeline_proc = fix_order_proc print >> run_log, shellcmd retcode = None if pipeline_proc: pipeline_proc.communicate() retcode = pipeline_proc.returncode bowtie_proc.wait() r=bowtie_proc.returncode if r: die(fail_str+"Error running bowtie:\n"+log_tail(bwt_logname,100)) if use_FIFO: if fifo_pid and not os.path.exists(unmapped_reads_out): try: os.kill(fifo_pid, signal.SIGTERM) except: pass if retcode: die(fail_str+"Error running:\n"+shellcmd) except OSError, o: die(fail_str+"Error: "+str(o)) # Success #finish_time = datetime.now() #duration = finish_time - start_time #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration) if use_FIFO: try: os.remove(unmapped_reads_fifo) except: pass if multihits_out != None and not os.path.exists(params.preflt_data[multihits_out].multihit_reads): open(params.preflt_data[multihits_out].multihit_reads, "w").close() if seg_mapping: if not params.bowtie2: params.bowtie_alignment_option = backup_bowtie_alignment_option return (mapped_reads, unmapped_reads_out) # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable def get_gtf_juncs(gff_annotation): th_log("Reading known junctions from GTF file") gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w") gff_prefix = gff_annotation.split('/')[-1].split('.')[0] gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs" gtf_juncs_out = open(gtf_juncs_out_name, "w") gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation] try: print >> run_log, " ".join(gtf_juncs_cmd), " > "+gtf_juncs_out_name retcode = subprocess.call(gtf_juncs_cmd, stderr=gtf_juncs_log, stdout=gtf_juncs_out) # cvg_islands returned an error if retcode == 1: th_logp("\tWarning: TopHat did not find any junctions in GTF file") return (False, gtf_juncs_out_name) elif retcode != 0: die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode)) # cvg_islands not found except OSError, o: errmsg=fail_str+str(o)+"\n" if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: errmsg+="Error: gtf_juncs not found on this system" die(errmsg) return (True, gtf_juncs_out_name) # Call bowtie-build on the FASTA file of sythetic splice junction sequences def build_juncs_bwt_index(is_bowtie2, external_splice_prefix, color): th_log("Indexing splices") bowtie_build_log = open(logging_dir + "bowtie_build.log", "w") #user_splices_out_prefix = output_dir + "user_splices_idx" if is_bowtie2: bowtie_build_cmd = [prog_path("bowtie2-build")] else: bowtie_build_cmd = [prog_path("bowtie-build")] if color: bowtie_build_cmd += ["-C"] bowtie_build_cmd += [external_splice_prefix + ".fa", external_splice_prefix] try: print >> run_log, " ".join(bowtie_build_cmd) retcode = subprocess.call(bowtie_build_cmd, stdout=bowtie_build_log) if retcode != 0: die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode)) except OSError, o: errmsg=fail_str+str(o)+"\n" if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: errmsg+="Error: bowtie-build not found on this system" die(errmsg) return external_splice_prefix # Build a splice index from a .juncs file, suitable for use with specified read # (or read segment) lengths def build_juncs_index(is_bowtie2, min_anchor_length, max_seg_len, juncs_prefix, external_juncs, external_insertions, external_deletions, external_fusions, reference_fasta, color): th_log("Retrieving sequences for splices") juncs_file_list = ",".join(external_juncs) insertions_file_list = ",".join(external_insertions) deletions_file_list = ",".join(external_deletions) fusions_file_list = ",".join(external_fusions) # do not use insertions and deletions in case of Bowtie2 if is_bowtie2: insertions_file_list = "/dev/null" deletions_file_list = "/dev/null" juncs_db_log = open(logging_dir + "juncs_db.log", "w") external_splices_out_prefix = tmp_dir + juncs_prefix external_splices_out_name = external_splices_out_prefix + ".fa" external_splices_out = open(external_splices_out_name, "w") # juncs_db_cmd = [bin_dir + "juncs_db", juncs_db_cmd = [prog_path("juncs_db"), str(min_anchor_length), str(max_seg_len), juncs_file_list, insertions_file_list, deletions_file_list, fusions_file_list, reference_fasta] try: print >> run_log, " ".join(juncs_db_cmd) + " > " + external_splices_out_name retcode = subprocess.call(juncs_db_cmd, stderr=juncs_db_log, stdout=external_splices_out) if retcode != 0: die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode)) # juncs_db not found except OSError, o: errmsg=fail_str+str(o)+"\n" if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: errmsg+="Error: juncs_db not found on this system" die(errmsg) external_splices_out_prefix = build_juncs_bwt_index(is_bowtie2, external_splices_out_prefix, color) return external_splices_out_prefix def build_idx_from_fa(is_bowtie2, fasta_fname, out_dir, color): """ Build a bowtie index from a FASTA file. Arguments: - `fasta_fname`: File path to FASTA file. - `out_dir`: Output directory to place index in. (includes os.sep) Returns: - The path to the Bowtie index. """ bwt_idx_path = out_dir + os.path.basename(fasta_fname).replace(".fa", "") if is_bowtie2: bowtie_idx_cmd = [prog_path("bowtie2-build")] else: bowtie_idx_cmd = [prog_path("bowtie-build")] if color: bowtie_idx_cmd += ["-C"] bowtie_idx_cmd += [fasta_fname, bwt_idx_path] try: th_log("Building Bowtie index from " + os.path.basename(fasta_fname)) print >> run_log, " ".join(bowtie_idx_cmd) retcode = subprocess.call(bowtie_idx_cmd, stdout=open(os.devnull, "w"), stderr=open(os.devnull, "w")) if retcode != 0: die(fail_str + "Error: Couldn't build bowtie index with err = " + str(retcode)) except OSError, o: errmsg=fail_str+str(o)+"\n" if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: errmsg+="Error: bowtie-build not found on this system" die(errmsg) return bwt_idx_path # Print out the sam header, embedding the user's specified library properties. # FIXME: also needs SQ dictionary lines def write_sam_header(read_params, sam_file): print >> sam_file, "@HD\tVN:1.0\tSO:coordinate" if read_params.read_group_id and read_params.sample_id: rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id, read_params.sample_id) if read_params.library_id: rg_str += "\tLB:%s" % read_params.library_id if read_params.description: rg_str += "\tDS:%s" % read_params.description if read_params.seq_platform_unit: rg_str += "\tPU:%s" % read_params.seq_platform_unit if read_params.seq_center: rg_str += "\tCN:%s" % read_params.seq_center if read_params.mate_inner_dist: rg_str += "\tPI:%s" % read_params.mate_inner_dist if read_params.seq_run_date: rg_str += "\tDT:%s" % read_params.seq_run_date if read_params.seq_platform: rg_str += "\tPL:%s" % read_params.seq_platform print >> sam_file, rg_str print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd) # Write final TopHat output, via tophat_reports and wiggles def compile_reports(params, sam_header_filename, ref_fasta, mappings, readfiles, gff_annotation): th_log("Reporting output tracks") left_maps, right_maps = mappings left_reads, right_reads = readfiles # left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)] left_maps = ','.join(left_maps) if len(right_maps) > 0: # right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)] right_maps = ','.join(right_maps) log_fname = logging_dir + "reports.log" report_log = open(log_fname, "w") junctions = output_dir + "junctions.bed" insertions = output_dir + "insertions.bed" deletions = output_dir + "deletions.bed" accepted_hits = output_dir + "accepted_hits" report_cmdpath = prog_path("tophat_reports") fusions = output_dir + "fusions.out" report_cmd = [report_cmdpath] alignments_output_filename = tmp_dir + "accepted_hits" report_cmd.extend(params.cmd()) report_cmd += ["--sam-header", sam_header_filename] if params.report_secondary_alignments: report_cmd += ["--report-secondary-alignments"] if params.report_discordant_pair_alignments: report_cmd += ["--report-discordant-pair-alignments"] if params.report_mixed_alignments: report_cmd += ["--report-mixed-alignments"] report_cmd.extend(["--samtools="+samtools_path]) b2_params = params.bowtie2_params max_penalty, min_penalty = b2_params.mp.split(',') report_cmd += ["--bowtie2-max-penalty", max_penalty, "--bowtie2-min-penalty", min_penalty] report_cmd += ["--bowtie2-penalty-for-N", str(b2_params.np)] read_gap_open, read_gap_cont = b2_params.rdg.split(',') report_cmd += ["--bowtie2-read-gap-open", read_gap_open, "--bowtie2-read-gap-cont", read_gap_cont] ref_gap_open, ref_gap_cont = b2_params.rfg.split(',') report_cmd += ["--bowtie2-ref-gap-open", ref_gap_open, "--bowtie2-ref-gap-cont", ref_gap_cont] report_cmd.extend([ref_fasta, junctions, insertions, deletions, fusions, alignments_output_filename, left_maps, left_reads]) if len(right_maps) > 0 and right_reads: report_cmd.append(right_maps) report_cmd.append(right_reads) try: print >> run_log, " ".join(report_cmd) report_proc=subprocess.call(report_cmd, preexec_fn=subprocess_setup, stderr=report_log) if report_proc != 0: die(fail_str+"Error running "+" ".join(report_cmd)+"\n"+log_tail(log_fname)) bam_parts = [] for i in range(params.system_params.num_threads): bam_part_filename = "%s%d.bam" % (alignments_output_filename, i) if os.path.exists(bam_part_filename): bam_parts.append(bam_part_filename) else: break num_bam_parts = len(bam_parts) if params.report_params.sort_bam: pids = [0 for i in range(num_bam_parts)] sorted_bam_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)] #left_um_parts = ["%s%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)] #right_um_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)] for i in range(num_bam_parts): bamsort_cmd = [samtools_path, "sort", bam_parts[i], sorted_bam_parts[i]] sorted_bam_parts[i] += ".bam" print >> run_log, " ".join(bamsort_cmd) if i + 1 < num_bam_parts: pid = os.fork() if pid == 0: subprocess.call(bamsort_cmd, stderr=open(logging_dir + "reports.samtools_sort.log%d" % i, "w")) os._exit(os.EX_OK) else: pids[i] = pid else: subprocess.call(bamsort_cmd, stderr=open(logging_dir + "reports.samtools_sort.log%d" % i, "w")) for i in range(len(pids)): if pids[i] > 0: result = os.waitpid(pids[i], 0) pids[i] = 0 for bam_part in bam_parts: os.remove(bam_part) bam_parts = sorted_bam_parts[:] #-- endif sort_bam if num_bam_parts > 1: if params.report_params.sort_bam: bammerge_cmd = [samtools_path, "merge","-f","-h", sam_header_filename] if not params.report_params.convert_bam: bammerge_cmd += ["-u"] else: #not sorted, so just raw merge bammerge_cmd = [prog_path("bam_merge"), "-Q", "--sam-header", sam_header_filename] if params.report_params.convert_bam: bammerge_cmd += ["%s.bam" % accepted_hits] bammerge_cmd += bam_parts print >> run_log, " ".join(bammerge_cmd) subprocess.call(bammerge_cmd, stderr=open(logging_dir + "reports.merge_bam.log", "w")) else: #make .sam bammerge_cmd += ["-"] bammerge_cmd += bam_parts merge_proc = subprocess.Popen(bammerge_cmd, stdout=subprocess.PIPE, stderr=open(logging_dir + "reports.merge_bam.log", "w")) bam2sam_cmd = [samtools_path, "view", "-h", "-"] sam_proc = subprocess.Popen(bam2sam_cmd, stdin=merge_proc.stdout, stdout=open(accepted_hits + ".sam", "w"), stderr=open(logging_dir + "accepted_hits_bam_to_sam.log", "w")) merge_proc.stdout.close() shellcmd = " ".join(bammerge_cmd) + " | " + " ".join(bam2sam_cmd) print >> run_log, shellcmd sam_proc.communicate() retcode = sam_proc.returncode if retcode: die(fail_str+"Error running:\n"+shellcmd) for bam_part in bam_parts: os.remove(bam_part) else: # only one file os.rename(bam_parts[0], accepted_hits+".bam") if not params.report_params.convert_bam: #just convert to .sam bam2sam_cmd = [samtools_path, "view", "-h", accepted_hits+".bam"] shellcmd = " ".join(bam2sam_cmd) + " > " + accepted_hits + ".sam" print >> run_log, shellcmd r = subprocess.call(bam2sam_cmd, stdout=open(accepted_hits + ".sam", "w"), stderr=open(logging_dir + "accepted_hits_bam_to_sam.log", "w")) if r != 0: die(fail_str+"Error running: "+shellcmd) os.remove(accepted_hits+".bam") except OSError, o: die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname)) try: # -- merge the unmapped files um_parts = [] um_merged = output_dir + "unmapped.bam" for i in range(params.system_params.num_threads): left_um_file = tmp_dir + "unmapped_left_%d.bam" % i right_um_file = tmp_dir + "unmapped_right_%d.bam" % i um_len = len(um_parts) if nonzeroFile(left_um_file): um_parts.append(left_um_file) if right_reads and nonzeroFile(right_um_file): um_parts.append(right_um_file) if len(um_parts) > 0: if len(um_parts)==1: os.rename(um_parts[0], um_merged) else: merge_cmd=[prog_path("bam_merge"), "-Q", "--sam-header", sam_header_filename, um_merged] merge_cmd += um_parts print >> run_log, " ".join(merge_cmd) ret = subprocess.call( merge_cmd, stderr=open(logging_dir + "bam_merge_um.log", "w") ) if ret != 0: die(fail_str+"Error executing: "+" ".join(merge_cmd)+"\n"+log_tail(logging_dir+"bam_merge_um.log")) for um_part in um_parts: os.remove(um_part) except OSError, o: die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname)) return junctions # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file # for each segment This function needs to be fixed to support mixed read length # inputs def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params): i = num_files_prev + 1 while i <= num_files: segfname=prefix+("_seg%d" % i)+extension out_segf.append(ZWriter(segfname,params.system_params)) i += 1 def split_reads(reads_filename, prefix, fasta, params, segment_length): #reads_file = open(reads_filename) out_segfiles = [] if fasta: extension = ".fa" else: extension = ".fq" if use_zpacker: extension += ".z" existing_seg_files = glob.glob(prefix+"_seg*"+extension) if resumeStage > currentStage and len(existing_seg_files)>0: #skip this, we are going to return the existing files return existing_seg_files zreads = ZReader(reads_filename, params, False) def convert_color_to_bp(color_seq): decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N', 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N', 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N', 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N', 'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N', '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' } base = color_seq[0] bp_seq = base for ch in color_seq[1:]: base = decode_dic[base+ch] bp_seq += base return bp_seq def convert_bp_to_color(bp_seq): encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0', 'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1', 'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2', 'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3', 'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4', '.A':'4', '.C':'4', '.G':'4', '.T':'4', '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4', 'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4', 'NT':'4', 'NN':'4', 'N.':'4', '..':'4' } base = bp_seq[0] color_seq = base for ch in bp_seq[1:]: color_seq += encode_dic[base + ch] base = ch return color_seq def split_record(read_name, read_seq, read_qual, out_segf, offsets, color): if color: color_offset = 1 read_seq_temp = convert_color_to_bp(read_seq) seg_num = 1 while seg_num + 1 < len(offsets): if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']: return seg_num += 1 else: color_offset = 0 seg_num = 0 last_seq_offset = 0 while seg_num + 1 < len(offsets): f = out_segf[seg_num].file seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset] print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1) if color: print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq) else: print >> f, seg_seq if not fasta: seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]] print >> f, "+" print >> f, seg_qual seg_num += 1 last_seq_offset = offsets[seg_num] line_state = 0 read_name = "" read_seq = "" read_quals = "" num_segments = 0 offsets = [] for line in zreads.file: if line.strip() == "": continue if line_state == 0: read_name = line.strip() elif line_state == 1: read_seq = line.strip() read_length = len(read_seq) tmp_num_segments = read_length / segment_length offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)] # Bowtie's minimum read length here is 20bp, so if the last segment # is between 20 and segment_length bp long, go ahead and write it out if read_length % segment_length >= min(segment_length - 2, 20): offsets.append(read_length) tmp_num_segments += 1 else: offsets[-1] = read_length if tmp_num_segments == 1: offsets = [0, read_length] if tmp_num_segments > num_segments: open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params) num_segments = tmp_num_segments if fasta: split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color) elif line_state == 2: line = line.strip() else: read_quals = line.strip() if not fasta: split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color) line_state += 1 if fasta: line_state %= 2 else: line_state %= 4 zreads.close() out_fnames=[] for zf in out_segfiles: zf.close() out_fnames.append(zf.fname) #return [o.fname for o in out_segfiles] return out_fnames # Find possible splice junctions using the "closure search" strategy, and report # them in closures.juncs. Calls the executable closure_juncs def junctions_from_closures(params, sam_header_filename, left_maps, right_maps, ref_fasta): th_log("Searching for junctions via mate-pair closures") #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)] #if len(maps) == 0: # return None slash = left_maps[0].rfind('/') juncs_out = "" if slash != -1: juncs_out += left_maps[0][:slash+1] fusions_out = juncs_out juncs_out += "closure.juncs" fusions_out += "closure.fusions" juncs_log = open(logging_dir + "closure.log", "w") juncs_cmdpath=prog_path("closure_juncs") juncs_cmd = [juncs_cmdpath] left_maps = ','.join(left_maps) right_maps = ','.join(right_maps) juncs_cmd.extend(params.cmd()) juncs_cmd.extend(["--sam-header", sam_header_filename, juncs_out, fusions_out, ref_fasta, left_maps, right_maps]) try: print >> run_log, ' '.join(juncs_cmd) retcode = subprocess.call(juncs_cmd, stderr=juncs_log) # spanning_reads returned an error if retcode != 0: die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode)) # cvg_islands not found except OSError, o: if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: th_logp(fail_str + "Error: closure_juncs not found on this system") die(str(o)) return [juncs_out] # Find possible junctions by examining coverage and split segments in the initial # map and segment maps. Report junctions, insertions, and deletions in segment.juncs, # segment.insertions, and segment.deletions. Calls the executable # segment_juncs def junctions_from_segments(params, sam_header_filename, left_reads, left_reads_map, left_seg_maps, right_reads, right_reads_map, right_seg_maps, unmapped_reads, reads_format, ref_fasta): # if left_reads_map != left_seg_maps[0]: out_path=getFileDir(left_seg_maps[0]) juncs_out=out_path+"segment.juncs" insertions_out=out_path+"segment.insertions" deletions_out =out_path+"segment.deletions" fusions_out = out_path+"segment.fusions" if resumeStage>currentStage and fileExists(juncs_out): return [juncs_out, insertions_out, deletions_out, fusions_out] th_log("Searching for junctions via segment mapping") if params.coverage_search == True: print >> sys.stderr, "\tCoverage-search algorithm is turned on, making this step very slow" print >> sys.stderr, "\tPlease try running TopHat again with the option (--no-coverage-search) if this step takes too much time or memory." left_maps = ','.join(left_seg_maps) log_fname = logging_dir + "segment_juncs.log" segj_log = open(log_fname, "w") segj_cmd = [prog_path("segment_juncs")] segj_cmd.extend(params.cmd()) segj_cmd.extend(["--sam-header", sam_header_filename, "--ium-reads", ",".join(unmapped_reads), ref_fasta, juncs_out, insertions_out, deletions_out, fusions_out, left_reads, left_reads_map, left_maps]) if right_seg_maps: right_maps = ','.join(right_seg_maps) segj_cmd.extend([right_reads, right_reads_map, right_maps]) try: print >> run_log, " ".join(segj_cmd) retcode = subprocess.call(segj_cmd, preexec_fn=subprocess_setup, stderr=segj_log) # spanning_reads returned an error if retcode != 0: die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode)+"\n"+log_tail(log_fname)) # cvg_islands not found except OSError, o: if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: th_logp(fail_str + "Error: segment_juncs not found on this system") die(str(o)) return [juncs_out, insertions_out, deletions_out, fusions_out] # Joins mapped segments into full-length read alignments via the executable # long_spanning_reads def join_mapped_segments(params, sam_header_filename, reads, ref_fasta, possible_juncs, possible_insertions, possible_deletions, possible_fusions, contig_seg_maps, spliced_seg_maps, alignments_out_name): rn="" contig_seg_maps = ','.join(contig_seg_maps) possible_juncs = ','.join(possible_juncs) possible_insertions = ",".join(possible_insertions) possible_deletions = ",".join(possible_deletions) possible_fusions = ",".join(possible_fusions) if resumeStage > currentStage: return if len(contig_seg_maps)>1: th_log("Joining segment hits") rn=".segs" else: th_log("Processing bowtie hits") log_fname=logging_dir + "long_spanning_reads"+rn+".log" align_log = open(log_fname, "w") align_cmd = [prog_path("long_spanning_reads")] align_cmd.extend(params.cmd()) align_cmd += ["--sam-header", sam_header_filename] b2_params = params.bowtie2_params max_penalty, min_penalty = b2_params.mp.split(',') align_cmd += ["--bowtie2-max-penalty", max_penalty, "--bowtie2-min-penalty", min_penalty] align_cmd += ["--bowtie2-penalty-for-N", str(b2_params.np)] read_gap_open, read_gap_cont = b2_params.rdg.split(',') align_cmd += ["--bowtie2-read-gap-open", read_gap_open, "--bowtie2-read-gap-cont", read_gap_cont] ref_gap_open, ref_gap_cont = b2_params.rfg.split(',') align_cmd += ["--bowtie2-ref-gap-open", ref_gap_open, "--bowtie2-ref-gap-cont", ref_gap_cont] align_cmd.append(ref_fasta) align_cmd.extend([reads, possible_juncs, possible_insertions, possible_deletions, possible_fusions, alignments_out_name, contig_seg_maps]) if spliced_seg_maps: spliced_seg_maps = ','.join(spliced_seg_maps) align_cmd.append(spliced_seg_maps) try: print >> run_log, " ".join(align_cmd) ret = subprocess.call(align_cmd, stderr=align_log) if ret: die(fail_str+"Error running 'long_spanning_reads':"+log_tail(log_fname)) except OSError, o: die(fail_str+"Error: "+str(o)) # This class collects spliced and unspliced alignments for each of the # left and right read files provided by the user. class Maps: def __init__(self, unspliced_sam, seg_maps, unmapped_segs, segs): self.unspliced_sam = unspliced_sam self.seg_maps = seg_maps self.unmapped_segs = unmapped_segs self.segs = segs # Map2GTF stuff def m2g_convert_coords(params, sam_header_filename, gtf_fname, reads, out_fname): """ajjkljlks Arguments: - `params`: TopHat parameters - `gtf_fname`: File name pointing to the annotation. - `reads`: The reads to convert coords (in Bowtie format). - `out_fname`: The file name pointing to the output. """ m2g_cmd = [prog_path("map2gtf")] m2g_cmd.extend(params.cmd()) m2g_cmd += ["--sam-header", sam_header_filename] m2g_cmd.append(gtf_fname) m2g_cmd.append(reads) #could be BAM file m2g_cmd.append(out_fname) fbasename = getFileBaseName(reads) m2g_log = logging_dir + "m2g_" + fbasename + ".out" m2g_err = logging_dir + "m2g_" + fbasename + ".err" try: th_log("Converting " + fbasename + " to genomic coordinates (map2gtf)") print >> run_log, " ".join(m2g_cmd) + " > " + m2g_log ret = subprocess.call(m2g_cmd, stdout=open(m2g_log, "w"), stderr=open(m2g_err, "w")) if ret != 0: die(fail_str + " Error: map2gtf returned an error") except OSError, o: err_msg = fail_str + str(o) die(err_msg + "\n") def gtf_to_fasta(params, trans_gtf, genome, out_basename): """ Build the transcriptome data files from a GTF. Arguments: - `trans_gtf`: - `genome`: - `out_basename`: Returns: - name of the FASTA file """ out_fname=out_basename + ".fa" out_fver=out_basename + ".ver" if resumeStage > currentStage and fileExists(out_fname) and fileExists(out_fver): return out_fname g2f_cmd = [prog_path("gtf_to_fasta")] g2f_cmd.extend(params.cmd()) g2f_cmd.append(trans_gtf) g2f_cmd.append(genome) g2f_cmd.append(out_fname) g2f_log = logging_dir + "g2f.out" g2f_err = logging_dir + "g2f.err" try: print >> run_log, " ".join(g2f_cmd)+" > " + g2f_log ret = subprocess.call(g2f_cmd, stdout = open(g2f_log, "w"), stderr = open(g2f_err, "w")) if ret != 0: die(fail_str + " Error: gtf_to_fasta returned an error.") except OSError, o: err_msg = fail_str + str(o) die(err_msg + "\n") fver = open(out_fver, "w", 0) print >> fver, "%d %d %d" % (GFF_T_VER, os.path.getsize(trans_gtf), os.path.getsize(out_fname)) fver.close() return out_fname def map2gtf(params, genome_sam_header_filename, ref_fasta, left_reads, right_reads): """ Main GTF mapping function Arguments: - `params`: The TopHat parameters. - `ref_fasta`: The reference genome. - `left_reads`: A list of reads. - `right_reads`: A list of reads (empty if single-end). """ test_input_file(params.gff_annotation) # th_log("Reading in GTF file: " + params.gff_annotation) # transcripts = gtf_to_transcripts(params.gff_annotation) gtf_name = getFileBaseName(params.gff_annotation) m2g_bwt_idx = None t_out_dir = tmp_dir if currentStage < resumeStage or (params.transcriptome_index and not params.transcriptome_outdir): m2g_bwt_idx = params.transcriptome_index th_log("Using pre-built transcriptome data..") else: th_log("Building transcriptome data files..") if params.transcriptome_outdir: t_out_dir=params.transcriptome_outdir+"/" m2g_ref_name = t_out_dir + gtf_name m2g_ref_fasta = gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_name) m2g_bwt_idx = build_idx_from_fa(params.bowtie2, m2g_ref_fasta, t_out_dir, params.read_params.color) params.transcriptome_index = m2g_bwt_idx transcriptome_header_filename = get_index_sam_header(params, m2g_bwt_idx) mapped_gtf_list = [] unmapped_gtf_list = [] # do the initial mapping in GTF coordinates for reads in [left_reads, right_reads]: if reads == None or os.path.getsize(reads) < 25 : continue fbasename = getFileBaseName(reads) mapped_gtf_out = tmp_dir + fbasename + ".m2g" #if use_zpacker: # mapped_gtf_out+=".z" unmapped_gtf = tmp_dir + fbasename + ".m2g_um" #if use_BWT_FIFO: # unmapped_gtf += ".z" (mapped_gtf_map, unmapped) = bowtie(params, m2g_bwt_idx, [transcriptome_header_filename, genome_sam_header_filename], [reads], "fastq", params.read_mismatches, params.read_gap_length, params.read_edit_dist, params.read_realign_edit_dist, mapped_gtf_out, unmapped_gtf, "", _reads_vs_T) mapped_gtf_list.append(mapped_gtf_map) unmapped_gtf_list.append(unmapped) if len(mapped_gtf_list) < 2: mapped_gtf_list.append(None) if len(unmapped_gtf_list) < 2: unmapped_gtf_list.append(None) return (mapped_gtf_list, unmapped_gtf_list) # end Map2GTF def get_preflt_data(params, ri, target_reads, out_mappings, out_unmapped): ## extract mappings and unmapped reads from prefilter mappings and preflt_ium ## #this is accomplished by a special prep_reads usage (triggered by --flt-hits) out_bam=None #if params.read_params.color: # out_unmapped += ".fq" # #if use_zpacker: out_unmapped += ".z" #else: out_unmapped += ".bam" out_bam = out_unmapped # no colorspace reads if resumeStage: return (out_mappings, out_unmapped) do_use_zpacker = use_zpacker and not out_bam prep_cmd=prep_reads_cmd(params, params.preflt_data[ri].unmapped_reads, None, None, None, # right-side mates out_bam, # stdout file out_mappings, # aux file (filtered mappings) None, # no index for out_bam [target_reads], # prefilter reads [params.preflt_data[ri].mappings]) # mappings to filter if not out_bam: um_reads = open(out_unmapped, "wb") sides=["left","right"] log_fname=logging_dir + "prep_reads.from_preflt."+sides[ri]+".log" filter_log = open(log_fname,"w") shell_cmd = " ".join(prep_cmd) #add the compression pipe zip_cmd=[] if do_use_zpacker: zip_cmd=[ params.system_params.zipper ] zip_cmd.extend(params.system_params.zipper_opts) zip_cmd.extend(['-c','-']) shell_cmd +=' | '+' '.join(zip_cmd) if not out_bam: shell_cmd += ' >' + out_unmapped retcode=0 try: print >> run_log, shell_cmd if do_use_zpacker: prep_proc = subprocess.Popen(prep_cmd, stdout=subprocess.PIPE, stderr=filter_log) zip_proc = subprocess.Popen(zip_cmd, preexec_fn=subprocess_setup, stdin=prep_proc.stdout, stderr=tophat_log, stdout=um_reads) prep_proc.stdout.close() #as per http://bugs.python.org/issue7678 zip_proc.communicate() retcode=prep_proc.poll() if retcode==0: retcode=zip_proc.poll() else: if out_bam: retcode = subprocess.call(prep_cmd, stderr=filter_log) else: retcode = subprocess.call(prep_cmd, stdout=um_reads, stderr=filter_log) if retcode: die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname)) except OSError, o: errmsg=fail_str+str(o) die(errmsg+"\n"+log_tail(log_fname)) if not out_bam: um_reads.close() return (out_mappings, out_unmapped) # The main aligment routine of TopHat. This function executes most of the # workflow producing a set of candidate alignments for each cDNA fragment in a # pair of SAM alignment files (for paired end reads). def spliced_alignment(params, bwt_idx_prefix, sam_header_filename, ref_fasta, read_len, segment_len, prepared_reads, user_supplied_junctions, user_supplied_insertions, user_supplied_deletions): possible_juncs = [] possible_juncs.extend(user_supplied_junctions) possible_insertions = [] possible_insertions.extend(user_supplied_insertions) possible_deletions = [] possible_deletions.extend(user_supplied_deletions) possible_fusions = [] left_reads, right_reads = prepared_reads maps = [[], []] # maps[0] = left_reads mapping data, maps[1] = right_reads_mapping_data # Before anything, map the reads using Map2GTF (if using annotation) m2g_maps = [ None, None ] # left, right initial_reads = [ left_reads, right_reads ] setRunStage(_stage_map_start) if params.gff_annotation: (mapped_gtf_list, unmapped_gtf_list) = \ map2gtf(params, sam_header_filename, ref_fasta, left_reads, right_reads) m2g_left_maps, m2g_right_maps = mapped_gtf_list m2g_maps = [m2g_left_maps, m2g_right_maps] if params.transcriptome_only or not fileExists(unmapped_gtf_list[0]): # The case where the user doesn't want to map to anything other # than the transcriptome OR we have no unmapped reads maps[0] = [m2g_left_maps] if right_reads: maps[1] = [m2g_right_maps] return maps # Feed the unmapped reads into spliced_alignment() initial_reads = unmapped_gtf_list[:] if currentStage >= resumeStage: th_log("Resuming TopHat pipeline with unmapped reads") if not nonzeroFile(initial_reads[0]) and \ (not initial_reads[1] or not nonzeroFile(initial_reads[1])): if m2g_maps[1]: return [[m2g_maps[0]], [m2g_maps[1]]] else: return [[m2g_maps[0]], []] max_seg_len = segment_len #this is the ref seq span on either side of the junctions #to be extracted into segment_juncs.fa num_segs = int(read_len / segment_len) if (read_len % segment_len) >= min(segment_len-2, 20): #remainder is shorter but long enough to become a new segment num_segs += 1 else: # the last segment is longer if num_segs>1: max_seg_len += (read_len % segment_len) if num_segs <= 1: th_logp("Warning: you have only one segment per read.\n\tIf the read length is greater than or equal to 45bp,\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments") # Using the num_segs value returned by check_reads(), # decide which junction discovery strategy to use if num_segs < 3: #if params.butterfly_search != False: # params.butterfly_search = True if params.coverage_search != False: params.coverage_search = True if num_segs == 1: segment_len = read_len else: #num_segs >= 3: # if we have at least three segments, just use split segment search, # which is the most sensitive and specific, fastest, and lightest-weight. # so unless specifically requested, disable the other junction searches if params.closure_search != True: params.closure_search = False if params.coverage_search != True: params.coverage_search = False if params.butterfly_search != True: params.butterfly_search = False # Perform the first part of the TopHat work flow on the left and right # reads of paired ends separately - we'll use the pairing information later have_left_IUM = False for ri in (0,1): reads=initial_reads[ri] if reads == None or not nonzeroFile(reads): continue fbasename=getFileBaseName(reads) unspliced_out = tmp_dir + fbasename + ".mapped" unspliced_sam = None unmapped_reads = None #if use_zpacker: unspliced_out+=".z" unmapped_unspliced = tmp_dir + fbasename + "_unmapped" if params.prefilter_multi: #unmapped_unspliced += ".z" (unspliced_sam, unmapped_reads) = get_preflt_data(params, ri, reads, unspliced_out, unmapped_unspliced) else: # Perform the initial Bowtie mapping of the full length reads (unspliced_sam, unmapped_reads) = bowtie(params, bwt_idx_prefix, sam_header_filename, [reads], "fastq", params.read_mismatches, params.read_gap_length, params.read_edit_dist, params.read_realign_edit_dist, unspliced_out, unmapped_unspliced, "", _reads_vs_G) seg_maps = [] unmapped_segs = [] segs = [] have_IUM = nonzeroFile(unmapped_reads) if ri==0 and have_IUM: have_left_IUM = True setRunStage(_stage_map_segments) if num_segs > 1 and have_IUM: # split up the IUM reads into segments # unmapped_reads can be in BAM format read_segments = split_reads(unmapped_reads, tmp_dir + fbasename, False, params, segment_len) # Map each segment file independently with Bowtie for i in range(len(read_segments)): seg = read_segments[i] fbasename=getFileBaseName(seg) seg_out = tmp_dir + fbasename unmapped_seg = tmp_dir + fbasename + "_unmapped" extra_output = "(%d/%d)" % (i+1, len(read_segments)) (seg_map, unmapped) = bowtie(params, bwt_idx_prefix, sam_header_filename, [seg], "fastq", params.segment_mismatches, params.segment_mismatches, params.segment_mismatches, params.segment_mismatches, seg_out, unmapped_seg, extra_output, _segs_vs_G) seg_maps.append(seg_map) unmapped_segs.append(unmapped) segs.append(seg) # Collect the segment maps for left and right reads together maps[ri] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs) else: # if there's only one segment, just collect the initial map as the only # map to be used downstream for coverage-based junction discovery read_segments = [reads] maps[ri] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads]) # XXX: At this point if using M2G, have three sets of reads: # mapped to transcriptome, mapped to genome, and unmapped (potentially # spliced or poly-A tails) - hp unmapped_reads = [] if maps[0]: left_reads_map = maps[0].unspliced_sam left_seg_maps = maps[0].seg_maps unmapped_reads = maps[0].unmapped_segs else: left_reads_map = None left_seg_maps = None if right_reads and maps[1]: right_reads_map = maps[1].unspliced_sam right_seg_maps = maps[1].seg_maps unmapped_reads.extend(maps[1].unmapped_segs) else: right_reads_map = None right_seg_maps = None if params.find_novel_juncs and have_left_IUM: # or params.find_novel_indels: # Call segment_juncs to infer a list of possible splice junctions from # the regions of the genome covered in the initial and segment maps #if params.find_novel_juncs: #TODO: in m2g case, we might want to pass the m2g mappings as well, # or perhaps the GTF file directly # -> this could improve alternative junction detection? setRunStage(_stage_find_juncs) juncs = junctions_from_segments(params, sam_header_filename, left_reads, left_reads_map, left_seg_maps, right_reads, right_reads_map, right_seg_maps, unmapped_reads, "fastq", ref_fasta) if not params.system_params.keep_tmp: for unmapped_seg in unmapped_reads: removeFileWithIndex(unmapped_seg) if os.path.getsize(juncs[0]) != 0: possible_juncs.append(juncs[0]) if params.find_novel_indels: if os.path.getsize(juncs[1]) != 0: possible_insertions.append(juncs[1]) if os.path.getsize(juncs[2]) != 0: possible_deletions.append(juncs[2]) if params.find_novel_fusions: if os.path.getsize(juncs[3]) != 0: possible_fusions.append(juncs[3]) # Optionally, and for paired reads only, use a closure search to # discover addtional junctions if currentStage >= resumeStage and params.closure_search and left_reads and right_reads: juncs = junctions_from_closures(params, sam_header_filename, [maps[initial_reads[left_reads]].unspliced_sam, maps[initial_reads[left_reads]].seg_maps[-1]], [maps[initial_reads[right_reads]].unspliced_sam, maps[initial_reads[right_reads]].seg_maps[-1]], ref_fasta) if os.path.getsize(juncs[0]) != 0: possible_juncs.extend(juncs) if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0 and len(possible_fusions) == 0: spliced_seg_maps = None junc_idx_prefix = None else: junc_idx_prefix = "segment_juncs" if len(possible_insertions) == 0: possible_insertions.append(os.devnull) # print >> sys.stderr, "Warning: insertions database is empty!" if len(possible_deletions) == 0: possible_deletions.append(os.devnull) # print >> sys.stderr, "Warning: deletions database is empty!" if len(possible_juncs) == 0: possible_juncs.append(os.devnull) th_logp("Warning: junction database is empty!") if len(possible_fusions) == 0: possible_fusions.append(os.devnull) setRunStage(_stage_juncs_db) juncs_bwt_samheader = None juncs_bwt_idx = None if junc_idx_prefix: jdb_prefix = tmp_dir + junc_idx_prefix if currentStage> sys.stderr, "[DBGLOG]:"+msg def test_input_file(filename): try: test_file = open(filename, "r") except IOError: die("Error: Opening file %s" % filename) return def validate_transcriptome(params): tgff=params.transcriptome_index+".gff" if os.path.exists(tgff): if params.gff_annotation and tgff!=params.gff_annotation: if (os.path.getsize(tgff)!=os.path.getsize(params.gff_annotation)): return False tfa=params.transcriptome_index+".fa" tverf=params.transcriptome_index+".ver" tver=0 tfa_size=0 tgff_size=0 if os.path.exists(tverf): inf = open(tverf, 'r') fline = inf.readline() inf.close() dlst = fline.split() if len(dlst)>2: tver, tgff_size, tfa_size = map(lambda f: int(f), dlst) else: return False tlst=tfa+".tlst" if os.path.exists(tlst) and os.path.getsize(tlst)>0 and \ os.path.exists(tfa) and os.path.getsize(tfa)>0 and os.path.getsize(tfa)== tfa_size and \ os.path.exists(tgff) and os.path.getsize(tgff)>0 and os.path.getsize(tgff)==tgff_size \ and tver >= GFF_T_VER: return True return False def main(argv=None): warnings.filterwarnings("ignore", "tmpnam is a potential security risk") # Initialize default parameter values params = TopHatParams() run_argv = sys.argv[:] try: if argv is None: argv = sys.argv args = params.parse_options(argv) if params.resume_dir: run_argv=doResume(params.resume_dir) args = params.parse_options(run_argv) params.check() bwt_idx_prefix = args[0] left_reads_list = args[1] left_quals_list, right_quals_list = None, None if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3): if params.read_params.mate_inner_dist == None: params.read_params.mate_inner_dist = 50 #die("Error: you must set the mean inner distance between mates with -r") right_reads_list = args[2] if params.read_params.quals: left_quals_list = args[3] right_quals_list = args[4] else: right_reads_list = None if params.read_params.quals: left_quals_list = args[2] start_time = datetime.now() prepare_output_dir() init_logger(logging_dir + "tophat.log") th_logp() if resumeStage>0: th_log("Resuming TopHat run in directory '"+output_dir+"' stage '"+stageNames[resumeStage]+"'") else: th_log("Beginning TopHat run (v"+get_version()+")") th_logp("-----------------------------------------------") global run_log run_log = open(logging_dir + "run.log", "w", 0) global run_cmd run_cmd = " ".join(run_argv) print >> run_log, run_cmd check_bowtie(params) check_samtools() # Validate all the input files, check all prereqs before committing # to the run if params.gff_annotation: if not os.path.exists(params.gff_annotation): die("Error: cannot find transcript file %s" % params.gff_annotation) if os.path.getsize(params.gff_annotation)<10: die("Error: invalid transcript file %s" % params.gff_annotation) if params.transcriptome_index: if params.gff_annotation: #gff file given, so transcriptome data will be written there gff_basename = getFileBaseName(params.gff_annotation) #just in case, check if it's not already there (-G/--GTF given again by mistake) tpath, tname = os.path.split(params.transcriptome_index) new_subdir=False if tpath in (".", "./") or not tpath : if not os.path.exists(params.transcriptome_index): os.makedirs(params.transcriptome_index) new_subdir=True if new_subdir or (os.path.exists(params.transcriptome_index) and os.path.isdir(params.transcriptome_index)): params.transcriptome_index = os.path.join(params.transcriptome_index, gff_basename) if not validate_transcriptome(params): #(re)generate the transcriptome data files tpath, tname = os.path.split(params.transcriptome_index) params.transcriptome_outdir=tpath t_gff=params.transcriptome_index+".gff" if params.transcriptome_outdir: #will create the transcriptome data files if not os.path.exists(params.transcriptome_outdir): os.makedirs(params.transcriptome_outdir) if params.gff_annotation: copy(params.gff_annotation, t_gff) else: #try to use existing transcriptome data files #if validate_transcriptome(params): check_bowtie_index(params.transcriptome_index, params.bowtie2, "(transcriptome)") params.gff_annotation = t_gff #end @ transcriptome_index given (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix, params.bowtie2) if currentStage >= resumeStage: th_log("Generating SAM header for "+bwt_idx_prefix) # we need to provide another name for this sam header as genome and transcriptome may have the same prefix. sam_header_filename = get_index_sam_header(params, bwt_idx_prefix, "genome") params.sam_header = sam_header_filename #if not params.skip_check_reads: reads_list = left_reads_list if right_reads_list: reads_list = reads_list + "," + right_reads_list params.read_params = check_reads_format(params, reads_list) user_supplied_juncs = [] user_supplied_insertions = [] user_supplied_deletions = [] user_supplied_fusions = [] global gtf_juncs if params.gff_annotation and params.find_GFF_juncs: test_input_file(params.gff_annotation) (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation) ##-- we shouldn't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job ## but we still need them loaded in gtf_juncs for later splice verification if found_juncs: ## and not params.gff_annotation: user_supplied_juncs.append(gtf_juncs) #else: # gtf_juncs = None if params.raw_junctions: test_input_file(params.raw_junctions) user_supplied_juncs.append(params.raw_junctions) if params.raw_insertions: test_input_file(params.raw_insertions) user_supplied_insertions.append(params.raw_insertions) if params.raw_deletions: test_input_file(params.raw_deletions) user_supplied_deletions.append(params.raw_deletions) global unmapped_reads_fifo unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo" # Now start the time consuming stuff if params.prefilter_multi: sides=("left","right") read_lists=(left_reads_list, right_reads_list) qual_lists=(left_quals_list, right_quals_list) for ri in (0,1): reads_list=read_lists[ri] if not reads_list: continue fmulti_ext="bam" if not params.bowtie2: fmulti_ext="fq" params.preflt_data[ri].seqfiles = reads_list params.preflt_data[ri].qualfiles = qual_lists[ri] params.preflt_data[ri].multihit_reads = tmp_dir + sides[ri]+"_multimapped."+fmulti_ext side_imap = tmp_dir + sides[ri]+"_im" #if use_zpacker: side_imap+=".z" side_ium = tmp_dir + sides[ri]+"_ium" #if use_BWT_FIFO and not params.bowtie2: # side_ium += ".z" th_log("Pre-filtering multi-mapped "+sides[ri]+" reads") rdlist=reads_list.split(',') bwt=bowtie(params, bwt_idx_prefix, sam_header_filename, rdlist, params.read_params.reads_format, params.read_mismatches, params.read_gap_length, params.read_edit_dist, params.read_realign_edit_dist, side_imap, side_ium, "", _reads_vs_G, ri ) # multi-mapped reads will be in params.preflt_data[ri].multihit_reads params.preflt_data[ri].mappings = bwt[0] # initial mappings params.preflt_data[ri].unmapped_reads = bwt[1] # IUM reads setRunStage(_stage_prep) prep_info=None if currentStage >= resumeStage: th_log("Preparing reads") else: th_log("Prepared reads:") multihit_reads = [] if params.preflt_data[0].multihit_reads: multihit_reads += [params.preflt_data[0].multihit_reads] if params.preflt_data[1].multihit_reads: multihit_reads += [params.preflt_data[1].multihit_reads] prep_info= prep_reads(params, left_reads_list, left_quals_list, right_reads_list, right_quals_list, multihit_reads) if currentStage < resumeStage and not fileExists(prep_info.kept_reads[0],40): die("Error: prepared reads file missing, cannot resume!") min_read_len = prep_info.min_len[0] if prep_info.min_len[1] > 0 and min_read_len > prep_info.min_len[1]: min_read_len = prep_info.min_len[1] if min_read_len < 20: th_logp("Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped in too many places") max_read_len=max(prep_info.max_len[0], prep_info.max_len[1]) seed_len=params.read_params.seed_length if seed_len: #if read len was explicitly given seed_len = max(seed_len, min_read_len) #can't be smaller than minimum length observed else: seed_len = max_read_len params.read_params.seed_length=seed_len # turn off integer-quals if params.read_params.integer_quals: params.read_params.integer_quals = False input_reads = prep_info.kept_reads[:] mappings = spliced_alignment(params, bwt_idx_prefix, sam_header_filename, ref_fasta, params.read_params.seed_length, params.segment_length, input_reads, user_supplied_juncs, user_supplied_insertions, user_supplied_deletions) setRunStage(_stage_tophat_reports) compile_reports(params, sam_header_filename, ref_fasta, mappings, input_reads, params.gff_annotation) setRunStage(_stage_alldone) if not params.system_params.keep_tmp: try: s=tmp_dir.rstrip('/') rmtree(s, True) except OSError: pass #th_logp("Warning: couldn't remove all temporary files in "+tmp_dir) finish_time = datetime.now() duration = finish_time - start_time th_logp("-----------------------------------------------") th_log("A summary of the alignment counts can be found in %salign_summary.txt" % output_dir); th_log("Run complete: %s elapsed" % formatTD(duration)) except Usage, err: th_logp(sys.argv[0].split("/")[-1] + ": " + str(err.msg)) th_logp(" for detailed help see http://tophat.cbcb.umd.edu/manual.html") return 2 if __name__ == "__main__": sys.exit(main()) tophat-2.0.9/src/GHash.hh0000644000175000017500000004063712157116165013672 0ustar toortoor/******************************************************************************** * Hash table class template (char* based) * *********************************************************************************/ #ifndef GHash_HH #define GHash_HH #include "GBase.h" /** * This class maintains a fast-access hash table of entities * indexed by a character string (essentially, maps strings to pointers) */ template class GHash { protected: struct GHashEntry { char* key; // Key string bool keyalloc; //shared key flag (to not free the key chars) int hash; // Hash value of key pointer data; // Data bool mark; // Entry is marked }; GHashEntry* hash; // Hash int fCapacity; // table size int fCount; // number of valid entries int fCurrentEntry; char* lastkeyptr; //pointer to last key string added //---------- Raw data retrieval (including empty entries // Return key at position pos. const char* Key(uint pos) const { return hash[pos].key; } // return data OBJ* at given position OBJ* Data(uint pos) const { return (OBJ*) hash[pos].data; } // Return mark flag of entry at position pos. bool Mark(uint pos) const { return hash[pos].mark; } // Return position of first filled slot, or >= fCapacity int First() const; // Return position of last filled slot or -1 int Last() const; // Return position of next filled slot in hash table // or a value greater than or equal to fCapacity if no filled // slot was found int Next(int pos) const; //Return position of previous filled slot in hash table //or a -1 if no filled slot was found int Prev(int pos) const; private: GHash(const GHash&); GHash &operator=(const GHash&); GFreeProc* fFreeProc; //procedure to free item data protected: public: static void DefaultFreeProc(pointer item) { delete (OBJ*)item; } public: GHash(GFreeProc* freeProc); // constructs of an empty hash GHash(bool doFree=true); // constructs of an empty hash (free the item objects) void setFreeItem(GFreeProc *freeProc) { fFreeProc=freeProc; } void setFreeItem(bool doFree) { fFreeProc=(doFree)? &DefaultFreeProc : NULL; } int Capacity() const { return fCapacity; } // table's size, including the empty slots. void Resize(int m); // Resize the table to the given size. int Count() const { return fCount; }// the total number of entries in the table. // Insert a new entry into the table given key and mark. // If there is already an entry with that key, leave it unchanged, const OBJ* Add(const char* ky, const OBJ* ptr=NULL, bool mrk=false); //same as Add, but the key pointer is stored directly, no string duplicate //is made (shared-key-Add) const OBJ* shkAdd(const char* ky, const OBJ* ptr, bool mrk=false); // Replace data at key, if the entry's mark is less than // or equal to the given mark. If there was no existing entry, // a new entry is inserted with the given mark. OBJ* Replace(const char* ky, const OBJ* ptr, bool mrk=false); // Remove a given key and its data OBJ* Remove(const char* ky); // Find data OBJ* given key. OBJ* Find(const char* ky, char** keyptr=NULL); bool hasKey(const char* ky); char* getLastKey() { return lastkeyptr; } OBJ* operator[](const char* ky) { return Find(ky); } void startIterate(); //iterator-like initialization char* NextKey(); //returns next valid key in the table (NULL if no more) OBJ* NextData(); //returns next valid hash[].data OBJ* NextData(char*& nextkey); //returns next valid hash[].data //or NULL if no more //nextkey is SET to the corresponding key GHashEntry* NextEntry() { //returns a pointer to a GHashEntry register int pos=fCurrentEntry; while (pos GHash::GHash(GFreeProc* freeProc) { GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); fCurrentEntry=-1; fFreeProc=freeProc; lastkeyptr=NULL; for (uint i=0; i GHash::GHash(bool doFree) { GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); fCurrentEntry=-1; lastkeyptr=NULL; fFreeProc = (doFree)?&DefaultFreeProc : NULL; for (uint i=0; i void GHash::Resize(int m){ register int i,n,p,x,h; GHashEntry *k; GASSERT(fCount<=fCapacity); if(m>2)>m) n>>=1; // Shrink until n/4 <= m while((n>>1)>1)); GASSERT(DEF_HASH_SIZE<=n); if(n!=fCapacity){ GASSERT(m<=n); GMALLOC(k, sizeof(GHashEntry)*n); for(i=0; i const OBJ* GHash::Add(const char* ky, const OBJ* pdata,bool mrk){ register int p,i,x,h,n; if(!ky) GError("GHash::insert: NULL key argument.\n"); GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); GASSERT(fCount const OBJ* GHash::shkAdd(const char* ky, const OBJ* pdata,bool mrk){ register int p,i,x,h,n; if(!ky) GError("GHash::insert: NULL key argument.\n"); GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); GASSERT(fCount OBJ* GHash::Replace(const char* ky,const OBJ* pdata, bool mrk){ register int p,i,x,h,n; if(!ky){ GError("GHash::replace: NULL key argument.\n"); } GASSERT(fCount=(MAX_LOAD*fCapacity)) Resize(fCount); GASSERT(fCount OBJ* GHash::Remove(const char* ky){ register int p,x,h,n; if(!ky){ GError("GHash::remove: NULL key argument.\n"); } if(0 bool GHash::hasKey(const char* ky) { register int p,x,h,n; if(!ky){ GError("GHash::find: NULL key argument.\n"); } if(0 OBJ* GHash::Find(const char* ky, char** keyptr){ register int p,x,h,n; if(!ky){ GError("GHash::find: NULL key argument.\n"); } if(0 void GHash::startIterate() {// initialize a key iterator; call fCurrentEntry=0; } template char* GHash::NextKey() { register int pos=fCurrentEntry; while (pos OBJ* GHash::NextData() { register int pos=fCurrentEntry; while (pos OBJ* GHash::NextData(char* &nextkey) { register int pos=fCurrentEntry; while (pos int GHash::First() const { register int pos=0; while(pos int GHash::Last() const { register int pos=fCapacity-1; while(0<=pos){ if(0<=hash[pos].hash) break; pos--; } GASSERT(pos<0 || 0<=hash[pos].hash); return pos; } // Find next valid entry template int GHash::Next(int pos) const { GASSERT(0<=pos && pos int GHash::Prev(int pos) const { GASSERT(0<=pos && pos= 0){ if(0<=hash[pos].hash) break; } GASSERT(pos<0 || 0<=hash[pos].hash); return pos; } // Remove all template void GHash::Clear(){ register int i; for(i=0; i=0){ if (hash[i].keyalloc) GFREE((hash[i].key)); if (FREEDATA) (*fFreeProc)(hash[i].data); } } GFREE(hash); GMALLOC(hash, sizeof(GHashEntry)*DEF_HASH_SIZE); //reinitialize it for (i=0; i=0){ uint len=strlen(hash[i].key); store << len; store << hash[i].mark; store.save(hash[i].key,len); } } } // Load data void GHash::Load(Stream& store){ Object::load(store); store >> fCapacity; store >> fCount; for(int i=0; i> hash[i].hash; if(hash[i].hash>=0){ uint len; store >> len; store >> hash[i].mark; GMALLOC(hash[i].key,len+1); store.load(hash[i].key,len); hash[i].key[len]='\0'; } } } */ // Destroy table template GHash::~GHash(){ register int i; for(i=0; i=0){ if (hash[i].keyalloc) GFREE((hash[i].key)); if (FREEDATA) (*fFreeProc)(hash[i].data); } } GFREE(hash); } #endif tophat-2.0.9/src/GVec.hh0000644000175000017500000006326212157340451013520 0ustar toortoor//--------------------------------------------------------------------------- /* Sortable collection of pointers to objects */ #ifndef _GVec_HH #define _GVec_HH #include "GBase.h" #define GVEC_INDEX_ERR "GVec error: invalid index: %d\n" #if defined(NDEBUG) || defined(NODEBUG) || defined(_NDEBUG) || defined(NO_DEBUG) #define TEST_INDEX(x) #else #define TEST_INDEX(x) \ if (x<0 || x>=fCount) GError(GVEC_INDEX_ERR, x) #endif #define GVEC_CAPACITY_ERR "GVec error: invalid capacity: %d\n" #define GVEC_COUNT_ERR "GVec error: invalid count: %d\n" #define MAXLISTSIZE INT_MAX-1 #define FREEDATA (fFreeProc!=NULL) template struct IsPrimitiveType { enum { VAL = 0 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; /* template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; template<> struct IsPrimitiveType { enum { VAL = 1 }; }; */ template int DefLTCompareProc(const pointer p1, const pointer p2) { const OBJ& o1 = *((OBJ*) p1); const OBJ& o2 = *((OBJ*) p2); if (o1 < o2) return -1; else return ((o2 < o1) ? 1 : 0 ); } //basic template for array of objects; //so it doesn't require comparison operators to be defined template class GVec { protected: OBJ* fArray; int fCount; int fCapacity; void qSort(int L, int R, GCompareProc* cmpFunc); public: GVec(int init_capacity=2); GVec(int init_count, const OBJ init_val); GVec(GVec& array); //copy constructor const GVec& operator=(GVec& array); //copy operator virtual ~GVec(); void Insert(int idx, OBJ item) { Insert(idx, &item); } void Insert(int idx, OBJ* item); void idxInsert(int idx, OBJ& item) { Insert(idx, &item); } void Grow(); void Grow(int idx, OBJ& item); //grow and add/insert item copy void Reverse(); //WARNING: will break the sort order if SORTED! int Add(OBJ* item); // simply append to the end of fArray, reallocating as needed int Add(OBJ& item) { return Add(&item); } int cAdd(OBJ item) { return Add(&item); } //all these will CREATE a new OBJ and COPY to it // // using OBJ copy operator= // -- stack/queue usage: //int Push(OBJ& item) { return Add(&item); } int Push(OBJ& item) { return Add(&item); } int cPush(OBJ item) { return Add(&item); } OBJ Pop();// Stack use; removes and returns a copy of the last item OBJ Shift(); //Queue use: removes and returns a copy of the first item void Add(GVec& list); //append copies of all items from another list OBJ& Get(int idx) { TEST_INDEX(idx); return fArray[idx]; } inline OBJ& operator[](int i) { TEST_INDEX(i); return fArray[i]; } OBJ& Last() { TEST_INDEX(fCount-1); return fArray[fCount-1]; } OBJ& First() { TEST_INDEX(0); return fArray[0]; } void Clear(); void Delete(int index); void Replace(int idx, OBJ& item); //Put, use operator= to copy void Exchange(int idx1, int idx2); void Swap(int idx1, int idx2) { Exchange(idx1, idx2); } int Capacity() { return fCapacity; } //this will reject identical items in sorted lists only! void setCapacity(int NewCapacity); int Count() { return fCount; } void setCount(int NewCount); // will trim or expand the array as needed void setCount(int NewCount, OBJ* v); //same as setCount() but new objects are set to v void setCount(int NewCount, OBJ v); void Resize(int NewCount) { setCount(NewCount); } //void Resize(int NewCount, OBJ* v) { setCount(NewCount, v); } void Resize(int NewCount, OBJ v) { setCount(NewCount, &v); } //void Move(int curidx, int newidx); bool isEmpty() { return fCount==0; } bool notEmpty() { return fCount>0; } void Sort(GCompareProc* cmpFunc); void Sort(); }; //---- template for dynamic array of object pointers //---- it's faster than GVec and has item deallocation awareness template class GPVec { protected: OBJ** fList; //pointer to an array of pointers to objects int fCount; //total number of entries in list int fCapacity; //current allocated size GFreeProc* fFreeProc; //useful for deleting objects //--- void Expand(); void Grow(); void Grow(int idx, OBJ* newitem); void qSort(int L, int R, GCompareProc* cmpFunc); public: static void DefaultFreeProc(pointer item) { delete (OBJ*)item; } virtual ~GPVec(); GPVec(int init_capacity=2, bool free_elements=true); //also the default constructor GPVec(bool free_elements); GPVec(GPVec& list); //copy constructor? GPVec(GPVec* list); //kind of a copy constructor const GPVec& operator=(GPVec& list); OBJ* Get(int i); OBJ* operator[](int i) { return this->Get(i); } void Reverse(); //reverse pointer array; WARNING: will break the sort order if sorted! void freeItem(int idx); //calls fFreeProc (or DefaultFreeProc) on fList[idx] and sets NULL there, doesn't pack! //it will free even if fFreeProc is NULL! void setFreeItem(GFreeProc *freeProc) { fFreeProc=freeProc; } void setFreeItem(bool doFree) { if (doFree) fFreeProc=DefaultFreeProc; else fFreeProc=NULL; } // -- stack usage: int Push(OBJ* item) { return Add(item); } OBJ* Pop();// Stack use; removes and returns last item,but does NOT FREE it OBJ* Shift(); //Queue use: removes and returns first item, but does NOT FREE it void deallocate_item(OBJ*& item); //forcefully call fFreeProc or delete on item void Clear(); void Exchange(int idx1, int idx2); void Swap(int idx1, int idx2) { Exchange(idx1, idx2); } OBJ* First() { return (fCount>0)?fList[0]:NULL; } OBJ* Last() { return (fCount>0)?fList[fCount-1]:NULL;} bool isEmpty() { return fCount==0; } bool notEmpty() { return fCount>0; } int Capacity() { return fCapacity; } int Count() { return fCount; } void setCapacity(int NewCapacity); void setCount(int NewCount); //the same as setCapacity() but the new item range is filled with NULLs int Add(OBJ* item); //simply append the pointer copy void Add(GPVec& list); //add all pointers from another list void Insert(int idx, OBJ* item); void Move(int curidx, int newidx); void Put(int idx, OBJ* item); void Pack(); void Delete(int index); //also frees the item if fFreeProc!=NULL, and shifts the successor items void Forget(int idx); //simply places a NULL at fList[idx], nothing else int RemovePtr(pointer item); //always use linear search to find the pointer! calls Delete() if found int IndexOf(pointer item); //a linear search for pointer address! void Sort(GCompareProc* cmpFunc); void Sort(); }; //-------------------- TEMPLATE IMPLEMENTATION------------------------------- template GVec::GVec(int init_capacity) { fCount=0; fCapacity=0; fArray=NULL; setCapacity(init_capacity); } template GVec::GVec(int init_count, const OBJ init_val) { fCount=0; fCapacity=0; fArray=NULL; setCapacity(init_count); fCount = init_count; for (int i=0;i GVec::GVec(GVec& array) { //copy constructor this->fCount=array.fCount; this->fCapacity=array.fCapacity; this->fArray=NULL; if (this->fCapacity>0) { if (IsPrimitiveType::VAL) { GMALLOC(fArray, fCapacity*sizeof(OBJ)); memcpy(fArray, array.fArray, fCount*sizeof(OBJ)); } else { fArray=new OBJ[this->fCapacity]; //]() // uses OBJ operator= for (int i=0;ifCount;i++) fArray[i]=array[i]; } } this->fCount=array.fCount; } template const GVec& GVec::operator=(GVec& array) { if (&array==this) return *this; Clear(); fCapacity=array.fCapacity; fCount=array.fCount; if (fCapacity>0) { if (IsPrimitiveType::VAL) { GMALLOC(fArray, fCapacity*sizeof(OBJ)); memcpy(fArray, array.fArray, fCount*sizeof(OBJ)); } else { fArray=new OBJ[this->fCapacity]; // ]() // uses OBJ operator= for (int i=0;i GVec::~GVec() { this->Clear(); } template void GVec::setCapacity(int NewCapacity) { if (NewCapacity < fCount || NewCapacity > MAXLISTSIZE) GError(GVEC_CAPACITY_ERR, NewCapacity); //error: NewCapacity MUST be > fCount //if you want to shrink it use Resize() or setCount() if (NewCapacity!=fCapacity) { if (NewCapacity==0) { if (IsPrimitiveType::VAL) { GFREE(fArray); } else { delete[] fArray; fArray=NULL; } } else { if (IsPrimitiveType::VAL) { GREALLOC(fArray, NewCapacity*sizeof(OBJ)); } else { OBJ* oldArray=fArray; //fArray=new OBJ[NewCapacity](); fArray=new OBJ[NewCapacity]; for (int i=0;ifCount;i++) { fArray[i] = oldArray[i]; }// we need operator= here //wouldn't be faster to use memcpy instead? //memcpy(fArray, oldArray, fCount*sizeof(OBJ)); if (oldArray) delete[] oldArray; } } fCapacity=NewCapacity; } } template void GVec::Clear() { fCount=0; if (IsPrimitiveType::VAL) { GFREE(fArray); } else { delete[] fArray; fArray=NULL; } fCapacity=0; } template void GVec::Grow() { int delta = (fCapacity>8) ? (fCapacity>>2) : 1 ; setCapacity(fCapacity + delta); } template void GVec::Reverse() { int l=0; int r=fCount-1; OBJ c; while (l void GVec::Grow(int idx, OBJ& item) { int delta = (fCapacity>8) ? (fCapacity>>2) : 1 ; int NewCapacity=fCapacity+delta; if (NewCapacity <= fCount || NewCapacity >= MAXLISTSIZE) GError(GVEC_CAPACITY_ERR, NewCapacity); //error: capacity not within range //if (NewCapacity!=fCapacity) { if (idx==fCount) { //append item //GREALLOC(fArray, NewCapacity*sizeof(OBJ)); setCapacity(NewCapacity); fArray[idx]=item; } else { //insert item at idx OBJ* newList; if (IsPrimitiveType::VAL) { GMALLOC(newList, NewCapacity*sizeof(OBJ)); //copy data before idx memcpy(&newList[0],&fArray[0], idx*sizeof(OBJ)); newList[idx]=item; //copy data after idx memmove(&newList[idx+1],&fArray[idx], (fCount-idx)*sizeof(OBJ)); //..shouldn't do this: memset(&newList[fCount+1], 0, (NewCapacity-fCount-1)*sizeof(OBJ)); //data copied: GFREE(fArray); } else { newList=new OBJ[NewCapacity]; //]() // operator= required! for (int i=0;i int GVec::Add(OBJ* item) { if (item==NULL) return -1; if (fCount==fCapacity) Grow(); fArray[fCount] = *item; //OBJ::operator= must copy OBJ properly! fCount++; return fCount-1; } template void GVec::Add(GVec& list) { if (list.Count()==0) return; //simply copy setCapacity(fCapacity+list.fCount); if (IsPrimitiveType::VAL) { memcpy( &fArray[fCount], list.fArray, list.fCount*sizeof(OBJ)); } else { for (int i=0;i OBJ GVec::Pop() { if (fCount<=0) GError("Error: invalid GVec::Pop() operation!\n"); fCount--; //OBJ o(fArray[fCount]); //copy constructor //o=fList[fCount]; //fArray[fCount]=NULL; return fArray[fCount]; //copy of the last element (copy constructor called) } //Queue usage: template OBJ GVec::Shift() { if (fCount<=0) GError("Error: invalid GVec::Shift() operation!\n"); fCount--; OBJ o(fArray[0]); //copy constructor if (fCount>0) memmove(&fArray[0], &fArray[1], (fCount)*sizeof(OBJ)); //fList[fCount]=NULL; //not that it matters.. return o; } template void GVec::Insert(int idx, OBJ* item) { //idx must be the new position this new item must have //so the allowed range is [0..fCount] //the old idx item all the above will be shifted to idx+1 if (idx<0 || idx>fCount) GError(GVEC_INDEX_ERR, idx); if (fCount==fCapacity) { //need to resize the array Grow(idx, *item); //expand and also copy/move data and insert the new item return; } //move data around to make room for the new item if (idx::VAL) { memmove(&fArray[idx+1],&fArray[idx], (fCount-idx)*sizeof(OBJ)); } else { for (int i=fCount; i>idx; i--) { fArray[i]=fArray[i-1]; } } } fArray[idx]=*item; fCount++; } /*template void GVec::Move(int curidx, int newidx) { //swap if (curidx!=newidx || newidx>=fCount) GError(GVEC_INDEX_ERR, newidx); OBJ tmp=fArray[curidx]; //copy constructor here fArray[curidx]=fArray[newidx]; fArray[newidx]=tmp; }*/ template void GVec::Replace(int idx, OBJ& item) { TEST_INDEX(idx); fArray[idx]=item; } template void GVec::Exchange(int idx1, int idx2) { TEST_INDEX(idx1); TEST_INDEX(idx2); OBJ item=fArray[idx1]; fArray[idx1]=fArray[idx2]; fArray[idx2]=item; } template void GVec::Delete(int index) { TEST_INDEX(index); fCount--; if (IsPrimitiveType::VAL) { if (index void GVec::setCount(int NewCount) { if (NewCount<0 || NewCount > MAXLISTSIZE) GError(GVEC_COUNT_ERR, NewCount); //if (NewCount > fCapacity) setCapacity(NewCount); while(NewCount > fCapacity) Grow(); fCount = NewCount; //new items will be populated by the default object constructor(!) } template void GVec::setCount(int NewCount, OBJ* v) { if (NewCount<0 || NewCount > MAXLISTSIZE) GError(GVEC_COUNT_ERR, NewCount); while (NewCount > fCapacity) Grow(); if (NewCount>fCount) { for (int i=fCount;i void GVec::setCount(int NewCount, OBJ v) { if (NewCount<0 || NewCount > MAXLISTSIZE) GError(GVEC_COUNT_ERR, NewCount); while (NewCount > fCapacity) Grow(); if (NewCount>fCount) { for (int i=fCount;i void GVec::qSort(int l, int r, GCompareProc* cmpFunc) { int i, j; OBJ p,t; do { i = l; j = r; p = this->fArray[(l + r) >> 1]; do { while (cmpFunc(&(this->fArray[i]), &p) < 0) i++; while (cmpFunc(&(this->fArray[j]), &p) > 0) j--; if (i <= j) { t = this->fArray[i]; this->fArray[i] = this->fArray[j]; this->fArray[j] = t; i++; j--; } } while (i <= j); if (l < j) qSort(l, j, cmpFunc); l = i; } while (i < r); } template void GVec::Sort(GCompareProc* cmpFunc) { if (cmpFunc==NULL) { GMessage("Warning: NULL compare function given, useless Sort() call.\n"); return; } if (this->fArray!=NULL && this->fCount>0) qSort(0, this->fCount-1, cmpFunc); } template void GVec::Sort() { GCompareProc* cmpFunc = DefLTCompareProc; Sort(cmpFunc); } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //*=> GPVec implementation template GPVec::GPVec(GPVec& list) { //copy constructor fCount=list.fCount; fCapacity=list.fCapacity; fList=NULL; if (fCapacity>0) { GMALLOC(fList, fCapacity*sizeof(OBJ*)); } fFreeProc=list.fFreeProc; fCount=list.fCount; memcpy(fList, list.fList, fCount*sizeof(OBJ*)); //for (int i=0;i GPVec::GPVec(GPVec* plist) { //another copy constructor fCount=0; fCapacity=plist->fCapacity; fList=NULL; if (fCapacity>0) { GMALLOC(fList, fCapacity*sizeof(OBJ*)); } fFreeProc=plist->fFreeProc; fCount=plist->fCount; memcpy(fList, plist->fList, fCount*sizeof(OBJ*)); //for (int i=0;ifCount;i++) Add(plist->Get(i)); } template const GPVec& GPVec::operator=(GPVec& list) { if (&list!=this) { Clear(); fFreeProc=list.fFreeProc; //Attention: the object *POINTERS* are copied, // but the actual object content is NOT duplicated //for (int i=0;i void GPVec::Add(GPVec& list) { if (list.Count()==0) return; //simply copy the pointers! -- the objects will be shared setCapacity(fCapacity+list.fCount); memcpy( & (fList[fCount]), list.fList, list.fCount*sizeof(OBJ*)); fCount+=list.fCount; } template void GPVec::Reverse() { int l=0; int r=fCount-1; OBJ* c; while (l GPVec::GPVec(int init_capacity, bool free_elements) { fCount=0; fCapacity=0; fList=NULL; fFreeProc=(free_elements) ? DefaultFreeProc : NULL; if (init_capacity>0) setCapacity(init_capacity); } template GPVec::GPVec(bool free_elements) { fCount=0; fCapacity=0; fList=NULL; fFreeProc=(free_elements) ? DefaultFreeProc : NULL; } template GPVec::~GPVec() { this->Clear();//this will free the items if fFreeProc is defined } template void GPVec::setCapacity(int NewCapacity) { if (NewCapacity < fCount || NewCapacity > MAXLISTSIZE) GError(GVEC_CAPACITY_ERR, NewCapacity); //error: capacity not within range if (NewCapacity!=fCapacity) { if (NewCapacity==0) { GFREE(fList); } else { GREALLOC(fList, NewCapacity*sizeof(OBJ*)); } fCapacity=NewCapacity; } } template void GPVec::deallocate_item(OBJ* &item) { if (item==NULL) return; if (FREEDATA) { (*fFreeProc)(item); item=NULL; } else { delete item; item=NULL; } } template void GPVec::Clear() { if (FREEDATA) { for (int i=0; i void GPVec::Exchange(int idx1, int idx2) { TEST_INDEX(idx1); TEST_INDEX(idx2); OBJ* item=fList[idx1]; fList[idx1]=fList[idx2]; fList[idx2]=item; } template void GPVec::Expand() { if (fCount==fCapacity) Grow(); //return this; } template OBJ* GPVec::Get(int idx) { TEST_INDEX(idx); return fList[idx]; } template void GPVec::Grow() { /* int delta; if (fCapacity > 64 ) { delta = (fCapacity > 0xFFF) ? 0x100 : (fCapacity>>4); } else { delta = (fCapacity>8) ? (fCapacity>>2) : 1 ; } */ int delta = (fCapacity>8) ? (fCapacity>>2) : 1; setCapacity(fCapacity + delta); } template void GPVec::Grow(int idx, OBJ* newitem) { /* int delta; if (fCapacity > 64 ) { delta = (fCapacity > 0xFFF) ? 0x100 : (fCapacity>>4); } else { delta = (fCapacity>8) ? (fCapacity>>2) : 1 ; } */ int delta = (fCapacity>8) ? (fCapacity>>2) : 1 ; int NewCapacity=fCapacity+delta; if (NewCapacity <= fCount || NewCapacity > MAXLISTSIZE) GError(GVEC_CAPACITY_ERR, NewCapacity); //error: capacity not within range //if (NewCapacity!=fCapacity) { /*if (NewCapacity==0) { GFREE(fList); } else {//add the new item */ if (idx==fCount) { GREALLOC(fList, NewCapacity*sizeof(OBJ*)); fList[idx]=newitem; } else { OBJ** newList; GMALLOC(newList, NewCapacity*sizeof(OBJ*)); //copy data before idx memcpy(&newList[0],&fList[0], idx*sizeof(OBJ*)); newList[idx]=newitem; //copy data after idx memmove(&newList[idx+1],&fList[idx], (fCount-idx)*sizeof(OBJ*)); memset(&newList[fCount+1], 0, (NewCapacity-fCount-1)*sizeof(OBJ*)); //data copied: GFREE(fList); fList=newList; } fCount++; fCapacity=NewCapacity; } template int GPVec::IndexOf(pointer item) { int result=-1; for (int i=0;i int GPVec::Add(OBJ* item) { int result; if (item==NULL) return -1; result = fCount; if (result==fCapacity) this->Grow(); fList[result]=item; fCount++; return fCount-1; } template void GPVec::Insert(int idx, OBJ* item) { //idx can be [0..fCount] so an item can be actually added if (idx<0 || idx>fCount) GError(GVEC_INDEX_ERR, idx); if (fCount==fCapacity) { Grow(idx, item); return; } if (idx void GPVec::Move(int curidx, int newidx) { //s //BE_UNSORTED; //cannot do that in a sorted list! if (curidx!=newidx || newidx>=fCount) GError(GVEC_INDEX_ERR, newidx); OBJ* p; p=Get(curidx); //this is a delete: fCount--; if (curidx void GPVec::Put(int idx, OBJ* item) { //WARNING: this will never free the replaced item! TEST_INDEX(idx); fList[idx]=item; } template void GPVec::Forget(int idx) { TEST_INDEX(idx); fList[idx]=NULL; //user should free that somewhere else } template void GPVec::freeItem(int idx) { TEST_INDEX(idx); if (fFreeProc!=NULL) { (*fFreeProc)(fList[idx]); } else this->DefaultFreeProc(fList[idx]); fList[idx]=NULL; } template void GPVec::Delete(int index) { TEST_INDEX(index); if (fFreeProc!=NULL && fList[index]!=NULL) { (*fFreeProc)(fList[index]); //freeItem } fList[index]=NULL; fCount--; if (index OBJ* GPVec::Pop() { if (fCount<=0) return NULL; fCount--; OBJ* o=fList[fCount]; fList[fCount]=NULL; return o; } //Queue usage: template OBJ* GPVec::Shift() { if (fCount<=0) return NULL; fCount--; OBJ* o=fList[0]; if (fCount>0) memmove(&fList[0], &fList[1], (fCount)*sizeof(OBJ*)); fList[fCount]=NULL; //not that it matters.. return o; } //linear search for the pointer address template int GPVec::RemovePtr(pointer item) { if (item==NULL) return -1; for (int i=0;i void GPVec::Pack() { for (int i=fCount-1; i>=0; i--) if (fList[i]==NULL) Delete(i); //shift rest of fList content accordingly } template void GPVec::setCount(int NewCount) { if (NewCount<0 || NewCount > MAXLISTSIZE) GError(GVEC_COUNT_ERR, NewCount); if (NewCount > fCapacity) setCapacity(NewCount); if (NewCount > fCount) //pad with NULL pointers memset(& fList[fCount], 0, (NewCount - fCount) * sizeof(OBJ*)); fCount = NewCount; } template void GPVec::qSort(int L, int R, GCompareProc* cmpFunc) { int I, J; OBJ* P; OBJ* T; do { I = L; J = R; P = this->fList[(L + R) >> 1]; do { while (cmpFunc(this->fList[I], P) < 0) I++; while (cmpFunc(this->fList[J], P) > 0) J--; if (I <= J) { T = this->fList[I]; this->fList[I] = this->fList[J]; this->fList[J] = T; I++; J--; } } while (I <= J); if (L < J) qSort(L, J, cmpFunc); L = I; } while (I < R); } template void GPVec::Sort(GCompareProc* cmpFunc) { if (cmpFunc==NULL) { GMessage("Warning: NULL compare function given, useless Sort() call.\n"); return; } if (this->fList!=NULL && this->fCount>0) qSort(0, this->fCount-1, cmpFunc); } template void GPVec::Sort() { GCompareProc* cmpFunc = DefLTCompareProc; Sort(cmpFunc); } //--------------------------------------------------------------------------- #endif tophat-2.0.9/src/map2gtf.h0000644000175000017500000000665412157116165014071 0ustar toortoor/* * Author: Harold Pimentel * Contact: http://cs.berkeley.edu/~pimentel * Date: June 10, 2011 */ #ifndef _MAP2GTF_H_ #define _MAP2GTF_H_ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bwt_map.h" #include "common.h" //#include "gff.h" #include "GVec.hh" #define MAX_READ_NAME_LEN 2048 //simplified version of GffObj //parsed from a simple text line struct GffTranscript: public GSeg { GVec exons; int numID; //numeric ID in tlst std::string gffID; std::string refID; char strand; GffTranscript():exons(1), numID(-1), gffID(), refID(), strand(0) { } string& getRefName() { return refID; } GffTranscript(const std::string& tline); }; /* * XXX: This class currently assumes someone used the script in TopHat to map * the reads already. It also depends on that same format. */ class TranscriptomeHit; class Map2GTF { public: Map2GTF(const std::string& gtf_fname, const std::string& sam_fname); ~Map2GTF(); // Write out to a BAM file bool next_read_hits(std::vector& hits, size_t& num_hits, long& read_id); void convert_coords(const std::string& out_fname, const std::string& sam_header); bool trans_to_genomic_coords(TranscriptomeHit& hit); private: //GffReader gtfReader_; GPVec transcripts; map tidx_to_t; std::string gtf_fname_; std::string in_fname_; //FILE* gtf_fhandle_; //actually a tlst handle std::ifstream tlststream; samfile_t* in_fhandle_; bam_header_t* in_sam_header_; map ref_to_id_; bam_header_t* out_sam_header_; ReadTable readTable_; RefSequenceTable refSeqTable_; Map2GTF(); // Don't want anyone calling the constructor w/o options }; class TranscriptomeHit { public: bam1_t* hit; GffTranscript* trans; TranscriptomeHit(bam1_t* h = NULL, GffTranscript* t=NULL): //GffObj* t = NULL) hit(h), trans(t) { } bool operator==(const TranscriptomeHit& th) const { if (hit->core.tid != th.hit->core.tid) return false; if (hit->core.pos != th.hit->core.pos) return false; if (hit->core.n_cigar != th.hit->core.n_cigar) return false; for (int i = 0; i < hit->core.n_cigar; ++i) { if (bam1_cigar(hit)[i] != bam1_cigar(th.hit)[i]) return false; } return true; } bool operator<(const TranscriptomeHit& th) const { if (hit->core.tid != th.hit->core.tid) return hit->core.tid < th.hit->core.tid; if (hit->core.pos != th.hit->core.pos) return hit->core.pos < th.hit->core.pos; if (hit->core.n_cigar != th.hit->core.n_cigar) return hit->core.n_cigar < th.hit->core.n_cigar; for (int i = 0; i < hit->core.n_cigar; ++i) { if (bam1_cigar(hit)[i] != bam1_cigar(th.hit)[i]) return bam1_cigar(hit)[i] < bam1_cigar(th.hit)[i]; } return false; } }; //bool get_read_start(GList* exon_list, size_t gtf_start, bool get_read_start(GVec& exon_list, size_t gtf_start, size_t& genome_start, int& exon_idx); void print_trans(GffTranscript* trans, const bam1_t* in, size_t rem_len, size_t match_len, size_t cur_pos, size_t start_pos); #endif /* _MAP2GTF_H_ */ tophat-2.0.9/src/GBase.h0000644000175000017500000003207412157116165013505 0ustar toortoor#ifndef G_BASE_DEFINED #define G_BASE_DEFINED #ifndef _POSIX_SOURCE //mostly for MinGW #define _POSIX_SOURCE #endif #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #if defined __WIN32__ || defined WIN32 || defined _WIN32 || defined _WIN32_ #ifndef __WIN32__ #define __WIN32__ #endif #include #include #define CHPATHSEP '\\' #undef off_t #define off_t int64_t #ifndef popen #define popen _popen #endif #ifndef fseeko #ifdef _fseeki64 #define fseeko(stream, offset, origin) _fseeki64(stream, offset, origin) #else /* #define _DEFINE_WIN32_FSEEKO int fseeko(FILE *stream, off_t offset, int whence); */ #define fseeko fseek #endif #endif #ifndef ftello #ifdef _ftelli64 #define ftello(stream) _ftelli64(stream) #else /* #define _DEFINE_WIN32_FTELLO off_t ftello(FILE *stream); */ #define ftello ftell #endif #endif #else #define CHPATHSEP '/' #include #endif #ifndef fseeko #define fseeko fseek #endif #ifndef ftello #define ftello ftell #endif #ifdef DEBUG #undef NDEBUG #endif typedef int32_t int32; typedef uint32_t uint32; typedef int16_t int16; typedef uint16_t uint16; typedef unsigned char uchar; typedef unsigned char byte; #ifndef MAXUINT #define MAXUINT ((unsigned int)-1) #endif #ifndef MAXINT #define MAXINT INT_MAX #endif #ifndef MAX_UINT #define MAX_UINT ((unsigned int)-1) #endif #ifndef MAX_INT #define MAX_INT INT_MAX #endif typedef int64_t int64; typedef uint64_t uint64; /****************************************************************************/ #ifndef EXIT_FAILURE #define EXIT_FAILURE 1 #endif #ifndef EXIT_SUCCESS #define EXIT_SUCCESS 0 #endif /****************************************************************************/ #define ERR_ALLOC "Error allocating memory.\n" //------------------- // Debug helpers #ifndef NDEBUG #define GASSERT(exp) ((exp)?((void)0):(void)GAssert(#exp,__FILE__,__LINE__)) #ifdef TRACE #define GTRACE(exp) (GMessage exp) #else #define GTRACE(exp) ((void)0) #endif #else #define GASSERT(exp) ((void)0) #define GTRACE(exp) ((void)0) #endif #define GERROR(exp) (GError exp) /********************************** Macros ***********************************/ // Abolute value #define GABS(val) (((val)>=0)?(val):-(val)) // Min and Max #define GMAX(a,b) (((a)>(b))?(a):(b)) #define GMIN(a,b) (((a)>(b))?(b):(a)) // Min of three #define GMIN3(x,y,z) ((x)<(y)?GMIN(x,z):GMIN(y,z)) // Max of three #define GMAX3(x,y,z) ((x)>(y)?GMAX(x,z):GMAX(y,z)) // Return minimum and maximum of a, b #define GMINMAX(lo,hi,a,b) ((a)<(b)?((lo)=(a),(hi)=(b)):((lo)=(b),(hi)=(a))) // Clamp value x to range [lo..hi] #define GCLAMP(lo,x,hi) ((x)<(lo)?(lo):((x)>(hi)?(hi):(x))) typedef void* pointer; typedef unsigned int uint; typedef int GCompareProc(const pointer item1, const pointer item2); typedef long GFStoreProc(const pointer item1, FILE* fstorage); //for serialization typedef pointer GFLoadProc(FILE* fstorage); //for deserialization typedef void GFreeProc(pointer item); //usually just delete, //but may also support structures with embedded dynamic members #define GMALLOC(ptr,size) if (!GMalloc((pointer*)(&ptr),size)) \ GError(ERR_ALLOC) #define GCALLOC(ptr,size) if (!GCalloc((pointer*)(&ptr),size)) \ GError(ERR_ALLOC) #define GREALLOC(ptr,size) if (!GRealloc((pointer*)(&ptr),size)) \ GError(ERR_ALLOC) #define GFREE(ptr) GFree((pointer*)(&ptr)) inline char* strMin(char *arg1, char *arg2) { return (strcmp(arg1, arg2) < 0)? arg1 : arg2; } inline char* strMax(char *arg1, char *arg2) { return (strcmp(arg2, arg1) < 0)? arg1 : arg2; } inline int iround(double x) { return (int)floor(x + 0.5); } /****************************************************************************/ inline int Gintcmp(int a, int b) { //return (a>b)? 1 : ((a==b)?0:-1); return a-b; } int Gstrcmp(const char* a, const char* b, int n=-1); //same as strcmp but doesn't crash on NULL pointers int Gstricmp(const char* a, const char* b, int n=-1); //basic swap template function template void Gswap(T& lhs, T& rhs) { //register T tmp=lhs; T tmp=lhs; //requires copy operator lhs=rhs; rhs=tmp; } /**************** Memory management ***************************/ bool GMalloc(pointer* ptr, unsigned long size); // Allocate memory bool GCalloc(pointer* ptr, unsigned long size); // Allocate and initialize memory bool GRealloc(pointer* ptr,unsigned long size); // Resize memory void GFree(pointer* ptr); // Free memory, resets ptr to NULL //int saprintf(char **retp, const char *fmt, ...); void GError(const char* format,...); // Error routine (aborts program) void GMessage(const char* format,...);// Log message to stderr // Assert failed routine:- usually not called directly but through GASSERT void GAssert(const char* expression, const char* filename, unsigned int lineno); // ****************** string manipulation ************************* char *Gstrdup(const char* str); //duplicate a string by allocating a copy for it and returning it char* Gstrdup(const char* sfrom, const char* sto); //same as GStrdup, but with an early termination (e.g. on delimiter) char* Gsubstr(const char* str, char* from, char* to=NULL); //extracts a substring, allocating it, including boundaries (from/to) int strsplit(char* str, char** fields, int maxfields, const char* delim); int strsplit(char* str, char** fields, int maxfields, const char delim); int strsplit(char* str, char** fields, int maxfields); //splits by tab or space char* replaceStr(char* &str, char* newvalue); //conversion: to Lower/Upper case // creating a new string: char* upCase(const char* str); char* loCase(const char* str); // changing string in place: char* strlower(char * str); char* strupper(char * str); //strstr but for memory zones: scans a memory region //for a substring: void* Gmemscan(void *mem, unsigned int len, void *part, unsigned int partlen); // test if a char is in a string: bool chrInStr(char c, const char* str); char* rstrchr(char* str, char ch); /* returns a pointer to the rightmost occurence of ch in str - like rindex for platforms missing it*/ char* strchrs(const char* s, const char* chrs); //strchr but with a set of chars instead of only one char* rstrfind(const char* str, const char *substr); // like rindex() but for strings; right side version of strstr() char* reverseChars(char* str, int slen=0); //in place reversal of string char* rstrstr(const char* rstart, const char *lend, const char* substr); /*the reversed, rightside equivalent of strstr: starts searching from right end (rstart), going back to left end (lend) and returns a pointer to the last (right) matching character in str */ char* strifind(const char* str, const char* substr); // the case insensitive version of strstr -- finding a string within a strin //Determines if a string begins with a given prefix //(returns false when any of the params is NULL, // but true when prefix is '' (empty string)!) bool startsWith(const char* s, const char* prefix); bool endsWith(const char* s, const char* suffix); //Note: returns true if suffix is empty string, but false if it's NULL // ELF hash function for strings int strhash(const char* str); //---- generic base GSeg : genomic segment (interval) -- // coordinates are considered 1-based (so 0 is invalid) class GSeg { public: uint start; //starte) { start=e;end=s; } else { start=s;end=e; } } //check for overlap with other segment uint len() { return end-start+1; } bool overlap(GSeg* d) { //return startstart ? (d->start<=end) : (start<=d->end); return (start<=d->end && end>=d->start); } bool overlap(GSeg& d) { //return start=d.start); } bool overlap(GSeg& d, int fuzz) { //return start=d.start); } bool overlap(uint s, uint e) { if (s>e) { Gswap(s,e); } //return start=s); } //return the length of overlap between two segments int overlapLen(GSeg* r) { if (startstart) { if (r->start>end) return 0; return (r->end>end) ? end-r->start+1 : r->end-r->start+1; } else { //r->start<=start if (start>r->end) return 0; return (r->endend-start+1 : end-start+1; } } int overlapLen(uint rstart, uint rend) { if (rstart>rend) { Gswap(rstart,rend); } if (startend) return 0; return (rend>end) ? end-rstart+1 : rend-rstart+1; } else { //rstart<=start if (start>rend) return 0; return (rendstart && end==s->end); uint sd = (start>s->start) ? start-s->start : s->start-start; uint ed = (end>s->end) ? end-s->end : s->end-end; return (sd<=fuzz && ed<=fuzz); } //comparison operators required for sorting bool operator==(GSeg& d){ return (start==d.start && end==d.end); } bool operator<(GSeg& d){ return (start==d.start)?(end0) pushed=true; } // "undo" the last getLine request // so the next call will in fact return the same line GLineReader(const char* fname) { FILE* f=fopen(fname, "rb"); if (f==NULL) GError("Error opening file '%s'!\n",fname); closeFile=true; init(f); } GLineReader(FILE* stream=NULL, off_t fpos=0) { closeFile=false; init(stream,fpos); } void init(FILE* stream, off_t fpos=0) { len=0; isEOF=false; allocated=1024; GMALLOC(buf,allocated); lcount=0; buf[0]=0; file=stream; filepos=fpos; pushed=false; } ~GLineReader() { GFREE(buf); if (closeFile) fclose(file); } }; /* extended fgets() - to read one full line from a file and update the file position correctly ! buf will be reallocated as necessary, to fit the whole line */ char* fgetline(char* & buf, int& buflen, FILE* stream, off_t* f_pos=NULL, int* linelen=NULL); //print int/values nicely formatted in 3-digit groups char* commaprint(uint64 n); /*********************** File management functions *********************/ // removes the last part (file or directory name) of a full path // WARNING: this is a destructive operation for the given string! void delFileName(char* filepath); // returns a pointer to the last file or directory name in a full path const char* getFileName(const char* filepath); // returns a pointer to the file "extension" part in a filename const char* getFileExt(const char* filepath); int fileExists(const char* fname); //returns 0 if file entry doesn't exist // 1 if it's a directory // 2 if it's a regular file // 3 otherwise (?) int64 fileSize(const char* fpath); //write a formatted fasta record, fasta formatted void writeFasta(FILE *fw, const char* seqid, const char* descr, const char* seq, int linelen=60, int seqlen=0); //parses the next number found in a string at the current position //until a non-digit (and not a '.', 'e','E','-','+') is encountered; //updates the char* pointer to be after the last digit parsed bool parseNumber(char* &p, double& v); bool parseDouble(char* &p, double& v); //just an alias for parseNumber bool parseInt(char* &p, int& i); bool parseUInt(char* &p, uint& i); bool parseHex(char* &p, uint& i); #endif /* G_BASE_DEFINED */ tophat-2.0.9/src/tophat-fusion-post0000755000175000017500000032036612160105020016046 0ustar toortoor#!/usr/bin/env python """ tophat-fusion-post Created by Daehwan Kim on 2011-05-05. Copyright (c) 2011 Daehwan Kim. All rights reserved. """ import sys, getopt, warnings import os, subprocess, errno import copy import string, re import random from datetime import datetime, date, time import math use_message = ''' TopHat-Fusion Usage: tophat-fusion-post [options] Options: -v/--version -o/--output-dir [ default: ./tophatfusion_out ] --num_fusion_reads [ default: 3 ] --num_fusion_pairs [ default: 2 ] --num_fusion_both [ default: 5 ] --fusion-read-mismatches [ default: 2 ] --fusion-multireads [ default: 2 ] --non-human -p/--num-threads [ default: 1 ] --skip-fusion-kmer --skip-filter-fusion --skip-blast --skip-read-dist --skip-html --tex-table ''' class Usage(Exception): def __init__(self, msg): self.msg = msg output_dir = "./tophatfusion_out/" logging_dir = output_dir + "logs/" tmp_dir = output_dir + "tmp/" class TopHatFusionParams: def __init__(self, keep_tmp = ""): self.keep_tmp = keep_tmp self.num_fusion_reads = 3 self.num_fusion_pairs = 2 self.num_fusion_both = 0 self.fusion_read_mismatches = 2 self.fusion_multireads = 2 self.is_human = True self.num_threads = 1 self.skip_fusion_kmer = False self.skip_filter_fusion = False self.skip_blast = False self.skip_read_dist = False self.skip_html = False self.tex_table = False def check(self): if False: die("Error: arg to --num-threads must be greater than 0") def parse_options(self, argv): try: opts, args = getopt.getopt(argv[1:], "hp:o:", ["version", "help", "output-dir=", "num-fusion-reads=", "num-fusion-pairs=", "num-fusion-both=", "fusion-read-mismatches=", "fusion-multireads=", "non-human", "num-threads=", "skip-fusion-kmer", "skip-filter-fusion", "skip-blast", "skip-read-dist", "skip-html", "tex-table"]) except getopt.error, msg: raise Usage(msg) for option, value in opts: if option in ("-v", "--version"): print "TopHat v%s" % (get_version()) sys.exit(0) if option in ("-h", "--help"): raise Usage(use_message) if option == "--num-fusion-reads": self.num_fusion_reads = int(value) if option == "--num-fusion-pairs": self.num_fusion_pairs = int(value) if option == "--num-fusion-both": self.num_fusion_both = int(value) if option == "--fusion-read-mismatches": self.fusion_read_mismatches = int(value) if option == "--fusion-multireads": self.fusion_multireads = int(value) if option == "--non-human": self.is_human = False if option in ("-p", "--num-threads"): self.num_threads = int(value) if option in ("-o", "--output-dir"): global output_dir, logging_dir, tmp_dir output_dir = value + "/" logging_dir = output_dir + "logs/" tmp_dir = output_dir + "tmp/" if option == "--skip-fusion-kmer": self.skip_fusion_kmer = True if option == "--skip-filter-fusion": self.skip_filter_fusion = True if option == "--skip-blast": self.skip_blast = True if option == "--skip-read-dist": self.skip_read_dist = True if option == "--skip-html": self.skip_html = True if option == "--tex-table": self.tex_table = True if len(args) < 1: raise Usage(use_message) return args # Returns the current time in a nice format def right_now(): curr_time = datetime.now() return curr_time.strftime("%c") def reverse_complement(seq): result = "" for nt in seq: base = nt if nt == 'A': base = 'T' elif nt == 'a': base = 't' elif nt == 'C': base = 'G' elif nt == 'c': base = 'g' elif nt == 'G': base = 'C' elif nt == 'g': base = 'c' elif nt == 'T': base = 'A' elif nt == 't': base = 'a' result = base + result return result def get_chromosome_order(bowtie_index, chrs): bowtie_header_cmd = ['bowtie', '--sam', bowtie_index, '/dev/null'] bowtie_header_filename = tmp_dir + "sam_header" subprocess.call(bowtie_header_cmd, stdout=open(bowtie_header_filename, "w"), stderr=open("/dev/null")) bowtie_header_file = open(bowtie_header_filename) for line in bowtie_header_file: index = string.find(line, "SN:") if index != -1: SN = line[index+3:].split() chrs.append(SN[0]) bowtie_header_file.close() # Ensures that the output, logging, and temp directories are present. If not, # they are created def prepare_output_dir(): if os.path.exists(output_dir): pass else: os.mkdir(output_dir) if os.path.exists(logging_dir): pass else: os.mkdir(logging_dir) if os.path.exists(tmp_dir): pass else: try: os.makedirs(tmp_dir) except OSError, o: die("\nError creating directory %s (%s)" % (tmp_dir, o)) def check_samples(): sample_list_filename = output_dir + 'sample_list.txt' prev_list, curr_list = [], [] if os.path.exists(sample_list_filename): sample_list = open(sample_list_filename, 'r') for line in sample_list: prev_list.append(line[:-1]) sample_list.close() for dir in sorted(os.listdir('.')): if string.find(dir, "tophat_") != 0: continue fusion = dir + "/fusions.out" if not os.path.exists(fusion): continue curr_list.append(dir[7:]) if prev_list != curr_list: sample_list = open(sample_list_filename, 'w') for sample in curr_list: print >> sample_list, sample sample_list.close() return True else: return False def map_fusion_kmer(bwt_idx_prefix, params, sample_update = False): def get_fusion_seq(): seq_dic = {} for dir in sorted(os.listdir('.')): if string.find(dir, "tophat_") != 0: continue fusion = dir + "/fusions.out" if not os.path.exists(fusion): continue fusion_file = open(fusion, 'r') fusion_file.readline() for line in fusion_file: left_seq, right_seq = line[:-1].split('\t@\t')[2:4] left_seq = left_seq.split(' ')[0] right_seq = right_seq.split(' ')[1] if len(left_seq) < 23 or len(right_seq) < 23: continue seq_dic[left_seq[-23:]] = 1 seq_dic[right_seq[:23]] = 1 fusion_file.close() fusion_seq_fa = open(output_dir + "fusion_seq.fa", 'w') for seq in seq_dic.keys(): print >> fusion_seq_fa, ">%s" % seq print >> fusion_seq_fa, seq fusion_seq_fa.close() def convert_bowtie(): bwt_dic = {} bwtout_file = open(output_dir + "fusion_seq.bwtout", 'r') for line in bwtout_file: seq, temp, chr, coord = line[:-1].split('\t')[0:4] if seq in bwt_dic: bwt_dic[seq].append(chr + ":" + coord) else: bwt_dic[seq] = [chr + ":" + coord] bwtout_file.close() kmer_map = open(fusion_kmer_file_name, 'w') for seq, chrs in bwt_dic.items(): print >> kmer_map, "%s\t%s" % (seq, ','.join(chrs)) kmer_map.close() print >> sys.stderr, "[%s] Extracting 23-mer around fusions and mapping them using Bowtie" % right_now() if sample_update: print >> sys.stderr, "\tsamples updated" fusion_kmer_file_name = output_dir + "fusion_seq.map" if not os.path.exists(fusion_kmer_file_name) or \ sample_update: get_fusion_seq() cmd = ['bowtie', '-p', '8', '-a', '-n', '3', '-m', '100', bwt_idx_prefix, '-f', '%sfusion_seq.fa' % output_dir] subprocess.call(cmd, stdout=open(output_dir + 'fusion_seq.bwtout', 'w'), stderr=open('/dev/null', 'w')) convert_bowtie() def filter_fusion(bwt_idx_prefix, params): chrs = [] get_chromosome_order(bwt_idx_prefix, chrs) chr_order = {} for chr in chrs: chr_order[chr] = len(chr_order) def filter_fusion_impl(fusion, refGene_list, ensGene_list, seq_chr_dic, fusion_gene_list): def gene_exists(gene_list, chr, coord, dir, is_left): min = 0 max = len(gene_list) - 1 while max - min >= 0: mid = (min + max) / 2 gene = gene_list[mid] ref_chr = gene[1] if chr != ref_chr: if chr_order[chr] < chr_order[ref_chr]: max = mid - 1 else: min = mid + 1 continue left_coord = gene[2] right_coord = gene[3] if coord >= left_coord and coord <= right_coord: left_coords = gene[5] right_coords = gene[6] sense = gene[7] belong = False where = "outside" for i in range(len(left_coords)): # gives some relax! relax = 3 left = int(left_coords[i]) - 1 right = int(right_coords[i]) - 1 if coord <= right + relax: if coord < left - relax: where = "intron%d(%d-%d)" % (i, int(right_coords[i-1]), left - 1) else: if ((is_left and dir == "f") or (not is_left and dir == "r")) and abs(coord - right) <= relax: belong = True if ((is_left and dir == "r") or (not is_left and dir == "f")) and abs(coord - left) <= relax: belong = True where = "exon%d(%d-%d)" % (i + 1, left, right) break return [gene[0], gene[4], where, belong, sense] elif coord < left_coord: max = mid - 1 else: min = mid + 1 return ["N/A", "N/A", "N/A", False, "N/A"] def how_diff(first, second): seq_len = len(first) min_value = 10000 prev, curr = [0 for i in range(seq_len)], [0 for i in range(seq_len)] for j in range(seq_len): for i in range(seq_len): value = 10000 if first[i] == second[j]: match = 0 else: match = 1 # right if i == 0: value = j * 2 + match elif j > 0: value = prev[i] + 2 temp_value = 10000 # down if j == 0: temp_value = i * 2 + match elif i > 0: temp_value = curr[i-1] + 2 if temp_value < value: value = temp_value # match if i > 0 and j > 0: temp_value = prev[i-1] + match if temp_value < value: value = temp_value curr[i] = value if (i == seq_len - 1 or j == seq_len - 1) and value < min_value: min_value = value prev, curr = curr, prev return min_value kmer_len = len(seq_chr_dic.keys()[0]) sample_name = fusion.split("/")[0][len("tophat_"):] data = os.getcwd().split('/')[-1] fusion_file = open(fusion, 'r') fusion_file.readline() for line in fusion_file: info, sim, left_seq_org, right_seq_org, left_dist, right_dist, pair_list = line[:-1].split('\t@\t')[:7] info = info.split('\t') sim = sim.split(' ') left_seq = left_seq_org.replace(' ', '') right_seq = right_seq_org.replace(' ', '') num_reads = int(info[4]) num_pair_ends = int(info[5]) num_pair_ends_fusion = int(info[6]) num_pair_ends_both = int(num_pair_ends + num_pair_ends_fusion * 0.5) num_unsupport_reads = int(info[7]) left_ext = int(info[8]) right_ext = int(info[9]) sym = float(info[10]) half_len = len(left_seq) / 2 chr1, chr2 = info[0].split('-')[:2] coord1, coord2 = int(info[1]), int(info[2]) dir = info[3] if string.find(sample_name, "single") != -1: single = True else: single = False if string.find(data, "maher") != -1: extent = min(10, num_reads) if single: if left_ext < 25 + extent * 2 or right_ext < 25 + extent * 2: continue else: if left_ext < 14 + extent or right_ext < 14 + extent: continue else: if left_ext < 16 or right_ext < 16: continue both = num_reads + num_pair_ends_both all = both if num_pair_ends > num_reads * 50: continue if num_reads < params.num_fusion_reads or \ num_pair_ends < params.num_fusion_pairs or \ both < params.num_fusion_both: continue if (chr1 != chr2 and num_unsupport_reads > num_reads) or \ (chr1 == chr2 and num_unsupport_reads > all + num_pair_ends + 5): continue pairs = [] if num_pair_ends >= 1: pairs = pair_list.strip().split() left, right = pairs[0].split(':') if abs(int(left)) + abs(int(right)) > 2000: continue pairs = pairs[:200] # are the sequences around the breakpoint different enough? if int(sim[0]) < 8: continue # is the reads distributed symmetrically? if sym >= 22 + max(0, 6 - num_reads): continue max_intron_len = 100000 if chr1 == chr2 and dir == "ff": coord_dif = coord2 - coord1 if coord_dif > 0 and coord_dif < max_intron_len: continue if not left_seq[half_len-kmer_len:half_len] in seq_chr_dic or not right_seq[half_len:half_len+kmer_len] in seq_chr_dic: continue left_chrs = seq_chr_dic[left_seq[half_len-kmer_len:half_len]] right_chrs = seq_chr_dic[right_seq[half_len:half_len+kmer_len]] if chr1 == chr2: max_intron_len = min(max_intron_len, abs(coord1 - coord2) * 9 / 10) same = False for chr_coord in left_chrs: chr, coord = chr_coord.split(':') coord = int(coord) if chr == chr2 and abs(coord - coord2) < max_intron_len: same = True break if same: continue for chr_coord in right_chrs: chr, coord = chr_coord.split(':') coord = int(coord) if chr == chr1 and abs(coord - coord1) < max_intron_len: same = True break if same: continue def find_gene(chr, coord, one_dir, is_left): result = [] for gene_list in [refGene_list, ensGene_list]: result.append(gene_exists(gene_list, chr, coord, one_dir, is_left)) if result[0][0] == "N/A": return result[1] + result[1][:2] else: return result[0] + result[1][:2] dir = info[3] gene1, gene1_name, gene1_where, gene1_belong, gene1_sense, ens_gene1, ens_gene1_name = find_gene(chr1, coord1, dir[0], True) gene2, gene2_name, gene2_where, gene2_belong, gene2_sense, ens_gene2, ens_gene2_name = find_gene(chr2, coord2, dir[1], False) if gene1_name == gene2_name or ens_gene1_name == ens_gene2_name or ens_gene1 == ens_gene2: continue if gene1 == "N/A" or gene2 == "N/A" or (string.find(gene1, "ENS") == 0 and string.find(gene2, "ENS") == 0): continue left_diff = how_diff(left_seq[half_len - 20:half_len], right_seq[half_len - 20:half_len]) if left_diff <= 8: continue right_diff = how_diff(left_seq[half_len:half_len+20], right_seq[half_len:half_len+20]) if right_diff <= 8: continue if left_diff + right_diff < 20: continue left_dist = left_dist.strip().split(' ') right_dist = right_dist.strip().split(' ') for i in range(len(left_dist)): left_dist[i] = '%d' % min(9, int(left_dist[i])) right_dist[i] ='%d' % min(9, int(right_dist[i])) swap = False if (dir == 'ff' and gene1_sense == '-' and gene2_sense == '-') or \ (dir == 'rr' and gene1_sense == '+' and gene2_sense == '+') or \ (dir == 'fr' and gene1_sense == '-' and gene2_sense == '+') or \ (dir == 'rf' and gene1_sense == '+' and gene2_sense == '-'): swap = True if swap: if dir == 'ff': dir = 'rr' elif dir == 'rr': dir = 'ff' info[0] = "%s-%s" % (chr2, chr1) info[1:3] = info[1:3][::-1] info[3] = dir info[8:10] = info[8:10][::-1] left_seq_org, right_seq_org = reverse_complement(right_seq_org), reverse_complement(left_seq_org) left_dist, right_dist = right_dist, left_dist gene1_name, gene1_where, gene1_sense, gene2_name, gene2_where, gene2_sense = \ gene2_name, gene2_where, gene2_sense, gene1_name, gene1_where, gene1_sense for j in range(len(pairs)): pair = pairs[j].split(':') pairs[j] = ':'.join(pair[::-1]) # check if this is due to trans-splicing. if gene1_sense in "+-" and gene2_sense in "+-": if ((dir == 'ff' or dir == 'rr') and gene1_sense != gene2_sense) or \ (dir == 'fr' and (gene1_sense != '+' or gene2_sense != '-')) or \ (dir == 'rf' and (gene1_sense != '-' or gene2_sense != '+')): # print >> sys.stderr, "fusion due to trans-splicing", info # print >> sys.stderr, gene1, gene1_name, gene1_where, gene1_sense # print >> sys.stderr, gene2, gene2_name, gene2_where, gene2_sense None fusion_gene = [] fusion_gene.append(sample_name + ' ' + ' '.join(info[:10])) fusion_gene.append(left_seq_org) fusion_gene.append(right_seq_org) fusion_gene.append("%s %s" % (''.join(left_dist[::-1]), ''.join(right_dist))) fusion_gene.append("%s %s %s %s" % (gene1_name, gene1_where, gene2_name, gene2_where)) fusion_gene.append(" ".join(pairs)) fusion_gene_list.append(fusion_gene) fusion_file.close() print >> sys.stderr, "[%s] Filtering fusions" % right_now() seq_chr_dic = {} seq_chr_file = open(output_dir + "fusion_seq.map", 'r') for line in seq_chr_file: seq, chrs = line[:-1].split('\t') chrs = chrs.split(',') seq_chr_dic[seq] = chrs seq_chr_file.close() re_mir = re.compile(r'^(MIR)') def read_genes(gene_file_name, offset = 1, id = -4): gene_list, temp_gene_list = [], [] if not os.path.exists(gene_file_name): return gene_list gene_file = open(gene_file_name, 'r') for line in gene_file: line = line[:-1].split('\t')[offset:] num_exons = int(line[7]) left_coords = line[8].split(',')[:num_exons] right_coords = line[9].split(',')[:num_exons] if line[1] in chr_order and not re_mir.findall(line[id]): temp_gene_list.append([line[0], line[1], int(line[3]), int(line[4]), line[id], left_coords, right_coords, line[2]]) gene_file.close() def my_cmp(a, b): if a[1] != b[1]: if chr_order[a[1]] < chr_order[b[1]]: return -1 else: return 1 else: if a[2] != b[2]: if a[2] < b[2]: return -1 else: return 1 else: if a[3] > b[3]: return -1 else: return 1 temp_gene_list = sorted(temp_gene_list, cmp=my_cmp) gene_list = [] if len(temp_gene_list) >= 1: gene_list.append(temp_gene_list[0]) # remove overlapping genes, that is, the longest genes are preferred for i in range(1, len(temp_gene_list)): gene1 = temp_gene_list[i-1] gene2 = temp_gene_list[i] if gene1[1] == gene2[1] and gene1[3] >= gene2[3]: continue gene_list.append(gene2) return gene_list refGene_list = read_genes("refGene.txt") ensGene_list = read_genes("ensGene.txt") fusion_gene_list = [] for file in sorted(os.listdir(".")): if string.find(file, "tophat_") != 0: continue fusion_file = file + "/fusions.out" if not os.path.exists(fusion_file): continue print >> sys.stderr, "\tProcessing:", fusion_file filter_fusion_impl(fusion_file, refGene_list, ensGene_list, seq_chr_dic, fusion_gene_list) fusion_out_file = output_dir + "potential_fusion.txt" print >> sys.stderr, '\t%d fusions are output in %s' % (len(fusion_gene_list), fusion_out_file) output_file = open(fusion_out_file, 'w') for fusion_gene in fusion_gene_list: for line in fusion_gene: print >> output_file, line output_file.close() def parallel_work(pids, work): child = -1 for i in range(len(pids)): if pids[i] == 0: child = i break while child == -1: status = os.waitpid(0, 0) for i in range(len(pids)): if status[0] == pids[i]: child = i pids[i] = 0 break child_id = os.fork() if child_id == 0: work() os._exit(os.EX_OK) else: # print >> sys.stderr, '\t\t>> thread %d: %d' % (child, child_id) pids[child] = child_id def wait_pids(pids): for pid in pids: if pid > 0: os.waitpid(pid, 0) def do_blast(params): print >> sys.stderr, "[%s] Blasting 50-mers around fusions" % right_now() file_name = output_dir + "potential_fusion.txt" blast_genomic_out = output_dir + "blast_genomic" blast_nt_out = output_dir + "blast_nt" if not os.path.exists(blast_genomic_out): os.system("mkdir %s" % blast_genomic_out); if not os.path.exists(blast_nt_out): os.system("mkdir %s" % blast_nt_out) pids = [0 for i in range(params.num_threads)] count = 0 line_no = 0 check_list = [] output_list = [] output = "" file = open(file_name, 'r') for line in file: if line_no % 6 == 0: count += 1 if line_no % 6 == 1: left_seq = line[:-1].split(" ")[0] if line_no % 6 == 2: right_seq = line[:-1].split(" ")[1] if line_no % 6 == 4: def blast(database, seq, outdir): file_name = "%s/%s" % (outdir, seq) if os.path.exists(file_name): return blast_cmd = "echo %s | blastn -db %s -evalue 1e-10 -word_size 28" # blast_cmd = "echo %s | blastall -p blastn -d %s -e 1e-10 -W 28" output = os.popen(blast_cmd % (seq, database)).read() if str.find(output, "No hits found") != -1: blast_cmd = "echo %s | blastn -db %s -evalue 1e-5" # blast_cmd = "echo %s | blastall -p blastn -d %s -e 1e-5" output = os.popen(blast_cmd % (seq, database)).read() pos1 = str.find(output, ">ref|") pos2 = str.find(output, "Database: ", pos1) if pos1 != -1 and pos1 < pos2: output = output[pos1:pos2].rstrip() else: output = "" file = open(file_name, "w") file.write(output) file.close() seq = left_seq + right_seq def work(): if params.is_human: blast_genomic = "blast/human_genomic" else: blast_genomic = "blast/other_genomic" blast_nt = "blast/nt" blast(blast_genomic, left_seq, blast_genomic_out) blast(blast_genomic, right_seq, blast_genomic_out) blast(blast_genomic, seq, blast_genomic_out) blast(blast_nt, left_seq, blast_nt_out) blast(blast_nt, right_seq, blast_nt_out) blast(blast_nt, seq, blast_nt_out) if not os.path.exists(output_dir + "blast_nt/" + seq ): print >> sys.stderr, "\t%d. %s" % (count, line[:-1]) if params.num_threads <= 1: work() else: parallel_work(pids, work) line_no += 1 if params.num_threads > 1: wait_pids(pids) def read_dist(params): def alignments_region(sample_name, bam, alignments_list): def output_region(sample_name, fusion, reads): chr_ref, pos1_ref, pos2_ref, dir_ref = fusion chr1_ref, chr2_ref = chr_ref.split('-') if len(reads) <= 0: return alignments_file_name = "%s/%s_%s_%d_%d_%s" % (alignments_dir, sample_name, chr_ref, pos1_ref, pos2_ref, dir_ref) alignments_file = open(alignments_file_name, "w") def my_cmp(a, b): if a[3] and not b[3]: return -1 elif not a[3] and b[3]: return 1 if a[3]: if dir_ref[0] == "f": return a[4] - b[4] else: return b[4] - a[4] else: if dir_ref[1] == "f": return a[4] - b[4] else: return b[4] - a[4] reads = sorted(reads, cmp=my_cmp) begin_pos = reads[0][4] dist_to_breakpoint = abs(pos1_ref - begin_pos) for read in reads: seq_pos = 0 read_id, chr1, chr2, before, left_pos, right_pos, cigars, seq, qual, mismatch, left = read[:-1] space_len = 0 if before: space_len = abs(left_pos - begin_pos) else: space_len = abs(pos1_ref - begin_pos) + 2 if dir_ref[1] == "f": space_len += (left_pos - pos2_ref) else: space_len += (pos2_ref - left_pos) qual_str = seq_str = ' ' * space_len saw_fusion = False for cigar in cigars: length = int(cigar[:-1]) cigar_op = cigar[-1] if cigar_op in "Mm": seq_str += seq[seq_pos:seq_pos + length] qual_str += qual[seq_pos:seq_pos + length] seq_pos += length if cigar_op in "Nn": seq_str += ('|' * length) qual_str += ('|' * length) if cigar_op in "Dd": seq_str += ("D" * length) qual_str += ("D" * length) if cigar_op in "Ii": seq_pos += length if cigar_op in "F": seq_str += " " qual_str += " " cigar_str = ''.join(cigars) prefix_str = '%s %s %d %d %s' % (chr1, chr2, left_pos, right_pos, cigar_str) prefix_str += ' ' * (60 - len(prefix_str)) L = "L" if not left: L = "R" begin, end = max(0, dist_to_breakpoint - within), dist_to_breakpoint + within output = '%s %s\n' % (prefix_str, seq_str[begin:end]) alignments_file.write(output) alignments_file.close() cigar_re = re.compile('\d+\w') within = 300 old_chr = "" reads_list = [[] for i in range(len(alignments_list))] reads_compress_list = [[1, 0] for i in range(len(alignments_list))] cmd = ['samtools', 'view', '-h', bam] popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=open('/dev/null')) contigs = {} for line in popen.stdout: if line[0] == '@': if line[1:3] == 'SQ': line = line[:-1].split('\t') contig = line[1].split(':')[1] if contig not in contigs: contigs[contig] = len(contigs) continue line = line[:-1].split('\t') read_id, chr1, left_pos, cigar_str, seq, qual = line[0], line[2], int(line[3]) - 1, line[5], line[9], line[10] if chr1 != old_chr: old_chr = chr1 # print "%s - %s" % (sample_name, chr1) secondary_fusion_alignment = False num_hits = 0 for field in line[10:]: if field[:5] == "XF:Z:": if field[5] == '2': secondary_fusion_alignment = True chr1, left_pos, cigar_str, seq, qual = field[7:].split() left_pos = int(left_pos) - 1 if field[:5] == "NM:i:": mismatch = int(field.split(':')[-1]) if field[:5] == "NH:i:": num_hits = int(field.split(':')[-1]) if num_hits > params.fusion_multireads: continue if secondary_fusion_alignment: continue flag = int(line[1]) left = (flag & 64 != 0) antisense = (flag & 16 != 0) if '-' in chr1: chr1, chr2 = chr1.split('-') else: chr2 = chr1 right_pos = left_pos cigars = cigar_re.findall(cigar_str) dir = "ff" saw_fusion = False for cigar in cigars: length = int(cigar[:-1]) cigar_op = cigar[-1] if cigar_op in "MDN": right_pos += length if saw_fusion: dir = dir[0] + "f" else: dir = "f" + dir[1] if cigar_op in "mdn": right_pos -= length if saw_fusion: dir = dir[0] + "r" else: dir = "r" + dir[1] if cigar_op in "F": if dir[0] == "f": pos1 = right_pos - 1 else: pos1 = right_pos + 1 pos2 = right_pos = length - 1 saw_fusion = True if cigar_op in "iIdD": mismatch -= length if not saw_fusion: report_idx = -1 for i in range(len(alignments_list)): chr_ref, pos1_ref, pos2_ref = alignments_list[i][:3] chr1_ref, chr2_ref = chr_ref.split('-') """ # a fusion point can be given like "chr9-chr1" if (contigs[chr1_ref] < contigs[chr1] or (contigs[chr1_ref] == contigs[chr1] and left_pos - pos1_ref > 1000000)) and \ (contigs[chr2_ref] < contigs[chr1] or (contigs[chr2_ref] == contigs[chr1] and left_pos - pos2_ref > 1000000)): report_idx = i else: break """ if report_idx >= 0: for j in range(report_idx+1): output_region(sample_name, alignments_list[j], reads_list[j]) alignments_list = alignments_list[report_idx+1:] reads_list = reads_list[report_idx+1:] if len(alignments_list) <= 0: break fusion_read = saw_fusion if mismatch > params.fusion_read_mismatches: continue for i in range(len(alignments_list)): chr_ref, pos1_ref, pos2_ref, dir_ref = alignments_list[i] chr1_ref, chr2_ref = chr_ref.split('-') def reverse_read(fusion_read, dir, antisense, left_pos, right_pos, fusion_left, seq, qual, cigars): if dir == 'ff' or not fusion_read: left_pos, right_pos = right_pos - 1, left_pos - 1 elif dir == 'fr': left_pos, right_pos = right_pos + 1, left_pos - 1 elif dir == 'rf': left_pos, right_pos = right_pos - 1, left_pos + 1 elif dir == 'rr': left_pos, right_pos = right_pos + 1, left_pos + 1 reversed_cigars = [] cigars.reverse() for i in range(len(cigars)): cigar = cigars[i] opcode = cigar[-1] if opcode == 'F': reversed_cigars.append("%dF" % fusion_left) else: if str.islower(opcode): opcode = str.upper(opcode) else: opcode = str.lower(opcode) cigar = cigar[:-1] + opcode reversed_cigars.append(cigar) if dir == 'ff': dir = 'rr' elif dir == 'rr': dir = 'ff' return dir, antisense == False, left_pos, right_pos, reverse_complement(seq), qual[::-1], reversed_cigars if fusion_read and chr1 == chr2_ref and chr2 == chr1_ref and pos1 == pos2_ref and pos2 == pos1_ref: dir, antisense, left_pos, right_pos, seq, qual, cigars = \ reverse_read(fusion_read, dir, antisense, left_pos, right_pos, pos1, seq, qual, cigars) chr1, chr2 = chr2, chr1 pos1, pos2 = pos2, pos1 if (fusion_read and chr1 == chr1_ref and chr2 == chr2_ref) or \ (not fusion_read and (chr1 == chr1_ref or chr1 == chr2_ref)): if not fusion_read: do_not_continue = False if chr1 == chr1_ref: if abs(left_pos - pos1_ref) <= 10000 or abs(right_pos - pos1_ref) <= 10000: do_not_continue = True if chr1 == chr2_ref: if abs(left_pos - pos2_ref) <= 10000 or abs(right_pos - pos2_ref) <= 10000: do_not_continue = True if not do_not_continue: continue seq_temp = seq qual_temp = qual cigars_temp = cigars left_pos_temp, right_pos_temp = left_pos, right_pos before = True if fusion_read: if dir != dir_ref: continue if pos1 != pos1_ref or pos2 != pos2_ref: continue else: do_not_continue = False if chr1 == chr1_ref: if dir_ref[0] == "f" and not (right_pos - pos1_ref >= 10 or pos1_ref - right_pos >= within): do_not_continue = True if dir_ref[0] == "r" and not (pos1_ref - left_pos >= 10 or left_pos - pos1_ref >= within): do_not_continue = True if chr1 == chr2_ref: if dir_ref[1] == "f" and not (pos2_ref - left_pos >= 10 or left_pos - pos2_ref >= within): do_not_continue = True before = False if dir_ref[1] == "r" and not (right_pos - pos2_ref >= 10 or pos2_ref - right_pos >= within): do_not_continue = True before = False if not do_not_continue: continue if (before and dir_ref[0] == "r") or (not before and dir_ref[1] == "r"): dir, antisense, left_pos_temp, right_pos_temp, seq_temp, qual_temp, cigars_temp = \ reverse_read(fusion_read, dir, antisense, left_pos_temp, right_pos_temp, 0, seq_temp, qual_temp, cigars_temp) reads_list[i].append([read_id, chr1, chr2, before, left_pos_temp, right_pos_temp, cigars_temp, seq_temp, qual_temp, mismatch, left, line]) reads_compress_list[i][1] += 1 for i in range(len(alignments_list)): output_region(sample_name, alignments_list[i], reads_list[i]) print >> sys.stderr, "[%s] Generating read distributions around fusions" % right_now() file_name = output_dir + "potential_fusion.txt" file = open(file_name, 'r') alignments_dir = output_dir + "read_alignments" alignments_list = {} line_no = 0 for line in file: if line_no % 6 == 0: temp_list = line[:-1].split(' ') sample_name = temp_list[0] if not sample_name in alignments_list: alignments_list[sample_name] = [] if not os.path.exists("%s/%s_%s" % (alignments_dir, sample_name, "_".join(temp_list[1:5]))): alignments_list[sample_name].append(' '.join(temp_list[1:5])) line_no += 1 file.close() if not os.path.exists(alignments_dir): os.system("mkdir %s" % alignments_dir) pids = [0 for i in range(params.num_threads)] for sample_name, list in alignments_list.items(): bam_file_name = 'tophat_%s/accepted_hits.bam' % sample_name if not os.path.exists(bam_file_name): continue increment = 50 for i in range((len(list) + increment - 1) / increment): temp_list = list[i*increment:(i+1)*increment] print >> sys.stderr, '\t%s (%d-%d)' % (sample_name, i*increment + 1, min((i+1)*increment, len(list))) alignments_list = [] for fusion in temp_list: print >> sys.stderr, '\t\t%s' % fusion fusion = fusion.split() alignments_list.append([fusion[0], int(fusion[1]), int(fusion[2]), fusion[3]]) def work(): if len(alignments_list) > 0: alignments_region(sample_name, bam_file_name, alignments_list) if params.num_threads <= 1: work() else: parallel_work(pids, work) if params.num_threads > 1: wait_pids(pids) def generate_html(params): def blast_output(database, seq): blast_output_filename = "%s/%s" % (database, seq) if os.path.exists(blast_output_filename): file = open(blast_output_filename, "r") return file.read() + "\n" return "" def read_fusion_genes(fusion_gene_list): sample_names = [] sample_list_filename = output_dir + 'sample_list.txt' if not os.path.exists(sample_list_filename): return sample_list_file = open(sample_list_filename, "r") for line in sample_list_file: sample_names.append(line[:-1]) sample_list_file.close() for sample_name in sample_names: sample_isoform_filename = "tophat_" + sample_name + "/transfuse.txt" if not os.path.exists(sample_isoform_filename): continue fusion_gene_begin_index = len(fusion_gene_list) sample_isoform_file = open(sample_isoform_filename, "r") for line in sample_isoform_file: line = line[:-1].lstrip() fields = line.split('\t') def read_coords(value_dic, value_str): chr1, chr2 = "", "" if ' ' in value_str: chr1, chr2 = value_str.split(' ') else: chr1 = value_str chr1, chr1_coord = chr1.split(':') chr1_left, chr1_right = chr1_coord.split('-') chr1_left, chr1_right = int(chr1_left), int(chr1_right) value_dic["chr1"] = chr1 value_dic["chr1_left"] = chr1_left value_dic["chr1_right"] = chr1_right if chr2: chr2, chr2_coord = chr2.split(':') chr2_left, chr2_right = chr2_coord.split('-') chr2_left, chr2_right = int(chr2_left), int(chr2_right) value_dic["chr2"] = chr2 value_dic["chr2_left"] = chr2_left value_dic["chr2_right"] = chr2_right def read_values(value_dic, value_str): values = fields[-1].split(';') for value in values: value = value.strip() if not value: continue name, value = value.split(' ') value_dic[name] = value[1:-1] type = fields[0] if type == "gene": value_dic = {} read_coords(value_dic, fields[1]) read_values(value_dic, fields[-1]) fusion_gene_list.append([sample_name, value_dic, []]) elif type == "transcript": value_dic = {} read_coords(value_dic, fields[1]) read_values(value_dic, fields[-1]) # daehwan - for debugging purposes if float(value_dic["FPKM"]) >= 1.0: skip_transcript = False curr_gene = fusion_gene_list[-1] transcript_list = curr_gene[-1] transcript_list.append([value_dic, []]) else: skip_transcript = True elif type in ["exon", "fusion"]: if skip_transcript: continue curr_gene = fusion_gene_list[-1] transcript_list = curr_gene[-1] curr_transcript = transcript_list[-1] exon_list = curr_transcript[-1] if type == "exon": chr, chr_coord = fields[1].split(':') chr_left, chr_right = chr_coord.split('-') chr_left, chr_right = int(chr_left), int(chr_right) exon_list.append(["exon", chr, chr_left, chr_right]) else: fusion_left, fusion_right = fields[1].split(' ') chr1, chr1_pos = fusion_left.split(':') chr2, chr2_pos = fusion_right.split(':') chr1_pos, chr2_pos = int(chr1_pos), int(chr2_pos) exon_list.append(["fusion", chr1, chr1_pos, chr2, chr2_pos]) transcript_coverage_dic = {} sample_isoform_cov_filename = "tophat_" + sample_name + "/transfuse_cov.txt" if os.path.exists(sample_isoform_cov_filename): sample_isoform_cov_file = open(sample_isoform_cov_filename, "r") for line in sample_isoform_cov_file: if line[0] == "C": transcript_id = line.split("\t")[0] transcript_coverage_dic[transcript_id] = [] else: last_value = line.strip().split()[-1] base_cov_float = float(last_value) if math.isnan(base_cov_float): base_cov_float = 0.1 base_cov = int(base_cov_float + 0.99) transcript_coverage_dic[transcript_id].append(base_cov) sample_isoform_cov_file.close() for fusion_gene in fusion_gene_list[fusion_gene_begin_index:]: sample_name, gene_value_dic, transcripts = fusion_gene max_base_cov = 0 saw_fusion_transcript = False for transcript in transcripts: transcript_value_dic, exons = transcript transcript_id = transcript_value_dic["transcript_id"] if transcript_id in transcript_coverage_dic: transcript_coverage = transcript_coverage_dic[transcript_id] else: transcript_coverage = None saw_fusion = False exon_offset = 0 for exon in exons: if exon[0] == "fusion": saw_fusion = True continue exon_length = exon[3] - exon[2] + 1 if transcript_coverage: coverage = transcript_coverage[exon_offset:exon_offset + exon_length] else: coverage = [0 for z in range(exon_length)] exon.append(coverage) for base_cov in exon[-1]: if base_cov > max_base_cov: max_base_cov = base_cov exon_offset += exon_length transcript_value_dic["is_fusion_transcript"] = saw_fusion if saw_fusion: saw_fusion_transcript = True if not saw_fusion and saw_fusion_transcript: transcript_value_dic["is_after_fusion"] = True else: transcript_value_dic["is_after_fusion"] = False for transcript in transcripts: transcript_value_dic, exons = transcript transcript_value_dic["max_base_cov"] = max_base_cov sample_isoform_file.close() def find_fusion_gene(fusion_gene_list, sample_name, chr1, chr1_coord, chr2, chr2_coord): # slow linear search through fusion_gene_list fusion_gene_info = "" for fusion_gene in fusion_gene_list: _sample_name = fusion_gene[0] value_dic = fusion_gene[1] _chr1, _chr1_left, _chr1_right, _chr2, _chr2_left, _chr2_right = \ value_dic["chr1"], value_dic["chr1_left"], value_dic["chr1_right"], \ value_dic["chr2"], value_dic["chr2_left"], value_dic["chr2_right"] if sample_name != _sample_name: continue if chr1 != _chr1 or chr2 != _chr2: continue if chr1_coord < _chr1_left - 2 or chr1_coord > _chr1_right + 2: continue if chr2_coord < _chr2_left -2 or chr2_coord > _chr2_right + 2: continue transcripts = fusion_gene[-1] for transcript in transcripts: exons = transcript[-1] for exon in exons: if exon[0] != "fusion": continue if chr1_coord == exon[2] and chr2_coord == exon[4]: return fusion_gene return None def read_fusion_list(fusion_list): re_find_chromosomes = re.compile(r'Homo sapiens chromosome (\d+|[XY])') re_find_identities = re.compile(r'Identities = (\d+)\/\d+ \((\d+)%\)') re_find_exon = re.compile(r'exon\d+\((\d+-\d+)\)') line_no = 0 do_not_add = False output = {} file_name = output_dir + "potential_fusion.txt" file = open(file_name, 'r') for line in file: if line_no % 6 == 0: if output and not do_not_add: fusion_list.append(output) do_not_add = False temp_list = line[:-1].split(' ') sample_name = temp_list[0] chr = temp_list[1] chr1, chr2 = chr.split('-') left = int(temp_list[2]) right = int(temp_list[3]) dir = temp_list[4] output = {} output["sample_name"] = sample_name output["chr"] = chr output["chr1"] = chr1 output["chr2"] = chr2 output["left_coord"] = left output["right_coord"] = right output["dir"] = dir output["stats"] = temp_list[5:] elif line_no % 6 == 1: output["left_seq"] = line[:-1].split(" ")[0] elif line_no % 6 == 2: output["right_seq"] = line[:-1].split(" ")[1] left_seq = output["left_seq"] right_seq = output["right_seq"] seq = left_seq + right_seq temp_output = blast_output(output_dir + "blast_genomic", left_seq) left_chromosomes = set(re_find_chromosomes.findall(temp_output)) temp_output = blast_output(output_dir + "blast_genomic", right_seq) right_chromosomes = set(re_find_chromosomes.findall(temp_output)) if string.find(output["sample_name"], "single") == -1: chr1, chr2 = output["chr"].split("-") if chr1 != chr2 and left_chromosomes & right_chromosomes: do_not_add = True temp_output = blast_output(output_dir + "blast_genomic", seq) + blast_output(output_dir + "blast_nt", seq) for identity in re_find_identities.findall(temp_output): query = int(identity[0]) percent = int(identity[1]) if query + percent > 160: do_not_add = True break elif line_no % 6 == 3: output["depth"] = line[:-1] elif line_no % 6 == 4: output["gene"] = line[:-1] temp = output["gene"].split() output["gene1"], output["gene2"] = temp[0], temp[2] else: pair_str = line.strip() if len(pair_str) > 0: pairs = pair_str.split(" ") else: pairs = [] output["pair_coords"] = pairs read_output = "" read_output_file = output_dir + "read_alignments/%s_%s_%d_%d_%s" % (output["sample_name"], output["chr"], output["left_coord"], output["right_coord"], output["dir"]) if not do_not_add and os.path.exists(read_output_file): read_output = open(read_output_file, "r").read() def unique(list, left, right): color_len = 300 result, lcolor, rcolor = [], [0 for i in range(color_len)], [0 for i in range(color_len)] keys = [] F_start, F_end = 1000000, 0 for item in list: key = item.split(" ") if len(key) < 6: continue chr1, chr2, pos1, pos2, cigars = key[:5] pos1, pos2 = int(pos1), int(pos2) def color(list, left, right): for i in range(left, right): if i >= len(list): break list[i] += 1 if 'F' in cigars: dist1 = abs(pos1 - left) dist2 = abs(pos2 - right) color(lcolor, 0, dist1) color(rcolor, 0, dist2) elif abs(pos2 - left) < color_len: color(lcolor, abs(pos2 - left), abs(pos1 - left)) elif abs(pos1 - right) < color_len: color(rcolor, abs(pos1 - right), abs(pos2 - right)) if "F" in key[4] or len(keys) == 0 or key[:2] != keys[-1][:2] or abs(int(key[2]) - int(keys[-1][2])) >= 3: keys.append(key[:3]) result.append(item) if "F" in key[4]: index = len(result) - 1 if index < F_start: F_start = index if F_end < index: F_end = index if F_end == 0: F_start, F_end = 0, len(result) temp_result = result[max(0, F_start - 50):(F_end+50)] result = [] for i in range(len(temp_result)): item = temp_result[i].split(" ") if i == 0: trim = 0 for ch in item[40:-1]: trim += 1 item = item[:40] + item[40 + trim:] result.append(" ".join(item)) return result, lcolor, rcolor if len(read_output) < 1024 * 1024 * 1024: read_output = read_output.split("\n") read_output, lcolor, rcolor = unique(read_output, output["left_coord"], output["right_coord"]) def stat(lcolor, rcolor, dist): lcount, lsum, rcount, rsum = 1, 0, 1, 0 lgap, lpass, rgap, rpass = 0, False, 0, False for i in range(dist): if lcolor[i] > 0: lcount += 1 lsum += lcolor[i] if lgap > 0: lpass = True else: if not lpass: lgap += 1 if rcolor[i] > 0: rcount += 1 rsum += rcolor[i] if rgap > 0: rpass = True else: if not rpass: rgap += 1 if not lpass: lgap = 0 if not rpass: rgap = 0 return lcount, lsum / lcount, lgap, rcount, rsum / rcount, rgap temp = output["gene"].split() gene1_loc, gene2_loc = temp[1], temp[3] def find_exon_len(gene_loc, coord, dir, is_left): exon_len = 1000000 for loc in re_find_exon.findall(gene_loc): start, end = loc.split('-') start, end = int(start), int(end) if (is_left and dir == 'f') or (not is_left and dir == 'r'): exon_len = coord - start + 1 else: exon_len = end - coord + 1 return exon_len return exon_len lcount_min, rcount_min, diff_max = 150, 150, 120 lcount_exon_min = find_exon_len(gene1_loc, output["left_coord"], output["dir"][0], True) rcount_exon_min = find_exon_len(gene2_loc, output["right_coord"], output["dir"][1], False) lcount_min = min(lcount_min, lcount_exon_min - 20) rcount_min = min(rcount_min, rcount_exon_min - 20) diff_max = min(diff_max, abs(lcount_min - rcount_min) + 20) if lcount_exon_min < 1000 and rcount_exon_min < 1000: diff_max = max(diff_max, abs(lcount_exon_min - rcount_exon_min) + 20) lcount, lavg, lgap, rcount, ravg, rgap = stat(lcolor, rcolor, len(lcolor)) if lcount <= lcount_min or rcount <= rcount_min or lgap / lcount > 0.1 or rgap / rcount > 0.1: if abs(min(lcount, lcount_exon_min) - min(rcount, rcount_exon_min)) > diff_max or lcount < 60 or rcount < 60: do_not_add = True def derivation(color, length, avg): der = 0.0 for i in range(length): diff = 1.0 - float(color[i]) / float(max(1, avg)) der += diff * diff return math.sqrt(der / max(1, length)) output["read_output"] = read_output output["lcount"] = lcount output["lavg"] = lavg output["lgap"] = lgap output["lder"] = lder = derivation(lcolor, lcount_min, lavg) output["rcount"] = rcount output["ravg"] = ravg output["rgap"] = rgap output["rder"] = rder = derivation(rcolor, rcount_min, ravg) stats = output["stats"] num_read = int(stats[0]) pair = int(stats[1]) pair_fusion = int(stats[2]) anti_read = int(stats[3]) anti_pair = int(stats[4]) pair_coords = output["pair_coords"] dist = 1000000 if len(pair_coords) > 0: pair = 0 for pair_coord in pair_coords: pair_coord = pair_coord.split(":") temp_dist = abs(int(pair_coord[0])) + abs(int(pair_coord[1])) if temp_dist < dist: dist = temp_dist if temp_dist < 2000: pair += 1 anti_read += 0.5 if pair == 0: rate = num_read / anti_read else: rate = pair / anti_read fusion_gene = find_fusion_gene(fusion_gene_list, \ output["sample_name"], \ output["chr1"], output["left_coord"], \ output["chr2"], output["right_coord"]) output["assembled"] = False fusion_dic = {} left_normal_transcript, right_normal_transcript = 0, 0 if fusion_gene: sample_name, gene_value_dic, transcripts = fusion_gene for transcript in transcripts: transcript_value_dic, exons = transcript if not transcript_value_dic["is_fusion_transcript"]: if transcript_value_dic["is_after_fusion"]: right_normal_transcript = 1 else: left_normal_transcript = 1 continue for exon in exons: if exon[0] == "fusion": fusion_item = "%d-%d" % (exon[2], exon[4]) if output["left_coord"] == exon[2] and output["right_coord"] == exon[4]: assembled = True output["assembled"] = True if fusion_item not in fusion_dic: fusion_dic[fusion_item] = 0 fusion_dic[fusion_item] += 1 num_fusions = 0 num_alternative_splicing_with_same_fusion = 0 for key, value in fusion_dic.items(): num_fusions += 1 num_alternative_splicing_with_same_fusion += (value - 1) # lcount and rcount both are in [0, 300] max_avg = 300 score = lcount + rcount + (min(max_avg, lavg) + min(max_avg, ravg)) - abs(lcount - rcount) \ - min(max_avg, abs(lavg - ravg)) - (lgap + rgap) - (lder + rder) * max_avg - min(dist, 1000) \ + rate if output["assembled"]: score += 300 + min(num_fusions - 1, 5) * 200 + min(num_alternative_splicing_with_same_fusion, 5) * 100 \ + left_normal_transcript * 50 + right_normal_transcript * 50 output["score"] = score output["test"] = "lcount: %d, lavg: %d, lgap: %d, lder: %f, rcount: %d, ravg: %d, rgap: %d, rder: %f, dist: %d, score: %f num_fusions: %d num_alternative_splicing_with_same_fusion: %d left_normal_transcript: %d right_normal_transcript: %d" % \ (lcount, lavg, lgap, lder, rcount, ravg, rgap, rder, dist, score, num_fusions, num_alternative_splicing_with_same_fusion, left_normal_transcript, right_normal_transcript) else: read_output = [r"too many - not shown"] if not "read_output" in output: output["read_output"] = "" output["lcount"] = 0 output["lavg"] = 0 output["lgap"] = 0 output["lder"] = 0 output["rcount"] = 0 output["ravg"] = 0 output["rgap"] = 0 output["rder"] = 0 output["score"] = 0 output["assembled"] = False line_no += 1 if output and not do_not_add: fusion_list.append(output) file.close() def cluster_fusion(fusion_list, fusion_gene_list, cluster_list): cluster_dist = 500000 cluster_temp_list = [] parent = [i for i in range(len(fusion_list))] for i in range(len(fusion_list)): fusion = fusion_list[i] cluster = {} cluster["index"] = [i] cluster["chr"] = fusion["chr"] cluster["left1"] = cluster["left2"] = fusion["left_coord"] cluster["right1"] = cluster["right2"] = fusion["right_coord"] cluster["dir"] = fusion["dir"] cluster_temp_list.append(cluster) def parent_index(parent, i): parent_i = parent[i] if i == parent_i: return i else: parent[i] = parent[parent_i] return parent_index(parent, parent[i]) for i in range(len(fusion_list) - 1): parent_i = parent_index(parent, i) for j in range(i+1, len(fusion_list)): parent_j = parent_index(parent, j) if parent_i == parent_j: continue cluster_i = cluster_temp_list[parent_i] cluster_j = cluster_temp_list[parent_j] if cluster_i["chr"] != cluster_j["chr"] or cluster_i["dir"] != cluster_j["dir"]: continue i_left1 = cluster_i["left1"] i_left2 = cluster_i["left2"] j_left1 = cluster_j["left1"] j_left2 = cluster_j["left2"] if abs(i_left1 - j_left1) > cluster_dist or abs(i_left2 - j_left2) > cluster_dist or \ abs(i_left2 - j_left1) > cluster_dist or abs(i_left1 - j_left2) > cluster_dist: continue i_right1 = cluster_i["right1"] i_right2 = cluster_i["right2"] j_right1 = cluster_j["right1"] j_right2 = cluster_j["right2"] if abs(i_right1 - j_right1) > cluster_dist or abs(i_right2 - j_right2) > cluster_dist or\ abs(i_right2 - j_right1) > cluster_dist or abs(i_right1 - j_right2) > cluster_dist: continue cluster_i["left1"] = min(i_left1, j_left1) cluster_i["left2"] = max(i_left2, j_left2) cluster_i["right1"] = min(i_right1, j_right1) cluster_i["right2"] = max(i_right2, j_right2) parent[j] = i cluster_i["index"].extend(cluster_j["index"]) cluster_temp_list2 = [] for i in range(len(parent)): if i == parent[i]: cluster_temp_list2.append(cluster_temp_list[i]) def num_samples(indices): samples = [] for i in indices: sample = fusion_list[i]["sample_name"] if not sample in samples: samples.append(sample) return len(samples) def cmp(a, b): def pair_count(indices): final_score = -1000000.0 for index in indices: score = fusion_list[index]["score"] if score > final_score: final_score = score return int(final_score) a_indices = a["index"] b_indices = b["index"] a_gene, b_gene = 0, 0 def known_genes(indices): num = 0 for index in indices: temp_num = 0 fusion = fusion_list[index] if fusion["gene1"] != "N/A": temp_num += 1 if fusion["gene2"] != "N/A": temp_num += 1 if temp_num > num: num = temp_num return num a_gene = known_genes(a_indices) b_gene = known_genes(b_indices) if a_gene != b_gene: return b_gene - a_gene a_num_sample, b_num_sample = num_samples(a_indices), num_samples(b_indices) a_score = pair_count(a_indices) b_score = pair_count(b_indices) return b_score - a_score cluster_temp_list = sorted(cluster_temp_list2, cmp=cmp) max_num_fusions = 500 for i in range(min(max_num_fusions, len(cluster_temp_list))): do_not_add = False indices = cluster_temp_list[i]["index"] if num_samples(indices) > 5: do_not_add = True if not do_not_add: def cmp(a, b): return int(fusion_list[b]["score"] - fusion_list[a]["score"]) cluster_temp_list[i]["index"] = sorted(cluster_temp_list[i]["index"], cmp=cmp) cluster_list.append(cluster_temp_list[i]) def generate_html_impl(fusion_list, cluster_list, fusion_gene_list): html_file_name = output_dir + 'result.html' txt_file_name = output_dir + 'result.txt' html_file = open(html_file_name, 'w') txt_file = open(txt_file_name, 'w') html_prev = [] javascript = [] html_body = [] html_post = [] def cmp(i, j): i_left = fusion_list[i]["left_coord"] j_left = fusion_list[j]["left_coord"] if i_left == j_left: return i - j else: return i_left - j_left indices_list = [] for c in cluster_list: indices_list += c["index"] indices_list = sorted(indices_list) print >> sys.stderr, "\tnum of fusions: %d" % len(cluster_list) if params.tex_table: tex_table_file_name = output_dir + 'table.tex' tex_table_file = open(tex_table_file_name, 'w') print >> tex_table_file, r'\documentclass{article}' print >> tex_table_file, r'\usepackage{graphicx}' print >> tex_table_file, r'\begin{document}' print >> tex_table_file, r'\pagestyle{empty}' print >> tex_table_file, r"\center{\scalebox{0.7}{" print >> tex_table_file, r"\begin{tabular}{| c | c | c | c | c | c | c |}" print >> tex_table_file, r"\hline" print >> tex_table_file, r"SAMPLE ID & Fusion genes (left-right) & Chromosomes (left-right) & " + \ r"5$'$ position & 3$'$ position & Spanning reads & Spanning pairs \\" html_prev.append(r'') html_prev.append(r'') html_prev.append(r'result') html_prev.append(r'') html_prev.append(r'') html_prev.append(r'') html_prev.append(r'') html_prev.append(r'') html_prev.append(r'') html_prev.append(r'') html_body.append(r'') html_post.append(r'


Candidate fusion list

') html_post.append(r'The following tables show fusion candidates where fusions are grouped based on their genomic locations (table description).
') num_fusions = 0 for c in range(len(cluster_list)): cluster = cluster_list[c] indices = cluster["index"] num_fusions += len(indices) top_index = indices[0] indices = sorted(indices, cmp=cmp) chr1, chr2 = cluster["chr"].split('-') if params.tex_table: print >> tex_table_file, r'\hline' html_post.append(r'


') html_post.append(r'%d. %s %s' % (c+1, cluster["chr"], cluster["dir"])) html_post.append(r'') for i in indices: fusion = fusion_list[i] sample_name = fusion["sample_name"] # sample_name = string.split(sample_name, '_')[0] output = "" stats = fusion["stats"] stats = [int(stat) for stat in stats] if stats[1] > 0: pair_support_html = r"\htmlref{%d}{pair_%d}" % (stats[1], i) else: pair_support_html = r"0" if params.tex_table: if sample_name == "lane1": sample_name2 = "thyroid" else: sample_name2 = "testes" print >> tex_table_file, r'%s & %s & %s & %d & %d & %d & %d \\' % \ (sample_name2, fusion["gene1"] + "-" + fusion["gene2"], chr1[3:] + "-" + chr2[3:], fusion["left_coord"], fusion["right_coord"], stats[0], stats[1] + stats[2]) print >> txt_file, '%s\t%s\t%s\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%.2f' % \ (sample_name, fusion["gene1"], chr1, fusion["left_coord"], fusion["gene2"], chr2, fusion["right_coord"], stats[0], stats[1], stats[2], fusion["score"]) html_post.append(r'' % (i, sample_name)) html_post.append(r'' % fusion["gene1"]) html_post.append(r'' % chr1) html_post.append(r'' % fusion["left_coord"]) html_post.append(r'' % fusion["gene2"]) html_post.append(r'' % chr2) html_post.append(r'' % fusion["right_coord"]) html_post.append(r'' % (i, stats[0])) html_post.append(r'' % (i, stats[1])) html_post.append(r'' % stats[2]) # daehwan - for debugging purposes # html_post.append(r'' % fusion["score"]) # html_post.append(r'' % fusion["test"]) fusion_gene = find_fusion_gene(fusion_gene_list, sample_name, chr1, fusion["left_coord"], chr2, fusion["right_coord"]) fusion["fusion_gene"] = fusion_gene if fusion_gene and fusion["assembled"]: html_post.append(r'' % i) html_post.append(r'') html_post.append(r'
%s%s%s%d%s%s%d%d%d%d%.2f%sisoforms
') """ # sample html_post.append(r'


sample list

') html_post.append(r'') html_post.append(r'') html_post.append(r'') html_post.append(r'') html_post.append(r'') if os.path.exists("sample_info.txt"): sample_file = open("sample_info.txt", "r") for line in sample_file: line = line[:-1].split("\t") html_post.append(r'' % line[0]) html_post.append(r'' % line[1]) html_post.append(r'' % line[2]) html_post.append(r'' % line[3]) sample_file.close() html_post.append(r'
sample namefragment lengthread length# of fragments
%s%s%s%s
') """ # description html_post.append(r'


table description

') html_post.append(r'1. Sample name in which a fusion is identified
') html_post.append(r'2. Gene on the "left" side of the fusion
') html_post.append(r'3. Chromosome ID on the left
') html_post.append(r'4. Coordinates on the left
') html_post.append(r'5. Gene on the "right" side
') html_post.append(r'6. Chromosome ID on the right
') html_post.append(r'7. Coordinates on the right
') html_post.append(r'8. Number of spanning reads - this is the number of reads that span a fusion point all on their own. In other words, the read itself has a fusion break point within it.
') html_post.append(r'9. Number of spanning mate pairs - this is the number of pairs of reads where one read maps entirely on the left and the other read maps entirely on the right of the fusion break point. Neither read is split, so these pairs are not counted at all in (8).
') html_post.append(r'10. Number of spanning mate pairs where one end spans a fusion (reads spanning fusion with only a few bases are included).
') html_post.append(r'If you follow the the 9th column, it shows coordinates "number1:number2" where one end is located at a distance of "number1" bases from the left genomic coordinate of a fusion and "number2" is similarly defined.') html_post.append(r'


') fusion_gene_drawn = [False for i in range(len(cluster_list))] fusion_gene_indices = [] for i in indices_list: cluster_index = -1 for c in range(len(cluster_list)): cluster_indices = cluster_list[c]["index"] if i in cluster_indices: cluster_index = c break fusion = fusion_list[i] sample_name = fusion["sample_name"] chr = fusion["chr"] left = fusion["left_coord"] right = fusion["right_coord"] dir = fusion["dir"] chr1, chr2 = chr.split("-") left_seq = fusion["left_seq"] right_seq = fusion["right_seq"] seq = left_seq + right_seq pair_coords = fusion["pair_coords"] stats = fusion["stats"] stats = [int(stat) for stat in stats] html_post.append(r'' % i) html_post.append(r'' % (sample_name)) html_post.append(r'' % fusion["gene1"]) html_post.append(r'' % chr1) html_post.append(r'' % fusion["left_coord"]) html_post.append(r'' % fusion["gene2"]) html_post.append(r'' % chr2) html_post.append(r'' % fusion["right_coord"]) html_post.append(r'' % (i, stats[0])) html_post.append(r'' % (i, stats[1])) html_post.append(r'' % stats[2]) html_post.append(r'') dir_str = "" if dir[0] == "f": dir_str += "-------->" else: dir_str += "<--------" dir_str += " " if dir[1] == "f": dir_str += "-------->" else: dir_str += "<--------" html_post.append(r'' % dir_str) html_post.append(r'

%s

%s

%s

%d

%s

%s

%d

%d

%d

%d

%s

') html_post.append(r'

') """ html_post.append(r'
')
            html_post.append(r'%s %s' % (left_seq, right_seq))
            html_post.append(r'%s' % fusion["depth"])
            html_post.append(r'%s' % fusion["gene"])
            html_post.append(r'
') """ fusion_gene = fusion["fusion_gene"] if fusion_gene: fusion_gene_indices.append([i, cluster_index]) fg_value_dic, transcripts = fusion_gene[1], fusion_gene[2] html_post.append(r'

%s-%s

' % (i, fusion["gene1"], fusion["gene2"])) if not fusion_gene_drawn[cluster_index]: javascript.append(r'') fusion_gene_drawn[cluster_index] = True left_blast_genomic = blast_output(output_dir + "blast_genomic", left_seq) right_blast_genomic = blast_output(output_dir + "blast_genomic", right_seq) blast_genomic = blast_output(output_dir + "blast_genomic", seq) left_blast_nt = blast_output(output_dir + "blast_nt", left_seq) right_blast_nt = blast_output(output_dir + "blast_nt", right_seq) blast_nt = blast_output(output_dir + "blast_nt", seq) html_post.append(r'') html_post.append(r'
Left flanking sequenceRight flanking sequence
%s%s
' % (left_seq, right_seq)) html_post.append(r'

blast search - genome

') html_post.append(r'

left flanking sequence - %s

' % left_seq) html_post.append(r'
%s
' % left_blast_genomic) html_post.append(r'

right flanking sequence - %s

' % right_seq) html_post.append(r'
%s
' % right_blast_genomic) html_post.append(r'

blast search - nt

') html_post.append(r'

left flanking sequence - %s

' % left_seq) html_post.append(r'
%s
' % left_blast_nt) html_post.append(r'

right flanking sequence - %s

' % right_seq) html_post.append(r'
%s
' % right_blast_nt) html_post.append(r'


reads

' % i) read_output = fusion["read_output"] html_post.append(r'
%s
' % '\n'.join(read_output)) html_post.append(r'


%d pairs

' % (i, len(pair_coords))) html_post.append(r'
%s
' % '\n'.join(pair_coords)) html_post.append(r'


') html_post.append(r'') html_post.append(r'') fusion_gene_draw_str = "" for index in fusion_gene_indices: isoform_index, cluster_index = index fusion_gene_draw_str += "fusion_gene_draw%d(%d);" % (cluster_index, isoform_index) html_body.append(r'' % fusion_gene_draw_str) for line in html_prev + javascript + html_body + html_post: print >> html_file, line html_file.close() txt_file.close() if params.tex_table: print >> tex_table_file, r"\hline" print >> tex_table_file, r"\end{tabular}}}" print >> tex_table_file, r'\end{document}' tex_table_file.close() os.system("pdflatex --output-directory=%s %s" % (output_dir, tex_table_file_name)) print >> sys.stderr, "[%s] Reporting final fusion candidates in html format" % right_now() # Cufflinks-Fusion fusion_gene_list = [] read_fusion_genes(fusion_gene_list) fusion_list = [] read_fusion_list(fusion_list) cluster_list = [] cluster_fusion(fusion_list, fusion_gene_list, cluster_list) generate_html_impl(fusion_list, cluster_list, fusion_gene_list) # Format a DateTime as a pretty string. # FIXME: Currently doesn't support days! def formatTD(td): hours = td.seconds // 3600 minutes = (td.seconds % 3600) // 60 seconds = td.seconds % 60 return '%02d:%02d:%02d' % (hours, minutes, seconds) # Generate a new temporary filename in the user's tmp directory def tmp_name(): tmp_root = tmp_dir if os.path.exists(tmp_root): pass else: os.mkdir(tmp_root) return tmp_root + os.tmpnam().split('/')[-1] def die(msg=None): if msg is not None: print >> sys.stderr, msg sys.exit(1) # rough equivalent of the 'which' command to find external programs # (current script path is tested first, then PATH envvar) def which(program): def is_executable(fpath): return os.path.exists(fpath) and os.access(fpath, os.X_OK) fpath, fname = os.path.split(program) if fpath: if is_executable(program): return program else: progpath = os.path.join(bin_dir, program) if is_executable(progpath): return progpath for path in os.environ["PATH"].split(os.pathsep): progpath = os.path.join(path, program) if is_executable(progpath): return progpath return None def die(msg=None): if msg is not None: print >> sys.stderr, msg sys.exit(1) def prog_path(program): progpath = which(program) if progpath == None: die("Error locating program: "+program) return progpath def get_version(): return "2.0.9" def main(argv=None): warnings.filterwarnings("ignore", "tmpnam is a potential security risk") # Initialize default parameter values params = TopHatFusionParams() try: if argv is None: argv = sys.argv args = params.parse_options(argv) params.check() bwt_idx_prefix = args[0] print >> sys.stderr, "[%s] Beginning TopHat-Fusion post-processing run (v%s)" % (right_now(), get_version()) print >> sys.stderr, "-----------------------------------------------" start_time = datetime.now() prepare_output_dir() sample_updated = check_samples() if not params.skip_fusion_kmer: map_fusion_kmer(bwt_idx_prefix, params, sample_updated) if not params.skip_filter_fusion: filter_fusion(bwt_idx_prefix, params) if not params.skip_blast: do_blast(params) if not params.skip_read_dist: read_dist(params) if not params.skip_html: generate_html(params) global run_log run_log = open(logging_dir + "run.log", "w", 0) global run_cmd run_cmd = " ".join(argv) print >> run_log, run_cmd finish_time = datetime.now() duration = finish_time - start_time print >> sys.stderr,"-----------------------------------------------" print >> sys.stderr, "[%s] Run complete [%s elapsed]" % (right_now(), formatTD(duration)) except Usage, err: print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg) print >> sys.stderr, "\tfor detailed help see http://tophat-fusion.sourceforge.net/manual.html" return 2 if __name__ == "__main__": sys.exit(main()) tophat-2.0.9/src/FastaTools.h0000644000175000017500000000305112157116165014574 0ustar toortoor// // FastaTools.h // TopHat // // Created by Harold Pimentel on 10/27/11. // #ifndef TopHat_FastaTools_h #define TopHat_FastaTools_h #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #include #include #include #include "common.h" /* #define LINE_BUF_SIZE 1024 #define ID_BUF_SIZE 1024 #define DESC_BUF_SIZE 1024 */ struct FastaRecord { // The identifier after ">" std::string id_; // The description after the identifier std::string desc_; // The sequence std::string seq_; void clear() { id_.clear(); desc_.clear(); seq_.clear(); } }; class FastaReader { public: FastaReader(); FastaReader(std::string fname); ~FastaReader(); void init(std::string fname); bool good() const; bool next(FastaRecord& rec); private: std::string fname_; std::string prev_line_; std::ifstream ifstream_; //char line_buf_[LINE_BUF_SIZE]; //char id_buf_[ID_BUF_SIZE]; //char desc_buf_[DESC_BUF_SIZE]; std::string line_buf_; //std::string id_buf_; //std::string desc_buf_; // variable to check if stream is primed (has already been initialized) bool isPrimed_; }; class FastaWriter { public: FastaWriter(); FastaWriter(std::string fname); ~FastaWriter(); void init(std::string fname); void write(FastaRecord& rec, size_t column_size = 60); private: std::string fname_; std::ofstream ofstream_; bool isPrimed_; }; #endif tophat-2.0.9/src/fusions.cpp0000755000175000017500000006705212150706377014557 0ustar toortoor/* * fusions.cpp * TopHat */ #ifdef HAVE_CONFIG_H #include #else #define PACKAGE_VERSION "INTERNAL" #define SVN_REVISION "XXX" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common.h" #include "bwt_map.h" #include "junctions.h" #include "fusions.h" #include "fragments.h" #include "wiggles.h" #include "tokenize.h" #include "reads.h" #include "inserts.h" // daehwan - replace this with that of SeqAn int difference(const string& first, const string& second) { int len = seqan::length(first); if (len != (int)seqan::length(second)) return 0; int min_value = 10000; short value1[1024] = {0,}; short value2[1024] = {0,}; short* curr = value1; short* prev = value2; for (int j = 0; j < len; ++j) { for (int i = 0; i < len; ++i) { int value = 10000; int match = first[i] == second[j] ? 0 : 1; // right if (i == 0) value = j * 2 + match; else if (j > 0) value = prev[i] + 2; int temp_value = 10000; // down if (j == 0) temp_value = i * 2 + match; else if (i > 0) temp_value = curr[i-1] + 2; if (temp_value < value) value = temp_value; // match if (i > 0 && j > 0) temp_value = prev[i-1] + match; if (temp_value < value) value = temp_value; curr[i] = value; if ((i == len - 1 || j == len - 1) && value < min_value) min_value = value; } short* temp = prev; prev = curr; curr = temp; } return min_value; } /** * Add fusions from an alignment to an FusionSet. * This will look for fusion in the alignment specified by bh. * @param bh The bowtie hit to be used to specify alignment infromation. * @param fusions The FusionSet that will be updated with the insertion information from the alignment. */ void fusions_from_alignment(const BowtieHit& bh, FusionSet& fusions, RefSequenceTable& rt, bool update_stat) { RefSequenceTable::Sequence* ref_str1 = rt.get_seq(bh.ref_id()); RefSequenceTable::Sequence* ref_str2 = rt.get_seq(bh.ref_id2()); if (!ref_str1 || !ref_str2) return; vector new_fusions; fusions_from_spliced_hit(bh, new_fusions); for(size_t i = 0; i < new_fusions.size(); ++i) { Fusion fusion = new_fusions[i]; const vector& cigars = bh.cigar(); // daehwan - for debugging purposes #if 0 if (fusion.left == 99880270) { fprintf(stderr, "daehwan read_id(%d): %d-%d|%d-%d\n", bh.insert_id(), fusion.refid1, fusion.refid2, fusion.left, fusion.right); } #endif /* * Assume read is in the same direction as fusion. */ // Find the position of Fusion. size_t fusion_pos = 0; for (; fusion_pos < cigars.size(); ++fusion_pos) { CigarOpCode opcode = cigars[fusion_pos].opcode; if (opcode == FUSION_FF || opcode == FUSION_FR || opcode == FUSION_RF || opcode == FUSION_RR) break; } if (fusion_pos <= 0 || fusion_pos + 1 >= cigars.size()) continue; // For left bases, size_t left_pos = 0; for (int j = (int)fusion_pos - 1; j >= 0; --j) { const CigarOp& cigar = cigars[j]; switch (cigar.opcode) { case MATCH: case mATCH: case REF_SKIP: case rEF_SKIP: case DEL: case dEL: left_pos += cigar.length; break; default: break; } } // For right bases, size_t right_pos = 0; for (size_t j = fusion_pos + 1; j < cigars.size(); ++j) { const CigarOp& cigar = cigars[j]; switch (cigar.opcode) { case MATCH: case mATCH: case REF_SKIP: case rEF_SKIP: case DEL: case dEL: right_pos += cigar.length; break; default: break; } } if (left_pos < fusion_anchor_length || right_pos < fusion_anchor_length) continue; FusionSet::iterator itr = fusions.find(fusion); if (itr != fusions.end()) { itr->second.count += 1; } else { assert(fusion.refid1 != 0xFFFFFFFF); FusionStat fusionStat; fusionStat.count = 1; fusions[fusion] = fusionStat; itr = fusions.find(fusion); if (!update_stat) { /* * make a reversed fusion. * this is necessary to detect reads that contradict the fusion. */ FusionStat fusionStat_rev; fusionStat_rev.count = 1; fusionStat_rev.reversed = true; Fusion fusion_rev(fusion.refid2, fusion.refid1, fusion.right, fusion.left, fusion.dir); fusions[fusion_rev] = fusionStat_rev; } } if (update_stat) { if (itr->second.chr1_seq.length() <= 0) { size_t len = 100; size_t half_len = len / 2; size_t increase = 20; if (fusion.left >= half_len && fusion.left + half_len <= seqan::length(*ref_str1) && fusion.right >= half_len && fusion.right + half_len <= seqan::length(*ref_str2)) { seqan::Dna5String left, right; if (fusion.dir == FUSION_RF || fusion.dir == FUSION_RR) { left = seqan::infix(*ref_str1, fusion.left - half_len, fusion.left + half_len); seqan::reverseComplement(left); } else left = seqan::infix(*ref_str1, fusion.left - half_len + 1, fusion.left + half_len + 1); if (fusion.dir == FUSION_FR || fusion.dir == FUSION_RR) { right = seqan::infix(*ref_str2, fusion.right - half_len + 1, fusion.right + half_len + 1); seqan::reverseComplement(right); } else right = seqan::infix(*ref_str2, fusion.right - half_len, fusion.right + half_len); itr->second.chr1_seq = DnaString_to_string(left); itr->second.chr2_seq = DnaString_to_string(right); for (size_t j = 0; j < 5; ++j) { size_t pos = (5 - j - 1) * increase / 2; const string& left_sub = itr->second.chr1_seq.substr(pos, (j+1) * increase); const string& right_sub = itr->second.chr2_seq.substr(pos, (j+1) * increase); itr->second.diffs.push_back(difference(left_sub, right_sub)); } } } assert (bh.ref_id() == itr->first.refid1); itr->second.left_ext = max((size_t)itr->second.left_ext, left_pos); itr->second.right_ext = max((size_t)itr->second.right_ext, right_pos); for (size_t k = 0; k < left_pos && k < itr->second.left_bases.size(); ++k) { ++(itr->second.left_bases[k]); } for (size_t k = 0; k < right_pos && k < itr->second.right_bases.size(); ++k) { ++(itr->second.right_bases[k]); } } } } void unsupport_fusions(const BowtieHit& bh, FusionSet& fusions, const FusionSet& fusions_ref) { if (bh.fusion_opcode() != FUSION_NOTHING || bh.is_spliced() || bh.read_len() < 40) return; FusionSet::const_iterator lb, ub; uint32_t left = bh.left() + 20; uint32_t right = bh.right() - 20; lb = fusions_ref.upper_bound(Fusion(0u, 0u, left, 0)); ub = fusions_ref.lower_bound(Fusion(0xffffffffu, 0xffffffffu, right, 0xffffffffu)); while (lb != ub && lb != fusions_ref.end()) { if (lb->first.refid1 == bh.ref_id()) { // daehwan #if 0 // MCF-7 RPS6KB1 17:57970443-58027925:1 TMEM49 17:57784863-57917950:1 if ((lb->first.left == 57992061 && lb->first.right == 57917126) || (lb->first.left == 57917126 && lb->first.right == 57992061)) { const char* dir_str = "ff"; if (lb->first.dir == FUSION_FR) dir_str = "fr"; else if (lb->first.dir == FUSION_RF) dir_str = "rf"; cout << "fusion: " << lb->first.left << "-" << lb->first.right << endl; cout << dir_str << endl; cout << bh.insert_id() << ": " << bh.left() << "-" << bh.right() << "\t"; cout << print_cigar(bh.cigar()) << " " << (int)bh.edit_dist() << endl; cout << bh.seq() << endl; cout << bh.qual() << endl; cout << endl; } #endif FusionSet::iterator itr; if (lb->second.reversed) itr = fusions.find(Fusion(lb->first.refid2, lb->first.refid1, lb->first.right, lb->first.left, lb->first.dir)); else itr = fusions.find(lb->first); if (itr == fusions.end()) { FusionStat fusionStat; fusionStat.unsupport_count = 1; fusions[lb->first] = fusionStat; } else ++(itr->second.unsupport_count); } ++lb; } } /** */ void print_fusions(FILE* fusions_out, FusionSet& fusions, RefSequenceTable& ref_sequences) { // fprintf(fusions_out, "track name=fusions description=\"TopHat fusions\"\n"); vector vFusion; for (FusionSet::iterator itr = fusions.begin(); itr != fusions.end(); ++itr) { vFusion.push_back(itr->first); } sort(vFusion.begin(), vFusion.end()); for (size_t i = 0; i < vFusion.size(); ++i) { FusionSet::iterator itr = fusions.find(vFusion[i]); int counts = itr->second.count; if (counts <= 0 || itr->second.reversed) continue; const char* dir = ""; if (itr->first.dir == FUSION_FF) dir = "ff"; else if(itr->first.dir == FUSION_FR) dir = "fr"; else if(itr->first.dir == FUSION_RF) dir = "rf"; else dir = "rr"; assert (itr->second.left_bases.size() == itr->second.right_bases.size()); float symm = 0.0f; for (uint32_t i = 0; i < itr->second.left_bases.size(); ++i) { float term = ((int)itr->second.left_bases[i] - (int)itr->second.right_bases[i]) / (float)counts; symm += (term * term); } fprintf(fusions_out, "%s-%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.6f", ref_sequences.get_name(itr->first.refid1), ref_sequences.get_name(itr->first.refid2), itr->first.left, itr->first.right, dir, counts, itr->second.pair_count, itr->second.pair_count_fusion, itr->second.unsupport_count, itr->second.left_ext, itr->second.right_ext, symm); fprintf(fusions_out, "\t@\t"); for (uint32_t i = 0; i < itr->second.diffs.size(); ++i) { fprintf(fusions_out, "%d ", itr->second.diffs[i]); } fprintf(fusions_out, "\t@\t"); uint32_t half_length = itr->second.chr1_seq.length() / 2; fprintf(fusions_out, "%s %s\t@\t", itr->second.chr1_seq.substr(0, half_length).c_str(), itr->second.chr1_seq.substr(half_length).c_str()); fprintf(fusions_out, "%s %s\t@\t", itr->second.chr2_seq.substr(0, half_length).c_str(), itr->second.chr2_seq.substr(half_length).c_str()); for (uint32_t i = 0; i < itr->second.left_bases.size(); ++i) { fprintf(fusions_out, "%d ", itr->second.left_bases[i]); } fprintf(fusions_out, "\t@\t"); for (uint32_t i = 0; i < itr->second.right_bases.size(); ++i) { fprintf(fusions_out, "%d ", itr->second.right_bases[i]); } fprintf(fusions_out, "\t@\t"); sort(itr->second.vPairSupport.begin(), itr->second.vPairSupport.end()); for (uint32_t i = 0; i < min((size_t)200, itr->second.vPairSupport.size()); ++i) { fprintf(fusions_out, "%d:%d ", itr->second.vPairSupport[i].ldist, itr->second.vPairSupport[i].rdist); } fprintf(fusions_out, "\n"); } } /** * Extract a list of fusions from a bowtie hit. * Given a bowtie hit, extract a vector of insertions. * @param bh The bowtie hit to use for alignment information. * @param insertions Used to store the resultant vector of insertions. */ void fusions_from_spliced_hit(const BowtieHit& bh, vector& fusions, bool auto_sort) { const vector& cigar = bh.cigar(); unsigned int positionInGenome = bh.left(); for(size_t c = 0; c < cigar.size(); ++c) { Fusion fusion; switch(cigar[c].opcode) { case REF_SKIP: case MATCH: case DEL: positionInGenome += cigar[c].length; break; case rEF_SKIP: case mATCH: case dEL: positionInGenome -= cigar[c].length; break; case FUSION_FF: case FUSION_FR: case FUSION_RF: case FUSION_RR: fusion.dir = cigar[c].opcode; if (fusion.dir == FUSION_RF || fusion.dir == FUSION_RR) positionInGenome = positionInGenome + 1; else positionInGenome = positionInGenome - 1; if (bh.ref_id() < bh.ref_id2() || (bh.ref_id() == bh.ref_id2() && positionInGenome < cigar[c].length) || !auto_sort) { fusion.refid1 = bh.ref_id(); fusion.refid2 = bh.ref_id2(); fusion.left = positionInGenome; fusion.right = cigar[c].length; } else { assert (auto_sort); fusion.refid1 = bh.ref_id2(); fusion.refid2 = bh.ref_id(); fusion.left = cigar[c].length; fusion.right = positionInGenome; } fusions.push_back(fusion); break; default: break; } } } void pair_support(const vector >& best_hits, FusionSet& fusions, FusionSet& fusions_ref) { if (best_hits.size() > fusion_multipairs) return; for (size_t i = 0; i < best_hits.size(); ++i) { const BowtieHit& lh = best_hits[i].first; const BowtieHit& rh = best_hits[i].second; bool left_fusionSpanned = lh.fusion_opcode() != FUSION_NOTHING; bool right_fusionSpanned = rh.fusion_opcode() != FUSION_NOTHING; if (left_fusionSpanned && right_fusionSpanned) continue; bool fusionSpanned = left_fusionSpanned || right_fusionSpanned; bool fusion_leftSide = false; uint32_t ref_id1 = lh.ref_id2(); uint32_t ref_id2 = rh.ref_id(); uint32_t dir = FUSION_FF; if (fusionSpanned) { if (left_fusionSpanned) dir = lh.fusion_opcode(); else dir = rh.fusion_opcode(); } else { if (!lh.antisense_align() && !rh.antisense_align()) dir = FUSION_FR; else if (lh.antisense_align() && rh.antisense_align()) dir = FUSION_RF; else { if (lh.ref_id() == rh.ref_id()) { if ((lh.antisense_align() && lh.left() > rh.left()) || (!lh.antisense_align() && lh.left() < rh.left())) dir = FUSION_FF; else dir = FUSION_RR; } else { if ((lh.antisense_align() && lh.ref_id() > rh.ref_id()) || (!lh.antisense_align() && lh.ref_id() < rh.ref_id())) dir = FUSION_FF; else dir = FUSION_RR; } } } FusionSet::iterator lb, ub; bool unsupport = false; // int inner_dist = max_report_intron_length * 2; const int range = min((int)fusion_min_dist, 10000); int inner_dist = range; int outer_dist = range * 2; int max_dist = range * 2; int left1 = 0, left2 = 0, right1 = 0, right2 = 0; if (fusionSpanned) { vector new_fusions; if (left_fusionSpanned) fusions_from_spliced_hit(lh, new_fusions); else fusions_from_spliced_hit(rh, new_fusions); Fusion& fusion = new_fusions[0]; dir = fusion.dir; ref_id1 = fusion.refid1; ref_id2 = fusion.refid2; if (left_fusionSpanned && lh.ref_id() != ref_id2 && lh.ref_id2() != ref_id2) unsupport = true; if (right_fusionSpanned && rh.ref_id() != ref_id1 && rh.ref_id2() != ref_id1) unsupport = true; int temp1, temp2; const BowtieHit* fusionHit; const BowtieHit* otherHit; if (left_fusionSpanned) { fusionHit = &lh; otherHit = &rh; } else { fusionHit = &rh; otherHit = &lh; } // check the normal hit (otherHit) is on the left hand side of the fusion. if (ref_id1 == ref_id2) { if (dir == FUSION_FF || dir == FUSION_FR) fusion_leftSide = otherHit->left() < fusionHit->left(); else if (dir == FUSION_RF) fusion_leftSide = otherHit->left() < fusionHit->right(); else fusion_leftSide = otherHit->right() > fusionHit->left(); } else fusion_leftSide = fusionHit->ref_id() == otherHit->ref_id(); // ***** // daehwan - make sure what '+' and '-' really mean for FF, FR, RF, RR cases // ***** if ((dir != FUSION_RF && dir != FUSION_RR) && fusionHit->antisense_align() && (!fusion_leftSide || otherHit->antisense_align())) unsupport = true; if ((dir != FUSION_FR && dir != FUSION_RR) && !fusionHit->antisense_align() && (fusion_leftSide || !otherHit->antisense_align())) unsupport = true; if ((dir == FUSION_FR || dir == FUSION_RR) && !fusionHit->antisense_align() && (fusion_leftSide || otherHit->antisense_align())) unsupport = true; if ((dir == FUSION_RF || dir == FUSION_RR) && fusionHit->antisense_align() && (!fusion_leftSide || !otherHit->antisense_align())) unsupport = true; temp1 = otherHit->left(); temp2 = otherHit->right(); if ((fusion_leftSide && dir == FUSION_RF) || (!fusion_leftSide && dir != FUSION_FR)) { temp2 = temp1 + inner_dist; if (temp1 > outer_dist) temp1 = temp1 - outer_dist; else temp1 = 0; } else { if (temp2 >= inner_dist) temp1 = temp2 - inner_dist; else temp1 = 0; temp2 = temp2 + outer_dist; } if (fusion_leftSide) { left1 = temp1; left2 = temp2; } else { right1 = temp1; right2 = temp2; } lb = fusions_ref.find(fusion); ub = fusions_ref.end(); // daehwan - debug #if 0 if (fusion.left == 6994359 && fusion.right == 17581683) { cout << "daehwan - test - pair_with_fusion: " << lh.insert_id() << endl; cout << "edit dist: " << (int)lh.edit_dist() << ", " << (int)rh.edit_dist() << endl; const char* dir_str = "ff"; if (dir == FUSION_FR) dir_str = "fr"; else if (dir == FUSION_RF) dir_str = "rf"; else if (dir == FUSION_RR) dir_str = "rr"; cout << dir_str << " : " << (lh.antisense_align() ? "-" : "+") << " " << (rh.antisense_align() ? "-" : "+") << endl; cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl; cout << lh.ref_id2() << ": " << print_cigar(lh.cigar()) << endl; cout << rh.ref_id() << ": " << rh.left() << "-" << rh.right() << endl; cout << rh.ref_id2() << ": " << print_cigar(rh.cigar()) << endl; cout << "found: " << (lb == ub ? "no" : "yes") << endl; cout << "unsupport: " << (unsupport ? "yes" : "no") << endl; cout << "fusion_left: " << (fusion_leftSide ? "yes" : "no") << endl; cout << endl; } #endif } else { if (dir == FUSION_FF) { if (lh.antisense_align()) { if (rh.right() >= inner_dist) right1 = rh.right() - inner_dist; right2 = right1 + outer_dist; left2 = lh.left() + inner_dist; if (left2 > outer_dist) left1 = left2 - outer_dist; } else { if (lh.right() >= inner_dist) left1 = lh.right() - inner_dist; left2 = left1 + outer_dist; right2 = rh.left() + inner_dist; if (right2 > outer_dist) right1 = right2 - outer_dist; } } else if (dir == FUSION_FR) { if (lh.right() >= inner_dist) left1 = lh.right() - inner_dist; left2 = left1 + outer_dist; if (rh.right() >= inner_dist) right1 = rh.right() - inner_dist; right2 = right1 + outer_dist; } else if (dir == FUSION_RF) { left2 = lh.left() + inner_dist; right2 = rh.left() + inner_dist; if (left2 > outer_dist) left1 = left2 - outer_dist; if (right2 > outer_dist) right1 = right2 - outer_dist; } else // if (dir == FUSION_RR) { if (lh.antisense_align()) { left2 = lh.left() + inner_dist; if (left2 > outer_dist) left1 = left2 - outer_dist; if (rh.right() >= inner_dist) right1 = rh.right() - inner_dist; right2 = right1 + outer_dist; } else { if (lh.right() >= inner_dist) left1 = lh.right() - inner_dist; left2 = left1 + outer_dist; right2 = rh.left() + inner_dist; if (right2 > outer_dist) right1 = right2 - outer_dist; } } // daehwan - debug #if 0 if (fusion.left == 6994359 && fusion.right == 17581683) { const char* dir_str = "ff"; if (dir == FUSION_FR) dir_str = "fr"; else if (dir == FUSION_RF) dir_str = "rf"; else if (dir == FUSION_RR) dir_str = "rr"; cout << "paired-end from two chromosomes" << endl; cout << "insert id: " << lh.insert_id() << endl; cout << dir_str << " : " << (lh.antisense_align() ? "-" : "+") << " " << (rh.antisense_align() ? "-" : "+") << endl; cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl; cout << lh.ref_id2() << " " << print_cigar(lh.cigar()) << endl; cout << rh.ref_id() << ": " << rh.left() << "-" << rh.right() << endl; cout << rh.ref_id2() << " " << print_cigar(rh.cigar()) << endl; cout << "left: " << left1 << "-" << left2 << endl; cout << "right: " << right1 << "-" << right2 << endl; cout << endl; } #endif if (ref_id1 > ref_id2 || (ref_id1 == ref_id2 && lh.left() > rh.left())) { uint32_t temp = ref_id1; ref_id1 = ref_id2; ref_id2 = temp; temp = left1; left1 = right1; right1 = temp; temp = left2; left2 = right2; right2 = temp; } lb = fusions_ref.upper_bound(Fusion(ref_id1, ref_id2, left1, right1)); ub = fusions_ref.lower_bound(Fusion(ref_id1, ref_id2, left2, right2)); // daehwan #if 0 static const uint32_t chr_id1 = RefSequenceTable::hash_string("chr2"); static const uint32_t chr_id2 = RefSequenceTable::hash_string("chr3"); if ((lh.ref_id() == chr_id1 && rh.ref_id() == chr_id2) || (lh.ref_id() == chr_id2 && rh.ref_id() == chr_id1)) { // KPL-4 BSG 19:571325-583492:1 NFIX 19:13106584-13209610:1 // const uint32_t left1 = 571325, right1 = 583492, left2 = 13106584, right2 = 13209610; // SK-BR-3 DHX35 20:37590942-37668366:1 ITCH 20:32951041-33099198:1 // const uint32_t left1 = 37590942, right1 = 37668366, left2 = 32951041, right2 = 33099198; // SK-BR-3 NFS1 20:34220262-34287287:-1 PREX1 20:47240790-47444420:-1 // const uint32_t left1 = 34220262, right1 = 34287287, left2 = 47240790, right2 = 47444420; // VCaP TIA1 2:70436576-70475792:-1 DIRC2 3:122513642-122599986:1 uint32_t left1 = 70436576, right1 = 70475792, left2 = 122513642, right2 = 122599986; if (lh.ref_id() != chr_id1) { uint32_t temp = left1; left1 = left2; left2 = temp; temp = right1; right1 = right2; right2 = temp; } if ((lh.left() >= left1 && lh.left() <= right1 && rh.left() >= left2 && rh.left() <= right2) || (lh.left() >= left2 && lh.left() <= right2 && rh.left() >= left1 && rh.left() <= right1)) { for (size_t t = 0; t < left.size(); ++t) { const BowtieHit& lh = left[t]; const BowtieHit& rh = right[t]; const char* dir_str = "ff"; if (dir == FUSION_FR) dir_str = "fr"; else if (dir == FUSION_RF) dir_str = "rf"; else if (dir == FUSION_RR) dir_str = "rr"; cout << "paired-end from two chromosomes" << endl; cout << "insert id: " << lh.insert_id() << endl; cout << dir_str << " : " << (lh.antisense_align() ? "-" : "+") << " " << (rh.antisense_align() ? "-" : "+") << endl; cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl; cout << lh.ref_id2() << " " << print_cigar(lh.cigar()) << endl; cout << rh.ref_id() << ": " << rh.left() << "-" << rh.right() << endl; cout << rh.ref_id2() << " " << print_cigar(rh.cigar()) << endl; cout << "left: " << left1 << "-" << left2 << endl; cout << "right: " << right1 << "-" << right2 << endl; cout << endl; } } } #endif } while (lb != ub && lb != fusions_ref.end()) { if (lb->first.dir == dir && lb->first.refid1 == ref_id1 && lb->first.refid2 == ref_id2 && ((!fusionSpanned && (int)lb->first.right >= right1 && (int)lb->first.right <= right2) || fusionSpanned) && !lb->second.reversed) { int dist = 0, left_dist = 0, right_dist = 0; // daehwan - check this out if (!fusionSpanned || fusion_leftSide) { if (dir == FUSION_RF || dir == FUSION_RR) left_dist = (left2 - inner_dist) - (int)lb->first.left; else left_dist = (int)lb->first.left - (left1 + inner_dist); } if (!fusionSpanned || !fusion_leftSide) { if (dir == FUSION_FR || dir == FUSION_RR) right_dist = (int)lb->first.right - (right1 + inner_dist); else right_dist = (right2 - inner_dist) - (int)lb->first.right; } dist = abs(left_dist) + abs(right_dist); // daehwan - fix this later if (dist < 0 && fusionSpanned) unsupport = true; bool pass = dist < max_dist; // daehwan #if 0 // if(!fusionSpanned) if (pass && !unsupport) //if (lb->first.left == 6994359 && lb->first.right == 17581683) { const char* dir_str = "ff"; if (dir == FUSION_FR) dir_str = "fr"; else if (dir == FUSION_RF) dir_str = "rf"; else if (dir == FUSION_RR) dir_str = "rr"; cout << "insert id: " << lh.insert_id() << endl; cout << "left: " << lh.left() << " " << print_cigar(lh.cigar()) << endl; cout << "right: " << rh.left() << " " << print_cigar(rh.cigar()) << endl; cout << dir_str << endl; cout << "dist: " << dist << endl; cout << lb->first.refid1 << " " << lb->first.refid2 << endl; cout << lb->first.left << "-" << lb->first.right << endl; cout << "unsupport: " << (unsupport ? "yes" : "no") << endl; cout << "pass: " << (pass ? "yes" : "no") << endl; cout << "ids: " << lh.insert_id() << " : " << rh.insert_id() << endl; cout << "left dist: " << left_dist << "\tright dist: " << right_dist << endl; cout << endl << endl; } #endif FusionSet::iterator itr = fusions.find(lb->first); if (itr != fusions.end()) { if (unsupport) ++(itr->second.unsupport_count_pair); else if (pass) { if (fusionSpanned) ++(itr->second.pair_count_fusion); else { itr->second.vPairSupport.push_back(FusionPairSupport(left_dist, right_dist)); ++(itr->second.pair_count); if (itr->second.vPairSupport.size() >= 300) { sort(itr->second.vPairSupport.begin(), itr->second.vPairSupport.end()); itr->second.vPairSupport.erase(itr->second.vPairSupport.begin() + 200, itr->second.vPairSupport.end()); } } } } else { FusionStat fusionStat; if (unsupport) fusionStat.unsupport_count_pair = 1; else if (pass) { if (fusionSpanned) fusionStat.pair_count_fusion = 1; else { fusionStat.vPairSupport.push_back(FusionPairSupport(left_dist, right_dist)); fusionStat.pair_count = 1; } } fusions[lb->first] = fusionStat; } } if (fusionSpanned) break; ++lb; } } } void merge_with(FusionSimpleSet& fusions, const FusionSimpleSet& other_fusions) { for (FusionSimpleSet::const_iterator other_itr = other_fusions.begin(); other_itr != other_fusions.end(); ++other_itr) { FusionSimpleSet::iterator itr = fusions.find(other_itr->first); if (itr != fusions.end()) { FusionSimpleStat& curr = itr->second; curr.merge_with(other_itr->second); } else { fusions[other_itr->first] = other_itr->second; } } } void merge_with(FusionSet& fusions, const FusionSet& other_fusions) { for (FusionSet::const_iterator other_itr = other_fusions.begin(); other_itr != other_fusions.end(); ++other_itr) { FusionSet::iterator itr = fusions.find(other_itr->first); if (itr != fusions.end()) { FusionStat& curr = itr->second; curr.merge_with(other_itr->second); } else { fusions[other_itr->first] = other_itr->second; } } } tophat-2.0.9/src/gtf_juncs.cpp0000644000175000017500000000700012122334361015020 0ustar toortoor/* * gff_juncs.cpp * TopHat * * Created by Cole Trapnell on 1/15/09. * Copyright 2009 Cole Trapnell. All rights reserved. * */ #ifdef HAVE_CONFIG_H #include #else #define PACKAGE_VERSION "INTERNAL" #define SVN_REVISION "XXX" #endif #include #include #include #include #include "gff.h" #include "common.h" #include "bwt_map.h" using namespace std; void print_usage() { fprintf(stderr, "Usage: gtf_juncs \n"); } void read_transcripts(FILE* f, GffReader& gffr) { //assume gffr was just created but not initialized gffr.init(f, true, true); //(gffile, mRNA-only, sortByLoc) gffr.showWarnings(verbose); //(keepAttr, mergeCloseExons, noExonAttr) gffr.readAll(false, true, true); //now all parsed GffObjs are in gffr.gflst, grouped by genomic sequence } uint32_t get_junctions_from_gff(FILE* ref_mRNA_file, RefSequenceTable& rt) { GffReader gff_reader(ref_mRNA_file, true); //only recognizable transcript features, sort them by locus if (ref_mRNA_file) { read_transcripts(ref_mRNA_file, gff_reader); } set > > uniq_juncs; //if any ref data was loaded int last_gseqid=-1; const char* gseqname=NULL; for (int i=0;iGFF_MAX_LOCUS)) { //if (verbose) GMessage("Warning: transcript %s discarded (structural errors found, length=%d).\n", rna.getID(), tlen); continue; } if (rna.isDiscarded()) { //discarded generic "gene" or "locus" features with no other detailed subfeatures continue; } if (rna.exons.Count()==0) { //if (verbose) // GMessage("Warning: %s %s found without exon segments (adding default exon).\n",rna.getFeatureName(), rna.getID()); rna.addExon(rna.start,rna.end); } if (rna.gseq_id!=last_gseqid) { gseqname=rna.getGSeqName(); rt.get_id(gseqname, NULL, 0); last_gseqid=rna.gseq_id; } for (int e = 1; e < rna.exons.Count(); ++e) { GffExon& ex = *(rna.exons[e]); GffExon& prex = *(rna.exons[e-1]); if(uniq_juncs.insert(make_pair(gseqname, make_pair(prex.end - 1, ex.start - 1))).second) { fprintf(stdout, "%s\t%d\t%d\t%c\n", gseqname, prex.end-1, ex.start-1, rna.strand); } } } //for each loaded GFF record return uniq_juncs.size(); } int main(int argc, char** argv) { int parse_ret = parse_options(argc, argv, print_usage); if (parse_ret) return parse_ret; if(optind >= argc) { print_usage(); return 2; } string gtf_filename = argv[optind++]; //GFF_database gff_db; if (gtf_filename == "") { print_usage(); exit(2); } FILE* ref_gtf = fopen(gtf_filename.c_str(), "r"); if (!ref_gtf) { fprintf (stderr, "Error: could not open GTF file %s for reading\n", gtf_filename.c_str()); exit(1); } fprintf(stderr, "gtf_juncs v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION); fprintf(stderr, "---------------------------\n"); //gff_db.from_file(gff_filename); // gff_db.sort_entries(); // RefSequenceTable rt(true); uint32_t num_juncs_reported = get_junctions_from_gff(ref_gtf, rt); //uint32_t num_juncs_reported = 0; fprintf(stderr, "Extracted %u junctions from %s\n", num_juncs_reported, gtf_filename.c_str()); if (!num_juncs_reported) return 1; return 0; } tophat-2.0.9/src/fragments.cpp0000644000175000017500000000247012122334362015033 0ustar toortoor/* * fragments.cpp * TopHat * * Created by Cole Trapnell on 1/14/09. * Copyright 2009 Cole Trapnell. All rights reserved. * */ #ifdef HAVE_CONFIG_H #include #endif #include #include "bwt_map.h" #include "fragments.h" void best_fragment_mappings(uint64_t refid, const string& name, HitList& hits_in_ref, ReadTable& it, BestFragmentAlignmentTable& best_status_for_fragments) { return; #if 0 for (size_t i = 0; i < hits_in_ref.size(); ++i) { BowtieHit& h1 = hits_in_ref[i]; uint64_t fragment_id = h1.insert_id(); uint32_t obs_order = it.observation_order(fragment_id); JunctionSet dummy; FragmentAlignmentGrade s(h1, dummy); pair >& fragment_best = best_status_for_fragments[obs_order]; FragmentAlignmentGrade& current = fragment_best.first; // Is the new status better than the current best one? if (current < s) { fragment_best.second.clear(); current = s; fragment_best.second.push_back(&h1); } else if (! (s < current)) // is it just as good? { fragment_best.second.push_back(&h1); } } #endif } bool valid_fragment_alignment(const FragmentAlignmentGrade& g, const FragmentAlignment& a) { // stub return true; } tophat-2.0.9/src/contig_to_chr_coords0000755000175000017500000001102712122334360016455 0ustar toortoor#!/usr/bin/env python # encoding: utf-8 """ contig_to_chr_coords.py Created by Cole Trapnell on 2008-09-05. Copyright (c) 2008 Cole Trapnell. All rights reserved. """ import sys import getopt help_message = ''' Takes the NCBI seq_contig file and maps contig coords to whole chromosome coords in a GTF, GFF, or BED file contig_to_chr_coords.py ''' class Usage(Exception): def __init__(self, msg): self.msg = msg def main(argv=None): if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt(argv[1:], "ho:vbg", ["help", "output=", "bed", "gff"]) except getopt.error, msg: raise Usage(msg) arg_is_splice = False arg_is_gff = False # option processing for option, value in opts: if option == "-v": verbose = True if option in ("-h", "--help"): raise Usage(help_message) if option in ("-o", "--output"): output = value if option in ("-b", "--bed"): arg_is_splice = True if option in ("-g", "--gff"): arg_is_gff = True if (arg_is_splice == False and arg_is_gff == False) or (arg_is_splice == True and arg_is_gff == True): print >> sys.stderr, "Error: please specify either -b or -g, but not both" raise Usage(help_message) if len(args) < 1: raise Usage(help_message) contig_to_chr_file = open(args[0]) contigs = {} for line in contig_to_chr_file.readlines(): if line[0] == "#": continue line = line.strip() cols = line.split('\t') if len(cols) < 9: continue chromosome = cols[1] group = cols[8] feature_name = cols[5] if not feature_name in ["start", "end"]: contigs[feature_name] = (chromosome, int(cols[2])) #print feature_name, chromosome, int(cols[2]) if arg_is_gff: gff_file = open(args[1]) lines = gff_file.readlines() print lines[0], for line in lines[1:]: line = line.strip() cols = line.split('\t') if len(cols) < 8: continue contig = cols[0] chr_fields = contig.split('|') refseq_id = chr_fields[3] contig = contigs.get(refseq_id) chr_name = contig[0] pipe_idx = chr_name.find('|') if pipe_idx != -1: chr_name = chr_name[:pipe_idx] if contig != None: #print line left_pos = contig[1] + int(cols[3]) right_pos = contig[1] + int(cols[4]) print "chr%s\tTopHat\tisland\t%d\t%d\t%s\t.\t.\t%s" % (chr_name, left_pos, right_pos, cols[5],cols[8]) #print >>sys.stderr, "%s\t%d\t%d\t%s\t%s\t%s\t%s" % (contig[0], left_pos, right_pos,cols[3],cols[6],cols[0],cols[1]) if arg_is_splice: splice_file = open(args[1]) lines = splice_file.readlines() print lines[0], for line in lines[1:]: line = line.strip() cols = line.split('\t') contig = cols[0] chr_fields = contig.split('|') refseq_id = chr_fields[3] contig = contigs.get(refseq_id) chr_name = contig[0] pipe_idx = chr_name.find('|') if pipe_idx != -1: chr_name = chr_name[:pipe_idx] if contig != None: #print line left_pos = contig[1] + int(cols[1]) right_pos = contig[1] + int(cols[2]) print "chr%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t255,0,0\t2\t1,1\t%s" % (chr_name, left_pos, right_pos, cols[3],cols[5],left_pos, right_pos,cols[11]) #print >>sys.stderr, "%s\t%d\t%d\t%s\t%s\t%s\t%s" % (contig[0], left_pos, right_pos,cols[3],cols[6],cols[0],cols[1]) except Usage, err: print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg) print >> sys.stderr, "\t for help use --help" return 2 if __name__ == "__main__": sys.exit(main()) tophat-2.0.9/src/align_status.h0000755000175000017500000000175712122334361015220 0ustar toortoor#ifndef ALIGN_STATUS_H #define ALIGN_STATUS_H #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #include #include #include #include #include #include #include "common.h" #include "bwt_map.h" #include "junctions.h" #include "insertions.h" #include "deletions.h" #include "fusions.h" using namespace std; class Coverage; /** */ struct AlignStatus { public: /** * Is there an alignment? */ int _alignment_score; public: AlignStatus(); AlignStatus(const BowtieHit& bh, const JunctionSet& gtf_junctions, const JunctionSet& junctions, const InsertionSet& insertions, const DeletionSet& deletions, const FusionSet& fusions, const Coverage& coverage); bool operator<(const AlignStatus& rhs) const; bool operator==(const AlignStatus& rhs) const; bool operator!=(const AlignStatus& rhs) const; }; #endif tophat-2.0.9/THANKS0000644000175000017500000000216212122334411012456 0ustar toortoorTopHat THANKS file TopHat was originally written by Cole Trapnell, with substantial creative input from Lior Pachter and Steven Salzberg. Many people further contributed to TopHat by reporting problems, suggesting various improvements, donating test data or submitting actual code. Here is a list of these people. Help me keep it complete and exempt of errors. Robert Bradley rbradley@math.berkeley.edu Steven Brenner brenner@compbio.berkeley.edu Angela Brooks angelabrooks@berkeley.edu Mark Diekhans markd@soe.ucsc.edu Mike Duff moduff@gmail.com Steffen Durinck sdurinck@illumina.com Matthew Fitzgibbon mfitzgib@fhcrc.org Brenton Graveley graveley@neuron.uchc.edu Kasper Hansen khansen@stat.berkeley.edu Rachel Harte hartera@soe.ucsc.edu Ben Langmead langmead@cs.umd.edu Richard McCombie mccombie@cshl.edu Ali Mortazavi alim@caltech.edu Adam Phillippy amp@umiacs.umd.edu Mike Schatz mschatz@umiacs.umd.edu Gary Schroth gschroth@illumina.com Gavin Sherlock sherlock@genome.stanford.edu Diane Trout diane@caltech.edu Svilen Tzonev stzonev@illumina.com Jeltje Van Baren jeltje@wustl.edu Barbara Wold wold@caltech.edu