pax_global_header00006660000000000000000000000064151064217260014516gustar00rootroot0000000000000052 comment=241ed02bc1a5d567ecf486de7d84bb74db0068d2 python-markdownify-1.2.2/000077500000000000000000000000001510642172600153715ustar00rootroot00000000000000python-markdownify-1.2.2/.github/000077500000000000000000000000001510642172600167315ustar00rootroot00000000000000python-markdownify-1.2.2/.github/workflows/000077500000000000000000000000001510642172600207665ustar00rootroot00000000000000python-markdownify-1.2.2/.github/workflows/python-app.yml000066400000000000000000000023621510642172600236130ustar00rootroot00000000000000# This workflow will install Python dependencies, run tests and lint with a single version of Python # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions name: Python application on: push: branches: [ develop ] pull_request: branches: [ develop ] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Python 3.8 uses: actions/setup-python@v2 with: python-version: 3.8 - name: Install dependencies run: | python -m pip install --upgrade pip pip install --upgrade setuptools setuptools_scm wheel build tox - name: Lint and test run: | tox - name: Build run: | python -m build -nwsx . types: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Set up Python 3.8 uses: actions/setup-python@v2 with: python-version: 3.8 - name: Install dependencies run: | python -m pip install --upgrade pip pip install --upgrade setuptools setuptools_scm wheel build tox mypy types-beautifulsoup4 - name: Check types run: | mypy . mypy --strict tests/types.py python-markdownify-1.2.2/.github/workflows/python-publish.yml000066400000000000000000000015651510642172600245050ustar00rootroot00000000000000# This workflow will upload a Python Package using Twine when a release is created # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries name: Upload Python Package on: release: types: [created] jobs: deploy: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v2 with: python-version: '3.8' - name: Install dependencies run: | python -m pip install --upgrade pip pip install --upgrade setuptools setuptools_scm wheel build twine - name: Build and publish env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | python -m build -nwsx . twine upload dist/* python-markdownify-1.2.2/.gitignore000066400000000000000000000001501510642172600173550ustar00rootroot00000000000000*.pyc *.egg .eggs/ *.egg-info/ .DS_Store /.env /dist /MANIFEST /venv build/ .vscode/settings.json .tox/ python-markdownify-1.2.2/LICENSE000066400000000000000000000020731510642172600164000ustar00rootroot00000000000000The MIT License (MIT) Copyright 2012-2018 Matthew Tretter Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. python-markdownify-1.2.2/MANIFEST.in000066400000000000000000000000371510642172600171270ustar00rootroot00000000000000include README.rst prune tests python-markdownify-1.2.2/README.rst000066400000000000000000000213561510642172600170670ustar00rootroot00000000000000|build| |version| |license| |downloads| .. |build| image:: https://img.shields.io/github/actions/workflow/status/matthewwithanm/python-markdownify/python-app.yml?branch=develop :alt: GitHub Workflow Status :target: https://github.com/matthewwithanm/python-markdownify/actions/workflows/python-app.yml?query=workflow%3A%22Python+application%22 .. |version| image:: https://img.shields.io/pypi/v/markdownify :alt: Pypi version :target: https://pypi.org/project/markdownify/ .. |license| image:: https://img.shields.io/pypi/l/markdownify :alt: License :target: https://github.com/matthewwithanm/python-markdownify/blob/develop/LICENSE .. |downloads| image:: https://pepy.tech/badge/markdownify :alt: Pypi Downloads :target: https://pepy.tech/project/markdownify Installation ============ ``pip install markdownify`` Usage ===== Convert some HTML to Markdown: .. code:: python from markdownify import markdownify as md md('Yay GitHub') # > '**Yay** [GitHub](http://github.com)' Specify tags to exclude: .. code:: python from markdownify import markdownify as md md('Yay GitHub', strip=['a']) # > '**Yay** GitHub' \...or specify the tags you want to include: .. code:: python from markdownify import markdownify as md md('Yay GitHub', convert=['b']) # > '**Yay** GitHub' Options ======= Markdownify supports the following options: strip A list of tags to strip. This option can't be used with the ``convert`` option. convert A list of tags to convert. This option can't be used with the ``strip`` option. autolinks A boolean indicating whether the "automatic link" style should be used when a ``a`` tag's contents match its href. Defaults to ``True``. default_title A boolean to enable setting the title of a link to its href, if no title is given. Defaults to ``False``. heading_style Defines how headings should be converted. Accepted values are ``ATX``, ``ATX_CLOSED``, ``SETEXT``, and ``UNDERLINED`` (which is an alias for ``SETEXT``). Defaults to ``UNDERLINED``. bullets An iterable (string, list, or tuple) of bullet styles to be used. If the iterable only contains one item, it will be used regardless of how deeply lists are nested. Otherwise, the bullet will alternate based on nesting level. Defaults to ``'*+-'``. strong_em_symbol In markdown, both ``*`` and ``_`` are used to encode **strong** or *emphasized* texts. Either of these symbols can be chosen by the options ``ASTERISK`` (default) or ``UNDERSCORE`` respectively. sub_symbol, sup_symbol Define the chars that surround ```` and ```` text. Defaults to an empty string, because this is non-standard behavior. Could be something like ``~`` and ``^`` to result in ``~sub~`` and ``^sup^``. If the value starts with ``<`` and ends with ``>``, it is treated as an HTML tag and a ``/`` is inserted after the ``<`` in the string used after the text; this allows specifying ```` to use raw HTML in the output for subscripts, for example. newline_style Defines the style of marking linebreaks (``
``) in markdown. The default value ``SPACES`` of this option will adopt the usual two spaces and a newline, while ``BACKSLASH`` will convert a linebreak to ``\\n`` (a backslash and a newline). While the latter convention is non-standard, it is commonly preferred and supported by a lot of interpreters. code_language Defines the language that should be assumed for all ``
`` sections.
  Useful, if all code on a page is in the same programming language and
  should be annotated with `````python`` or similar.
  Defaults to ``''`` (empty string) and can be any string.

code_language_callback
  When the HTML code contains ``pre`` tags that in some way provide the code
  language, for example as class, this callback can be used to extract the
  language from the tag and prefix it to the converted ``pre`` tag.
  The callback gets one single argument, a BeautifulSoup object, and returns
  a string containing the code language, or ``None``.
  An example to use the class name as code language could be::

    def callback(el):
        return el['class'][0] if el.has_attr('class') else None

  Defaults to ``None``.

escape_asterisks
  If set to ``False``, do not escape ``*`` to ``\*`` in text.
  Defaults to ``True``.

escape_underscores
  If set to ``False``, do not escape ``_`` to ``\_`` in text.
  Defaults to ``True``.

escape_misc
  If set to ``True``, escape miscellaneous punctuation characters
  that sometimes have Markdown significance in text.
  Defaults to ``False``.

keep_inline_images_in
  Images are converted to their alt-text when the images are located inside
  headlines or table cells. If some inline images should be converted to
  markdown images instead, this option can be set to a list of parent tags
  that should be allowed to contain inline images, for example ``['td']``.
  Defaults to an empty list.

table_infer_header
  Controls handling of tables with no header row (as indicated by ````
  or ````). When set to ``True``, the first body row is used as the header row.
  Defaults to ``False``, which leaves the header row empty.

wrap, wrap_width
  If ``wrap`` is set to ``True``, all text paragraphs are wrapped at
  ``wrap_width`` characters. Defaults to ``False`` and ``80``.
  Use with ``newline_style=BACKSLASH`` to keep line breaks in paragraphs.
  A `wrap_width` value of `None` reflows lines to unlimited line length.

strip_document
  Controls whether leading and/or trailing separation newlines are removed from
  the final converted document. Supported values are ``LSTRIP`` (leading),
  ``RSTRIP`` (trailing), ``STRIP`` (both), and ``None`` (neither). Newlines
  within the document are unaffected.
  Defaults to ``STRIP``.

strip_pre
  Controls whether leading/trailing blank lines are removed from ``
``
  tags. Supported values are ``STRIP`` (all leading/trailing blank lines),
  ``STRIP_ONE`` (one leading/trailing blank line), and ``None`` (neither).
  Defaults to ``STRIP``.

bs4_options
  Specify additional configuration options for the ``BeautifulSoup`` object
  used to interpret the HTML markup. String and list values (such as ``lxml``
  or ``html5lib``) are treated as ``features`` arguments to control parser
  selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
  are treated as full kwargs to be used for the BeautifulSoup constructor,
  allowing specification of any parameter. For parameter details, see the
  Beautiful Soup documentation at:

.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

Options may be specified as kwargs to the ``markdownify`` function, or as a
nested ``Options`` class in ``MarkdownConverter`` subclasses.


Converting BeautifulSoup objects
================================

.. code:: python

    from markdownify import MarkdownConverter

    # Create shorthand method for conversion
    def md(soup, **options):
        return MarkdownConverter(**options).convert_soup(soup)


Creating Custom Converters
==========================

If you have a special usecase that calls for a special conversion, you can
always inherit from ``MarkdownConverter`` and override the method you want to
change.
The function that handles a HTML tag named ``abc`` is called
``convert_abc(self, el, text, parent_tags)`` and returns a string
containing the converted HTML tag.
The ``MarkdownConverter`` object will handle the conversion based on the
function names:

.. code:: python

    from markdownify import MarkdownConverter

    class ImageBlockConverter(MarkdownConverter):
        """
        Create a custom MarkdownConverter that adds two newlines after an image
        """
        def convert_img(self, el, text, parent_tags):
            return super().convert_img(el, text, parent_tags) + '\n\n'

    # Create shorthand method for conversion
    def md(html, **options):
        return ImageBlockConverter(**options).convert(html)

.. code:: python

    from markdownify import MarkdownConverter

    class IgnoreParagraphsConverter(MarkdownConverter):
        """
        Create a custom MarkdownConverter that ignores paragraphs
        """
        def convert_p(self, el, text, parent_tags):
            return ''

    # Create shorthand method for conversion
    def md(html, **options):
        return IgnoreParagraphsConverter(**options).convert(html)


Command Line Interface
======================

Use ``markdownify example.html > example.md`` or pipe input from stdin
(``cat example.html | markdownify > example.md``).
Call ``markdownify -h`` to see all available options.
They are the same as listed above and take the same arguments.


Development
===========

To run tests and the linter run ``pip install tox`` once, then ``tox``.
python-markdownify-1.2.2/markdownify/000077500000000000000000000000001510642172600177235ustar00rootroot00000000000000python-markdownify-1.2.2/markdownify/__init__.py000066400000000000000000000732351510642172600220460ustar00rootroot00000000000000from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
from textwrap import fill
import re
import six


# General-purpose regex patterns
re_convert_heading = re.compile(r'convert_h(\d+)')
re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE)
re_whitespace = re.compile(r'[\t ]+')
re_all_whitespace = re.compile(r'[\t \r\n]+')
re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
re_html_heading = re.compile(r'h(\d+)')
re_pre_lstrip1 = re.compile(r'^ *\n')
re_pre_rstrip1 = re.compile(r'\n *$')
re_pre_lstrip = re.compile(r'^[ \n]*\n')
re_pre_rstrip = re.compile(r'[ \n]*$')

# Pattern for creating convert_ function names from tag names
re_make_convert_fn_name = re.compile(r'[\[\]:-]')

# Extract (leading_nl, content, trailing_nl) from a string
# (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here)
re_extract_newlines = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL)

# Escape miscellaneous special Markdown characters
re_escape_misc_chars = re.compile(r'([]\\&<`[>~=+|])')

# Escape sequence of one or more consecutive '-', preceded
# and followed by whitespace or start/end of fragment, as it
# might be confused with an underline of a header, or with a
# list marker
re_escape_misc_dash_sequences = re.compile(r'(\s|^)(-+(?:\s|$))')

# Escape sequence of up to six consecutive '#', preceded
# and followed by whitespace or start/end of fragment, as
# it might be confused with an ATX heading
re_escape_misc_hashes = re.compile(r'(\s|^)(#{1,6}(?:\s|$))')

# Escape '.' or ')' preceded by up to nine digits, as it might be
# confused with a list item
re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))')

# Find consecutive backtick sequences in a string
re_backtick_runs = re.compile(r'`+')

# Heading styles
ATX = 'atx'
ATX_CLOSED = 'atx_closed'
UNDERLINED = 'underlined'
SETEXT = UNDERLINED

# Newline style
SPACES = 'spaces'
BACKSLASH = 'backslash'

# Strong and emphasis style
ASTERISK = '*'
UNDERSCORE = '_'

# Document/pre strip styles
LSTRIP = 'lstrip'
RSTRIP = 'rstrip'
STRIP = 'strip'
STRIP_ONE = 'strip_one'


def strip1_pre(text):
    """Strip one leading and trailing newline from a 
 string."""
    text = re_pre_lstrip1.sub('', text)
    text = re_pre_rstrip1.sub('', text)
    return text


def strip_pre(text):
    """Strip all leading and trailing newlines from a 
 string."""
    text = re_pre_lstrip.sub('', text)
    text = re_pre_rstrip.sub('', text)
    return text


def chomp(text):
    """
    If the text in an inline tag like b, a, or em contains a leading or trailing
    space, strip the string and return a space as suffix of prefix, if needed.
    This function is used to prevent conversions like
         foo => ** foo**
    """
    prefix = ' ' if text and text[0] == ' ' else ''
    suffix = ' ' if text and text[-1] == ' ' else ''
    text = text.strip()
    return (prefix, suffix, text)


def abstract_inline_conversion(markup_fn):
    """
    This abstracts all simple inline tags like b, em, del, ...
    Returns a function that wraps the chomped text in a pair of the string
    that is returned by markup_fn, with '/' inserted in the string used after
    the text if it looks like an HTML tag. markup_fn is necessary to allow for
    references to self.strong_em_symbol etc.
    """
    def implementation(self, el, text, parent_tags):
        markup_prefix = markup_fn(self)
        if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
            markup_suffix = '), ignore adjacent whitespace elements.
                    return True
                elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
                    # Outside block elements (including 
), ignore adjacent whitespace elements.
                    return True
                else:
                    return False
            elif el is None:
                return True
            else:
                raise ValueError('Unexpected element type: %s' % type(el))

        children_to_convert = [el for el in node.children if not _can_ignore(el)]

        # Create a copy of this tag's parent context, then update it to include this tag
        # to propagate down into the children.
        parent_tags_for_children = set(parent_tags)
        parent_tags_for_children.add(node.name)

        # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
        if (
            re_html_heading.match(node.name) is not None  # headings
            or node.name in {'td', 'th'}  # table cells
        ):
            parent_tags_for_children.add('_inline')

        # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
        if node.name in {'pre', 'code', 'kbd', 'samp'}:
            parent_tags_for_children.add('_noformat')

        # Convert the children elements into a list of result strings.
        child_strings = [
            self.process_element(el, parent_tags=parent_tags_for_children)
            for el in children_to_convert
        ]

        # Remove empty string values.
        child_strings = [s for s in child_strings if s]

        # Collapse newlines at child element boundaries, if needed.
        if node.name == 'pre' or node.find_parent('pre'):
            # Inside 
 blocks, do not collapse newlines.
            pass
        else:
            # Collapse newlines at child element boundaries.
            updated_child_strings = ['']  # so the first lookback works
            for child_string in child_strings:
                # Separate the leading/trailing newlines from the content.
                leading_nl, content, trailing_nl = re_extract_newlines.match(child_string).groups()

                # If the last child had trailing newlines and this child has leading newlines,
                # use the larger newline count, limited to 2.
                if updated_child_strings[-1] and leading_nl:
                    prev_trailing_nl = updated_child_strings.pop()  # will be replaced by the collapsed value
                    num_newlines = min(2, max(len(prev_trailing_nl), len(leading_nl)))
                    leading_nl = '\n' * num_newlines

                # Add the results to the updated child string list.
                updated_child_strings.extend([leading_nl, content, trailing_nl])

            child_strings = updated_child_strings

        # Join all child text strings into a single string.
        text = ''.join(child_strings)

        # apply this tag's final conversion function
        convert_fn = self.get_conv_fn_cached(node.name)
        if convert_fn is not None:
            text = convert_fn(node, text, parent_tags=parent_tags)

        return text

    def convert__document_(self, el, text, parent_tags):
        """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
        if self.options['strip_document'] == LSTRIP:
            text = text.lstrip('\n')  # remove leading separation newlines
        elif self.options['strip_document'] == RSTRIP:
            text = text.rstrip('\n')  # remove trailing separation newlines
        elif self.options['strip_document'] == STRIP:
            text = text.strip('\n')  # remove leading and trailing separation newlines
        elif self.options['strip_document'] is None:
            pass  # leave leading and trailing separation newlines as-is
        else:
            raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])

        return text

    def process_text(self, el, parent_tags=None):
        # For the top-level element, initialize the parent context with an empty set.
        if parent_tags is None:
            parent_tags = set()

        text = six.text_type(el) or ''

        # normalize whitespace if we're not inside a preformatted element
        if 'pre' not in parent_tags:
            if self.options['wrap']:
                text = re_all_whitespace.sub(' ', text)
            else:
                text = re_newline_whitespace.sub('\n', text)
                text = re_whitespace.sub(' ', text)

        # escape special characters if we're not inside a preformatted or code element
        if '_noformat' not in parent_tags:
            text = self.escape(text, parent_tags)

        # remove leading whitespace at the start or just after a
        # block-level element; remove traliing whitespace at the end
        # or just before a block-level element.
        if (should_remove_whitespace_outside(el.previous_sibling)
                or (should_remove_whitespace_inside(el.parent)
                    and not el.previous_sibling)):
            text = text.lstrip(' \t\r\n')
        if (should_remove_whitespace_outside(el.next_sibling)
                or (should_remove_whitespace_inside(el.parent)
                    and not el.next_sibling)):
            text = text.rstrip()

        return text

    def get_conv_fn_cached(self, tag_name):
        """Given a tag name, return the conversion function using the cache."""
        # If conversion function is not in cache, add it
        if tag_name not in self.convert_fn_cache:
            self.convert_fn_cache[tag_name] = self.get_conv_fn(tag_name)

        # Return the cached entry
        return self.convert_fn_cache[tag_name]

    def get_conv_fn(self, tag_name):
        """Given a tag name, find and return the conversion function."""
        tag_name = tag_name.lower()

        # Handle strip/convert exclusion options
        if not self.should_convert_tag(tag_name):
            return None

        # Look for an explicitly defined conversion function by tag name first
        convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub("_", tag_name)
        convert_fn = getattr(self, convert_fn_name, None)
        if convert_fn:
            return convert_fn

        # If tag is any heading, handle with convert_hN() function
        match = re_html_heading.match(tag_name)
        if match:
            n = int(match.group(1))  # get value of N from 
            return lambda el, text, parent_tags: self.convert_hN(n, el, text, parent_tags)

        # No conversion function was found
        return None

    def should_convert_tag(self, tag):
        """Given a tag name, return whether to convert based on strip/convert options."""
        strip = self.options['strip']
        convert = self.options['convert']
        if strip is not None:
            return tag not in strip
        elif convert is not None:
            return tag in convert
        else:
            return True

    def escape(self, text, parent_tags):
        if not text:
            return ''
        if self.options['escape_misc']:
            text = re_escape_misc_chars.sub(r'\\\1', text)
            text = re_escape_misc_dash_sequences.sub(r'\1\\\2', text)
            text = re_escape_misc_hashes.sub(r'\1\\\2', text)
            text = re_escape_misc_list_items.sub(r'\1\\\2', text)

        if self.options['escape_asterisks']:
            text = text.replace('*', r'\*')
        if self.options['escape_underscores']:
            text = text.replace('_', r'\_')
        return text

    def underline(self, text, pad_char):
        text = (text or '').rstrip()
        return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''

    def convert_a(self, el, text, parent_tags):
        if '_noformat' in parent_tags:
            return text
        prefix, suffix, text = chomp(text)
        if not text:
            return ''
        href = el.get('href')
        title = el.get('title')
        # For the replacement see #29: text nodes underscores are escaped
        if (self.options['autolinks']
                and text.replace(r'\_', '_') == href
                and not title
                and not self.options['default_title']):
            # Shortcut syntax
            return '<%s>' % href
        if self.options['default_title'] and not title:
            title = href
        title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
        return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text

    convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])

    def convert_blockquote(self, el, text, parent_tags):
        # handle some early-exit scenarios
        text = (text or '').strip(' \t\r\n')
        if '_inline' in parent_tags:
            return ' ' + text + ' '
        if not text:
            return "\n"

        # indent lines with blockquote marker
        def _indent_for_blockquote(match):
            line_content = match.group(1)
            return '> ' + line_content if line_content else '>'
        text = re_line_with_content.sub(_indent_for_blockquote, text)

        return '\n' + text + '\n\n'

    def convert_br(self, el, text, parent_tags):
        if '_inline' in parent_tags:
            return ' '

        if self.options['newline_style'].lower() == BACKSLASH:
            return '\\\n'
        else:
            return '  \n'

    def convert_code(self, el, text, parent_tags):
        if '_noformat' in parent_tags:
            return text

        prefix, suffix, text = chomp(text)
        if not text:
            return ''

        # Find the maximum number of consecutive backticks in the text, then
        # delimit the code span with one more backtick than that
        max_backticks = max((len(match) for match in re.findall(re_backtick_runs, text)), default=0)
        markup_delimiter = '`' * (max_backticks + 1)

        # If the maximum number of backticks is greater than zero, add a space
        # to avoid interpretation of inside backticks as literals
        if max_backticks > 0:
            text = " " + text + " "

        return '%s%s%s%s%s' % (prefix, markup_delimiter, text, markup_delimiter, suffix)

    convert_del = abstract_inline_conversion(lambda self: '~~')

    def convert_div(self, el, text, parent_tags):
        if '_inline' in parent_tags:
            return ' ' + text.strip() + ' '
        text = text.strip()
        return '\n\n%s\n\n' % text if text else ''

    convert_article = convert_div

    convert_section = convert_div

    convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])

    convert_kbd = convert_code

    def convert_dd(self, el, text, parent_tags):
        text = (text or '').strip()
        if '_inline' in parent_tags:
            return ' ' + text + ' '
        if not text:
            return '\n'

        # indent definition content lines by four spaces
        def _indent_for_dd(match):
            line_content = match.group(1)
            return '    ' + line_content if line_content else ''
        text = re_line_with_content.sub(_indent_for_dd, text)

        # insert definition marker into first-line indent whitespace
        text = ':' + text[1:]

        return '%s\n' % text

    # definition lists are formatted as follows:
    #   https://pandoc.org/MANUAL.html#definition-lists
    #   https://michelf.ca/projects/php-markdown/extra/#def-list
    convert_dl = convert_div

    def convert_dt(self, el, text, parent_tags):
        # remove newlines from term text
        text = (text or '').strip()
        text = re_all_whitespace.sub(' ', text)
        if '_inline' in parent_tags:
            return ' ' + text + ' '
        if not text:
            return '\n'

        # TODO - format consecutive 
elements as directly adjacent lines): # https://michelf.ca/projects/php-markdown/extra/#def-list return '\n\n%s\n' % text def convert_hN(self, n, el, text, parent_tags): # convert_hN() converts tags, where N is any integer if '_inline' in parent_tags: return text # Markdown does not support heading depths of n > 6 n = max(1, min(6, n)) style = self.options['heading_style'].lower() text = text.strip() if style == UNDERLINED and n <= 2: line = '=' if n == 1 else '-' return self.underline(text, line) text = re_all_whitespace.sub(' ', text) hashes = '#' * n if style == ATX_CLOSED: return '\n\n%s %s %s\n\n' % (hashes, text, hashes) return '\n\n%s %s\n\n' % (hashes, text) def convert_hr(self, el, text, parent_tags): return '\n\n---\n\n' convert_i = convert_em def convert_img(self, el, text, parent_tags): alt = el.attrs.get('alt', None) or '' src = el.attrs.get('src', None) or '' title = el.attrs.get('title', None) or '' title_part = ' "%s"' % title.replace('"', r'\"') if title else '' if ('_inline' in parent_tags and el.parent.name not in self.options['keep_inline_images_in']): return alt return '![%s](%s%s)' % (alt, src, title_part) def convert_video(self, el, text, parent_tags): if ('_inline' in parent_tags and el.parent.name not in self.options['keep_inline_images_in']): return text src = el.attrs.get('src', None) or '' if not src: sources = el.find_all('source', attrs={'src': True}) if sources: src = sources[0].attrs.get('src', None) or '' poster = el.attrs.get('poster', None) or '' if src and poster: return '[![%s](%s)](%s)' % (text, poster, src) if src: return '[%s](%s)' % (text, src) if poster: return '![%s](%s)' % (text, poster) return text def convert_list(self, el, text, parent_tags): # Converting a list to inline is undefined. # Ignoring inline conversion parents for list. before_paragraph = False next_sibling = _next_block_content_sibling(el) if next_sibling and next_sibling.name not in ['ul', 'ol']: before_paragraph = True if 'li' in parent_tags: # remove trailing newline if we're in a nested list return '\n' + text.rstrip() return '\n\n' + text + ('\n' if before_paragraph else '') convert_ul = convert_list convert_ol = convert_list def convert_li(self, el, text, parent_tags): # handle some early-exit scenarios text = (text or '').strip() if not text: return "\n" # determine list item bullet character to use parent = el.parent if parent is not None and parent.name == 'ol': if parent.get("start") and str(parent.get("start")).isnumeric(): start = int(parent.get("start")) else: start = 1 bullet = '%s.' % (start + len(el.find_previous_siblings('li'))) else: depth = -1 while el: if el.name == 'ul': depth += 1 el = el.parent bullets = self.options['bullets'] bullet = bullets[depth % len(bullets)] bullet = bullet + ' ' bullet_width = len(bullet) bullet_indent = ' ' * bullet_width # indent content lines by bullet width def _indent_for_li(match): line_content = match.group(1) return bullet_indent + line_content if line_content else '' text = re_line_with_content.sub(_indent_for_li, text) # insert bullet into first-line indent whitespace text = bullet + text[bullet_width:] return '%s\n' % text def convert_p(self, el, text, parent_tags): if '_inline' in parent_tags: return ' ' + text.strip(' \t\r\n') + ' ' text = text.strip(' \t\r\n') if self.options['wrap']: # Preserve newlines (and preceding whitespace) resulting # from
tags. Newlines in the input have already been # replaced by spaces. if self.options['wrap_width'] is not None: lines = text.split('\n') new_lines = [] for line in lines: line = line.lstrip(' \t\r\n') line_no_trailing = line.rstrip() trailing = line[len(line_no_trailing):] line = fill(line, width=self.options['wrap_width'], break_long_words=False, break_on_hyphens=False) new_lines.append(line + trailing) text = '\n'.join(new_lines) return '\n\n%s\n\n' % text if text else '' def convert_pre(self, el, text, parent_tags): if not text: return '' code_language = self.options['code_language'] if self.options['code_language_callback']: code_language = self.options['code_language_callback'](el) or code_language if self.options['strip_pre'] == STRIP: text = strip_pre(text) # remove all leading/trailing newlines elif self.options['strip_pre'] == STRIP_ONE: text = strip1_pre(text) # remove one leading/trailing newline elif self.options['strip_pre'] is None: pass # leave leading and trailing newlines as-is else: raise ValueError('Invalid value for strip_pre: %s' % self.options['strip_pre']) return '\n\n```%s\n%s\n```\n\n' % (code_language, text) def convert_q(self, el, text, parent_tags): return '"' + text + '"' def convert_script(self, el, text, parent_tags): return '' def convert_style(self, el, text, parent_tags): return '' convert_s = convert_del convert_strong = convert_b convert_samp = convert_code convert_sub = abstract_inline_conversion(lambda self: self.options['sub_symbol']) convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol']) def convert_table(self, el, text, parent_tags): return '\n\n' + text.strip() + '\n\n' def convert_caption(self, el, text, parent_tags): return text.strip() + '\n\n' def convert_figcaption(self, el, text, parent_tags): return '\n\n' + text.strip() + '\n\n' def convert_td(self, el, text, parent_tags): colspan = 1 if 'colspan' in el.attrs and el['colspan'].isdigit(): colspan = max(1, min(1000, int(el['colspan']))) return ' ' + text.strip().replace("\n", " ") + ' |' * colspan def convert_th(self, el, text, parent_tags): colspan = 1 if 'colspan' in el.attrs and el['colspan'].isdigit(): colspan = max(1, min(1000, int(el['colspan']))) return ' ' + text.strip().replace("\n", " ") + ' |' * colspan def convert_tr(self, el, text, parent_tags): cells = el.find_all(['td', 'th']) is_first_row = el.find_previous_sibling() is None is_headrow = ( all([cell.name == 'th' for cell in cells]) or (el.parent.name == 'thead' # avoid multiple tr in thead and len(el.parent.find_all('tr')) == 1) ) is_head_row_missing = ( (is_first_row and not el.parent.name == 'tbody') or (is_first_row and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1) ) overline = '' underline = '' full_colspan = 0 for cell in cells: if 'colspan' in cell.attrs and cell['colspan'].isdigit(): full_colspan += max(1, min(1000, int(cell['colspan']))) else: full_colspan += 1 if ((is_headrow or (is_head_row_missing and self.options['table_infer_header'])) and is_first_row): # first row and: # - is headline or # - headline is missing and header inference is enabled # print headline underline underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' elif ((is_head_row_missing and not self.options['table_infer_header']) or (is_first_row and (el.parent.name == 'table' or (el.parent.name == 'tbody' and not el.parent.find_previous_sibling())))): # headline is missing and header inference is disabled or: # first row, not headline, and: # - the parent is table or # - the parent is tbody at the beginning of a table. # print empty headline above this row overline += '| ' + ' | '.join([''] * full_colspan) + ' |' + '\n' overline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' return overline + '|' + text + '\n' + underline def markdownify(html, **options): return MarkdownConverter(**options).convert(html) python-markdownify-1.2.2/markdownify/__init__.pyi000066400000000000000000000040611510642172600222060ustar00rootroot00000000000000from _typeshed import Incomplete from typing import Callable, Union ATX: str ATX_CLOSED: str UNDERLINED: str SETEXT = UNDERLINED SPACES: str BACKSLASH: str ASTERISK: str UNDERSCORE: str LSTRIP: str RSTRIP: str STRIP: str STRIP_ONE: str def markdownify( html: str, autolinks: bool = ..., bs4_options: str = ..., bullets: str = ..., code_language: str = ..., code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ..., convert: Union[list[str], None] = ..., default_title: bool = ..., escape_asterisks: bool = ..., escape_underscores: bool = ..., escape_misc: bool = ..., heading_style: str = ..., keep_inline_images_in: list[str] = ..., newline_style: str = ..., strip: Union[list[str], None] = ..., strip_document: Union[str, None] = ..., strip_pre: str = ..., strong_em_symbol: str = ..., sub_symbol: str = ..., sup_symbol: str = ..., table_infer_header: bool = ..., wrap: bool = ..., wrap_width: int = ..., ) -> str: ... class MarkdownConverter: def __init__( self, autolinks: bool = ..., bs4_options: str = ..., bullets: str = ..., code_language: str = ..., code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ..., convert: Union[list[str], None] = ..., default_title: bool = ..., escape_asterisks: bool = ..., escape_underscores: bool = ..., escape_misc: bool = ..., heading_style: str = ..., keep_inline_images_in: list[str] = ..., newline_style: str = ..., strip: Union[list[str], None] = ..., strip_document: Union[str, None] = ..., strip_pre: str = ..., strong_em_symbol: str = ..., sub_symbol: str = ..., sup_symbol: str = ..., table_infer_header: bool = ..., wrap: bool = ..., wrap_width: int = ..., ) -> None: ... def convert(self, html: str) -> str: ... def convert_soup(self, soup: Incomplete) -> str: ... python-markdownify-1.2.2/markdownify/main.py000077500000000000000000000106621510642172600212310ustar00rootroot00000000000000#!/usr/bin/env python import argparse import sys from markdownify import markdownify, ATX, ATX_CLOSED, UNDERLINED, \ SPACES, BACKSLASH, ASTERISK, UNDERSCORE def main(argv=sys.argv[1:]): parser = argparse.ArgumentParser( prog='markdownify', description='Converts html to markdown.', ) parser.add_argument('html', nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="The html file to convert. Defaults to STDIN if not " "provided.") parser.add_argument('-s', '--strip', nargs='*', help="A list of tags to strip. This option can't be used with " "the --convert option.") parser.add_argument('-c', '--convert', nargs='*', help="A list of tags to convert. This option can't be used with " "the --strip option.") parser.add_argument('-a', '--autolinks', action='store_true', help="A boolean indicating whether the 'automatic link' style " "should be used when a 'a' tag's contents match its href.") parser.add_argument('--default-title', action='store_false', help="A boolean to enable setting the title of a link to its " "href, if no title is given.") parser.add_argument('--heading-style', default=UNDERLINED, choices=(ATX, ATX_CLOSED, UNDERLINED), help="Defines how headings should be converted.") parser.add_argument('-b', '--bullets', default='*+-', help="A string of bullet styles to use; the bullet will " "alternate based on nesting level.") parser.add_argument('--strong-em-symbol', default=ASTERISK, choices=(ASTERISK, UNDERSCORE), help="Use * or _ to convert strong and italics text"), parser.add_argument('--sub-symbol', default='', help="Define the chars that surround ''.") parser.add_argument('--sup-symbol', default='', help="Define the chars that surround ''.") parser.add_argument('--newline-style', default=SPACES, choices=(SPACES, BACKSLASH), help="Defines the style of
conversions: two spaces " "or backslash at the and of the line thet should break.") parser.add_argument('--code-language', default='', help="Defines the language that should be assumed for all " "'
' sections.")
    parser.add_argument('--no-escape-asterisks', dest='escape_asterisks',
                        action='store_false',
                        help="Do not escape '*' to '\\*' in text.")
    parser.add_argument('--no-escape-underscores', dest='escape_underscores',
                        action='store_false',
                        help="Do not escape '_' to '\\_' in text.")
    parser.add_argument('-i', '--keep-inline-images-in',
                        default=[],
                        nargs='*',
                        help="Images are converted to their alt-text when the images are "
                        "located inside headlines or table cells. If some inline images "
                        "should be converted to markdown images instead, this option can "
                        "be set to a list of parent tags that should be allowed to "
                        "contain inline images.")
    parser.add_argument('--table-infer-header', dest='table_infer_header',
                        action='store_true',
                        help="When a table has no header row (as indicated by '' "
                        "or ''), use the first body row as the header row.")
    parser.add_argument('-w', '--wrap', action='store_true',
                        help="Wrap all text paragraphs at --wrap-width characters.")
    parser.add_argument('--wrap-width', type=int, default=80)
    parser.add_argument('--bs4-options',
                        default='html.parser',
                        help="Specifies the parser that BeautifulSoup should use to parse "
                             "the HTML markup. Examples include 'html5.parser', 'lxml', and "
                             "'html5lib'.")

    args = parser.parse_args(argv)
    print(markdownify(**vars(args)))


if __name__ == '__main__':
    main()
python-markdownify-1.2.2/markdownify/py.typed000066400000000000000000000000011510642172600214110ustar00rootroot00000000000000
python-markdownify-1.2.2/pyproject.toml000066400000000000000000000023731510642172600203120ustar00rootroot00000000000000[build-system]
requires = ["setuptools>=61.2", "setuptools_scm[toml]>=3.4.3"]
build-backend = "setuptools.build_meta"

[project]
name = "markdownify"
version = "1.2.2"
authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}]
description = "Convert HTML to markdown."
readme = "README.rst"
classifiers = [
    "Environment :: Web Environment",
    "Framework :: Django",
    "Intended Audience :: Developers",
    "License :: OSI Approved :: MIT License",
    "Operating System :: OS Independent",
    "Programming Language :: Python :: 2.5",
    "Programming Language :: Python :: 2.6",
    "Programming Language :: Python :: 2.7",
    "Programming Language :: Python :: 3.6",
    "Programming Language :: Python :: 3.7",
    "Programming Language :: Python :: 3.8",
    "Topic :: Utilities",
]
dependencies = [
    "beautifulsoup4>=4.9,<5",
    "six>=1.15,<2"
]

[project.urls]
Homepage = "http://github.com/matthewwithanm/python-markdownify"
Download = "http://github.com/matthewwithanm/python-markdownify/tarball/master"

[project.scripts]
markdownify = "markdownify.main:main"

[tool.setuptools]
zip-safe = false
include-package-data = true

[tool.setuptools.packages.find]
include = ["markdownify", "markdownify.*"]
namespaces = false

[tool.setuptools_scm]
python-markdownify-1.2.2/shell.nix000066400000000000000000000003221510642172600172150ustar00rootroot00000000000000{ pkgs ? import  {} }:
pkgs.mkShell {
  name = "python-shell";
  buildInputs = with pkgs; [
    python38
    python38Packages.tox
    python38Packages.setuptools
    python38Packages.virtualenv
  ];
}
python-markdownify-1.2.2/tests/000077500000000000000000000000001510642172600165335ustar00rootroot00000000000000python-markdownify-1.2.2/tests/__init__.py000066400000000000000000000000001510642172600206320ustar00rootroot00000000000000python-markdownify-1.2.2/tests/test_advanced.py000066400000000000000000000023411510642172600217110ustar00rootroot00000000000000from .utils import md


def test_chomp():
    assert md('  ') == '  '
    assert md('   ') == '  '
    assert md('    ') == '  '
    assert md('     ') == '  '
    assert md(' s  ') == ' **s**  '
    assert md('  s ') == '  **s** '
    assert md('  s  ') == '  **s**  '
    assert md('   s   ') == '  **s**  '


def test_nested():
    text = md('

This is an example link.

') assert text == '\n\nThis is an [example link](http://example.com/).\n\n' def test_ignore_comments(): text = md("") assert text == "" def test_ignore_comments_with_other_tags(): text = md("example link") assert text == "[example link](http://example.com/)" def test_code_with_tricky_content(): assert md('>') == "`>`" assert md('/home/username') == "`/home/`**username**" assert md('First line blah blah
blah blah
second line') \ == "First line `blah blah \nblah blah` second line" def test_special_tags(): assert md('') == '' assert md('') == 'foobar' python-markdownify-1.2.2/tests/test_args.py000066400000000000000000000036031510642172600211020ustar00rootroot00000000000000""" Test whitelisting/blacklisting of specific tags. """ from markdownify import markdownify, LSTRIP, RSTRIP, STRIP, STRIP_ONE from .utils import md def test_strip(): text = md('Some Text', strip=['a']) assert text == 'Some Text' def test_do_not_strip(): text = md('Some Text', strip=[]) assert text == '[Some Text](https://github.com/matthewwithanm)' def test_convert(): text = md('Some Text', convert=['a']) assert text == '[Some Text](https://github.com/matthewwithanm)' def test_do_not_convert(): text = md('Some Text', convert=[]) assert text == 'Some Text' def test_strip_document(): assert markdownify("

Hello

") == "Hello" # test default of STRIP assert markdownify("

Hello

", strip_document=LSTRIP) == "Hello\n\n" assert markdownify("

Hello

", strip_document=RSTRIP) == "\n\nHello" assert markdownify("

Hello

", strip_document=STRIP) == "Hello" assert markdownify("

Hello

", strip_document=None) == "\n\nHello\n\n" def test_strip_pre(): assert markdownify("
  \n  \n  Hello  \n  \n  
") == "```\n Hello\n```" assert markdownify("
  \n  \n  Hello  \n  \n  
", strip_pre=STRIP) == "```\n Hello\n```" assert markdownify("
  \n  \n  Hello  \n  \n  
", strip_pre=STRIP_ONE) == "```\n \n Hello \n \n```" assert markdownify("
  \n  \n  Hello  \n  \n  
", strip_pre=None) == "```\n \n \n Hello \n \n \n```" def bs4_options(): assert markdownify("

Hello

", bs4_options="html.parser") == "Hello" assert markdownify("

Hello

", bs4_options=["html.parser"]) == "Hello" assert markdownify("

Hello

", bs4_options={"features": "html.parser"}) == "Hello" python-markdownify-1.2.2/tests/test_basic.py000066400000000000000000000004451510642172600212300ustar00rootroot00000000000000from .utils import md def test_single_tag(): assert md('Hello') == 'Hello' def test_soup(): assert md('
Hello
') == '\n\nHello\n\n' def test_whitespace(): assert md(' a b \t\t c ') == ' a b c ' assert md(' a b \n\n c ') == ' a b\nc ' python-markdownify-1.2.2/tests/test_conversions.py000066400000000000000000000454411510642172600225240ustar00rootroot00000000000000from markdownify import ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE from .utils import md def inline_tests(tag, markup): # test template for different inline tags assert md(f'<{tag}>Hello') == f'{markup}Hello{markup}' assert md(f'foo <{tag}>Hello bar') == f'foo {markup}Hello{markup} bar' assert md(f'foo<{tag}> Hello bar') == f'foo {markup}Hello{markup} bar' assert md(f'foo <{tag}>Hello bar') == f'foo {markup}Hello{markup} bar' assert md(f'foo <{tag}> bar') in ['foo bar', 'foo bar'] # Either is OK def test_a(): assert md('Google') == '[Google](https://google.com)' assert md('https://google.com') == '' assert md('https://community.kde.org/Get_Involved') == '' assert md('https://community.kde.org/Get_Involved', autolinks=False) == '[https://community.kde.org/Get\\_Involved](https://community.kde.org/Get_Involved)' def test_a_spaces(): assert md('foo Google bar') == 'foo [Google](http://google.com) bar' assert md('foo Google bar') == 'foo [Google](http://google.com) bar' assert md('foo Google bar') == 'foo [Google](http://google.com) bar' assert md('foo bar') == 'foo bar' def test_a_with_title(): text = md('Google') assert text == r'[Google](http://google.com "The \"Goog\"")' assert md('https://google.com', default_title=True) == '[https://google.com](https://google.com "https://google.com")' def test_a_shortcut(): text = md('http://google.com') assert text == '' def test_a_no_autolinks(): assert md('https://google.com', autolinks=False) == '[https://google.com](https://google.com)' def test_a_in_code(): assert md('Google') == '`Google`' assert md('
Google
') == '\n\n```\nGoogle\n```\n\n' def test_b(): assert md('Hello') == '**Hello**' def test_b_spaces(): assert md('foo Hello bar') == 'foo **Hello** bar' assert md('foo Hello bar') == 'foo **Hello** bar' assert md('foo Hello bar') == 'foo **Hello** bar' assert md('foo bar') == 'foo bar' def test_blockquote(): assert md('
Hello
') == '\n> Hello\n\n' assert md('
\nHello\n
') == '\n> Hello\n\n' assert md('
 Hello
') == '\n> \u00a0Hello\n\n' def test_blockquote_with_nested_paragraph(): assert md('

Hello

') == '\n> Hello\n\n' assert md('

Hello

Hello again

') == '\n> Hello\n>\n> Hello again\n\n' def test_blockquote_with_paragraph(): assert md('
Hello

handsome

') == '\n> Hello\n\nhandsome\n\n' def test_blockquote_nested(): text = md('
And she was like
Hello
') assert text == '\n> And she was like\n> > Hello\n\n' def test_br(): assert md('a
b
c') == 'a \nb \nc' assert md('a
b
c', newline_style=BACKSLASH) == 'a\\\nb\\\nc' assert md('

foo
bar

', heading_style=ATX) == '\n\n# foo bar\n\n' assert md('foo
bar', heading_style=ATX) == ' foo bar |' def test_code(): inline_tests('code', '`') assert md('*this_should_not_escape*') == '`*this_should_not_escape*`' assert md('*this_should_not_escape*') == '`*this_should_not_escape*`' assert md('*this_should_not_escape*') == '`*this_should_not_escape*`' assert md('*this_should_not_escape*') == '`*this_should_not_escape*`' assert md('this should\t\tnormalize') == '`this should normalize`' assert md('this should\t\tnormalize') == '`this should normalize`' assert md('foobarbaz') == '`foobarbaz`' assert md('foobarbaz') == '`foobarbaz`' assert md('foo bar baz') == '`foo bar baz`' assert md('foo bar baz') == '`foo bar baz`' assert md('foo bar baz') == '`foo bar baz`' assert md('foo bar baz') == '`foo bar baz`' assert md('foo bar baz') == '`foo bar baz`' assert md('foo bar baz') == '`foo bar baz`' assert md('foobarbaz', sup_symbol='^') == '`foobarbaz`' assert md('foobarbaz', sub_symbol='^') == '`foobarbaz`' assert md('foo`bar`baz') == 'foo`` `bar` ``baz' assert md('foo``bar``baz') == 'foo``` ``bar`` ```baz' assert md('foo `bar` baz') == 'foo `` `bar` `` baz' def test_dl(): assert md('
term
definition
') == '\n\nterm\n: definition\n\n' assert md('

te

rm

definition
') == '\n\nte rm\n: definition\n\n' assert md('
term

definition-p1

definition-p2

') == '\n\nterm\n: definition-p1\n\n definition-p2\n\n' assert md('
term

definition 1

definition 2

') == '\n\nterm\n: definition 1\n: definition 2\n\n' assert md('
term 1
definition 1
term 2
definition 2
') == '\n\nterm 1\n: definition 1\n\nterm 2\n: definition 2\n\n' assert md('
term

line 1

line 2

') == '\n\nterm\n: > line 1\n >\n > line 2\n\n' assert md('
term
  1. 1

    • 2a
    • 2b
  2. 3

') == '\n\nterm\n: 1. 1\n\n * 2a\n * 2b\n 2. 3\n\n' def test_del(): inline_tests('del', '~~') def test_div_section_article(): for tag in ['div', 'section', 'article']: assert md(f'<{tag}>456') == '\n\n456\n\n' assert md(f'123<{tag}>456789') == '123\n\n456\n\n789' assert md(f'123<{tag}>\n 456 \n789') == '123\n\n456\n\n789' assert md(f'123<{tag}>

456

789') == '123\n\n456\n\n789' assert md(f'123<{tag}>\n

456

\n789') == '123\n\n456\n\n789' assert md(f'123<{tag}>
4 5 6
789') == '123\n\n```\n4 5 6\n```\n\n789' assert md(f'123<{tag}>\n
4 5 6
\n789') == '123\n\n```\n4 5 6\n```\n\n789' assert md(f'123<{tag}>4\n5\n6789') == '123\n\n4\n5\n6\n\n789' assert md(f'123<{tag}>\n4\n5\n6\n789') == '123\n\n4\n5\n6\n\n789' assert md(f'123<{tag}>\n

\n4\n5\n6\n

\n789') == '123\n\n4\n5\n6\n\n789' assert md(f'<{tag}>

title

body', heading_style=ATX) == '\n\n# title\n\nbody\n\n' def test_em(): inline_tests('em', '*') def test_figcaption(): assert (md("TEXT
\nCaption\n
SPAN
") == "TEXT\n\nCaption\n\nSPAN") assert (md("
SPAN
\nCaption\n
TEXT") == "SPAN\n\nCaption\n\nTEXT") def test_header_with_space(): assert md('

\n\nHello

') == '\n\n### Hello\n\n' assert md('

Hello\n\n\nWorld

') == '\n\n### Hello World\n\n' assert md('

\n\nHello

') == '\n\n#### Hello\n\n' assert md('
\n\nHello
') == '\n\n##### Hello\n\n' assert md('
\n\nHello\n\n
') == '\n\n##### Hello\n\n' assert md('
\n\nHello \n\n
') == '\n\n##### Hello\n\n' def test_h1(): assert md('

Hello

') == '\n\nHello\n=====\n\n' def test_h2(): assert md('

Hello

') == '\n\nHello\n-----\n\n' def test_hn(): assert md('

Hello

') == '\n\n### Hello\n\n' assert md('

Hello

') == '\n\n#### Hello\n\n' assert md('
Hello
') == '\n\n##### Hello\n\n' assert md('
Hello
') == '\n\n###### Hello\n\n' assert md('Hello') == md('
Hello
') assert md('Hello') == md('

Hello

') assert md('Hello') == md('Hello') def test_hn_chained(): assert md('

First

\n

Second

\n

Third

', heading_style=ATX) == '\n\n# First\n\n## Second\n\n### Third\n\n' assert md('X

First

', heading_style=ATX) == 'X\n\n# First\n\n' assert md('X

First

', heading_style=ATX_CLOSED) == 'X\n\n# First #\n\n' assert md('X

First

') == 'X\n\nFirst\n=====\n\n' def test_hn_nested_tag_heading_style(): assert md('

A

P

C

', heading_style=ATX_CLOSED) == '\n\n# A P C #\n\n' assert md('

A

P

C

', heading_style=ATX) == '\n\n# A P C\n\n' def test_hn_nested_simple_tag(): tag_to_markdown = [ ("strong", "**strong**"), ("b", "**b**"), ("em", "*em*"), ("i", "*i*"), ("p", "p"), ("a", "a"), ("div", "div"), ("blockquote", "blockquote"), ] for tag, markdown in tag_to_markdown: assert md('

A <' + tag + '>' + tag + ' B

') == '\n\n### A ' + markdown + ' B\n\n' assert md('

A
B

', heading_style=ATX) == '\n\n### A B\n\n' # Nested lists not supported # assert md('

A
  • li1
  • l2

', heading_style=ATX) == '\n### A li1 li2 B\n\n' def test_hn_nested_img(): image_attributes_to_markdown = [ ("", "", ""), ("alt='Alt Text'", "Alt Text", ""), ("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""), ] for image_attributes, markdown, title in image_attributes_to_markdown: assert md('

A B

') == '\n\n### A' + (' ' + markdown + ' ' if markdown else ' ') + 'B\n\n' assert md('

A B

', keep_inline_images_in=['h3']) == '\n\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n' def test_hn_atx_headings(): assert md('

Hello

', heading_style=ATX) == '\n\n# Hello\n\n' assert md('

Hello

', heading_style=ATX) == '\n\n## Hello\n\n' def test_hn_atx_closed_headings(): assert md('

Hello

', heading_style=ATX_CLOSED) == '\n\n# Hello #\n\n' assert md('

Hello

', heading_style=ATX_CLOSED) == '\n\n## Hello ##\n\n' def test_hn_newlines(): assert md("

H1-1

TEXT

H2-2

TEXT

H1-2

TEXT", heading_style=ATX) == '\n\n# H1-1\n\nTEXT\n\n## H2-2\n\nTEXT\n\n# H1-2\n\nTEXT' assert md('

H1-1

\n

TEXT

\n

H2-2

\n

TEXT

\n

H1-2

\n

TEXT

', heading_style=ATX) == '\n\n# H1-1\n\nTEXT\n\n## H2-2\n\nTEXT\n\n# H1-2\n\nTEXT\n\n' def test_head(): assert md('head') == 'head' def test_hr(): assert md('Hello
World') == 'Hello\n\n---\n\nWorld' assert md('Hello
World') == 'Hello\n\n---\n\nWorld' assert md('

Hello

\n
\n

World

') == '\n\nHello\n\n---\n\nWorld\n\n' def test_i(): assert md('Hello') == '*Hello*' def test_img(): assert md('Alt text') == '![Alt text](/path/to/img.jpg "Optional title")' assert md('Alt text') == '![Alt text](/path/to/img.jpg)' def test_video(): assert md('') == '[![text](/path/to/img.jpg)](/path/to/video.mp4)' assert md('') == '[text](/path/to/video.mp4)' assert md('') == '[text](/path/to/video.mp4)' assert md('') == '![text](/path/to/img.jpg)' assert md('') == 'text' def test_kbd(): inline_tests('kbd', '`') def test_p(): assert md('

hello

') == '\n\nhello\n\n' assert md("

hello

") == "\n\nhello\n\n" assert md('

123456789 123456789

') == '\n\n123456789 123456789\n\n' assert md('

123456789\n\n\n123456789

') == '\n\n123456789\n123456789\n\n' assert md('

123456789\n\n\n123456789

', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n' assert md('

123456789\n\n\n123456789

', wrap=True, wrap_width=None) == '\n\n123456789 123456789\n\n' assert md('

123456789 123456789

', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n' assert md('

Some long link

', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n' assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n' assert md('

12345
67890

', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n' assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345 \n67890\n\n' assert md('

12345
67890

', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345 \n67890\n\n' assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n' assert md('

12345678901
12345

', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n' assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n' assert md('

12345678901
12345

', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n' assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n1234 5678\n9012\\\n67890\n\n' assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n1234 5678\n9012 \n67890\n\n' assert md('First

Second

Third

Fourth') == 'First\n\nSecond\n\nThird\n\nFourth' assert md('

 x y

', wrap=True, wrap_width=80) == '\n\n\u00a0x y\n\n' def test_pre(): assert md('
test\n    foo\nbar
') == '\n\n```\ntest\n foo\nbar\n```\n\n' assert md('
test\n    foo\nbar
') == '\n\n```\ntest\n foo\nbar\n```\n\n' assert md('
*this_should_not_escape*
') == '\n\n```\n*this_should_not_escape*\n```\n\n' assert md('
*this_should_not_escape*
') == '\n\n```\n*this_should_not_escape*\n```\n\n' assert md('
\t\tthis  should\t\tnot  normalize
') == '\n\n```\n\t\tthis should\t\tnot normalize\n```\n\n' assert md('
\t\tthis  should\t\tnot  normalize
') == '\n\n```\n\t\tthis should\t\tnot normalize\n```\n\n' assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' assert md('
foo\nbaz
') == '\n\n```\nfoo\nbaz\n```\n\n' assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' assert md('
foo\nbar\nbaz
') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' assert md('
foo\nbar\nbaz
', sup_symbol='^') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' assert md('
foo\nbar\nbaz
', sub_symbol='^') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' assert md('
foo\nbar\nbaz
', sub_symbol='^') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' assert md('foo
bar
baz', sub_symbol='^') == 'foo\n\n```\nbar\n```\n\nbaz' assert md("

foo

\n
bar
\n

baz

", sub_symbol="^") == "\n\nfoo\n\n```\nbar\n```\n\nbaz" def test_q(): assert md('foo quote bar') == 'foo "quote" bar' assert md('foo quote bar') == 'foo "quote" bar' def test_script(): assert md('foo bar') == 'foo bar' def test_style(): assert md('foo bar') == 'foo bar' def test_s(): inline_tests('s', '~~') def test_samp(): inline_tests('samp', '`') def test_strong(): assert md('Hello') == '**Hello**' def test_strong_em_symbol(): assert md('Hello', strong_em_symbol=UNDERSCORE) == '__Hello__' assert md('Hello', strong_em_symbol=UNDERSCORE) == '__Hello__' assert md('Hello', strong_em_symbol=UNDERSCORE) == '_Hello_' assert md('Hello', strong_em_symbol=UNDERSCORE) == '_Hello_' def test_sub(): assert md('foo') == 'foo' assert md('foo', sub_symbol='~') == '~foo~' assert md('foo', sub_symbol='') == 'foo' def test_sup(): assert md('foo') == 'foo' assert md('foo', sup_symbol='^') == '^foo^' assert md('foo', sup_symbol='') == 'foo' def test_lang(): assert md('
test\n    foo\nbar
', code_language='python') == '\n\n```python\ntest\n foo\nbar\n```\n\n' assert md('
test\n    foo\nbar
', code_language='javascript') == '\n\n```javascript\ntest\n foo\nbar\n```\n\n' def test_lang_callback(): def callback(el): return el['class'][0] if el.has_attr('class') else None assert md('
test\n    foo\nbar
', code_language_callback=callback) == '\n\n```python\ntest\n foo\nbar\n```\n\n' assert md('
test\n    foo\nbar
', code_language_callback=callback) == '\n\n```javascript\ntest\n foo\nbar\n```\n\n' assert md('
test\n    foo\nbar
', code_language_callback=callback) == '\n\n```javascript\ntest\n foo\nbar\n```\n\n' def test_spaces(): assert md('

a b

c d

') == '\n\na b\n\nc d\n\n' assert md('

a

') == '\n\n*a*\n\n' assert md('test

again

') == 'test\n\nagain\n\n' assert md('test
text
after') == 'test\n> text\n\nafter' assert md('
  1. x
  2. y
') == '\n\n1. x\n2. y\n' assert md('
  • x
  • y
  • ') == '\n\n* x\n* y\n' assert md('test
     foo 
    bar') == 'test\n\n```\n foo\n```\n\nbar' python-markdownify-1.2.2/tests/test_custom_converter.py000066400000000000000000000031631510642172600235500ustar00rootroot00000000000000from markdownify import MarkdownConverter from bs4 import BeautifulSoup class UnitTestConverter(MarkdownConverter): """ Create a custom MarkdownConverter for unit tests """ def convert_img(self, el, text, parent_tags): """Add two newlines after an image""" return super().convert_img(el, text, parent_tags) + '\n\n' def convert_custom_tag(self, el, text, parent_tags): """Ensure conversion function is found for tags with special characters in name""" return "convert_custom_tag(): %s" % text def convert_h1(self, el, text, parent_tags): """Ensure explicit heading conversion function is used""" return "convert_h1: %s" % (text) def convert_hN(self, n, el, text, parent_tags): """Ensure general heading conversion function is used""" return "convert_hN(%d): %s" % (n, text) def test_custom_conversion_functions(): # Create shorthand method for conversion def md(html, **options): return UnitTestConverter(**options).convert(html) assert md('Alt texttext') == '![Alt text](/path/to/img.jpg "Optional title")\n\ntext' assert md('Alt texttext') == '![Alt text](/path/to/img.jpg)\n\ntext' assert md("text") == "convert_custom_tag(): text" assert md("

    text

    ") == "convert_h1: text" assert md("

    text

    ") == "convert_hN(3): text" def test_soup(): html = 'test' soup = BeautifulSoup(html, 'html.parser') assert MarkdownConverter().convert_soup(soup) == '**test**' python-markdownify-1.2.2/tests/test_escaping.py000066400000000000000000000065361510642172600217470ustar00rootroot00000000000000import warnings from bs4 import MarkupResemblesLocatorWarning from .utils import md def test_asterisks(): assert md('*hey*dude*') == r'\*hey\*dude\*' assert md('*hey*dude*', escape_asterisks=False) == r'*hey*dude*' def test_underscore(): assert md('_hey_dude_') == r'\_hey\_dude\_' assert md('_hey_dude_', escape_underscores=False) == r'_hey_dude_' def test_xml_entities(): assert md('&', escape_misc=True) == r'\&' def test_named_entities(): assert md('»') == u'\xbb' def test_hexadecimal_entities(): # This looks to be a bug in BeautifulSoup (fixed in bs4) that we have to work around. assert md(''') == '\x27' def test_single_escaping_entities(): assert md('&amp;', escape_misc=True) == r'\&' def test_misc(): # ignore the bs4 warning that "1.2" or "*" looks like a filename warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning) assert md('\\*', escape_misc=True) == r'\\\*' assert md('<foo>', escape_misc=True) == r'\' assert md('# foo', escape_misc=True) == r'\# foo' assert md('#5', escape_misc=True) == r'#5' assert md('5#', escape_misc=True) == '5#' assert md('####### foo', escape_misc=True) == r'####### foo' assert md('> foo', escape_misc=True) == r'\> foo' assert md('~~foo~~', escape_misc=True) == r'\~\~foo\~\~' assert md('foo\n===\n', escape_misc=True) == 'foo\n\\=\\=\\=\n' assert md('---\n', escape_misc=True) == '\\---\n' assert md('- test', escape_misc=True) == r'\- test' assert md('x - y', escape_misc=True) == r'x \- y' assert md('test-case', escape_misc=True) == 'test-case' assert md('x-', escape_misc=True) == 'x-' assert md('-y', escape_misc=True) == '-y' assert md('+ x\n+ y\n', escape_misc=True) == '\\+ x\n\\+ y\n' assert md('`x`', escape_misc=True) == r'\`x\`' assert md('[text](notalink)', escape_misc=True) == r'\[text\](notalink)' assert md('text]', escape_misc=True) == r'[text\]](link)' assert md('[text]', escape_misc=True) == r'[\[text\]](link)' assert md('1. x', escape_misc=True) == r'1\. x' # assert md('1. x', escape_misc=True) == r'1\. x' assert md('1. x', escape_misc=True) == r'1\. x' assert md(' 1. x', escape_misc=True) == r' 1\. x' assert md('123456789. x', escape_misc=True) == r'123456789\. x' assert md('1234567890. x', escape_misc=True) == r'1234567890. x' assert md('A1. x', escape_misc=True) == r'A1. x' assert md('1.2', escape_misc=True) == r'1.2' assert md('not a number. x', escape_misc=True) == r'not a number. x' assert md('1) x', escape_misc=True) == r'1\) x' # assert md('1) x', escape_misc=True) == r'1\) x' assert md('1) x', escape_misc=True) == r'1\) x' assert md(' 1) x', escape_misc=True) == r' 1\) x' assert md('123456789) x', escape_misc=True) == r'123456789\) x' assert md('1234567890) x', escape_misc=True) == r'1234567890) x' assert md('(1) x', escape_misc=True) == r'(1) x' assert md('A1) x', escape_misc=True) == r'A1) x' assert md('1)x', escape_misc=True) == r'1)x' assert md('not a number) x', escape_misc=True) == r'not a number) x' assert md('|not table|', escape_misc=True) == r'\|not table\|' assert md(r'\ <foo> &amp; | ` `', escape_misc=False) == r'\ & | ` `' python-markdownify-1.2.2/tests/test_lists.py000066400000000000000000000055251510642172600213110ustar00rootroot00000000000000from .utils import md nested_uls = """
    • 1
      • a
        • I
        • II
        • III
      • b
      • c
    • 2
    • 3
    """ nested_ols = """
    1. 1
      1. a
        1. I
        2. II
        3. III
      2. b
      3. c
    2. 2
    3. 3
""" def test_ol(): assert md('
  1. a
  2. b
') == '\n\n1. a\n2. b\n' assert md('
  1. a
  2. b
') == '\n\n1. a\n2. b\n' assert md('
  1. a
  2. b
') == '\n\n3. a\n4. b\n' assert md('foo
  1. a
  2. b
bar') == 'foo\n\n3. a\n4. b\n\nbar' assert md('
  1. a
  2. b
') == '\n\n1. a\n2. b\n' assert md('
  1. a
  2. b
') == '\n\n1. a\n2. b\n' assert md('
  1. a
  2. b
') == '\n\n1. a\n2. b\n' assert md('
  1. first para

    second para

  2. third para

    fourth para

') == '\n\n1234. first para\n\n second para\n1235. third para\n\n fourth para\n' def test_nested_ols(): assert md(nested_ols) == '\n\n1. 1\n 1. a\n 1. I\n 2. II\n 3. III\n 2. b\n 3. c\n2. 2\n3. 3\n' def test_ul(): assert md('
  • a
  • b
') == '\n\n* a\n* b\n' assert md("""
  • a
  • b
  • c
""") == '\n\n* a\n* b\n* c\n' assert md('
  • first para

    second para

  • third para

    fourth para

') == '\n\n* first para\n\n second para\n* third para\n\n fourth para\n' def test_inline_ul(): assert md('

foo

  • a
  • b

bar

') == '\n\nfoo\n\n* a\n* b\n\nbar\n\n' assert md('foo
  • bar
baz') == 'foo\n\n* bar\n\nbaz' def test_nested_uls(): """ Nested ULs should alternate bullet characters. """ assert md(nested_uls) == '\n\n* 1\n + a\n - I\n - II\n - III\n + b\n + c\n* 2\n* 3\n' def test_bullets(): assert md(nested_uls, bullets='-') == '\n\n- 1\n - a\n - I\n - II\n - III\n - b\n - c\n- 2\n- 3\n' def test_li_text(): assert md('
  • foo bar
  • foo bar
  • foo bar space.
') == '\n\n* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n' python-markdownify-1.2.2/tests/test_tables.py000066400000000000000000000233761510642172600214310ustar00rootroot00000000000000from .utils import md table = """
Firstname Lastname Age
Jill Smith 50
Eve Jackson 94
""" table_with_html_content = """
Firstname Lastname Age
Jill Smith 50
Eve Jackson 94
""" table_with_paragraphs = """
Firstname

Lastname

Age

Jill

Smith

50

Eve Jackson 94
""" table_with_linebreaks = """
Firstname Lastname Age
Jill Smith Jackson 50
Eve Jackson Smith 94
""" table_with_header_column = """
Firstname Lastname Age
Jill Smith 50
Eve Jackson 94
""" table_head_body = """
Firstname Lastname Age
Jill Smith 50
Eve Jackson 94
""" table_head_body_missing_head = """
Firstname Lastname Age
Jill Smith 50
Eve Jackson 94
""" table_head_body_multiple_head = """
Creator Editor Server
Operator Manager Engineer
Bob Oliver Tom
Thomas Lucas Ethan
""" table_missing_text = """
Lastname Age
Jill 50
Eve Jackson 94
""" table_missing_head = """
Firstname Lastname Age
Jill Smith 50
Eve Jackson 94
""" table_body = """
Firstname Lastname Age
Jill Smith 50
Eve Jackson 94
""" table_with_caption = """TEXT
Caption
Firstname Lastname Age
""" table_with_colspan = """
Name Age
Jill Smith 50
Eve Jackson 94
""" table_with_undefined_colspan = """
Name Age
Jill Smith
""" table_with_colspan_missing_head = """
Name Age
Jill Smith 50
Eve Jackson 94
""" def test_table(): assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_html_content) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_paragraphs) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n' assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_head_body_multiple_head) == '\n\n| | | |\n| --- | --- | --- |\n| Creator | Editor | Server |\n| Operator | Manager | Engineer |\n| Bob | Oliver | Tom |\n| Thomas | Lucas | Ethan |\n\n' assert md(table_head_body_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_missing_text) == '\n\n| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_missing_head) == '\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_body) == '\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_caption) == 'TEXT\n\nCaption\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n\n' assert md(table_with_colspan) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_undefined_colspan) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n' assert md(table_with_colspan_missing_head) == '\n\n| | | |\n| --- | --- | --- |\n| Name | | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' def test_table_infer_header(): assert md(table, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_html_content, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_paragraphs, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_linebreaks, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n' assert md(table_with_header_column, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_head_body, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_head_body_multiple_head, table_infer_header=True) == '\n\n| Creator | Editor | Server |\n| --- | --- | --- |\n| Operator | Manager | Engineer |\n| Bob | Oliver | Tom |\n| Thomas | Lucas | Ethan |\n\n' assert md(table_head_body_missing_head, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_missing_text, table_infer_header=True) == '\n\n| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_missing_head, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_body, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_caption, table_infer_header=True) == 'TEXT\n\nCaption\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n' assert md(table_with_colspan, table_infer_header=True) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_undefined_colspan, table_infer_header=True) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n' assert md(table_with_colspan_missing_head, table_infer_header=True) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' python-markdownify-1.2.2/tests/types.py000066400000000000000000000040641510642172600202550ustar00rootroot00000000000000from markdownify import markdownify, ASTERISK, BACKSLASH, LSTRIP, RSTRIP, SPACES, STRIP, UNDERLINED, UNDERSCORE, MarkdownConverter from bs4 import BeautifulSoup from typing import Union markdownify("

Hello

") == "Hello" # test default of STRIP markdownify("

Hello

", strip_document=LSTRIP) == "Hello\n\n" markdownify("

Hello

", strip_document=RSTRIP) == "\n\nHello" markdownify("

Hello

", strip_document=STRIP) == "Hello" markdownify("

Hello

", strip_document=None) == "\n\nHello\n\n" # default options MarkdownConverter( autolinks=True, bs4_options='html.parser', bullets='*+-', code_language='', code_language_callback=None, convert=None, default_title=False, escape_asterisks=True, escape_underscores=True, escape_misc=False, heading_style=UNDERLINED, keep_inline_images_in=[], newline_style=SPACES, strip=None, strip_document=STRIP, strip_pre=STRIP, strong_em_symbol=ASTERISK, sub_symbol='', sup_symbol='', table_infer_header=False, wrap=False, wrap_width=80, ).convert("") # custom options MarkdownConverter( strip_document=None, bullets="-", escape_asterisks=True, escape_underscores=True, escape_misc=True, autolinks=True, default_title=True, newline_style=BACKSLASH, sup_symbol='^', sub_symbol='^', keep_inline_images_in=['h3'], wrap=True, wrap_width=80, strong_em_symbol=UNDERSCORE, code_language='python', code_language_callback=None ).convert("") html = 'test' soup = BeautifulSoup(html, 'html.parser') MarkdownConverter().convert_soup(soup) == '**test**' def callback(el: BeautifulSoup) -> Union[str, None]: return el['class'][0] if el.has_attr('class') else None MarkdownConverter(code_language_callback=callback).convert("") MarkdownConverter(code_language_callback=lambda el: None).convert("") markdownify('
test\n    foo\nbar
', code_language_callback=callback) markdownify('
test\n    foo\nbar
', code_language_callback=lambda el: None) python-markdownify-1.2.2/tests/utils.py000066400000000000000000000004441510642172600202470ustar00rootroot00000000000000from markdownify import MarkdownConverter # for unit testing, disable document-level stripping by default so that # separation newlines are included in testing def md(html, **options): options = {"strip_document": None, **options} return MarkdownConverter(**options).convert(html) python-markdownify-1.2.2/tox.ini000066400000000000000000000003231510642172600167020ustar00rootroot00000000000000[tox] envlist = py38 [testenv] passenv = PYTHONPATH deps = pytest==8 flake8 restructuredtext_lint Pygments commands = pytest flake8 --ignore=E501,W503 markdownify tests restructuredtext-lint README.rst