tex-fmt-0.5.2/000077500000000000000000000000001473573253500131205ustar00rootroot00000000000000tex-fmt-0.5.2/.gitattributes000066400000000000000000000001241473573253500160100ustar00rootroot00000000000000tests/** linguist-vendored completion/** linguist-vendored man/** linguist-vendored tex-fmt-0.5.2/.github/000077500000000000000000000000001473573253500144605ustar00rootroot00000000000000tex-fmt-0.5.2/.github/workflows/000077500000000000000000000000001473573253500165155ustar00rootroot00000000000000tex-fmt-0.5.2/.github/workflows/ci.yml000066400000000000000000000032641473573253500176400ustar00rootroot00000000000000name: "CI" on: pull_request: branches: - main - develop push: branches: - main - develop workflow_dispatch: jobs: test: name: Cargo test (${{ matrix.os }}) runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: [windows-latest, macos-latest, ubuntu-latest] steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 - name: Test run: cargo test format: name: Cargo format (${{ matrix.os }}) runs-on: ${{ matrix.os }} strategy: fail-fast: true matrix: os: [windows-latest, macos-latest, ubuntu-latest] steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 - name: Format run: cargo fmt --check cross: name: Cargo cross build (${{ matrix.target }}) runs-on: ubuntu-latest strategy: fail-fast: false matrix: target: - aarch64-unknown-linux-gnu - x86_64-unknown-linux-musl steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 - run: cargo install cross - name: Build run: cross build --target ${{ matrix.target }} nix: name: Nix build runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: cachix/install-nix-action@v25 with: github_access_token: ${{ secrets.GITHUB_TOKEN }} nix_path: nixpkgs=channel:nixos-unstable - uses: DeterminateSystems/magic-nix-cache-action@v2 - run: nix build - run: nix flake check --all-systems tex-fmt-0.5.2/.github/workflows/publish.yml000066400000000000000000000060531473573253500207120ustar00rootroot00000000000000name: "Publish" on: release: types: [published] workflow_dispatch: jobs: build: name: Build (${{ matrix.archive }}) runs-on: ${{ matrix.os }} strategy: matrix: include: - os: windows-latest target: x86_64-pc-windows-msvc program: cargo archive: tex-fmt-x86_64-windows.zip - os: windows-latest target: i686-pc-windows-msvc program: cargo archive: tex-fmt-i686-windows.zip - os: windows-latest target: aarch64-pc-windows-msvc program: cargo archive: tex-fmt-aarch64-windows.zip - os: macos-latest target: x86_64-apple-darwin program: cargo archive: tex-fmt-x86_64-macos.tar.gz - os: macos-latest target: aarch64-apple-darwin program: cargo archive: tex-fmt-aarch64-macos.tar.gz - os: ubuntu-latest target: x86_64-unknown-linux-gnu program: cargo archive: tex-fmt-x86_64-linux.tar.gz - os: ubuntu-latest target: aarch64-unknown-linux-gnu program: cross archive: tex-fmt-aarch64-linux.tar.gz - os: ubuntu-latest target: armv7-unknown-linux-gnueabihf program: cross archive: tex-fmt-armv7hf-linux.tar.gz - os: ubuntu-latest target: x86_64-unknown-linux-musl program: cargo archive: tex-fmt-x86_64-alpine.tar.gz steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: targets: ${{ matrix.target }} - name: Install cross if: ${{ matrix.program == 'cross' }} run: cargo install cross - name: Build run: ${{ matrix.program }} build --target ${{ matrix.target }} --all-features --release --locked - name: Compress (windows) if: ${{ contains(matrix.os, 'windows') }} run: ${{ format('Compress-Archive target/{0}/release/tex-fmt.exe {1}', matrix.target, matrix.archive) }} - name: Compress (macos) if: ${{ contains(matrix.os, 'macos') }} run: ${{ format('gtar -czvf {1} -C target/{0}/release tex-fmt', matrix.target, matrix.archive) }} - name: Compress (linux) if: ${{ contains(matrix.os, 'ubuntu') }} run: ${{ format('tar -czvf {1} -C target/{0}/release tex-fmt', matrix.target, matrix.archive) }} - name: Upload binary archive uses: actions/upload-artifact@v4 with: name: ${{ matrix.target }} path: ${{ matrix.archive }} github: name: GitHub archive upload needs: [build] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/download-artifact@v4 - name: Publish binaries run: | gh release upload ${{ github.ref_name }} $(find . -iname tex-fmt*.zip) gh release upload ${{ github.ref_name }} $(find . -iname tex-fmt*.tar.gz) env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} tex-fmt-0.5.2/.gitignore000066400000000000000000000001621473573253500151070ustar00rootroot00000000000000/debug/ /target/ **/*.rs.bk *.pdb /result *.html *.log flamegraph.svg perf.data* *.csv *.pdf *.png cachegrind.out tex-fmt-0.5.2/.rustfmt.toml000066400000000000000000000000171473573253500155750ustar00rootroot00000000000000max_width = 80 tex-fmt-0.5.2/Cargo.lock000066400000000000000000000444011473573253500150300ustar00rootroot00000000000000# This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "aho-corasick" version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] [[package]] name = "anstream" version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", "is_terminal_polyfill", "utf8parse", ] [[package]] name = "anstyle" version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" [[package]] name = "anstyle-parse" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" dependencies = [ "windows-sys 0.59.0", ] [[package]] name = "anstyle-wincon" version = "3.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" dependencies = [ "anstyle", "windows-sys 0.59.0", ] [[package]] name = "autocfg" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "bitflags" version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" version = "4.5.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" version = "4.5.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838" dependencies = [ "anstream", "anstyle", "clap_lex", "strsim", ] [[package]] name = "clap_complete" version = "4.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac2e663e3e3bed2d32d065a8404024dad306e699a04263ec59919529f803aee9" dependencies = [ "clap", ] [[package]] name = "clap_lex" version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" [[package]] name = "clap_mangen" version = "0.2.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fbae9cbfdc5d4fa8711c09bd7b83f644cb48281ac35bf97af3e47b0675864bdf" dependencies = [ "clap", "roff", ] [[package]] name = "colorchoice" version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" [[package]] name = "colored" version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" dependencies = [ "lazy_static", "windows-sys 0.59.0", ] [[package]] name = "dirs" version = "5.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" dependencies = [ "dirs-sys", ] [[package]] name = "dirs-sys" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" dependencies = [ "libc", "option-ext", "redox_users", "windows-sys 0.48.0", ] [[package]] name = "env_filter" version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" dependencies = [ "log", "regex", ] [[package]] name = "env_logger" version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcaee3d8e3cfc3fd92428d477bc97fc29ec8716d180c0d74c643bb26166660e0" dependencies = [ "anstream", "anstyle", "env_filter", "humantime", "log", ] [[package]] name = "equivalent" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "getrandom" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", "wasi", ] [[package]] name = "hashbrown" version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" [[package]] name = "humantime" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "indexmap" version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", "hashbrown", ] [[package]] name = "is_terminal_polyfill" version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "lazy_static" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "libredox" version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" dependencies = [ "bitflags", "libc", ] [[package]] name = "log" version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "memchr" version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "merge" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "10bbef93abb1da61525bbc45eeaff6473a41907d19f8f9aa5168d214e10693e9" dependencies = [ "merge_derive", "num-traits", ] [[package]] name = "merge_derive" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "209d075476da2e63b4b29e72a2ef627b840589588e71400a25e3565c4f849d07" dependencies = [ "proc-macro-error", "proc-macro2", "quote", "syn 1.0.109", ] [[package]] name = "num-traits" version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] [[package]] name = "option-ext" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" [[package]] name = "proc-macro-error" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" dependencies = [ "proc-macro-error-attr", "proc-macro2", "quote", "syn 1.0.109", "version_check", ] [[package]] name = "proc-macro-error-attr" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" dependencies = [ "proc-macro2", "quote", "version_check", ] [[package]] name = "proc-macro2" version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" dependencies = [ "proc-macro2", ] [[package]] name = "redox_users" version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom", "libredox", "thiserror", ] [[package]] name = "regex" version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", "regex-automata", "regex-syntax", ] [[package]] name = "regex-automata" version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] [[package]] name = "regex-syntax" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "roff" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "88f8660c1ff60292143c98d08fc6e2f654d722db50410e3f3797d40baaf9d8f3" [[package]] name = "serde" version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", "syn 2.0.93", ] [[package]] name = "serde_spanned" version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "87607cb1398ed59d48732e575a4c28a7a8ebf2454b964fe3f224f2afc07909e1" dependencies = [ "serde", ] [[package]] name = "similar" version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1de1d4f81173b03af4c0cbed3c898f6bff5b870e4a7f5d6f4057d62a7a4b686e" [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "syn" version = "2.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c786062daee0d6db1132800e623df74274a0a87322d8e183338e01b3d98d058" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "tex-fmt" version = "0.5.2" dependencies = [ "clap", "clap_complete", "clap_mangen", "colored", "dirs", "env_logger", "lazy_static", "log", "merge", "regex", "similar", "toml", ] [[package]] name = "thiserror" version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", "syn 2.0.93", ] [[package]] name = "toml" version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e" dependencies = [ "serde", "serde_spanned", "toml_datetime", "toml_edit", ] [[package]] name = "toml_datetime" version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" dependencies = [ "serde", ] [[package]] name = "toml_edit" version = "0.22.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" dependencies = [ "indexmap", "serde", "serde_spanned", "toml_datetime", "winnow", ] [[package]] name = "unicode-ident" version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" [[package]] name = "utf8parse" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "version_check" version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "windows-sys" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ "windows-targets 0.48.5", ] [[package]] name = "windows-sys" version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ "windows-targets 0.52.6", ] [[package]] name = "windows-targets" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ "windows_aarch64_gnullvm 0.48.5", "windows_aarch64_msvc 0.48.5", "windows_i686_gnu 0.48.5", "windows_i686_msvc 0.48.5", "windows_x86_64_gnu 0.48.5", "windows_x86_64_gnullvm 0.48.5", "windows_x86_64_msvc 0.48.5", ] [[package]] name = "windows-targets" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ "windows_aarch64_gnullvm 0.52.6", "windows_aarch64_msvc 0.52.6", "windows_i686_gnu 0.52.6", "windows_i686_gnullvm", "windows_i686_msvc 0.52.6", "windows_x86_64_gnu 0.52.6", "windows_x86_64_gnullvm 0.52.6", "windows_x86_64_msvc 0.52.6", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" version = "0.6.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" dependencies = [ "memchr", ] tex-fmt-0.5.2/Cargo.toml000066400000000000000000000015321473573253500150510ustar00rootroot00000000000000[package] name = "tex-fmt" version = "0.5.2" authors = ["William George Underwood"] license = "MIT" repository = "https://github.com/WGUNDERWOOD/tex-fmt" edition = "2021" description = "LaTeX formatter written in Rust" keywords = ["latex", "formatter"] categories = ["command-line-utilities", "development-tools"] exclude = ["tests/*", "extra/*", "*.nix", ".github/*", "completion/*", "man/*"] [dependencies] clap = { version = "4.5.23", features = ["cargo"] } clap_complete = "4.5.40" clap_mangen = "0.2.24" colored = "2.2.0" dirs = "5.0.1" env_logger = "0.11.6" lazy_static = "1.5.0" log = "0.4.22" merge = "0.1.0" regex = "1.11.1" similar = "2.6.0" toml = "0.8.19" [features] shellinstall = [] [build-dependencies] clap = { version = "4.5.23", features = ["cargo"] } clap_complete = "4.5.40" clap_mangen = "0.2.24" [profile.release] codegen-units = 1 tex-fmt-0.5.2/LICENSE000066400000000000000000000020711473573253500141250ustar00rootroot00000000000000MIT License Copyright (c) 2024 William George Underwood Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. tex-fmt-0.5.2/NEWS.md000066400000000000000000000124601473573253500142210ustar00rootroot00000000000000# v0.5.2 - Fix critical bug with config files missing the lists field. - Trim trailing newlines. # v0.5.1 - Custom list environments can be passed using the `lists` option in the config file. - Allow `verbosity = "info"` in the config file. - Fixed a bug with configuration values being incorrectly reset. # v0.5.0 Version v0.5.0 is a major release, including breaking changes and substantial new features. ## Changes to existing CLI options - The option to disable line wrapping has been changed from `--keep` to `--nowrap`. - The option to set the number of characters used per indentation level has been changed from `--tab` to `--tabsize`. - The option to set the maximum line length for wrapping has been changed from `--wrap` to `--wraplen`. - See below for information on the new `--config`, `--noconfig`, `--man`, `--completion`, and `--args` flags. ## Configuration file support Configuring tex-fmt can now be achieved using a configuration file as well as CLI arguments. The configuration file can be read from a user-specified path with `--config `, from the current working directory, from the root of the current git repository, or from the user's configuration directory, in order of decreasing priority. Arguments passed on the command line will always override those specified in configuration files. Configuration files can be disabled by passing `--noconfig`. ## Man pages Man pages can be generated using the `--man` flag. Pre-built man pages are also available for download from the GitHub repository. ## Shell completion Completion files for popular shells, including bash, fish, zsh, elvish and PowerShell, can be generated using the `--completion ` flag. Pre-built completion scripts are also available for download from the GitHub repository. ## Minor changes - Arguments passed to tex-fmt can be inspected by passing `--args` - Fixed bug with `\itemsep` matching the `\item` pattern - Added last non-indented line number to "Indent did not return to zero" error messages - Removed LTO optimization to improve compile time with minimal effect on run time - If duplicate file names are provided, they are now removed before formatting - Added LLF to the list of existing tools - Changed order of options in help dialogs # v0.4.7 - Fix bug with `--stdin` adding newlines at EOF - Fix logic for ignoring verbatim environments - Ensure sectioning commands begin on new lines - Various performance improvements - Add NEWS.md for release notes - Ensure all test files successfully compile to PDFs - Better documentation of options in README.md # v0.4.6 - Added `--wrap` flag to choose line length for wrapping - Significant changes to central formatting logic to reduce memory allocations - Treat comment environments as verbatim - Improved performance with finding comments in source code # v0.4.5 - Added `--usetabs` to use tabs instead of spaces for indentation - Fixed a bug with unicode graphemes and comment handling - Main function now returns `std::process::ExitCode` for a cleaner exit - Reduced memory allocation in comment handling logic - Reduced memory allocation when indenting lines - Caching of pattern matches reduces number of regex searches # v0.4.4 - Added `--tab` flag for variable tab size [default: 2] - Fixed bug with incorrect line numbers being printed - Fixed bug with quadratic complexity of comment checking - Added Arch User Repository support - Added VS Code support - Improved performance by moving environment checking inside main loop - Upgraded Cargo dependencies - Included LTO optimization on the release build # v0.4.3 - Switch output text coloring to the `colored` crate. - Add `--stdin` flag to read input from stdin (and output to stdout). # v0.4.2 - Added `--quiet` flag to suppress warning messages - Allow `tex-fmt main` for `tex-fmt main.tex` - Internal documentation - Improved performance - Added more Clippy lints # v0.4.1 - Added binary archives to GitHub release # v0.4.0 ## Breaking change The logic for line wrapping has been changed. Previously, for lines longer than 80 characters, we would break the line at suitable points into chunks of no more than 80 characters. Then another round of indenting was applied, and this would often push the length back over 80 characters. A subsequent round of wrapping was therefore required, and often led to the creation of very short lines (#6). The new approach is to take lines longer than 80 characters and remove the first segment up to 70 characters, pushing the resulting two lines back onto the queue. When indenting is then reapplied, the lines typically do not go over 80 characters unless the indentation is very deep. However, some lines may now be truncated to 70 characters rather than 80. ## Other updates - Added a `--keep` flag to disable line wrapping (#10) - Improved the central algorithm to avoid multiple passes and improve run-time performance (#7) - Only write the file to disk if the formatting returns a different string, to avoid unnecessary editing of modification times # v0.3.1 - Updated README - Added project logo # v0.3.0 - Added a `--check` flag to check if file is correctly formatted - Fixed bug with line wrapping giving up early - Shell scripts verified with shellcheck - Refactored variable names - Some performance improvements # v0.2.2 Bump version number # v0.2.1 Bump version number # v0.2.0 Bump version number tex-fmt-0.5.2/README.md000066400000000000000000000203371473573253500144040ustar00rootroot00000000000000# tex-fmt tex-fmt [![CI]( https://github.com/wgunderwood/tex-fmt/actions/workflows/ci.yml/badge.svg)]( https://github.com/wgunderwood/tex-fmt/actions/workflows/ci.yml) [![crates.io]( https://img.shields.io/crates/v/tex-fmt?logo=rust)]( https://crates.io/crates/tex-fmt) [![Packaging status]( https://repology.org/badge/tiny-repos/tex-fmt.svg)]( https://repology.org/project/tex-fmt/versions) [![license: MIT]( https://shields.io/badge/license-MIT-blue.svg)]( https://mit-license.org/) An extremely fast LaTeX formatter written in Rust.
Input Output
``` tex \documentclass{article} \begin{document} \begin{itemize} \item Lists with items over multiple lines \end{itemize} \begin{equation} E = m c^2 \end{equation} \end{document} ``` ``` tex \documentclass{article} \begin{document} \begin{itemize} \item Lists with items over multiple lines \end{itemize} \begin{equation} E = m c^2 \end{equation} \end{document} ```
- ⚡  Extremely fast run-time performance - 🔧  Minimal configuration required - 📟  Command-line interface - 📜  Handles LaTeX file types `.tex`, `.bib`, `.cls`, and `.sty` - 🦀  Written entirely in safe Rust ## Installation ### Cargo Install the [stable release](https://crates.io/crates/tex-fmt) with ``` shell cargo install tex-fmt ``` Install from [GitHub](https://github.com/WGUNDERWOOD/tex-fmt) with ```shell cargo install --git "https://github.com/wgunderwood/tex-fmt" ``` ### Nix Install from [nixpkgs]( https://search.nixos.org/packages?channel=unstable&query=tex-fmt) into a temporary shell with ``` shell nix-shell -p tex-fmt ``` Build from source using flakes with ``` shell nix build "github:wgunderwood/tex-fmt" ``` Add to your NixOS installation with ```nix environment.systemPackages = [ pkgs.tex-fmt ]; ``` ### Arch Linux Install from the [Arch User Repository](https://aur.archlinux.org/packages/tex-fmt). For example, using the [yay](https://github.com/Jguer/yay) AUR helper: ``` shell yay -S tex-fmt ``` ### Homebrew Install using [Homebrew](https://formulae.brew.sh/formula/tex-fmt) with ```shell brew install tex-fmt ``` ### Binary download Binaries for various platforms are available on the GitHub [releases](https://github.com/WGUNDERWOOD/tex-fmt/releases) page. ### Visual Studio Code Integration with VS Code is provided by the [LaTeX Workshop](https://github.com/James-Yu/LaTeX-Workshop) extension. You will need to first install tex-fmt through one of the above methods. ## Usage The most commonly used options are given below. For a full list, see the [options]( https://github.com/WGUNDERWOOD/tex-fmt?tab=readme-ov-file#options) section below. ``` shell tex-fmt file.tex # format file.tex and overwrite tex-fmt --check file.tex # check if file.tex is correctly formatted tex-fmt --print file.tex # format file.tex and print to stdout tex-fmt --nowrap file.tex # do not wrap long lines tex-fmt --stdin # read from stdin and print to stdout tex-fmt --help # view help information ``` ### Configuration Options can also be read from a configuration file, which will be read from the following locations, in order of decreasing priority. - A named config file passed as `tex-fmt --config ` - A file named `tex-fmt.toml` in the current working directory - A file named `tex-fmt.toml` in the root directory of the current git repository - A file named `tex-fmt.toml` in a subdirectory titled `tex-fmt/` in the user's configuration directory - Linux: `~/.config/tex-fmt/tex-fmt.toml` - macOS: `/Users//Library/Application Support/tex-fmt/tex-fmt.toml` - Windows: `C:\Users\\AppData\Roaming\tex-fmt\tex-fmt.toml` Arguments passed on the command line will always override those specified in configuration files. An example configuration file is available at [tex-fmt.toml](https://github.com/WGUNDERWOOD/tex-fmt/blob/main/tex-fmt.toml). To ignore all config files, use the `--noconfig` flag. Note for contributors: this repository's configuration file will be automatically applied if tex-fmt is run from within the repository. Use `--noconfig` or `--config ` to avoid this. ### Disabling the formatter Ending a source line with `% tex-fmt: skip` disables formatting for that line. To disable the formatter for a block, use `% tex-fmt: off` and `% tex-fmt: on`. ``` tex \documentclass{article} \begin{document} This line is skipped % tex-fmt: skip % tex-fmt: off These lines are also not formatted or wrapped % tex-fmt: on \end{document} ``` Verbatim environments including `verbatim`, `Verbatim`, `lstlisting` and `minted` are automatically skipped. ### Shell completion Shell completion scripts can be generated at run-time using the `--completion ` flag. See the [completion]( https://github.com/WGUNDERWOOD/tex-fmt/tree/main/completion) directory for more details. ### Man page A man page can be generated at run-time using the `--man` flag. See the [man](https://github.com/WGUNDERWOOD/tex-fmt/tree/main/man) directory for more details. ## Performance When formatting all of the test cases, tex-fmt is over a thousand times faster than latexindent. | **Files** | **Lines** | **Size** | **tex-fmt** | **latexindent** | **latexindent -m** | | --- | --- | --- | --- | --- | --- | | 51 | 94k | 3.5M | **0.055s** | 106s [x1927] | 127s [x2309] | ## Contribution Please feel free to open an issue or submit a pull request, including as much information as you can. Documentation of internals can be accessed by cloning this repository and running `cargo doc`. Alternatively, you can [Buy Me a Coffee](https://buymeacoffee.com/wgunderwood)! ## Limitations - Semantic parsing of LaTeX code not conducted - No linting or correction of syntax errors - Compliance with existing formatting guidelines not guaranteed - No spelling or grammar checking ## Existing tools - [latexindent](https://github.com/cmhughes/latexindent.pl). Perl script, many configuration options, slow on large files - [LaTeXTidy](http://bfc.sfsu.edu/cgi-bin/hsu.pl?LaTeX_Tidy). Perl script, download links seem to be broken - [latex-pretty](https://c.albert-thompson.com/latex-pretty/). Browser-based, uses latexindent as the backend - [latexformat.com](https://latexformat.com/). Browser-based - [texpretty](http://ftp.math.utah.edu/pub/texpretty/). C program which works sometimes and appears to be fast - [latex-editor](https://latex-editor.pages.dev/formatter/). Browser-based - [LaTeXFmt](https://github.com/engeljh/vim-latexfmt). Vim plugin, does not apply indentation - [latex-formatter](https://github.com/nfode/latex-formatter). Visual Studio plugin, uses latexindent as the backend - [LLF](https://repo.or.cz/llf.git). Lua script, many configuration options ## Options The following command-line options are offered by tex-fmt. | Option | Alias | Default | Description | | -------------- | ----- | ------- | --- | | `--check` | `-c` | | Check formatting, do not modify files | | `--print` | `-p` | | Print to stdout, do not modify files | | `--nowrap` | `-n` | | Do not wrap long lines | | `--wraplen` | `-l` | `80` | Line length for wrapping | | `--tabsize` | `-t` | `2` | Number of characters to use as tab size | | `--usetabs` | | | Use tabs instead of spaces for indentation | | `--stdin` | `-s` | | Process stdin as a single file, output to stdout | | `--config` | | | Path to config file | | `--noconfig` | | | Do not read any config file | | `--lists` | | | Extra list environments to be formatted as `itemize` | | `--verbose` | `-v` | | Show info messages | | `--quiet` | `-q` | | Hide warning messages | | `--trace` | | | Show trace messages | | `--completion` | | | Generate a shell completion script | | `--man` | | | Generate a man page | | `--args` | | | View arguments passed to tex-fmt | | `--help` | `-h` | | Print help | | `--version` | `-V` | | Print version | tex-fmt-0.5.2/build.rs000066400000000000000000000027001473573253500145640ustar00rootroot00000000000000use clap::ValueEnum; use clap_complete::{generate_to, Shell}; use std::env::var_os; use std::fs::create_dir; use std::io::Error; use std::path::Path; include!("src/command.rs"); fn main() -> Result<(), Error> { println!("cargo::rerun-if-changed=src/"); println!("cargo::rerun-if-changed=build.rs"); println!("cargo::rerun-if-changed=Cargo.toml"); if std::env::var("CARGO_FEATURE_SHELLINSTALL").is_ok() { println!("cargo::warning=shellinstall"); build_completion()?; build_man()?; } Ok(()) } fn build_completion() -> Result<(), Error> { let outdir = match var_os("CARGO_MANIFEST_DIR") { None => return Ok(()), Some(outdir) => Path::new(&outdir).join("completion/"), }; if !outdir.exists() { create_dir(&outdir).unwrap(); } let mut command = get_cli_command(); for &shell in Shell::value_variants() { generate_to(shell, &mut command, "tex-fmt", &outdir)?; } Ok(()) } fn build_man() -> Result<(), Error> { let outdir = match var_os("CARGO_MANIFEST_DIR") { None => return Ok(()), Some(outdir) => Path::new(&outdir).join("man/"), }; if !outdir.exists() { create_dir(&outdir).unwrap(); } let command = get_cli_command(); let man = clap_mangen::Man::new(command); let mut buffer: Vec = Default::default(); man.render(&mut buffer)?; std::fs::write(outdir.join("tex-fmt.1"), buffer)?; Ok(()) } tex-fmt-0.5.2/default.nix000066400000000000000000000004231473573253500152630ustar00rootroot00000000000000{pkgs ? import {}}: let manifest = (pkgs.lib.importTOML ./Cargo.toml).package; in pkgs.rustPlatform.buildRustPackage rec { pname = manifest.name; version = manifest.version; cargoLock.lockFile = ./Cargo.lock; src = pkgs.lib.cleanSource ./.; } tex-fmt-0.5.2/extra/000077500000000000000000000000001473573253500142435ustar00rootroot00000000000000tex-fmt-0.5.2/extra/binary.sh000066400000000000000000000006271473573253500160700ustar00rootroot00000000000000#!/usr/bin/env bash echo "Testing binary" DIR="$(mktemp -d)" cp -r ../tests/* "$DIR" cargo build --release # run tex-fmt ../target/release/tex-fmt "$DIR/source"/* "$DIR/target"/* # tex-fmt agrees with target files for file in ../tests/source/*; do f=$(basename "$file") diff ../"tests/target/$f" "$DIR/source/$f" | diff-so-fancy diff ../"tests/target/$f" "$DIR/target/$f" | diff-so-fancy done tex-fmt-0.5.2/extra/card.py000066400000000000000000000031031473573253500155230ustar00rootroot00000000000000from PIL import Image import matplotlib.pyplot as plt import matplotlib.font_manager as fm # start plot (fig, ax) = plt.subplots(figsize=(10, 5)) plt.xticks([]) plt.yticks([]) for side in ["bottom", "top", "left", "right"]: ax.spines[side].set_color("#FFFFFF00") # colors col_dark = "#191924" col_yellow = "#eed858" col_light = "#faf7e5" outer_col = col_yellow inner_col = col_light text_col = col_dark # outer box w = 200 h = 100 xs_outer = [w/2, w, w, 0, 0, w/2] ys_outer = [0, 0, h, h, 0, 0] plt.fill(xs_outer, ys_outer, c=outer_col, lw=1, zorder=1) # inner box dw = 23 dh = 20 xs_inner = [w/2, w-dw, w-dw, dw, dw, w/2] ys_inner = [dh, dh, h-dh, h-dh, dh, dh] plt.plot(xs_inner, ys_inner, c=inner_col, lw=30, zorder=2) plt.fill(xs_inner, ys_inner, c=inner_col, lw=0) # logo img = Image.open("logo.png").resize((900, 900)) fig.figimage(img, 2210, 540) # text fontfamily = "Roboto Slab" fonts = fm.findSystemFonts(fontpaths=None, fontext='ttf') [fm.fontManager.addfont(f) for f in fonts if fontfamily.split()[0] in f] fontsize = 16 plt.text(31, 50, "An extremely fast La\nformatter written in Rust.", fontsize=fontsize, ha="left", va="center", fontweight="light", c=text_col, fontfamily=fontfamily, fontstyle="normal") plt.text(92.6, 53.53, "T", fontsize=fontsize, ha="left", va="center", fontweight="light", c=text_col, fontfamily=fontfamily, fontstyle="normal") plt.text(96.55, 53.53, "eX", fontsize=fontsize, ha="left", va="center", fontweight="light", c=text_col, fontfamily=fontfamily, fontstyle="normal") # save plt.savefig("card.svg", dpi=400, transparent=True) plt.close("all") tex-fmt-0.5.2/extra/latex.sh000066400000000000000000000011241473573253500157120ustar00rootroot00000000000000#!/usr/bin/env bash echo "Checking latex PDFs agree" DIR="$(mktemp -d)" cp -r ../tests/* "$DIR" echo "$DIR" cd "$DIR" || exit echo for file in ./source/*.tex; do f=$(basename "$file" .tex) echo "Running latex for $f.tex" (cd ./source && latexmk -pdflua "$f.tex") (cd ./target && latexmk -pdflua "$f.tex") (cd ./source && pdftotext -q "$f.pdf") (cd ./target && pdftotext -q "$f.pdf") done echo for file in ./source/*.tex; do f=$(basename "$file" .tex) echo "Checking PDF for $f.tex" diff -u "source/$f.txt" "target/$f.txt" | diff-so-fancy done echo "$DIR" tex-fmt-0.5.2/extra/logo.py000066400000000000000000000027471473573253500155670ustar00rootroot00000000000000import matplotlib.pyplot as plt import matplotlib.font_manager as fm # start plot (fig, ax) = plt.subplots(figsize=(5, 5)) plt.xticks([]) plt.yticks([]) for side in ["bottom", "top", "left", "right"]: ax.spines[side].set_color("#FFFFFF00") # colors col_dark = "#191924" col_orange = "#e5652e" col_yellow = "#eed858" col_light = "#faf7e5" outer_col = col_orange inner_col = col_dark text_col = col_light line_col = col_yellow # outer box lw = 24 xs_outer = [0.5, 1, 1, 0, 0, 0.5] ys_outer = [0, 0, 1, 1, 0, 0] plt.plot(xs_outer, ys_outer, c=outer_col, lw=lw, zorder=0) plt.fill(xs_outer, ys_outer, c=outer_col, lw=0) # inner box eps = 0.05 xs_inner = [0.5, 1-eps, 1-eps, eps, eps, 0.5] ys_inner = [eps, eps, 1-eps, 1-eps, eps, eps] plt.plot(xs_inner, ys_inner, c=inner_col, lw=0.6*lw, zorder=2) plt.fill(xs_inner, ys_inner, c=inner_col, lw=0) # line eps = 0.125 plt.plot([0.5, eps, 1-eps, 0.5], [0.485] * 4, lw=5, c=col_yellow) # text fontfamily = "Bungee" fonts = fm.findSystemFonts(fontpaths=None, fontext='ttf') [fm.fontManager.addfont(f) for f in fonts if fontfamily.split()[0] in f] fontsize = 100 plt.text(0.5, 0.72, "TEX", fontsize=fontsize, ha="center", va="center", fontweight="light", c=text_col, fontfamily=fontfamily, fontstyle="normal") fontsize = 96 plt.text(0.496, 0.25, "FMT", fontsize=fontsize, ha="center", va="center", fontweight="light", c=text_col, fontfamily=fontfamily, fontstyle="normal") # save plt.savefig("logo.svg", dpi=1000, transparent=True) plt.close("all") tex-fmt-0.5.2/extra/logo.svg000066400000000000000000000155461473573253500157370ustar00rootroot00000000000000 1980-01-01T00:00:00+00:00 image/svg+xml Matplotlib v3.8.4, https://matplotlib.org/ tex-fmt-0.5.2/extra/perf.sh000066400000000000000000000030221473573253500155300ustar00rootroot00000000000000#!/usr/bin/env bash echo "Getting performance metrics" DIR="$(mktemp -d)" cp -r ../tests/* "$DIR" cargo build --release calc(){ awk "BEGIN { print ""$*"" }"; } echo echo -n "Test files: $(find "$DIR"/*/* | wc -l) files, " echo -n "$(wc -l --total=only "$DIR"/source/* "$DIR"/target/*) lines, " du -hs "$DIR" | cut -f 1 echo # tex-fmt TEXFMTFILE="hyperfine-tex-fmt.csv" hyperfine --warmup 10 \ --min-runs 20 \ --export-csv $TEXFMTFILE \ --command-name "tex-fmt" \ --prepare "cp -r ../tests/* $DIR" \ "../target/release/tex-fmt $DIR/source/* $DIR/target/*" # latexindent LATEXINDENTFILE="hyperfine-latexindent.csv" hyperfine --warmup 0 \ --export-csv $LATEXINDENTFILE \ --runs 1 \ --command-name "latexindent" \ --prepare "cp -r ../tests/* $DIR" \ "latexindent $DIR/source/* $DIR/target/*" # latexindent -m LATEXINDENTMFILE="hyperfine-latexindent-m.csv" hyperfine --warmup 0 \ --export-csv $LATEXINDENTMFILE \ --runs 1 \ --command-name "latexindent -m" \ --prepare "cp -r ../tests/* $DIR" \ "latexindent -m $DIR/source/* $DIR/target/*" # print results TEXFMT=$(cat $TEXFMTFILE | tail -n 1 | cut -d "," -f 2) echo "tex-fmt: ${TEXFMT}s" LATEXINDENT=$(cat $LATEXINDENTFILE | tail -n 1 | cut -d "," -f 2) LATEXINDENTTIMES=$(calc "$LATEXINDENT"/"$TEXFMT") echo "latexindent: ${LATEXINDENT}s, x$LATEXINDENTTIMES" LATEXINDENTM=$(cat $LATEXINDENTMFILE | tail -n 1 | cut -d "," -f 2) LATEXINDENTMTIMES=$(calc "$LATEXINDENTM"/"$TEXFMT") echo "latexindent -m: ${LATEXINDENTM}s, x$LATEXINDENTMTIMES" tex-fmt-0.5.2/extra/prof.sh000066400000000000000000000006331473573253500155470ustar00rootroot00000000000000#!/usr/bin/env bash echo "Making flamegraph profile" DIR="$(mktemp -d)" cp -r ../tests/* "$DIR" CARGO_PROFILE_RELEASE_DEBUG=true cargo build --release BIN="../target/release/tex-fmt" echo echo -n "Test files: $(find "$DIR"/*/* | wc -l) files, " echo -n "$(wc -l --total=only "$DIR"/source/* "$DIR"/target/*) lines, " du -hs "$DIR" | cut -f 1 echo flamegraph -F 10000 -- "$BIN" "$DIR/source/"* "$DIR/target/"* tex-fmt-0.5.2/flake.lock000066400000000000000000000027261473573253500150630ustar00rootroot00000000000000{ "nodes": { "flake-utils": { "inputs": { "systems": "systems" }, "locked": { "lastModified": 1731533236, "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", "owner": "numtide", "repo": "flake-utils", "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", "type": "github" }, "original": { "owner": "numtide", "repo": "flake-utils", "type": "github" } }, "nixpkgs": { "locked": { "lastModified": 1735264675, "narHash": "sha256-MgdXpeX2GuJbtlBrH9EdsUeWl/yXEubyvxM1G+yO4Ak=", "owner": "nixos", "repo": "nixpkgs", "rev": "d49da4c08359e3c39c4e27c74ac7ac9b70085966", "type": "github" }, "original": { "owner": "nixos", "ref": "nixos-24.11", "repo": "nixpkgs", "type": "github" } }, "root": { "inputs": { "flake-utils": "flake-utils", "nixpkgs": "nixpkgs" } }, "systems": { "locked": { "lastModified": 1681028828, "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", "owner": "nix-systems", "repo": "default", "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", "type": "github" }, "original": { "owner": "nix-systems", "repo": "default", "type": "github" } } }, "root": "root", "version": 7 } tex-fmt-0.5.2/flake.nix000066400000000000000000000011701473573253500147210ustar00rootroot00000000000000{ description = "LaTeX formatter written in Rust"; inputs = { nixpkgs.url = "github:nixos/nixpkgs/nixos-24.11"; flake-utils.url = "github:numtide/flake-utils"; }; outputs = { self, nixpkgs, flake-utils, }: flake-utils.lib.eachDefaultSystem ( system: let pkgs = import nixpkgs {inherit system;}; in { packages = { default = pkgs.callPackage ./default.nix {inherit pkgs;}; }; devShells = { default = pkgs.callPackage ./shell.nix {inherit pkgs;}; }; } ) // { overlays.default = import ./overlay.nix; }; } tex-fmt-0.5.2/justfile000066400000000000000000000020301473573253500146630ustar00rootroot00000000000000default: test doc clippy format shellcheck shellinstall all: default prof perf binary logo latex alias b := build alias d := doc alias t := test alias l := latex alias c := clippy alias f := format build: @cargo build -r test: @cargo test doc: @cargo doc shellinstall: @cargo build --features shellinstall testignored: @cargo test -- --ignored clippy: @cargo clippy -r && cargo shear format: @cargo fmt @alejandra -q . latex: @cd extra && bash latex.sh perf: @cd extra && bash perf.sh prof: @cd extra && bash prof.sh binary: @cd extra && bash binary.sh upgrade: @cargo upgrade && cargo update shellcheck: @shellcheck extra/*.sh nix: @nix flake update todo: @rg -g '!justfile' todo logo: @cd extra && python logo.py @cd extra && magick -background none logo.svg -resize 5000x5000 logo.png @cd extra && python card.py @cd extra && magick -background none card.svg -resize 1280x640\! card.png @cd extra && inkscape -w 2560 -h 1280 card.svg -o card.png @cd extra && rm -f logo.png card.svg tex-fmt-0.5.2/notes.org000066400000000000000000000034041473573253500147620ustar00rootroot00000000000000#+title: tex-fmt * Tasks ** TODO Add to Nix home-manager with empty config file ** TODO Indicate which args are CLI and which are config ** TODO Add link to treefmt-nix in README * Options and documentation ** Args struct ** OptionArgs struct ** Implement Default, Display, from for Args ** CLI command ** CLI args parser function ** Config args parser function ** Args resolver ** GitHub README * Release process ** Update release notes *** git log --oneline --no-merges vX.X.X..main *** Write in NEWS.md ** Update version number in Cargo.toml ** Update Nix flake and lock *** Check for new NixOS version (nixos-XX.XX) for flake.nix *** just nix ** Update Rust version *** just upgrade ** Run tests *** just *** just perf *** Update performance results in README.md ** Push to GitHub and check tests pass ** Create a git tag *** git tag vX.X.X *** git push --tags ** Publish to crates.io with cargo publish *** Pass --allow-dirty if notes.org has changed ** Publish GitHub release with notes from NEWS.md *** GitHub binaries published automatically with actions ** Publish in nixpkgs *** Check out master branch of nixpkgs fork *** git fetch upstream *** git rebase upstream/master *** git fetch *** git push --force-with-lease origin master *** git branch -d update-tex-fmt *** git switch --create update-tex-fmt upstream/master *** nvim pkgs/by-name/te/tex-fmt/package.nix *** Update version and invalidate src.hash and cargoHash *** nix-build -A tex-fmt *** Fix both hashes, get a successful build *** git add pkgs/by-name/te/tex-fmt/package.nix *** git commit -m "tex-fmt: X.X.X -> Y.Y.Y" *** git push --set-upstream origin HEAD *** Go to GitHub and create a pull request *** Submit pull request and check relevant boxes ** Tidy repository *** Commit any new changes to NEWS.md or notes.org tex-fmt-0.5.2/overlay.nix000066400000000000000000000000741473573253500153220ustar00rootroot00000000000000_: prev: { tex-fmt = prev.callPackage ./default.nix {}; } tex-fmt-0.5.2/shell.nix000066400000000000000000000010641473573253500147500ustar00rootroot00000000000000{pkgs ? import {}}: pkgs.mkShell { inputsFrom = [(pkgs.callPackage ./default.nix {})]; buildInputs = let python = pkgs.python3.withPackages (ps: with ps; [ grip matplotlib pillow ]); in [ pkgs.alejandra pkgs.bacon pkgs.cacert pkgs.cargo-edit pkgs.cargo-flamegraph pkgs.cargo-shear pkgs.clippy pkgs.diff-so-fancy pkgs.gh pkgs.hyperfine pkgs.poppler_utils pkgs.ripgrep pkgs.rustfmt pkgs.shellcheck pkgs.texlive.combined.scheme-full python ]; } tex-fmt-0.5.2/src/000077500000000000000000000000001473573253500137075ustar00rootroot00000000000000tex-fmt-0.5.2/src/args.rs000066400000000000000000000164231473573253500152170ustar00rootroot00000000000000//! Main arguments use crate::cli::*; use crate::config::*; use crate::logging::*; use crate::Log; use colored::Colorize; use log::Level; use log::LevelFilter; use merge::Merge; use std::fmt; use std::path::PathBuf; /// Arguments passed to tex-fmt #[derive(Debug)] pub struct Args { /// Check formatting, do not modify files pub check: bool, /// Print to stdout, do not modify files pub print: bool, /// Wrap long lines pub wrap: bool, /// Maximum allowed line length pub wraplen: u8, /// Wrap lines longer than this pub wrapmin: u8, /// Number of characters to use as tab size pub tabsize: u8, /// Characters to use for indentation pub tabchar: TabChar, /// Read from stdin and output to stdout pub stdin: bool, /// Path to config file pub config: Option, /// Extra list environments pub lists: Vec, /// Verbosity level for log messages pub verbosity: LevelFilter, /// Print arguments and exit pub arguments: bool, /// List of files to be formatted pub files: Vec, } /// Arguments using Options to track CLI/config file/default values #[derive(Clone, Debug, Merge)] #[allow(clippy::missing_docs_in_private_items)] pub struct OptionArgs { pub check: Option, pub print: Option, pub wrap: Option, pub wraplen: Option, pub wrapmin: Option, pub tabsize: Option, pub tabchar: Option, pub stdin: Option, pub config: Option, pub noconfig: Option, #[merge(strategy = merge::vec::append)] pub lists: Vec, pub verbosity: Option, pub arguments: Option, #[merge(strategy = merge::vec::append)] pub files: Vec, } /// Character to use for indentation #[derive(Clone, Debug, PartialEq, Eq)] #[allow(clippy::missing_docs_in_private_items)] pub enum TabChar { Tab, Space, } impl fmt::Display for TabChar { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Self::Tab => write!(f, "tab"), Self::Space => write!(f, "space"), } } } impl Default for OptionArgs { fn default() -> Self { Self { check: Some(false), print: Some(false), wrap: Some(true), wraplen: Some(80), wrapmin: Some(70), tabsize: Some(2), tabchar: Some(TabChar::Space), stdin: Some(false), config: None, noconfig: Some(false), lists: vec![ "itemize", "enumerate", "description", "inlineroman", "inventory", ] .into_iter() .map(std::borrow::ToOwned::to_owned) .collect(), verbosity: Some(LevelFilter::Warn), arguments: Some(false), files: vec![], } } } /// Get all arguments from CLI, config file, and defaults, and merge them pub fn get_args() -> Args { let mut args = get_cli_args(); let config_args = get_config_args(&args); if let Some(c) = config_args { args.merge(c); } args.merge(OptionArgs::default()); Args::from(args) } impl Args { /// Construct concrete arguments from optional arguments fn from(args: OptionArgs) -> Self { Self { check: args.check.unwrap(), print: args.print.unwrap(), wrap: args.wrap.unwrap(), wraplen: args.wraplen.unwrap(), wrapmin: args.wrapmin.unwrap(), tabsize: args.tabsize.unwrap(), tabchar: args.tabchar.unwrap(), stdin: args.stdin.unwrap(), config: args.config, lists: args.lists, verbosity: args.verbosity.unwrap(), arguments: args.arguments.unwrap(), files: args.files, } } /// Resolve conflicting arguments pub fn resolve(&mut self, logs: &mut Vec) -> u8 { let mut exit_code = 0; // stdin implies print self.print |= self.stdin; // Set wrapmin self.wrapmin = if self.wraplen >= 50 { self.wraplen - 10 } else { self.wraplen }; // Check files are passed if no --stdin if !self.stdin && self.files.is_empty() { record_file_log( logs, Level::Error, "", "No files specified. Provide filenames or pass --stdin.", ); exit_code = 1; } // Check no files are passed if --stdin if self.stdin && !self.files.is_empty() { record_file_log( logs, Level::Error, "", "Do not provide file name(s) when using --stdin.", ); exit_code = 1; } // Remove duplicate list environments self.lists.dedup(); // Remove duplicate files self.files.dedup(); // Print arguments and exit if self.arguments { println!("{self}"); std::process::exit(0); } exit_code } } impl Default for Args { fn default() -> Self { Self::from(OptionArgs::default()) } } /// Print a field from `Args` fn display_arg_line( f: &mut fmt::Formatter, name: &str, value: &str, ) -> fmt::Result { let width = 20; let name_fmt = format!("{}{}", name.bold(), ":"); write!(f, "\n {name_fmt: fmt::Result { write!(f, "{}", "tex-fmt".magenta().bold())?; display_arg_line(f, "check", &self.check.to_string())?; display_arg_line(f, "print", &self.print.to_string())?; display_arg_line(f, "wrap", &self.wrap.to_string())?; display_arg_line(f, "wraplen", &self.wraplen.to_string())?; display_arg_line(f, "wrapmin", &self.wrapmin.to_string())?; display_arg_line(f, "tabsize", &self.tabsize.to_string())?; display_arg_line(f, "tabchar", &self.tabchar.to_string())?; display_arg_line(f, "stdin", &self.stdin.to_string())?; match &self.config { None => display_arg_line(f, "config", "None")?, Some(c) => display_arg_line(f, "config", &c.display().to_string())?, } display_arg_line( f, "verbosity", &self.verbosity.to_string().to_lowercase(), )?; if !self.lists.is_empty() { display_arg_line(f, "lists", &self.lists[0])?; for file in &self.lists[1..] { write!( f, "\n {:` fn get_flag(arg_matches: &ArgMatches, flag: &str) -> Option { if arg_matches.get_flag(flag) { Some(true) } else { None } } /// Parse CLI arguments into `OptionArgs` struct pub fn get_cli_args() -> OptionArgs { let mut command = get_cli_command(); let arg_matches = command.clone().get_matches(); // Generate completions and exit if let Some(shell) = arg_matches.get_one::("completion") { generate(*shell, &mut command, "tex-fmt", &mut io::stdout()); std::process::exit(0); } // Generate man page and exit if arg_matches.get_flag("man") { let man = Man::new(command); man.render(&mut io::stdout()).unwrap(); std::process::exit(0); } let wrap: Option = if arg_matches.get_flag("nowrap") { Some(false) } else { None }; let tabchar = if arg_matches.get_flag("usetabs") { Some(TabChar::Tab) } else { None }; let verbosity = if arg_matches.get_flag("trace") { Some(LevelFilter::Trace) } else if arg_matches.get_flag("verbose") { Some(LevelFilter::Info) } else if arg_matches.get_flag("quiet") { Some(LevelFilter::Error) } else { None }; let args = OptionArgs { check: get_flag(&arg_matches, "check"), print: get_flag(&arg_matches, "print"), wrap, wraplen: arg_matches.get_one::("wraplen").copied(), wrapmin: None, tabsize: arg_matches.get_one::("tabsize").copied(), tabchar, stdin: get_flag(&arg_matches, "stdin"), config: arg_matches.get_one::("config").cloned(), noconfig: get_flag(&arg_matches, "noconfig"), lists: vec![], verbosity, arguments: get_flag(&arg_matches, "args"), files: arg_matches .get_many::("files") .unwrap_or_default() .map(ToOwned::to_owned) .collect::>(), }; args } tex-fmt-0.5.2/src/command.rs000066400000000000000000000070361473573253500157010ustar00rootroot00000000000000use clap::{value_parser, Command, Arg, ArgAction}; use ArgAction::{Append, SetTrue}; use std::path::PathBuf; /// Construct the CLI command #[allow(clippy::too_many_lines)] fn get_cli_command() -> Command { Command::new("tex-fmt") .author("William George Underwood, wg.underwood13@gmail.com") .about(clap::crate_description!()) .version(clap::crate_version!()) .before_help(format!("tex-fmt {}", clap::crate_version!())) .arg( Arg::new("check") .short('c') .long("check") .action(SetTrue) .help("Check formatting, do not modify files"), ) .arg( Arg::new("print") .short('p') .long("print") .action(SetTrue) .help("Print to stdout, do not modify files"), ) .arg( Arg::new("nowrap") .short('n') .long("nowrap") .action(SetTrue) .help("Do not wrap long lines"), ) .arg( Arg::new("wraplen") .short('l') .long("wraplen") .value_parser(value_parser!(u8)) .help("Line length for wrapping [default: 80]"), ) .arg( Arg::new("tabsize") .short('t') .long("tabsize") .value_parser(value_parser!(u8)) .help("Number of characters to use as tab size [default: 2]"), ) .arg( Arg::new("usetabs") .long("usetabs") .action(SetTrue) .help("Use tabs instead of spaces for indentation"), ) .arg( Arg::new("stdin") .short('s') .long("stdin") .action(SetTrue) .help("Process stdin as a single file, output to stdout"), ) .arg( Arg::new("config") .long("config") .value_parser(value_parser!(PathBuf)) .help("Path to configuration file") ) .arg( Arg::new("noconfig") .long("noconfig") .action(SetTrue) .help("Do not read any config file"), ) .arg( Arg::new("verbose") .short('v') .long("verbose") .action(SetTrue) .help("Show info messages"), ) .arg( Arg::new("quiet") .short('q') .long("quiet") .action(SetTrue) .help("Hide warning messages"), ) .arg( Arg::new("trace") .long("trace") .action(SetTrue) .help("Show trace messages"), ) .arg( Arg::new("completion") .long("completion") .value_parser(value_parser!(Shell)) .value_name("shell") .help("Generate shell completion script") ) .arg( Arg::new("man") .long("man") .action(SetTrue) .help("Generate man page"), ) .arg( Arg::new("args") .long("args") .action(SetTrue) .help("Print arguments passed to tex-fmt and exit"), ) .arg( Arg::new("files") .action(Append) .help("List of files to be formatted"), ) } tex-fmt-0.5.2/src/comments.rs000066400000000000000000000012541473573253500161040ustar00rootroot00000000000000//! Utilities for finding, extracting and removing LaTeX comments use crate::format::*; /// Find the location where a comment begins in a line pub fn find_comment_index(line: &str, pattern: &Pattern) -> Option { // often there is no '%' so check this first if pattern.contains_comment { let mut prev_c = ' '; for (i, c) in line.char_indices() { if c == '%' && prev_c != '\\' { return Some(i); } prev_c = c; } } None } /// Remove a comment from the end of a line pub fn remove_comment(line: &str, comment: Option) -> &str { comment.map_or_else(|| line, |c| &line[0..c]) } tex-fmt-0.5.2/src/config.rs000066400000000000000000000073611473573253500155310ustar00rootroot00000000000000//! Read arguments from a config file use crate::args::*; use dirs::config_dir; use log::LevelFilter; use std::env::current_dir; use std::fs::{metadata, read_to_string}; use std::path::PathBuf; use toml::Table; /// Config file name const CONFIG: &str = "tex-fmt.toml"; /// Try finding a config file in various sources fn resolve_config_path(args: &OptionArgs) -> Option { // Do not read config file if args.noconfig == Some(true) { return None; }; // Named path passed as cli arg if args.config.is_some() { return args.config.clone(); }; // Config file in current directory if let Ok(mut config) = current_dir() { config.push(CONFIG); if config.exists() { return Some(config); }; } // Config file at git repository root if let Some(mut config) = find_git_root() { config.push(CONFIG); if config.exists() { return Some(config); }; } // Config file in user home config directory if let Some(mut config) = config_dir() { config.push("tex-fmt"); config.push(CONFIG); if config.exists() { return Some(config); }; } None } /// Get the git repository root directory fn find_git_root() -> Option { let mut depth = 0; let mut current_dir = current_dir().unwrap(); while depth < 100 { depth += 1; if metadata(current_dir.join(".git")) .map(|m| m.is_dir()) .unwrap_or(false) { return Some(current_dir); } if !current_dir.pop() { break; } } None } /// Parse arguments from a config file path pub fn get_config_args(args: &OptionArgs) -> Option { let config_path = resolve_config_path(args); #[allow(clippy::question_mark)] if config_path.is_none() { return None; }; let config_string = config_path .clone() .unwrap() .into_os_string() .into_string() .unwrap(); let config = read_to_string(config_path.clone().unwrap()).unwrap(); let config = config.parse::().unwrap_or_else(|_| { panic!("Failed to read config file at {config_string}") }); let verbosity = match config.get("verbosity").map(|x| x.as_str().unwrap()) { Some("error" | "quiet") => Some(LevelFilter::Error), Some("warn") => Some(LevelFilter::Warn), Some("info" | "verbose") => Some(LevelFilter::Info), Some("trace") => Some(LevelFilter::Trace), _ => None, }; let tabchar = match config.get("tabchar").map(|x| x.as_str().unwrap()) { Some("tab") => Some(TabChar::Tab), Some("space") => Some(TabChar::Space), _ => None, }; let args = OptionArgs { check: config.get("check").map(|x| x.as_bool().unwrap()), print: config.get("print").map(|x| x.as_bool().unwrap()), wrap: config.get("wrap").map(|x| x.as_bool().unwrap()), wraplen: config .get("wraplen") .map(|x| x.as_integer().unwrap().try_into().unwrap()), wrapmin: config .get("wrapmin") .map(|x| x.as_integer().unwrap().try_into().unwrap()), tabsize: config .get("tabsize") .map(|x| x.as_integer().unwrap().try_into().unwrap()), tabchar, stdin: config.get("stdin").map(|x| x.as_bool().unwrap()), config: config_path, noconfig: None, lists: config .get("lists") .and_then(|v| v.as_array()) .unwrap_or(&vec![]) .iter() .filter_map(|v| v.as_str().map(String::from)) .collect(), verbosity, arguments: None, files: vec![], }; Some(args) } tex-fmt-0.5.2/src/format.rs000066400000000000000000000216111473573253500155460ustar00rootroot00000000000000//! Core methodology for formatting a file use crate::args::*; use crate::ignore::*; use crate::indent::*; use crate::logging::*; use crate::read::*; use crate::regexes::{ENV_BEGIN, ENV_END, ITEM, RE_SPLITTING}; use crate::subs::*; use crate::verbatim::*; use crate::wrap::*; use crate::write::*; use crate::LINE_END; use log::Level::{Info, Warn}; use std::iter::zip; /// Central function to format a file pub fn format_file( old_text: &str, file: &str, args: &Args, logs: &mut Vec, ) -> String { record_file_log(logs, Info, file, "Formatting started."); // Clean the source file and zip its lines with line numbers let old_text = clean_text(old_text, args); let mut old_lines = zip(1.., old_text.lines()); // Initialise let mut state = State::new(); let mut queue: Vec<(usize, String)> = vec![]; let mut new_text = String::with_capacity(2 * old_text.len()); // Select the character used for indentation. let indent_char = match args.tabchar { TabChar::Tab => "\t", TabChar::Space => " ", }; // Get any extra environments to be indented as lists let lists_begin: Vec = args .lists .iter() .map(|l| format!("\\begin{{{l}}}")) .collect(); let lists_end: Vec = args.lists.iter().map(|l| format!("\\end{{{l}}}")).collect(); loop { if let Some((linum_old, mut line)) = queue.pop() { // Read the patterns present on this line. let pattern = Pattern::new(&line); // Temporary state for working on this line. let mut temp_state = state.clone(); // Update the state with the line number from the queue. temp_state.linum_old = linum_old; // If the line should not be ignored ... if !set_ignore_and_report( &line, &mut temp_state, logs, file, &pattern, ) { // Check if the line should be split because of a pattern // that should begin on a new line. if needs_split(&line, &pattern) { // Split the line into two ... let (this_line, next_line) = split_line(&line, &temp_state, file, args, logs); // ... and queue the second part for formatting. queue.push((linum_old, next_line.to_string())); line = this_line.to_string(); } // Calculate the indent based on the current state // and the patterns in the line. let indent = calculate_indent( &line, &mut temp_state, logs, file, args, &pattern, &lists_begin, &lists_end, ); #[allow(clippy::cast_possible_wrap)] let indent_length = usize::try_from(indent.visual * args.tabsize as i8) .expect("Visual indent is non-negative."); // Wrap the line before applying the indent, and loop back // if the line needed wrapping. if needs_wrap(line.trim_start(), indent_length, args) { let wrapped_lines = apply_wrap( line.trim_start(), indent_length, &temp_state, file, args, logs, &pattern, ); if let Some([this_line, next_line_start, next_line]) = wrapped_lines { queue.push(( linum_old, [next_line_start, next_line].concat(), )); queue.push((linum_old, this_line.to_string())); continue; } } // Lastly, apply the indent if the line didn't need wrapping. line = apply_indent(&line, &indent, args, indent_char); } // Add line to new text state = temp_state; new_text.push_str(&line); new_text.push_str(LINE_END); state.linum_new += 1; } else if let Some((linum_old, line)) = old_lines.next() { queue.push((linum_old, line.to_string())); } else { break; } } if !indents_return_to_zero(&state) { let msg = format!( "Indent does not return to zero. Last non-indented line is line {}", state.linum_last_zero_indent ); record_file_log(logs, Warn, file, &msg); } new_text = remove_trailing_spaces(&new_text); new_text = remove_trailing_blank_lines(&new_text); record_file_log(logs, Info, file, "Formatting complete."); new_text } /// Sets the `ignore` and `verbatim` flags in the given [State] based on /// `line` and returns whether `line` should be ignored by formatting. fn set_ignore_and_report( line: &str, temp_state: &mut State, logs: &mut Vec, file: &str, pattern: &Pattern, ) -> bool { temp_state.ignore = get_ignore(line, temp_state, logs, file, true); temp_state.verbatim = get_verbatim(line, temp_state, logs, file, true, pattern); temp_state.verbatim.visual || temp_state.ignore.visual } /// Cleans the given text by removing extra line breaks and trailing spaces, /// and also tabs if they shouldn't be used. fn clean_text(text: &str, args: &Args) -> String { let mut text = remove_extra_newlines(text); if args.tabchar != TabChar::Tab { text = remove_tabs(&text, args); } text = remove_trailing_spaces(&text); text } /// Information on the current state during formatting #[derive(Clone, Debug)] pub struct State { /// Corresponding line number in the original file pub linum_old: usize, /// Corresponding line number in the formatted file pub linum_new: usize, /// Ignored status of the current line pub ignore: Ignore, /// Indentation status of the current line pub indent: Indent, /// Verbatim status of the current line pub verbatim: Verbatim, /// Line number in the new file of the last non-indented line pub linum_last_zero_indent: usize, } impl State { /// Construct a new default state pub const fn new() -> Self { Self { linum_old: 1, linum_new: 1, ignore: Ignore::new(), indent: Indent::new(), verbatim: Verbatim::new(), linum_last_zero_indent: 1, } } } /// Record whether a line contains certain patterns to avoid recomputing pub struct Pattern { /// Whether a begin environment pattern is present pub contains_env_begin: bool, /// Whether an end environment pattern is present pub contains_env_end: bool, /// Whether an item pattern is present pub contains_item: bool, /// Whether a splitting pattern is present pub contains_splitting: bool, /// Whether a comment is present pub contains_comment: bool, } impl Pattern { /// Check if a string contains patterns pub fn new(s: &str) -> Self { // If splitting does not match, most patterns are not present if RE_SPLITTING.is_match(s) { Self { contains_env_begin: s.contains(ENV_BEGIN), contains_env_end: s.contains(ENV_END), contains_item: s.contains(ITEM), contains_splitting: true, contains_comment: s.contains('%'), } } else { Self { contains_env_begin: false, contains_env_end: false, contains_item: false, contains_splitting: false, contains_comment: s.contains('%'), } } } } /// Ensure that the indentation returns to zero at the end of the file const fn indents_return_to_zero(state: &State) -> bool { state.indent.actual == 0 } /// Run tex-fmt with the provided arguments pub fn run(args: &Args, logs: &mut Vec) -> u8 { let mut exit_code = 0; if args.stdin { if let Some((file, text)) = read_stdin(logs) { let new_text = format_file(&text, &file, args, logs); exit_code = process_output(args, &file, &text, &new_text, logs); } else { exit_code = 1; } } else { for file in &args.files { if let Some((file, text)) = read(file, logs) { let new_text = format_file(&text, &file, args, logs); exit_code = process_output(args, &file, &text, &new_text, logs); } else { exit_code = 1; } } } exit_code } tex-fmt-0.5.2/src/ignore.rs000066400000000000000000000042501473573253500155410ustar00rootroot00000000000000//! Utilities for ignoring/skipping source lines use crate::format::*; use crate::logging::*; use log::Level::Warn; /// Information on the ignored state of a line #[derive(Clone, Debug)] pub struct Ignore { /// Whether the line is in an ignore block pub actual: bool, /// Whether the line should be ignored/skipped pub visual: bool, } impl Ignore { /// Construct a new ignore state pub const fn new() -> Self { Self { actual: false, visual: false, } } } /// Determine whether a line should be ignored pub fn get_ignore( line: &str, state: &State, logs: &mut Vec, file: &str, warn: bool, ) -> Ignore { let skip = contains_ignore_skip(line); let begin = contains_ignore_begin(line); let end = contains_ignore_end(line); let actual: bool; let visual: bool; if skip { actual = state.ignore.actual; visual = true; } else if begin { actual = true; visual = true; if warn && state.ignore.actual { record_line_log( logs, Warn, file, state.linum_new, state.linum_old, line, "Cannot begin ignore block:", ); } } else if end { actual = false; visual = true; if warn && !state.ignore.actual { record_line_log( logs, Warn, file, state.linum_new, state.linum_old, line, "No ignore block to end.", ); } } else { actual = state.ignore.actual; visual = state.ignore.actual; } Ignore { actual, visual } } /// Check if a line contains a skip directive fn contains_ignore_skip(line: &str) -> bool { line.ends_with("% tex-fmt: skip") } /// Check if a line contains the start of an ignore block fn contains_ignore_begin(line: &str) -> bool { line.ends_with("% tex-fmt: off") } /// Check if a line contains the end of an ignore block fn contains_ignore_end(line: &str) -> bool { line.ends_with("% tex-fmt: on") } tex-fmt-0.5.2/src/indent.rs000066400000000000000000000147131473573253500155440ustar00rootroot00000000000000//! Utilities for indenting source lines use crate::args::*; use crate::comments::*; use crate::format::*; use crate::logging::*; use crate::regexes::*; use core::cmp::max; use log::Level; use log::LevelFilter; /// Opening delimiters const OPENS: [char; 3] = ['{', '(', '[']; /// Closing delimiters const CLOSES: [char; 3] = ['}', ')', ']']; /// Information on the indentation state of a line #[derive(Debug, Clone)] pub struct Indent { /// The indentation level of a line pub actual: i8, /// The visual indentation level of a line pub visual: i8, } impl Indent { /// Construct a new indentation state pub const fn new() -> Self { Self { actual: 0, visual: 0, } } } /// Calculate total indentation change due to the current line fn get_diff( line: &str, pattern: &Pattern, lists_begin: &[String], lists_end: &[String], ) -> i8 { // list environments get double indents let mut diff: i8 = 0; // other environments get single indents if pattern.contains_env_begin && line.contains(ENV_BEGIN) { // documents get no global indentation if line.contains(DOC_BEGIN) { return 0; }; diff += 1; diff += i8::from(lists_begin.iter().any(|r| line.contains(r))); } else if pattern.contains_env_end && line.contains(ENV_END) { // documents get no global indentation if line.contains(DOC_END) { return 0; }; diff -= 1; diff -= i8::from(lists_end.iter().any(|r| line.contains(r))); }; // indent for delimiters diff += line .chars() .map(|x| i8::from(OPENS.contains(&x)) - i8::from(CLOSES.contains(&x))) .sum::(); diff } /// Calculate dedentation for the current line fn get_back( line: &str, pattern: &Pattern, state: &State, lists_end: &[String], ) -> i8 { // Only need to dedent if indentation is present if state.indent.actual == 0 { return 0; } let mut back: i8 = 0; if pattern.contains_env_end && line.contains(ENV_END) { // documents get no global indentation if line.contains(DOC_END) { return 0; }; // list environments get double indents for indenting items for r in lists_end { if line.contains(r) { return 2; }; } // other environments get single indents back = 1; } else if pattern.contains_item && line.contains(ITEM) { // deindent items to make the rest of item environment appear indented back += 1; }; // Dedent delimiters let mut cumul: i8 = back; for c in line.chars() { cumul -= i8::from(OPENS.contains(&c)); cumul += i8::from(CLOSES.contains(&c)); back = max(cumul, back); } back } /// Calculate indentation properties of the current line fn get_indent( line: &str, prev_indent: &Indent, pattern: &Pattern, state: &State, lists_begin: &[String], lists_end: &[String], ) -> Indent { let diff = get_diff(line, pattern, lists_begin, lists_end); let back = get_back(line, pattern, state, lists_end); let actual = prev_indent.actual + diff; let visual = prev_indent.actual - back; Indent { actual, visual } } /// Calculates the indent for `line` based on its contents. /// This functions saves the calculated [Indent], which might be /// negative, to the given [State], and then ensures that the returned /// [Indent] is non-negative. #[allow(clippy::too_many_arguments)] pub fn calculate_indent( line: &str, state: &mut State, logs: &mut Vec, file: &str, args: &Args, pattern: &Pattern, lists_begin: &[String], lists_end: &[String], ) -> Indent { // Calculate the new indent by first removing the comment from the line // (if there is one) to ignore diffs from characters in there. let comment_index = find_comment_index(line, pattern); let line_strip = remove_comment(line, comment_index); let mut indent = get_indent( line_strip, &state.indent, pattern, state, lists_begin, lists_end, ); // Record the indent to the logs. if args.verbosity == LevelFilter::Trace { record_line_log( logs, Level::Trace, file, state.linum_new, state.linum_old, line, &format!( "Indent: actual = {}, visual = {}:", indent.actual, indent.visual ), ); } // Save the indent to the state. Note, this indent might be negative; // it is saved without correction so that this is // not forgotten for the next iterations. state.indent = indent.clone(); // Update the last zero-indented line for use in error messages. if indent.visual == 0 && state.linum_new > state.linum_last_zero_indent { state.linum_last_zero_indent = state.linum_new; } // However, we can't negatively indent a line. // So we log the negative indent and reset the values to 0. if (indent.visual < 0) || (indent.actual < 0) { record_line_log( logs, Level::Warn, file, state.linum_new, state.linum_old, line, "Indent is negative.", ); indent.actual = indent.actual.max(0); indent.visual = indent.visual.max(0); } indent } /// Apply the given indentation to a line pub fn apply_indent( line: &str, indent: &Indent, args: &Args, indent_char: &str, ) -> String { let first_non_whitespace = line.chars().position(|c| !c.is_whitespace()); // If line is blank, return an empty line if first_non_whitespace.is_none() { return String::new(); } // If line is correctly indented, return it directly #[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss)] let n_indent_chars = (indent.visual * args.tabsize as i8) as usize; if first_non_whitespace == Some(n_indent_chars) { return line.into(); } // Otherwise, allocate enough memory to fit line with the added // indentation and insert the appropriate string slices let trimmed_line = line.trim_start(); let mut new_line = String::with_capacity(trimmed_line.len() + n_indent_chars); for idx in 0..n_indent_chars { new_line.insert_str(idx, indent_char); } new_line.insert_str(n_indent_chars, trimmed_line); new_line } tex-fmt-0.5.2/src/logging.rs000066400000000000000000000104641473573253500157100ustar00rootroot00000000000000//! Utilities for logging use colored::{Color, Colorize}; use env_logger::Builder; use log::Level; use log::Level::{Debug, Error, Info, Trace, Warn}; use log::LevelFilter; use std::cmp::Reverse; use std::io::Write; use std::path::Path; use std::time::Instant; /// Holds a log entry #[derive(Debug)] pub struct Log { /// Log entry level pub level: Level, /// Time when the entry was logged pub time: Instant, /// File name associated with the entry pub file: String, /// Line number in the formatted file pub linum_new: Option, /// Line number in the original file pub linum_old: Option, /// Line content pub line: Option, /// Entry-specific message pub message: String, } /// Append a log to the logs list fn record_log( logs: &mut Vec, level: Level, file: &str, linum_new: Option, linum_old: Option, line: Option, message: &str, ) { let log = Log { level, time: Instant::now(), file: file.to_string(), linum_new, linum_old, line, message: message.to_string(), }; logs.push(log); } /// Append a file log to the logs list pub fn record_file_log( logs: &mut Vec, level: Level, file: &str, message: &str, ) { record_log(logs, level, file, None, None, None, message); } /// Append a line log to the logs list pub fn record_line_log( logs: &mut Vec, level: Level, file: &str, linum_new: usize, linum_old: usize, line: &str, message: &str, ) { record_log( logs, level, file, Some(linum_new), Some(linum_old), Some(line.to_string()), message, ); } /// Get the color of a log level const fn get_log_color(log_level: Level) -> Color { match log_level { Info => Color::Cyan, Warn => Color::Yellow, Error => Color::Red, Trace => Color::Green, Debug => panic!(), } } /// Start the logger pub fn init_logger(level_filter: LevelFilter) { Builder::new() .filter_level(level_filter) .format(|buf, record| { writeln!( buf, "{}: {}", record .level() .to_string() .color(get_log_color(record.level())) .bold(), record.args() ) }) .init(); } /// Display all of the logs collected pub fn print_logs(logs: &mut Vec) { logs.sort_by_key(|l| { ( l.level, l.linum_new, l.linum_old, l.message.clone(), Reverse(l.time), ) }); logs.dedup_by(|a, b| { ( a.level, &a.file, a.linum_new, a.linum_old, &a.line, &a.message, ) == ( b.level, &b.file, b.linum_new, b.linum_old, &b.line, &b.message, ) }); logs.sort_by_key(|l| l.time); for log in logs { let linum_new = log .linum_new .map_or_else(String::new, |i| format!("Line {i} ")); let linum_old = log .linum_old .map_or_else(String::new, |i| format!("({i}). ")); let line = log .line .as_ref() .map_or_else(String::new, |l| l.trim_start().to_string()); let log_string = format!( "{} {}: {}{}{} {}", "tex-fmt".magenta().bold(), match log.file.as_str() { "" | "" => "".blue().bold(), _ => Path::new(&log.file) .file_name() .unwrap() .to_str() .unwrap() .blue() .bold(), }, linum_new.white().bold(), linum_old.white().bold(), log.message.yellow().bold(), line, ); match log.level { Error => log::error!("{}", log_string), Warn => log::warn!("{}", log_string), Info => log::info!("{}", log_string), Trace => log::trace!("{}", log_string), Debug => panic!(), } } } tex-fmt-0.5.2/src/main.rs000066400000000000000000000022151473573253500152010ustar00rootroot00000000000000//! tex-fmt //! An extremely fast LaTeX formatter written in Rust #![warn(missing_docs)] #![warn(clippy::nursery)] #![warn(clippy::cargo)] #![warn(clippy::missing_docs_in_private_items)] #![warn(clippy::pedantic)] #![allow(clippy::wildcard_imports)] #![allow(clippy::multiple_crate_versions)] #![allow(clippy::struct_excessive_bools)] #![allow(clippy::module_name_repetitions)] use std::fs; use std::process::ExitCode; mod args; mod cli; mod comments; mod config; mod format; mod ignore; mod indent; mod logging; mod read; mod regexes; mod subs; mod verbatim; mod wrap; mod write; use crate::args::*; use crate::format::*; use crate::logging::*; #[cfg(test)] mod tests; #[cfg(target_family = "unix")] /// Line ending for unix const LINE_END: &str = "\n"; #[cfg(target_family = "windows")] /// Line ending for Windows const LINE_END: &str = "\r\n"; fn main() -> ExitCode { let mut args = get_args(); init_logger(args.verbosity); let mut logs = Vec::::new(); let mut exit_code = args.resolve(&mut logs); if exit_code == 0 { exit_code = run(&args, &mut logs); } print_logs(&mut logs); ExitCode::from(exit_code) } tex-fmt-0.5.2/src/read.rs000066400000000000000000000027431473573253500151760ustar00rootroot00000000000000//! Utilities for reading files use crate::logging::*; use crate::regexes::*; use log::Level::{Error, Trace}; use std::fs; use std::io::Read; /// Add a missing extension and read the file pub fn read(file: &str, logs: &mut Vec) -> Option<(String, String)> { // check if file has an accepted extension let has_ext = EXTENSIONS.iter().any(|e| file.ends_with(e)); // if no valid extension, try adding .tex let mut new_file = file.to_owned(); if !has_ext { new_file.push_str(".tex"); }; if let Ok(text) = fs::read_to_string(&new_file) { return Some((new_file, text)); } if has_ext { record_file_log(logs, Error, file, "Could not open file."); } else { record_file_log(logs, Error, file, "File type invalid."); } None } /// Attempt to read from stdin, return filename `` and text pub fn read_stdin(logs: &mut Vec) -> Option<(String, String)> { let mut text = String::new(); match std::io::stdin().read_to_string(&mut text) { Ok(bytes) => { record_file_log( logs, Trace, "", &format!("Read {bytes} bytes."), ); Some((String::from(""), text)) } Err(e) => { record_file_log( logs, Error, "", &format!("Could not read from stdin: {e}"), ); None } } } tex-fmt-0.5.2/src/regexes.rs000066400000000000000000000046561473573253500157320ustar00rootroot00000000000000//! Regexes and matching utilities use crate::LINE_END; use lazy_static::lazy_static; use regex::Regex; /// Match a LaTeX \item pub const ITEM: &str = "\\item"; /// Match a LaTeX \begin{document} pub const DOC_BEGIN: &str = "\\begin{document}"; /// Match a LaTeX \end{document} pub const DOC_END: &str = "\\end{document}"; /// Match a LaTeX \begin{...} pub const ENV_BEGIN: &str = "\\begin{"; /// Match a LaTeX \end{...} pub const ENV_END: &str = "\\end{"; /// Acceptable LaTeX file extensions pub const EXTENSIONS: [&str; 4] = [".tex", ".bib", ".sty", ".cls"]; /// Names of LaTeX verbatim environments const VERBATIMS: [&str; 5] = ["verbatim", "Verbatim", "lstlisting", "minted", "comment"]; /// Regex matches for sectioning commands const SPLITTING: [&str; 6] = [ r"\\begin\{", r"\\end\{", r"\\item(?:$|[^a-zA-Z])", r"\\(?:sub){0,2}section\*?\{", r"\\chapter\*?\{", r"\\part\*?\{", ]; // Regexes lazy_static! { // A static `String` which is a valid regex to match any one of the // [`SPLITTING_COMMANDS`]. pub static ref SPLITTING_STRING: String = [ "(", SPLITTING.join("|").as_str(), ")" ].concat(); pub static ref RE_NEWLINES: Regex = Regex::new(&format!(r"{LINE_END}{LINE_END}({LINE_END})+")).unwrap(); pub static ref RE_TRAIL: Regex = Regex::new(&format!(r" +{LINE_END}")).unwrap(); pub static ref VERBATIMS_BEGIN: Vec = VERBATIMS .iter() .map(|l| format!("\\begin{{{l}}}")) .collect(); pub static ref VERBATIMS_END: Vec = VERBATIMS.iter().map(|l| format!("\\end{{{l}}}")).collect(); // Regex that matches splitting commands pub static ref RE_SPLITTING: Regex = Regex::new( SPLITTING_STRING.as_str() ) .unwrap(); // Matches splitting commands with non-whitespace characters before it. pub static ref RE_SPLITTING_SHARED_LINE: Regex = Regex::new( [r"(:?\S.*?)", "(:?", SPLITTING_STRING.as_str(), ".*)"] .concat().as_str() ) .unwrap(); // Matches any splitting command with non-whitespace // characters before it, catches the previous text in a group called // "prev" and captures the command itself and the remaining text // in a group called "env". pub static ref RE_SPLITTING_SHARED_LINE_CAPTURE: Regex = Regex::new( [r"(?P\S.*?)", "(?P", SPLITTING_STRING.as_str(), ".*)"] .concat().as_str() ) .unwrap(); } tex-fmt-0.5.2/src/subs.rs000066400000000000000000000060501473573253500152320ustar00rootroot00000000000000//! Utilities for performing text substitutions use crate::args::*; use crate::comments::*; use crate::format::*; use crate::logging::*; use crate::regexes::*; use crate::LINE_END; use log::Level; use log::LevelFilter; /// Remove multiple line breaks pub fn remove_extra_newlines(text: &str) -> String { let double_line_end = format!("{LINE_END}{LINE_END}"); RE_NEWLINES.replace_all(text, double_line_end).to_string() } /// Replace tabs with spaces pub fn remove_tabs(text: &str, args: &Args) -> String { let replace = (0..args.tabsize).map(|_| " ").collect::(); text.replace('\t', &replace) } /// Remove trailing spaces from line endings pub fn remove_trailing_spaces(text: &str) -> String { RE_TRAIL.replace_all(text, LINE_END).to_string() } /// Remove trailing blank lines from file pub fn remove_trailing_blank_lines(text: &str) -> String { let mut new_text = text.trim_end().to_string(); new_text.push_str(LINE_END); new_text } /// Check if line contains content which be split onto a new line pub fn needs_split(line: &str, pattern: &Pattern) -> bool { // Check if we should format this line and if we've matched an environment. let contains_splittable_env = pattern.contains_splitting && RE_SPLITTING_SHARED_LINE.is_match(line); // If we're not ignoring and we've matched an environment ... if contains_splittable_env { // ... return `true` if the comment index is `None` // (which implies the split point must be in text), otherwise // compare the index of the comment with the split point. find_comment_index(line, pattern).map_or(true, |comment_index| { if RE_SPLITTING_SHARED_LINE_CAPTURE .captures(line) .unwrap() // Matched split point so no panic. .get(2) .unwrap() // Regex has 4 groups so index 2 is in bounds. .start() > comment_index { // If split point is past the comment index, don't split. false } else { // Otherwise, split point is before comment and we do split. true } }) } else { // If ignoring or didn't match an environment, don't need a new line. false } } /// Ensure lines are split correctly. /// /// Returns a tuple containing: /// 1. a reference to the line that was given, shortened because of the split /// 2. a reference to the part of the line that was split pub fn split_line<'a>( line: &'a str, state: &State, file: &str, args: &Args, logs: &mut Vec, ) -> (&'a str, &'a str) { let captures = RE_SPLITTING_SHARED_LINE_CAPTURE.captures(line).unwrap(); let (line, [prev, rest, _]) = captures.extract(); if args.verbosity == LevelFilter::Trace { record_line_log( logs, Level::Trace, file, state.linum_new, state.linum_old, line, "Placing environment on new line.", ); } (prev, rest) } tex-fmt-0.5.2/src/tests.rs000066400000000000000000000065301473573253500154230ustar00rootroot00000000000000use crate::args::*; use crate::format_file; use crate::fs; use crate::logging::*; use colored::Colorize; use similar::{ChangeTag, TextDiff}; fn test_file(source_file: &str, target_file: &str) -> bool { let args = Args::default(); let mut logs = Vec::::new(); let source_text = fs::read_to_string(source_file).unwrap(); let target_text = fs::read_to_string(target_file).unwrap(); let fmt_source_text = format_file(&source_text, source_file, &args, &mut logs); if fmt_source_text != target_text { println!( "{} {} -> {}", "fail".red().bold(), source_file.yellow().bold(), target_file.yellow().bold() ); let diff = TextDiff::from_lines(&fmt_source_text, &target_text); for change in diff.iter_all_changes() { match change.tag() { ChangeTag::Delete => print!( "{} {}", format!("@ {:>3}:", change.old_index().unwrap()) .blue() .bold(), format!("- {change}").red().bold(), ), ChangeTag::Insert => print!( "{} {}", format!("@ {:>3}:", change.new_index().unwrap()) .blue() .bold(), format!("+ {change}").green().bold(), ), ChangeTag::Equal => {} }; } } fmt_source_text == target_text } fn read_files_from_dir(dir: &str) -> Vec { fs::read_dir(dir) .unwrap() .map(|f| f.unwrap().file_name().into_string().unwrap()) .collect() } #[test] fn test_source() { let source_files = read_files_from_dir("./tests/source/"); for file in source_files { if !test_file( &format!("tests/source/{file}"), &format!("tests/target/{file}"), ) { panic!("Failed in {file}"); } } } #[test] fn test_target() { let target_files = read_files_from_dir("./tests/target/"); let mut fail = false; for file in target_files { if !test_file( &format!("tests/target/{file}"), &format!("tests/target/{file}"), ) { fail = true; } } assert!(!fail, "Some tests failed"); } #[test] #[ignore] fn test_short() { let files = vec![ //"brackets.tex", //"cam-thesis.cls", //"comments.tex", //"cv.tex", //"document.tex", // "environment_lines.tex", //"heavy_wrap.tex", //"higher_categories_thesis.bib", //"higher_categories_thesis.tex", //"ignore.tex", //"lists.tex", //"masters_dissertation.tex", //"ociamthesis.cls", //"phd_dissertation.tex", //"phd_dissertation_refs.bib", //"puthesis.cls", //"quiver.sty", //"readme.tex", //"sections.tex", "short_document.tex", //"tikz_network.sty", //"unicode.tex", //"verbatim.tex", //"wgu-cv.cls", //"wrap.tex", ]; let mut fail = false; for file in files { if !test_file( &format!("tests/source/{file}"), &format!("tests/target/{file}"), ) { fail = true; } } assert!(!fail, "Some tests failed"); } tex-fmt-0.5.2/src/verbatim.rs000066400000000000000000000030121473573253500160620ustar00rootroot00000000000000//! Utilities for ignoring verbatim environments use crate::format::*; use crate::logging::*; use crate::regexes::*; use log::Level::Warn; /// Information on the verbatim state of a line #[derive(Clone, Debug)] pub struct Verbatim { /// The verbatim depth of a line pub actual: i8, /// Whether the line is in a verbatim environment pub visual: bool, } impl Verbatim { /// Construct a new verbatim state pub const fn new() -> Self { Self { actual: 0, visual: false, } } } /// Determine whether a line is in a verbatim environment pub fn get_verbatim( line: &str, state: &State, logs: &mut Vec, file: &str, warn: bool, pattern: &Pattern, ) -> Verbatim { let diff = get_verbatim_diff(line, pattern); let actual = state.verbatim.actual + diff; let visual = actual > 0 || state.verbatim.actual > 0; if warn && (actual < 0) { record_line_log( logs, Warn, file, state.linum_new, state.linum_old, line, "Verbatim count is negative.", ); } Verbatim { actual, visual } } /// Calculate total verbatim depth change fn get_verbatim_diff(line: &str, pattern: &Pattern) -> i8 { if pattern.contains_env_begin && VERBATIMS_BEGIN.iter().any(|r| line.contains(r)) { 1 } else if pattern.contains_env_end && VERBATIMS_END.iter().any(|r| line.contains(r)) { -1 } else { 0 } } tex-fmt-0.5.2/src/wrap.rs000066400000000000000000000052241473573253500152310ustar00rootroot00000000000000//! Utilities for wrapping long lines use crate::args::*; use crate::comments::*; use crate::format::*; use crate::logging::*; use log::Level; use log::LevelFilter; /// String slice to start wrapped text lines pub const TEXT_LINE_START: &str = ""; /// String slice to start wrapped comment lines pub const COMMENT_LINE_START: &str = "% "; /// Check if a line needs wrapping pub fn needs_wrap(line: &str, indent_length: usize, args: &Args) -> bool { args.wrap && (line.chars().count() + indent_length > args.wraplen.into()) } /// Find the best place to break a long line fn find_wrap_point( line: &str, indent_length: usize, args: &Args, ) -> Option { let mut wrap_point: Option = None; let mut after_char = false; let mut prev_char: Option = None; let mut line_width = 0; let wrap_boundary = usize::from(args.wrapmin) - indent_length; // Return *byte* index rather than *char* index. for (i, c) in line.char_indices() { line_width += 1; if line_width > wrap_boundary && wrap_point.is_some() { break; } if c == ' ' && prev_char != Some('\\') { if after_char { wrap_point = Some(i); } } else if c != '%' { after_char = true; } prev_char = Some(c); } wrap_point } /// Wrap a long line into a short prefix and a suffix pub fn apply_wrap<'a>( line: &'a str, indent_length: usize, state: &State, file: &str, args: &Args, logs: &mut Vec, pattern: &Pattern, ) -> Option<[&'a str; 3]> { if args.verbosity == LevelFilter::Trace { record_line_log( logs, Level::Trace, file, state.linum_new, state.linum_old, line, "Wrapping long line.", ); } let wrap_point = find_wrap_point(line, indent_length, args); let comment_index = find_comment_index(line, pattern); match wrap_point { Some(p) if p <= args.wraplen.into() => {} _ => { record_line_log( logs, Level::Warn, file, state.linum_new, state.linum_old, line, "Line cannot be wrapped.", ); } }; wrap_point.map(|p| { let this_line = &line[0..p]; let next_line_start = comment_index.map_or("", |c| { if p > c { COMMENT_LINE_START } else { TEXT_LINE_START } }); let next_line = &line[p + 1..]; [this_line, next_line_start, next_line] }) } tex-fmt-0.5.2/src/write.rs000066400000000000000000000014201473573253500154040ustar00rootroot00000000000000//! Utilities for writing formatted files use crate::args::*; use crate::fs; use crate::logging::*; use log::Level::Error; use std::path; /// Write a formatted file to disk fn write_file(file: &str, text: &str) { let filepath = path::Path::new(&file).canonicalize().unwrap(); fs::write(filepath, text).expect("Could not write the file"); } /// Handle the newly formatted file pub fn process_output( args: &Args, file: &str, text: &str, new_text: &str, logs: &mut Vec, ) -> u8 { if args.print { print!("{}", &new_text); } else if args.check && text != new_text { record_file_log(logs, Error, file, "Incorrect formatting."); return 1; } else if text != new_text { write_file(file, new_text); } 0 } tex-fmt-0.5.2/tests/000077500000000000000000000000001473573253500142625ustar00rootroot00000000000000tex-fmt-0.5.2/tests/source/000077500000000000000000000000001473573253500155625ustar00rootroot00000000000000tex-fmt-0.5.2/tests/source/brackets.tex000066400000000000000000000014331473573253500201030ustar00rootroot00000000000000\documentclass{article} \begin{document} Matching brackets on a line do nothing (like this). Matching brackets on two lines also do nothing (like this longer example). Matching brackets on three lines get an indent (like this much much longer example right here on these lines). Matching brackets on more lines also get an indent (like this much much much much much longer example here). The brackets could start at the beginning of the line (so maybe they look like this). [They could be any shape of bracket] {Even braces get the same indents too} What about equations? They are the same: $(1 + 2 + 3)$ $(1 + 2 + 3 + 4 + 5 + 7 + 8 + 9)$ And the dollars can go anywhere as expected: $ (1 + 2 + 3 + 4 + 5 + 7 + 8 + 9) $ Note that dollars themselves are not indented \end{document} tex-fmt-0.5.2/tests/source/cam-thesis.cls000066400000000000000000000374251473573253500203350ustar00rootroot00000000000000%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Class ``cam-thesis'' %% %% Version: v0.2 %% Authors: Jean Martina, Rok Strnisa, Matej Urbas %% Date: 30/07/2008 %% %% Copyright (c) 2008-2012, Rok Strniša, Jean Martina, Matej Urbas %% License: Simplified BSD License %% License file: ./License %% Original License URL: http://www.freebsd.org/copyright/freebsd-license.html %%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% General guidelines on which this class is based: %% %% http://www.cl.cam.ac.uk/local/phd/typography/ %% http://www.admin.cam.ac.uk/offices/gradstud/exams/submission/phd/format.html %% %%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Class identification. %% %%%%% \NeedsTeXFormat{LaTeX2e} \ProvidesClass{cam-thesis}[2012/04/12 University of Cambridge thesis class] \typeout{} \typeout{***********************************************} \typeout{***********************************************} \typeout{} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% General Cambridge guidelines. %% %% LIMIT: 60k words (including tables and footnotes, excluding appendices, bib, %% photos, diagrams); title and section headings should be capitalized as normal %% sentences; citations should include authors' initials, and page numbers (if %% possible); double-sided printing is permissible for the soft bound version; %% however, single-sided is required for the text of the final, hard bound %% library copy (diagrams on facing pages are acceptable); always make it %% possible to create the ps file as well (required for technical reports). %% %%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Package options (see README.md for a list of options with descriptions). %% %% These options can be provided within square brackets of the `documentclass' %% command. %% %%%%% % techreport - formats the thesis as a technical report. \newif\ifcam@techreport\cam@techreportfalse \DeclareOption{techreport}{\cam@techreporttrue} % times - tells the class to use the times font. \newif\ifcam@times\cam@timesfalse \DeclareOption{times}{\cam@timestrue} % glossary - puts the glossary (after the TOC). % \newif\ifcam@glossary\cam@glossaryfalse \DeclareOption{glossary}{\cam@glossarytrue} % index - puts the index at the end of the thesis. % \newif\ifcam@index\cam@indexfalse \DeclareOption{withindex}{\cam@indextrue} % 1st year report - omits abstract/declaration % \newif\ifcam@firstyr\cam@firstyrfalse \DeclareOption{firstyr}{\cam@firstyrtrue} % 2nd year report - omits declaration % \newif\ifcam@secondyr\cam@secondyrfalse \DeclareOption{secondyr}{\cam@secondyrtrue} % backrefs - add back references % \newif\ifcam@backrefs\cam@backrefsfalse \DeclareOption{backrefs}{\cam@backrefstrue} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Using report class as base. %% %%%%% \PassOptionsToClass{a4paper,12pt,twoside,openright}{report} \DeclareOption*{\PassOptionsToClass{\CurrentOption}{report}} \ProcessOptions\relax \LoadClass{report} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% For consistent vertical spacing %% %%%%% \raggedbottom %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Additional packages, and their options. %% %%%%% \RequirePackage{graphicx} % Required for the UC Logo (on the title page) \RequirePackage{calc} % Used for calculating margins and laying out the title page % Create the index \ifcam@index \RequirePackage{makeidx} \makeindex \newcommand{\printthesisindex}{% \cleardoublepage% \phantomsection% \addcontentsline{toc}{chapter}{Index}% \printindex} \fi % Create the glossary \ifcam@glossary \RequirePackage{glossaries} \makeglossaries% \newcommand{\printthesisglossary}{\printglossary[nonumberlist]} \newcommand{\cam@printthesisglossary}{% \cleardoublepage% \pagestyle{empty}% \renewcommand{\glossarypreamble}{\thispagestyle{empty}}% \printthesisglossary% } \fi %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Page margins (suitable for J.S. Wilson & Son). %% %%%%% \newlength{\cam@topmargin} \newlength{\cam@bottommargin} \newlength{\cam@oddmargin} \newlength{\cam@evenmargin} %% Calculate and set the margins properly (with parameters that actually have %% some meaning for everyday thesis-writers). %% %% @param 1 odd side margin (inner margin). %% @param 2 even side margin (outer margin). %% @param 3 top margin. %% @param 4 bottom margin. \DeclareRobustCommand{\cam@calcpaperdims}[4]{% % MARGINS % 'Top margin' is the distance between the top of the text and the top of the page. % 'Bottom margin' is the distance between the bottom of the footer (the page number) and the bottom of the page. \setlength{\cam@oddmargin}{#1} % inner margin \setlength{\cam@evenmargin}{#2} % outer margin \setlength{\cam@topmargin}{#3} % top margin (the distance from the top of the page to the top of the body text -- the header is located between) \setlength{\cam@bottommargin}{#4} % bottom margin (the distance from the bottom of the page to the bottom of the body text -- the footer is located between) % Horizontal spacing \setlength{\textwidth}{\paperwidth-\cam@oddmargin-\cam@evenmargin} % text takes the remaining width (210 - inner - outer) \setlength{\oddsidemargin}{\cam@oddmargin-1in} % Counter the LaTeX 1in margin \setlength{\evensidemargin}{\cam@evenmargin-1in} % Counter the LaTeX 1in margin \setlength{\marginparwidth}{\cam@evenmargin-8mm} % the margin only has 'outer' space available, so we have to make it a bit thinner. \setlength{\marginparsep}{3mm} % Vertical spacing \setlength{\headheight}{5mm} % The height of the box where the heading text lives \setlength{\headsep}{5mm} % The distance between the heading and the top of the text \setlength{\topmargin}{\cam@topmargin-\headheight-\headsep-1in} % Counter the LaTeX 1in margin \setlength{\textheight}{\paperheight-\cam@topmargin-1.7\cam@bottommargin} % text takes the remaining height (297 - top margin - bottom margin) \setlength{\footskip}{.7\cam@bottommargin} % The distance from the bottom of the text to the bottom of the footer } \ifcam@techreport \cam@calcpaperdims{25mm}{25mm}{20mm}{20mm} \else \cam@calcpaperdims{30mm}{20mm}{20mm}{20mm} \fi %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Variable definitions and default values: these variables should be defined by %% the user (somewhere in the preamble). For example, to put the abstract into %% the thesis, the thesis writer should type the following somewhere in the %% preamble (before the `\begin{document}` or `\frontmatter` commands are %% called): %% %% \abstract{This is my abstract.} %% %% See below (in the comments starting with 'DOCVAR: ') for a list of all % variables %% the thesis writer is expected to use. %% %%%%% % DOCVAR: abstract (The text that will be inserted into the abstract of the % thesis.) \newcommand{\@abstract}{} \renewcommand{\abstract}[1]{\renewcommand{\@abstract}{#1}} % DOCVAR: acknowledgements (The text that will be inserted into the % acknowledgments of the thesis.) \newcommand{\@acknowledgements}{} \newcommand{\acknowledgements}[1]{\renewcommand{\@acknowledgements}{#1}} % DOCVAR: college (The name of the thesis writer's college, which will appear % just below their name.) \newcommand{\@college}{} \newcommand{\college}[1]{\renewcommand{\@college}{#1}} % DOCVAR: keywords (These keywords will appear in the PDF meta-information % called `pdfkeywords`.) \newcommand{\@keywords}{} \newcommand{\keywords}[1]{\renewcommand{\@keywords}{#1}} % DOCVAR: subjectline (This subject will appear in the PDF meta-information % called `pdfsubject`.) \newcommand{\@subjectline}{} \newcommand{\subjectline}[1]{\renewcommand{\@subjectline}{#1}} % DOCVAR: submissiondate (The date of the submission of this thesis. If the % submission date is provided, it will be printed on the title page--within the % `submissionnotice` by default. Note that the thesis writer can provide their % own `submissionnotice`, in which case it is up to them whether they will use % this date in their notice.) \newif\ifcam@submissiondate\cam@submissiondatefalse \newcommand{\@submissiondate}{} \newcommand{\submissiondate}[1]{% \renewcommand{\@submissiondate}{#1}\cam@submissiondatetrue} % DOCVAR: submissionnotice (The submission notice is shown on the bottom of the % title page.) \newcommand{\@submissionnotice}{% \ifcam@firstyr First year report submitted \else \ifcam@secondyr Second year report submitted \else This dissertation is submitted \fi \fi \ifcam@submissiondate on \@submissiondate{} \fi \ifcam@firstyr in partial fulfilment of the requirements \fi \ifcam@secondyr in partial fulfilment of the requirements \fi for the degree of Doctor of Philosophy% } \newcommand{\submissionnotice}[1]{\renewcommand{\@submissionnotice}{#1}} % DOCVAR: collegeshield (The name of the file that contains the image of the % college's shield. If `collegeshield' is provided, it will be included in the % title page (just below the author's name and above the name of the college). \newif\ifcam@collegeshield\cam@collegeshieldfalse \newcommand{\@collegeshield}{} \newcommand{\collegeshield}[1]{% \renewcommand{\@collegeshield}{#1}\cam@collegeshieldtrue} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Chapter and section numbering %% \setcounter{secnumdepth}{3} \setcounter{tocdepth}{3} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Front matter %% %% - outside and inside front cover %% - title leaf %% Do not include the date of make! %% Institution + department. %% Names of referees. (optional) %% Degree. %% Date of submission and defense. (optional) %% Place and date of publication and publishers (and other info by them). %%%%% \newcommand{\frontmatter}{ \pagestyle{empty} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Title page components %% %%%%% \ifcam@techreport % Technical report mustn't have the custom title page (a standard one will be % prepended by the editor, see http://www.cl.cam.ac.uk/techreports/submission.html). \else % The boxes below are all that will be displayed on the title page. They are % used to calculate exactly how much space should be left between them % (vertically). %% LOGO box \newlength{\cam@logorightnudge} \setlength{\cam@logorightnudge}{-0.5\paperwidth+12mm} \newsavebox{\cam@logo} \begin{lrbox}{\cam@logo} \hspace*{\cam@logorightnudge} %\includegraphics[width=73mm]{CollegeShields/CUni} \end{lrbox} %% THESIS TITLE box \newsavebox{\cam@title} \begin{lrbox}{\cam@title} \begin{minipage}[c][\height][c]{.98\textwidth} \begin{center} \Huge% \ifcam@times\else% \bfseries% \fi% {\@title{}}% \ifcam@firstyr\\% {\vspace{5mm}\emph{\LARGE PhD Proposal}}% \fi% \ifcam@secondyr\\% {\vspace{5mm}\emph{\LARGE Dissertation Schedule}}% \fi \end{center} \end{minipage} \end{lrbox} %% COLLEGESHIELD box (optional): \ifcam@collegeshield% \newsavebox{\cam@collegeshieldbox} \begin{lrbox}{\cam@collegeshieldbox} \includegraphics[height=20mm]{\@collegeshield} \end{lrbox} \fi %% AUTHOR&COLLEGE box \newsavebox{\cam@authorcollege} \begin{lrbox}{\cam@authorcollege} \begin{minipage}[c][\height][c]{.98\textwidth} \begin{center} {\large \@author{}~\\[1ex]} \ifcam@collegeshield% \vspace{2mm}{\usebox{\cam@collegeshieldbox}}\\ \fi \@college{} \end{center} \end{minipage} \end{lrbox} %% SUBMISSION NOTICE box \newsavebox{\cam@submitnotice} \begin{lrbox}{\cam@submitnotice} \begin{minipage}[c][\height][c]{.98\textwidth} \begin{center} \@submissionnotice{} \end{center} \end{minipage} \end{lrbox} % Now calculate the exact free vertical space \newlength{\cam@titlepagevspace} \setlength{\cam@titlepagevspace}{\textheight% -\totalheightof{\usebox{\cam@logo}}% -\totalheightof{\usebox{\cam@submitnotice}}% -\totalheightof{\usebox{\cam@authorcollege}}% -\totalheightof{\usebox{\cam@title}}} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Title Page: Put the components (logo, title, author, college and submit %% notice) onto the title page. %% %%%%% \begin{center} ~\vspace{.02\cam@titlepagevspace}\\ {\usebox{\cam@logo}}\\ \vspace{.28\cam@titlepagevspace} {\usebox{\cam@title}}\\ \vspace{.23\cam@titlepagevspace} {\usebox{\cam@authorcollege}}\\ \null\vfill {\usebox{\cam@submitnotice}} \end{center} \hypersetup{pdfsubject={\@subjectline},pdfkeywords={\@keywords}} \fi % Epigraph on odd page. (optional) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Declaration %% %%%%% \ifcam@techreport % Technical report doesn't need the declaration % (see http://www.cl.cam.ac.uk/techreports/submission.html). \else {\ifcam@firstyr % First and second yr report don't need the declaration \else \ifcam@secondyr % \else \chapter*{Declaration} \thispagestyle{empty} This dissertation is the result of my own work and includes nothing which is the outcome of work done in collaboration except as declared in the Preface and specified in the text. It is not substantially the same as any that I have submitted, or am concurrently submitting, for a degree or diploma or other qualification at the University of Cambridge or any other University or similar institution except as declared in the Preface and specified in the text. I further state that no substantial part of my dissertation has already been submitted, or is being concurrently submitted, for any such degree, diploma or other qualification at the University of Cambridge or any other University or similar institution except as declared in the Preface and specified in the text. This dissertation does not exceed the prescribed limit of 60\,000 words. % Leaving some space for the signature: \vspace{15mm} \begin{flushright} \@author{}\\ \@date{}\\ \end{flushright} \vfill \fi \fi} \fi %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Abstract %% %%%%% \ifcam@techreport% \setcounter{page}{3} \fi \ifcam@firstyr % First yr report doesn't need a standalone abstract \else \chapter*{Abstract} \thispagestyle{empty} % Cambridge thesis submission guidelines require the title and author be in the abstract. % For more info see https://www.cambridgestudents.cam.ac.uk/your-course/examinations/graduate-exam-information/after-examination/degree-approval-and-1 % tex-fmt: skip \textbf{\large \@title} \par\vspace{0.3cm} \noindent\textit{\@author} \par\vspace{0.6cm} \@abstract{} \fi %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Acknowledgements %% %%%%% \ifcam@firstyr % First and second yr report don't need the acknowledgements \else {\ifcam@secondyr % \else \chapter*{Acknowledgements} \thispagestyle{empty} \@acknowledgements{} \fi} \fi %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Table of contents, figures, symbols and glossary. %% %%%%% % The following command prevents the page number to be displayed on the first % page of the TOC. \addtocontents{toc}{\protect\thispagestyle{empty}} \pagestyle{empty} \tableofcontents{} \ifcam@glossary% \cam@printthesisglossary \fi \cleardoublepage \pagestyle{plain} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Backrefs %% %%%%% \ifcam@backrefs \RequirePackage[hyperpageref]{backref} \renewcommand*{\backref}[1]{} \renewcommand*{\backrefalt}[4]{% \ifcase #1 % \or {\footnotesize Cited on page #2.}% \else {\footnotesize Cited on pages #2.}% \fi } \fi %%%%% EOF: cam-thesis.cls tex-fmt-0.5.2/tests/source/comments.tex000066400000000000000000000010061473573253500201260ustar00rootroot00000000000000\documentclass{article} \begin{document} % Comments should be indented along with other text (these parentheses make the middle line here % and this comment aligns with the text indented as usual) % Comments do not directly affect indenting, % so they can contain arbitrary brackets (((( % which may not match. % Similarly they might contain \begin{align} unmatched % environment tags. This is a percent sign \% and not a comment Some lines might have both \% percents % and comments \end{align} \end{document} tex-fmt-0.5.2/tests/source/cv.tex000066400000000000000000000142011473573253500167120ustar00rootroot00000000000000% !TeX program = lualatex \documentclass{wgu-cv} \yourname{William G Underwood} \youraddress{ ORFE Department, Sherrerd Hall, Charlton Street, Princeton, NJ 08544, USA } \youremail{wgu2@princeton.edu} \yourwebsite{wgunderwood.github.io} \begin{document} \maketitle \section{Employment} \subsection{Postdoctoral Research Associate in Statistics} {Jul 2024 -- Jul 2026} \subsubsection{University of Cambridge} \begin{itemize} \item Advisor: Richard Samworth, Department of Pure Mathematics and Mathematical Statistics \item Funding: European Research Council Advanced Grant 101019498 \end{itemize} \subsection{Assistant in Instruction} {Sep 2020 -- May 2024} \subsubsection{Princeton University} \begin{itemize} \item ORF 499: Senior Thesis, Spring 2024 \item ORF 498: Senior Independent Research Foundations, Fall 2023 \item SML 201: Introduction to Data Science, Fall 2023 \item ORF 363: Computing and Optimization, Spring 2023, Fall 2020 \item ORF 524: Statistical Theory and Methods, Fall 2022, Fall 2021 \item ORF 526: Probability Theory, Fall 2022 \item ORF 245: Fundamentals of Statistics, Spring 2021 \end{itemize} \section{Education} \subsection{PhD in Operations Research \& Financial Engineering} {Sep 2019 -- May 2024} \subsubsection{Princeton University} \begin{itemize} \item Dissertation: Estimation and Inference in Modern Nonparametric Statistics \item Advisor: Matias Cattaneo, Department of Operations Research \& Financial Engineering \end{itemize} \subsection{MA in Operations Research \& Financial Engineering} {Sep 2019 -- Sep 2021} \subsubsection{Princeton University} \subsection{MMath in Mathematics \& Statistics} {Oct 2015 -- Jun 2019} \subsubsection{University of Oxford} \begin{itemize} \item Dissertation: Motif-Based Spectral Clustering of Weighted Directed Networks \item Supervisor: Mihai Cucuringu, Department of Statistics \end{itemize} \section{Research \& publications} \subsection{Articles}{} \begin{itemize} \item Uniform inference for kernel density estimators with dyadic data, with M D Cattaneo and Y Feng. \emph{Journal of the American Statistical Association}, forthcoming, 2024. \arxiv{2201.05967}. \item Motif-based spectral clustering of weighted directed networks, with A Elliott and M Cucuringu. \emph{Applied Network Science}, 5(62), 2020. \arxiv{2004.01293}. \item Simple Poisson PCA: an algorithm for (sparse) feature extraction with simultaneous dimension determination, with L Smallman and A Artemiou. \emph{Computational Statistics}, 35:559--577, 2019. \end{itemize} \subsection{Preprints}{} \begin{itemize} \item Inference with Mondrian random forests, with M D Cattaneo and J M Klusowski, 2023. \\ \arxiv{2310.09702}. \item Yurinskii's coupling for martingales, with M D Cattaneo and R P Masini. \emph{Annals of Statistics}, reject and resubmit, 2023. \arxiv{2210.00362}. \end{itemize} \pagebreak \subsection{Works in progress}{} \begin{itemize} \item Higher-order extensions to the Lindeberg method, with M D Cattaneo and R P Masini. \item Adaptive Mondrian random forests, with M D Cattaneo, R Chandak and J M Klusowski. \end{itemize} \subsection{Presentations}{} \begin{itemize} \item Statistics Seminar, University of Pittsburgh, February 2024 \item Statistics Seminar, University of Illinois, January 2024 \item Statistics Seminar, University of Michigan, January 2024 \item PhD Poster Session, Two Sigma Investments, July 2023 \item Research Symposium, Two Sigma Investments, June 2022 \item Statistics Laboratory, Princeton University, September 2021 \end{itemize} \subsection{Software}{} \begin{itemize} \item MondrianForests: Mondrian random forests in Julia, 2023. \\ \github{wgunderwood/MondrianForests.jl} \item DyadicKDE: dyadic kernel density estimation in Julia, 2022. \\ \github{wgunderwood/DyadicKDE.jl} \item motifcluster: motif-based spectral clustering in R, Python and Julia, 2020. \\ \github{wgunderwood/motifcluster} \end{itemize} \section{Awards \& funding} \vspace{-0.22cm} \begin{itemize} \item School of Engineering and Applied Science Award for Excellence, Princeton University \hfill 2022% \item Francis Robbins Upton Fellowship in Engineering, Princeton University \hfill 2019% \item Royal Statistical Society Prize, Royal Statistical Society \& University of Oxford \hfill 2019% \item Gibbs Statistics Prize, University of Oxford \hfill 2019% \item James Fund for Mathematics Research Grant, St John's College, University of Oxford \hfill 2017% \item Casberd Scholarship, St John's College, University of Oxford \hfill 2016% \end{itemize} \section{Professional experience} \subsection{Quantitative Research Intern} {Jun 2023 -- Aug 2023} \subsubsection{Two Sigma Investments} \vspace{-0.20cm} \subsection{Machine Learning Consultant} {Oct 2018 -- Nov 2018} \subsubsection{Mercury Digital Assets} \vspace{-0.18cm} \subsection{Educational Consultant} {Feb 2018 -- Sep 2018} \subsubsection{Polaris \& Dawn} \vspace{-0.20cm} \subsection{Premium Tutor} {Feb 2016 -- Oct 2018} \subsubsection{MyTutor} \vspace{-0.20cm} \subsection{Statistics \& Machine Learning Researcher} {Aug 2017 -- Sep 2017} \subsubsection{Cardiff University} \vspace{-0.20cm} \subsection{Data Science Intern} {Jun 2017 -- Aug 2017} \subsubsection{Rolls-Royce} \vspace{-0.20cm} \subsection{Peer review}{} \emph{Econometric Theory, Journal of the American Statistical Association, Journal of Business \& Economic Statistics, Journal of Causal Inference, Journal of Econometrics, Operations Research.} \section{References} \vspace{-0.22cm} \begin{itemize} \item Matias Cattaneo, Professor, ORFE, Princeton University \item Jason Klusowski, Assistant Professor, ORFE, Princeton University \item Jianqing Fan, Professor, ORFE, Princeton University \item Ricardo Masini, Assistant Professor, Statistics, University of California, Davis \end{itemize} \end{document} tex-fmt-0.5.2/tests/source/document.tex000066400000000000000000000001461473573253500201230ustar00rootroot00000000000000\documentclass{article} \begin{document} Documents should not be globally indented. \end{document} tex-fmt-0.5.2/tests/source/environment_lines.tex000066400000000000000000000013501473573253500220410ustar00rootroot00000000000000\documentclass{article} \begin{document} \newenvironment{env1}{}{} \newenvironment{env2}{}{} \newenvironment{env3}{}{} \newenvironment{env4}{}{} % environments on separate lines \begin{env1} \begin{env2} \end{env2} \end{env1} % environments on shared lines \begin{env1}\begin{env2} \end{env2}\end{env1} % environments on shared lines with spaces \begin{env1} \begin{env2} \end{env2} \end{env1} % environments all on same line \begin{env1}\begin{env2}\end{env2}\end{env1} % with a comment \begin{env1} % environments with extra brackets \begin{env1}(a)(b \begin{env2}[c{d}e] \end{env2}[f]g)\end{env1} % environments and a long line \begin{env1}\begin{env2}\begin{env3}\begin{env4}\end{env4}\end{env3}\end{env2}\end{env1} \end{document} tex-fmt-0.5.2/tests/source/heavy_wrap.tex000066400000000000000000000012641473573253500204540ustar00rootroot00000000000000\documentclass{article} \usepackage{amsmath} \usepackage{amsthm} \newtheorem{definition}{Definition} \begin{document} \begin{definition} \begin{definition} \begin{definition} Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. \end{definition} \end{definition} \end{definition} \end{document} tex-fmt-0.5.2/tests/source/higher_categories_thesis.bib000066400000000000000000000542321473573253500233000ustar00rootroot00000000000000@software{alex_rice_2024_10964565, author = {Alex Rice}, title = {Agda formalisation of Catt}, month = apr, year = 2024, publisher = {Zenodo}, version = {thesis}, doi = {10.5281/zenodo.10964565}, url = {https://github.com/alexarice/catt-agda/tree/thesis} } @software{alex_rice_2024_10964705, author = {Alex Rice}, title = {Semistrict Catt implementation}, month = apr, year = 2024, publisher = {Zenodo}, version = {thesis}, doi = {10.5281/zenodo.10966141}, url = {https://github.com/alexarice/catt-strict/tree/thesis} } @software{sd-visualiser, author = {Hu, Nick and Rice, Alex and Tataru, Calin}, title = {\textsf{sd-visualiser}}, year = 2024, url = {https://github.com/sd-visualiser/sd-visualiser} } @unpublished{andrastalk, title= {Efficient Evaluation with Controlled Definition Unfolding}, author = {András Kovács}, year = {2024}, note= {Workshop on the Implementation of Type Systems}, URL= {https://popl24.sigplan.org/details/wits-2024-papers/8/Efficient-Evaluation-with-Controlled-Definition-Unfolding}, % tex-fmt: skip } @inbook{selinger2011survey, title = {A Survey of Graphical Languages for Monoidal Categories}, DOI = {10.1007/978-3-642-12821-9_4}, booktitle = {New Structures for Physics}, publisher = {Springer Berlin Heidelberg}, author = {Selinger, Peter}, year = {2011}, pages = {289-–355}, isbn="978-3-642-12821-9", doi="10.1007/978-3-642-12821-9_4" } @article{forest2022unifying, title={Unifying notions of pasting diagrams}, author={Forest, Simon}, journal={Higher Structures}, volume={6}, number={1}, pages={1--79}, year={2022}, doi={10.21136/HS.2022.01} } @unpublished{makkai2005word, title={The word problem for computads}, author={Makkai, Michael}, note={\url{https://www.math.mcgill.ca/makkai/WordProblem/WordProblemCombined.pdf}}, % tex-fmt: skip year={2005} } @phdthesis{forest2021computational, title={Computational descriptions of higher categories}, author={Forest, Simon}, year={2021}, school={Institut Polytechnique de Paris} } @unpublished{douglas2016internal, title={Internal bicategories}, author={Christopher L. Douglas and André G. Henriques}, year={2016}, eprint={1206.4284}, archivePrefix={arXiv}, primaryClass={math.CT} } @book{leinster2004higher, title={Higher operads, higher categories}, author={Leinster, Tom}, volume={298}, year={2004}, publisher={Cambridge University Press} } @unpublished{simpson1998homotopy, title={Homotopy types of strict 3-groupoids}, author={Carlos Simpson}, year={1998}, eprint={math/9810059}, archivePrefix={arXiv}, primaryClass={math.CT} } @incollection {joyal2006weak, AUTHOR = {Joyal, Andr\'{e} and Kock, Joachim}, TITLE = {Weak units and homotopy 3-types}, BOOKTITLE = {Categories in algebra, geometry and mathematical physics}, SERIES = {Contemp. Math.}, VOLUME = {431}, PAGES = {257--276}, PUBLISHER = {Amer. Math. Soc., Providence, RI}, YEAR = {2007}, ISBN = {978-0-8218-3970-6}, DOI = {10.1090/conm/431/08277}, URL = {https://doi.org/10.1090/conm/431/08277}, } @inproceedings{10.1145/237721.237728, author = {Jim, Trevor}, title = {What are principal typings and what are they good for?}, year = {1996}, isbn = {0897917693}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://doi.org/10.1145/237721.237728}, doi = {10.1145/237721.237728}, abstract = {We demonstrate the pragmatic value of the principal typing property, a property distinct from ML's principal type property, by studying a type system with principal typings. The type system is based on rank 2 intersection types and is closely related to ML. Its principal typing property provides elegant support for separate compilation, including "smartest recompilation" and incremental type inference. Moreover, it motivates a new rule for typing recursive definitions that can type some interesting examples of polymorphic recursion.}, booktitle = {Proceedings of the 23rd ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages}, pages = {42–53}, numpages = {12}, location = {St. Petersburg Beach, Florida, USA}, series = {POPL '96} } @article{10.1145/3450952, author = {Dunfield, Jana and Krishnaswami, Neel}, title = {Bidirectional Typing}, year = {2021}, issue_date = {June 2022}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {54}, number = {5}, issn = {0360-0300}, url = {https://doi.org/10.1145/3450952}, doi = {10.1145/3450952}, abstract = {Bidirectional typing combines two modes of typing: type checking, which checks that a program satisfies a known type, and type synthesis, which determines a type from the program. Using checking enables bidirectional typing to support features for which inference is undecidable; using synthesis enables bidirectional typing to avoid the large annotation burden of explicitly typed languages. In addition, bidirectional typing improves error locality. We highlight the design principles that underlie bidirectional type systems, survey the development of bidirectional typing from the prehistoric period before Pierce and Turner’s local type inference to the present day, and provide guidance for future investigations.}, journal = {ACM Comput. Surv.}, month = {5}, articleno = {98}, numpages = {38}, keywords = {Type checking, type inference} } @article{abel2013normalization, title={Normalization by evaluation: Dependent types and impredicativity}, author={Abel, Andreas}, journal={Habilitation. Ludwig-Maximilians-Universit{\"a}t M{\"u}nchen}, year={2013} } @article{gratzer2019implementing, title={Implementing a modal dependent type theory}, author={Gratzer, Daniel and Sterling, Jonathan and Birkedal, Lars}, journal={Proceedings of the ACM on Programming Languages}, volume={3}, number={ICFP}, pages={1--29}, year={2019}, publisher={ACM New York, NY, USA}, doi = {10.1145/3341711} } @unpublished{hadzihasanovic2019representable, title={Representable diagrammatic sets as a model of weak higher categories}, author={Amar Hadzihasanovic}, year={2019}, eprint={1909.07639}, archivePrefix={arXiv}, primaryClass={math.CT} } @inproceedings{reutter2019high, title={High-level methods for homotopy construction in associative n-categories}, author={Reutter, David and Vicary, Jamie}, booktitle={Proceedings of the 34th Annual ACM/IEEE Symposium on Logic in Computer Science (LICS)}, pages={1--13}, year={2019}, doi={10.1109/LICS.2019.8785895} } @unpublished{corbyn2024homotopy, title={\textsf{homotopy.io}: a proof assistant for finitely-presented globular $n$-categories}, author={Nathan Corbyn and Lukas Heidemann and Nick Hu and Chiara Sarti and Calin Tataru and Jamie Vicary}, year={2024}, eprint={2402.13179}, archivePrefix={arXiv}, primaryClass={cs.LO} } @unpublished{tataru2024theory, title={The theory and applications of anticolimits}, author={Calin Tataru and Jamie Vicary}, year={2024}, eprint={2401.17076}, archivePrefix={arXiv}, primaryClass={math.CT} } @incollection{MARTINLOF197573, title = {An Intuitionistic Theory of Types: Predicative Part}, editor = {H.E. Rose and J.C. Shepherdson}, series = {Studies in Logic and the Foundations of Mathematics}, publisher = {Elsevier}, volume = {80}, pages = {73-118}, year = {1975}, booktitle = {Logic Colloquium '73}, issn = {0049-237X}, doi = {https://doi.org/10.1016/S0049-237X(08)71945-1}, url = {https://www.sciencedirect.com/science/article/pii/S0049237X08719451}, author = {Per Martin-Löf}, abstract = {Publisher Summary The theory of types is intended to be a full-scale system for formalizing intuitionistic mathematics as developed. The language of the theory is richer than the languages of traditional intuitionistic systems in permitting proofs to appear as parts of propositions so that the propositions of the theory can express properties of proofs. There are axioms for universes that link the generation of objects and types and play somewhat the same role for the present theory as does the replacement axiom for Zermelo–Fraenkel set theory. The present theory is based on a strongly impredicative axiom that there is a type of all types in symbols. This axiom has to be abandoned, however, after it has been shown to lead to a contraction. This chapter discusses Normalization theorem, which can be strengthened in two ways: it can be made to cover open terms and it can be proved that every reduction sequence starting from an arbitrary term leads to a unique normal term after a finite number of steps. The definition of the notion of convertibility and the proof that an arbitrary term is convertible can no longer be separated because the type symbols and the terms are generated simultaneously.} } @article{lumsdaine2010weak, title = {Weak omega-categories from intensional type theory}, volume = {Volume 6, Issue 3}, ISSN = {1860-5974}, url = {http://dx.doi.org/10.2168/LMCS-6(3:24)2010}, DOI = {10.2168/lmcs-6(3:24)2010}, journal = {Logical Methods in Computer Science}, publisher = {Centre pour la Communication Scientifique Directe (CCSD)}, author = {Lumsdaine, Peter LeFanu}, year = {2010}, month = sep } @article{garner2011types, title={Types are weak omega-groupoids}, author={Garner, Richard and van den Berg, Benno}, journal={Proceedings of the London Mathematical Society}, volume={102}, number={2}, pages={370--394}, year={2010}, publisher={London Mathematical Society}, DOI = {10.1112/plms/pdq026} } @unpublished{dorn2021framed, title={Framed combinatorial topology}, author={Christoph Dorn and Christopher L. Douglas}, year={2021}, eprint={2112.14700}, archivePrefix={arXiv}, primaryClass={math.GT} } @unpublished{heidemann2023framed, title={Framed Combinatorial Topology with Labels in $\infty$-Categories}, author={Lukas Heidemann}, year={2023}, eprint={2305.06288}, archivePrefix={arXiv}, primaryClass={math.AT} } @article{eckmann1962group, title={Group-like structures in general categories I multiplications and comultiplications}, author={Eckmann, Beno and Hilton, Peter J}, journal={Mathematische Annalen}, volume={145}, number={3}, pages={227--255}, year={1962} } @phdthesis{brunerie2016homotopy, title={On the homotopy groups of spheres in homotopy type theory}, author={Brunerie, Guillaume}, year={2016}, school={Universit{\'e} Nice Sophia Antipolis} } @unpublished{shulman2019all, title={All $(\infty,1)$-toposes have strict univalent universes}, author={Michael Shulman}, year={2019}, eprint={1904.07004}, archivePrefix={arXiv}, primaryClass={math.AT} } @Book{hottbook, author = {The {Univalent Foundations Program}}, title = {Homotopy Type Theory: Univalent Foundations of Mathematics}, publisher = {\url{https://homotopytypetheory.org/book}}, address = {Institute for Advanced Study}, year = 2013} @incollection {hofmannstreicher, AUTHOR = {Hofmann, Martin and Streicher, Thomas}, TITLE = {The groupoid interpretation of type theory}, BOOKTITLE = {Twenty-five years of constructive type theory}, VOLUME = {36}, PAGES = {83--111}, PUBLISHER = {Oxford University Press}, YEAR = {1998}, DOI = {10.1093/oso/9780198501275.003.0008}, } @inproceedings{heidemann2022zigzag, title={Zigzag normalisation for associative n-categories}, author={Heidemann, Lukas and Reutter, David and Vicary, Jamie}, booktitle={Proceedings of the 37th Annual ACM/IEEE Symposium on Logic in Computer Science (LICS)}, pages={1--13}, year={2022}, doi = {10.1145/3531130.3533352} } @article{Batanin2013, author = {Michael Batanin and Denis-Charles Cisinski and Mark Weber}, title = {Multitensor lifting and strictly unital higher category theory}, year = {2013}, journal = {Theory and Applications of Categories}, volume = 28, pages = {804--856} } @phdthesis{dorn2018associative, title={Associative n-categories}, author={Dorn, C}, year={2018}, school={University of Oxford} } @article {joyalcoherence, AUTHOR = {Joyal, Andr\'{e} and Kock, Joachim}, TITLE = {Coherence for weak units}, JOURNAL = {Documenta Mathematica}, VOLUME = {18}, YEAR = {2013}, PAGES = {71--110}, ISSN = {1431-0635,1431-0643}, } @incollection {cheng2007periodic, AUTHOR = {Cheng, Eugenia and Gurski, Nick}, TITLE = {The periodic table of {$n$}-categories for low dimensions {I}. {D}egenerate categories and degenerate bicategories}, BOOKTITLE = {Categories in algebra, geometry and mathematical physics}, SERIES = {Contemp. Math.}, VOLUME = {431}, PAGES = {143--164}, PUBLISHER = {Amer. Math. Soc., Providence, RI}, YEAR = {2007}, ISBN = {978-0-8218-3970-6}, DOI = {10.1090/conm/431/08270}, URL = {https://doi.org/10.1090/conm/431/08270} } @unpublished{cheng2007periodic2, title={The periodic table of $n$-categories for low dimensions II: degenerate tricategories}, author={Eugenia Cheng and Nick Gurski}, year={2007}, eprint={0706.2307}, archivePrefix={arXiv}, primaryClass={math.CT} } @article{Baez1995, title = {Higher-dimensional algebra and topological quantum field theory}, volume = {36}, ISSN = {1089-7658}, url = {http://dx.doi.org/10.1063/1.531236}, DOI = {10.1063/1.531236}, number = {11}, journal = {Journal of Mathematical Physics}, publisher = {AIP Publishing}, author = {Baez, John C. and Dolan, James}, year = {1995}, month = nov, pages = {6073–6105} } @BOOK{Heunen2019-jt, title = "Categories for quantum theory", author = "Heunen, Chris and Vicary, Jamie", publisher = "Oxford University Press", series = "Oxford Graduate Texts in Mathematics", month = nov, year = 2019, address = "London, England", doi = {10.1093/oso/9780198739623.001.0001} } @article{Barr1991, title = {*-Autonomous categories and linear logic}, volume = {1}, ISSN = {1469-8072}, url = {http://dx.doi.org/10.1017/S0960129500001274}, DOI = {10.1017/s0960129500001274}, number = {2}, journal = {Mathematical Structures in Computer Science}, publisher = {Cambridge University Press (CUP)}, author = {Barr, Michael}, year = {1991}, month = jul, pages = {159–178} } @book{riehl2022elements, title={Elements of \(\infty\)-Category Theory}, author={Riehl, Emily and Verity, Dominic}, volume={194}, year={2022}, publisher={Cambridge University Press}, DOI = {10.1017/9781108936880} } @article{Street2012, title = {Monoidal categories in, and linking, geometry and algebra}, volume = {19}, ISSN = {1370-1444}, url = {http://dx.doi.org/10.36045/bbms/1354031551}, DOI = {10.36045/bbms/1354031551}, number = {5}, journal = {Bulletin of the Belgian Mathematical Society - Simon Stevin}, publisher = {The Belgian Mathematical Society}, author = {Street, Ross}, year = {2012}, month = dec } @article{mellies2009categorical, title={Categorical semantics of linear logic}, author={Mellies, Paul-Andr{\'e}}, journal={Panoramas et syntheses}, volume={27}, pages={15--215}, year={2009} } @inproceedings{ghani2018compositional, title={Compositional game theory}, author={Ghani, Neil and Hedges, Jules and Winschel, Viktor and Zahn, Philipp}, booktitle={Proceedings of the 33rd annual ACM/IEEE Symposium on Logic in Computer Science (LICS)}, pages={472--481}, year={2018}, doi = {10.1145/3209108.3209165} } @book{Bourbaki2016, title = {Topologie algébrique}, ISBN = {9783662493618}, url = {http://dx.doi.org/10.1007/978-3-662-49361-8}, DOI = {10.1007/978-3-662-49361-8}, publisher = {Springer Berlin Heidelberg}, author = {Bourbaki, N.}, year = {2016} } @article{Weber2004, title = {Generic Morphisms, Parametric Representations and Weakly Cartesian Monads.}, author = {Weber, Mark}, date = {2004}, journaltitle = {Theory and Applications of Categories}, volume = {13}, pages = {191--234}, publisher = {{Mount Allison University, Department of Mathematics and Computer Science, Sackville}}, url = {http://eudml.org/doc/124614}, langid = {english}, keywords = {braiding,centre,descent,endofunctor,generic morphism,higher category theory,monad,operand,parametric representation,pseudofunctor} } @article{lipparini16, title={An infinite natural sum}, author={Lipparini, Paolo}, journal={Mathematical Logic Quarterly}, DOI = {10.1002/malq.201500017}, volume={62}, number={3}, pages={249--257}, year={2016}, publisher={Wiley Online Library} } @article{newman1942theories, title={On theories with a combinatorial definition of equivalence}, author={Newman, Maxwell and Herman, Alexander}, journal={Annals of mathematics}, pages={223--243}, year={1942}, publisher={JSTOR} } @unpublished{maltsiniotis2010grothendieck, title={Grothendieck $\infty$-groupoids, and still another definition of $\infty$-categories}, author={Georges Maltsiniotis}, year={2010}, eprint={1009.2331}, archivePrefix={arXiv}, primaryClass={math.CT} } @unpublished{leinster2001survey, title={A Survey of Definitions of n-Category}, author={Tom Leinster}, year={2001}, eprint={math/0107188}, archivePrefix={arXiv}, primaryClass={math.CT} } @inproceedings{finster2017type, title={A type-theoretical definition of weak $\omega$-categories}, author={Finster, Eric and Mimram, Samuel}, booktitle={Proceedings of the 32nd Annual ACM/IEEE Symposium on Logic in Computer Science (LICS)}, pages={1--12}, year={2017}, doi={10.1109/LICS.2017.8005124} } @phdthesis{Ara, author={Dimitri Ara}, title={Sur les $\infty$-groupoides de {G}rothendieck et une variante $\infty$-cat\'egorique}, school={Universit\'e Paris Diderot}, year={2010} } @unpublished{PursuingStacks, author={Alexander Grothendieck}, year=1983, title={Pursuing stacks} } @phdthesis{gurski2006algebraic, title={An algebraic theory of tricategories}, author={Gurski, Michael Nicholas}, year={2006}, school={University of Chicago, Department of Mathematics} } @inproceedings{bar2017data, title={Data structures for quasistrict higher categories}, author={Bar, Krzysztof and Vicary, Jamie}, booktitle={Proceedings of the 32nd Annual ACM/IEEE Symposium on Logic in Computer Science (LICS)}, pages={1--12}, year={2017}, doi={10.1109/LICS.2017.8005147} } @book{gordon1995coherence, title={Coherence for tricategories}, author={Gordon, Robert and Power, Anthony John and Street, Ross}, volume={558}, year={1995}, publisher={American Mathematical Soc.} } @inproceedings{finster2022type, title={A type theory for strictly unital ∞-categories}, author={Finster, Eric and Reutter, David and Vicary, Jamie and Rice, Alex}, booktitle={Proceedings of the 37th Annual ACM/IEEE Symposium on Logic in Computer Science (LICS)}, pages={1--12}, year={2022}, doi = {10.1145/3531130.3533363} } @inproceedings{finster2023strictly, title={A Syntax for Strictly Associative and Unital ∞-categories}, author={Finster, Eric and Rice, Alex and Vicary, Jamie}, booktitle={Proceedings of the 39th Annual ACM/IEEE Symposium on Logic in Computer Science (LICS)}, pubstate={forthcoming}, year={2024} } @InProceedings{cwf, author="Dybjer, Peter", editor="Berardi, Stefano and Coppo, Mario", title="Internal type theory", booktitle="Types for Proofs and Programs", year="1996", publisher="Springer Berlin Heidelberg", address="Berlin, Heidelberg", pages="120--134", abstract="We introduce categories with families as a new notion of model for a basic framework of dependent types. This notion is close to ordinary syntax and yet has a clean categorical description. We also present categories with families as a generalized algebraic theory. Then we define categories with families formally in Martin-L{\"o}f's intensional intuitionistic type theory. Finally, we discuss the coherence problem for these internal categories with families.", isbn="978-3-540-70722-6" } @article{batanin1998computads, title={Computads for finitary monads on globular sets}, author={Batanin, Michael A}, journal={Contemporary Mathematics}, volume={230}, pages={37--58}, year={1998}, issn = {0271-4132}, publisher={American Mathematical Society} } @article{street1976limits, title={Limits indexed by category-valued 2-functors}, author={Street, Ross}, journal={Journal of Pure and Applied Algebra}, volume={8}, number={2}, pages={149--181}, year={1976}, publisher={Elsevier}, doi={10.1016/0022-4049(76)90013-X} } @article{burroni1993higher, title={Higher-dimensional word problems with applications to equational logic}, author={Burroni, Albert}, journal={Theoretical computer science}, volume={115}, number={1}, pages={43--62}, year={1993}, publisher={Elsevier} } @unpublished{dean2022computads, title={Computads for weak $\omega$-categories as an inductive type}, author={Christopher J. Dean and Eric Finster and Ioannis Markakis and David Reutter and Jamie Vicary}, year={2024}, eprint={2208.08719}, archivePrefix={arXiv}, primaryClass={math.CT} } @unpublished{benjamin2021globular, title={Globular weak $\omega$-categories as models of a type theory}, author={Thibaut Benjamin and Eric Finster and Samuel Mimram}, year={2024}, eprint={2106.04475}, archivePrefix={arXiv}, primaryClass={cs.LO} } @phdthesis{benjamin2020type, title={A type theoretic approach to weak w-categories and related higher structures}, author={Benjamin, Thibaut}, year={2020}, school={Institut polytechnique de Paris} } @unpublished{benjamin2024duamity, title={Opposites of weak $\omega$-categories and the suspension and hom adjunction}, author={Thibaut Benjamin and Ioannis Markakis}, year={2024}, eprint={2402.01611}, archivePrefix={arXiv}, primaryClass={math.CT} } @article{batanin1998monoidal, title={Monoidal globular categories as a natural environment for the theory of weak n-categories}, author={Batanin, Michael A}, journal={Advances in Mathematics}, volume={136}, number={1}, pages={39--103}, year={1998}, publisher={Academic Press} } tex-fmt-0.5.2/tests/source/higher_categories_thesis.tex000066400000000000000000025544251473573253500233570ustar00rootroot00000000000000\documentclass{cam-thesis} \usepackage[english]{babel} \usepackage[utf8]{inputenc} \usepackage[T1]{fontenc} \usepackage{csquotes} %\usepackage{microtype} \usepackage[ttscale=.75]{libertine} \usepackage{dsfont} \usepackage[parfill]{parskip} % Set nicer (= less bold, less vertical spacing) mathcal font \usepackage[cal=cm]{mathalpha} % % Set up the headers and footers % \usepackage{fancyhdr} % \usepackage{ifthen} % \pagestyle{fancy} % \fancyhf{} % % Use ifthenelse to work around the fact that we wish to have alternate headers % % but a onesided document % \fancyhead[R]{\ifthenelse{\isodd{\value{page}}}{% % \thepage\hfill\textsc{\nouppercase\leftmark}}{}} % \fancyhead[L]{\ifthenelse{\isodd{\value{page}}}{}{% % \textsc{\nouppercase\rightmark}\hfill\thepage}} % \fancyfoot{} % % Remove page numbers on the first page of a chapter % \fancypagestyle{plain}{% % \renewcommand{\headrulewidth}{0pt}% % \fancyhf{}% % } % See the excellent biblatex documentation for more information \usepackage[ backend=biber,% style=alphabetic,% block=ragged,% backref=false,% useprefix=true,% maxnames=8,% minnames=7,% minalphanames=3,% maxalphanames=4,% url=false, eprint=true, backrefstyle=two]% {biblatex} \renewcommand{\subtitlepunct}{\addcolon\addspace} % \DefineBibliographyStrings{english}{% % bibliography = {References}, } % Enumerations and tables \usepackage{calc} \usepackage[shortlabels]{enumitem} % \setlist{nosep} \setlist[description]{font={\textnormal},labelindent=\parindent} \usepackage{booktabs} \usepackage{longtable} \usepackage[width=.8\textwidth]{caption} \captionsetup[table]{skip=1em} % Math packages \usepackage{mathtools} \usepackage{savesym} \usepackage{amsmath} \savesymbol{openbox} \usepackage{amsthm} \usepackage{thmtools} \savesymbol{Bbbk} \usepackage{amssymb} \usepackage{stmaryrd} \usepackage{bm} % \usepackage{mathabx} % % tocbibind allows us to have the toc in the toc % \usepackage[notbib,notindex]{tocbibind} % % Supposedly it should also allow us to have the index and the bibliography in % % the toc, but it has some bugs (e.g. displaying the right page number in the % % toc, but getting the wrong link with hyperref), so we disable those options % % here and use corresponding separate options for the index, index of symbols % % (nomenclature) and bibliography instead. % % % % The whole is rather finicky and it is somehow crucial that tocbibind is loaded % % *before* imakeidx. % \usepackage{imakeidx} % \makeindex[intoc,columns=2] % \usepackage[refpage,intoc,noprefix]{nomencl} % % Set fixed width so that descriptions in the index of symbols are aligned. % \setlength{\nomlabelwidth}{5cm} % \renewcommand{\nomname}{Index of symbols} % % Make page numbers links % \renewcommand*{\pagedeclaration}[1]{\unskip, \hyperpage{#1}} % \makenomenclature% % Used in hyperref's setup, and must be loaded before tikz-cd. \usepackage[dvipsnames]{xcolor} \definecolor{Diag1}{RGB}{0,0,255} \definecolor{Diag2}{RGB}{255,0,0} \usepackage[most]{tcolorbox} \usepackage{tikz-cd} \usepackage[ colorlinks=true % Remove the boxes , linktocpage=true % Make page numbers (not section titles) links in ToC , linkcolor=NavyBlue % Colour for internal links , citecolor=Green % Colour for bibliographical citations , urlcolor=BrickRed % Colour for (external) urls ]{hyperref} \usepackage[noabbrev,capitalise]{cleveref} \newcommand{\creflastconjunction}{, and\nobreakspace} \creflabelformat{equation}{#2\textup{#1}#3} % Write Equation x.y.z instead of Equation (x.y.z) \Crefname{judgement}{Judgement}{Judgements} \Crefname{diagram}{Diagram}{Diagrams} \Crefname{rule}{Rule}{Rules} % Label tables just like equations, theorems, definitions, etc. % % NB: This can be confusing if LaTeX does not place the table at the point of % writing (e.g. for lack of space)! \numberwithin{equation}{section} % Colours are as in Andrej Bauer's notes on realizability: % https://github.com/andrejbauer/notes-on-realizability \colorlet{ShadeOfPurple}{blue!5!white} \colorlet{ShadeOfYellow}{yellow!5!white} \colorlet{ShadeOfGreen} {green!5!white} \colorlet{ShadeOfBrown} {brown!10!white} % Add a blue for principles \colorlet{ShadeOfBlue}{cyan!5!white} % But we also shade proofs \colorlet{ShadeOfGray} {gray!10!white} \declaretheorem[sibling=equation]{theorem} \declaretheorem[sibling=theorem]{lemma} \declaretheorem[sibling=theorem]{proposition} \declaretheorem[sibling=theorem]{corollary} \declaretheorem[sibling=theorem,style=definition]{definition} \declaretheorem[sibling=theorem,style=remark]{example} \declaretheorem[sibling=theorem,style=remark]{remark} \declaretheorem[style=definition,name=Guiding principle for groupoids,numbered=no]{principle-groupoid} \declaretheorem[style=definition,name=Guiding principle for categories,numbered=no]{principle-category} % Now we set the shading using the tcolorbox package. % % The related thmtools' option "shaded" and the package mdframed seem to have % issues: the former does not allow for page breaks in shaded environments and % the latter puts double spacing between two shaded environments. % % Since tcolorbox puts stuff inside a minipage or \parbox (according to this % stackexchange answer: https://tex.stackexchange.com/a/250170), new % paragraphs aren't indented. We can fix this by grabbing the parindent % value and passing it to tcbset. \newlength{\normalparindent} \AtBeginDocument{\setlength{\normalparindent}{\parindent}} \newlength{\normalparskip} \AtBeginDocument{\setlength{\normalparskip}{\parskip}} \tcbset{shadedenv/.style={ colback={#1}, frame hidden, enhanced, breakable, boxsep=0pt, left=2mm, right=2mm, % LaTeX thinks this is too wide (as becomes clear from the many "Overfull % \hbox" warnings, but optically it looks spot on. add to width=1.1mm, enlarge left by=-0.6mm, before upper={\setlength{\parindent}{\normalparindent}% \setlength{\parskip}{\normalparskip}} }} \newcommand{\setenvcolor}[2]{% \tcolorboxenvironment{#1}{shadedenv={#2}} \addtotheorempreheadhook[#1]{\tikzcdset{background color=#2}} } % \setenvcolor{theorem}{ShadeOfPurple} \setenvcolor{lemma}{ShadeOfPurple} \setenvcolor{proposition}{ShadeOfPurple} \setenvcolor{corollary}{ShadeOfPurple} \setenvcolor{definition}{ShadeOfYellow} \setenvcolor{example}{ShadeOfGreen} \setenvcolor{remark}{ShadeOfBrown} \setenvcolor{principle-groupoid}{ShadeOfBlue} \setenvcolor{principle-category}{ShadeOfBlue} \setenvcolor{proof}{ShadeOfGray} \declaretheorem[sibling=theorem,style=remark,numbered=no]{claim} \usepackage{xspace} \usepackage{quiver} \usetikzlibrary{nfold, backgrounds, decorations.pathmorphing, positioning} \tikzcdset{column sep/smaller/.initial=0em} \tikzcdset{arrow style = tikz, diagrams={>=stealth}} \tikzcdset{Rightarrow/.append style ={nfold}} \usepackage{adjustbox} \usepackage{cellspace} \usepackage{makecell} \setlength\cellspacetoplimit{5pt} \setlength\cellspacebottomlimit{5pt} \newcolumntype{P}[1]{>{\centering\arraybackslash}p{#1}} \usepackage{ebproof} \usepackage{mathpartir} \usepackage{subcaption} \usepackage{float} \usepackage{afterpage} \usepackage{listings} \lstdefinestyle{cattstyle}{ keywordstyle=\color{Diag1}, keywordstyle=[2]\color{Diag2}, basicstyle=\ttfamily, breaklines=true, keepspaces=true, belowskip=0pt, } \lstset{style=cattstyle} \lstdefinelanguage{Catt}{ keywords=[1]{def,normalise,assert,size,in}, keywords=[2]{coh,comp,id} } \usepackage{fontspec} \usepackage{fancyvrb} %\setmonofont[Scale=0.8]{Hack Nerd Font Mono} \hfuzz=1.5pt \def\su{\textsf{su}\xspace} \def\sua{\textsf{sua}\xspace} \def\sa{\textsf{sa}\xspace} \def\Catt{\textsc{Catt}\xspace} \def\Cattsua{\textsc{Catt}\textsubscript{\sua}\xspace} \def\Cattsu{\textsc{Catt}\textsubscript{\su}\xspace} \def\Cattsa{\textsc{Catt}\textsubscript{\sa}\xspace} \def\Cattr{\textsc{Catt}\textsubscript{\(\mathcal{R}\)}\xspace} \def\Group{\textsf{Group}\xspace} \def\Reg{\textsf{Reg}\xspace} \def\Std{\textsf{Std}\xspace} \def\dr{\textsf{dr}\xspace} \def\ecr{\textsf{ecr}\xspace} \def\prune{\textsf{prune}\xspace} \def\insert{\textsf{insert}\xspace} \newcommand\id{\ensuremath{\mathsf{id}}} \newcommand\proj{\ensuremath{\mathsf{proj}}} \newcommand*{\Coh}[3]{\ensuremath\mathsf{Coh}_{(#1\,;\,#2)}[#3]} \newcommand*{\SCoh}[3]{\ensuremath\mathsf{SCoh}_{(#1\,;\,#2)}[#3]} \newcommand*{\Ctx}{\ensuremath{\mathsf{Ctx}}} \newcommand*{\Tree}{\ensuremath{\mathsf{Tree}}} \newcommand*{\Sub}{\ensuremath{\mathsf{Sub}}} \newcommand*{\Type}{\ensuremath{\mathsf{Type}}} \newcommand*{\SType}{\ensuremath{\mathsf{SType}}} \newcommand*{\Term}{\ensuremath{\mathsf{Term}}} \newcommand*{\STerm}{\ensuremath{\mathsf{STerm}}} \newcommand*{\arr}[3]{{#1 \to_{#2} #3}} \newcommand*{\sub}[1]{\ensuremath{\llbracket #1 \rrbracket}} \newcommand*{\bound}[2]{\ensuremath{\partial_{#1}({#2})}} \newcommand*{\bdry}[3]{\ensuremath{\partial_{#1}^{#2}({#3})}} \newcommand*{\incbd}[3]{\ensuremath{\delta_{#1}^{#2}({#3})}} \newcommand*{\incbdpath}[3]{\ensuremath{\mathrm{I}_{#1}^{#2}({#3})}} \newcommand*{\stdcoh}[2]{\mathcal{C}_{#1}^{#2}} \newcommand*{\stdty}[2]{\mathcal{U}_{#1}^{#2}} \newcommand*{\stdtm}[2]{\mathcal{T}_{#1}^{#2}} \newcommand*{\stdlbl}[2]{\mathcal{L}_{#1}^{#2}} \newcommand*{\unrestrict}{\mathop\downarrow} \newcommand*{\unrestrictfull}{\mathop{\downarrow\downarrow}} \newcommand*{\restrict}{\mathop\uparrow} \newcommand*{\Dyck}{\mathsf{Dyck}} \newcommand*{\Peak}{\mathsf{Peak}} \newcommand*{\Path}{\mathsf{Path}} \newcommand*{\MaxPath}{\mathsf{MaxPath}} \newcommand*{\SPath}{\mathsf{SPath}} \newcommand*{\SOther}{\mathsf{SOther}} \newcommand*{\Inc}{\mathsf{Inc}} \newcommand*{\UDPeak}{\Updownarrow_{\mathsf{pk}}} \newcommand*{\UpPeak}{\Uparrow_{\mathsf{pk}}} \newcommand*{\DownPeak}{\Downarrow_{\mathsf{pk}}} \newcommand*{\eval}{\mathsf{eval}} \renewcommand*{\quote}{\mathsf{quote}} \newcommand*{\red}{\rightsquigarrow} \newcommand*{\redr}{\rightsquigarrow_{\mathcal{R}}} \newcommand*{\redrts}{\leftrightsquigarrow_{\mathcal{R}}} \DeclareMathOperator{\doubleplus}{+\kern-1ex+} \newcommand\emp{{[\kern3pt]}} \newcommand*{\insertion}[3]{\ensuremath{#1\mathop{\mathord{\ll}_{#2}}#3}} \newcommand*{\insertionprime}[3]{\ensuremath{#1\mathop{\mathord{\ll'}_{#2}}#3}} \renewcommand*{\th}{\ensuremath{\mathsf{th}}} \newcommand*{\bh}{\ensuremath{\mathsf{bh}}} \newcommand*{\lh}{\ensuremath{\mathsf{lh}}} \newcommand*{\+}{\mathbin{\#}} \DeclareMathOperator*{\bighash}{\text{\LARGE \(\+\)}} \renewcommand*{\sc}{\ensuremath{\mathsf{sc}}} \newcommand*{\U}{\mathbf{U}} \DeclareMathOperator{\FV}{FV} \DeclareMathOperator{\DC}{DC} \DeclareMathOperator{\Var}{Var} \DeclareMathOperator{\Supp}{Supp} \DeclareMathOperator{\replace}{replace} \DeclareMathOperator{\drop}{drop} \DeclareMathOperator{\ty}{Ty} \DeclareMathOperator{\tm}{Tm} \DeclareMathOperator{\wk}{wk} \DeclareMathOperator{\src}{src} \DeclareMathOperator{\tgt}{tgt} \DeclareMathOperator{\base}{base} \DeclareMathOperator{\N}{N} \DeclareMathOperator{\inc}{inc} \DeclareMathOperator{\fst}{fst} \DeclareMathOperator{\snd}{snd} \DeclareMathOperator{\dep}{dep} \DeclareMathOperator{\len}{len} \DeclareMathOperator{\ext}{ext} \makeatletter \providecommand{\leftsquigarrow}{% \mathrel{\mathpalette\reflect@squig\relax}% } \newcommand{\reflect@squig}[2]{% \reflectbox{$\m@th#1\rightsquigarrow$}% } \makeatother \newcommand{\olsi}[1]{\,\overline{\!{#1}}} % overline short italic \newcommand*{\module}[1]{% \href{https://alexarice.github.io/catt-agda/#1.html}{#1}} \newcommand*{\funcn}[3]{% \href{https://alexarice.github.io/catt-agda/#1.html\##2}{#3}} \newcommand*{\func}[2]{\funcn{#1}{#2}{#2}} \newlist{lemmaenum}{enumerate}{1} % should only occur inside lemma env. \setlist[lemmaenum]{label=(\roman*),ref=\thelemma(\roman*)} \crefalias{lemmaenumi}{lemma} \addbibresource{higher_categories_thesis.bib} \title{A type-theoretic approach to semistrict higher categories} %% The full name of the author (e.g.: James Smith): \author{Alex Rice} %% College affiliation: \college{Darwin College} %% College shield: %\collegeshield{CollegeShields/Darwin} %% Submission date [optional]: \submissiondate{18\textsuperscript{th} April 2024} %% Declaration date: \date{18\textsuperscript{th} April 2024} %% PDF meta-info: \subjectline{Computer Science} \keywords{category theory, higher category theory, type theory} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Abstract: %% \abstract{% Weak \(\infty\)-categories are known to be more expressive than their strict counterparts, but are more difficult to work with, as constructions in such a category involve the manipulation of explicit coherence data. This motivates the search for definitions of semistrict \(\infty\)-categories, where some, but not all, of the operations have been strictified. We introduce a general framework for adding definitional equality to the type theory \Catt, a type theory whose models correspond to globular weak \(\infty\)-categories, which was introduced by \citeauthor{finster2017type}. Adding equality to this theory causes the models to exhibit \emph{semistrict} behaviour, trivialising some operations while leaving others weak. The framework consists of a generalisation of \Catt extended with an equality relation generated by an arbitrary set of equality rules \(\mathcal{R}\), which we name \Cattr. We study this framework in detail, formalising much of its metatheory in the proof assistant Agda, and studying how certain operations of \Catt behave in the presence of definitional equality. The main contribution of this thesis is to introduce two type theories, \Cattsu and \Cattsua, which are instances of this general framework. \Cattsu, short for \Catt with strict units, is a variant of \Catt where the unitor isomorphisms trivialise to identities. It is primarily generated by a reduction we call \emph{pruning}, which removes identities from composites, simplifying their structure. \Cattsua, which stands for \Catt with strict units and associators, trivialises both the associativity and unitality operations of \Catt, and is generated by a generalisation of pruning called \emph{insertion}. Insertion merges multiple composites into a single operation, flattening the structure of terms in the theory. Further, we provide reduction systems that generate the equality of both \Cattsu and \Cattsua respectively, and prove that these reductions systems are strongly terminating and confluent. We therefore prove that the equality, and hence typechecking, of both theories is decidable. This is used to give an implementation of these type theories, which uses an approach inspired by normalisation by evaluation to efficiently find normal forms for terms. We further introduce a bidirectional typechecking algorithm used by the implementation which allows for terms to be defined in a convenient syntax where many arguments can be left implicit. } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Acknowledgements: %% \acknowledgements{% I would firstly like to thank everyone that I have collaborated with over the course of my PhD, both for their contributions to the work that appears in this thesis, but also for their contributions to my development as a researcher. I would especially like to thank my supervisor, Jamie Vicary, whose guidance throughout was invaluable, for keeping my research on track despite the disruptions caused by the pandemic during the first years of my PhD. I would also like to thank all the friends who have been with me at any point in this journey. I particularly want to show my appreciation (and apologise) to everyone who was bombarded with technical questions throughout the writing up of this text; I thoroughly enjoyed our discussions on correct typesetting and use of the English language. Lastly, I would like to thank my family for supporting me throughout my entire education. I would not have made it to this point without them. } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Contents: %% \begin{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Title page, abstract, declaration etc.: %% - the title page (is automatically omitted in the technical report mode). \frontmatter{} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Thesis body: %% \chapter*{Introduction} \addcontentsline{toc}{chapter}{Introduction} The study of higher-dimensional structures is becoming more prevalent in both mathematics and computer science. \emph{Higher categories}~\cite{leinster2004higher,riehl2022elements}, a broad term for many different generalisations categories which capture these higher-dimensional ideas, are a central tool for studying these structures. The ``higher'' nature of these categories typically corresponds to the existence of morphisms whose source and target may be other morphisms, instead of just objects. A common method of organising this data is by giving a set of \(n\)-cells for each \(n \in \mathbb{N}\). A \(0\)-cell then corresponds to the objects of an ordinary category, and the source and target of an \((n+1)\)-cell are given by \(n\)-cells. These higher categories present in many forms, and have been characterised into a periodic table of categories~\cite{cheng2007periodic,cheng2007periodic2}. Of particular interest are the \((n,k)\)-categories for \(n,k \in \mathbb{N} \cup \{\infty\}\), higher categories which contain \(m\)-cells for \(m \leq n\), and whose \(m\)-cells are invertible for \(m < k\). In mathematics, the study of \((\infty,0)\)-categories, known as \(\infty\)-groupoids, is motivated by the study of the homotopy structure of topological spaces~\cite{Bourbaki2016}, where \(n\)-cells are given by paths in the topological space, with higher cells taking the form of homotopies between lower cells. In computer science, many applications have been found for \((n,n)\)-categories for smaller \(n\), more commonly referred to as \(n\)-categories, including quantum computing~\cite{Heunen2019-jt}, logic~\cite{Barr1991,mellies2009categorical}, physics~\cite{Baez1995}, and game theory~\cite{ghani2018compositional}, among others~\cite{Street2012}. The composition of \(1\)-cells in an \(n\)-category functions identically to the composition of morphisms in a \(1\) category; two morphisms \(f : x \to y\) and \(g : y \to z\) can be composed to form a \(1\)-cell \(f * g : x \to z\). However, there are two distinct ways of composing \(2\)-cells, depicted by the diagrams below: % https://q.uiver.app/#q=WzAsNSxbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzQsMCwiXFxidWxsZXQiXSxbNSwwLCJcXGJ1bGxldCJdLFs2LDAsIlxcYnVsbGV0Il0sWzAsMSwiIiwwLHsiY3VydmUiOi01fV0sWzAsMSwiIiwyLHsiY3VydmUiOjV9XSxbMCwxXSxbMiwzLCIiLDEseyJjdXJ2ZSI6LTN9XSxbMiwzLCIiLDEseyJjdXJ2ZSI6M31dLFszLDQsIiIsMSx7ImN1cnZlIjotM31dLFszLDQsIiIsMSx7ImN1cnZlIjozfV0sWzcsNSwiXFxhbHBoYSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbNiw3LCJcXGJldGEiLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzksOCwiXFxnYW1tYSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMTEsMTAsIlxcZGVsdGEiLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV1d % tex-fmt: skip \[ \begin{tikzcd} \bullet && \bullet && \bullet & \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=30pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, from=1-1, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=-18pt}, from=1-5, to=1-6] \arrow[""{name=4, anchor=center, inner sep=0}, curve={height=18pt}, from=1-5, to=1-6] \arrow[""{name=5, anchor=center, inner sep=0}, curve={height=-18pt}, from=1-6, to=1-7] \arrow[""{name=6, anchor=center, inner sep=0}, curve={height=18pt}, from=1-6, to=1-7] \arrow["\beta"', shorten <=4pt, shorten >=4pt, Rightarrow, from=2, to=0] \arrow["\alpha"', shorten <=4pt, shorten >=4pt, Rightarrow, from=1, to=2] \arrow["\gamma"', shorten <=5pt, shorten >=5pt, Rightarrow, from=4, to=3] \arrow["\delta"', shorten <=5pt, shorten >=5pt, Rightarrow, from=6, to=5] \end{tikzcd} \] These diagrams mirror the concept of commutative diagrams for \(1\)-categories, where spaces in the commutative diagram representing an equality have been replaced by \(2\)-cell arrows. The first of these composites composes two \(2\)-cells \(\alpha\) and \(\beta\) along a shared \(1\)-cell boundary creating the vertical composite \(\alpha \star_1 \beta\). The second composes the \(2\)-cells \(\gamma\) and \(\delta\) along a \(0\)-cell boundary and creates the horizontal composite \(\gamma \star_0 \delta\). In higher dimensions, the pattern continues of having \(n\) distinct ways of composing two \(n\)-cells. For each \(n\)-cell, there is also an identity \((n+1)\)-cell. Similarly to \(1\)-categories, \(n\)-categories must satisfy various laws concerning their operations. These can be roughly organised into 3 groups: \begin{itemize} \item Associativity laws: Each of the composition operations in an \(n\)-category is associative. \item Unitality laws: The identity morphisms are a left and right unit for the appropriate composition operations. \item Interchange laws: These laws govern the relation between different compositions on the same cells. For any four \(2\)-cells that form the following diagram: % https://q.uiver.app/#q=WzAsNCxbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzYsMF0sWzQsMCwiXFxidWxsZXQiXSxbMCwxLCIiLDAseyJjdXJ2ZSI6LTV9XSxbMCwxLCIiLDIseyJjdXJ2ZSI6NX1dLFswLDFdLFsxLDMsIiIsMix7ImN1cnZlIjotNX1dLFsxLDMsIiIsMix7ImN1cnZlIjo1fV0sWzEsM10sWzYsNCwiXFxiZXRhIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs1LDYsIlxcYWxwaGEiLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzgsOSwiXFxnYW1tYSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbOSw3LCJcXGRlbHRhIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dXQ== % tex-fmt: skip \[\begin{tikzcd} \bullet && \bullet && \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=30pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, from=1-1, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-3, to=1-5] \arrow[""{name=4, anchor=center, inner sep=0}, curve={height=30pt}, from=1-3, to=1-5] \arrow[""{name=5, anchor=center, inner sep=0}, from=1-3, to=1-5] \arrow["\beta"', shorten <=4pt, shorten >=4pt, Rightarrow, from=2, to=0] \arrow["\alpha"', shorten <=4pt, shorten >=4pt, Rightarrow, from=1, to=2] \arrow["\gamma"', shorten <=4pt, shorten >=4pt, Rightarrow, from=4, to=5] \arrow["\delta"', shorten <=4pt, shorten >=4pt, Rightarrow, from=5, to=3] \end{tikzcd}\] the first of the interchange laws states that two composites below are related: \[ (\alpha \star_1 \beta) \star_0 (\gamma \star_1 \delta) \simeq (\alpha \star_0 \gamma) \star_1 (\beta \star_0 \delta)\] \end{itemize} These laws can be combined to create non-trivial emergent behaviour in a form not seen in the theory of \(1\)-categories. One critical example of this is known as the \emph{Eckmann-Hilton} argument~\cite{eckmann1962group}, which states that the composition of two scalars, morphisms from the identity to the identity, commute. The argument proceeds by moving the two scalars around each other, as depicted in \cref{fig:eh}. This crucially uses both the interchange and unitality laws. \newsavebox{\ehalpha} \savebox{\ehalpha}{\adjustbox{scale=0.8}{ \begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-10pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=10pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-10pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=10pt}, from=1-2, to=1-3] \arrow["\alpha"', color=Diag1, shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \arrow["\id"', shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=2] \end{tikzcd}}} \newsavebox{\ehbeta} \savebox{\ehbeta}{\adjustbox{scale=0.8}{ \begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-10pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=10pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-10pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=10pt}, from=1-2, to=1-3] \arrow["\id"', shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \arrow["\beta"', color=Diag2, shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=2] \end{tikzcd}}} \newsavebox{\ehlefttop} \savebox{\ehlefttop}{ \adjustbox{scale=1}{% \begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, controls=+(80:0.7) and +(100:0.7),, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=0}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, controls=+(100:-0.7) and +(80:-0.7),, from=1-1, to=1-2] \arrow["\alpha", color=Diag1, shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \arrow["\id", shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd}}} \newsavebox{\ehrighttop} \savebox{\ehrighttop}{ \adjustbox{scale=1}{% \begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, controls=+(80:0.7) and +(100:0.7), from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=0}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, controls=+(100:-0.7) and +(80:-0.7),, from=1-1, to=1-2] \arrow["\id", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \arrow["\beta", color=Diag2, shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd}}} \newsavebox{\ehleftbot} \savebox{\ehleftbot}{ \adjustbox{scale=1}{% \begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, controls=+(80:0.7) and +(100:0.7),, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=0}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, controls=+(100:-0.7) and +(80:-0.7),, from=1-1, to=1-2] \arrow["\id", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \arrow["\alpha", color=Diag1, shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd}}} \newsavebox{\ehrightbot} \savebox{\ehrightbot}{ \adjustbox{scale=1}{% \begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, controls=+(80:0.7) and +(100:0.7), from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=0}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, controls=+(100:-0.7) and +(80:-0.7),, from=1-1, to=1-2] \arrow["\beta", color=Diag2, shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \arrow["\id", shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd}}} \begin{figure}[ht] \centering \[ \begin{tikzcd}[ampersand replacement=\&,column sep=small] \bullet \&\& \bullet \& \simeq \& \bullet \&\&\&\&\& \bullet \& \simeq \& \bullet \&\&\& \bullet \&\&\& \bullet \\ \\ \&\&\&\&\&\&\&\&\&\&\&\&\&\& \simeq \\ \\ \bullet \&\& \bullet \& \simeq \& \bullet \&\&\&\&\& \bullet \& \simeq \& \bullet \&\&\& \bullet \&\&\& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, "\id", curve={height=-24pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "\id"', curve={height=24pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "\id"{description}, from=1-1, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, draw=none, controls=+(90:1.8) and +(90:1.8), from=1-5, to=1-10] \arrow[""{name=4, anchor=center, inner sep=0}, draw=none, controls=+(90:-1.8) and +(90:-1.8), from=1-5, to=1-10] \arrow[""{name=5, anchor=center, inner sep=0}, from=1-5, to=1-10] \arrow[""{name=6, anchor=center, inner sep=0}, draw=none, controls=+(90:1.8) and +(90:1.8), from=5-5, to=5-10] \arrow[""{name=7, anchor=center, inner sep=0}, draw=none, controls=+(90:-1.8) and +(90:-1.8), from=5-5, to=5-10] \arrow[""{name=8, anchor=center, inner sep=0}, from=5-5, to=5-10] \arrow[""{name=9, anchor=center, inner sep=0}, "\id", curve={height=-24pt}, from=5-1, to=5-3] \arrow[""{name=10, anchor=center, inner sep=0}, "\id"', curve={height=24pt}, from=5-1, to=5-3] \arrow[""{name=11, anchor=center, inner sep=0}, "\id"{description}, from=5-1, to=5-3] \arrow[""{name=12, anchor=center, inner sep=0}, draw=none, controls=+(80:1.5) and +(100:1.5), from=1-12, to=1-15] \arrow[""{name=13, anchor=center, inner sep=0}, draw=none, controls=+(100:-1.5) and +(80:-1.5), from=1-12, to=1-15] \arrow[""{name=14, anchor=center, inner sep=0}, draw=none, controls=+(80:1.5) and +(100:1.5), from=1-15, to=1-18] \arrow[""{name=15, anchor=center, inner sep=0}, draw=none, controls=+(100:-1.5) and +(80:-1.5), from=1-15, to=1-18] \arrow[""{name=16, anchor=center, inner sep=0}, draw=none, controls=+(80:1.5) and +(100:1.5), from=5-12, to=5-15] \arrow[""{name=17, anchor=center, inner sep=0}, draw=none, controls=+(100:-1.5) and +(80:-1.5), from=5-12, to=5-15] \arrow[""{name=18, anchor=center, inner sep=0}, draw=none, controls=+(80:1.5) and +(100:1.5), from=5-15, to=5-18] \arrow[""{name=19, anchor=center, inner sep=0}, draw=none, controls=+(100:-1.5) and +(80:-1.5), from=5-15, to=5-18] \arrow["\alpha"', color=Diag1, shorten <=3pt, shorten >=5pt, Rightarrow, from=1, to=2] \arrow["\beta"', color=Diag2, shorten <=5pt, shorten >=3pt, Rightarrow, from=2, to=0] \arrow["\beta"', color=Diag2, shorten <=3pt, shorten >=5pt, Rightarrow, from=10, to=11] \arrow["\alpha"', color=Diag1, shorten <=5pt, shorten >=3pt, Rightarrow, from=11, to=9] \arrow["\usebox{\ehalpha}"{description,inner sep = 0,xshift = -1.2pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=4, to=5] \arrow["\usebox{\ehbeta}"{description,inner sep = 0,xshift = -1.2pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=5, to=3] \arrow["\usebox{\ehbeta}"{description,inner sep = 0,xshift = -1.2pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=7, to=8] \arrow["\usebox{\ehalpha}"{description,inner sep = 0,xshift = -1.2pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=8, to=6] \arrow["\usebox{\ehlefttop}"{description,inner sep = 0,xshift = -1.3pt, yshift = 0.2pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=13, to=12] \arrow["\usebox{\ehrighttop}"{description,inner sep = 0,xshift = -1.3pt,yshift = 0.2pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=15, to=14] \arrow["\usebox{\ehleftbot}"{description,inner sep = 0,xshift = -1.3pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=17, to=16] \arrow["\usebox{\ehrightbot}"{description,inner sep = 0,xshift = -1.3pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=19, to=18] \arrow[controls=+(90:1.8) and +(90:1.8), from=1-5, to=1-10] \arrow[controls=+(90:-1.8) and +(90:-1.8), from=1-5, to=1-10] \arrow[controls=+(90:1.8) and +(90:1.8), from=5-5, to=5-10] \arrow[controls=+(90:-1.8) and +(90:-1.8), from=5-5, to=5-10] \arrow[controls=+(80:1.5) and +(100:1.5), from=1-12, to=1-15] \arrow[controls=+(100:-1.5) and +(80:-1.5), from=1-12, to=1-15] \arrow[controls=+(80:1.5) and +(100:1.5), from=1-15, to=1-18] \arrow[controls=+(100:-1.5) and +(80:-1.5), from=1-15, to=1-18] \arrow[controls=+(80:1.5) and +(100:1.5), from=5-12, to=5-15] \arrow[controls=+(100:-1.5) and +(80:-1.5), from=5-12, to=5-15] \arrow[controls=+(80:1.5) and +(100:1.5), from=5-15, to=5-18] \arrow[controls=+(100:-1.5) and +(80:-1.5), from=5-15, to=5-18] \end{tikzcd} \] \caption{The Eckmann-Hilton argument.} \label{fig:eh} \end{figure} \paragraph{Semistrict higher categories} While we have given the types of laws that must hold in \(n\)-categories, we have not yet stated the full nature of these laws. By taking each of these laws to hold up to equality, one obtains the notion of a \emph{strict} \(n\)-category. It is often the case in category theory that equality is the incorrect notion by which to compare objects, with the coarser relation of isomorphism being preferable. In the presence of higher-dimensional cells, arrows themselves can be compared up to isomorphism. This allows the laws for an \(n\)-category to be stated with isomorphism replacing equality, giving rise to the notion of \emph{weak} \(n\)-category. In such a weak \(n\)-category, each law is given by a set of isomorphisms, which are given as part of the data of the category. For the associativity law of three \(1\)-cells \(f\), \(g\), and \(h\), an invertible \(2\)-cell known as the \emph{associator} must be given, which takes the following form: \[ \alpha_{f,g,h} : (f * g) * h \to f * (g * h)\] Similarly, the unit laws for a \(1\)-cell \(f\) are given by the \emph{left unitor} \(\lambda_f\) and the \emph{right unitor} \(\rho_f\) which take the following form: \[ \lambda_f : \id * f \to f \qquad \rho_f : f * \id \to f\] Whereas two morphisms being equal is a property of those morphisms, an isomorphism between the same morphisms is a form of data, and the choice of isomorphism may not be unique. Weak higher categories therefore contain higher \emph{coherence laws} which govern the interaction of these isomorphisms. These coherence laws can also be given as isomorphisms instead of equalities, and must satisfy their own coherence laws, leading to a tower of coherence laws. The amount of data needed to define an \(n\)-category therefore increases exponentially as \(n\) increases. In addition to the difficulty in defining a weak \(n\)-category, it is also more difficult to give proofs in a weak environment, due to the bureaucracy of working around the various coherence isomorphisms. Consider the proof of Eckmann-Hilton given in \cref{fig:eh}. In a weak environment, we would hope to be able to simply replace each equality by the appropriate isomorphism, however doing so for the first equality in the proof would require us to give an isomorphism: \[ \alpha \cong \alpha * \id\] Each side of this isomorphism has a different source and target, and hence no such isomorphism can be given in the globular setting used in this thesis. A full proof of Eckmann-Hilton is still possible but far more involved. Weak categories are a more general notion than their strict counterparts, with every strict \(n\)-category generating a corresponding weak category by letting every coherence isomorphism be given by the identity morphism. For \(2\)-categories, the converse is in fact possible; every weak \(2\)-category is equivalent to a strict \(2\)-category, allowing proofs for weak \(2\)-categories to be given by instead proving the same property for strict \(2\)-categories. This is no longer possible in \(n\)-categories where \(n \geq 3\). It was shown by \citeauthor{simpson1998homotopy}~\cite{simpson1998homotopy} that strict \(n\)-categories do not model the homotopy structure of all topological spaces, with the topological space \(S^2\) having no interpretation. More concretely, we consider the morphism \(\mathsf{EH}_{\alpha,\beta} : \alpha \star_1 \beta \to \beta \star_1 \alpha\) generated by the Eckmann-Hilton argument for scalars \(\alpha\) and \(\beta\). In a strict \(3\)-category, this morphism is given by the identity and so: \[ \mathsf{EH}_{\alpha,\beta} \star_2 \mathsf{EH}_{\beta,\alpha} = \id\] This equality does not hold in a general weak \(3\)-category (even up to isomorphism), contradicting that each weak \(3\)-category is equivalent to a strict \(3\)-category. This motivates the search for semistrict definitions of \(n\)-category: definitions where some operations are strict, yet do not lose the expressivity of weak \(n\)-categories. For \(3\)-categories, two such definitions have been proposed: \begin{itemize} \item \citeauthor{joyal2006weak}~\cite{joyal2006weak,joyalcoherence} define a monoidal \(2\)-category (which can be viewed as a \(3\)-category with a single \(0\)-cell) which only has weak units and unitors, and is otherwise strict. They prove that all braided monoidal categories (weak \(3\)-categories with a unique \(0\)-cell and unique \(1\)-cell) can be interpreted in this setting as the category of endomorphisms on the weak unit morphism. \item Gray-categories are a form of semistrict \(3\)-categories for which all structure is strict except the interchanger, the isomorphism witnessing the interchange law. \citeauthor{gordon1995coherence}~\cite{gordon1995coherence} prove that every weak \(3\)-category is equivalent to a Gray-category. \end{itemize} It is non-trivial to even define such a notion of semistrict \(n\)-category for \(n > 3\), let alone prove that it loses no expressivity over its weak counterpart. Simpson conjectures~\cite{simpson1998homotopy} that having only the unit laws weak is sufficient to model all homotopy groupoids, \(\infty\)-groupoids arising from the homotopy of topological spaces, though it is unclear if such a definition has been given. \citeauthor{hadzihasanovic2019representable}~% \cite{hadzihasanovic2019representable} defines weak higher categories based on \emph{diagrammatic sets}. It could be argued that such a definition can model strict interchange, though the classes of diagrams that can be composed in this theory are restricted to those that are \emph{spherical}, which disallows horizontal composites in the form stated above and makes comparison difficult. \citeauthor{Batanin2013}~\cite{Batanin2013} define a notion of \(\infty\)-category with strict units based on the language of operads. % A key axiom in this theory is \emph{disc reduction} which states that composites trivialise over certain configurations of cells known as discs. Definitions of semistrict \(n\)-categories which are strictly unital and associative have also been defined, primarily inspired by the graphical language of \emph{string diagrams}. \citeauthor{bar2017data}~\cite{bar2017data} define \emph{quasi-strict \(4\)-categories}, where the associativity and unitality laws hold strictly up to equality. \citeauthor{dorn2018associative}~\cite{dorn2018associative} defines \emph{associative \(n\)-categories}: a definition of strictly associative and unital \(n\)-category similarly based on geometric principles. Associative \(n\)-categories are further studied by Heidemann, Reutter, Tataru, and Vicary~\cite{reutter2019high,heidemann2022zigzag,tataru2024theory}, which has recently led to the construction of the graphical proof assistant \textsf{homotopy.io}~\cite{corbyn2024homotopy} for manipulating higher-dimensional string diagrams. Similarly to the case for diagrammatic sets, the composition operations in these theories have a different form to those of strict \(n\)-categories, making comparison difficult. The connection between these definitions and geometry is studied by \citeauthor{dorn2021framed}~\cite{dorn2021framed} and \citeauthor{heidemann2023framed}~\cite{heidemann2023framed}. \paragraph{Type theory and higher categories} Deep links exist between higher category theory and type theory. The identity type in Martin-Löf type theory (\textsc{Mltt})~\cite{MARTINLOF197573} naturally leads to higher-dimensional structure; the identity type \(s =_A t\) can be formed for any two terms \(s\) and \(t\) of type \(A\), but this construction can be iterated since the identity type is a type itself, leading to higher identity types \(p =_{s =_A t} q\) for \(p, q: s =_A t\). Operations on this type are generated by the J-rule, an induction principle for the identity type. Independent proofs by \citeauthor{lumsdaine2010weak}~\cite{lumsdaine2010weak} and \citeauthor{garner2011types}~\cite{garner2011types} show that the J-rule is sufficient to equip identity types with the appropriate operations to form a weak \(\infty\)-groupoid. Terms of the identity type \(s =_A t\) correspond to witnesses of the fact that \(s\) and \(t\) are equal, or can even be viewed as proofs of the equality. The study of these proofs as objects of study in their own right is known as \emph{proof relevance}. Although the axiom of uniqueness of identity proofs (UIP), which states that any two terms of the identity type are themselves equal, is consistent with \textsc{Mltt}, it was shown that it is not provable by \citeauthor{hofmannstreicher}, who constructed a model of \textsc{Mltt} where types are interpreted as \(1\)-groupoids, and identity types are non-trivial. The \(\infty\)-groupoidal nature of \textsc{Mltt} is embraced in Homotopy type theory (\textsc{Hott})~\cite{hottbook}, where types are interpreted as topological spaces. The key component of \textsc{Hott}, the \emph{univalence axiom}, which is incompatible with UIP, states that the identities between types are given by equivalences between these types, which need not be unique. The models of \textsc{Hott} are equipped with more structure than is present in an \(\infty\)-groupoid, and are given by \(\infty\)-toposes~\cite{shulman2019all}. In the appendices of his thesis~\cite{brunerie2016homotopy}, \citeauthor{brunerie2016homotopy} defines a type theory for \(\infty\)-groupoids by removing all structure from \textsc{Mltt} which does not concern the identity type. This theory constructs the identity type similarly to \textsc{Mltt}, but replaces the J-rule with a rule stating that all terms over \emph{contractible contexts} are equal. \citeauthor{finster2017type} further refine this idea to produce the type theory \Catt~\cite{finster2017type}, a type theory for weak \(\infty\)-categories, using techniques from a definition of weak \(\infty\)-categories due to \citeauthor{maltsiniotis2010grothendieck}~\cite{maltsiniotis2010grothendieck} which itself is based on an earlier definition of \(\infty\)-groupoids which was given by \citeauthor{PursuingStacks}~\cite{PursuingStacks}. It was later shown~\cite{benjamin2021globular} that type-theoretic models of \Catt coincide with \(\infty\)-categories defined by \citeauthor{maltsiniotis2010grothendieck}. The type theory \Catt is unusual, due to having no computation or equality rules. In the current work we leverage this to define new notions of semistrict \(\infty\)-category, by adding definitional equality to \Catt. This equality unifies certain terms, which correspond to operations in a weak \(\infty\)-category, causing the semistrict behaviour of the resulting theories. This thesis develops a framework for working with equality relations in \Catt, and uses this to define two new type theories, \Cattsu and \Cattsua: \begin{itemize} \item \Cattsu is a version of \Catt which is strictly unital. It is primarily generated by the \emph{pruning} reduction, a computation rule which removes unnecessary identities from more complex terms. \item \Cattsua is \Catt with strict unitors and associators. In this theory, pruning is replaced by a more general reduction which we call \emph{insertion}, which merges multiple composites into a single composite, flattening the structure of terms in the theory. We claim to give the first algebraic definition of an \(\infty\)-category where the unitality and associativity laws hold strictly as models of \Cattsua. \end{itemize} The majority of the technical content of this thesis is concerned with proving standard metatheoretic properties of these type theories. This includes defining a notion of computation for each theory, given by demonstrating the existence of a confluent and terminating reduction system, which allows these theories to be implemented. This is used to produce interpreters for both theories, allowing complex constructions to be checked mechanically. We demonstrate the utility of this by formalising a proof of the \emph{syllepsis}, a \(5\)-dimensional term witnessing a commutativity property of the Eckmann-Hilton argument. \clearpage \paragraph{Overview} We now give an overview of the content contained in each of the following chapters of the thesis. \begin{itemize} \item \cref{sec:background} gives an introduction to \(\infty\)-category theory. It defines strict \(\infty\)-categories and continues to define the definition of weak \(\infty\)-categories due to Maltsiniotis. The chapter ends by giving a definition of the type theory \Catt, as defined by \citeauthor{finster2017type}, and describing some preliminary well-known constructions in \Catt. \item \cref{cha:gener-pres-catt} introduces a general framework for studying variants of \Catt with definitional equality relations generated from a set of rules \(\mathcal{R}\), which we name \Cattr. The chapter also states various properties concerning the metatheory of \Cattr, including specifying conditions on the set of equality rules \(\mathcal{R}\), under which the theory is well-behaved. The description of \Catt in this chapter is comprehensive and self-contained, although lacks some exposition of the previous chapter. The type theory \Cattr is accompanied by an Agda formalisation, which is introduced in this chapter. \item \cref{sec:operations-catt} takes an arbitrary well-behaved variant of \Cattr, and explores various constructions that can be formed in this setting. The primary purpose of this chapter is to introduce the \emph{pruning operation}, which is done in \cref{sec:pruning}, and the \emph{insertion operation}, which is introduced in \cref{sec:insertion}. \cref{sec:trees,sec:structured-terms} build up theory about a certain class of contexts represented by trees, and terms that appear in these contexts. This theory is vital for a complete understanding of insertion. \item In \cref{cha:cattstrict}, the type theories \Cattsu and \Cattsua are finally defined in \cref{sec:cattsu,sec:cattsua} respectively, as variants of the framework \Cattr. Preliminary results about both theories are proved, primarily by compiling results that have been stated in the previous two chapters. The main technical contribution of this section involves giving reduction systems for both theories, and giving proofs that these reductions systems are strongly terminating and globally confluent, hence making equality in these theories decidable. In \cref{sec:towards-nbe}, the decidability of equality is used to implement a typechecker for both theories \Cattsu and \Cattsua. The typechecker uses \emph{normalisation by evaluation} (NbE) to reduce terms to a canonical form where they can be checked for equality. The section discusses the interaction of NbE with \Catt, as well as discussing limitations of this approach in this setting. \cref{sec:models} discusses some properties of the models of these type theories, introducing a technique which we call \emph{rehydration}, which ``pads out'' terms of the semistrict theory with the necessary coherences to produce a term of \Catt which is equivalent to the original term. Rehydration can be seen as a conservativity result for the semistrict theories introduced at the start of the chapter. A proof of rehydration is given for the restricted case of terms over a certain class of context known as ps-contexts. This partial rehydration result is sufficient to determine that the semistrictness defined by \Cattsu and \Cattsua is a property, a model of \Catt can be a model of \Cattsu or \Cattsua in at most one way. We further explore some obstructions to rehydration in a generic context. The thesis ends with a discussion of further variants of \Catt and other options for future work. \end{itemize} Although results of later chapters depend on definitions and results of the preceding chapters, a linear reading of this thesis is not essential. A reader who is already familiar with the type theory \Catt can safely skip \cref{sec:background}, and a reader who is only interested in the type theory \Cattsu could read \cref{cha:gener-pres-catt} followed by \cref{sec:pruning,sec:cattsu}. Similarly, a reader only interested in \Cattsua can ignore any content on the pruning construction. \cref{sec:towards-nbe} may be of interest to a reader who is purely interested in the type-theoretic techniques used, and not the type theory \Catt itself. \paragraph{Statement of authorship} The type theory \Cattsu was originally developed in collaboration with Eric Finster, David Reutter, and Jamie Vicary, and was presented by the author at the Logic in Computer Science conference in 2022~\cite{finster2022type}. \Cattsua will be presented at Logic in Computer Science 2024~\cite{finster2023strictly} and was developed in collaboration with Eric Finster and Jamie Vicary. The author claims the development of the framework \Cattr and its accompanying Agda formalisation as individual contribution, as well as the implementation of \Cattsu and \Cattsua which appears in \cref{sec:towards-nbe}. \chapter{Background} \label{sec:background} We begin with an overview of the important concepts required for the rest of the thesis. Throughout, we will assume knowledge of various basic concepts from computer science, as well as a basic knowledge of category theory (including functor categories, presheaves, and (co)limits) and type theory. The primary purpose of the following sections is to introduce weak \(\infty\)-categories. While there are many differing definitions of \(\infty\)-categories (see \cite{leinster2001survey}), we focus here on models of the type theory \Catt~\cite{finster2017type}, which are known to be equivalent to a definition of \citeauthor{maltsiniotis2010grothendieck}~\cite{maltsiniotis2010grothendieck} based off an earlier definition by \citeauthor{PursuingStacks}~\cite{PursuingStacks}, which we introduce in \cref{sec:weak}. In \cref{sec:type-theory-catt}, we define the type theory \Catt, similarly to how it was originally defined. This section additionally serves as a place to introduce various syntax and notations which will be used throughout the rest of the thesis. \section{Higher categories} \label{sec:higher-categories} A higher category is a generalisation of the ordinary notion of a category to allow higher-dimensional structure. This manifests in the form of allowing arrows or morphisms to have their source or target be another morphism instead of an object. In this thesis, we are primarily concerned with \(\infty\)-categories, which are equipped with the notion of an \(n\)-cell for each \(n \in \mathbb{N}\), where each \((n+1)\)-cell has a source and target \(n\)-cell, and \(0\)-cells play the role of objects in an ordinary category. The role of objects is played by \(0\)-cells, with \(1\)-cells as the morphisms between these objects. For \(0\)-cells \(x\) and \(y\), a \(1\)-cell \(f\) with source \(x\) and target \(y\) will be drawn as: \[ \begin{tikzcd} x & y \arrow["f", from=1-1, to=1-2] \end{tikzcd} \] or may be written as \(f: x \to y\). Two cells are \emph{parallel} if they have the same source and target. Between any two parallel \(n\)-cells \(f\) and \(g\), we have a set of \((n+1)\)-cells between them. A \(2\)-cell \(\alpha : f \to g\) may be drawn as: \[ \begin{tikzcd} x & y \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-12pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=12pt}, from=1-1, to=1-2] \arrow["\alpha", shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd} \] A \(3\)-cell \(\gamma\) between parallel \(2\)-cells \(\alpha\) and \(\beta\) could be drawn as: \[ \begin{tikzcd} x && y \arrow[""{name=0, anchor=center, inner sep=0}, "f", curve={height=-15pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "g"', curve={height=15pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "\alpha", shift left=4, shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \arrow[""{name=3, anchor=center, inner sep=0}, "\beta"', shift right=4, shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \arrow["\gamma", shorten <=4pt, shorten >=4pt, Rightarrow, nfold=3, from=2, to=3] \end{tikzcd} \] Just as in ordinary \(1\)-category theory, we expect to be able to compose morphisms whose boundaries are compatible. For \(1\)-cells, nothing has changed, given \(1\)-cells \(f: x \to y\) and \(g : y \to z\) we form the composition \(f * g\): \[ \begin{tikzcd} x & y & z \arrow[from=1-1, to=1-2, "f"] \arrow[from=1-2, to=1-3, "g"] \end{tikzcd} \] which has source \(x\) and target \(z\). We pause here to note that composition will be given in ``diagrammatic order'' throughout the whole thesis, which is the opposite of the order of function composition yet the same as the order of the arrows as drawn above. This is chosen as it will be common for us to draw higher-dimensional arrows in a diagram, and rare for us to consider categories where the higher arrows are given by functions. In an attempt to avoid confusion, we use an asterisk (\(*\)) to represent composition of arrows or cells in a higher category, and will use a circle (\(\circ\)) only for function composition. In two dimensions, there is no longer a unique composition operation. For \(2\)-cells \(\alpha : f \to g\) and \(\beta : g \to h\), the composite \(\alpha *_1 \beta\) can be formed as before: % https://q.uiver.app/#q=WzAsMixbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzAsMSwiZiIsMCx7ImN1cnZlIjotNH1dLFswLDEsImgiLDIseyJjdXJ2ZSI6NH1dLFswLDEsImciLDFdLFsyLDQsIlxcYWxwaGEiLDAseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzQsMywiXFxiZXRhIiwwLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dXQ== % tex-fmt: skip \[ \begin{tikzcd} x && y \arrow[""{name=0, anchor=center, inner sep=0}, "f"', curve={height=24pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "h", curve={height=-24pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "g"{description}, from=1-1, to=1-3] \arrow["\alpha", shorten <=3pt, shorten >=3pt, Rightarrow, from=0, to=2] \arrow["\beta", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \end{tikzcd} \] We refer to this composition as \emph{vertical composition}. The cells \(\gamma : i \to j\) and \(\delta : k \to l\) can also be composed in the following way: % https://q.uiver.app/#q=WzAsMyxbMCwwLCJ4Il0sWzEsMCwieSJdLFsyLDAsInoiXSxbMCwxLCIiLDAseyJjdXJ2ZSI6LTN9XSxbMCwxLCIiLDIseyJjdXJ2ZSI6M31dLFsxLDIsIiIsMix7ImN1cnZlIjotM31dLFsxLDIsIiIsMix7ImN1cnZlIjozfV0sWzMsNCwiXFxhbHBoYSIsMCx7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbNSw2LCJcXGJldGEiLDAseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV1d % tex-fmt: skip \[ \begin{tikzcd} x & y & z \arrow[""{name=0, anchor=center, inner sep=0}, "j", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "i"', curve={height=18pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, "l", curve={height=-18pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "k"', curve={height=18pt}, from=1-2, to=1-3] \arrow["\gamma", shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["\delta", shorten <=5pt, shorten >=5pt, Rightarrow, from=3, to=2] \end{tikzcd} \] This composition is called the \emph{horizontal composition}, and is written \(\gamma *_0 \delta\). The subscript refers to the dimension of the shared boundary in the composition, with the \(1\)-cell \(g\) being the shared boundary in the vertical composition example and the \(0\)-cell \(y\) being the shared boundary in the horizontal composition example. The dimension of this shared boundary is the \emph{codimension} of the composition. This pattern continues with \(3\)-cells, which can be composed at codimension \(0\), \(1\), or \(2\), as depicted below: % https://q.uiver.app/#q=WzAsNyxbMiwwLCJcXGJ1bGxldCJdLFswLDAsIlxcYnVsbGV0Il0sWzMsMCwiXFxidWxsZXQiXSxbNSwwLCJcXGJ1bGxldCJdLFs2LDAsIlxcYnVsbGV0Il0sWzcsMCwiXFxidWxsZXQiXSxbOCwwLCJcXGJ1bGxldCJdLFsxLDAsIiIsMCx7ImN1cnZlIjotM31dLFsxLDAsIiIsMix7ImN1cnZlIjozfV0sWzIsMywiIiwwLHsiY3VydmUiOi00fV0sWzIsMywiIiwyLHsiY3VydmUiOjR9XSxbMiwzXSxbNCw1LCIiLDAseyJjdXJ2ZSI6LTN9XSxbNCw1LCIiLDIseyJjdXJ2ZSI6M31dLFs1LDYsIiIsMix7ImN1cnZlIjotM31dLFs1LDYsIiIsMix7ImN1cnZlIjozfV0sWzgsNywiIiwyLHsib2Zmc2V0IjotNSwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs4LDcsIiIsMCx7Im9mZnNldCI6NSwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs4LDcsIiIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMTAsMTEsIiIsMix7Im9mZnNldCI6LTQsInNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMTAsMTEsIiIsMCx7Im9mZnNldCI6NCwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFsxMSw5LCIiLDEseyJvZmZzZXQiOi00LCJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzExLDksIiIsMSx7Im9mZnNldCI6NCwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFsxMywxMiwiIiwyLHsib2Zmc2V0IjotMywic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFsxMywxMiwiIiwwLHsib2Zmc2V0IjozLCJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzE1LDE0LCIiLDIseyJvZmZzZXQiOi0zLCJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzE1LDE0LCIiLDAseyJvZmZzZXQiOjMsInNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMTYsMTgsIlxcZ2FtbWEiLDAseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzE4LDE3LCJcXGRlbHRhIiwwLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFsyMSwyMiwiXFxnYW1tYSIsMCx7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMTksMjAsIlxcZGVsdGEiLDAseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzIzLDI0LCJcXGdhbW1hIiwwLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFsyNSwyNiwiXFxkZWx0YSIsMCx7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XV0= % tex-fmt: skip \[ \begin{tikzcd} \bullet && \bullet & \bullet && \bullet & \bullet & \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-18pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=18pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-24pt}, from=1-4, to=1-6] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=24pt}, from=1-4, to=1-6] \arrow[""{name=4, anchor=center, inner sep=0}, from=1-4, to=1-6] \arrow[""{name=5, anchor=center, inner sep=0}, curve={height=-18pt}, from=1-7, to=1-8] \arrow[""{name=6, anchor=center, inner sep=0}, curve={height=18pt}, from=1-7, to=1-8] \arrow[""{name=7, anchor=center, inner sep=0}, curve={height=-18pt}, from=1-8, to=1-9] \arrow[""{name=8, anchor=center, inner sep=0}, curve={height=18pt}, from=1-8, to=1-9] \arrow[""{name=9, anchor=center, inner sep=0}, shift left=5, shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow[""{name=10, anchor=center, inner sep=0}, shift right=5, shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow[""{name=11, anchor=center, inner sep=0}, shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow[""{name=12, anchor=center, inner sep=0}, shift left=4, shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=4] \arrow[""{name=13, anchor=center, inner sep=0}, shift right=4, shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=4] \arrow[""{name=14, anchor=center, inner sep=0}, shift left=4, shorten <=3pt, shorten >=3pt, Rightarrow, from=4, to=2] \arrow[""{name=15, anchor=center, inner sep=0}, shift right=4, shorten <=3pt, shorten >=3pt, Rightarrow, from=4, to=2] \arrow[""{name=16, anchor=center, inner sep=0}, shift left=3, shorten <=5pt, shorten >=5pt, Rightarrow, from=6, to=5] \arrow[""{name=17, anchor=center, inner sep=0}, shift right=3, shorten <=5pt, shorten >=5pt, Rightarrow, from=6, to=5] \arrow[""{name=18, anchor=center, inner sep=0}, shift left=3, shorten <=5pt, shorten >=5pt, Rightarrow, from=8, to=7] \arrow[""{name=19, anchor=center, inner sep=0}, shift right=3, shorten <=5pt, shorten >=5pt, Rightarrow, from=8, to=7] \arrow["", shorten <=2pt, shorten >=2pt, Rightarrow, nfold=3, from=9, to=11] \arrow["", shorten <=2pt, shorten >=2pt, Rightarrow, nfold=3, from=11, to=10] \arrow["", shorten <=3pt, shorten >=3pt, Rightarrow, nfold=3, from=14, to=15] \arrow["", shorten <=3pt, shorten >=3pt, Rightarrow, nfold=3, from=12, to=13] \arrow["", shorten <=2pt, shorten >=2pt, Rightarrow, nfold=3, from=16, to=17] \arrow["", shorten <=2pt, shorten >=2pt, Rightarrow, nfold=3, from=18, to=19] \end{tikzcd} \] where the unlabelled arrows and objects (which are written \(\bullet\)) are assumed to represent arbitrary potentially-distinct cells. For every \(n\)-cell \(x\), there is an \((n+1)\)-cell \(\id(x) : x \to x\), called the \emph{identity morphism}. Similarly to 1-categories, \(\infty\)-categories need to satisfy certain laws, which fall into 3 groups: associativity, unitality, and interchange. These laws can hold strictly, meaning that they hold up to equality, or weakly, meaning that they hold up to a higher-dimensional isomorphism. We delay the discussion of weak \(\infty\)-categories to \cref{sec:weak}, and begin with the discussion of strict \(\infty\)-categories. In these strict categories, associativity laws are the same as for 1-categories, only now a law is needed for each composition (in every dimension and codimension). Unitality is again similar to the case for 1-categories, except we again need unitality laws for each composition. We note that for lower-codimensional compositions, an iterated identity is needed. For example, given a \(2\)-cell \(\alpha : f \to g\), the appropriate equation for left unitality of horizontal composition is: \[ \id(\id(x)) *_0 \alpha = \alpha \] In general for a unit to be cancelled, it must be iterated a number of times equal to the difference between the dimension and codimension of the composition. Interchange laws do not appear in 1-categories, and specify how compositions of different dimensions interact. The first interchange law states that for suitable \(2\)-cells \(\alpha\), \(\beta\), \(\gamma\), and \(\delta\), that: \[ (\alpha *_0 \gamma) *_1 (\beta *_0 \delta) = (\alpha *_1 \beta) *_0 (\gamma *_1 \delta)\] This can be diagrammatically depicted as: \newsavebox{\innertop} \savebox{\innertop}{ \adjustbox{scale=0.8}{\begin{tikzcd}[ampersand replacement=\&,column sep=small] \bullet \& \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-12pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=12pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-12pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=12pt}, from=1-2, to=1-3] \arrow["\alpha", shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \arrow["\gamma", shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=2] \end{tikzcd}}} \newsavebox{\innerbot} \savebox{\innerbot}{ \adjustbox{scale=0.8}{\begin{tikzcd}[ampersand replacement=\&,column sep=small] \bullet \& \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-12pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=12pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-12pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=12pt}, from=1-2, to=1-3] \arrow["\beta", shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \arrow["\delta", shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=2] \end{tikzcd}}} \newsavebox{\innerleft} \savebox{\innerleft}{ \adjustbox{scale=1}{\begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, controls=+(80:0.7) and +(100:0.7),, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=0}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, controls=+(100:-0.7) and +(80:-0.7),, from=1-1, to=1-2] \arrow["\alpha", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \arrow["\beta", shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd}}} \newsavebox{\innerright} \savebox{\innerright}{ \adjustbox{scale=1}{\begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, controls=+(80:0.7) and +(100:0.7), from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=0}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, controls=+(100:-0.7) and +(80:-0.7),, from=1-1, to=1-2] \arrow["\gamma", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \arrow["\delta", shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd}}} \[ \begin{tikzcd}[column sep=small] \bullet &&&&& \bullet & {=} & \bullet &&& \bullet &&& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, from=1-1, to=1-6] \arrow[""{name=1, anchor=center, inner sep=0}, draw=none, controls=+(90:2) and +(90:2), from=1-1, to=1-6] \arrow[""{name=2, anchor=center, inner sep=0}, draw=none, controls=+(90:-2) and +(90:-2), from=1-1, to=1-6] \arrow[""{name=4, anchor=center, inner sep=0}, draw=none, controls=+(80:1.5) and +(100:1.5), from=1-8, to=1-11] \arrow[""{name=5, anchor=center, inner sep=0}, draw=none, controls=+(100:-1.5) and +(80:-1.5), from=1-8, to=1-11] \arrow[""{name=6, anchor=center, inner sep=0}, draw=none, controls=+(80:1.5) and +(100:1.5), from=1-11, to=1-14] \arrow[""{name=8, anchor=center, inner sep=0}, draw=none, controls=+(100:-1.5) and +(80:-1.5), from=1-11, to=1-14] \arrow["\usebox{\innertop}"{description, inner sep = 0,xshift = -1.2pt}, shorten <=4pt, shorten >=4pt, Rightarrow, from=2, to=0] \arrow["\usebox{\innerbot}"{description, inner sep = 0,xshift = -1.2pt}, shorten <=4pt, shorten >=4pt, Rightarrow, from=0, to=1] \arrow[""{name=1, anchor=center, inner sep=0}, controls=+(90:2) and +(90:2), from=1-1, to=1-6] \arrow[""{name=2, anchor=center, inner sep=0}, controls=+(90:-2) and +(90:-2), from=1-1, to=1-6] \arrow["\usebox{\innerleft}"{description, inner sep = 0,xshift = -1.3pt}, shorten <=2pt, shorten >=2pt, Rightarrow, from=5, to=4] \arrow["\usebox{\innerright}"{description, inner sep = 0,xshift = -1.3pt}, shorten <=2pt, shorten >=2pt, Rightarrow, from=8, to=6] \arrow[controls=+(80:1.5) and +(100:1.5), from=1-8, to=1-11] \arrow[controls=+(100:-1.5) and +(80:-1.5), from=1-8, to=1-11] \arrow[controls=+(80:1.5) and +(100:1.5), from=1-11, to=1-14] \arrow[controls=+(100:-1.5) and +(80:-1.5), from=1-11, to=1-14] \end{tikzcd} \] There are also interchange laws for the interaction of composition and identities; A composition of two identities is the same as an identity on the composition of the underlying cells. The \(\infty\)-categories that we study in this thesis will be globular, meaning that their cells form a globular set. A globular set can be seen as natural extension of the data of a category, whose data can be arranged into the following diagram: % https://q.uiver.app/#q=WzAsMixbMCwwLCJZIl0sWzEsMCwiWCJdLFswLDEsInMiLDAseyJvZmZzZXQiOi0xfV0sWzAsMSwidCIsMix7Im9mZnNldCI6MX1dXQ== % tex-fmt: skip \[ \begin{tikzcd} M & O \arrow["s", shift left, from=1-1, to=1-2] \arrow["t"', shift right, from=1-1, to=1-2] \end{tikzcd} \] where \(O\) is a set of objects, \(M\) is a set of all morphisms, and \(s\) and \(t\) are functions assigning each morphism to its source and target object respectively. \(2\)-cells can be added to this diagram in a natural way: % https://q.uiver.app/#q=WzAsMyxbMSwwLCJDXzEiXSxbMiwwLCJDXzAiXSxbMCwwLCJDXzIiXSxbMCwxLCJzXzAiLDAseyJvZmZzZXQiOi0xfV0sWzAsMSwidF8wIiwyLHsib2Zmc2V0IjoxfV0sWzIsMCwic18xIiwwLHsib2Zmc2V0IjotMX1dLFsyLDAsInRfMSIsMix7Im9mZnNldCI6MX1dXQ== % tex-fmt: skip \[ \begin{tikzcd} {C_2} & {C_1} & {C_0} \arrow["{s_0}", shift left, from=1-2, to=1-3] \arrow["{t_0}"', shift right, from=1-2, to=1-3] \arrow["{s_1}", shift left, from=1-1, to=1-2] \arrow["{t_1}"', shift right, from=1-1, to=1-2] \end{tikzcd} \] In a globular set, the source and target of any cell must be parallel, meaning they share the same source and target. This condition is imposed by \emph{globularity conditions}. Adding these and iterating the process leads to the following definition. \begin{definition} The category of globes \(\mathbf{G}\) has objects given by the natural numbers and morphisms generated from \(\mathbf{s}_n, \mathbf{t}_n : n \to n + 1\) quotiented by the \emph{globularity conditions}: \begin{align*} \mathbf{s}_{n+1} \circ \mathbf{s}_n &= \mathbf{t}_{n+1} \circ \mathbf{s}_n\\ \mathbf{s}_{n+1} \circ \mathbf{t}_n &= \mathbf{t}_{n+1} \circ \mathbf{t}_n \end{align*} The category of globular sets \(\mathbf{Glob}\), is the presheaf category \([\mathbf{G}^{\mathrm{op}}, \mathbf{Set}]\). \end{definition} Unwrapping this definition, a globular set \(G\) consists of sets \(G(n)\) for each \(n \in \mathbb{N}\), with source and target maps \(s_n, t_n : G(n+1) \to G(n)\), forming the following diagram: \[ \begin{tikzcd} \cdots & {G(3)} & {G(2)} & {G(1)} & {G(0)} \arrow["{s_0}", shift left, from=1-4, to=1-5] \arrow["{t_0}"', shift right, from=1-4, to=1-5] \arrow["{s_1}", shift left, from=1-3, to=1-4] \arrow["{t_1}"', shift right, from=1-3, to=1-4] \arrow["{t_2}"', shift right, from=1-2, to=1-3] \arrow["{s_2}", shift left, from=1-2, to=1-3] \arrow[shift right, from=1-1, to=1-2] \arrow[shift left, from=1-1, to=1-2] \end{tikzcd} \] and satisfying the globularity conditions. A morphism of globular sets \(F : G \to H\) is a collection of functions \(G(n) \to H(n)\) which commute with the source and target maps. Given a globular set \(G\), we will call the elements of \(G(n)\) the \(n\)-cells and write \(f : x \to y\) for an \((n+1)\)-cell \(f\) where \(s_n(f) = x\) and \(t_n(f) = y\). We further define the \(n\)-boundary operators \(\delta_n^-\) and \(\delta_n^+\) which take the source or target respectively of a \((n+k)\)-cell \(k\) times, returning an \(n\)-cell. \begin{example} \label{ex:disc} The \(n\)-disc \(D^n\) is a finite globular set given by \(Y(n)\), where \(Y\) is the Yoneda embedding \(\mathbf{G} \to \mathbf{Glob}\). \(D^n\) has no \(k\)-cells for \(k > n\), a single \(n\)-cell \(d_n\), and two \(m\)-cells \(d_m^-\) and \(d_m^+\) for \(m < n\). Every \((m+1)\)-cell of \(D^n\) has source \(d_m^-\) and target \(d_m^+\). The first few discs are depicted in \cref{fig:discs}. The Yoneda lemma tells us that a map of globular sets \(D^n \to G\) is the same as an \(n\)-cell of \(G\). For an \(n\)-cell \(x\) of \(G\), we let \(\{x\}\) be the unique map \(D^n \to G\) which sends \(d_n\) to \(x\). \end{example} \begin{figure}[h] \centering \begin{tabular}{P{3cm} P{3cm} P{3cm} P{4cm}} \(D^0\)&\(D^1\)&\(D^2\)&\(D^3\)\\ {\begin{tikzcd} d_0 \end{tikzcd} }&{\begin{tikzcd}[ampersand replacement=\&] d_0^- \& d_0^+ \arrow[from=1-1, to=1-2, "d_1"] \end{tikzcd} }&{\begin{tikzcd}[ampersand replacement=\&] d_0^- \& d_0^+ \arrow[""{name=0, anchor=center, inner sep=0}, "d_1^+", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "d_1^-"', curve={height=18pt}, from=1-1, to=1-2] \arrow["d_2"', shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd} }&{\begin{tikzcd}[ampersand replacement=\&] d_0^- \&\& d_0^+ \arrow[""{name=0, anchor=center, inner sep=0}, "d_1^+", curve={height=-25pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "d_1^-"', curve={height=25pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "d_2^-", shift left=12pt,Rightarrow, shorten <=5pt, shorten >=5pt, from=1,to=0] \arrow[""{name=3, anchor=center, inner sep=0}, "d_2^+"', shift right=12pt,Rightarrow, shorten <=5pt, shorten >=5pt, from=1,to=0] \arrow["d_3", Rightarrow, nfold = 3, shorten <=3pt, shorten >=3pt,from=2,to=3] \end{tikzcd}} \end{tabular} \caption{The first disc globular sets.} \label{fig:discs} \end{figure} \begin{remark} Globular sets are not the only natural extension of the data of a 1-category. The form of this data in a definition of a higher category is referred to as the \emph{shape} of the cells. Notable alternatives to globular sets include simplicial sets, opetopic sets, and cubical sets. \end{remark} We can now give the definition of a strict \(\infty\)-category. \begin{definition} A \emph{strict \(\infty\)-category} is a globular set \(G\) with the following operations: \begin{itemize} \item For \(m < n\), a composition \(*_m\) taking \(n\)-cells \(f\) and \(g\) with \(\delta_m^+(f) = \delta_m^-(g)\) and producing an \(n\)-cell \(f *_m g\) with: \begin{align*} s(f *_m g) &= \begin{cases*} s(f)&\text{if \(m = n - 1\)}\\ s(f) *_m s(g)&\text{otherwise} \end{cases*}\\ t(f *_m g) &= \begin{cases*} t(g)&\text{if \(m = n - 1\)}\\ t(f) *_m t(g)&\text{otherwise} \end{cases*} \end{align*} \item For any \(n\)-cell \(x\), an identity \((n+1)\)-cell \(\id(x) : x \to x\). \end{itemize} and satisfying equalities: \begin{itemize} \item Associativity: Given \(m < n\) and \(n\)-cells \(f\), \(g\), and \(h\) with \(\delta_m^+(f) = \delta_m^-(g)\) and \(\delta_m^+(g) = \delta_m^-(h)\): \[ (f *_m g) *_m h = f *_m (g *_m h) \] \item Unitality: Given \(m < n\) and \(n\)-cell \(f\): \begin{align*} \id^{n-m}(\delta_m^-(f)) *_m f &= f\\ f *_m \id^{n-m}(\delta_m^+(f)) &= f \end{align*} \item Composition interchange: If \(o < m < n\) and \(\alpha\), \(\beta\), \(\gamma\), and \(\delta\) be \(n\)-cells with \[\delta_m^+(\alpha) = \delta_m^-(\beta)\qquad \delta_m^+(\gamma) = \delta_m^-(\delta)\qquad \delta_o^+(\alpha) = \delta_o^-(\gamma)\] then: \[(\alpha *_o \gamma) *_m (\beta *_o \delta) = (\alpha *_m \beta) *_o (\gamma *_m \delta)\] \item Identity interchange: Let \(m < n\) and \(f\) and \(g\) be \(n\)-cells with \(\delta_m^+(f) = \delta_m^-(g)\). Then: \[\id(f) *_m \id(g) = \id(f *_m g)\] \end{itemize} A morphism of \(\infty\) categories is a morphism of the underlying globular sets which preserves composition and identities. \end{definition} There is a clear forgetful functor from the category of strict \(\infty\)-categories to the category of globular sets, which has a left adjoint given by taking the free strict \(\infty\)-category over a globular set. We end this section with an example of a non-trivial application of the axioms of an \(\infty\)-category, known as the Eckmann-Hilton argument. The argument shows that any two scalars (morphisms from the identity to the identity) commute. \begin{proposition}[Eckmann-Hilton] \label{prop:eh} Let \(x\) be an \(n\)-cell in an \(\infty\)-category and let \(\alpha\) and \(\beta\) be \((n+2)\)-cells with source and target \(\id(x)\). Then \(\alpha *_{n+1} \beta = \beta *_{n+1} \alpha\). \end{proposition} \begin{proof} The cells \(\alpha\) and \(\beta\) can be manoeuvred around each other as follows: \begin{align*} &\phantom{{}={}} \alpha *_{n+1} \beta \\ &= (\alpha *_n i) *_{n+1} (i *_n \beta)&\text{Unitality}\\ &= (\alpha *_{n+1} i) *_n (i *_{n+1} \beta)&\text{Interchange}\\ &= \alpha *_n \beta &\text{Unitality}\\ &= (i *_{n+1} \alpha) *_n (\beta *_{n+1} i)&\text{Unitality}\\ &= (i *_n \beta) *_{n+1} (\alpha *_n i)&\text{Interchange}\\ &= \beta *_{n+1} \alpha&\text{Unitality} \end{align*} where \(i = \id(\id(x))\). \end{proof} We give a more graphical representation of the proof in \cref{fig:eh}, which appeared in the introduction. In this proof the \(\alpha\) is moved to the left of \(\beta\), though we equally could have moved it round the right, and the choice made was arbitrary. \subsection{Pasting diagrams} \label{sec:pasting-diagrams} The definition of \(\infty\)-categories given in the previous section is close in spirit to the ordinary definitions of 1-categories and clearly demonstrates the different families of axioms present. However, we will see in \cref{sec:weak} that these sorts of definitions do not scale well to our eventual setting of weak higher categories. There is a special class of (finite) globular sets known as \emph{pasting diagrams}, sometimes known as \emph{pasting schemes}. The elements of the free strict \(\infty\)-category on a globular set \(G\) can instead be represented by a pasting diagram equipped with a map into \(G\). To do this, it must be possible to obtain a canonical composite from each pasting diagram. Informally, we can define an \(n\)-dimensional pasting diagram to be a finite globular set which admits a unique full composite of dimension \(n\), where a full composite of a globular set \(G\) is an element of the free \(\infty\)-category over \(G\) which uses all the maximal elements. This functions as the primary intuition on the role of pasting diagrams. Pasting diagrams were used directly by \citeauthor{batanin1998monoidal}~\cite{batanin1998monoidal} to give a definition of weak \(\infty\)-categories, and will be pivotal in \cref{sec:weak} to define the variety of \(\infty\)-categories that \Catt is based on. A more in-depth discussion of pasting diagrams, representations of free strict \(\infty\)-categories using them, and their use in the definition of weak \(\infty\)-categories can be found in \citetitle{leinster2004higher}~\cite{leinster2004higher}. Before giving a more formal definition of pasting diagrams, we explore some examples and non-examples. In contrast to \citeauthor{leinster2004higher}, we consider pasting diagrams as a full subcategory of globular sets, rather than a separate category with a function sending each pasting diagram to a globular set. The disc contexts introduced in \cref{ex:disc} are all examples of pasting diagrams. The unique ``composite'' of these globular sets is just given by their maximal element, noting that we allow a singular cell in our informal definition of composite. The uniqueness of this is trivial as the only possible operations we could apply are compositions with units, which gives the same cell under the laws of an \(\infty\)-category. The diagrams used to graphically represent our composition operations (of which we recall three below) are also pasting diagrams. \[ \begin{tikzcd} x & y & z \arrow["f", from=1-1, to=1-2] \arrow["g", from=1-2, to=1-3] \end{tikzcd} \qquad \begin{tikzcd} x && y \arrow[""{name=0, anchor=center, inner sep=0}, "f"', curve={height=24pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "h", curve={height=-24pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "g"{description}, from=1-1, to=1-3] \arrow["\alpha", shorten <=3pt, shorten >=3pt, Rightarrow, from=0, to=2] \arrow["\beta", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \end{tikzcd} \qquad \begin{tikzcd} x & y & z \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, "i", curve={height=-18pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "h"', curve={height=18pt}, from=1-2, to=1-3] \arrow["\alpha", shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["\beta", shorten <=5pt, shorten >=5pt, Rightarrow, from=3, to=2] \end{tikzcd} \] The composite of these diagrams is just the composite of the two maximal cells with the appropriate codimension. We can also consider composites which are not binary composites of two cells of equal dimension. For example the following globular set is a pasting diagram: \[ \begin{tikzcd} x & y & z \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-2] \arrow["h", from=1-2, to=1-3] \arrow["\alpha", shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \end{tikzcd} \] with a composite given by \(\alpha *_0 \id(h)\). This operation is fairly common (in fact we have already seen it in \cref{prop:eh}) and is known as \emph{whiskering}. In this case we would say that the composite is given by the right whiskering of \(\alpha\) with \(h\). The 1-dimensional pasting diagrams are all given by chains of 1-cells of the form: \[x_0 \overset{f_0}\to x_1 \overset{f_1}\to x_2 \overset{f_2}\to \cdots \overset{f_n}\to x_{n+1}\] There are multiple ways to form a composite over these diagrams by repeated binary composition, however these all have the same result due to associativity. Lastly we look at the following diagram, where all the \(0\)-cells and \(1\)-cells are assumed to be distinct: \[ \begin{tikzcd}[column sep = large] \bullet & \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=30pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, from=1-1, to=1-2] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-2, to=1-3] \arrow[""{name=4, anchor=center, inner sep=0}, curve={height=30pt}, from=1-2, to=1-3] \arrow[""{name=5, anchor=center, inner sep=0}, from=1-2, to=1-3] \arrow["\alpha", shorten <=4pt, shorten >=4pt, Rightarrow, from=1, to=2] \arrow["\beta", shorten <=4pt, shorten >=4pt, Rightarrow, from=2, to=0] \arrow["\gamma", shorten <=4pt, shorten >=4pt, Rightarrow, from=4, to=5] \arrow["\delta", shorten <=4pt, shorten >=4pt, Rightarrow, from=5, to=3] \end{tikzcd} \] We get a composite given by \((\alpha *_1 \beta) *_0 (\gamma *_1 \delta)\). The uniqueness of this composite is due to the interchange law. Non-examples of pasting diagrams roughly fall into two groups: those that do not admit a composite, and those that admit many distinct composites. The following three globular sets fail to admit a composite (the last is drawn in a box to emphasise that \(z\) is part of the same globular set as \(x\), \(y\), \(f\), \(g\), and \(\alpha\)): \[ \begin{tikzcd}[column sep=large, row sep = small] & y \\ x \\ & z \arrow["f", pos=0.6, from=2-1, to=1-2] \arrow["g"', pos=0.6, from=2-1, to=3-2] \end{tikzcd} \qquad \begin{tikzcd}[column sep=large] x & y \arrow["f", curve={height=-12pt}, from=1-1, to=1-2] \arrow["g"', curve={height=12pt}, from=1-1, to=1-2] \end{tikzcd} \qquad \fbox{% \begin{tikzcd}[column sep=scriptsize, ampersand replacement = \&] x \&\& y \& z \arrow[""{name=0, anchor=center, inner sep=0}, "f", curve={height=-18pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "g"', curve={height=18pt}, from=1-1, to=1-3] \arrow["\alpha", shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \end{tikzcd}} \] The globular set with a single \(0\)-cell \(x\), and a single \(1\)-cell \(f : x \to x\) has too many composites: \(f\) and \(f *_0 f\) need not be equal in an \(\infty\)-category. To describe the free \(\infty\)-category in terms of pasting diagrams we need to be able to extract a composite from a pasting diagram, and construct a pasting diagram from an arbitrary composite. Each pasting diagram having a unique composite solves the former issue. To be able to construct a pasting diagram from a composite, we wish to equip our set of pasting diagrams itself with the structure of an \(\infty\)-category. We therefore need our pasting diagrams to have a notion of boundary and a notion of composition. A natural candidate for composition is given by colimits, as \(\mathbf{Glob}\) has all colimits due to being a presheaf category, and so it is sufficient for our class of pasting diagrams to be closed under these specific colimits. In fact, it is sufficient to contain a class of colimits known as \emph{globular sums}. \begin{definition} A globular category is a category \(\mathcal{C}\), equipped with a disc functor \(D : \mathbf{G} \to \mathcal{C}\), specifying certain objects as discs in the category. A \emph{globular sum} is a colimit of a diagram of the form: \[ \begin{tikzcd}[column sep = tiny, row sep = tiny] {D(i_0)} && {D(i_1)} && {D(i_2)} && {D(i_n)} && {D(i_{n+1})} \\ &&&&& \cdots \\ & {D(j_0)} && {D(j_1)} &&&& {D(j_n)} \arrow["{f_0}", from=3-2, to=1-1] \arrow["{g_0}"', from=3-2, to=1-3] \arrow["{f_n}", from=3-8, to=1-7] \arrow["{g_n}"', from=3-8, to=1-9] \arrow["{f_1}", from=3-4, to=1-3] \arrow["{g_1}"', from=3-4, to=1-5] \end{tikzcd} \] Where all morphisms \(f_i\) are a composite of source maps (\(D(\mathbf{s}_n)\) for some \(n\)) and the morphisms \(g_i\) are a composite of target maps (\(D(\mathbf{t}_n)\) for some \(n\)). Given that the maps \(f_i\) and \(g_i\) are uniquely determined, we may write such a globular sum as: \[ D(i_0) \amalg_{D(j_0)} D(i_1) \amalg_{D(j_1)} D(i_2) \cdots D(i_n) \amalg_{D(j_n)} D(i_{n+ 1})\] A \emph{globular extension} is a globular category where all globular sums exist, and a morphism of globular extensions is a functor of the underlying categories commuting with the disc functors and preserving globular sums. \end{definition} We can now give our first definition of a pasting diagram. \begin{definition} The category \(\mathbf{Glob}\) is a globular category with functor \(\mathbf{G} \to \mathbf{Glob}\) given by the Yoneda embedding. The category of \emph{pasting diagrams}, \(\mathbf{Pd}\), is the full subcategory containing the globular sets which are globular sums. The boundary of an \((n+1)\)-dimensional pasting diagram is given by replacing each instance of \(D^{n+1}\) by \(D^n\) in its globular sum representation. There are two canonical maps including the boundary into the original pasting diagram, whose images give the source and target of the pasting diagram. \end{definition} The category of pasting diagrams clearly forms a globular category, with the functor \(\mathbf{G} \to \mathbf{Pd}\) sending \(n\) to \(D^n\). It is a globular extension and is in fact the universal globular extension; it is initial in the category of globular extensions~\cite{Ara}. We finish this section with one larger example. \begin{example} The following depicts a \(2\)-dimensional pasting diagram. \[ \begin{tikzcd} x & y & z & w \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-2] \arrow["h"', from=1-2, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "k", curve={height=-24pt}, from=1-3, to=1-4] \arrow[""{name=3, anchor=center, inner sep=0}, "i"', curve={height=24pt}, from=1-3, to=1-4] \arrow[""{name=4, anchor=center, inner sep=0}, "j"{description}, from=1-3, to=1-4] \arrow["\alpha", shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["\beta", shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=4] \arrow["\gamma", shorten <=3pt, shorten >=3pt, Rightarrow, from=4, to=2] \end{tikzcd} \] This has the following globular sum decomposition: % https://q.uiver.app/#q=WzAsMTMsWzAsMCwieCJdLFsyLDAsInkiXSxbOCwwLCJ6Il0sWzEwLDAsInciXSxbMywxLCJ5Il0sWzQsMCwieSJdLFs2LDAsInoiXSxbNywxLCJ6Il0sWzksMF0sWzEwLDEsInoiXSxbMTIsMSwidyJdLFsxMiwwLCJ6Il0sWzE0LDAsInciXSxbMCwxLCJnIiwwLHsiY3VydmUiOi0zfV0sWzAsMSwiZiIsMix7ImN1cnZlIjozfV0sWzIsMywiaSIsMix7ImN1cnZlIjo0fV0sWzIsMywiaiIsMV0sWzUsNiwiaCIsMl0sWzQsMSwiIiwyLHsic3R5bGUiOnsiYm9keSI6eyJuYW1lIjoiZGFzaGVkIn19fV0sWzQsNSwiIiwxLHsic3R5bGUiOnsiYm9keSI6eyJuYW1lIjoiZGFzaGVkIn19fV0sWzksMTAsImoiLDFdLFsxMSwxMiwiaiIsMV0sWzExLDEyLCJrIiwxLHsiY3VydmUiOi00fV0sWzcsNiwiIiwwLHsic3R5bGUiOnsiYm9keSI6eyJuYW1lIjoiZGFzaGVkIn19fV0sWzcsMiwiIiwwLHsic3R5bGUiOnsiYm9keSI6eyJuYW1lIjoiZGFzaGVkIn19fV0sWzE0LDEzLCJcXGFscGhhIiwwLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFsxNSwxNiwiXFxiZXRhIiwwLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFsyMSwyMiwiXFxnYW1tYSIsMCx7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMjAsMywiIiwxLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwfSwibGV2ZWwiOjEsInN0eWxlIjp7ImJvZHkiOnsibmFtZSI6ImRhc2hlZCJ9fX1dLFsyMCwxMSwiIiwxLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwfSwibGV2ZWwiOjEsInN0eWxlIjp7ImJvZHkiOnsibmFtZSI6ImRhc2hlZCJ9fX1dXQ== % tex-fmt: skip \[ \begin{tikzcd}[column sep=small, row sep = small] x && y && y && z && z & {} & w && z && w \\ &&& y &&&& z &&& z && w \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "i"', curve={height=24pt}, from=1-9, to=1-11] \arrow[""{name=3, anchor=center, inner sep=0}, "j"{description}, from=1-9, to=1-11] \arrow["h"', from=1-5, to=1-7] \arrow[dashed, from=2-4, to=1-3] \arrow[dashed, from=2-4, to=1-5] \arrow[""{name=4, anchor=center, inner sep=0}, "j"{description}, from=2-11, to=2-13] \arrow[""{name=5, anchor=center, inner sep=0}, "j"{description}, from=1-13, to=1-15] \arrow[""{name=6, anchor=center, inner sep=0}, "k"{description}, curve={height=-24pt}, from=1-13, to=1-15] \arrow[dashed, from=2-8, to=1-7] \arrow[dashed, from=2-8, to=1-9] \arrow["\alpha", shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["\beta", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=3] \arrow["\gamma", shorten <=3pt, shorten >=3pt, Rightarrow, from=5, to=6] \arrow[shorten <=6pt, dashed, from=4, to=1-11] \arrow[shorten <=6pt, dashed, from=4, to=1-13] \end{tikzcd} \] The source and target of the diagram are given by the isomorphic pasting diagrams: \[ \begin{tikzcd} x & y & z & w \arrow["f"', curve={height=18pt}, from=1-1, to=1-2] \arrow["h", from=1-2, to=1-3] \arrow["i"', curve={height=24pt}, from=1-3, to=1-4] \end{tikzcd} \qquad\text{and}\qquad \begin{tikzcd} x & y & z & w \arrow["g", curve={height=-18pt}, from=1-1, to=1-2] \arrow["h", from=1-2, to=1-3] \arrow["k", curve={height=-24pt}, from=1-3, to=1-4] \end{tikzcd} \] \end{example} \subsection{Weak higher categories} \label{sec:weak} The \(\infty\)-categories we have defined so far have all been strict \(\infty\)-categories, meaning that the laws are required to hold up to equality. In ordinary \(1\)-category theory, isomorphism is usually preferred over equality for comparing objects. Similarly, when we have access to higher-dimensional arrows, it follows that we can also consider isomorphisms between morphisms, and therefore consider laws such as associativity up to isomorphism instead of equality. Topological spaces provide one of the primary examples for where it is useful to consider weak laws. Given a topological space \(X\), we can define a globular set of paths and homotopies. Let the \(0\)-cells be given by points \(x\) of the topological space, let morphisms from \(x\) to \(y\) be given as paths \(I \to X\) (where \(I\) is the topological interval \([0,1]\)) which send \(0\) to \(x\) and \(1\) to \(y\), and let higher cells be given by homotopies. The natural composition of two paths \(p\) and \(q\) is the following path: \[ (p * q)(i) = \begin{cases*} p(2i)&when \(i < 0.5\)\\ q(2i-1)&when \(i \geq 0.5\) \end{cases*} \] which effectively lines up the paths end to end. Given \(3\) paths \(p\), \(q\), and \(r\), the compositions \((p * q) * r\) and \(p * (q * r)\) are not identical but are equal up to homotopy, meaning the two compositions are isomorphic. Therefore, in this case the composition \(p * q\) does not form a strict \(\infty\)-category structure, but rather a weak structure. \paragraph{Weak 2-categories} We start our exploration of weak higher categories by considering the lower dimension case of bicategories (weak \(2\)-categories). Here, interchange must still be given by a strict equality, as there are no non-trivial \(3\)-cells in a \(2\)-category. However, associativity and unitality can be given by isomorphisms known as associators and unitors: \begin{align*} \alpha_{f,g,h} &: (f *_0 g) *_0 h \to f *_0 (g *_0 h)\\ \lambda_f &: \id(x) *_0 f \to f\\ \rho_f &: f *_0 \id(y) \to f \end{align*} for \(f : x \to y\), \(g : y \to z\), and \(h : z \to w\). \begin{example} \label{ex:spans} All strict 2-categories are also bicategories. The bicategory of spans is an example of a bicategory which is not strict. Starting with a category \(\mathcal{C}\) equipped with chosen pullbacks, we define the bicategory of spans over \(\mathcal{C}\) to be: \begin{itemize} \item Objects are the same as \(\mathcal{C}\) \item Morphisms \(A\) to \(B\) are spans \(A \leftarrow C \to B\). \item A 2-morphism from \(A \leftarrow C \to B\) to \(A \leftarrow C' \to B\) is a morphism \(C \to C'\) such that the following diagram commutes: \[ \begin{tikzcd}[row sep = small] & C \\ A && B \\ & {C'} \arrow[from=1-2, to=3-2] \arrow[from=3-2, to=2-1] \arrow[from=1-2, to=2-1] \arrow[from=1-2, to=2-3] \arrow[from=3-2, to=2-3] \end{tikzcd} \] \item Compositions and identities of 2-morphisms is given by composition and identities of the underlying morphisms in \(\mathcal{C}\). \item The identity on an object \(A\) is the span \(A \leftarrow A \to A\). \item Given spans \(A \leftarrow D \to B\) and \(B \leftarrow E \to C\), their composite is given by the pullback: \[ \begin{tikzcd}[row sep=small] && {D \times_B E} \\ & D && E \\ A && B && C \arrow[from=2-2, to=3-1] \arrow[from=2-2, to=3-3] \arrow[from=2-4, to=3-3] \arrow[from=2-4, to=3-5] \arrow[from=1-3, to=2-2] \arrow[from=1-3, to=2-4] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=-45}, draw=none, from=1-3, to=3-3] \end{tikzcd} \] \item Associators and unitors are given by the universal property of the pullback. \end{itemize} \end{example} In general, there could be many possible isomorphisms between \((f * g) * h\) and \(f * (g * h)\), and we require that the chosen morphisms satisfy certain compatibility properties. The first is that each of the associator, left unitor, and right unitor should be a natural isomorphism. The second is a property known as \emph{coherence}, saying that any two parallel morphisms built purely from naturality moves, associators, and unitors must be equal. For bicategories it is sufficient to give two coherence laws: the triangle equality and pentagon equality. The triangle equality identifies two ways of cancelling the identity in the composite \(f * \id * g\), giving a compatibility between the left and right unitors. It is given by the following commutative diagram: % https://q.uiver.app/#q=WzAsMyxbMCwwLCIoZiBcXHN0YXIgXFxpZCkgXFxzdGFyIGciXSxbMiwwLCJmIFxcc3RhciAoXFxpZCBcXHN0YXIgZykiXSxbMSwxLCJmIFxcc3RhciBnIl0sWzAsMSwiXFxhbHBoYV97ZixcXGlkLGd9Il0sWzAsMiwiXFxyaG9fZiBcXHN0YXJfMCBcXGlkKGcpIiwyXSxbMSwyLCJcXGlkKGYpXFxzdGFyXzBcXGxhbWJkYV9nIl1d % tex-fmt: skip \[ \begin{tikzcd} {(f * \id) * g} && {f * (\id * g)} \\ & {f * g} \arrow["{\alpha_{f,\id,g}}", from=1-1, to=1-3] \arrow["{\rho_f *_0 \id(g)}"', from=1-1, to=2-2] \arrow["{\id(f)*_0\lambda_g}", from=1-3, to=2-2] \end{tikzcd} \] The pentagon equation identifies two ways of associating \(((f * g) * h) * k\) to \(f * (g * (h * k))\). It is given by the diagram below: % https://q.uiver.app/#q=WzAsNSxbMSwzLCIoZiBcXHN0YXIgKGcgXFxzdGFyIGgpKSBcXHN0YXIgayJdLFswLDEsIigoZiBcXHN0YXIgZykgXFxzdGFyIGgpIFxcc3RhciBrIl0sWzIsMCwiKGYgXFxzdGFyIGcpIFxcc3RhciAoaCBcXHN0YXIgaykiXSxbNCwxLCJmIFxcc3RhciAoZyBcXHN0YXIgKGggXFxzdGFyIGspKSJdLFszLDMsImYgXFxzdGFyICgoZyBcXHN0YXIgaCkgXFxzdGFyIGspIl0sWzEsMiwiXFxhbHBoYV97ZiBcXHN0YXIgZyxoLGt9Il0sWzIsMywiXFxhbHBoYV97ZixnLGhcXHN0YXIga30iXSxbMSwwLCJcXGFscGhhX3tmLGcsaH0gXFxzdGFyXzAgXFxpZChrKSIsMl0sWzAsNCwiXFxhbHBoYV97ZixnXFxzdGFyIGgsa30iLDJdLFs0LDMsIlxcaWQoZilcXHN0YXJfMCBcXGFscGhhX3tnLGgsa30iLDJdXQ== % tex-fmt: skip \[ \begin{tikzcd}[column sep = -1.5em] && {(f * g) * (h * k)} \\ {((f * g) * h) * k} &&&& {f * (g * (h * k))} \\ \\ & {(f * (g * h)) * k} && {f * ((g * h) * k)} \arrow["{\alpha_{f * g,h,k}}", from=2-1, to=1-3] \arrow["{\alpha_{f,g,h* k}}", from=1-3, to=2-5] \arrow["{\alpha_{f,g,h} *_0 \id(k)}"', from=2-1, to=4-2] \arrow["{\alpha_{f,g* h,k}}"', from=4-2, to=4-4] \arrow["{\id(f)*_0 \alpha_{g,h,k}}"', from=4-4, to=2-5] \end{tikzcd} \] Surprisingly, these two equations are enough to give full coherence. For the example of spans from \cref{ex:spans}, these two equations follow from the uniqueness of the universal morphism. \paragraph{Weak \(\infty\)-categories} To move from weak \(2\)-categories to weak \(3\)-categories, new coherence cells for interchangers are added to replace the interchanger equalities, and new equalities must be added to specify the interaction between the interchangers and other coherence morphisms. Furthermore, the triangle and pentagon equations from \(2\)-categories will become isomorphisms in a weak \(3\)-category, causing more coherence equations to be added. As we move up in dimension, the number of coherence morphisms and equalities required increases exponentially. A bicategory has 11 operations (1-identity, 2-identity, 1-composition, vertical composition, horizontal composition, left unitor (and inverse), right unitor (and inverse), and associator (and inverse)), whereas a fully weak tricategory already has around 51 operations~\cite{gurski2006algebraic}. These numbers are obtained by unwrapping various subdefinitions and should be treated as approximate. Comparisons between the size of partially weak definitions can be found in~\cite{bar2017data}. Because of this complexity, we look for more uniform ways to represent the operations and axioms of an \(\infty\)-category. In this thesis, we will work with the type theory \Catt, which is based on a definition of \(\infty\)-categories due to \citeauthor{maltsiniotis2010grothendieck}~\cite{maltsiniotis2010grothendieck}, which is itself based on a definition of \(\infty\)-groupoid by \citeauthor{PursuingStacks}~\cite{PursuingStacks}. We will sketch the ideas behind these definitions here, and give a definition of \Catt in \cref{sec:type-theory-catt}. The key insight behind Grothendieck's definition is that pasting diagrams should be weakly contractible, instead of containing a unique composite. Whereas in a strict \(\infty\)-category, each pasting diagram effectively has 1 composite, in a weak \(\infty\)-category there can be many operations over a pasting diagram. These operations are assembled into a globular extension called a \emph{coherator}. A weak \(\infty\)-groupoid is then a presheaf on this coherator for which the opposite functor preserves globular sums (alternatively, the dual notion of globular product could be defined, and such a presheaf could be asked to preserve globular products). The objects of a coherator are given by pasting diagrams, with \(D^n\) being sent to the \(n\)-cells of the category and other pasting diagrams being sent to composable sets of cells (as determined by the preservation of globular sums). Operations over a pasting diagram \(P\) in the coherator are given by morphisms \(D^n \to P\). When we take a presheaf over this, we obtain a function that takes an \(P\)-shaped collection of cells to a single \(n\)-cell. Operations can be precomposed with source and target maps \(D^{n-1} \to D^n\) to get the source and target of an operation. To build the coherator, we start by taking the category of pasting diagrams. The ``operations'' of this category consist solely of the inclusions of discs into pasting diagrams, which correspond to picking a single element from the pasting diagram. Other operations are then built using the following guiding principle. \begin{principle-groupoid} Let \(f\) and \(g\) be two parallel operations over a pasting diagram \(P\). Then there is an operation \(h\) over \(P\) with source \(f\) and target \(g\). \end{principle-groupoid} We define a pair of operations \(f,g : D^n \to X\) to be \emph{parallel} if \(n = 0\) or both \(n > 0\) and \(f \circ \mathbf{s}_{n-1} = g \circ \mathbf{s}_{n-1}\) and \(f \circ \mathbf{t}_{n-1} = g \circ \mathbf{t}_{n-1}\). A \emph{lift} for such a pair of parallel operations is an operation \(h : D^{n+1} \to X\) such that \(h \circ \mathbf{s}_{n} = f\) and \(h \circ \mathbf{t}_n = g\). Closing under this principle then amounts to inductively adding lifts for all parallel operations, while ensuring that the category remains a globular extension. We start with some basic operations: Consider the pasting diagram \(A = D^1 \amalg D^1\) given by: \[ \begin{tikzcd} x & y & z \arrow["a", from=1-1, to=1-2] \arrow["b", from=1-2, to=1-3] \end{tikzcd} \] Our rule now tells us that since \(x\) and \(z\) are elements of \(A\), that there should be an operation returning a cell with source \(x\) and target \(z\), namely the composition of \(a\) and \(b\). In the language of coherators, there are operations \(f, g : D^0 \to A\), where \(f\) includes into the source of the first disc of \(A\), and \(g\) includes into the target of the second disc of \(A\). These are trivially parallel, and so there exists a lift \(h : D^1 \to A\), giving 1-composition. Similarly, if we take the pasting diagram with a single \(0\)-cell \(x\) and no other cells, then applying our rule with \(f,g\) both being the operation returning the element \(x\) produces an operation with source and target \(x\), the identity on \(x\). We can generate more complicated operations with this principle, consider pasting diagram \(B\): \[ \begin{tikzcd} x & y & z & w \arrow["f", from=1-1, to=1-2] \arrow["g", from=1-2, to=1-3] \arrow["h", from=1-3, to=1-4] \end{tikzcd} \] We already know the coherator contains 1-composition, and using composition and the universal property of globular sums, we can generate operations realising the compound composites \((f * g) * h\) and \(f * (g * h)\). The principle then gives us an operation returning the \(2\)-cell \((f * g) * h \to f * (g * h)\), which is of course the associator. This one principle allows us to generate all the structure we need, as well as structure that is arguably unnecessary, such as ternary compositions that did not appear in the definition of bicategory. Unfortunately, as we have already mentioned, Grothendieck's definition is for \(\infty\)-groupoids, where everything is invertible, instead of \(\infty\)-categories in full generality, as we want to study in this thesis. This can be seen by taking the pasting diagram \(C\): \[ \begin{tikzcd} x & y \arrow["f", from=1-1, to=1-2] \end{tikzcd} \] and applying the rule with \(f\) returning \(y\) and \(g\) returning \(x\), giving an operation that returns a \(1\)-cell \(f^{-1} : y \to x\), the inverse of \(f\). The rule as we have stated it is too powerful. Maltsiniotis' definition provides a solution to this problem by giving a more refined version of the principle. Whereas Grothendieck's definition treats all operations as coherences, Maltsiniotis' definition splits operations into two classes: compositions and equivalences. Both classes are obtained by restricting the classes of parallel operations that admit lifts. We begin by defining what it means for an operation to be algebraic: \begin{definition} Let \(\mathcal{C}\) be a globular extension for which the canonical functor \(P : \mathbf{Pd} \to \mathcal{C}\) is faithful and the identity on objects. Then an operation \(f : D^n \to X\) in \(\mathcal{C}\) is \emph{algebraic} if whenever \(f = P(g) \circ f'\), \(g = \id\). \end{definition} Intuitively, an operation is algebraic when it does not factor through any proper inclusion. Algebraicity is equivalent to requiring that an operation makes use of all the locally maximal elements of the pasting diagram, elements which do not appear in the source or target of a higher-dimensional element of the diagram. Equivalences contain the various invertible laws of our \(\infty\)-categories such as associators, unitors, identities, and interchangers. For two operations \(f,g : D^n \to X\) to admit a lift under the rule for equivalences, they must both be algebraic. This gives the following rule: \begin{principle-category}[Equivalences] Let \(f\) and \(g\) be two parallel operations over a pasting diagram \(P\). If both \(f\) and \(g\) use all locally maximal variables of \(f\), then there is an operation over \(P\) with source \(f\) and target \(g\). \end{principle-category} Clearly any operations generated by this principle are invertible, as the extra condition imposed is symmetric. For compositions, we introduce the following asymmetric principle, recalling that pasting diagrams are equipped with source and target inclusions, and letting \(\partial^-(P)\) and \(\partial^+(P)\) be the images of these inclusions: \begin{principle-category}[Composites] Let \(f\) and \(g\) are parallel operations over a (non-singleton) pasting diagram \(P\) such that \(f\) uses all locally maximal cells of \(\partial^-(P)\) and no cells outside of \(\partial^-(P)\) and \(g\) uses all locally maximal cells of \(\partial^+(P)\) and no cells outside of \(\partial^+(P)\). Then there is an operation over \(P\) with source \(f\) and target \(g\). \end{principle-category} The condition required to form a composite can be expressed by the operation \(f : D^n \to P\) factoring into an algebraic map composed with the source inclusion into \(P\), and similar for \(g\) with the target inclusion. It can be easily checked that the inverse operation given above does not satisfy the criteria for being an equivalence or composite. As with Grothendieck's definition, a coherator can be made by closing the globular extension of pasting diagrams under these restricted principles, and then weak \(\infty\)-categories can be defined to be presheaves on this coherator such that the opposite functor preserves globular sums. \begin{remark} We have claimed that a coherator can be formed by closing under adding lifts to parallel operations, though this is not precise and there are actually multiple ways of performing this closure that lead to different coherators. For example, one could add the lift for 1-composition twice, to get two distinct 1-composition operations, as long as one also added a lift between these now parallel operations. Grothendieck gives a general schema for producing coherators, and conjectures that any two coherators give rise to equivalent models of \(\infty\)-categories. \end{remark} We now turn our attention back to the proof of Eckmann-Hilton from \cref{fig:eh}. Given a \(0\)-cell \(x\) and two scalars \(\alpha, \beta : \id(x) \to \id(x)\), we expect the Eckmann-Hilton argument to give us an isomorphism in a weak higher category, rather than the equality obtained in the strict case. In fact, we immediately see that equalities 2, 3, and 4 in the proof can be immediately replaced by isomorphisms (interchangers and unitors). The first and last equalities however are more problematic, although at first we may believe that there should exist some horizontal unitor isomorphism, upon closer inspection the two compositions do not even have the same boundary and so are not parallel. The composition \(\alpha *_1 \beta\) has source and target \(\id(x)\), whereas the source of \(\alpha *_0 \id(\id(x))\) is \(\id(x) *_0 \id(x)\). To recover the proof in a weak setting, the intermediate composites must be composed with unitors so that they all have source and target \(\id(x)\). To give equivalences for the first and last step, these unitors must be moved around with naturality moves, and at a critical point the isomorphism \(\lambda_{\id(x)} \simeq \rho_{\id(x)}\) is required. Multiple full proofs of Eckmann-Hilton will be given in \cref{sec:examples}. The proof of Eckmann-Hilton is vastly simpler in the strict case, mainly due to the presence of the equation \(\id(x) *_0 \id(x) = \id(x)\). \subsection{Computads} \label{sec:computads} A free group is generated by a set, and a free category is generated by a directed graph, and so it is a natural question what the generating data for a free \(\infty\)-category is. We have already seen that a free \(\infty\)-category can be generated by a globular set, but free \(\infty\)-categories can also be generated by data that does not form a globular set. Consider the minimum data needed to state the Eckmann-Hilton principle (see \cref{fig:eh} or \cref{prop:eh}). We require a single \(0\)-cell \(x\), and two \(2\)-cells \(\alpha, \beta : \id(x) \to \id(x)\). This data does not form a globular set as, for example, the source of the \(2\)-cell \(\alpha\) is not in the generating data, but is rather an operation applied to the data. We could try to remedy this by adding a new \(1\)-cell \(f\) to the data to represent \(\id(x)\), but then the connection between \(\id(x)\) and \(f\) would be lost and \(f\) and \(\id(x)\) would be distinct in any free \(\infty\)-category generated on this data. The correct generating data for an \(\infty\)-category is a \emph{computad}. A version for 2-categories was introduced by \citeauthor{street1976limits}~\cite{street1976limits}, which allows a generating \(2\)-cell to have a composite or identity as its source or target. These were extended to strict \(\infty\)-categories by \citeauthor{burroni1993higher}~\cite{burroni1993higher} and weak \(\infty\)-categories by \citeauthor{batanin1998computads}~\cite{batanin1998computads}, which allow the source and target of an \(n\)-cell to be any \((n-1)\)-cell of the free \(\infty\)-category generated by the lower-dimensional data. A modern approach to computads for weak \(\infty\)-categories is given by \citeauthor{dean2022computads}~\cite{dean2022computads}, which avoids much of the complexity of globular operads, relying only on (mutual) structural induction. This definition of a computad is much closer in style (and is inspired by) the type theory \Catt which we review in \cref{sec:type-theory-catt}. \section{The type theory \Catt} \label{sec:type-theory-catt} In this section we give an overview of the dependent type theory \Catt~\cite{finster2017type}. \Catt serves as a definition of weak \(\infty\)-categories, by defining a weak \(\infty\)-category to be a model of the type theory (e.g.\ using categories with families~\cite{cwf}). In \cref{cha:gener-pres-catt}, we give a more general and comprehensive presentation of \Catt, allowing the addition of equality relations to the type theory, pre-empting \cref{cha:cattstrict}. In contrast, this section presents the version of \Catt closer to the one found in the literature, and compares its various constructions to the ideas introduced in \cref{sec:weak}. \subsection{Syntax of \Catt} \label{sec:syntax-catt} \Catt has 4 classes of syntax: contexts, terms, types, and substitutions. \begin{itemize} \item Contexts contain a list of variables with an associated type. We can consider contexts as finite computads, the generating data for a weak \(\infty\)-category (see \cref{sec:computads}). It is alternatively valid to consider contexts in \Catt as finitely generated \(\infty\)-categories. The set of contexts contains all globular sets (and hence all pasting diagrams). \item Terms over a context \(\Gamma\) correspond to the operations from \cref{sec:weak}. Terms can either be a variable, which corresponds to the operations which pick a single cell out of a globular set, or those generated by the unique constructor \(\mathsf{Coh}\), which correspond to the operations generated by lifting. A term over a context \(\Gamma\) can also be seen as an element of the free \(\infty\)-category generated from \(\Gamma\). \item Types over a context \(\Gamma\) consist of a collection of terms over the same context, and contain the boundary information for a term. Types either take the form of the constructor \(*\), the type of \(0\)-cells (which have no boundary data), or an arrow type \(\arr s A t\), where \(s\) and \(t\) are terms giving the source and target of the boundary and the type \(A\) gives lower-dimensional boundary information. This can be viewed as a directed version of the equality type \(s =_A t\) from Martin-L\"of type theory. \item Substitutions from a context \(\Gamma\) to a context \(\Delta\) are a mapping from variables of \(\Gamma\) to terms of \(\Delta\). These play the role of functors between the \(\infty\)-categories generated by \(\Gamma\) and \(\Delta\) and are also syntactically crucial for forming compound composites in the theory. \end{itemize} \begin{figure}[ht] \centering \begin{tabular}{Sc Sc} { \begin{prooftree} \hypo{\phantom{\Term}} \infer1{\emptyset : \Ctx} \end{prooftree} } & { \begin{prooftree} \hypo{\Gamma : \Ctx} \hypo{A : \Type_\Gamma} \infer2{\Gamma, (x : A) : \Ctx} \end{prooftree}} \\ { \begin{prooftree} \hypo{\phantom{\Term}} \infer1{\langle \rangle : \emptyset \to \Gamma} \end{prooftree} } & { \begin{prooftree} \hypo{\sigma : \Delta \to \Gamma} \hypo{t : \Term_\Gamma} \hypo{A : \Type_\Delta} \infer3{\langle \sigma , t \rangle : \Delta, (x : A) \to \Gamma} \end{prooftree} } \\ { \begin{prooftree} \hypo{\phantom{\Type}} \infer1{\star : \Type_\Gamma} \end{prooftree} } & { \begin{prooftree} \hypo{A : \Type_\Gamma} \hypo{s : \Term_\Gamma} \hypo{t : \Term_\Gamma} \infer3{\arr s A t : \Type_\Gamma} \end{prooftree} } \\ { \begin{prooftree} \hypo{x \in \Var(\Gamma)\vphantom{\Type}} \infer1{x : \Term_\Gamma} \end{prooftree} } & { \begin{prooftree} \hypo{\Delta : \Ctx} \hypo{A : \Type_\Delta} \hypo{\sigma : \Delta \to \Gamma} \infer3{\Coh \Delta A \sigma : \Term_\Gamma} \end{prooftree} } \end{tabular} \vspace{-5pt} \caption{Syntax constructions in \Catt.} \label{fig:syntax} \end{figure} The rules for constructing each piece of syntax are given in \cref{fig:syntax}. To simplify the notation, we may avoid writing substitutions in a fully nested fashion, writing \(\langle \sigma , s , t \rangle\) instead of \(\langle \langle \sigma, s \rangle, t \rangle\), or \(\langle s \rangle\) instead of \(\langle \langle \rangle, s \rangle\). We may also omit the subscript in the arrow type. As opposed to the original paper on \Catt, we fibre terms, types, and substitutions over contexts, allowing us to avoid any problems with substitution only extending to a partial operation on terms. We write \(\Ctx\) for the set of contexts, \(\Term_\Gamma\) for the set of terms in a context \(\Gamma\), \(\Type_\Gamma\) for the set of types in a context \(\Gamma\), and write \(\sigma : \Delta \to \Gamma\) when \(\sigma\) is a substitution taking variables of \(\Delta\) to terms of \(\Gamma\). In the literature, substitutions are often written as going in the opposite direction. We emphasise here that the direction of our substitution morphisms agrees with the direction of the function from variables to terms, the direction of the induced functor between the \(\infty\)-categories freely generated from the domain and codomain contexts, and the direction of arrows in a Grothendieck coherator. We write \(\equiv\) for \emph{syntactic equality}, up to renaming of variables and \(\alpha\)-equivalence. The various pieces of syntax will be considered as equal up to this relation, which can be achieved by using a de Bruijn index representation of the syntax as we present in \cref{cha:gener-pres-catt} for the formalisation. However, we continue to use named variables in the prose of the thesis to aid readability, assuming that all variables in a context are always distinct. We contrast this with the equality symbol, \(=\), which will represent the equality derived from extra equality rules we have placed on \Catt in \cref{sec:catt-with-equality}, and will be referred to as \emph{definitional equality}. The action of a substitution \(\sigma : \Delta \to \Gamma\) can be extended from variables to all terms \(t \in \Term_\Delta\), types \(A \in \Type_\Delta\), and substitutions \(\tau : \Theta \to \Delta\) by mutual recursion: \begin{align*} x \sub \sigma &= t&\text{if }(x \mapsto t) \in \sigma\\ \Coh \Theta A \tau \sub \sigma &= \Coh \Theta A {\tau \bullet \sigma}\\ \star \sub \sigma &= \star\\ \arr s A t \sub \sigma &= \arr {s \sub \sigma} {A \sub \sigma} {t \sub \sigma}\\ \langle \rangle \bullet \sigma &= \langle \rangle\\ \langle \tau , t \rangle \bullet \sigma &= \langle \tau \bullet \sigma , t \sub \sigma \rangle \end{align*} For every context \(\Gamma\), there is an identity substitution \(\id_\Gamma\), which sends every variable to itself, which along with composition of substitutions above gives a category of contexts and substitutions. The coherence constructor \(\Coh \Delta A \sigma\) allows us to construct lifts between parallel operations over pasting diagrams. The context \(\Delta\) plays the role of the pasting diagram. The type \(A\) will always be of the form \(\arr s B t\), and the terms \(s\) and \(t\) play the role of the parallel operation (with the type \(\arr s B t\) being well-formed ensuring that \(s\) and \(t\) are parallel). The substitution \(\sigma : \Delta \to \Gamma\) holds the data of a set of arguments to the coherence, allowing compound composites/operations to be formed and taking the role of composition of morphisms in the coherator. We next define the free variables of each piece of syntax. These will be used to encode the condition of an operation being algebraic from the theory of non-invertible coherators. Let \(\Var(\Gamma)\) denote the variables of \(\Gamma\). For a term \(t \in \Term_\Gamma\), a type \(A \in \Type_\Gamma\) and a substitution \(\sigma : \Delta \to \Gamma\) we define their free variables \(\FV(t), \FV(A), \FV(\sigma) \subseteq \Var(\Gamma)\) by mutual recursion. \begin{align*} \FV(x) &= \{x\} &\text{if \(x\) is a variable}\\ \FV(\Coh \Delta A \sigma) &= \FV(\sigma)\\ \FV(\star) &= \{\}\\ \FV(\arr s A t) &= \FV(s) \cup \FV(A) \cup \FV(t)\\ \FV(\langle \rangle) &= \{\}\\ \FV(\langle \sigma , t \rangle) &= \FV(\sigma) \cup \FV(t) \end{align*} The free variables of a term are often the wrong notion to use for testing algebraicity. For example in the context \(D^1\), the term \(d_1\) has free variables \(\{d_1\}\), whereas the unary composite of \(d_1\), \(\Coh {D_1} {\arr {d_0^-} \star {d_0^+}} {\id_{D^1}}\), has free variables \(\{d_0^-,d_0^+,d_1\}\). To remedy this, the original paper considers \(\FV(t) \cup \FV(A)\), for a term \(t\) of type \(A\). In this thesis we instead define the support of each piece of syntax as a purely syntactic construction. \begin{definition} Fix a context \(\Gamma\). The subset \(V \subseteq \Var(\Gamma)\) is \emph{downwards closed} if for all \((x : A) \in \Gamma\) we have: \[x \in V \implies \FV(A) \subseteq V\] The downwards closure of a set \(V\) in a context \(\Gamma\), \(\DC_\Gamma(V)\) can be defined by induction on the context: \begin{align*} \DC_\emptyset(\emptyset) &= \emptyset\\ \DC_{\Gamma, x : A}(V) &= \begin{cases*} \DC_\Gamma(V)&if \(x \not\in V\)\\ \{x\} \cup \DC_\Gamma(V \cup \FV(A))&if \(x \in V\)\\ \end{cases*} \end{align*} The support of a term, type, or substitution is then defined as the downwards closure of its free variables: \[ \Supp(t) = \DC_\Gamma(\FV(t))\qquad \Supp(A) = \DC_\Gamma(\FV(A))\qquad \Supp(\sigma) = \DC_\Gamma(\FV(\sigma)) \] for terms \(t \in \Term_\Gamma\), types \(A \in \Type_\Gamma\), and substitutions \(\sigma : \Delta \to \Gamma\). \end{definition} We will see later (\cref{item:supp-tm-char-2}) that for well-formed terms \(t\) of typed \(A\) that the support of \(t\) is equal to \(\FV(t) \cup \FV(A)\) and that \(\Supp(A) = \FV(A)\) for well-formed types. Modifying \Catt to use the support operation therefore does not change the theory. We lastly define the \emph{dimension} of types, contexts, and terms. For types this is defined recursively: \[ \dim(\star) = 0 \qquad \dim(\arr s A t) = 1 + \dim(A) \] For contexts, we define \(\dim(\Gamma)\) to be the maximum of the dimension of each type in \(\Gamma\). For coherences \(\Coh \Gamma A \sigma\), the dimension is given by \(\dim(A)\), and for variables the dimension is given by the dimension of the associated type in the context. \subsection{Ps-contexts} \label{sec:ps-contexts} We need to be able to describe pasting diagrams within the theory \Catt. As contexts model globular sets it is natural to treat pasting diagrams as a subset of contexts. We will build pasting diagrams by iteratively attaching discs to a context, which is done by introducing the judgements: \[ \Delta \vdash_{\mathsf{ps}} x : A \qquad \text{and}\qquad \Delta \vdash_{\mathsf{ps}} \] If the first judgement holds, then \(\Delta\) is a pasting diagram for which a disc can be attached to the variable \(x\), called a \emph{dangling variable}, which has type \(A\). The contexts \(\Delta\) for which the second judgement holds are fully formed pasting diagrams, which we call \emph{ps-contexts} (short for pasting scheme contexts). The rules for these judgements are given in \cref{fig:ps-context}. We note that these rules do not just specify which globular sets are pasting diagrams, but they also specify an ordering on the elements of the pasting diagram, ensuring that there is a unique ps-context for each pasting diagram. For example, the following judgement holds: \begin{equation} \label[judgement]{judg:ps} (x : \star), (y : \star), (f : \arr x \star y), (z: \star), (g : \arr y \star z) \vdash_{\mathsf{ps}} \end{equation} However, the context: \[(y : \star), (z : \star), (g : \arr y \star z), (x : \star), (f : \arr x \star y)\] represents the same globular set but is not a ps-context. \begin{figure}[ht] \centering \begin{mathpar} \inferrule{ }{(x : \star) \vdash_{\mathsf{ps}} x : \star} {(\textsc{pss})} \and \inferrule{\Gamma \vdash_{\mathsf{ps}} x : A}{\Gamma, (y : A), (f : \arr x A y)} {(\textsc{pse})} \and \inferrule{\Gamma \vdash_{\mathsf{ps}} x : \arr s A t}{\Gamma \vdash_{\mathsf{ps}} t : A} {(\textsc{psd})} \and \inferrule{\Gamma \vdash_{\mathsf{ps}} x : \star}{\Gamma \vdash_{\mathsf{ps}}} {(\textsc{ps})} \end{mathpar} \caption{Rules for ps-contexts.} \label{fig:ps-context} \end{figure} \begin{example} \Cref{judg:ps} is given by the following derivation: \[\begin{prooftree} \hypo{ } \infer1[(\textsc{pss})]{(x : \star) \vdash_{\mathsf{ps}} x : \star} \infer1[(\textsc{pse})]{(x : \star), (y : \star), (f : \arr x \star y) \vdash_{\mathsf{ps}} f : \arr x \star y} \infer1[(\textsc{psd})]{(x : \star), (y : \star), (f : \arr x \star y) \vdash_{\mathsf{ps}} y : \star} \infer1[(\textsc{pse})]{(x : \star), (y : \star), (f : \arr x \star y), (z : \star), (g : \arr y \star z) \vdash_{\mathsf{ps}} g : \arr x \star y} \infer1[(\textsc{psd})]{(x : \star), (y : \star), (f : \arr x \star y), (z : \star), (g : \arr y \star z) \vdash_{\mathsf{ps}} z : \star} \infer1[(\textsc{ps})]{(x : \star), (y : \star), (f : \arr x \star y), (z : \star), (g : \arr y \star z) \vdash_{\mathsf{ps}}} \end{prooftree}\] The applications of (\textsc{pse}) allow new variables to be added to the context, by adding a fresh variable, and attaching a variable from the dangling variable to the new fresh variable. The rule (\textsc{psd}) encodes that if we can attach a variable to \(f : x \to y\), then we can also attach a variable to \(y\). The rule (\textsc{ps}) forces as many (\textsc{psd}) rules to be applied as possible before completing the derivation, ensuring that derivations of ps-contexts are unique. \end{example} We now state the following theorem, which follows immediately from~\cite[Theorem~53]{benjamin2021globular}. \begin{theorem} The set of ps-contexts is in bijection with the set of pasting diagrams. \end{theorem} In order to use ps-contexts as our notion of pasting diagram, we need to be able to identify the source and target variables of each ps-context. This will be done by specifying the dimension \(i\) source and target of each pasting context. More precisely, for each ps-context \(\Gamma\) and \(i \in \mathbb{N}\), we define a ps-context \(\bound i \Gamma\) and subcontext inclusions: \[ \incbd i - \Gamma : \bound i \Gamma \to \Gamma \qquad \text{and}\qquad \incbd i + \Gamma : \bound i \Gamma \to \Gamma\] Intuitively, the context \(\bound i \Gamma\) can be constructed by removing any variables of dimension greater than \(i\) from \(\Gamma\), and quotienting the dimension \(i\) variables by the (symmetric transitive closure of the) relation \(x \sim y\) if there exists an \(f : x \to y\). The inclusions then send this quotiented variable to the variable appearing first in the equivalence class for the source inclusion, and the variable appearing last in the class for the target inclusion. These contexts and substitutions can be defined by recursion on the context \(\Gamma\): \begin{align*} \bound i {(x : \star)} &= {(x : \star)}\\ \bound i {\Gamma, (y : A), (f : \arr x A y)} &= \begin{cases*} \bound i \Gamma&if \(i \leq \dim(A)\)\\ \bound i \Gamma, (y : A), (f : \arr x A y)&otherwise \end{cases*}\\ \incbd i \epsilon {(x : \star)} &= \langle x \rangle\\ \incbd i \epsilon {\Gamma, (y : A) , (f : \arr x A y)} &= \begin{cases*} \mathrlap{\incbd i \epsilon \Gamma}{\phantom{\bound i \Gamma, (y : A), (f : \arr x A y)}}&if \(i < \dim(A)\)\\ \incbd i - \Gamma&if \(i = \dim(A)\) and \(\epsilon = -\)\\ \replace(\incbd i + \Gamma, y)&if \(i = \dim(A)\) and \(\epsilon = +\)\\ \langle \incbd i \epsilon \Gamma, y, f \rangle &otherwise \end{cases*} \end{align*} where \(\epsilon \in \{-,+\}\) and \(\replace(\langle \sigma, s \rangle, t) = \langle \sigma, t \rangle\). As it will be common to take the boundary of \(\Gamma\) at the dimension below the dimension of \(\Gamma\) itself, we write \[\incbd {} \epsilon \Gamma = \incbd {\dim(\Gamma) - 1} \epsilon \Gamma\] when \(\dim(\Gamma)\) is not zero. In the original \Catt paper, these inclusion substitutions are not given and instead the source and target variables are given directly as subcontexts. It can be easily checked that the free variables of the inclusions are equal to the subcontexts, and that the free variable sets of these inclusions are downwards closed. It is known, e.g.\ from~\cite[Lemma~55]{benjamin2021globular}, that these constructions agree with the constructions of the source and target pasting diagrams in \cref{sec:pasting-diagrams}. We state the following well-known result (see~\cite{finster2017type}) about isomorphisms between pasting contexts. \begin{proposition} \label{prop:ps-context-iso} Let \(\Gamma\) and \(\Delta\) be ps-contexts and suppose \(\sigma : \Gamma \to \Delta\) is an isomorphism. Then \(\Gamma \equiv \Delta\) and \(\sigma\) is the identity substitution. \end{proposition} \subsection{Typing for \Catt} \label{sec:typing-catt} We now have all the prerequisites in place to state the typing rules for \Catt. These take the form of 4 judgements (not including the judgements for ps-contexts introduced in \cref{sec:ps-contexts}): \begin{alignat*}{2} &\Gamma \vdash&\qquad&\text{\(\Gamma \in \Ctx\) is a well-formed context.}\\ &\Gamma \vdash A&&\text{\(A \in \Type_\Gamma\) is a well-formed type in context \(\Gamma\).}\\ &\Gamma \vdash t : A &&\text{\(t \in \Term_\Gamma\) is a well-formed term of type \(A \in \Type_\Gamma\).}\\ &\Gamma \vdash \sigma : \Delta &&\text{\(\sigma : \Delta \to \Gamma\) is a well-formed substitution.} \end{alignat*} The typing rules for these judgements are then given in \cref{fig:catt-typing}. As most of these are standard we draw attention to a couple of the key rules. The rule for arrow types ensures that both the source and target of the arrow themselves have the same type, namely the one given in the subscript of the arrow. This effectively ensures the globular nature of the type theory, as given a term \(f : \arr s {\arr x A y} t\), both the source of the source and source of the target are \(x\), and both the target of the source and target of the target are \(y\). \begin{figure}[ht] \centering \begin{mathpar} \inferrule{ }{\emptyset \vdash} \and \inferrule{\Gamma \vdash\\ \Gamma \vdash A}{\Gamma, (x : A) \vdash} \and \inferrule{ }{\Gamma \vdash \star} \and \inferrule{\Gamma \vdash s : A \\ \Gamma \vdash A \\ \Gamma \vdash t : A}{\Gamma \vdash \arr s A t} \\ \inferrule{ }{\Gamma \vdash \langle\rangle : \emptyset} \and \inferrule{\Gamma \vdash \sigma : \Delta\\ \Gamma \vdash t : A\sub\sigma}{\Gamma \vdash \langle \sigma , t \rangle : \Delta, (x : A)} \and \inferrule{(x : A) \in \Gamma}{\Gamma \vdash x : A} \and \inferrule{\Delta \vdash_{\mathsf{ps}}\\ \Delta \vdash \arr s A t \\ \Gamma \vdash \sigma : \Delta\\\dim(\Delta) \neq 0\\\Supp(s) = \Supp(\incbd {} - \Delta)\\\Supp(t) = \Supp(\incbd {} + \Delta)}{\Gamma \vdash \Coh \Delta {\arr s A t} \sigma : \arr {s \sub \sigma} {A \sub \sigma} {t \sub \sigma}} \and \inferrule{\Delta \vdash_{\mathsf{ps}}\\ \Delta \vdash \arr s A t \\ \Gamma \vdash \sigma : \Delta\\\Supp(s) = \Supp(t) = \Var(\Delta)}{\Gamma \vdash \Coh \Delta {\arr s A t} \sigma : \arr {s \sub \sigma} {A \sub \sigma} {t \sub \sigma}} \end{mathpar} \caption{Typing rules for \Catt.} \label{fig:catt-typing} \end{figure} There are two rules given for typing coherence, corresponding to the two guiding principles for categories from \cref{sec:weak}. The first rule allows composites to be typed and the second allows equivalences to be typed. In both, the ps-context \(\Delta\) corresponds to the pasting diagram \(P\), the terms \(s\) and \(t\) correspond to the operations \(f\) and \(g\) over \(P\) (with the judgement \(\Delta \vdash \arr s A t\) enforcing that they are parallel), and the conditions involving support give the remaining side conditions. By a straightforward mutual induction we can prove that application of substitution to terms, types, and other substitutions preserves typing. Therefore, the \emph{syntactic category} of \Catt can be formed, which contains well-formed contexts as objects and well-formed substitutions between these contexts as morphisms, which by an abuse of notation we call \textsf{Catt}. There is a full subcategory \(\mathsf{Catt}^{\mathsf{ps}}\), which only contains the contexts which are ps-contexts. \begin{theorem} The category \(\mathsf{Catt}^{\mathsf{ps}}\) is a coherator for \(\infty\)-categories. \end{theorem} \begin{proof} Follows from \cite[Theorem~73]{benjamin2021globular}, noting that the opposite convention for substitution is used in that paper. \end{proof} Thus, we immediately get that a presheaf over \(\mathsf{Catt}^{\mathsf{ps}}\) which preserves globular products is an \(\infty\)-category (using the Maltsiniotis definition). Further, presheaves of this form are equivalent to type-theoretic models of \Catt by \cite[Theorem~88]{benjamin2021globular}, meaning type-theoretic models of \Catt are \(\infty\)-categories. \subsection{Basic constructions} \label{sec:basic-constructions} We now introduce some examples of basic categorical operations in order to give some early examples. Suppose we have terms \(a : \arr s \star t\) and \(b : \arr t \star u\) in some context \(\Gamma\). Then the ps-context \[ \Delta = (x : \star), (y : \star), (f : \arr x \star y), (z : \star), (g : \arr y \star z) \] from \cref{judg:ps} can be used to form the 1-composite: \[ a *_0 b = \Coh \Delta {\arr x \star z} {\langle s, t, a, u, b \rangle}\] It is often not necessary to give all the terms in a substitution, especially when the substitution is from a pasting diagram (or more generally a globular set). In these cases it is sufficient to give terms for the \emph{locally maximal} variables of the context, those that do not appear as the source or target of another variable. For \(\Delta\), the locally maximal variables are \(f\) and \(g\), and so it suffices to give the substitution above as \(\langle a , b \rangle\), with the rest of the terms being inferable. The disc contexts \(D^n\) can be formed in \Catt as the analogue of the disc globular sets given in \cref{ex:disc} and satisfy the property that a substitution from a disc context \(D^n\) contains the same data as a term and \(n\)-dimensional type. Given a term \(t\) of type \(A\) in context \(\Gamma\), we write this substitution \(\{A,t\} : D^{\dim(A)} \to \Gamma\). All disc contexts are ps-contexts. Using these, the identity can be formed on a term \(t\) of type \(A\) in \(\Gamma\): \[\id(A,t) = \Coh {D^n} {\arr {d_n} {} {d_n}} {\{A, t\}}\] where \(\dim(A) = n\), which is typed using the rule for equivalences. The structure of this term changes for different values of \(n\), and we will relate these different terms in \cref{sec:suspension}. As before, the non-locally maximal elements of a substitution can be inferred, and so we may write \(\id(t)\) or \(\{t\}\) when the type \(A\) is inferable. In \Catt, all types are inferable, though later when we consider semistrict variations of \Catt it may be necessary to specify the exact type we are using up to syntactic equality. \paragraph{Standard coherences} The composite and identity above form part of a more general collection of coherences, which we call \emph{standard coherences}. \begin{definition} Given a pasting diagram \(\Delta\), we mutually define for all \(n\) the \emph{standard coherence} \(\stdcoh\Delta n\), the \emph{standard term} \(\stdtm \Delta n\), and the \emph{standard type} \(\stdty \Delta n\): \begin{alignat*}{2} &\stdcoh \Delta n &&= \Coh \Delta {\stdty \Delta n} {\id_\Delta}\\ &\stdtm \Delta n &&= \begin{cases} d^n &\text{when \(\Delta\) is the disc \(D^n\)}\\ \stdcoh \Delta n &\text{otherwise} \end{cases}\\ &\stdty \Delta 0 &&= \star\\ &\stdty \Delta {n+1} &&= \arr {\stdtm {\bound n \Delta} n \sub {\incbd n - \Delta}} {\stdty \Delta n} {\stdtm {\bound n \Delta} n \sub {\incbd n + \Delta}} \end{alignat*} The standard type takes the standard term over each boundary of \(\Delta\), includes these all back into \(\Delta\) and assembles them into a type. When \(n = \dim(\Delta)\) we will refer to the standard coherence as the \emph{standard composite}. \end{definition} Intuitively, the standard coherence \(\stdcoh \Delta n\) is the canonical composite in dimension \(n\) of the pasting diagram \(\Delta\). To give this a type is needed to form the coherence, for which the standard type \(\stdty \Delta n\) is used. The standard term \(\stdtm \Delta n\) is used as a variant of the standard coherence which special cases disc contexts. This avoids the standard type containing unary composites and allows standard composites (of non-disc contexts) to be normal forms of the reduction systems that will be described in \cref{cha:cattstrict}. It is immediate that the composite of \(1\)-cells \(a *_0 b\) is given by \(\stdcoh \Delta 1\sub{\langle a , b \rangle}\) and the identity on a term \(t\) of dimension \(n\) is given by \(\stdcoh {D^n} {n+1}\sub{\{t\}}\). This construction can be used to generate all the composites in the definition of a strict \(\infty\)-category. For example the vertical composite of \(2\)-cells is the standard composite over the context given by the diagram: \[ \begin{tikzcd} x && y \arrow[""{name=0, anchor=center, inner sep=0}, "f"', curve={height=24pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "h", curve={height=-24pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "g"{description}, from=1-1, to=1-3] \arrow["\alpha", shorten <=3pt, shorten >=3pt, Rightarrow, from=0, to=2] \arrow["\beta", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \end{tikzcd} \] and the horizontal composite of \(2\)-cells is the standard composite over: \[ \begin{tikzcd} x & y & z \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, "i", curve={height=-18pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "h"', curve={height=18pt}, from=1-2, to=1-3] \arrow["\alpha", shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["\beta", shorten <=5pt, shorten >=5pt, Rightarrow, from=3, to=2] \end{tikzcd} \] Noting that the standard type over the above diagram has source \(f * h\) and target \(g * i\), themselves being standard compositions demonstrating the mutual recursive behaviour of these constructions. \begin{remark} Above we gave two ps-contexts by drawing a diagram of the globular set that they represent. Ps-contexts fix the order that variables occur in and as such the mapping from ps-contexts to globular sets is injective. The use of diagrams to define ps-contexts is therefore unambiguous. \end{remark} \paragraph{Further examples} The substitution component of a coherence allows operations to be combined into compound operations. Consider the (Ps-)context given by the following diagram: \[\Gamma = \begin{tikzcd} s & t & u & v \arrow["a", from=1-1, to=1-2] \arrow["b", from=1-2, to=1-3] \arrow["c", from=1-3, to=1-4] \end{tikzcd} \] There are (at least) 3 ways to compose together the elements of this context. We could take the unbiased ternary composite \(a * b * c = \stdcoh \Gamma 1\sub{\langle a, b, c\rangle}\), but could also construct either biased composite: \begin{align*} (a * b) * c &= \stdcoh \Delta 1\sub{\langle \stdcoh \Delta 1\sub{\langle a,b\rangle}, c\rangle}\\ a * (b * c) &= \stdcoh \Delta 1\sub{\langle a, \stdcoh \Delta 1\sub{\langle b, c\rangle}\rangle}\\ \end{align*} Using the equivalence typing rule, we can relate these biased composite with the following term: \[ \alpha_{a,b,c} = \Coh \Gamma {\arr {(a * b) * c} {} {a * (b * c)}} {\id_\Gamma}\] which is the associator. Similarly, for a term \(f : \arr x \star y\), unitors can be formed over the disc context \(D^1\) using the equivalence rule: \begin{align*} \lambda_f &= \Coh {D^1} {\arr {\id(d_0^-) * d_1} {} {d_1}} {\{f\}}\\ \rho_f &= \Coh {D^1} {\arr {d_1 * \id(d_0^-)} {} {d_1}} {\{f\}} \end{align*} The remainder of the operations for a 2-category can be defined similarly, as each displays the equivalence of two terms built over a pasting diagram. We observe that both the unitors and associator (as well as any coherence typed with the equivalence rule) are trivially invertible. \subsection{Suspension} \label{sec:suspension} To end this section, we introduce the meta-operation of \emph{suspension}, as described for \Catt by \citeauthor{benjamin2020type}~\cite{benjamin2020type}. Suspension takes any piece of syntax as input and produces one with a dimension one higher. It can be used as an aid to defining operations in \Catt, but will also form a key part of the formal development of the constructions described in \cref{sec:operations-catt}. Suspension is inspired by the identically named operation on topological spaces. Given a topological space \(X\), its suspension \(\Sigma X\) is formed by quotienting the space \(X \times [0,1]\) by the relation that identifies all points of the form \((x,0)\) for \(x \in X\) and identifies points \((x,1)\) for \(x \in X\). The suspension on a space \(X\) can be alternatively viewed as the space containing two distinguished points \(N\) and \(S\), and a path from \(N\) to \(S\) for each point \(x \in X\). The names \(N\) and \(S\) stand for north and south, as the suspension of a circle can be visualised as a globe, with \(N\) and \(S\) being the north and south pole and each of the paths between them being a meridian. A similar operation can be applied to globular sets. Given a globular set \(G\), its suspension \(\Sigma G\) is obtained by shifting the dimension of every \(n\)-cell up by one (making it into an \((n+1)\)-cell), adding two new \(0\)-cells \(N\) and \(S\), and letting the source of every \(1\)-cell be \(N\) and the target be \(S\). The globularity conditions for this construction can be quickly verified. This construction extends to all computads~\cite{benjamin2024duamity}, and can be defined in \Catt by mutually defining the operation on contexts, types, terms, and substitutions. \begin{definition} For contexts \(\Gamma \in \Ctx\), types \(A \in \Type_\Gamma\), terms \(t \in \Term_\Gamma\), and substitutions \(\sigma : \Delta \to \Gamma\), we define their \emph{suspensions} \(\Sigma(\Gamma) \in \Ctx\), \(\Sigma(A) \in \Type_{\Sigma(\Gamma)}\), \(\Sigma(t)\in \Term_{\Sigma(\Gamma)}\), and \(\Sigma(\sigma) : \Sigma(\Delta) \to \Sigma(\Gamma)\) by mutual recursion. \begin{align*} \Sigma (\emptyset) &= (N : \star), (S : \star) &\Sigma (\Gamma, (x : A)) &= \Sigma \Gamma, (x : \Sigma A)\\ \Sigma (\star) &= \arr N \star S &\Sigma (\arr s A t) &= \arr {\Sigma s} {\Sigma A} {\Sigma t}\\ \Sigma(\langle \rangle) &= \langle N, S \rangle &\Sigma(\langle \sigma, x \rangle) &= \langle \Sigma(\sigma), \Sigma(t) \rangle\\ \Sigma (x) &= x &\Sigma (\Coh \Delta A \sigma) &= \Coh {\Sigma(\Delta)} {\Sigma(A)} {\Sigma(\sigma)} \end{align*} where \(x\) is a variable of \(\Gamma\). \end{definition} The dimension shift of suspension is driven by the cases for types, especially the case for the base type \(\star\), which returns a type of dimension \(1\), namely \(\arr N \star S\), using the two new variables \(N\) and \(S\). We note that the suspension of any ps-context is also a ps-context, and in general the suspension of any piece of well-formed \Catt syntax can be well-formed. These results are given in \cite[Section~3.2]{benjamin2020type}, but will be proved in \cref{sec:ruleset} in more generality. We can now investigate the action of suspension on the operations we have already defined. Take the context: \[ (x : \star), (y : \star), (f : \arr x \star y), (z : \star), (g : \arr y \star z) \] used in \cref{sec:basic-constructions} to generate 1-composition. Applying suspension to this context gives: \[ \begin{tikzcd} N && S \arrow[""{name=0, anchor=center, inner sep=0}, "x"', curve={height=24pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "z", curve={height=-24pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "y"{description}, from=1-1, to=1-3] \arrow["f"', shorten <=3pt, shorten >=3pt, Rightarrow, from=0, to=2] \arrow["g"', shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \end{tikzcd} \] which is the context used to generate vertical 2-composition. Furthermore, applying suspension directly to 1-composition operation forms the vertical 2-composition operation. The suspension of each disc context \(D^n\) is (up to \(\alpha\)-renaming) \(D^{n+1}\). It can be checked that applying suspension to the identity operation for \(n\)-dimensional terms returns the identity operation for \((n+1)\)-dimensional terms. Repeating this logic, all identity operations can be obtained as iterated suspensions of the identity for \(0\)-cells. The following more general result about standard coherences holds: \begin{proposition} The following syntactic equalities hold: \[\Sigma(\stdcoh \Delta n) = \stdcoh {\Sigma(\Delta)} {n+1}\qquad \Sigma(\stdtm \Delta n) = \stdtm {\Sigma(\Delta)} {n+1}\qquad \Sigma(\stdty \Delta n) = \stdty {\Sigma(\Delta)} {n+1}\] for all ps-contexts \(\Delta\) and \(n \in \mathbb{N}\). \end{proposition} The proof of these results is delayed to \cref{sec:operations-catt}, where we will have more tools for dealing with these constructions. \chapter{A formalised presentation of \Catt with equality} \label{cha:gener-pres-catt} The main purpose of this chapter will be to define the family of type theories \Cattr, which extend the base type theory \Catt with a specified set \(\mathcal{R}\) of equality rules. These equality rules equate various terms of the theory, which unifies the corresponding operations their models, allowing us in \cref{cha:cattstrict} to generate type theories that model semistrict categories, categories where some but not all structure is strictified. This chapter will also introduce the Agda formalisation~\cite{alex_rice_2024_10964565} which accompanies this thesis, which compiles with Agda v2.6.4 and standard library v2.0. The formalisation implements the syntax and typing judgements of \Cattr, and contains proofs of most results in this chapter and \cref{sec:operations-catt}. By formalising \Cattr, instead of the more specific type theories \Cattsu and \Cattsua introduced in \cref{sec:cattsu,sec:cattsua}, the formalisation of many results can be applied to both type theories. This also allows these results to be applied to any future type theories of this form. A dependency graph of the formalisation is given in \cref{fig:dep-graph}, and an online version of this graph can be found at \url{https://alexarice.github.io/catt-agda/dep-graph.svg} for which each node is a clickable link to an HTML version of the code. This graph was generated by processing the dependency graph output of Agda with the tool \textsf{sd-visualiser}~\cite{sd-visualiser}. \section{Extended substitution} \label{sec:extend-subst} \Cattr uses the same syntax as \Catt with one exception. In \Cattr we make a natural generalisation to substitutions, which will allow more operations to be defined for working with the suspension operation introduced in \cref{sec:suspension}. Unfortunately, the full utility of this generalisation will not be realised until \cref{sec:structured-terms}, but we choose to introduce it here as it forms a core part of the syntax, and requires little modification to the rules of the type theory. We recall that the suspension operation \(\Sigma\) acts on contexts, substitutions, types, and terms. Given a substitution \(\sigma : \Delta \to \Gamma\), its suspension \(\Sigma(\sigma)\) has domain \(\Sigma(\Delta)\) and codomain \(\Sigma(\Gamma)\). When we define trees and tree labellings in \cref{sec:operations-catt}, which will be used to define the insertion operation in \cref{sec:insertion}, we will need to be able to define substitutions from suspended contexts to arbitrary contexts. More generally, we would like to be able to describe substitutions of the form: \[ \Sigma^n(\Delta) \to \Gamma\] where \(\Sigma^n(\Delta)\) is the operation that applies suspension \(n\) times to \(\Delta\). Consider the data contained in a substitution \(\tau : \Sigma(\Delta) \to \Gamma\). There are two terms \(N \sub \tau\) and \(S \sub \tau\) of type \(\star\), and then a term for each variable of \(\Delta\). Temporarily ignoring the typing conditions for substitutions, we see that the data is equivalent to a substitution from \(\Delta\) to \(\Gamma\) and two additional terms. If we now consider a substitution \(\tau : \Sigma(\Sigma(\Delta)) \to \Gamma\), we notice that there is a term in \(\Gamma\) for each variable of \(\Delta\), as well as two terms \(s = N \sub \tau\) and \(t = S \sub \tau\) for the outer suspension and terms \(u = N' \sub \tau\) and \(v = S' \sub \tau\) for the inner suspension. As before, the terms \(s\) and \(t\) should have type \(\star\), but the terms \(u\) and \(v\) should have type \(\arr s \star t\). We note that this is the exact condition needed for \(\arr u {\arr s \star t} v\) to be a well-formed type. This motivates the notion of an \emph{extended substitution}, which is obtained by equipping a substitution with a type. We have not yet determined the typing conditions required on the substitution part of these extended substitutions. We return to the example of a substitution \(\tau : \Sigma^2(\Delta) \to \Gamma\), and suppose that \(\Delta\) has a variable \(x\) of type \(\star\). In \(\Sigma^2(\Delta)\), \(x\) has the type \(\arr {N'} {\arr N \star S} {S'}\), and so \(x\) should be sent to a term of type \(\arr u {\arr s \star t} v\), the type portion of the extended substitution. In a substitution \(\sigma : \Delta \to \Gamma\), \(x\) would be sent to a term of type \(\star \sub \sigma\), which suggests that \(\star \sub \sigma\) should be redefined to send \(\star\) to the type part of the extended substitution. This one change to the application of substitution to types is sufficient to generalise from substitutions to extended substitutions. An extended substitution \(\sigma : \Delta \to \Gamma\) then has the following intuition: The substitution part specifies where each variable in \(\Delta\) should be sent, and the type part specifies where the base type \(\star\) should be sent. The other cases for the application of substitution extend this to all terms, types, and (extended) substitutions as before. The extended substitution \(\sigma\) then represents a standard substitution \(\Sigma^n(\Delta)\) to \(\Gamma\), where \(n\) is the dimension of the type part of \(\sigma\). Hence, a regular substitution can be recovered as an extended substitution with type part \(\star\). We modify the syntax of \Catt as follows, and will refer to these extended substitutions simply as substitutions, as extended substitutions are a direct generalisation of substitutions, and the notion of substitution is still recoverable by setting the type part to \(\star\): \begin{itemize} \item Substitutions will now be fibred over a type of their codomain context, which we will write \(\sigma : \arr \Delta A \Gamma\) where \(A \in \Type_\Gamma\). We note that this allows us to specify that \(\sigma\) is a regular substitution by writing \(\sigma : \arr \Delta \star \Gamma\). \item The constructor \(\langle\rangle\) is removed, and is replaced by the constructor \(\langle A \rangle : \arr \emptyset A \Gamma\), where \(A \in \Type_\Gamma\). Adding a term to a substitution preserves the type of the substitution. As before we may write a substitution \(\langle \langle \langle A \rangle, s \rangle, t \rangle\) as \(\langle A , s, t\rangle\). We let \(\FV(\langle A \rangle) = \FV(A)\). \item An operation \(\ty(\sigma)\) is introduced that returns the type portion of a substitution. For \(\sigma : \arr \Delta A \Gamma\), we have \(\ty(\sigma) = A\). \item Coherences \(\Coh \Delta A \sigma \in \Term_\Gamma\) are restricted so that \(\sigma\) is a regular substitution. In other words \(\ty(\sigma)\) must be \(\star\) for \(\sigma\) to appear in a substitution. While this condition could be dropped, it is convenient to keep the same operations as \Catt. \end{itemize} To witness the equivalence of extended substitutions \(\Delta \to \Gamma\) and regular substitutions \(\Sigma^n(\Delta) \to \Gamma\), we introduce new operations. \begin{definition} For a substitution \(\sigma : \arr {\Delta} {\arr s A t} \Gamma\), we define its \emph{unrestriction}: \[\unrestrict\sigma : \arr {\Sigma(\Delta)} A \Gamma\] by induction on the length of \(\Delta\): \begin{align*} \unrestrict \langle \arr s A t \rangle &= \langle A, s, t \rangle\\ \unrestrict \langle \sigma' , u \rangle &= \langle \unrestrict \sigma' , u \rangle \end{align*} The unrestrict operation simply moves two terms from the type part of the substitution into the main body of the substitution. \end{definition} To define the second operation, we need to first specify the changes to application of substitution: \begin{itemize} \item The composition of substitutions takes substitutions \(\sigma : \arr \Theta A \Delta\) and \(\tau : \arr \Delta B \Gamma\) to a substitution \(\sigma \bullet \tau : \arr \Theta {A \sub \tau} \Gamma\). \item For a substitution \(\sigma : \arr \Delta A \Gamma\), we define \(\star \sub{\sigma} = A\). \item As the substitution in a coherence must have type \(\star\), we define the application of an extended substitution \(\tau : \arr \Delta {\arr s A t} \Gamma\) to a coherence as: \[ \Coh \Theta A \sigma \sub \tau = \Coh {\Sigma(\Theta)} {\Sigma(A)} {\Sigma(\sigma)} \sub {\unrestrict \tau}\] The case for applying a regular substitution to a coherence remains unchanged. \end{itemize} We can now define an inverse to the unrestriction operation. \begin{definition} For a substitution \(\sigma : \arr {\Sigma(\Delta)} A \Gamma\), its \emph{restriction} \[ \restrict \sigma : \arr \Delta {\arr {N \sub \sigma} A {S \sub \sigma}} \Gamma \] is defined by induction on the length of \(\Delta\): \begin{align*} \restrict \langle A, s, t \rangle &= \langle \arr s A t \rangle\\ \restrict \langle \sigma', u \rangle &= \langle \restrict \sigma', u \rangle \end{align*} Inversely to the unrestrict operation, the restrict operation moves two terms into the type part of the substitution. \end{definition} As restriction and unrestriction cancel each other, the suspension of the substitution \(\sigma : \arr \Delta \star \Gamma\) can be factored into \((\unrestrict \circ (\restrict \circ \Sigma)) (\sigma)\). We observe that the second part of this composition, \(\restrict \circ \Sigma\), is the operation that simply applies the suspension to each term in the substitution as well as the type of the substitution. This motivates the final definition of this section. \begin{definition} Let the \emph{restricted suspension} of a substitution \(\sigma : \arr \Delta A \Gamma\) be a substitution \[\Sigma'(\sigma) : \arr \Delta {\Sigma(A)} {\Sigma(\Gamma)}\] defined inductively by the equations: \begin{align*} \Sigma'(\langle A \rangle) &= \langle \Sigma(A)\rangle \\ \Sigma'(\langle \sigma' , t \rangle) &= \langle \Sigma'(\sigma'), \Sigma(t) \rangle \end{align*} The suspension of a substitution \(\tau : \arr \Delta \star \Gamma\) can be defined by \(\Sigma(\tau) = \unrestrict\Sigma'(\tau)\). \end{definition} For the rest of the thesis and the formalisation, the suspension on a substitution is defined as the composition of unrestriction and restricted suspension. \section[\texorpdfstring{\Cattr}{Cattr}: \Catt with equality]{\boldmath\texorpdfstring{\Cattr}{Cattr}: \Catt with equality} \label{sec:catt-with-equality} This section will define the type theory \Cattr, a variation of \Catt with specified equality rules. This section, in addition to the following sections in this chapter, will be used to motivate certain choices in the formalisation. All the preliminary definitions as well as syntax, typing, and equality rules are assembled in \cref{fig:cattr}. \subsection{Syntax} \label{sec:syntax} The syntax of \Cattr is based on the syntax of \Catt with the changes specified in \cref{sec:extend-subst}. This creates a dependence chain of needing to define the base syntax before suspension can be defined, and needing to define suspension before application of substitution can be defined. In the formalisation these are defined in the following files: \begin{itemize} \item The core syntax is defined in \module{Catt.Syntax.Base}. \item Suspension is defined in \module{Catt.Suspension}. \item Other syntactic operations are defined in \module{Catt.Syntax}, which re-exports the core syntax. \end{itemize} To avoid any issues with \(\alpha\)-equivalence, especially as we have terms that contain contexts, we work with de Bruijn indices throughout the formalisation. This means that a context is simply a vector of types, a fixed length list, which are given a nicer syntax. Variables are then simply bounded natural numbers, represented by the sets \(\mathsf{Fin}_n\), where \(\mathsf{Fin}_n\) is the set \(\{0,\dots,n-1\}\). Given a context \(A , B , C\), the variables over this context are simply \(\mathsf{var\ 0}\), which has type \(C\), \(\mathsf{var\ 1}\), which has type \(B\), and \(\mathsf{var\ 2}\), with type \(A\). We note that \(3\) is not in \(\mathsf{Fin}_3\), and so \(\mathsf{var\ 3}\) is not a term of this context. Hence, we do not need to deal with unknown variables when applying substitutions. We will still make use of variable names in this text to aid readability, and will ignore any potential problems that could arise from this, knowing that the results are formalised in a setting where they do not appear. The formalisation also differs from the presentation in the texts by the way that the various notions of syntax are fibred. We fibre contexts by a natural number representing their length, and then fibre terms, types, and substitutions over these lengths instead of fibring them over the contexts. We then get the following 4 syntactic classes defined as mutually inductive families, where \(\mathcal{U}\) is a type universe: \[ \funcn{Catt.Syntax.Base}{Ctx}{\Ctx} : \mathbb{N} \to \mathcal{U} \quad \funcn{Catt.Syntax.Base}{Ty}\Type : \mathbb{N} \to \mathcal{U} \quad \funcn{Catt.Syntax.Base}{Tm}\Term : \mathbb{N} \to \mathcal{U} \quad \funcn{Catt.Syntax.Base}{Sub}\Sub : (n\ m : \mathbb{N}) \to \Type_m \to \mathcal{U}\] This decision was made purely for convenience, by fibring over natural numbers instead of contexts, we sometimes avoid the need for providing more explicit arguments to syntactic constructions. It comes with drawback that the context must be provided for certain operations, such as the support of a piece of syntax, or the dimension of a term. One place an explicit argument can be avoided is when defining the weakening of a piece of syntax, an operation witnessing that for a piece of syntax living in a context \(\Gamma\), there is a copy living in \(\Gamma , A\) for any \(A\). These operations are defined in \module{Catt.Syntax} and take the following form, where we re-use the name \(\wk\) here as an abuse of notation: \[ \funcn{Catt.Syntax}{wk-tm}{\wk} : \Term_{\Gamma} \to \Term_{\Gamma, A}\quad\funcn{Catt.Syntax}{wk-ty}{\wk} : \Type_{\Gamma} \to \Type_{\Gamma, A}\quad \funcn{Catt.Syntax}{wk-sub}{\wk} : (\arr \Gamma B \Delta) \to (\arr {\Gamma} {\wk(B)} {\Delta, A}) \] If terms are fibred over contexts then this type \(A\) must often be specified, though with the fibring over context length this is no longer necessary. When using de Bruijn indices, this operation is no longer the identity on terms, as each variable must be incremented due to the index in a variable counting from the end of the context. One might ask why de Bruijn levels (which index from the start of the context) were not used instead, but this would not solve our problem as \(\mathsf{Fin}_n\) is not a subtype of \(\mathsf{Fin}_{n+1}\) in Agda. Furthermore, using de Bruijn levels would cause the substitution application introduced in \cref{sec:syntax-catt} (and expanded in \cref{sec:extend-subst}) to compute poorly, due to the way substitutions are defined. The definition of weakening is given in \cref{fig:wk}. Weakening can be used to give a short inductive definition of the identity substitution, a substitution \(\id_\Gamma : \Gamma \to \Gamma\) which sends every variable to itself. On the inductive case \(\id_{\Gamma, (x : A)}\), it is clear that the variable \(x\) should be sent to \(x\), but the constructor for substitutions also requires a substitution \(\Gamma \to \Gamma, (x : A)\). This can be obtained by weakening a recursive call to the identity on \(\Gamma\). Similarly, an inclusion \(\Gamma \to \Gamma, (x : A)\) can be defined as \(\wk(\id_\Gamma)\), and applying this substitution is the same operation as weakening. To begin proving syntactic properties of \Cattr, we need a notion of syntactic equality. This will be written \(\Gamma \equiv \Delta\) for contexts \(\Gamma\) and \(\Delta\), and similarly for terms \(s\) and \(t\), types \(A\) and \(B\), and substitutions \(\sigma\) and \(\tau\). It is given by \(\alpha\)-equivalence, and so we would hope that the formalisation could leverage the use of de Bruijn indices to use the in-built equality type for syntactic equality. This is too restrictive however, there will be many times when we want to compare two terms of differing context length (in practice this context length will be propositionally equal, instead of definitionally equal). Therefore, four syntactic equality relations are defined mutually inductively on the constructors of each piece of syntax in \module{Catt.Syntax.Properties}. These definitions can easily be heterogeneous, allowing two terms \(s : \Term_n\) and \(t : \Term_m\) to be compared. Unfortunately, using these comes at the cost of large amounts of boilerplate, as these inductively defined equalities do not come equipped with the J-rule, and so it must be manually proved that each operation respects syntactic equality. An example of such a function is \funcn{Catt.Syntax.Properties}{wk-tm-≃}{wk-tm-\(\simeq\)}, which states that the weakenings of two syntactically equal terms are syntactically equal. \module{Catt.Syntax.Properties} contains many of the basic properties about the syntax of \Cattr, including: \begin{itemize} \item Syntactic equality is decidable. \item Syntactic equality is propositional, there is at most one proof of \(s \equiv t\). \item Functoriality of suspension. \item Interaction of weakening with substitution application. We have \(\wk(s) \sub {\langle \sigma , t \rangle} \equiv s \sub \sigma\) and \(s \sub {\wk(\sigma)} \equiv \wk(s \sub \sigma)\) and equivalent lemmas for the application of substitution to types and substitutions. \end{itemize} It also contains the following proposition. \begin{proposition} \label{prop:categorical} Application of substitution is associative and unital with respect to the identity substitution. More precisely, given substitutions \(\sigma : \arr \Theta A \Delta\) and \(\tau : \arr \Delta B \Gamma\), the following equalities hold: \begin{mathpar} A \sub \sigma \sub \tau \equiv A \sub {\sigma \bullet \tau} \and A \sub \id_\Theta \equiv A\\ t \sub \sigma \sub \tau \equiv t \sub {\sigma \bullet \tau} \and t \sub \id_\Theta \equiv t\\ (\mu \bullet \sigma) \bullet \tau \equiv \mu \bullet (\sigma \bullet \tau) \and \mu \bullet \id_\Theta \equiv \mu \and \id_\Xi \bullet \mu \equiv \mu \end{mathpar} for types \(A \in \Type_\Theta\), terms \(t \in \Term_\Theta\), and substitutions \(\mu : \arr \Xi C \Theta\). \end{proposition} \begin{proof} The last equation is a simple induction on \(\mu\) (and the context \(\Xi\)). Both the unitality equations and associativity equations, as with the vast majority of syntactic proofs, are given by mutual induction on types, terms, and substitutions. The only difficult case is: \[ \Coh \Theta C \mu \sub \sigma \sub \tau \equiv t \sub {\sigma \bullet \tau} \] where the type part of \(\sigma: \arr \Theta A \Delta\) or \(\tau : \arr \Delta B \Gamma\) is not \(\star\). First suppose \(B = \arr s {B'} t\) but \(A = \star\): \begin{align*} \Coh \Theta C \mu \sub \sigma \sub \tau &\equiv \Coh \Theta C {\mu \bullet \sigma} \sub \tau\\ &\equiv \Coh {\Sigma(\Theta)} {\Sigma(C)} {\Sigma(\mu \bullet \sigma)} \sub {\unrestrict \tau}\\ &\equiv \Coh {\Sigma(\Theta)} {\Sigma(C)} {\Sigma(\mu) \bullet \Sigma(\sigma)} \sub {\unrestrict \tau}\\ &\equiv \Coh {\Sigma(\Theta)} {\Sigma(C)} {\Sigma(\mu)} \sub {\Sigma(\sigma) \bullet \unrestrict \tau}\\ &\equiv \Coh {\Sigma(\Theta)} {\Sigma(C)} {\Sigma(\mu)} \sub {\unrestrict (\sigma \bullet \tau)}\\ &\equiv \Coh {\Theta} {C} {\mu} \sub {\sigma \bullet \tau} \end{align*} where the second to last line is given by property \[\unrestrict (\sigma \bullet \tau) \equiv \Sigma(\sigma) \bullet \unrestrict \tau\] which holds for all \(\sigma : \arr \Theta \star \Delta\) and is proven in \funcn{Catt.Syntax.Properties}{↓-comp}{\textsf{\(\downarrow\)-comp}}, and the line before is given by the inductive hypothesis. If instead we had \(A = \arr s {A'} t\), then: \begin{align*} \Coh \Theta C \mu \sub \sigma \sub \tau &\equiv \Coh {\Sigma(\Theta)} {\Sigma(C)} {\Sigma(\mu)} \sub {\unrestrict \sigma} \sub \tau\\ &\equiv \Coh {\Sigma(\Theta)} {\Sigma(C)} {\Sigma(\mu)} \sub {\unrestrict \sigma \bullet \tau}\\ &\equiv \Coh {\Sigma(\Theta)} {\Sigma(C)} {\Sigma(\mu)} \sub {\unrestrict (\sigma \bullet \tau)}\\ &\equiv \Coh \Theta C \mu \sub {\sigma \bullet \tau} \end{align*} where we use the inductive hypothesis after applying the equality \[ \unrestrict (\sigma \bullet \tau) \equiv \unrestrict \sigma \bullet \tau \] which holds for all \(\sigma : \arr \Theta {\arr s {A'} t} \Delta\) by \funcn{Catt.Syntax.Properties}{↓-comp-higher}% {\textsf{\(\downarrow\)-comp-higher}}. \end{proof} This proposition proves that the syntax of \Cattr forms a category, which we will not name as we will work instead with the subcategory containing well-formed contexts and substitutions, introduced in the following sections. \paragraph{Discs} We finish our discussion of the syntax of \Cattr by giving formal definitions of disc and sphere contexts, some constructions on these, and their properties. This will allow these to be used as examples in following sections, and pre-empts the use of discs in the first two equality rules that we will introduce, disc removal and endo-coherence removal. We begin with the definitions of discs, spheres, and sphere types, which can be found in \module{Catt.Discs} as \func{Catt.Discs}{Disc}, \func{Catt.Discs}{Sphere}, and \func{Catt.Discs}{sphere-type}. We write the sphere type as \(U^n\), which is intentionally close to the notation of the standard type \(\mathcal{U}_\Delta^n\), as it will turn out that these coincide. \begin{definition} We mutually define the disc contexts \(D^n\), sphere contexts \(S^n\), and sphere type \(U^n \in \Type_{S^n}\). \begin{mathpar} D^n = S^n , (d_n^- : U^n) \and S^0 = \emptyset \and S^{n+1} = D^n , (d_n^+ : \wk(U^n)) \\ U^0 = \star \and U^{n+1} = \arr {d_n^-} {\wk(\wk(U^{n+1}))} {d_n^+} \end{mathpar} We will sometimes refer to the last variable of \(D^n\) as \(d_n\) instead of \(d_n^-\), given that there is no \(d_n^+\) in the context. \end{definition} We also characterise the substitutions from a sphere or disc. These are given by \func{Catt.Discs}{sub-from-sphere} and \func{Catt.Discs}{sub-from-disc} in the formalisation. \begin{definition} Let \(A : \Type_\Gamma\) be a type and suppose \(n = \dim(A)\). Define the substitution \(\{A\} : S^n \to \Gamma\) inductively by: \[ \{\star\} = \langle \rangle \qquad \{\arr s A t\} = \langle \{ A \}, s, t \rangle\] Further, given a term \(t : \Term_\Gamma\), define the substitution \(\{A,t\} : D^n \to \Gamma\) by \(\{A, t\} = \langle \{A\}, t \rangle\). \end{definition} In \module{Catt.Discs.Properties}, various facts about these constructions are proved which we list below. \begin{lemma} \label{lem:disc-prop} The following hold: \begin{lemmaenum} \item \label{item:disc-prop-dim}\(\dim(D^n) = \dim(U^n) = n\) and \(\dim(S^n) = \max(n - 1, 0)\). \item \label{item:disc-prop-susp} \(\Sigma(D^n) \equiv D^{n+1}\), \(\Sigma(S^n) \equiv S^{n+1}\), and \(\Sigma(U^n) \equiv U^{n+1}\). \item \label{item:disc-prop-wk} \(\{\wk(A)\} \equiv \wk(\{A\})\) and \(\{\wk(A), \wk(t)\} \equiv \wk(\{A,t\})\). \item \label{item:disc-prop-sub-susp} \(\{\Sigma(A)\} \equiv \Sigma(\{A\})\) and \(\{\Sigma(A),\Sigma(t)\} \equiv \Sigma(\{A,t\})\). \item \label{item:disc-prop-sub-sub} \(\{A \sub \sigma\} \equiv \{A\} \bullet \sigma\) and \(\{A \sub \sigma,t \sub \sigma\} \equiv \{A,t\}\bullet \sigma\). \item \label{item:disc-prop-sub-from} \(U^n \sub{\{A\}} \equiv A\) and hence \(\wk(U^n)\sub{\{A,t\}} \equiv A\). \item For \(\tau : S^n \to \Gamma\), \(\tau \equiv \{U^n \sub \tau\}\). \item For \(\tau : D^n \to \Gamma\), \(\tau \equiv \{\wk(U^n) \sub \tau, d_n \sub \tau\}\). \end{lemmaenum} for all \(n \in \mathbb{N}\) and appropriate \(A\), \(t\), and \(\sigma\). \end{lemma} The last two statements finish the characterisation of substitutions from spheres and discs as all such substitutions are of the form \(\{A\}\) or \(\{A,t\}\) respectively. In \module{Catt.Discs.Pasting}, it is shown that \(D^n\) is a ps-context for each \(n\). Therefore, as in \cref{sec:basic-constructions}, the identity on a term \(t\) of type \(A\) can be defined as: \[ \id(A,t) = \Coh {D^n} {\arr {d_n} {\wk(U^n)} {d_n}} {\{A,t\}} \] where \(n = \dim(A)\). Many properties of identity terms can be easily derived from \cref{lem:disc-prop}. \subsection{Typing and equality} \label{sec:typing-equality} The typing rules for \Cattr differ from those from \Catt in three key ways: \begin{enumerate} \item The fixed conditions on the support of the types in a coherence have been replaced by a set of operations \(\mathcal{O}\). Instead of having two typing rules for coherences, one for equivalences and one for composites, we simply have one typing rule and specify that a coherence \(\Coh \Delta {\arr s A t} \sigma\) can be well-formed when: \[ (\Delta, \Supp(s), \Supp(t)) \in \mathcal{O} \] This will be further motivated and explained in \cref{sec:support}. \item A definitional equality is added to the system, generated by a set of equality rules \(\mathcal{R}\) which specifies pairs of terms which should be equated. The equality takes the form of three new judgements: \begin{alignat*}{2} &\Gamma \vdash A = B&\qquad&\text{\(A, B \in \Type_\Gamma\) are equal in context \(\Gamma\).}\\ &\Gamma \vdash s = t &&\text{\(s, t \in \Term_\Gamma\) are equal in context \(\Gamma\).}\\ &\Gamma \vdash \tau = \sigma &&\text{\(\tau : \Theta \to \Gamma\) and \(\sigma : \Delta \to \Gamma\) are equal.} \end{alignat*} These judgements are all mutually defined (and are in fact mutually defined with the typing judgements). We may sometimes abbreviate these judgements to \(A = B\), \(s = t\), and \(\tau = \sigma\) when the contexts of each piece of syntax is clear. \item The typing rules are adjusted to account for this definitional equality, via the addition of a conversion rule. \end{enumerate} The conversion rule is the only additional typing rule that must be added to \Cattr, and takes the following form: \begin{mathpar} \inferrule {\Gamma \vdash s : A \and \Gamma \vdash A = B}{\Gamma \vdash s : B}\textsc{conv} \end{mathpar} allowing the type of any term to vary up to the definitional equality. This rule accounts for all the semistrict behaviour in the theories we introduce in \cref{cha:cattstrict}. By adding this rule, and allowing the type of a term to vary up to definitional equality instead of syntactic equality, we allow more terms in the theory to become composable. Suppose we have terms \(f : x \to y\) and \(g : y' \to z\). In \Catt, we would not be able to form the vertical composition of these terms, as \(y\) and \(y'\) are not the same. If we now suppose that \(\Gamma \vdash y = y'\), then it will follow that \(\Gamma \vdash (x \to y) = (x \to y')\), and so using the conversion rule we get: \begin{mathpar} \inferrule{\inferrule*{\Gamma \vdash f : x \to y \and \inferrule*{\Gamma \vdash y = y'}{\Gamma \vdash (x \to y) = (x \to y')}}{\Gamma \vdash f : x \to y'} \and \Gamma \vdash g : y' \to z}{\Gamma \vdash f * g : x \to z} \end{mathpar} We remark that adding definitional equality does not simply quotient the terms of the theory, but also allows new terms to be well-formed as above. The definitional equality judgements are given by the rules in \cref{fig:equality} and appear in the formalisation alongside the typing rules in \module{Catt.Typing}. These are generated by the set of \emph{equality rules} \(\mathcal{R}\), which is a set of triples of the form \((\Gamma, s, t)\) where \(\Gamma\) is a context and \(s,t \in \Term_\Gamma\). The key inference rule for equality is then: \begin{mathpar} \inferrule{\Gamma \vdash s : A \and (\Gamma,s,t) \in \mathcal{R}}{\Gamma \vdash s = t}\textsc{rule} \end{mathpar} which says that if a triple \((\Gamma, s, t)\) is in \(\mathcal{R}\), then \(\Gamma \vdash s = t\) if \(s\) is well-formed in \(\Gamma\). The typing prerequisite forces the definitions of equality and typing to be mutually defined, and ensures that we only apply our equality rules to well-behaved terms. We note the asymmetry of this rule, in that only the left-hand side is required to be well-formed. Every rule introduced in this thesis will take the form of some reduction from the left-hand side to the right-hand side, and we will be able to prove that typing for the right-hand side follows from typing for the left-hand side for every equality we consider. The converse may not hold in general, necessitating the condition on the left-hand side. This is similar to \(\beta\)-reduction in the \(\lambda\)-calculus, where an untyped term can reduce to a simply typed term. The remainder of the inference rules for equality simply close under each constructor, reflexivity, symmetry, and transitivity. It is only necessary to give symmetry and transitivity rules for terms, and a reflexivity rule for variables, with these properties following for the other judgements by simple induction. \begin{lemma} The definitional equality relations on terms, types, and substitutions are equivalence relations, for any \(\mathcal{R}\). \end{lemma} \begin{proof} Proofs of these are found in \module{Catt.Typing.Properties.Base}. \end{proof} It is also possible to prove that each term has a canonical type. \begin{definition} The \emph{canonical type} of a term \(t : \Term_\Gamma\), \(\ty(t)\), is defined by a case split on \(t\). If \(t\) is a variable then the canonical type is the corresponding type in the context \(\Gamma\). Otherwise, if \(t \equiv \Coh \Delta A \sigma\) then the canonical type is \(A \sub \sigma\). \end{definition} This can be used to show that the type of a well-formed term is unique up to definitional equality, and is equal to this canonical type. \begin{lemma} \label{lem:ty-unique} If \(\Gamma \vdash s : A\), then \(\Gamma \vdash s : ty(s)\) and \(\Gamma \vdash A = \ty(s)\). Further, if \(\Gamma \vdash s : A\) and \(\Gamma \vdash s : B\) then \(\Gamma \vdash A = B\). \end{lemma} \begin{proof} We prove the first part by induction on the derivation \(\Gamma \vdash s : A\). If the derivation is derived from the conversion rule applied to \(\Gamma \vdash s : B\) and \(\Gamma \vdash A = B\), then by inductive hypothesis we have \(\Gamma \vdash s : \ty(s)\) and \(\Gamma \vdash B = \ty(s)\). By transitivity, we obtain \(\Gamma \vdash A = \ty(s)\) as required. The second part follows directly from the applying the first part to both derivations. \end{proof} Using the canonical type, we can define the canonical identity on a term. \begin{definition} \label{def:canonical-id} Given a term \(t : \Term_\Gamma\), let its \emph{canonical identity} be given by: \[ \id(t) \equiv \id(\ty(t), t)\] This construction can be iterated, and we say that a term is an \emph{iterated canonical identity} if it is on the form \(\id^k(t)\) for some \(k\). \end{definition} There is not much more that can be proved about the definitional equality at this point without knowing more about the rule set \(\mathcal{R}\). In \cref{sec:ruleset}, certain conditions will be imposed on the set of equality rules, that will allow further lemmas to be proved in large generality. \paragraph{Disc removal} We now give our first example of an equality rule, \emph{disc removal}. Disc removal removes unary composites, replacing them with the underlying term. We recall that for every \(n\), there exists the \(n\)-dimensional disc context \(D^n\), and that given a term \(t \in \Term_\Gamma\) and \(n\)-dimensional type \(A \in \Type_\Gamma\), there exists a substitution \(\{A,t\} : D^n \to \Gamma\). The unary composite of a term \(t\) of type \(A\) of dimension \(n\) is then the coherence: \[\Coh {D^n} {\wk(U^n)} {\{A,t\}}\] Disc removal equates this with the term \(t\), making the following rule admissible: \begin{mathpar} \inferrule{\Gamma \vdash t : A \\ \Gamma \vdash A}{\Gamma \vdash \Coh {D^n} {\wk(U^n)} {\{A,t\}} = t}\textsc{dr} \end{mathpar} with the removal of the disc coherence giving the name to this equality rule. Assembling disc removal into a rule set \(\mathcal{R}\) is simple, as it is possible to simply give a syntactic condition with no need to refer to typing. \begin{definition} The \emph{disc removal rule set}, \dr, is the set consisting of the triples: \[ (\Gamma, \Coh {D^n} {\wk(U^n)} {\{A,t\}}, t) \] for each context \(\Gamma\), type \(A : \Type_\Gamma\), and term \(t : \Term_\Gamma\) where \(n = \dim(A)\). A set of rules \(\mathcal{R}\) \emph{contains disc removal} if \(\dr \subseteq \mathcal{R}\). Further we say that \(\mathcal{R}\) \emph{has disc removal} if the rule \textsc{dr} holds in the generated theory. \end{definition} The inference rule \textsc{dr} follows the \textsc{rule} and typing properties about discs which will be given in \cref{sec:ruleset}. We draw attention to the typing premise of \textsc{rule}. If we know that the unary composite of a term \(t\) is well-formed, then it follows that \(t\) itself must have been well-formed, but we cannot infer that the term \(\Coh {D^n} {\wk(U^n)} {\{A,t\}}\) is well-formed from \(t\) being well-formed. In particular, knowing that \(t\) is well-formed does not constrain \(A\) at all without knowing that the given type \(A\) is the type of \(t\). We must therefore include an additional typing premise if we want to avoid well-formed and non-well-formed terms being equated. \afterpage{% \clearpage% flush all other floats \ifodd\value{page} \else% \expandafter\afterpage% put it on the next page if this one is odd \fi {% \begin{figure}[hbtp] \centering \fbox{% \begin{subfigure}{0.47\textwidth} \begin{mathpar} \inferrule{ }{\star : \Type_\Gamma} \and \inferrule{x \in \Var(\Gamma)} {x : \Term_\Gamma} \and \inferrule{A : \Type_\Gamma}{\langle A \rangle : \emptyset \to \Gamma} \and \inferrule{ }{\emptyset : \Ctx} \and \inferrule{\Gamma : \Ctx \\ A : \Type_\Gamma}{\Gamma, (x : A) : \Ctx} \and \inferrule{\sigma : \arr \Delta A \Gamma \\ t : \Term_\Gamma \\ B : \Type_\Delta}{\langle \sigma , t \rangle : \arr {\Delta, (x : B)} A \Gamma} \and \inferrule{A : \Type_\Gamma \\ s : \Term_\Gamma \\ t : \Term_\Gamma} {\arr s A t : \Type_\Gamma} \and \inferrule{\\\\\Delta : \Ctx \\ A : \Type_\Delta \\ \sigma : \arr \Delta \star \Gamma}{\Coh \Delta A \sigma : \Term_\Gamma} \end{mathpar} \caption{Syntax.} \end{subfigure}} \hfill \fbox{% \begin{subfigure}{0.49\textwidth} \begin{mathpar} \inferrule{ }{\emptyset \vdash} \and \inferrule{\Gamma \vdash\\ \Gamma \vdash A}{\Gamma, (x : A) \vdash} \and \inferrule{ }{\Gamma \vdash \star} \and \inferrule{\Gamma \vdash s : A \\ \Gamma \vdash A \\ \Gamma \vdash t : A}{\Gamma \vdash \arr s A t} \and \inferrule{\Gamma \vdash A}{\Gamma \vdash \langle A \rangle : \emptyset} \and \inferrule{\Gamma \vdash \sigma : \Delta\\ \Gamma \vdash t : A\sub\sigma}{\Gamma \vdash \langle \sigma , t \rangle : \Delta, (x : A)} \and \inferrule{(x : A) \in \Gamma}{\Gamma \vdash x : A} \and \inferrule{\Gamma \vdash t : A\\ \Gamma \vdash A = B}{\Gamma \vdash t : B} \and \inferrule{\Delta \vdash_{\mathsf{ps}}\\ \Delta \vdash \arr s A t \\ \Gamma \vdash \sigma : \Delta\\(\Delta, \Supp(s), \Supp(t)) \in \mathcal{O}}{\Gamma \vdash \Coh \Delta {\arr s A t} \sigma : \arr {s \sub \sigma} {A \sub \sigma} {t \sub \sigma}} \end{mathpar} \caption{Typing.} \end{subfigure}} \vspace{7pt} \fbox{% \begin{subfigure}{0.9852\textwidth} \begin{mathpar} \inferrule{\Gamma \vdash s : A \\ (\Gamma, s, t) \in \mathcal{R}}{\Gamma \vdash s = t}\textsc{rule} \and \inferrule{x \in \Var(\Gamma)}{\Gamma \vdash x = x} \and \inferrule{\Gamma \vdash s = t}{\Gamma \vdash t = s} \and \inferrule{\Gamma \vdash s = t \\ \Gamma \vdash t = u}{\Gamma \vdash s = u} \and \inferrule{\Delta \vdash A = B \\ \Gamma \vdash \sigma = \tau}{\Gamma \vdash \Coh \Delta A \sigma = \Coh \Delta B \tau} \and \inferrule{ }{\Gamma \vdash \star = \star} \and \inferrule{\Gamma \vdash s = s' \\ \Gamma \vdash t = t' \\ \Gamma \vdash A = A'}{\Gamma \vdash \arr s A t = \arr {s'} {A'} {t'}}\and \inferrule{\Gamma \vdash A = B}{\Gamma \vdash \langle A \rangle = \langle B \rangle}\and \inferrule{\Gamma \vdash \sigma = \tau \\ \Gamma \vdash s = t}{\Gamma \vdash \langle \sigma, s \rangle = \langle \tau, t \rangle} \end{mathpar} \caption{Equality.} \label{fig:equality} \end{subfigure}} \vspace{7pt} \fbox{% \begin{subfigure}{0.47\textwidth} \vspace{3.7pt} \begin{mathpar} \inferrule{ }{(x : \star) \vdash_{\mathsf{ps}} x : \star} \and \inferrule{\Gamma \vdash_{\mathsf{ps}} x : A}{\Gamma, (y : A), (f : \arr x A y)} \and \inferrule{\Gamma \vdash_{\mathsf{ps}} x : \arr s A t}{\Gamma \vdash_{\mathsf{ps}} t : A} \and \inferrule{\Gamma \vdash_{\mathsf{ps}} x : \star}{\Gamma \vdash_{\mathsf{ps}}} \end{mathpar} \caption{Ps-contexts.} \end{subfigure}} \hfill \fbox{% \begin{subfigure}{0.49\textwidth} \begin{mathpar} \FV(\star) = \{\} \and \FV(\langle A \rangle) = \FV(A) \\ \FV(x) = \{x\} \text{ for }x \in \Var \\ \FV(\Coh \Delta A \sigma) = \FV(\sigma) \\ \FV(\arr s A t) = \FV(s) \cup \FV(A) \cup \FV(t) \\ \FV(\langle \sigma , t \rangle) = \FV(\sigma) \cup \FV(t) \end{mathpar} \caption{Free variables.} \end{subfigure}} \caption{\Cattr: syntax, typing, and operations.} \label{fig:cattr} \end{figure} \begin{figure} \ContinuedFloat \fbox{% \begin{subfigure}{1\textwidth} \begin{align*} \DC_\emptyset(\emptyset) &= \emptyset\\ \DC_{\Gamma, x : A}(V) &= \begin{cases*} \DC_\Gamma(V)&if \(x \not\in V\)\\ \{x\} \cup \DC_\Gamma(V \setminus \{x\} \cup \FV(A))&if \(x \in V\)\\ \end{cases*}\\ \Supp(t) &= \DC_\Gamma(\FV(t))\text{ for }t \in \Term_\Gamma\\ \Supp(A) &= \DC_\Gamma(\FV(A))\text{ for }A \in \Type_\Gamma\\ \Supp(\sigma) &= \DC_\Gamma(\FV(\sigma))\text{ for }\sigma : \arr {\Delta} A \Gamma \end{align*} \caption{Support.} \end{subfigure}} \vspace{7pt} \fbox{% \begin{subfigure}{\textwidth} \begin{align*} x \sub \sigma &= t\text{ if }(x \mapsto t) \in \sigma\\ \Coh \Theta A \tau \sub \sigma &= \begin{cases*} \Coh \Theta A {\tau \bullet \sigma}&if \(\dim(\ty(\sigma)) = 0\)\\ \Coh {\Sigma(\Theta)} {\Sigma(A)} {\Sigma(\tau)} \sub {\unrestrict\sigma}&otherwise \end{cases*} \\ \star \sub \sigma &= \ty(\sigma)\\ (\arr s A t) \sub \sigma &= \arr {s \sub \sigma} {A \sub \sigma} {t \sub \sigma}\\ \langle A \rangle \bullet \sigma &= \langle A \sub \sigma \rangle\\ \langle \tau , t \rangle \bullet \sigma &= \langle \tau \bullet \sigma , t \sub \sigma \rangle \end{align*} \caption{Substitution application.} \end{subfigure}} \vspace{7pt} \fbox{% \begin{subfigure}{0.475\textwidth} \begin{align*} \Sigma (\emptyset) &= (N : \star), (S : \star)\\ \Sigma (\Gamma, (x : A)) &= \Sigma \Gamma, (x : \Sigma A)\\ \Sigma (\star) &= \arr N \star S\\ \Sigma (\arr s A t) &= \arr {\Sigma s} {\Sigma A} {\Sigma t}\\ \Sigma (x) &= x\\ \Sigma (\Coh \Delta A \sigma) &= \Coh {\Sigma(\Delta)} {\Sigma(A)} {\Sigma(\sigma)}\\ \Sigma(\sigma) &= \unrestrict(\Sigma'(\sigma))\\[7.25pt] \Sigma'(\langle A \rangle) &= \langle \Sigma(A) \rangle\\ \Sigma'(\langle \sigma, x \rangle) &= \langle \Sigma'(\sigma), \Sigma(t) \rangle\\ \unrestrict\langle \arr s A t \rangle &= \langle A , s , t \rangle\\ \unrestrict\langle \sigma, t \rangle &= \langle \unrestrict \sigma, t \rangle \end{align*} \caption{Suspension.} \end{subfigure}} \hfill \begin{subfigure}{0.49\textwidth} \fbox{% \begin{subfigure}{1\textwidth} \begin{align*} \wk(\star) &= \star\\ \wk(\arr s A t) &= \arr {\wk(s)} {\wk(A)} {\wk(t)}\\ \wk(x) &= x\\ \wk(\Coh \Delta A \sigma) &= \Coh \Delta A {\wk(\sigma)}\\ \wk(\langle A \rangle) &= \langle \wk(A) \rangle\\ \wk(\langle \sigma, t \rangle) &= \langle \wk(\sigma), \wk(t) \rangle \end{align*} \caption{Weakening.} \label{fig:wk} \end{subfigure}} \vspace{7pt} \fbox{% \begin{subfigure}{1\textwidth} \begin{align*} \id_\emptyset &= \langle \star \rangle\\ \id_{\Gamma, (x : A)} &= \langle \wk(\id_\Gamma), x \rangle \end{align*} \caption{Identity substitution.} \end{subfigure}} \end{subfigure} \caption{\Cattr: syntax, typing, and operations.} \end{figure} }% } \section[The set of operations \texorpdfstring{\(\mathcal{O}\)}{O}]{The set of operations \texorpdfstring{\boldmath\(\mathcal{O}\)}{O}} \label{sec:support} In \cref{sec:typing-equality}, we introduced a set of operations \(\mathcal{O}\), which allows us to vary the operations available in the theory, much like the set \(\mathcal{R}\) allows us to vary the equality rules of the theory. The set \(\mathcal{O}\) replaces the conditions on the support of the type contained in a coherence, and consists of a set of triples of a context \(\Delta\), along with two sets \(x,y \subseteq \Var(\Delta)\). A certain type \(\arr s A t : \Type_\Delta\) is permitted to appear in a coherence exactly when \((\Delta , \Supp(s), \Supp(t))\) is an element of \(\mathcal{O}\). There are two key advantages to setting up the theory this way. \begin{itemize} \item A clear separation is introduced in the metatheory and formalisation between properties that are specific to the support conditions in \Catt and those that are independent of the specific support conditions present. \item The results in the following sections can be proven generically for different variants of \Catt. \end{itemize} In particular, the main utility we extract in this thesis is the ability to define groupoidal versions of the various semistrict theories we define in \cref{cha:cattstrict}. By letting \(\mathcal{O}\) consists of all possible triples, the support condition is effectively removed, producing a version of \Catt closer to Grothendieck's definition of \(\infty\)-groupoid (see \cref{sec:weak}). \subsection{Operation sets} \label{sec:operation-sets} As previously mentioned, an operation set \(\mathcal{O}\) consists of a collection of triples of a context \(\Delta\) and two subsets of the variables of \(\Delta\). We call a subset of the variables of a context a \emph{variable set}. In the formalisation, these variable sets are given as a list of booleans, one boolean for each variable of the context. These are given in \module{Catt.Support}, which also contains many constructions on them, including unions of these sets, subset relations, and the free variables of each piece of syntax. The variable sets of \(\Delta\) form a lattice with top element \(\Var(\Delta)\) and bottom element \(\emptyset\). The free variable constructions commute with weakening, as is proved in \module{Catt.Support.Properties} by mutual induction. We recall the function \(\DC\) on these variable sets, given by \func{Catt.Support}{DC} in the formalisation, which produces the downwards closure of a variable set. This admits the following properties: \begin{proposition} \(\DC\) is an idempotent join-semilattice homomorphism. It preserves binary joins (unions), subset inclusions, and preserves the top and bottom element of the lattice. \end{proposition} We further define the application of a substitution to a variable set below. \begin{definition} Given a variable set \(V\) of \(\Delta\) and (regular) substitution \(\sigma : \Delta \to \Gamma\), we define the application of \(\sigma\) to \(V\), written \(V \sub \sigma\) to be a variable set of \(\Gamma\) given by: \begin{align*} V \sub {\langle \rangle} &= \emptyset\\ V \sub {\langle \sigma , t \rangle} &= \begin{cases*} (V \setminus \{x\}) \sub \sigma \cup \FV(t)&if \(x \in V\)\\ V \sub \sigma &otherwise \end{cases*} \end{align*} Where \(x\) is assumed to be the last variable of \(\Delta\) in the second case. \end{definition} We note that when representing variable sets as a list of booleans, these definitions are given by simple inductions on the length of the context. These constructions admit the following properties. \begin{proposition} \label{prop:vs-sub} Let \(\Delta\) be a context. Then the function taking a variable set \(V\) of \(\Delta\) to \(V \sub \sigma\) is a join-semilattice homomorphism for any substitution \(\sigma : \Delta \to \Gamma\). Further, for a term \(t : \Term_\Delta\), a type \(A : \Type_\Delta\), or a substitution \(\tau : \arr \Theta A \Delta\), the following equalities hold: \begin{align*} \FV(t \sub \sigma) &= \FV(t) \sub \sigma \\ \FV(A \sub \sigma) &= \FV(A) \sub \sigma \\ \FV(\tau \bullet \sigma) &= \FV(\tau) \sub \sigma \end{align*} and hence \(\Var(\Delta) \sub \sigma = \FV(\id_\Delta) \sub \sigma = \FV(\id_\Delta \bullet \sigma) = \FV(\sigma)\). For any variable set \(V \subseteq \Var(\Theta)\) we have: \[ V \sub {\id_\Theta} = V \qquad V \sub {\tau \bullet \sigma} = V \sub \tau \sub \sigma \] for \(\tau : \Theta \to \Delta\) and \(\sigma : \Delta \to \Gamma\). \end{proposition} \begin{proof} All proofs proceed by induction on the length of the context \(\Delta\) and are given in \module{Catt.Support.Properties}. \end{proof} An operation set is then an element of: \[ \Sigma_{\Delta : \Ctx} \mathcal{P}(\Var(\Delta)) \times \mathcal{P}(\Var(\Delta)) \] In the formalisation this is defined in \module{Catt.Ops} to be a function from a context and two variable sets of that context to a universe. \begin{remark} The definition of an operation set in the formalisation deviates from the presentation given here, as the version in the formalisation is proof relevant. The proof relevant definition allows us to give any type as the type of witnesses that a certain triple appears in \(\mathcal{O}\), including a type containing many distinct witnesses. If we wished to recover a definition closer to the classical set-based definition, we could enforce that this function has a universe of propositions as its codomain, instead of a universe of types, and use propositional truncations to define various versions of \(\mathcal{O}\). This is however unnecessary for any of the proofs appearing in this thesis, hence the choice of the proof relevant definition for simplicity. A similar observation will apply to the definition of equality rule sets introduced in \cref{sec:ruleset}. \end{remark} We can now introduce our first operation set, the operation set for groupoidal operations, which imposes no support conditions and allows all operations. \begin{definition} We define the \emph{groupoidal operation set} \(\Group\) as: \[ \Group = \{ (\Delta, U, V) \mid \Delta : \Ctx, U \subseteq \Var(\Delta), V \subseteq \Var(\Delta) \} \] We will refer to \Cattr with the operation set \(\Group\) as \emph{groupoidal \Cattr} or \emph{groupoidal \Catt} (when \(\mathcal{R} = \emptyset\)). \end{definition} To recover the standard definition of \Catt, we must define the boundary sets of a pasting diagram. In \cref{sec:typing-catt}, these are given as the free variables of the boundary inclusion substitutions of pasting diagrams. Here we will instead give a direct definition of the variable sets corresponding to the free variables of the substitutions, delaying the definition of boundary inclusions of pasting diagrams until \cref{sec:trees}. \begin{definition} Let \(\Delta\) be a ps-context. Define the \(n\)-boundary variable sets \(\bdry n - \Delta\) and \(\bdry n + \Delta\) by induction on \(\Delta\): \begin{align*} \bdry i \epsilon {(x : \star)} &= \{ x \}\\ \bdry i \epsilon {\Gamma, (y : A) , (f : \arr x A y)} &= \begin{cases*} \bdry i \epsilon \Gamma&if \(i < \dim(A)\)\\ \bdry i - \Gamma&if \(i = \dim(A)\) and \(\epsilon = -\)\\ (\bdry i + \Gamma \cup \{ y \}) \setminus \{x\}&if \(i = \dim(A)\) and \(\epsilon = +\)\\ \bdry i \epsilon \Gamma \cup \{ y , f \}&otherwise \end{cases*} \end{align*} These boundary sets appear in the formalisation as \func{Catt.Support}{pd-bd-vs}. \end{definition} The following lemma is immediate: \begin{lemma} \label{lem:bdry-full} If \(n \geq \dim(\Delta)\), then \(\bdry n \epsilon \Delta = \Var(\Delta)\). \end{lemma} \begin{proof} A simple induction on the definition. A formalised proof appears as \func{Catt.Support.Properties}{pd-bd-vs-full} in the module \module{Catt.Support.Properties}. \end{proof} With this definition we can introduce the regular operation set, which recovers the regular support conditions used in the definition of \Catt. \begin{definition} The \emph{regular operation set} \Reg is defined to be: \[ \Reg = \{ (\Delta, \Var(\Delta), \Var(\Delta)) \mid \Delta \vdash_{\mathsf{ps}} \} \cup \{ (\Delta, \bdry {\dim(\Delta)-1} - \Delta, \bdry {\dim(\Delta)-1} + \Delta) \mid \Delta \vdash_{\mathsf{ps}} \} \] The first component allows equivalences to be well-formed, and the second gives the support condition for composites. \end{definition} The regular operation set has more standard presentation. \begin{proposition} \label{prop:std-op} Let the set \Std of standard operations be defined as: \[ \Std = \{ (\Delta, \bdry n - \Delta, \bdry n + \Delta) \mid \Delta \vdash_{\mathsf{ps}} , n \geq \dim(\Delta) - 1 \} \] Then \(\Std = \Reg\). \end{proposition} \begin{proof} Suppose \((\Delta, U, V) \in \Reg\). If \(U = \bdry {\dim(\Delta) - 1} - \Delta\) and \(V = \bdry {\dim(\Delta) - 1} + \Delta\), then \((\Delta , U ,V)\) is trivially in \Std by letting \(n = \dim(\Delta) - 1\). If instead \(U = V = \Var(\Delta)\), then \((\Delta, U , V) \in \Std\) by letting \(n = \dim(\Delta)\) and applying \cref{lem:bdry-full}. Conversely, assume \((\Delta, U, V) \in \Std\). Then there is \(n \geq \dim(\Delta) - 1\) with \(U = \bdry n - \Delta\) and \(V = \bdry n + \Delta\). If \(n = \dim(\Delta) - 1\) then \((\Delta, U ,V)\) is trivially in \(\Reg\), and otherwise by \cref{lem:bdry-full} we have \(U = V = \Var(\Delta)\), and so \((\Delta,U,V)\) is again an element of \Reg. Hence, \(\Reg = \Std\). \end{proof} This more uniform presentation is sometimes easier to work with, and will be used to prove properties of \Reg in \cref{sec:operation-properties}. \begin{remark} By letting \(\mathcal{O} = \emptyset\), we recover the type theory \textsf{GSeTT}~\cite{benjamin2021globular}, a type theory for globular sets. \end{remark} It would be possible to generalise the notion of operation set presented here by instead letting the set \(\mathcal{O}\) consist of triples \((\Delta, s,t)\) where \(s\) and \(t\) are terms over \(\Delta\) instead of variable sets over \(\Delta\). This would allow more control over which operations were allowed in the theory. As an example, we would be able to restrict the class of composites to contain only the standard composites, or even further restrict it to binary composites. This is however unnecessary to present the regular and groupoidal versions of \Cattr. By only allowing the set of available operations to be specified up to the support of the contained terms, it is possible to show that a coherence being an operation is closed under equality by proving that equality preserves the support of a term. \subsection{Operation properties} \label{sec:operation-properties} Currently, our set of operations is completely unconstrained, and we will be limited in the constructions that can be made in \Cattr. We therefore constrain these sets in two ways. The first enforces that our set of operations is closed under suspension, for which we need to be able to suspend variable sets. This is defined in the formalisation as \func{Catt.Suspension.Support}{susp-vs}. \begin{definition} Let \(\Delta\) be a context. The suspension of a variable set \(V\) over \(\Delta\) is defined to be: \[ \Sigma(V) = \{ N , S \} \cup V \] where \(\Sigma(V)\), the suspension of \(V\) is a variable set over \(\Sigma(\Delta)\). \end{definition} The suspension of a variable set commutes with taking the support of a piece of syntax, as shown in the next lemma. \begin{lemma} \label{lem:susp-vs-prop} The following equalities hold: \[ \Supp(\Sigma(s)) = \Sigma(\Supp(s)) \qquad \Supp(\Sigma(A)) = \Sigma(\Supp(A)) \qquad \Supp(\Sigma(\sigma)) = \Sigma(\Supp(\sigma)) \] for term \(s : \Term_\Gamma\), type \(A : \Type_\Gamma\), and substitution \(\sigma : \arr \Delta \star \Gamma\). \end{lemma} \begin{proof} All equalities hold by a mutual induction on terms, types, and substitutions, with a secondary induction on the context \(\Gamma\) for the case of the variables and the base type \(\star\). These calculations are given in \module{Catt.Suspension.Support}. \end{proof} We can then define our first property on operation sets. \begin{definition} An operation set \(\mathcal{O}\) is \emph{suspendable} if: \[ (\Delta, U, V) \in \mathcal{O} \implies (\Sigma(\Delta), \Sigma(U), \Sigma(V)) \in \mathcal{O} \] For \(\Delta : \Ctx\) and \(U, V \subseteq \Var(\Delta)\). \end{definition} The groupoidal operation set is trivially suspendable. To show that the regular operation set is suspendable, we prove the following proposition. \begin{proposition} Let \(\Delta\) be a ps-context. Then: \[\Sigma(\bdry n \epsilon \Delta) = \bdry {n + 1} {\epsilon} {\Sigma(\Delta)}\] for \(n \in \mathbb{N}\) and \(\epsilon \in \{-,+\}\). \end{proposition} \begin{proof} We proceed by induction on \(\Delta\). First suppose \(\Delta = (x : \star)\). We then have: \[ \Sigma(\bdry n \epsilon {(x : \star)}) = \Sigma(\{x\}) = \{N,S,x\} = \bdry {n + 1} {\epsilon} {\Sigma((x: \star))} \] Now suppose that \(\Delta = \Delta', (y : A), (f : \arr x A y)\). We split into cases on \(n\), \(\dim(A)\), and \(\epsilon\): \begin{itemize} \item If \(n < \dim(A)\) then \begin{align*} \Sigma(\bdry n \epsilon \Delta) &= \Sigma(\bdry n \epsilon {\Delta'})\\ &= \bdry {n + 1} {\epsilon} {\Sigma(\Delta')} &\text{by inductive hypothesis}\\ &= \bdry {n + 1} {\epsilon} {\Sigma(\Delta)} &\text{as }n + 1 < \dim(\Sigma(A))\\ \intertext{ \item If \(n = \dim(A)\) and \(\epsilon = -\) then the proof is similar to the preceding case. \item If \(n = \dim(A)\) and \(\epsilon = +\) then: } \Sigma(\bdry n + \Delta) &= \Sigma((\bdry n + {\Delta'} \cup \{y\}) \setminus \{x\})\\ &= (\Sigma(\bdry n + {\Delta'}) \cup \{y\}) \setminus \{x\} \\ &= (\bdry {n+1} + {\Sigma(\Delta')} \cup \{y\}) \setminus \{x\} &\text{by inductive hypothesis}\\ &= \bdry {n+1} + {\Sigma(\Delta)} &\text{as }n + 1 = \dim(\Sigma(A))\\ \intertext{\item If \(n > \dim(A)\) then} \Sigma(\bdry n \epsilon \Delta) &= \Sigma((\bdry n \epsilon {\Delta'}) \cup \{y,f\})\\ &= \Sigma(\bdry n + {\Delta'}) \cup \{y,f\} \\ &= \bdry {n+1} + {\Sigma(\Delta')} \cup \{y, f\} &\text{by inductive hypothesis}\\ &= \bdry {n+1} + {\Sigma(\Delta)} &\text{as }n + 1 > \dim(\Sigma(A)) \end{align*} \end{itemize} Hence, the desired equality holds in all cases. \end{proof} \begin{corollary} The regular operation set is suspendable. \end{corollary} \begin{proof} By \cref{prop:std-op}, it suffices to show that the standard operation set is suspendable, which is clear from the above proposition. \end{proof} The second restriction we put on operation sets is that there are enough operations to create the standard coherences presented in \cref{sec:basic-constructions}. \begin{definition} An operation set \(\mathcal{O}\) \emph{contains the standard operations} if \(\Std \subseteq \mathcal{O}\). \end{definition} The groupoidal operation set clearly contains the standard operations, and the regular operation set also does due to \cref{prop:std-op}. The empty operation set does not contain the standard operations. We end this section with the following proposition about the support of terms in a disc. \begin{proposition} For \(n \in \mathbb{N}\) the following two equations hold: \[ \bdry n - {D^{n+1}} = \Var(S^n) \cup \{d_n^-\} = \Var(D^n) \qquad \bdry n + {D^{n+1}} = \Var(S^n) \cup \{d_{n+1}^+\}\] Further, the following equations hold: \[\FV(U^n) = \Var(S^n) \qquad \Supp(d_n^-) = \Var(D^n) = \bdry n - {D^{n+1}} \qquad \Supp(d_n^+) = \bdry n + {D^{n+1}} \] again for any \(n \in \mathbb{N}\). \end{proposition} \begin{proof} The first equations follow by a simple case analysis, using that \(\bdry n - {D^n} = \Var(D^n)\) by \cref{lem:bdry-full,item:disc-prop-dim}. The free variables of \(U^n\) are easily calculated inductively, and the support of \(d_n^-\) and \(d_n^+\) are easy to compute using the first parts of the proposition, and that \(FV(U^n) \subseteq \Supp(d_n^-)\) and \(\FV(U^n) \subseteq \Supp(d_n^+)\) as the support of a term is downwards closed. These proofs are formalised in \module{Catt.Discs.Support}. \end{proof} \begin{corollary} \label{cor:disc-op} Both \((D^{n+1}, d_n^-, d_n^+)\) and \((D^n, d_n, d_n)\) are in \(\Std\) for each \(n\). \end{corollary} \section[The set of equality rules \texorpdfstring{\(\mathcal{R}\)}{R}]{The set of equality rules \texorpdfstring{\boldmath\(\mathcal{R}\)}{R}} \label{sec:ruleset} In \Cattr, the definitional equality relation is generated by a set of rules \(\mathcal{R}\) formed of triples containing a context and two terms in the context which should be made equal. In this section we discuss some operations on these equality sets and properties that they may have. \begin{remark} In the formalisation the set of equality rules is defined similarly to the set of operations \(\mathcal{O}\). It is defined as a function that takes a context and two terms over that context and returns a type. It is therefore proof relevant in the same way as the operation sets. \end{remark} The equality rule sets inherit some operations and relations just by being sets. We can easily form the empty equality set, which allows us to recover the weak type theory \Catt, and given two equality sets we can take their union, to get a type theory with equalities from both sets (we note that the equality generated by a union is in general coarser than the union of the equalities generated by the individual sets). To aid readability when reasoning about typing and equality with multiple distinct operations, we may subscript the turnstile symbol in various judgements with the set of equality rules being used. For example, we may write the judgements for typing of a term \(t\) in the type theory generated from rules \(\mathcal{R}\) as \[ \Gamma \vdash_{\mathcal{R}} t : A \] and the corresponding judgement for the equality of two terms \(s\) and \(t\) as \[ \Gamma \vdash_{\mathcal{R}} s = t \] Equality rule sets can also be subsets of each other, leading to the following lemma. \begin{lemma} \label{lem:subset-lem} Let \(\mathcal{R}\) and \(\mathcal{S}\) be two equality rule sets and suppose that \[ \Gamma \vdash_{\mathcal{S}} s = t\] for all \((\Gamma,s,t) \in \mathcal{R}\) with \(\Gamma \vdash_{\mathcal{S}} s : A\) for some \(A : \Type_\Gamma\). Then the following inference rules hold: \begin{mathpar} \inferrule{\Gamma \vdash_{\mathcal{R}}}{\Gamma \vdash_{\mathcal{S}}} \and \inferrule{\Gamma \vdash_{\mathcal{R}} t : A}{\Gamma \vdash_{\mathcal{S}} t : A} \and \inferrule{\Gamma \vdash_{\mathcal{R}} A}{\Gamma \vdash_{\mathcal{S}} A} \and \inferrule{\Gamma \vdash_{\mathcal{R}} \sigma : \Delta}{\Gamma \vdash_{\mathcal{S}} \sigma : \Delta} \\ \inferrule{\Gamma \vdash_{\mathcal{R}} s = t}{\Gamma \vdash_{\mathcal{S}} s = t} \and \inferrule{\Gamma \vdash_{\mathcal{R}} A = B}{\Gamma \vdash_{\mathcal{S}} A = B} \and \inferrule{\Gamma \vdash_{\mathcal{R}} \sigma = \tau}{\Gamma \vdash_{\mathcal{S}} \sigma = \tau} \end{mathpar} In particular these inference rules hold when \(\mathcal{R} \subseteq \mathcal{S}\). \end{lemma} \begin{proof} Follows from a simple induction. Details are given in the formalisation in module \module{Catt.Typing.Rule.Properties}. \end{proof} \begin{corollary} \label{cor:catt-to-r} Any context, term, type, or substitution that is well-formed in \Catt is also well-formed in \Cattr, for any equality set \(\mathcal{R}\). \end{corollary} Furthermore, we can immediately show that the application of a substitution to piece of syntax that is well-formed in \Catt is well-formed. \begin{lemma} \label{lem:sub-catt} Let \(\mathcal{R}\) be any equality rule set. Then the following inference rules hold for \(\sigma : \arr \Delta \star \Gamma\): \begin{mathpar} \inferrule{\Delta \vdash_\emptyset A \\ \Gamma \vdash_{\mathcal{R}} \sigma : \Delta}{\Gamma \vdash_{\mathcal{R}} A \sub \sigma }\and \inferrule{\Delta \vdash_\emptyset s : A \\ \Gamma \vdash_{\mathcal{R}} \sigma : \Delta}{\Gamma \vdash_{\mathcal{R}} s \sub \sigma : A \sub \sigma } \and \inferrule{\Delta \vdash_\emptyset \tau : \Theta \\ \Gamma \vdash_{\mathcal{R}} \sigma : \Delta}{\Gamma \vdash_{\mathcal{R}} \tau \bullet \sigma : \Theta } \end{mathpar} where the judgements with a subscript empty set are judgements in the theory generated by the empty rule sets (judgements in \Catt). \end{lemma} \begin{proof} Follows immediately from a mutual induction, using that any equality in \Catt is syntactic. The proof is formalised in \module{Catt.Typing.Properties.Base}. \end{proof} An arbitrary set \(\mathcal{R}\) has very few restrictions on the equality relation it generates, and the terms that are well-formed because of it. A rule set \(\mathcal{R}\) could identify terms of different types, or identify two different variables (or even identify all variables or terms). This makes it difficult to prove much about the theory generated by an arbitrary set \(\mathcal{R}\). To this end, we introduce certain conditions that these equality rule sets can satisfy. The first three of these conditions put certain closure properties on the set of rules \(\mathcal{R}\), and each allow various constructions to be well-formed. We call theories that satisfy these three properties \emph{tame theories} and introduce these in \cref{sec:tame-theories}. In \cref{sec:further-conditions}, we introduce two more conditions which take the form of a property that the generated equality must satisfy. By introducing these conditions, we can prove various metatheoretic properties about \Cattr in a modular and generic way. This will allow the re-use of many constructions and proofs about the properties of these constructions in \cref{cha:cattstrict}, where two distinct type theories for semistrict \(\infty\)-categories are given. In the following subsections, we will also show that the rule set for disc removal satisfies all these conditions. For all these conditions, we will have that if the condition holds on \(\mathcal{R}\) and on \(\mathcal{S}\) then it also holds on \(\mathcal{R}\cup \mathcal{S}\), and so these conditions can be proved individually for each rule set that is introduced. Further, the empty set will satisfy all of these conditions vacuously, and so all proofs and constructions in the section apply to \Catt. \subsection{Tame theories} \label{sec:tame-theories} Here we introduce the three core conditions on the equality rule set \(\mathcal{R}\) which we expect hold for any reasonable choice of rule set: \begin{itemize} \item The \emph{weakening condition}, which allows weakening to be well-formed. \item The \emph{suspension condition}, which allows suspension to be well-formed. \item The \emph{substitution condition}, which implies that the application of substitution to terms, types, and other substitutions (as substitution composition) preserves typing and equality. \end{itemize} We call an equality rule set \emph{tame} if it satisfies all three of these conditions, and call the corresponding theory \Cattr a \emph{tame theory}. \paragraph{Weakening condition} For the weakening operation to be well-formed, meaning that the weakening of a well-formed piece of syntax is itself well-formed, the following closure property must hold on the set of rules \(\mathcal{R}\). \begin{definition} A set of rules \(\mathcal{R}\) satisfies the \emph{weakening condition} if for all \((\Gamma,s,t) \in \mathcal{R}\) we have: \[ ((\Gamma, (x : A)), \wk(s), \wk(t)) \in \mathcal{R} \] for all \(A : \Type_\Gamma\). \end{definition} The following proposition is immediately provable by mutual induction on typing and equality. Its proof is given in \module{Catt.Typing.Properties.Weakening}. \begin{proposition} Let \(\mathcal{R}\) satisfy the weakening condition. Then the following inference rules are admissible in \Cattr. \begin{mathpar} \inferrule{\Gamma \vdash B}{\Gamma, (x : A) \vdash \wk(A)} \and \inferrule{\Gamma \vdash s : B}{\Gamma, (x : A) \vdash \wk(s) : \wk(B)} \and \inferrule{\Gamma \vdash \sigma : \Delta}{\Gamma, (x : A) \vdash \wk(\sigma) : \Delta} \end{mathpar} for types \(A,B : \Type_\Gamma\), term \(s : \Term_\Gamma\) and substitution \(\sigma : \arr \Delta C \Gamma\). \end{proposition} \begin{corollary} \label{cor:id-sub-ty} If \(\mathcal{R}\) satisfies the weakening condition then: \[ \Gamma \vdash \id_\Gamma : \Gamma \] for any \(\Gamma : \Ctx\). \end{corollary} Using only the above proposition we can immediately prove typing properties for several constructions using discs. \begin{lemma} \label{lem:disc-typing} Suppose the weakening condition holds. Then the following judgements hold: \[ S^n \vdash U^n \qquad S^n \vdash \qquad D^n \vdash \] For all \(n \in \mathbb{N}\). Further, the following inference rules are admissible: \begin{mathpar} \inferrule{\Gamma \vdash A \\ n = \dim(A)} {\Gamma \vdash \{A\} : S^n} \and \inferrule{\Gamma \vdash A \\ n = \dim(A) \\ \Gamma \vdash s : A} { \Gamma \vdash \{A,s\} : D^n} \\ \inferrule{\Gamma \vdash \{A\} : S^n}{\Gamma \vdash A} \and \inferrule{\Gamma \vdash \{A,s\} : D^n}{\Gamma \vdash A} \and \inferrule{\Gamma \vdash \{A,s\} : D^n}{\Gamma \vdash s : A} \end{mathpar} For \(A : \Type_\Gamma\) and \(s : \Term_\Gamma\). \end{lemma} \begin{proof} The first three typing judgements follow from a simple mutual induction, making use of the typing of weakening. We prove that \(\Gamma \vdash \{A\} : S^n\) by induction on \(n\) and \(A\). The base case is trivial. For the inductive step we assume that \(\Gamma \vdash \arr s A t\), with \(n = \dim(A)\), and want to show that: \[ \Gamma \vdash \langle \{A\},s ,t \rangle : S^n, (d_{n+1}^- : U^n), (d_{n+1}^+ : \wk(U^n)) \] The judgement \(\Gamma \vdash \{A\} : S^n\) holds by inductive hypothesis, and so it remains to show that the following two judgements hold: \[ \Gamma \vdash s : U^n \sub {\{A\}} \qquad \Gamma \vdash t : \wk(U^n)\sub{\langle\{A\}, s\rangle} \] As \(\Gamma \vdash \arr s A t\), we know (by case analysis on the typing derivation) that \(\Gamma \vdash s : A\) and \(\Gamma \vdash t : A\). These judgements are sufficient to finish the proof, since \(A \equiv U^n \sub {\{A\}} \equiv \wk(U^n) \sub {\langle \{A\}, s \rangle}\) by \cref{item:disc-prop-sub-from} and the interaction of weakening with substitution application. To show that \(\Gamma \vdash A\) follows from \(\Gamma \vdash \{A\} : S^n\), we instead show that \(\Gamma \vdash U^n \sub {\{A\}}\), leveraging that typing is invariant under syntactic equality. The typing of \(U^n \sub {\{A\}}\) follows from \(U^n\) being well-formed in \Catt (as it is well-formed in any theory with the weakening property), and \cref{lem:sub-catt}. The second to last inference rule follows trivially from the preceding one. For the last rule, we get that \(\Gamma \vdash s : U^n\sub{\{A\}}\) by case analysis on \(\Gamma \vdash \{A,s\} : D^n\), and so we are finished by the invariance of typing rules under syntactic equality. \end{proof} If we further have that the set of operations includes the standard operations then we get the following corollary. \begin{corollary} \label{cor:id-typing} Suppose that \(\mathcal{O}\) contains the standard operations in addition to \(\mathcal{R}\) satisfying the weakening condition. Then the following are equivalent: \begin{itemize} \item \(\Gamma \vdash A\) and \(\Gamma \vdash s : A\), \item There exists some \(B: \Type_\Gamma\) such that \(\Gamma \vdash \id(A,t) : B\), \item \(\Gamma \vdash \id(A,t) : \arr t A t\). \end{itemize} If we further have that \(\dim(A) \neq 0\) then the following two conditions are also equivalent: \begin{itemize} \item There exists some \(B: \Type_\Gamma\) such that \(\Gamma \vdash \Coh {D^n} {\wk(U^n)} {\{A,t\}} : B\), \item \(\Gamma \vdash \Coh{D^n} {\wk(U^n)} {\{A,t\}} : A\). \end{itemize} where \(n = \dim(A)\). \end{corollary} \begin{proof} The proof follows from \cref{lem:disc-typing,item:disc-prop-sub-from,cor:disc-op}. \end{proof} We end this discussion with the following \lcnamecref{prop:dr-weak}. \begin{proposition} \label{prop:dr-weak} The set \dr satisfies the weakening condition. \end{proposition} \begin{proof} It suffices to show that for all \(\Gamma : \Ctx\), \(A, B : \Type_\Gamma\), and \(t : \Term_\Gamma\) that: \[ ((\Gamma, (x : B)), \Coh {D^n} {\wk(U^n)} {\wk(\{A,t\})}, \wk(t)) \in \dr \] when \(n = \dim(A)\). By \cref{item:disc-prop-wk}, \(\wk(\{A,t\}) \equiv \{ \wk(A), \wk(t)\}\) and so the triple above is clearly contained in \dr. \end{proof} The semistrict type theories \Cattsu and \Cattsua (which will be introduced in \cref{sec:cattsu,sec:cattsua}) will be generated by equality rule sets that are the union of multiple smaller rule sets (including disc removal). Since the weakening condition is clearly preserved under unions, we will be able to show that the rule sets generating \Cattsu and \Cattsua satisfy the weakening condition by showing that it is satisfied by each individual component. \paragraph{Suspension condition} For suspension, we introduce the following condition, which is similar to the corresponding condition for weakening. \begin{definition} A set of equality rules \(\mathcal{R}\) satisfies the \emph{suspension condition} if \[ (\Sigma(\Gamma), \Sigma(s), \Sigma(t)) \in \mathcal{R} \] for all \((\Gamma,s,t) \in \mathcal{R}\). \end{definition} If the set of operations \(\mathcal{O}\) is suspendable, then this condition is sufficient to show that the suspension of a well-formed piece of syntax is well-formed. \begin{proposition} Suppose \(\mathcal{O}\) is suspendable and \(\mathcal{R}\) satisfies the suspension condition. Then the following inference rules are admissible for \(\Gamma, \Delta, \Delta' : \Ctx\), \(A,B,C,D : \Type_\Gamma\), \(s,t : \Term_\Gamma\), \(\sigma : \arr \Delta C \Gamma\), and \(\tau : \arr {\Delta'} D \Gamma\). \begin{mathpar} \inferrule{\Gamma \vdash}{\Sigma(\Gamma) \vdash}\and \inferrule{\Gamma \vdash A}{\Sigma(\Gamma) \vdash \Sigma(A)}\and \inferrule{\Gamma \vdash s : A}{\Sigma(\Gamma) \vdash \Sigma(s) : \Sigma(A)}\and \inferrule{\Gamma \vdash \sigma : \Delta}{\Sigma(\Gamma) \vdash \Sigma'(\sigma) : \Delta}\\ \inferrule{\Gamma \vdash A = B}{\Sigma(\Gamma) \vdash \Sigma(A) = \Sigma(B)}\and \inferrule{\Gamma \vdash s = t}{\Sigma(\Gamma) \vdash \Sigma(s) = \Sigma(t)}\and \inferrule{\Gamma \vdash \sigma = \tau}{\Sigma'(\Gamma) \vdash \Sigma'(\sigma) = \Sigma(\tau)} \end{mathpar} For all \(\mu : \arr \Delta {\arr s A t} \Gamma\) and \(\mu' : \arr {\Delta'} {\arr {s'} {A'} {t'}} {\Gamma'}\) the following two rules are admissible: \begin{mathpar} \inferrule{\Gamma \vdash \mu : \Delta}{\Gamma \vdash \unrestrict \mu : \Sigma(\Delta)} \and \inferrule{\Gamma \vdash \mu = \mu'}{\Gamma \vdash \unrestrict \mu = \unrestrict \mu'} \end{mathpar} and so the inference rules \begin{mathpar} \inferrule{\Gamma \vdash \sigma : \Delta}{\Sigma(\Gamma) \vdash \Sigma(\sigma) : \Sigma(\Delta)} \and \inferrule{\Gamma \vdash \sigma = \tau} {\Sigma(\Gamma) \vdash \Sigma(\sigma) = \Sigma(\tau)} \end{mathpar} hold for \(\sigma : \arr \Delta \star \Gamma\) and \(\tau : \arr {\Delta'} \star \Gamma\). \end{proposition} \begin{proof} The rules concerning the unrestriction operation follow by simple induction on the typing judgement or equality in the premise, and in fact do not need the suspension condition. The remainder of the rules follow from a routine mutual induction on all typing and equality rules, which can be found in \module{Catt.Suspension.Typing}. The suspendability of the operation set is used for the case involving the typing rule for coherences, which also makes use of \cref{lem:susp-vs-prop}. In this case, the functoriality of suspension is used to show that the coherence has the correct type. The suspension condition is used for the rule constructor of the equality of terms. \end{proof} Similarly to the weakening condition, the suspension condition is closed under unions of rule sets, and we can show it is satisfied by \dr, with a similar proof to the proof for weakening. \begin{proposition} \label{prop:dr-susp} The set \dr satisfies the suspension condition. \end{proposition} \begin{proof} It is sufficient to prove that for all \(\Gamma : \Ctx\), \(A : \Type_\Gamma\), and \(t : \Term_\Gamma\) that: \[(\Sigma(\Gamma), \Coh {\Sigma(D^n)} {\Sigma(\wk(U^n))} {\Sigma(\{A,t\})}, \Sigma(t)) \in \dr\] when \(n = \dim(A)\). By \cref{item:disc-prop-susp}, we get that \(\Sigma(D^n) \equiv D^{n+1}\) and \(\Sigma(\wk(U^n)) \equiv \wk(\Sigma(U^n)) \equiv \wk(U^{n+1})\). By \cref{item:disc-prop-sub-susp}, \(\Sigma(\{A,t\}) \equiv \{\Sigma(A),\Sigma(t)\}\). Therefore, it is sufficient to show that: \[(\Sigma(\Gamma), \Coh {D^{n+1}} {\wk(U^{n+1})} {\{\Sigma(A),\Sigma(t)\}}, \Sigma(t)) \in \dr\] which is clear as \(\dim(\Sigma(A)) = \dim(A) + 1 = n+1\). \end{proof} \paragraph{Substitution condition} The substitution condition takes a slightly different form to the previous two conditions. Instead of requiring that the rule set is closed under application of any arbitrary substitution \(\sigma\), we instead only ensure it is closed under well-formed substitutions. This will not prevent us proving that typing is closed under the application of substitutions, but will be critical in proving that the supported rules construction, which will be given in \cref{def:rule-with-supp} and is used for proving the support condition, satisfies the substitution condition. \begin{definition} An equality rule set \(\mathcal{R}\) satisfies the \emph{\(\mathcal{R}'\)-substitution condition} if: \[ (\Gamma, s \sub \sigma, t\sub \sigma) \in \mathcal{R} \] whenever \((\Delta, s, t) \in \mathcal{R}\) and \(\sigma : \arr \Delta \star \Gamma\) with \(\Gamma \vdash_{\mathcal{R}'} \sigma : \Delta\). We say the set \(\mathcal{R}\) satisfies the \emph{substitution condition} if it satisfies the \(\mathcal{R}\)-substitution condition. \end{definition} We make two comments about this definition: \begin{itemize} \item We only close under substitutions with type part \(\star\). It will still be possible that typing is preserved by arbitrary (well-formed) substitutions when combined with the suspension condition. \item We introduce a second rule set \(\mathcal{R}'\) in the definition, which is only used for the typing premise of the substitution \(\sigma\). The reason for this is that the substitution condition is not closed under unions, and so we will instead prove that certain rule sets satisfy the \(\mathcal{R}'\)-substitution condition for an arbitrary \(\mathcal{R}'\), a condition which is closed under unions. \end{itemize} The substitution condition allows us to give the next proposition. \begin{proposition} \label{prop:sub-prop-1} Suppose \(\mathcal{R}\) satisfies the substitution condition. For any \(\sigma : \arr \Delta \star \Gamma\), the following rules are admissible: \begin{mathpar} \inferrule{\Delta \vdash A \\ \Gamma \vdash \sigma : \Delta}{\Gamma \vdash A \sub \sigma}\and \inferrule{\Delta \vdash s : A \\ \Gamma \vdash \sigma : \Delta}{\Gamma \vdash s \sub \sigma : A \sub \sigma}\and \inferrule{\Delta \vdash \tau : \Theta \\ \Gamma \vdash \sigma : \Delta}{\Gamma \vdash \tau \bullet \sigma : \Theta}\\ \inferrule{\Delta \vdash A = B\\ \Gamma \vdash \sigma : \Delta}{\Gamma \vdash A \sub \sigma = B \sub \sigma}\and \inferrule{\Delta \vdash s = t \\ \Gamma \vdash \sigma : \Delta}{\Gamma \vdash s \sub \sigma = t \sub \sigma}\and \inferrule{\Delta \vdash \tau = \mu \\ \Gamma \vdash \sigma : \Delta}{\Gamma \vdash \tau \bullet \sigma = \mu \bullet \sigma} \end{mathpar} If \(\mathcal{R}\) additionally satisfies the suspension conditions, then all the above rules are admissible for any substitution \(\sigma : \arr \Delta B \Gamma\). \end{proposition} \begin{proof} The proof for a non-extended substitution is given by another routine mutual induction in \module{Catt.Typing.Properties.Substitution}. For an arbitrary substitution \(\sigma : \arr \Delta B \Gamma\), we also proceed by mutual induction, but for the application of the substitution to an equality of terms \(s\) and \(t\) we further split on \(B\). If \(B = \star\), then the proof for non-extended substitutions can be used. Otherwise, we have: \begin{align*} s \sub \sigma &\equiv \Sigma s \sub {\unrestrict \sigma}\\ &= \Sigma t \sub {\unrestrict \sigma}\\ &\equiv t \sub \sigma \end{align*} with the non-syntactic equality following from the preservation of equality by suspension and inductive hypothesis. The proofs that the extended versions of these rules are admissible are found in \module{Catt.Typing.Properties.Substitution.Suspended}. \end{proof} We also prove that application of substitution respects equality in its second argument, which does not in fact need the substitution condition. This is also proved by a simple mutual induction in \module{Catt.Typing.Properties.Substitution}. \begin{proposition} \label{prop:sub-prop-2} The following inference rules are admissible: \begin{mathpar} \inferrule{\Gamma \vdash \sigma = \tau}{\Gamma \vdash s \sub \sigma = s \sub \tau}\and \inferrule{\Gamma \vdash \sigma = \tau}{\Gamma \vdash A \sub \sigma = A \sub \tau}\and \inferrule{\Gamma \vdash \sigma = \tau}{\Gamma \vdash \mu \bullet \sigma = \mu \bullet \tau} \end{mathpar} for substitutions \(\sigma : \arr \Delta A \Gamma\), \(\tau : \arr \Delta B \Gamma\), and \(\mu : \arr \Theta C \Delta\), term \(s : \Term_\Delta\), and type \(A : \Type_\Delta\). \end{proposition} This allows us to define a category of well-formed syntax in \Cattr, which is well-defined by the two preceding definitions. \begin{definition} Suppose \(\mathcal{R}\) satisfies the substitution and weakening conditions. Then we can define the \emph{syntactic category} of \Cattr, which by an abuse of notation we call \(\mathsf{Catt}_{\mathcal{R}}\), to have: \begin{itemize} \item Objects given by contexts \(\Gamma\) where \(\Gamma \vdash\). \item Morphisms \(\Delta \to \Gamma\) given by substitutions \(\sigma : \arr \Delta \star \Gamma\) where \(\Gamma \vdash \sigma : \Delta\) quotiented by the relation which equates substitutions \(\sigma\) and \(\tau\) when \(\Gamma \vdash \sigma = \tau\). \item The identity morphism \(\Gamma \to \Gamma\) given by \(\id_\Gamma\). \item Composition is given by \(\tau \circ \sigma = \sigma \bullet \tau\). \end{itemize} By \cref{cor:id-sub-ty}, the identity substitution is a well-defined morphism, and the above two propositions prove that composition is well-defined. Composition satisfies associativity and unitality by \cref{prop:categorical}. \end{definition} By taking the weakening of the identity substitution \(\id_\Gamma : \Gamma \to \Gamma\), we get a substitution: \[ \proj_{\Gamma} = \wk(\id_\Gamma) : \Gamma \to \Gamma, (x : A)\] which includes \(\Gamma\) into \(\Gamma, x : A\). It can be checked (and is given by \func{Catt.Syntax.Properties}{apply-project-is-wk-tm} in the formalisation) that applying this substitution to a term is the same operation as weakening the term. Using this, the following can be proved: \begin{lemma} Suppose \(\mathcal{R}\) satisfies the substitution condition. Then it also satisfies the weakening condition. \end{lemma} \begin{proof} For \((\Gamma,s ,t) \in \mathcal{R}\) and \(A : \Type_\Gamma\), we must prove that: \[ ((\Gamma,(x:A)),\wk(s),\wk(t)) \equiv ((\Gamma, (x : A)), s \sub {\proj_{\Gamma}}, t \sub {\proj_{\Gamma}}) \in \mathcal{R} \] which will follow from the substitution condition if it can be proved that \[ \Gamma, x : A \vdash_{\mathcal{R}} \proj_\Gamma : \Gamma \] holds. This judgement is easy to derive when \(\mathcal{R}\) satisfies the weakening condition, but this is what we are trying to prove. Instead, since \(\emptyset\) trivially satisfies the weakening condition, \(\proj_\Gamma\) is well-formed in \Catt, and so the derivation above follows from \cref{cor:catt-to-r}. \end{proof} We lastly show that \dr also satisfies the substitution condition. \begin{proposition} \label{prop:dr-sub} The set \dr satisfies the \(\mathcal{R}\)-substitution condition for any equality set \(\mathcal{R}\). \end{proposition} \begin{proof} The proof is similar to \cref{prop:dr-weak,prop:dr-susp}, and follows from the equality \(\{A,t\} \bullet \sigma \equiv \{A \sub \sigma, t \sub \sigma\}\) which holds by \cref{item:disc-prop-sub-sub} . \end{proof} \begin{remark} The proof of the substitution condition for \dr makes no use of the typing of \(\sigma\). In fact this premise is only necessary for the supported rules construction which will be given in \cref{def:rule-with-supp} \end{remark} \paragraph{Tameness} We can now define tameness. \begin{definition} An equality rule set \(\mathcal{R}\) is tame if it satisfies the weakening, substitution, and suspension conditions. An operation set \(\mathcal{O}\) is tame if it is suspendable and contains the standard operations. A theory generated by \(\mathcal{R}\) and \(\mathcal{O}\) is tame if both \(\mathcal{R}\) and \(\mathcal{O}\) are. \end{definition} \begin{proposition} The set \dr is tame. \end{proposition} In the formalisation, each module is parameterised by the various conditions that the module needs, and where possible we avoid using extra unnecessary conditions. Given that every theory we will consider in this thesis is tame, and that it is hard to imagine a sensible theory that isn't tame, the argument could be made that the effort put into making distinctions between these conditions is wasted or at least unnecessary. The case for including the weakening condition is especially unconvincing as it is implied by the substitution condition which likely holds in any theory of significant interest. It is however included here as it is used in the formalisation, where its introduction is an artefact of the natural progression of this research. To this end, from \cref{sec:operations-catt}, we will assume that the theory we are working over is tame, and build a library of constructions and results that work in any tame theory, even when some results may not need all the conditions above. Since we have limited use for proving properties about theories that do not satisfy the substitution condition, we could have instead enforced that all theories respect substitution by adding a constructor to the (term) equality relation that takes an equality \(\Delta \vdash s = t\) and typing relation \(\Gamma \vdash \sigma : \Delta\) to an equality \(\Gamma \vdash s \sub \sigma = t \sub \sigma\). This may remove some overhead of setting up the weakening and substitution conditions. It would also allow more minimal equality rule sets to be given, as a rule set such as disc removal could be given by \[ \{(D^n, \Coh {D^n} {\wk(U^n)} {\id_{D^n}}, d_n) \mid n \in \mathbb{N}\} \] On the other hand, including the extra constructor would effectively add an extra case to each inductive proof, and it is less clear how to minimise some of the equality rules that will be introduced in \cref{sec:operations-catt}. Taking either approach would likely lead to a similar development of the theory. \subsection{Further conditions} \label{sec:further-conditions} Knowing that the theory we are working in is tame will be sufficient for giving most of the constructions and proofs in \cref{sec:operations-catt}. Here we introduce some extra conditions that instead serve to aid in the proof of metatheoretic properties of the generated theory. These conditions take the form of predicates on each rule in the equality rule sets, rather than being closure properties as the conditions for tameness were. \paragraph{Support condition} The support of a term plays a central role in classifying the operations of the theory (see \cref{sec:support}). Although it is known that support is respected by syntactic equality, we have not yet shown it is preserved by definitional equality. The following condition allows this to be proved. \begin{definition} A set \(\mathcal{R}\) satisfies the \emph{\(\mathcal{R}'\)-support condition} for an equality set \(\mathcal{R}'\) when: \[ \Gamma \vdash_{\mathcal{R}'} s : A \implies \Supp(s) = \Supp(t) \] for each \((\Gamma,s,t) \in \mathcal{R}\) and \(A : \Type_\Gamma\). A set \(\mathcal{R}\) satisfies the \emph{support condition} if it satisfies the \(\mathcal{R}\)-support condition. \end{definition} The use of support instead of free variables in this definition is critical, as we do not expect the free variables of a piece of syntax to be preserved by equality in general. As an example, we would like to have the equality: \[ D^1 \vdash \Coh {D^1} {U^1} {\id_{D^1}} = d_1 \] given by disc removal, yet the free variables of each side are not equal (though the support of each side is). We also draw attention to typing premise. Without this, the left-hand side of each equality rule is too unconstrained (at least with how the equality rules are currently presented), and this condition would fail to hold on the equality sets we introduce in this thesis. Having this typing premise come from a separate rule set \(\mathcal{R}'\) allows the support condition to be preserved by unions of equality sets, similar to the substitution condition. From the support condition, we immediately get the following proposition, proved by mutual induction. \begin{proposition} \label{prop:supp-prop} Let \(\mathcal{R}\) satisfy the support condition. Then the following rules are admissible: \begin{mathpar} \inferrule{\Gamma \vdash s = t}{\Supp(s) = \Supp(t)}\and \inferrule{\Gamma \vdash A = B}{\Supp(A) = \Supp(B)}\and \inferrule{\Gamma \vdash \sigma = \tau}{\Supp(\sigma) = \Supp(\tau)} \end{mathpar} For \(s,t: \Term_\Gamma\), \(A,B : \Type_\Gamma\) and substitutions \(\sigma : \arr \Delta A \Gamma\) and \(\tau : \arr \Theta B \Gamma\). \end{proposition} In traditional presentations of \Catt, \(\FV(t) \cup \FV(A)\) is used instead of \(\Supp(t)\) for a term \(t\) of type \(A\). Equipped with the support condition we can now show that these are the same. \begin{lemma} The following hold when \(\mathcal{R}\) satisfies the support condition: \begin{lemmaenum} \item \(\Supp(A) = \FV(A)\) when \(\Gamma \vdash A\), \item \label{item:supp-sub-char} \(\Supp(\sigma) = \FV(\sigma)\) when \(\Gamma \vdash \sigma : \Delta\), \item \label{item:supp-tm-char-1} \(\Supp(t) = \Supp(A) \cup \FV(t)\) when \(\Gamma \vdash t : A\), \item \label{item:supp-tm-char-2} \(\Supp(t) = \FV(A) \cup \FV(t) = \Supp(A) \cup \Supp(t)\) when \(\Gamma \vdash t : A\) and \(\Gamma \vdash A\). \end{lemmaenum} \end{lemma} \begin{proof} All properties are proven by a single mutual induction on the typing derivations in the premises. \begin{enumerate}[(i)] \item Suppose \(\Gamma \vdash A\). If \(A \equiv \star\) then \(\Supp(A) = \FV(A) = \emptyset\). Instead, suppose \(A \equiv \arr s B t\). Then we have that \(\Gamma \vdash B\), \(\Gamma \vdash s : B\), and \(\Gamma \vdash t : B\) and so: \begin{align*} \Supp(A) &= \Supp(B) \cup \Supp(s) \cup \Supp(t)\\ &= \FV(B) \cup (\FV(B) \cup \FV(s)) \cup (\FV(B) \cup \FV(t))&(*)\\ &= \FV(B) \cup \FV(s) \cup \FV(t)\\ &= \FV(A) \end{align*} where the equality \((*)\) is derived from the inductive hypothesis for (i) applied to \(B\) and the inductive hypothesis for (iv) applied to \(s\) and \(t\). \item Suppose \(\Gamma \vdash \sigma : \Delta\). If \(\sigma \equiv \langle A \rangle\) then \(\Gamma \vdash A\) and so: \[\Supp(\sigma) = \Supp(A) = \FV(A) = \FV(\sigma)\] If instead \(\sigma \equiv \langle \tau, t \rangle\) and \(\Delta = \Theta, (x : A)\) then \(\Gamma \vdash \tau : \Theta\) and \(\Gamma \vdash t : A \sub \tau\) and so: \begin{align*} \Supp(\sigma) &= \Supp(\tau) \cup \Supp(t)\\ &= \Supp(\tau) \cup (\Supp(A \sub \tau) \cup \FV(t))&(*)\\ &= \DC_\Gamma(\FV(\tau) \cup \FV(A \sub \tau)) \cup \FV(t)\\ &= \Supp(\tau) \cup \FV(t)&\text{as }\FV(A \sub \tau) \subseteq \FV(\tau)\\ &= \FV(\tau) \cup \FV(t)&(\dagger)\\ &= \FV(\sigma) \end{align*} where the equality \((*)\) is derived from the inductive hypothesis for (iii) applied to \(t\) and the equality \((\dagger)\) is derived from the inductive hypothesis for (ii) applied to \(\tau\). \item Suppose \(\Gamma \vdash t : A\). We then split on the constructor used for the typing derivation: If the derivation is the result of a conversion rule applied to \(\Gamma \vdash t : B\) and \(\Gamma \vdash A = B\), then inductive hypothesis gives \(\Supp(t) = \Supp(B) \cup \FV(t)\) and \cref{prop:supp-prop} gives \(\Supp(A) = \Supp(B)\) and so \(\Supp(t) = \Supp(A) \cup \FV(t)\) as required. If the derivation is derived from the typing rule for variables, then a simple induction on the context \(\Gamma\), using that \(\Supp(\wk(A)) = \Supp(A)\), gives the required result. If the derivation is given by the typing rule for coherences then \(t \equiv \Coh \Delta B \sigma\), \(\Gamma \vdash \sigma : \Delta\), and \(A \equiv B \sub \sigma\). Therefore, \begin{align*} \Supp(t) &= \Supp(\sigma)\\ &= \DC_\Gamma(\FV(B \sub \sigma) \cup \FV(\sigma))&\text{as }\FV(B \sub \sigma) \subseteq \FV(\sigma)\\ &= \Supp(A) \cup \Supp(\sigma)\\ &= \Supp(A) \cup \FV(\sigma)&(*)\\ &= \Supp(A) \cup \FV(t) \end{align*} where the equality \((*)\) is the result of applying the inductive hypothesis for (ii) to \(\sigma\). \item If \(\Gamma \vdash t : A\) and \(\Gamma \vdash A\) then: \[ \Supp(t) = \Supp(A) \cup \FV(t) = \FV(A) \cup \FV(t) \] trivially follows from (i) and (iii) and: \[ \Supp(t) = \DC_\Gamma(\Supp(t)) = \DC_\Gamma(\FV(A) \cup \FV(t)) = \Supp(A) \cup \Supp(t) \] with the first equality resulting from the idempotency of the downwards closure operator. \end{enumerate} This proof is formalised in \module{Catt.Typing.Properties.Support}. \end{proof} \begin{corollary} \label{cor:dc-sub} Let \(\mathcal{R}\) satisfy the support condition and suppose \(\Gamma \vdash \sigma : \Delta\). Then the following equality holds: \[ \DC_\Gamma(V \sub \sigma) = \DC_\Delta(V) \sub \sigma \] for all \(V \subseteq \Var(\Delta)\); downwards closure commutes with the application of \(\sigma\) to variable sets. \end{corollary} \begin{proof} Proceed by induction on \(\Delta\). If \(\Delta \equiv \emptyset\) then the equation is trivial. Therefore, assume \(\Delta \equiv \Theta, (x : A)\) and so \(\sigma \equiv \langle \tau , t \rangle\) with \(\Gamma \vdash \tau : \Theta\) and \(\Gamma \vdash t : A \sub \tau\) by case analysis. We now split on whether \(x \in V\). If \(x \not\in V\) then \(\DC_\Gamma(V \sub \sigma) = \DC_\Gamma(V \sub \tau) = \DC_\Theta(V) \sub \tau = \DC_\Delta(V) \sub \tau\) with the second equality due to inductive hypothesis. Otherwise, \(x \in V\) and so letting \(U = V \setminus \{x\}\) we get the equality: \begin{align*} \DC_\Gamma(V \sub \sigma) &= \DC_\Gamma(U \sub \tau \cup \FV(t))\\ &= \DC_\Gamma(U \sub \tau) \cup \Supp(t)\\ &= \DC_\Gamma(U \sub \tau) \cup \Supp(A \sub \tau) \cup \FV(t)&(\dagger)\\ &= \DC_\Gamma(U \sub \tau) \cup \DC_\Gamma(\FV(A) \sub \tau) \cup \FV(t) \\ &= \DC_\Gamma(U \sub \tau \cup \FV(A) \sub \tau) \cup \FV(t)\\ &= \DC_\Gamma((U \cup \FV(A)) \sub \tau) \cup \FV(t)\\ &= \DC_\Theta(U \cup \FV(A)) \sub \tau \cup \FV(t)&(*)\\ &= (\{x\} \cup \DC_\Theta(U \cup \FV(A))) \sub \sigma \\ &= \DC_\Delta(V) \sub \sigma \end{align*} where equality \((*)\) is by inductive hypothesis and equality \((\dagger)\) is by \cref{item:supp-tm-char-1}. \end{proof} Unfortunately, proving that the support condition holds for most equality rule sets is not as trivial as the proofs for the tameness properties. Consider the case for disc removal, which gives rise to the equality \[ \Gamma \vdash \Coh {D^n} {\wk(U^n)} {\{A,t\}} = t \] To prove the support condition for this case we need to show that: \[ \Supp(\{A,t\}) = \Supp(t) \] where we can assume that \(\Gamma \vdash t : A\). Intuitively this should hold, as the support of a substitution should be equal to the support of the locally maximal arguments, and if the derivation \(\Gamma \vdash t : A\) held in \Catt, we would be able to prove this. However, this proof (and intuition) relies on the derivation \(\Gamma \vdash_{\mathcal{R}} t : A\) holding in a theory generated by \(\mathcal{R}\) where \(\mathcal{R}\) already satisfies the support condition, without which the typing derivation offers little utility. We therefore introduce a proof strategy for showing that the support condition holds. The key insight of this strategy is to prove by induction that every equality and every typing derivation in the system is well-behaved with respect to support. Then, for the case of an equality \(\Gamma \vdash s = t\) arising from a rule \((\Gamma, s, t)\), we have \(\Gamma \vdash s : A\) as a premise and so by inductive hypothesis can assume that this typing derivation is well-behaved with respect to support. We formalise this with the following definition, called the \emph{supported rules} construction: \begin{definition} \label{def:rule-with-supp} Let \(\mathcal{R}\) be some equality rule set. The \emph{supported rules} construction applied to \(\mathcal{R}\) produces the equality rule set \(\mathcal{R}_{\mathsf{S}}\), given by: \[ \mathcal{R}_{\mathsf{S}} = \{ (\Gamma, s, t) \in \mathcal{R} \mid \Supp(s) = \Supp(t)\} \] The rule set \(\mathcal{R}_{\mathsf{S}}\) satisfies the support condition by construction. \end{definition} The proof strategy then proceeds as follows: to prove that \(\mathcal{R}\) satisfies the support condition, we instead prove that \(\mathcal{R}\) satisfies the \(\mathcal{R}_{\mathsf{S}}\)-support condition, leveraging that \(\mathcal{R}_\mathsf{S}\) itself satisfies the support condition. The proof is then completed by the following lemma: \begin{lemma} \label{lem:proof-strat-supp} Let \(\mathcal{R}\) be an equality rule set that satisfies the \(\mathcal{R}_{\mathsf{S}}\)-support condition. Then the following inference rules are admissible: \begin{mathpar} \inferrule{\Gamma \vdash_\mathcal{R} A}{\Gamma \vdash_{\mathcal{R}_\mathsf{S}} A}\and \inferrule{\Gamma \vdash_\mathcal{R} s : A}{\Gamma \vdash_{\mathcal{R}_\mathsf{S}} s : A}\and \inferrule{\Gamma \vdash_\mathcal{R} \sigma : \Delta}{\Gamma \vdash_{\mathcal{R}_\mathsf{S}} \sigma : \Delta}\and \inferrule{\Gamma \vdash_\mathcal{R} A = B}{\Gamma \vdash_{\mathcal{R}_\mathsf{S}} A = B}\and \inferrule{\Gamma \vdash_\mathcal{R} s = t}{\Gamma \vdash_{\mathcal{R}_\mathsf{S}} s = t}\and \inferrule{\Gamma \vdash_\mathcal{R} \sigma = \tau}{\Gamma \vdash_{\mathcal{R}_\mathsf{S}} \sigma = \tau} \end{mathpar} and hence \(\mathcal{R}\) satisfies the support condition. \end{lemma} \begin{proof} The inference rules are all proven using a mutual induction on all typing and equality rules, using that \(\mathcal{R}\) satisfies the \(\mathcal{R}_\mathsf{S}\)-support condition in the case where the equality \(\Gamma \vdash s = t\) is derived from a rule \((\Gamma, s, t) \in \mathcal{R}\). This induction is formalised in \module{Catt.Support.Typing}. The set \(\mathcal{R}\) then satisfies the support condition as if \((\Gamma,s,t) \in \mathcal{R}\) and \(\Gamma \vdash_{\mathcal{R}} s : A\), then \(\Gamma \vdash_{\mathcal{R}_{\mathsf{S}}} s : A\) holds by the first part of the lemma and so \(\Supp(s) = \Supp(t)\) as \(\mathcal{R}\) is already known to satisfy the \(\mathcal{R}_{\mathsf{S}}\)-support condition. \end{proof} \begin{remark} The original motivation for parameterising \Catt by an arbitrary set of equality rules \(\mathcal{R}\) was not to share proofs between \Cattsu and \Cattsua but was to be able to state the supported rules construction. \end{remark} To be able to prove that \(\mathcal{R}\) satisfies the \(\mathcal{R}_{\mathsf{S}}\)-support condition, we will commonly need to know that \(\mathcal{R}_{\mathsf{S}}\) satisfies various tameness conditions, which are given by the next lemma. \begin{lemma} \label{lem:supp-sat-conds} Let \(\mathcal{R}\) be any equality set. Then \(\mathcal{R}_{\mathsf{S}}\) satisfies the weakening, suspension, and substitution conditions if \(\mathcal{R}\) respects the corresponding condition. \end{lemma} \begin{proof} Let \((\Gamma, s, t) \in \mathcal{R}\) be an arbitrary rule. To show \(\mathcal{R}_{\mathsf{S}}\) satisfies the weakening condition we need to show that: \[ (\Gamma, s, t) \in \mathcal{R}_{\mathsf{S}} \implies ((\Gamma, (x : A)), \wk(s), \wk(t)) \in \mathcal{R}_{\mathsf{S}} \] By assumption, \((\Gamma, \wk(s), \wk(t)) \in \mathcal{R}\) and by the premise of the implication we have \(\Supp(s) = \Supp(t)\). From this it follows that \(\Supp(\wk(s)) = \Supp(\wk(t))\) and so the conclusion of the implication holds. The case for suspension is similar except we need to use the equality: \[ \Supp(\Sigma(s)) = \Sigma(\Supp(s)) = \Sigma(\Supp(t)) = \Supp(\Sigma(t)) \] derived from \cref{lem:susp-vs-prop} and \(\Supp(s) = \Supp(t)\) from the premise of the implication. For the substitution condition we need to show that: \[ \Supp(s) = \Supp(t) \implies \Supp(s \sub \sigma) = \Supp(t \sub \sigma) \] under the assumption that \(\Delta \vdash_{\mathcal{R}_\mathsf{S}} \sigma : \Gamma\). Since \(\mathcal{R}_\mathsf{S}\) satisfies the support rule, we can use \cref{cor:dc-sub} to get: \[ \Supp(s \sub \sigma) = \DC_\Gamma(\FV(s) \sub \sigma) = \Supp(s) \sub \sigma = \Supp(t) \sub \sigma = \DC_\Gamma(\FV(t) \sub \sigma) = \Supp(t \sub \sigma) \] as required. \end{proof} We now prove the appropriate support condition for disc removal. \begin{proposition} \label{prop:dr-supp} Let \(\mathcal{R}\) satisfy the support and weakening conditions. Then the set \(\dr\) satisfies the \(\mathcal{R}\)-support condition. \end{proposition} \begin{proof} It is sufficient to prove that given \(s : \Term_\Gamma\), \(A : \Type_\Gamma\), and \(n = \dim(A)\) that: \[\Gamma \vdash_{\mathcal{R}} \Coh {D^n} {\wk(U^n)} {\{A,t\}} : B \implies \Supp(\{A,t\}) = \Supp(t) \] Assume the premise of the implication. Then \(\Gamma \vdash_{\mathcal{R}} \{A,t\} : D^n\) by case analysis on the typing derivation and so \(\Gamma \vdash_{\mathcal{R}} A\) and \(\Gamma \vdash_{\mathcal{R}} t : A\) by \cref{lem:disc-typing} as \(\mathcal{R}\) satisfies the weakening condition. By a simple induction, it can be shown that \(\Supp(\{A,t\}) = \Supp(A) \cup \Supp(t)\). By \cref{item:supp-tm-char-2} we have \(\Supp(t) = \Supp(A) \cup \Supp(t)\) as \(\mathcal{R}\) satisfies the support condition and so \(\Supp(\{A,t\}) = \Supp(t)\) as required. \end{proof} \paragraph{Preservation condition} Our last condition allows us to prove preservation, the property that typing is preserved by equality. \begin{definition} A set \(\mathcal{R}\) satisfies the \emph{\(\mathcal{R}'\)-preservation condition} for an equality set \(\mathcal{R}'\) when: \[ \Gamma \vdash_{\mathcal{R}'} s : A \implies \Gamma \vdash_{\mathcal{R}'} t : A \] for each \((\Gamma, s, t) \in \mathcal{R}\) and \(A : \Type_\Gamma\). The set \(\mathcal{R}\) satisfies the \emph{preservation condition} if it satisfies the \(\mathcal{R}\)-preservation condition. \end{definition} When a rule set \(\mathcal{R}\) has all the properties presented in this section, we are able to show preservation for the generated theory. \begin{proposition} Let \(\mathcal{R}\) satisfy the support condition and preservation condition, as well as being tame. Then the following inference rules are admissible: \begin{mathpar} \inferrule{\Gamma \vdash A\\ \Gamma \vdash A = B}{\Gamma \vdash B}\and \inferrule{\Gamma \vdash s : A\\ \Gamma \vdash s = t \\ \Gamma \vdash A = B}{\Gamma \vdash t : B}\and \inferrule{\Gamma \vdash \sigma : \Delta\\ \Gamma \vdash \sigma = \tau}{\Gamma \vdash \tau : \Delta} \end{mathpar} for \(A, B : \Type_\Gamma\), \(s,t : \Term_\Gamma\), \(\sigma : \arr \Delta A \Gamma\), and \(\tau : \arr \Delta B \Gamma\). \end{proposition} \begin{proof} We prove the following bidirectional versions of the inference rules by mutual induction on the equality derivation: \begin{alignat*}{5} &\Gamma \vdash A = B &&\implies (\Gamma \vdash A \iff \Gamma \vdash B)\\ &\Gamma \vdash s = t &&\implies (\forall A.\ \Gamma \vdash s : A \iff \Gamma \vdash t : A)\\ &\Gamma \vdash \sigma = \tau &&\implies (\Gamma \vdash \sigma : \Delta \iff \Gamma \vdash \tau : \Delta) \end{alignat*} which imply the inference rules of the proposition are admissible (using the conversion rule for the second rule). The only non-trivial cases are for the statement for terms. We split on the equality derivation \(\Gamma \vdash s = t\). The cases for reflexivity on variables and transitivity are also trivial. The case for symmetry follows from the symmetry of the ``if and only if'' relation. Now suppose the equality is of the form \(\Coh \Delta A \sigma = \Coh \Delta B \tau\) and is derived from the equality rule for coherences from equalities \(\Delta \vdash A = B\) and \(\Gamma \vdash \sigma = \tau\). We prove the first direction, with the second following symmetrically. We therefore assume we have a typing derivation \(\Gamma \vdash \Coh \Delta A \sigma : C\), and will induct on this derivation to construction a derivation of \(\Gamma \vdash \Coh \Delta B \tau : C\). \begin{itemize} \item If the derivation is constructed with the conversion rule from \(\Gamma \vdash \Coh \Delta A \sigma : D\) and \(\Gamma \vdash D = C\), then we get a derivation \(\Gamma \vdash \Coh \Delta B \tau : D\) by inductive hypothesis and can apply the conversion rule to get a derivation \(\Gamma \vdash \Coh \Delta B \tau : C\). \item If instead the derivation is constructed with the coherence rule then \(C \equiv A \sub \sigma\) and \(A \equiv \arr s {A'} t\) and therefore \(B \equiv \arr {u} {B'} {v}\) with \(\Delta \vdash s = u\) and \(\Delta \vdash t = v\). We also have that \(\Delta \vdash_{\mathsf{ps}}\), \((\Delta, \Supp(s), \Supp(t)) \in \mathcal{O}\), \(\Delta \vdash A\), and \(\Gamma \vdash \sigma : \Delta\). By the inductive hypothesis on the equality, we have \(\Delta \vdash B\) and \(\Gamma \vdash \tau : \Delta\). By \cref{prop:supp-prop}, \(\Supp(s) = \Supp(u)\) and \(\Supp(t) = \Supp(v)\) and so \((\Delta, \Supp(u), \Supp(v)) \in \mathcal{O}\). Hence, by the coherence rule we have \(\Gamma \vdash \Coh \Delta B \tau : B \sub \tau\). By \cref{prop:sub-prop-1,prop:sub-prop-2}, \(\Gamma \vdash A \sub \sigma = B \sub \tau\) and so by the conversion rule we obtain a derivation \(\Gamma \vdash \Coh \Delta B \tau : C\). \end{itemize} Finally, suppose the equality is derived from \textsc{rule}, such that \((\Gamma, s,t) \in \mathcal{R}\) and \(\Gamma \vdash s : A\). If \(\Gamma \vdash s : B\), then the preservation condition gives a derivation \(\Gamma \vdash t : B\). Conversely, if \(\Gamma \vdash t : B\), then we need to show that \(\Gamma \vdash A = B\). By applying the preservation condition to the derivation \(\Gamma \vdash s : A\), we get a derivation \(\Gamma \vdash t : A\). Then by \cref{lem:ty-unique}, we have \(\Gamma \vdash A = B\) and so the proof is complete by applying the conversion rule to the derivation \(\Gamma \vdash s : A\). \end{proof} As with the other conditions, we end this section by showing that \dr satisfies the preservation condition. \begin{proposition} \label{prop:dr-preserve} Suppose \(\mathcal{R}\) satisfies the weakening condition, and the set of operations \(\mathcal{O}\) contains the standard operations. Then \dr satisfies the \(\mathcal{R}\)-preservation condition. \end{proposition} \begin{proof} Take \((\Gamma, \Coh {D^n} {\wk(U^n)} {\{A,t\}}, t) \in \dr\) and suppose \(\Gamma \vdash \Coh {D^n} {U^n} {\{A,t\}} : B\). Then by \cref{lem:ty-unique}: \[\Gamma \vdash B = \wk(U^n) \sub {\{A,t\}} \equiv A\] By \cref{lem:disc-typing}, \(\Gamma \vdash t : A\) and so by the conversion rule \(\Gamma \vdash t : B\) as required. \end{proof} \subsection{Endo-coherence removal} \label{sec:ecr} We conclude this chapter with a second example of a family of equality rules called \emph{endo-coherence removal}. As suggested by the name, these equalities simplify a class of terms known as endo-coherences. \begin{definition} An \emph{endo-coherence} is a coherence term \(\Coh \Delta {\arr s A s} \sigma\). \end{definition} If we consider the (ps-context): \[ \Delta = (x : \star) (y : \star) (f : \arr x \star y) (z : \star) (g : \arr y \star z) \] then we see that there are two distinct endo-coherences with source and target \(f * g\), the identity on \(f * g\) and the ``fake identity'' \(\Coh \Delta {f*g \to f*g} {\id_\Delta}\). In the type theories \Cattsu and \Cattsua introduced in \cref{sec:cattsu,sec:cattsua}, identities will be privileged, and these fake identities will be reduced to the true identity. More generally, for each term \(t\) there is a canonical endo-coherence with source and target \(t\), the identity on \(t\). Endo-coherence removal simplifies any other endo-coherence on that term to an identity. It makes the following rule admissible: \begin{mathpar} \inferrule{\Delta \vdash_{\mathsf{ps}} \\ \Delta \vdash A \\ \Delta \vdash s : A \\ \Supp(s) = \Var(\Delta) \\ \Gamma \vdash \sigma : \Delta}{\Gamma \vdash \Coh \Delta {\arr s A s} \sigma = \id(A \sub \sigma,s \sub \sigma)}\textsc{ecr} \end{mathpar} Endo-coherence removal can be assembled into the following equality rule set. \begin{definition} The \emph{endo-coherence removal set}, \ecr, is the set consisting of the triples: \[ \Gamma, \Coh \Delta {\arr s A s} \sigma, \id(A\sub \sigma, s \sub \sigma) \] for contexts \(\Gamma\) and \(\Delta\), \(A : \Type_\Delta\), \(s : \Term_\Delta\), and \(\sigma : \arr \Delta \star \Gamma\). A set of rules \(\mathcal{R}\) \emph{contains endo-coherence removal} if \(\ecr \subseteq \mathcal{R}\). We say that \(\mathcal{R}\) \emph{has endo-coherence removal} if the rule \textsc{ecr} holds in the generated theory. \end{definition} The set \ecr satisfies all the conditions introduced in this chapter, as proven in the next proposition, which concludes this chapter. \begin{proposition} \label{prop:ecr-props} Suppose the set of operations \(\mathcal{O}\) contains the standard operations. Then the set \ecr satisfies the following properties: \begin{lemmaenum} \item The set \ecr satisfies the weakening condition. \item The set \ecr satisfies the suspension condition. \item The set \ecr satisfies the \(\mathcal{R}\)-substitution condition, for any equality set \(\mathcal{R}\). \item \label{item:ecr-supp} The set \ecr satisfies the \(\mathcal{R}\)-support condition, for any equality set \(\mathcal{R}\) satisfying the support condition. \item \label{item:ecr-preserve} The set \ecr satisfies the \(\mathcal{R}\)-preservation condition, for any equality set \(\mathcal{R}\) satisfying the weakening and substitution conditions. \end{lemmaenum} \end{proposition} \begin{proof} Suppose \((\Gamma, \Coh \Delta {\arr s A s} \sigma, \id(A \sub \sigma, s \sub \sigma)) \in \ecr\). To show that the substitution holds, we suppose that \(\tau : \arr \Gamma \star \Theta\), and then must prove that: \[ (\Theta, \Coh \Delta {\arr s A s} {\sigma \bullet \tau}, \id(A \sub \sigma, s \sub \sigma) \sub \tau) \in \ecr \] It is immediate that: \[ (\Theta, \Coh \Delta {\arr s A s} {\sigma \bullet \tau}, \id(A \sub {\sigma \bullet \tau}, s \sub {\sigma \bullet \tau})) \in \ecr \] and so it suffices to prove that \(\id(A \sub \sigma, s \sub \sigma) \sub \tau \equiv \id(A \sub {\sigma \bullet \tau},s \sub {\sigma \bullet \tau})\), but this follows from \cref{item:disc-prop-sub-sub,prop:categorical}. The weakening condition then follows from the substitution condition. For the suspension condition, it must be shown that: \[ (\Sigma(\Gamma), \Coh {\Sigma(\Delta)} {\arr {\Sigma(s)} {\Sigma(A)} {\Sigma(s)}} {\Sigma(\sigma)}, \Sigma(\id(A\sub \sigma, s \sub \sigma))) \in \ecr \] and so it suffices to show that \(\Supp(\Sigma(s)) = \Var(\Sigma(\Delta))\), which follows from \(\Supp(\Sigma(s)) = \Sigma(\Supp(s))\), and \[ \Sigma(\id(A \sub \sigma, s \sub \sigma)) \equiv \id(\Sigma(A) \sub {\Sigma(\sigma)}, \Sigma(s) \sub {\Sigma(\sigma)}) \] which follows from the functoriality of suspension and \cref{item:disc-prop-sub-susp,item:disc-prop-susp}. For the support condition, assume that \(\Gamma \vdash_{\mathcal{R}} \Coh \Delta {\arr s A s} \sigma : B\) for some \(B : \Type_\Gamma\) and that \(\mathcal{R}\) satisfies the support condition. Then: \begin{align*} \Supp(\Coh \Delta {\arr s A s} \sigma) &= \Supp(\sigma)\\ &= \FV(\sigma)&\text{by \cref{item:supp-sub-char}}\\ &= \Var(\Delta) \sub \sigma \\ &= \Supp(s) \sub \sigma&\text{by assumption}\\ &= (\Supp(A) \cup \Supp(s)) \sub \sigma &\text{by \cref{item:supp-tm-char-2}}\\ &= \DC_\Delta(\FV(A) \cup \FV(s)) \sub \sigma\\ &= \DC_\Gamma(\FV(A) \sub \sigma \cup \FV(s) \sub \sigma)&\text{by \cref{cor:dc-sub}}\\ &= \DC_\Gamma(\FV(A \sub \sigma) \cup \FV(s \sub \sigma))&\text{by \cref{prop:vs-sub}}\\ &= \Supp(A \sub \sigma) \cup \Supp(s \sub \sigma)\\ &= \Supp(\id(A \sub \sigma, s \sub \sigma)) \end{align*} as required. Lastly for the preservation condition, let \(\mathcal{R}\) satisfy the weakening and substitution conditions, and assume \(\Gamma \vdash \Coh \Delta {\arr s A s} {\sigma} : B\). By deconstructing the typing derivation, we must have that \(\Delta \vdash A\), \(\Delta \vdash s : A\), and \(\Gamma \vdash \sigma : \Delta\). Therefore, by \cref{prop:sub-prop-1}, \(\Gamma \vdash A \sub \sigma\) and \(\Gamma \vdash s \sub \sigma : A \sub \sigma\). Hence, by \cref{cor:id-typing}, \(\Gamma \vdash \id(A \sub \sigma, s \sub \sigma) : (\arr s A s) \sub \sigma\). It remains to prove that \(\Gamma \vdash (\arr s A s) \sub \sigma = B\), but this is immediate from \cref{lem:ty-unique}, applied to the derivation \(\Gamma \vdash \Coh \Delta {\arr s A s} \sigma : B\). \end{proof} \begin{figure}[t] \centering %\includegraphics[height=\textheight - 25pt]{test.pdf} \caption{Dependency graph of Agda formalisation.} \label{fig:dep-graph} \end{figure} \chapter{Constructions in \texorpdfstring{\boldmath\Cattr}{Cattr}} \label{sec:operations-catt} This chapter will investigate some more involved constructions that can be given in the type theory \Cattr. These constructions will be central to defining the reductions that underpin the type theories \Cattsu and \Cattsua which appear in \cref{cha:cattstrict}. We will give a definition of each construction, describe under what conditions it is well-formed, and state various properties describing the behaviour of the construction and its interaction with other constructions. For this chapter we will assume that we are working in a tame theory, as described in \cref{sec:tame-theories}. This means that all proofs in this section will hold in any variant of \Cattr such that the equality set \(\mathcal{R}\) satisfies the weakening, substitution, and suspension conditions, and the set of operations \(\mathcal{O}\) is suspendable and contains the standard operations. We will also use all the relevant proofs from \cref{sec:catt-with-equality}, without explaining exactly what condition of the set \(\mathcal{R}\) is being used. The formalisation is commonly more specific when specifying which conditions are necessary for each module, for example omitting the suspension condition when it is not needed for a specific construction, but for the body of this text we ignore these distinctions and simply assume that every theory we work with will be tame, as will be case for all theories introduced in \cref{cha:cattstrict}. This chapter builds up to the following two constructions, that can be viewed as meta-operations on \Cattr. \begin{itemize} \item The \emph{pruning} operation will be introduced in \cref{sec:pruning} and is the main component of the type theory \Cattsu, defined in \cref{sec:cattsu}, a type theory for strictly unital \(\infty\)-categories. Pruning removes unnecessary identities from a term, simplifying the resulting term in the process. \item The \emph{insertion} operation will be introduced in \cref{sec:insertion}. It powers the type theory \Cattsua, a type theory for strictly unital and associative \(\infty\)-categories. Insertion merges certain arguments to a coherence into the body of the coherence itself, effectively ``inserting'' the argument into the head term. It can be viewed as a generalisation of pruning, but is a more complex construction. \end{itemize} Both pruning and insertion perform more radical modifications to the structure of a term than disc removal and endo-coherence removal, the equality rules we have seen so far. Pruning and insertion modify the pasting diagram in the coherence at the head of the term they act on. In this chapter, more combinatorial descriptions of pasting diagrams will be introduced to enable the pasting diagrams involved in these constructions to be constructed by induction. The pruning construction identifies locally maximal arguments of a coherence that are syntactically identities, and removes these arguments from the term, while also removing the component of the pasting diagram in the coherence which corresponds to this argument. Pruning could be applied to the term \(f * g * \id\), a ternary composite, to remove the identity argument and convert the ternary composite to a binary composite, returning the term \(f*g\). Insertion does not just simply remove parts of a term, but flattens the structure of a term, moving data from a locally maximal argument into the head term. The motivating example for insertion is the term \(f * (g * h)\), a binary composite where one of the locally maximal arguments is itself a binary composite. Under insertion, the inner composite \(g * h\) is merged with the outer binary composite to form a single ternary composite \(f * g * h\). When a locally maximal argument is an identity, it will always be insertable, and the result of inserting the identity into the head term will be similar to pruning the same argument, motivating the viewpoint that insertion is a generalisation of pruning. At the end of this chapter, this relationship will be made precise. Insertion again performs more radical changes to the head coherence of the term than pruning, and needs to be able to merge two pasting diagrams into one along a locally maximal argument. The operation on pasting diagrams is best understood as an operation on \emph{trees}, an alternative characterisation of pasting diagrams which will be introduced in \cref{sec:trees}. Although the definition of these trees is simple, to be able to use them effectively we must be able to describe their relationship to the \Catt contexts they represent. It will also be necessary to describe the morphisms between these trees, which correspond to substitutions between the underlying contexts, and the composition of such morphisms. Certain constructions on trees will not compute nicely with the syntax in \Catt. We therefore introduce a new notion of \emph{structured term}, an alternative syntax for \Catt which allows more complex representations of terms over contexts derived from trees. Structured terms effectively retain more information about how they are constructed, allowing constructions to compute on them in ways that are not possible on the raw syntax of \Catt. This representation of terms will be crucial in the formalisation, as it aids the proof assistant in simplifying various constructions. These structured terms are defined in \cref{sec:structured-terms}. Finally, \cref{sec:insertion} defines the constructions used in the insertion operation, using the structured syntax from the preceding section. In this section, many properties of insertion are stated, including a universal property that it satisfies. \section{Pruning} \label{sec:pruning} Pruning drives the strictly unital behaviour of \Cattsu. Unitality in \(\infty\)-categories is the property that the identity acts as a unit with respect to composition, so that composing with the unit is equivalent to the original term. If an \(\infty\)-category is strictly unital, then it exhibits this behaviour up to equality rather than equivalence. For \Catt, strict unitality means that a composition containing an identity as one of its arguments should be definitionally equal to the term with this argument removed. Pruning is the operation that removes an argument from a composition, taking a term such as \(f * g * \id\) to \(f * g\), or \(\id * f\) to the unary composite on \(f\). In the presence of strict units, it is also desirable to simplify the higher-dimensional data that witnessed the (weak) unitality in \Catt. For example, the left unitor on \(f\), given by the term: \[ \Coh {(x : \star), (y : \star), (f : \arr x \star y)} {\arr {\id(x) * f} {} {f}} {\id} \] which witnesses that composing on the left with an identity is equivalent to the original term, can be simplified to the identity on \(f\), and the triangle equations which govern the coherence laws for the unitors can also trivialise. For this reason, pruning is defined to be able to apply to any term which has identities as a locally maximal argument. We review the definition of a locally maximal argument below. \begin{definition} In a context \(\Gamma\), a \emph{locally maximal variable} is a variable \(x\) of \(\Gamma\) that does not appear in the source or target of any other variable of \(\Gamma\). Equivalently, \(x\) is locally maximal when: \[ x \not\in \Supp(y) \] for any \(y \neq x \in \Var(\Gamma)\). Given a substitution \(\sigma : \Delta \to \Gamma\), a \emph{locally maximal argument} of \(\sigma\) is a term \(x \sub \sigma\) where \(x\) is a locally maximal variable of \(\Delta\). \end{definition} \begin{example} \label{ex:lm} Consider the pasting diagram given by the following diagram: % https://q.uiver.app/#q=WzAsMyxbMCwwLCJ4Il0sWzIsMCwieSJdLFszLDAsInoiXSxbMCwxLCJmIiwwLHsiY3VydmUiOi01fV0sWzAsMSwiaCIsMix7ImN1cnZlIjo1fV0sWzAsMSwiZyIsMV0sWzEsMiwiaiJdLFszLDUsIlxcYWxwaGEiLDAseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzUsNCwiXFxiZXRhIiwwLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dXQ== % tex-fmt: skip \[ \begin{tikzcd} x && y & z \arrow[""{name=0, anchor=center, inner sep=0}, "f"', curve={height=30pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "h", curve={height=-30pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "g"{description}, from=1-1, to=1-3] \arrow["j", from=1-3, to=1-4] \arrow["\alpha", shorten <=3pt, shorten >=5pt, Rightarrow, from=0, to=2] \arrow["\beta", shorten <=5pt, shorten >=3pt, Rightarrow, from=2, to=1] \end{tikzcd} \] which corresponds to the \Catt context (written to highlight the dimension of each term): \begin{alignat*}{3} \Theta ={} &(x : \star),\\ &(y : \star),{}&&(f : x \to y),\\ &&&(g : x \to y),{}&&(\alpha : f \to g),\\ &&&(h : x \to y),&&(\beta : g \to h),\\ &(z : \star),&&(j : y \to z) \end{alignat*} The locally maximal variables of \(\Theta\) are \(\alpha\), \(\beta\), and \(j\). Note that \(j\) is locally maximal, despite not being of maximal dimension in the context. Pruning the context \(\Theta\) along locally maximal variable \(\alpha\) removes the variables \(\alpha\) and \(g\) from the context, and must amend the type of \(\beta\) so that its source is \(f\). \end{example} To perform the pruning construction, we start with a coherence term \(\Coh \Delta A \sigma : \Term_\Gamma\), and assume that some locally maximal argument of \(\sigma\) is an identity, that is \(x \sub \sigma \equiv \id(B,t)\) for some locally maximal variable \(x\), type \(B : \Type_\Gamma\), and term \(t : \Term_\Gamma\). We then construct the following: \begin{itemize} \item A new pasting diagram \(\Delta \sslash x\), corresponding to \(\Delta\) with the variable \(x\) and its target removed. \item A new set of arguments \(\sigma \sslash x\), consisting of the same terms as \(\sigma\) except those corresponding to \(x\) and its target. \item A projection substitution \(\pi_x : \Delta \to \Delta \sslash x\), from which a type \(A \sub {\pi_x} : \Type_{\Delta \sslash x}\) can be obtained. This projection sends \(x\) to the identity on its source, the target of \(x\) to the source of \(x\), and every other variable to itself. \end{itemize} We note that the source and target of the locally maximal variable \(x\) are well-defined as \(x\) must be sent by \(\sigma\) to an identity, which cannot be zero dimensional. \subsection{Dyck words} To be able to easily reason about the structures involved in pruning, we wish to define them by induction. To do this we introduce a different presentation of pasting diagrams called \emph{Dyck words}, which have a simpler inductive structure. Dyck words more directly encode the structure of the pasting diagram, and will allow us to give an inductive characterisation of the locally maximal variables of the associated context. \begin{definition} The set of \emph{Dyck words}, \(\Dyck_d\) of trailing dimension \(d\) consists of lists of ``up'' and ``down'' moves according to the following rules. \begin{mathpar} \inferrule{ }{\circleddash : \Dyck_0} \and \inferrule{d : \mathbb{N} \\\mathcal{D} : \Dyck_d}{\mathcal{D} \Uparrow : \Dyck_{d + 1}} \and \inferrule{d : \mathbb{N} \\ \mathcal{D} : \Dyck_{d + 1}}{\mathcal{D} \Downarrow : \Dyck_d} \end{mathpar} In any prefix of a Dyck word \(D : \Dyck_d\), the number of ``up'' moves (given by constructor \(\Uparrow\)) must be greater than or equal to the number of ``down'' moves (given by constructor \(\Downarrow\)). The difference between the number of each move is given by the trailing dimension \(d\). \end{definition} Dyck words can be given a visual interpretation as a \emph{mountain diagram}. To obtain such a diagram we start on the left-hand side, and draw a continuous line by drawing an upwards sloping segment for each \(\Uparrow\) in the word, and a downwards sloping line for each \(\Downarrow\) in the word. An example of such a diagram is given in \cref{fig:mountain}. \begin{figure}[ht] \centering \[\begin{tikzcd}[column sep = small, cells={inner sep = 0}, arrows={no head}] && \bullet && \bullet \\ & \bullet && \bullet && \bullet && \bullet \\ \bullet &&&&&& \bullet && \bullet \arrow[from=3-1, to=2-2] \arrow[from=2-2, to=1-3] \arrow[from=1-3, to=2-4] \arrow[from=2-4, to=1-5] \arrow[from=1-5, to=2-6] \arrow[from=2-6, to=3-7] \arrow[from=3-7, to=2-8] \arrow[from=2-8, to=3-9] \end{tikzcd}\] \caption[Mountain diagram]{Mountain diagram for \(\circleddash \Uparrow\, \Uparrow\, \Downarrow\, \Uparrow\, \Downarrow\, \Downarrow\, \Uparrow\, \Downarrow\, : \Dyck_0\).} \label{fig:mountain} \end{figure} The rules \(\circleddash\), \(\Uparrow\), and \(\Downarrow\) directly correspond to the rules \textsc{pss}, \textsc{pse}, and \textsc{psd} that generate the typing judgement for ps-contexts. From a Dyck word, we can directly construct this context by induction. \begin{definition} For a Dyck word \(\mathcal{D} : \Dyck_d\), its associated context \(\lfloor \mathcal{D} \rfloor\), associated type \(\ty_{\mathcal{D}} : \Type_{\lfloor \mathcal{D} \rfloor}\), and associated term \(\tm_{\mathcal{D}} : \Term_{\lfloor \mathcal{D} \rfloor}\) are defined by mutual induction on \(\mathcal{D}\): \begin{align*} \lfloor \circleddash \rfloor &= (x : \star)\\ \lfloor \mathcal{D} \Uparrow \rfloor &= \lfloor \mathcal{D} \rfloor, (y_{\mathcal{D}} : \ty_{\mathcal{D}}), (f_{\mathcal{D}} : \arr {\wk(\tm_{\mathcal{D}})} {\wk(\ty_{\mathcal{D}})} {y_{\mathcal{D}}})\\ \lfloor \mathcal{D} \Downarrow \rfloor &= \lfloor \mathcal{D} \rfloor\\[10pt] \ty_{\circleddash} &= \star\\ \ty_{\mathcal{D}\Uparrow} &= \arr{\wk(\wk(\tm_{\mathcal{D}}))} {\wk(\wk(\ty_{\mathcal{D}}))} {y_{\mathcal{D}}}\\ \ty_{\mathcal{D} \Downarrow} &= \base(\ty_{\mathcal{D}})&\text{where }\base(\arr s A t) = A\\[10pt] \tm_{\circleddash} &= x\\ \tm_{\mathcal{D}\Uparrow} &= f_{\mathcal{D}}\\ \tm_{\mathcal{D}\Downarrow} &= \tgt(\ty_{\mathcal{D}})&\text{where }\tgt(\arr s A t) = t \end{align*} The variable names given here are used to avoid ambiguity in the definition. As we consider contexts up to \(\alpha\)-equality, we may freely change these variable names. The \(\tgt\) and \(\base\) operations are well-defined here as it may be checked by a simple induction that \(\dim(\ty_{\mathcal{D}}) = d\) for \(\mathcal{D} : \Dyck_d\), ensuring that we only apply \(\tgt\) and \(\base\) to types of strictly positive dimension. \end{definition} The tight correspondence between the rules used to construct Dyck words and ps-contexts allow an easy proof that the contexts associated to Dyck words are in fact pasting diagrams. \begin{lemma} \label{lem:dyck-typing} For a Dyck word \(\mathcal{D} : \Dyck_d\), its associated context, type, and term are all well-formed: \[ \lfloor \mathcal{D} \rfloor \vdash \qquad \lfloor \mathcal{D} \rfloor \vdash \ty_{\mathcal{D}} \qquad \lfloor \mathcal{D} \rfloor \vdash \tm_{\mathcal{D}} : \ty_{\mathcal{D}} \] In addition to being a well-formed context, the context associated to a Dyck word is a ps-context; the following judgement holds: \[ \lfloor \mathcal{D} \rfloor \vdash_{\mathsf{ps}} \tm_{\mathcal{D}} : \ty_{\mathcal{D}} \] and so if \(\mathcal{D} : \Dyck_0\), we have \(\lfloor \mathcal{D} \rfloor \vdash_{\mathsf{ps}}\). Further, all ps-contexts are the associated context of a Dyck word. \end{lemma} \begin{proof} Due to the similarity of the rules for ps-contexts and Dyck words, this follows quickly from simple inductions, which are given in the formalisation. The proofs for the typing judgements appear in \module{Catt.Dyck.Typing} and the proofs for the ps-context judgements appear in \module{Catt.Dyck.Pasting}. \end{proof} The locally maximal variables in the context associated to a Dyck word correspond exactly to the points in the word where there is an upwards move followed immediately by a downwards move, creating a peak in the mountain diagram. These peaks can be given an inductive characterisation. \begin{definition} Let \(\mathcal{D} : \Dyck_d\) be a Dyck word. A \emph{peak} of \(\mathcal{D}\), \(p : \Peak_{\mathcal{D}}\) is inductively defined by the following rules: \begin{mathpar} \inferrule{d \in \mathbb{N} \\ \mathcal{D} : \Dyck_d}{\mathcal{D} \UDPeak : \mathcal{D} \Uparrow\,\Downarrow}\and \inferrule{d \in \mathbb{N} \\ \mathcal{D} : \Dyck_d \\ p : \Peak_{\mathcal{D}}}{p \UpPeak : \Peak_{\mathsf{D}} \Uparrow}\and \inferrule{d \in \mathbb{N} \\ \mathcal{D} : \Dyck_{d+1} \\ p : \Peak_{\mathcal{D}}}{p \DownPeak : \Peak_{\mathcal{D} \Downarrow}} \end{mathpar} From each peak \(p : \Peak_{\mathcal{D}}\), a term \(\lfloor p \rfloor\) of \(\lfloor \mathcal{D} \rfloor\) can be inductively defined by: \[ \lfloor \mathcal{D} \UDPeak \rfloor = f_\mathcal{D} \qquad \lfloor p \UpPeak \rfloor = \wk(\wk \lfloor p \rfloor) \qquad \lfloor p \DownPeak \rfloor = \lfloor p \rfloor\] The term \(\lfloor p \rfloor\) is a locally maximal variable of \(\lfloor \mathcal{D} \rfloor\). \end{definition} \begin{example} \label{ex:dyck-peaks} Recall the ps-context \(\Theta\) from \cref{ex:lm}. This context is the associated context of the Dyck word: \[ \circleddash \Uparrow\, \Uparrow\, \Downarrow\, \Uparrow\, \Downarrow\, \Downarrow\, \Uparrow\, \Downarrow\] for which the mountain diagram is given in \cref{fig:mountain}. The three locally maximal variables \(\alpha\), \(\beta\), and \(j\) correspond to the peaks: \[ \circleddash \Uparrow\, \UDPeak\, \UpPeak\, \DownPeak\, \DownPeak\, \UpPeak\, \DownPeak \qquad \circleddash \Uparrow\, \Uparrow\, \Downarrow\, \UDPeak\, \DownPeak\, \UpPeak\, \DownPeak \qquad \circleddash \Uparrow\, \Uparrow\, \Downarrow\, \Uparrow\, \Downarrow\, \Downarrow\, \UDPeak \] which themselves correspond to the three peaks of the mountain diagram, with the height of each peak corresponding to the dimension of each locally maximal variable. \end{example} As all disc contexts are pasting diagrams, and hence are the associated context of a Dyck word. \begin{definition} Let \(\mathcal{D}^n\) be the Dyck word with \(n\) upwards moves followed by \(n\) downwards moves. The equality \( \lfloor \mathcal{D}^n \rfloor \equiv D^n\) follows from a trivial induction. If \(n > 0\), There is a unique peak of \(\mathcal{D}^n\) with associated term \(d_n\). \end{definition} We lastly show that a Dyck word can be suspended, which is expected as ps-contexts are closed under suspension. The various constructions associated to a suspended Dyck word are equal to the same constructions on the unsuspended Dyck word. \begin{lemma} Dyck words are closed under suspension. We define the suspension of a Dyck word \(\mathcal{D} : \Dyck_d\) to be the Dyck word \(\Sigma(\mathcal{D}) : \Dyck_{d+1}\) which is obtained by inserting an additional up move to the start of the work, or can alternatively be inductively defined by: \[ \Sigma(\circleddash) = \circleddash \Uparrow \qquad \Sigma(\mathcal{D}\Uparrow) = \Sigma(\mathcal{D})\Uparrow \qquad \Sigma(\mathcal{D}\Downarrow) = \Sigma(\mathcal{D})\Downarrow \] The following equalities hold: \[ \lfloor \Sigma(\mathcal{D}) \rfloor = \Sigma(\lfloor \mathcal{D} \rfloor) \qquad \ty_{\Sigma(\mathcal{D})} = \Sigma(\ty_{\mathcal{D}}) \qquad \tm_{\Sigma(\mathcal{D})} = \Sigma(\tm_{\mathcal{D}}) \] for each Dyck word \(\mathcal{D}\). For each peak \(p : \Peak_{\mathcal{D}}\), there is an associated peak \(\Sigma(p) : \Peak_{\Sigma(\mathcal{D})}\) which is defined similarly. \end{lemma} \begin{proof} These properties are all proved by straight forward induction on \(\mathcal{D}\). The formalised proofs appear in \module{Catt.Dyck.Properties}. \end{proof} The Dyck words presented in this section can be viewed as a more direct syntax for pasting contexts, which allow induction to be easily performed. For this reason, most of the properties of Dyck words follow from routine inductions, and hence are relegated to the formalisation. The key contribution of this (sub)section is the characterisation of locally maximal variables as peaks, which have an easy inductive definition due to the simplicity of Dyck words. \begin{remark} All locally maximal variables of ps-contexts are identified with peaks, except for the unique variable of the singleton context. This discrepancy will make no difference for pruning, as a \(0\)-dimensional variable could never have been sent to an identity and so would never have been a candidate for pruning. \end{remark} \subsection{The pruning construction} \label{sec:prune-construction} Equipped with Dyck words, and a classification of locally maximal variables as peaks, we are now able to define each of the constructions used in the pruning operation. \begin{definition} Let \(\mathcal{D} : \Dyck_d\) be a Dyck word, and \(p : \Peak_{\mathcal{D}}\) be a peak of \(\mathcal{D}\). The pruned Dyck word \(\mathcal{D} \sslash p : \Dyck_d\) and substitution \(\pi_p : \lfloor \mathcal{D}\rfloor \to \lfloor \mathcal{D} \sslash p \rfloor\) are then defined inductively on the peak \(p\) by the following equations: \begin{align*} \mathcal{D} \Uparrow\, \Downarrow \sslash \mathcal{D} \UDPeak &= \mathcal{D}\\ \mathcal{D} \Uparrow \sslash p \UpPeak &= (\mathcal{D} \sslash p) \Uparrow \\ \mathcal{D} \Downarrow \sslash p \DownPeak &= (\mathcal{D} \sslash p) \Downarrow \\[10pt] \pi_{\mathcal{D}\UDPeak} &= \langle \id_{\lfloor \mathcal{D} \rfloor} , \tm_{\mathcal{D}}, \id(\ty_{\mathcal{D}}, \tm_{\mathcal{D}}) \rangle\\ \pi_{p \UpPeak} &= \langle \wk(\wk(\pi_p)) , y_{\mathcal{D}}, f_{\mathcal{D}} \rangle\\ \pi_{p \DownPeak} &= \pi_p\\ \intertext{If we further have a substitution \(\sigma : \arr {\lfloor \mathcal{D} \rfloor} \star \Gamma\) for some context \(\Gamma\), then the pruned substitution \(\sigma \sslash p : \arr {\lfloor \mathcal{D} \sslash p \rfloor} \star \Gamma\) can be formed:} \langle \sigma, s, t \rangle \sslash \mathcal{D}\UDPeak &= \sigma \\ \langle \sigma, s, t \rangle \sslash p \UpPeak &= \langle \sigma \sslash p, s, t \rangle \\ \sigma \sslash p \DownPeak &= \sigma \sslash p \end{align*} \end{definition} Each peak in a Dyck word corresponds to a consecutive upwards arrow and downwards arrow. Pruning this peak corresponds removing these two arrows, which does not change the trailing dimension of the Dyck word. The effect on the mountain diagram representation can be seen in \cref{fig:prune}. \begin{figure}[ht] \centering % https://q.uiver.app/#q=WzAsMTcsWzAsMiwiXFxidWxsZXQiXSxbMSwxLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzMsMSwiXFxidWxsZXQiXSxbNCwwLCJcXGJ1bGxldCJdLFs1LDEsIlxcYnVsbGV0Il0sWzYsMiwiXFxidWxsZXQiXSxbNywxLCJcXGJ1bGxldCJdLFs4LDIsIlxcYnVsbGV0Il0sWzksMSwiXFxyaWdodHNxdWlnYXJyb3ciXSxbMTAsMiwiXFxidWxsZXQiXSxbMTEsMSwiXFxidWxsZXQiXSxbMTIsMCwiXFxidWxsZXQiXSxbMTMsMSwiXFxidWxsZXQiXSxbMTQsMiwiXFxidWxsZXQiXSxbMTUsMSwiXFxidWxsZXQiXSxbMTYsMiwiXFxidWxsZXQiXSxbMCwxXSxbMSwyLCIiLDAseyJjb2xvdXIiOlswLDYwLDYwXX1dLFsyLDMsIiIsMCx7ImNvbG91ciI6WzAsNjAsNjBdfV0sWzMsNF0sWzQsNV0sWzUsNl0sWzYsN10sWzcsOF0sWzEwLDExXSxbMTEsMTJdLFsxMiwxM10sWzEzLDE0XSxbMTQsMTVdLFsxNSwxNl1d % tex-fmt: skip \[\begin{tikzcd}[column sep = small, cells={inner sep = 0}, arrows={no head}] && |[color={rgb,255:red,204;green,0;blue,14}]|\bullet && \bullet &&&&&&&& \bullet \\ & \bullet && \bullet && \bullet && \bullet && \rightsquigarrow && \bullet && \bullet && \bullet \\ \bullet &&&&&& \bullet && \bullet && \bullet &&&& \bullet && \bullet \arrow[from=3-1, to=2-2] \arrow[color={Diag2}, from=2-2, to=1-3] \arrow[color={Diag2}, from=1-3, to=2-4] \arrow[from=2-4, to=1-5] \arrow[from=1-5, to=2-6] \arrow[from=2-6, to=3-7] \arrow[from=3-7, to=2-8] \arrow[from=2-8, to=3-9] \arrow[from=3-11, to=2-12] \arrow[from=2-12, to=1-13] \arrow[from=1-13, to=2-14] \arrow[from=2-14, to=3-15] \arrow[from=3-15, to=2-16] \arrow[from=2-16, to=3-17] \end{tikzcd}\] \caption[Pruning]{Pruning of peak \(\circleddash \Uparrow\, \UDPeak\, \UpPeak\, \DownPeak\, \DownPeak\, \UpPeak\, \DownPeak\).} \label{fig:prune} \end{figure} When a peak is pruned the locally maximal variable and its target are removed from the associated context. The substitution \(\pi_{\mathcal{D} \UDPeak}\) simply maps these two variables to \(\id(\ty_{\mathcal{D}},\tm_{\mathcal{D}})\) and \(\tm_{\mathcal{D}}\), where the Dyck term \(\tm_{\mathcal{D}}\) is the source of the locally maximal variable. Pruning a substitution simply removes the terms corresponding to the removed variables in the associated context. \begin{example} Let \(\Gamma = (x : \star), (f : \arr x \star x)\) and consider the term \(f * \id(x)\), which is given by: \[ \Coh {(a : \star), (b : \star), (c : a \to b), (d : \star), (e : b \to d)} {a \to d} {\langle x, x, f, x, \id(\star,x) \rangle} \] The context in this coherence is the associated context of the Dyck word \(\circleddash \Uparrow\,\Downarrow\,\Uparrow\,\Downarrow\) which has a peak \(\circleddash \Uparrow\,\Downarrow\,\UDPeak\), which corresponds to the locally maximal variable \(e\). Since \(e\) is sent to an identity by the substitution, pruning can be applied to get: \begin{align*} \circleddash \Uparrow\,\Downarrow\,\Uparrow\,\Downarrow \sslash \circleddash \Uparrow\,\Downarrow\,\UDPeak &= \circleddash \Uparrow\, \Downarrow\\ \pi_{\circleddash \Uparrow\,\Downarrow\,\UDPeak} &= \langle a, b, c, b, \id(\star,b) \rangle\\ \langle x, x, f, x, \id(\star,x) \rangle \sslash \circleddash \Uparrow\,\Downarrow\,\UDPeak &= \langle x,x,f\rangle \end{align*} Which results in the term: \[ \Coh {(a : \star), (b : \star), (c : a \to b)} {(a \to d) \sub {\langle a, b, c, b, \id(\star,b) \rangle} } {\langle x, x, f \rangle} \equiv \Coh {(a : \star), (b : \star), (c : a \to b)} {(a \to b)} {\langle x, x, f \rangle} \] which is the unary composite on \(f\). In the presence of disc removal, this term could further simplify to the variable \(f\). \end{example} With these constructions, we can define the pruning rule. \begin{definition} A term \(t\) \emph{is an identity} if \(t \equiv \id(A,s)\) for some type \(A\) and some term \(s\). The \emph{pruning rule set}, \prune, is the set consisting of the triples: \[ (\Gamma, \Coh {\lfloor \mathcal{D} \rfloor} {A} \sigma, \Coh {\lfloor \mathcal{D} \sslash p \rfloor} {A \sub {\pi_p}} {\sigma \sslash p}) \] for each Dyck word \(\mathcal{D} : \Dyck_0\), peak \(p : \Peak_{\mathcal{D}}\), type \(A : \Type_{\lfloor \mathcal{D} \rfloor}\), and substitution \(\sigma : \arr {\lfloor \mathcal{D} \rfloor} \star \Gamma\) where \(\lfloor p \rfloor \sub \sigma\) is an identity. A set of rules \(\mathcal{R}\) \emph{contains pruning} if \(\prune \subseteq \mathcal{R}\). Pruning makes the following rule admissible: \begin{mathpar} \inferrule{\mathcal{D} : \Dyck_0 \\ p : \Peak_{\mathcal{D}} \\ \lfloor \mathcal{D} \rfloor \vdash A \\ \Gamma \vdash \sigma : \lfloor \mathcal{D} \rfloor \\\\ (\lfloor \mathcal{D} \rfloor, \Supp(\src(A)), \tgt(A)) \in \mathcal{O}\\ \lfloor p \rfloor \sub \sigma \text{ is an identity}}{\Gamma \vdash \Coh {\lfloor \mathcal{D} \rfloor} A \sigma = \Coh {\lfloor \mathcal{D} \sslash p} {A \sub {\pi_p}} {\sigma \sslash p}}\textsc{prune} \end{mathpar} The set \(\mathcal{R}\) \emph{has pruning} if the rule \textsc{prune} holds in the generated theory. \end{definition} \subsection{Properties of pruning} We start with the aim of proving that each construction involved in pruning satisfies the expected typing judgements. To do this the following lemma will be necessary, which describes the interaction of the Dyck word construction with pruning. \begin{lemma} \label{lem:dyck-prune-prop} Let \(\mathcal{D} : \Dyck_d\) be a Dyck word. Then the following equations hold: \begin{align*} \ty_{\mathcal{D}} \sub{\pi_p} &\equiv \ty_{\mathcal{D} \sslash p}\\ \tm_{\mathcal{D}} \sub{\pi_p} &\equiv \tm_{\mathcal{D} \sslash p} \end{align*} for any peak \(p : \Peak_{\mathcal{D}}\) of \(\mathcal{D}\). \end{lemma} \begin{proof} The proof proceeds by an induction on the peak \(p\), proving both equations simultaneously. Both equations hold by routine calculations given in \module{Catt.Dyck.Pruning.Properties} by the functions \func{Catt.Dyck.Pruning.Properties}{dyck-type-prune} and \func{Catt.Dyck.Pruning.Properties}{dyck-term-prune}. \end{proof} This allows the main typing properties of this section to be given. \begin{proposition} \label{prop:prune-ty} Let \(\mathcal{D} : \Dyck_d\) be a Dyck word and let \(p : \Peak_{\mathcal{D}}\) be a peak of this word. Then: \[ \lfloor \mathcal{D} \sslash p \rfloor \vdash \pi_p : \lfloor \mathcal{D} \rfloor \] Given a substitution \(\sigma\) with \(\Gamma \vdash \sigma : \lfloor \mathcal{D} \rfloor\), where \(\lfloor p \rfloor \sub \sigma\) is an identity, the equality and typing judgements: \[ \Gamma \vdash \sigma = \pi_p \bullet (\sigma \sslash p) \qquad \Gamma \vdash \sigma : \lfloor \mathcal{D} \sslash p \rfloor \] hold. \end{proposition} \begin{proof} We prove each judgement holds in turn by induction on the peak \(p\). For the judgement: \[ \lfloor \mathcal{D} \sslash p \rfloor \vdash \pi_p : \lfloor \mathcal{D} \rfloor \] the case when the peak is of the form \(p\DownPeak\) is trivial. The case for when it is of the form \(\mathcal{D}\UDPeak\) easily follows from \cref{lem:dyck-typing,cor:id-typing}. For the case where the peak is of the form \(p\UpPeak\), it must be shown that: \[ \Delta \vdash \langle \wk(\wk(\pi_p)), y, f \rangle : \lfloor \mathcal{D} \rfloor, (y : \ty_{\mathcal{D}}), (f : \arr {\wk(\tm_{\mathcal{D}})} {\wk(\ty_{\mathcal{D}})} y) \] where \(\Delta = \lfloor \mathcal{D} \sslash p \rfloor, (y : \ty_{\mathcal{D} \sslash p}), (f : \arr{\wk(\tm_{\mathcal{D}\sslash p})} {\wk(\ty_{\mathcal{D}\sslash p})} {y})\). This requires proofs of: \begin{align*} \Delta &\vdash \wk(\wk(\pi_p)) : \lfloor \mathcal{D} \rfloor\\ \Delta &\vdash y : \ty_{\mathcal{D}} \sub {\pi_p}\\ \Delta &\vdash f : (\arr {\wk(\tm_{\mathcal{D}})} {\wk(\ty_{\mathcal{D}})} y) \sub {\langle \wk(\pi_p), y \rangle} \end{align*} The first part follows from inductive hypothesis (and typing of weakening). The other two judgements follow from some calculation and \cref{lem:dyck-prune-prop}. For the second judgement: \[ \Gamma \vdash \sigma = \pi_p \bullet (\sigma \sslash p)\] The \(p \DownPeak\) case is again trivial. The \(p \UpPeak\) case follows easily from properties of weakening and the inductive hypothesis. For the \(\mathcal{D} \UDPeak\) case we suppose the substitution is of the form \(\langle \sigma, s, \id(A,t) \rangle\) and are required to show that: \[ \Gamma \vdash \langle \id_{\mathcal{D}}, \tm_{\mathcal{D}}, \id(\ty_{\mathcal{D}}, \tm_{\mathcal{D}})\rangle \bullet \sigma = \langle \sigma, s, \id(A,t) \rangle \] It is immediate that \(\id_{\mathcal{D}} \bullet \sigma \equiv \sigma\) and so it remains to show that \(\Gamma \vdash \tm_{\mathcal{D}} \sub \sigma = s\) and \(\Gamma \vdash \id(\ty_{\mathcal{D}},\tm_{\mathcal{D}}) \sub \sigma = \id(A,t)\). By deconstructing the typing derivation of \(\langle \sigma, s, \id(A,t) \rangle\), we have: \[ \Gamma \vdash \id(A,t) : (\arr{\wk(\tm_{\mathcal{D}})} {\wk(\ty_{\mathcal{D}})} {y}) \sub {\langle \sigma ,s \rangle} \] By \cref{cor:id-typing} and the uniqueness of typing, we must have: \[ \Gamma \vdash \arr t A t = (\arr{\wk(\tm_{\mathcal{D}})} {\wk(\ty_{\mathcal{D}})} {y}) \sub {\langle \sigma ,s \rangle} \equiv \arr {\tm_{\mathcal{D}} \sub \sigma} {\ty_{\mathcal{D}} \sub \sigma} {s} \] and so \(A = \ty_{\mathcal{D}} \sub \sigma\) and \(s = t = \tm_{\mathcal{D}} \sub \sigma\). The equality \(\id(\ty_{\mathcal{D}}, \tm_{\mathcal{D}}) = \id(A,t)\) follows as equality is respected by the identity construction, which can be proved by a simple induction. Lastly, we consider the judgement: \[ \Gamma \vdash \sigma \sslash p : \lfloor \mathcal{D} \sslash p \rfloor \] The only difficult case is for the peak \(p \UpPeak\), where we can assume that the substitution is of the form \(\langle \sigma, s, t\rangle\), such that: \[ \langle \sigma, s, t\rangle \sslash p \UpPeak \equiv \langle \sigma \sslash p, s, t\rangle\] Typing for \(\sigma \sslash p\) follows from inductive hypothesis, and the typing for \(s\) and \(t\) follow from applying conversion rules to the corresponding parts of the typing derivation for \(\langle \sigma, s, t \rangle\). After some computation, the following equalities are needed for these conversion rules: \begin{align*} \Gamma &\vdash \tm_{\mathcal{D}} \sub \sigma = \tm_{\mathcal{D} \sslash p} \sub {\sigma \sslash p}\\ \Gamma &\vdash \ty_{\mathcal{D}} \sub \sigma = \ty_{\mathcal{D} \sslash p} \sub {\sigma \sslash p} \end{align*} The first is given by: \begin{align*} \tm_{\mathcal{D}} \sub \sigma &= \tm_{\mathcal{D}} \sub {\pi_p \bullet (\sigma \sslash p)}\\ &\equiv \tm_{\mathcal{D}} \sub {\pi_p} \sub {\sigma \sslash p}\\ &\equiv \tm_{\mathcal{D} \sslash p} \sub {\sigma \sslash p} \end{align*} and the second follows similarly, completing the proof. \end{proof} We next show that pruning has the expected properties on the Dyck words \(\mathcal{D}^n\), which correspond to disc contexts. \begin{proposition} \label{prop:prune-disc} Let \(n > 0\), and \(p\) be the unique peak of \(\mathcal{D}^n\). Then: \[ \mathcal{D}^n \sslash p \equiv \mathcal{D} \qquad \{\arr s A t,u\} \sslash p \equiv \{A,s\}\] for all \(A,s,t,u\) where \(\dim(A) = n - 1\). \end{proposition} \begin{proof} Both properties are immediate. \end{proof} We now turn our attention to proving that the pruning equality set satisfies all the conditions from \cref{sec:ruleset}. We begin with the tameness conditions, omitting the weakening condition, as it follows from the substitution condition. \begin{proposition} \label{prop:prune-tame} For all \(\mathcal{D} : \Dyck_d\) and peaks \(p : \Peak_{\mathcal{D}}\), and substitutions \(\sigma : \lfloor \mathcal{D} \rfloor \to \Delta\) and \(\tau : \Delta \to \Gamma\) the following equality holds: \[ (\sigma \sslash p) \bullet \tau \equiv (\sigma \bullet \tau) \sslash p \] Hence, the set \prune satisfies the \(\mathcal{R}\)-substitution condition for any equality set \(\mathcal{R}\), and so also satisfies the weakening condition. Furthermore, the following equalities hold: \[\Sigma(\mathcal{D}) \sslash \Sigma(p) = \Sigma(\mathcal{D} \sslash p) \qquad \pi_{\Sigma(p)} \equiv \Sigma(\pi_p) \qquad \Sigma(\sigma \sslash p) \equiv \Sigma(\sigma) \sslash \Sigma(p)\] Therefore, the set \prune also satisfies the suspension condition, making the equality set \prune tame. \end{proposition} \begin{proof} The proofs of each syntactic equality are easily proved by induction on the peak \(p\). Their proofs are given in the formalisation in \module{Catt.Dyck.Pruning.Properties} as \func{Catt.Dyck.Pruning.Properties}{//s-sub}, \func{Catt.Dyck.Pruning.Properties}{prune-susp-peak}, \funcn{Catt.Dyck.Pruning.Properties}{susp-π}{susp-\(\pi\)}, and \func{Catt.Dyck.Pruning.Properties}{susp-//s}. \end{proof} To show that the support property holds, we must prove that \(\Supp(\sigma) = \Supp(\sigma \sslash p)\). We aim to do this by observing that \(\Supp(\sigma) = \Supp(\pi_p \bullet (\sigma \sslash p))\) and that \(\Supp(\pi_p \bullet (\sigma \sslash p)) = \Supp(\sigma \sslash p)\). By employing the proof strategy for the support condition introduced in \cref{sec:further-conditions}, the first will follow from the equality \(\sigma = \pi_p \bullet (\sigma \sslash p)\), which we can assume holds in a theory which satisfies the support condition. For the second we need the following lemma. \begin{lemma} \label{lem:pi-bdry} For all \(n : \mathbb{N}\), \(\epsilon \in \{-,+\}\), \(\mathcal{D} : \Dyck_d\), and \(p : \Peak_{\mathcal{D}}\): \[ \bdry n \epsilon {\lfloor \mathcal{D} \rfloor} \sub {\pi_p} = \bdry n \epsilon {\lfloor \mathcal{D} \sslash p \rfloor} \] and so \(\Supp(\pi_p) = \Var(\lfloor \mathcal{D} \sslash p \rfloor)\). \end{lemma} \begin{proof} The main equation in this lemma is given by a long and technical induction on the peak \(p\). The details of this induction appear in the formalisation in the function \funcn{Catt.Dyck.Pruning.Support}{π-boundary-vs}{\(\pi\)-boundary-vs} which appears in the module \module{Catt.Dyck.Pruning.Support}. The equation \(\Supp(\pi_p) = \Var(\lfloor \mathcal{D} \sslash p \rfloor)\) follows from \cref{prop:vs-sub,lem:bdry-full}, by setting \(n = \dim(\lfloor \mathcal{D} \rfloor)\). \end{proof} We are now ready to prove that the support condition holds. \begin{proposition} \label{prop:prune-supp} Let \(\mathcal{R}\) be a tame equality rule set that satisfies the support condition. Then the set \prune satisfies the \(\mathcal{R}\)-support condition. \end{proposition} \begin{proof} It suffices to prove that: \[ \Supp(\Coh {\lfloor \mathcal{D} \rfloor} {A} \sigma) = \Supp(\Coh {\lfloor \mathcal{D} \sslash p \rfloor} {A \sub {\pi_p}} {\sigma \sslash p}) \] for \(\mathcal{D} : \Dyck_0\), \(p : \Peak_{\mathcal{D}}\), type \(A\), and substitution \(\sigma : \lfloor \mathcal{D} \rfloor \to \Gamma\), where \(\lfloor p \rfloor \sub \sigma\) is an identity and \(\Gamma \vdash_{\mathcal{R}} \Coh {\lfloor \mathcal{D} \rfloor} {A} \sigma : B\) for some \(B\). By inspection of the typing derivation we obtain an instance of the judgement \(\Gamma \vdash_{\mathcal{R}} \sigma : \lfloor \mathcal{D} \rfloor\), and so: \begin{align*} \Supp(\Coh {\lfloor \mathcal{D} \rfloor} {A} \sigma) &= \Supp(\sigma)\\ &= \Supp(\pi_p \bullet (\sigma \sslash p))&(*)\\ &= \Supp(\pi_p) \sub {\sigma \sslash p}\\ &= \Var{\lfloor \mathcal{D} \sslash p \rfloor} \sub {\sigma \sslash p}&\text{by \cref{lem:pi-bdry}}\\ &= \Supp(\sigma \sslash p) \\ &= \Supp(\Coh {\lfloor \mathcal{D} \sslash p \rfloor} {A \sub {\pi_p}} {\sigma \sslash p}) \end{align*} where equality \((*)\) is derived by applying \cref{prop:supp-prop} to the equality \[\Gamma \vdash_{\mathcal{R}} \sigma = \pi_p \bullet (\sigma \sslash p)\] from \cref{prop:prune-ty}. \end{proof} To prove that the preservation condition holds, it is necessary to show that the type \(A \sub{\pi_p}\) created by pruning is a valid operation. This cannot be deduced from any of the conditions that have been imposed on the operation set \(\mathcal{O}\) so far. Therefore, we introduce the following additional condition. \begin{definition} An operation set \(\mathcal{O}\) \emph{supports pruning} if for all \(\mathcal{D} : \Dyck_0\), \(p : \Peak_{\mathcal{D}}\), and variable sets \(U,V \subseteq \Var(\lfloor \mathcal{D} \rfloor)\) we have: \[ (\lfloor \mathcal{D} \sslash p \rfloor, U \sub{\pi_p}, V \sub{\pi_p}) \in \mathcal{O} \] whenever \((\lfloor \mathcal{D} \rfloor, U , V) \in \mathcal{O}\). \end{definition} The globular operation set trivially supports pruning. From \cref{lem:pi-bdry,prop:std-op}, it can be proved that the regular operation set supports pruning. We can now prove that the preservation condition holds. \begin{proposition} \label{prop:prune-preserve} Let \(\mathcal{R}\) be a tame equality rule set and suppose the operation set \(\mathcal{O}\) supports pruning. Then the set \prune satisfies the \(\mathcal{R}\)-preservation condition. \end{proposition} \begin{proof} Let \(\mathcal{D} : \Dyck_d\) be a Dyck word and \(p : \Peak_{\mathcal{D}}\) be a peak of \(\mathcal{D}\). Further suppose \(\arr s A t : \Type_{\lfloor \mathcal{D} \rfloor}\), and \(\sigma : \lfloor \mathcal{D} \rfloor \to \Gamma\) such that \(\lfloor p \rfloor \sub \sigma\) is an identity and: \[ \Gamma \vdash_{\mathcal{R}} \Coh {\lfloor \mathcal{D} \rfloor} {\arr s A t} \sigma : B\] for some type \(B : \Type_\Gamma\). By inspection on this typing derivation we have: \[ \lfloor \mathcal{D} \rfloor \vdash_{\mathcal{R}} A \qquad \Gamma \vdash_{\mathcal{R}} \sigma \lfloor \mathcal{D} \rfloor \qquad (\lfloor \mathcal{D} \rfloor, \Supp(s), \Supp(t)) \in \mathcal{O} \qquad \Gamma \vdash_{\mathcal{R}} B = (\arr s A t) \sub \sigma\] and so by \cref{prop:prune-ty}, we have: \[\lfloor \mathcal{D} \sslash p \rfloor \vdash_{\mathcal{R}} \pi_p : \lfloor \mathcal{D} \rfloor \qquad \Gamma \vdash_{\mathcal{R}} \sigma \sslash p : \lfloor \mathcal{D} \sslash p \rfloor\] therefore, as \(\mathcal{O}\) supports pruning, the following judgement holds: \[\Gamma \vdash_{\mathcal{R}} \Coh {\lfloor \mathcal{D} \sslash p \rfloor} {(\arr s A t) \sub {\pi_p}} {\sigma \sslash p} : (\arr s A t) \sub {\pi_p} \sub {\sigma \sslash p}\] and so by applying the conversion rule, it suffices to show that \[ \Gamma \vdash_{\mathcal{R}} B = (\arr s A t) \sub {\pi_p} \sub {\sigma \sslash p}\] but this follows from the equality \(B = (\arr s A t) \sub \sigma\) and the equality \(\sigma = \pi_p \bullet (\sigma \sslash p)\) from \cref{prop:prune-ty}. \end{proof} We end this section with a property of pruning that will be required to prove confluence. Suppose we have a Dyck word \(\mathcal{D}\) and two distinct peaks \(p, q : \Peak_{\mathcal{D}}\). Then both peaks can be pruned from \(\mathcal{D}\) in either order. Consider the example below on the Dyck word from \cref{ex:dyck-peaks}. \[ \begin{tikzcd}[column sep = 0.7em, row sep = scriptsize, cells={inner sep = 0,shape=circle,anchor=center}, arrows={no head}] &&&&&&&&&&& \bullet \\ &&&&&&&&&& \bullet && \bullet && |[color=Diag1]|\bullet \\ &&&&&&&&& \bullet &&&& \bullet && \bullet \\ && |[color=Diag2]|\bullet && \bullet &&&& |[color=Diag2, rotate=35]|\mathclap{\rightsquigarrow} &&&&&&&& |[color=Diag1, rotate=-35]|\mathclap{\rightsquigarrow} && \bullet \\ & \bullet && \bullet && \bullet && |[color=Diag1]|\bullet &&&&&&&&&& \bullet && \bullet \\ \bullet &&&&&& \bullet && \bullet &&&&&&&& \bullet &&&& \bullet \\ &&&&&&&& |[color=Diag1, rotate=-35]|\mathclap{\rightsquigarrow} &&& |[color=Diag2]|\bullet && \bullet &&& |[color=Diag2, rotate=35]|\mathclap{\rightsquigarrow} \\ &&&&&&&&&& \bullet && \bullet && \bullet \\ &&&&&&&&& \bullet &&&&&& \bullet \arrow[from=6-1, to=5-2] \arrow[color=Diag2, from=5-2, to=4-3] \arrow[color=Diag2, from=4-3, to=5-4] \arrow[from=5-4, to=4-5] \arrow[from=4-5, to=5-6] \arrow[from=5-6, to=6-7] \arrow[color=Diag1, from=6-7, to=5-8] \arrow[color=Diag1, from=5-8, to=6-9] \arrow[from=3-10, to=2-11] \arrow[from=2-11, to=1-12] \arrow[from=1-12, to=2-13] \arrow[from=2-13, to=3-14] \arrow[color=Diag1, from=3-14, to=2-15] \arrow[color=Diag1, from=2-15, to=3-16] \arrow[from=6-17, to=5-18] \arrow[from=5-18, to=4-19] \arrow[from=4-19, to=5-20] \arrow[from=5-20, to=6-21] \arrow[from=9-10, to=8-11] \arrow[color=Diag2, from=8-11, to=7-12] \arrow[color=Diag2, from=7-12, to=8-13] \arrow[from=8-13, to=7-14] \arrow[from=7-14, to=8-15] \arrow[from=8-15, to=9-16] \end{tikzcd} \] The following proposition proves that both peaks of the Dyck word can be pruned, and that the order in which this is done does not matter. \begin{proposition} \label{prop:prune-conf} Suppose \(\mathcal{D} : \Dyck_d\) is a Dyck word and let \(p\) and \(q\) be two distinct peaks of \(\mathcal{D}\). Then there is a peak \(q_{p}\) of \(\mathcal{D} \sslash p\) such that: \[ \lfloor q_{p} \rfloor \equiv \lfloor q \rfloor \sub {\pi_{p}}\] and a similar peak \(p_{q}\) of \(\mathcal{D} \sslash q\). Furthermore, the following equations hold syntactically: \begin{mathpar} (\mathcal{D} \sslash p) \sslash q_{p} = (\mathcal{D} \sslash q) \sslash p_{q} \and \pi_p \bullet \pi_{q_p} \equiv \pi_q \bullet \pi_{p_q} \and (\sigma \sslash p) \sslash q_{p} = (\sigma \sslash q) \sslash p_{q} \end{mathpar} where the last equation holds for any \(\sigma : \lfloor \mathcal{D} \rfloor \to \Gamma\). \end{proposition} \begin{proof} All proofs proceed by a simultaneous induction on both the peaks \(p\) and \(q\), and are given in \module{Catt.Dyck.Pruning.Properties} in the formalisation. The construction of the peak \(q_p\) is given by the function \func{Catt.Dyck.Pruning.Properties}{prune-peak}, the equality \(\lfloor q_p \rfloor \equiv \lfloor q \rfloor \sub {\pi_p}\) is given by \func{Catt.Dyck.Pruning.Properties}{prune-peak-prop}, and the remaining three equations are given by \func{Catt.Dyck.Pruning.Properties}{prune-conf}, \funcn{Catt.Dyck.Pruning.Properties}{π-conf}{\(\pi\)-conf}, and \func{Catt.Dyck.Pruning.Properties}{prune-sub-conf}. \end{proof} \section{Trees} \label{sec:trees} During the next sections we build up to defining the insertion operation. This operation performs larger modifications to pasting diagrams than the pruning operation, and we will again want to represent pasting diagrams differently to make the definition in \cref{sec:insertion} as natural as possible. It is well known that pasting diagrams correspond to planar rooted trees \cite{Weber2004,leinster2004higher,batanin1998monoidal}, which we will simply refer to as \emph{trees} and can be defined as follows. \begin{definition} A \emph{tree} \(T : \Tree\) is inductively defined to be a (possibly empty) list of trees. \end{definition} Throughout this section we will make use of standard operations and notations for lists. A list that contains the elements \(x_i\) for \(i\) from \(0\) to \(n\) will be written in square bracket notation as \([x_0,x_1,x_2,\dots,x_n]\). Further, we use the notation \(\emp\) for the empty list and \(\doubleplus\) for the concatenation of lists, which is associative and has the empty list as its unit. We will use the Agda-like notation of writing \(n :: ns\) for a list for which the first element (the head) is \(n\) and the rest of the list (the tail) is \(ns\). The length of a list will be given by the operation \(\len\). We will use the notation \(\Sigma(T) = [T]\), and call \(\Sigma(T)\) the suspension of \(T\), for reasons that will become immediate once the context generated by a tree has been defined in \cref{sec:tree-contexts}. We note that it will be common to see expressions of the form \(S :: T\) where \(S\) and \(T\) are both trees. It may seem as if this was an error, and that a concatenation operation should have been given instead, but in this case we are exploiting the identification of trees and lists of trees to treat \(S\) as a tree (as an element of the list) and \(T\) as a list of trees. We now define some common operations on trees. \begin{definition} \label{def:treetrunk} The \emph{depth} of a tree \(\dep(T)\) is \(0\) if \(T\) is empty or \(1 + \max_k{\dep(T_k)}\) if \(T = [T_0,\dots,T_n]\). For a tree \(T\), its \emph{trunk height}, \(\th(T)\), is \(1 + \th(T_0)\) if \(T = [T_0]\) and \(0\) otherwise. A tree is \emph{linear} if its trunk height equals its depth. Subtrees of a tree can be indexed by a list of natural numbers \(P\), giving a subtree \(T^P\) by letting \(T^{\emp} = T\) and \(T^{k::P} = {(T_k)}^P\) if \(T = [T_0, \dots, T_n]\). \end{definition} As these trees represent pasting diagrams, a context can be associated to each one. To be able to make effective use of trees we will need to understand this mapping to contexts, and the associated constructions used in this mapping. One of these constructions is suspension, which we have already seen. The second is an operation known as the wedge sum, which will be introduced in \cref{sec:wedge-sums}. Both these operations are mappings from contexts to contexts which preserve ps-context derivations. We will see in \cref{sec:tree-contexts} that a further result holds, that these two operations (along with the singleton context) are sufficient to generate all ps-contexts. \begin{remark} In the formalisation, trees are defined in \module{Catt.Tree} and take a slightly different form to the trees defined above, and are actually defined to be a binary tree. This exploits an isomorphism between binary trees and trees with arbitrary (finite) branching. The constructors for the trees in the formalisation are called \(\mathsf{Sing}\), which stands for ``singleton'' and takes no arguments, and \(\mathsf{Join}\), which takes two trees as arguments. The isomorphism is generated from the following rules: \begin{mathpar} \inferrule{ }{\mathsf{Sing} \simeq \emp}\and \inferrule{S \simeq S' \\ T \simeq T'}{\mathsf{Join}(S,T) \simeq S' :: T'} \end{mathpar} Presenting trees in this way in the formalisation allows any induction to be done as a single induction over the constructors of a tree, instead of simultaneously inducting on the depth of the tree and on lists. We retain the standard notation of trees for this text for simplicity of notation. Under the above isomorphism, this has no effect on the formal development. \end{remark} \subsection{Wedge sums} \label{sec:wedge-sums} The wedge sum, just like suspension, is an operation inspired by a similar operation on topological spaces. Given two spaces \(X\) and \(Y\) and points \(x\) of \(X\) and \(y\) of \(Y\), the space \(X \vee Y\) can be formed, by taking the disjoint union of \(X\) and \(Y\), and identifying the points \(x\) and \(y\). This construction satisfies a universal property: it is the colimit of the following diagram: \begin{equation} \label[diagram]{diag:wedge-colimit} \begin{tikzcd} X && Y \\ & {\{*\}} \arrow["x", from=2-2, to=1-1] \arrow["y"', from=2-2, to=1-3] \end{tikzcd} \end{equation} where the arrows labelled \(x\) and \(y\) send the unique point \(*\) to \(x\) and \(y\) respectively. Such a universal construction gives rise to two inclusions: \[\inc_X : X \to X \vee Y \qquad \inc_Y : Y \to X \vee Y\] A similar colimit can be formed in the syntactic category of \Cattr. Leveraging that the variables of a context are ordered, every (non-empty) context in \Catt is naturally bipointed. For a context \(\Gamma\), the first point is given by the first variable of the context (which must have type \(\star\)), which we name \(\fst(\Gamma)\), and the second point is given by the last \(0\)-dimensional variable in the context, which we name \(\snd(\Gamma)\). We therefore restrict the construction above to when the chosen point for the left context \(\Gamma\) is \(\fst(\Gamma)\) and the chosen point for the second context is \(\snd(\Delta)\). This simplifies the construction, and will be the only case we need for forming trees. We note that \(\fst(\Sigma(\Gamma)) \equiv N\) and \(\snd(\Sigma(\Gamma)) \equiv S\), as we will commonly take the wedge sums of suspended contexts. \begin{definition} Let \(\Gamma\) and \(\Delta\) be non-empty contexts. We then mutually define the \emph{wedge sum} \(\Gamma \vee \Delta\) and inclusions \(\inc_\Gamma : \arr \Gamma \star {\Gamma \vee_t \Delta}\) and \(\inc_\Delta : \arr \Delta \star {\Gamma \vee \Delta}\) on the context \(\Delta\), noting that the base case is \(\Delta = (x : A)\) as \(\Delta\) is non-empty. \begin{align*} \Gamma \vee (x : A) &= \Gamma \\ \Gamma \vee \Delta, (x : A) &= \Gamma \vee \Delta, (x : A \sub {\inc_\Delta})\\[10pt] \inc_\Gamma &= \wk^{n - 1}(\id_\Gamma) &\text{when \(\Delta\) has length \(n\)}\\[10pt] \inc_{(x : A)} &= \langle \snd(\Gamma) \rangle\\ \inc_{\Delta, (x : A)} &= \langle \wk(\inc_\Delta), x \rangle \end{align*} If we further have substitutions \(\sigma : \arr \Gamma A \Theta\) and \(\tau : \arr \Delta A \Theta\), then we can define the substitution \( \sigma \vee \tau : \arr {\Gamma \vee \Delta} A \Theta\) again by induction on \(\Delta\): \begin{align*} \sigma \vee \langle A, s \rangle &= \sigma\\ \sigma \vee \langle \tau, s \rangle &= \langle \sigma, s \rangle \end{align*} We note that no extra property is needed to define this universal map, though to show it is well-formed we will need that \(\snd(\Gamma) \sub \sigma = \fst(\Delta) \sub \tau\). \end{definition} We firstly prove some basic properties required for \(\Gamma \vee \Delta\) to be the colimit of \cref{diag:wedge-colimit}. \begin{lemma} \label{lem:wedge-sum-prop} Let \(\Gamma\) and \(\Delta\) be non-empty contexts. Then: \[ \inc_{\Gamma} \vee \inc_{\Delta} \equiv \id_{\Gamma \vee \Delta} \] Further, the following equations hold: \[ \inc_{\Gamma} \bullet (\sigma \vee \tau) \equiv \sigma \qquad \inc_{\Delta} \bullet (\sigma \vee \tau) \equiv \tau \] for substitutions \(\sigma : \arr \Gamma A \Theta\) and \(\tau : \arr \Delta A \Theta\) where the second equality requires that \(\snd(\Gamma) \sub \sigma \equiv \fst(\Delta) \sub \tau\). Lastly: \[ (\sigma \vee \tau) \bullet \mu \equiv (\sigma \bullet \mu) \vee (\tau \bullet \mu) \] where \(\mu : \arr \Theta B {\Theta'}\) is another substitution. \end{lemma} \begin{proof} Proofs appear as \func{Catt.Wedge.Properties}{sub-from-wedge-prop}, \func{Catt.Wedge.Properties}{sub-from-wedge-inc-left}, \func{Catt.Wedge.Properties}{sub-from-wedge-inc-right}, and \func{Catt.Wedge.Properties}{sub-from-wedge-sub} in \module{Catt.Wedge.Properties}. \end{proof} To simplify definitions of substitutions between wedge sums of contexts, we will write substitutions diagrammatically by specifying the individual components. Consider the following diagram: % https://q.uiver.app/?q=WzAsNixbMCwyLCJcXFNpZ21hXFxHYW1tYSJdLFsxLDIsIlxcdmVlIl0sWzIsMiwiXFxTaWdtYSBcXERlbHRhIl0sWzAsMCwiXFxTaWdtYSBcXEdhbW1hJyJdLFsyLDAsIlxcU2lnbWFcXERlbHRhJyJdLFsxLDAsIlxcdmVlIl0sWzAsMywiXFxTaWdtYSBcXHNpZ21hIl0sWzIsNCwiXFxTaWdtYSBcXHRhdSJdXQ== % tex-fmt: skip \[\begin{tikzcd}[column sep=tiny, row sep=10pt] {\Gamma'} & \vee & {\Delta'} &\vee &{\Theta'} \\ \\ \Gamma & \vee & \Delta & \arrow["{\sigma}", from=3-1, to=1-1, pos=.4] \arrow["{\tau}", from=3-3, to=1-3, pos=.4] \end{tikzcd} \] which is generated from substitutions \(\sigma : \Gamma \to \Gamma'\) and \(\tau : \Delta \to \Delta'\). A substitution \(\Gamma \vee \Delta \to \Gamma' \vee \Delta' \vee \Theta'\) can be generated by composing each arrow in the diagram with suitable inclusions so that its target is \(\Gamma' \vee \Delta' \vee \Theta'\), and then using the universal property of the wedge to map out of the source context. In the diagram above the generated substitution is: \[ ((\sigma \bullet \inc_{\Gamma'}\bullet \inc_{\Gamma' \vee \Delta'}) \vee (\tau \bullet \inc_{\Delta'}\bullet \inc_{\Gamma' \vee \Delta'})) \] To ensure these definitions are unique, the following proposition is needed: \begin{proposition} The wedge sum \(\vee\) is associative and has the singleton context \((x : \star)\) as its left and right unit. Given a context \(\Gamma\), the inclusions satisfy the following unitality properties: \[ \inc_{\Gamma} : \Gamma \to \Gamma \vee (x : \star) \equiv \id_\Gamma \qquad \inc_{\Gamma} : \Gamma \to (x : \star) \vee \Gamma \equiv \id_\Gamma \] and given substitutions \(\sigma : \arr \Gamma A \Xi\), \(\tau : \arr \Delta A \Xi\), and \(\mu : \arr \Theta A \Xi\) we have: \[ (\sigma \vee \tau) \vee \mu \equiv \sigma \vee (\tau \vee \mu)\] There is a unique way of including each of the contexts \(\Gamma\), \(\Delta\), and \(\Theta\) into \(\Gamma \vee \Delta \vee \Theta\), that is there is a unique substitution \(\Gamma \to \Gamma \vee \Delta \vee \Theta\) which is built from a composite of inclusions and similarly for \(\Delta\) and \(\Theta\). \end{proposition} \begin{proof} The proofs of these are given in \module{Catt.Wedge.Properties}, and are all given by inducting on the right most context. The proof for the right unitality of \(\vee\) is omitted from the formalisation as it is immediate from the definitions. The uniqueness of inclusions substitutions is given by \begin{itemize} \item \func{Catt.Wedge.Properties}{wedge-inc-left-assoc}, which says: \begin{align*} \inc_{\Gamma} \bullet \inc_{\Gamma \vee \Delta} : \Gamma \to (\Gamma \vee \Delta) \vee \Theta &\equiv \inc_{\Gamma} : \Gamma \to \Gamma \vee (\Delta \vee \Theta)\\ \intertext{\item \func{Catt.Wedge.Properties}{wedge-incs-assoc}, which says:} \inc_{\Delta} \bullet \inc_{\Gamma \vee \Delta} : \Delta \to (\Gamma \vee \Delta) \vee \Theta &\equiv \inc_{\Delta} \bullet \inc_{\Delta \vee \Theta} : \Delta \to \Gamma \vee (\Delta \vee \Theta)\\ \intertext{\item \func{Catt.Wedge.Properties}{wedge-inc-right-assoc}, which says:} \inc_{\Theta} : \Theta \to (\Gamma \vee \Delta) \vee \Theta &\equiv \inc_{\Theta} \bullet \inc_{\Delta \vee \Theta} : \Theta \to \Gamma \vee (\Delta \vee \Theta) \end{align*} \end{itemize} We note that the definition of the wedge sum differs slightly in the formalisation, specifying a term \(t\) in \(\Gamma\) which takes the role of \(\snd(\Gamma)\), in order to give more computational control. By replacing the terms \(t\) in the formalisation by \(\snd(\Gamma)\) for the appropriate context \(\Gamma\), and noting that \(\snd(\Delta) \sub{\inc_{\Delta}} \equiv \snd(\Gamma \vee \Delta)\) (which can be proved by an easy induction), the results written here can be recovered. \end{proof} The previous proposition ensures that the diagrammatic notation for substitutions between wedge sums uniquely defines a substitution. We next show that all the constructions in this section have the expected typing properties. \begin{lemma} \label{lem:wedge-typing} The following inference rules are admissible in \Cattr: \begin{mathpar} \inferrule{\Gamma \vdash \\ \Delta \vdash}{\Gamma \vee \Delta \vdash}\and \inferrule{ }{\Gamma \vee \Delta \vdash \inc_{\Gamma} : \Gamma}\and \inferrule{ }{\Gamma \vee \Delta \vdash \inc_{\Delta} : \Delta}\and \inferrule{\Theta \vdash \snd(\Gamma) \sub \sigma = \fst(\Delta) \sub \tau}{\Theta \vdash \inc_{\Delta} \bullet (\sigma \vee \tau) = \tau}\and \inferrule{\Theta \vdash \sigma : \Gamma \\ \Theta \vdash \tau : \Gamma \\ \Theta \vdash \snd(\Gamma) \sub \sigma = \fst(\Delta) \sub \tau}{\Theta \vdash \sigma \vee \tau : \Gamma \vee \Delta}\and \inferrule{\Theta \vdash \sigma = \sigma'\\ \Theta \vdash \tau = \tau'}{\Theta \vdash \sigma \vee \tau = \sigma' \vee \tau'} \end{mathpar} \end{lemma} \begin{proof} All proofs are given in \module{Catt.Wedge.Typing}. \end{proof} We finally show that the wedge sum preserves pasting diagrams, the property that wedge sums were initially introduced for. \begin{proposition} \label{prop:wedge-ps} The wedge sum of two ps-contexts is a ps-context: If \(\Gamma \vdash_{\mathsf{ps}}\) and \(\Delta \vdash_{\mathsf{ps}}\), then \(\Gamma \vee \Delta \vdash_{\mathsf{ps}}\) \end{proposition} \begin{proof} It can first be proven that if the derivation \(\Gamma \vdash_{\mathsf{ps}}\) is generated by \(\Gamma \vdash_{\mathsf{ps}} x : \star\), then \(x \equiv \snd(\Gamma)\), by showing for all derivations \(\Gamma \vdash_{\mathsf{ps}} x : A \), where \(\dim(A) > 0\) that the \(0\)-target of the type \(A\) is \(\snd(\Gamma)\) by induction, and then case splitting on the original derivation. Then \(\Gamma \vdash_{\mathsf{ps}}\) implies that \(\Gamma \vdash_{\mathsf{ps}} \snd(\Gamma) : \star\). The statement of the proposition is then proven by induction on the following statement: If \(\Gamma \vdash_{\mathsf{ps}}\) and \(\Delta \vdash_{\mathsf{ps}} x : A\), then: \[ \Gamma \vee \Delta \vdash_{\mathsf{ps}} x \sub {\inc_{\Delta}} : A \sub {\inc_{\Delta}}\] The base case is given by the preceding paragraph, and the other cases follow from routine calculation. These proofs are given in \module{Catt.Wedge.Pasting}. \end{proof} We lastly give a version of the wedge sum construction for variable sets. \begin{definition} Let \(\Gamma\) and \(\Delta\) be two non-empty contexts, and let \(U \subseteq \Var(\Gamma)\) and \(V \subseteq \Var(\Delta)\) be variable sets. Then define: \[U \vee V = U \sub {\inc_\Gamma} \cup V \sub {\inc_\Delta}\] to be a variable set of \(\Gamma \vee \Delta\). \end{definition} \subsection{Tree contexts} \label{sec:tree-contexts} We have now defined suspensions and wedge sums, and shown that both operations preserve ps-contexts. This allows us to define the context generated by a tree. \begin{definition} For a tree \(T\), the context $\lfloor T \rfloor$ generated from it is defined recursively by: \[\lfloor \emp \rfloor = D^0 \qquad \lfloor [T_0,\dots,T_n] \rfloor = \bigvee\limits_{i = 0}^n \Sigma\lfloor T_i \rfloor\] It is immediate from this definition that \(\lfloor \Sigma(T) \rfloor \equiv \Sigma(\lfloor T \rfloor)\), \(\lfloor S \doubleplus T \rfloor \equiv \lfloor S \rfloor \vee \lfloor T \rfloor\), and that \(\dim(\lfloor T \rfloor) = \dep(T)\). \end{definition} We can immediately give some examples of trees and their associated contexts. The context \(D^0\) is defined to be the context associated to \(\emp\), and so as \(D^{n+1} \equiv \Sigma(D^n)\), all the disc contexts can easily be recovered from trees as \(D^n \equiv \lfloor \Sigma^n(\emp) \rfloor\). Each tree \(\Sigma^n(\emp)\) is linear and has depth \(n\). Trees can also be drawn graphically as follows: For a tree \([T_0,\dots,T_n]\), first recursively draw the trees \(T_i\) and lay these out in a horizontal line. Then a single point is drawn underneath these subtrees which we call the root of the tree, and a line is and drawn between the root of the tree and the root of each subtree. An example is given in \cref{fig:tree-example}. \begin{figure}[ht] \centering \begin{tikzpicture}[every node/.style={scale=0.6},baseline=(x11.base)] \node [on grid](x01) {$\bullet$}; \node [above left=0.5 and 0.3 of x01, on grid] (x11) {$\bullet$}; \node [above left=0.5 and 0.25 of x11, on grid] (x21) {$\bullet$}; \node [above right=0.5 and 0.25 of x11, on grid] (x22) {$\bullet$}; \node [above right=0.5 and 0.3 of x01, on grid](x12) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{tikzpicture} \qquad \begin{tikzcd} \bullet & \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=30pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, from=1-1, to=1-2] \arrow[from=1-2, to=1-3] \arrow[shorten <=4pt, shorten >=4pt, Rightarrow, from=1, to=2] \arrow[shorten <=4pt, shorten >=4pt, Rightarrow, from=2, to=0] \end{tikzcd} \caption{The tree \([[\emp,\emp],\emp]\) and generated context.} \label{fig:tree-example} \end{figure} The context associated to a tree is clearly a pasting diagram, as the context is built only using the singleton context, wedge sums, and suspension. In fact, the set of contexts generated by trees is exactly the set containing the singleton context, and closed under wedge sums and suspensions. Further, it is proven in the formalisation module \module{Catt.Dyck.FromTree} that all pasting diagrams are generated by some tree, though this will not be needed for any formal development of our type theories. We next introduces \emph{paths}, which can be thought of as the variables in a tree. \begin{definition} Let \(T\) be a tree. \emph{Paths} \(p : \Path_T\) are non-empty lists of natural numbers of the form \(q \doubleplus [n]\) such that \(q\) indexes a subtree \(T^q\) of \(T\) and \(0 \leq n \leq \len(T^q) \). For path \(p : \Path_T\), we obtain a variable of \(\lfloor T \rfloor\) by recursion on \(p\) as follows: \begin{itemize} \item Suppose \(p = [n]\). Let \(T = [T_0,\dots,T_k]\). It is clear that \(\lfloor T \rfloor\) has exactly \(k+2\) variables of dimension \(0\), corresponding to (inclusion of) the first variable of each context \(\Sigma(\lfloor T_i \rfloor)\) as well as the variable corresponding to the inclusion of \(\snd(\Sigma(T_i))\). We then define \(\lfloor [n] \rfloor\) to be the \(n\)\textsuperscript{th} such variable, indexing from 0. \item Let \(p = k :: q\) and \(T = [T_0,\dots,T_k,\dots]\), where \(q\) is a path of \(T_k\). Then by recursion we have a variable \(\lfloor q \rfloor\) of \(\lfloor T_k \rfloor\). This gives a variable \(\Sigma(\lfloor q \rfloor)\) of \(\Sigma(\lfloor T_k \rfloor)\) which can be included into \(\lfloor T \rfloor\) by the appropriate inclusion to get \(\lfloor p \rfloor\). \end{itemize} We lastly define the set of \emph{maximal paths} \(\MaxPath_T\) of \(T\) to be paths \(p \doubleplus [0] \) such that \(T^p = \emp\). Such paths correspond to locally maximal variables of \(\lfloor T \rfloor\). \end{definition} We now turn our attention to substitutions from a tree context \(\sigma : \lfloor T \rfloor \to \Gamma\). A substitution can be viewed as a function from the variables of its source context to terms of the target context. Therefore, a substitution \(\sigma : \lfloor T \rfloor \to \Gamma\) acts on variables of \(\lfloor T \rfloor\). However, we have seen that the more natural notion of a variable in a tree context is a path. This motivates the following definition. \begin{definition} A term-labelling \(L : T \to \Gamma\) from a tree \(T\) to context \(\Gamma\) is a pair containing a function \(\Path_T \to \Term_\Gamma\) and a type of \(\Gamma\). To apply the function component of a labelling to a path \(p\), we write \(L(p)\) or \(L[x_0,x_1,\dots]\) for a path \([x_0,x_1,\dots]\). The type component of the labelling is given by \(\ty(L)\). If \(T = [T_0,\dots,T_n]\), then there are natural projections \( L_i :T_i \to \Gamma\) given by \(L_i(p) = L(i :: p)\) and \(\ty(L) = \arr {L[i]} {\ty(L)} {L[i+1]}\) for \(0 \leq i \leq n\). \end{definition} For labellings to play the role of substitutions, a substitution \(\lfloor L \rfloor : \arr {\lfloor T \rfloor} {\ty(L)} \Gamma\) will be defined for each term-labelling \(L : T \to \Gamma\). A natural way to define this substitution is by induction on the tree \(T\), which motivates the use of extended substitutions. Suppose we start with a labelling \(L : \arr {[T_0,\dots,T_n]} {} \Gamma\). To proceed, we will apply the inductive hypothesis to obtain the substitutions: \[ \lfloor L_i \rfloor : \arr {\lfloor T_i \rfloor} {\arr {L[i]} {\ty(L)} {L[i+1]}} {\Gamma} \] These substitutions are not regular (non-extended) substitutions, even if \(L\) has associated type \(\star\), and so corresponds to a regular substitution. \begin{definition} Let \(L : T \to \Gamma\) be a term-labelling. We define the substitution: \[\lfloor L \rfloor : \arr {\lfloor T \rfloor} {\ty(L)} \Gamma\] by induction on the tree \(T\) as \(\langle \ty(L), L[0] \rangle\) if \(T = \emp\) and: \[ \unrestrict \lfloor L_0 \rfloor \vee \unrestrict \lfloor L_1 \rfloor \vee \cdots \vee \unrestrict \lfloor L_n \rfloor \] if \(T = [T_0, \dots, T_n]\). Although it looks like the \(0\)-dimensional terms in the labelling are not used to generate the substitution, they appear in the types of the labellings \(L_i\), and so appear in the unrestricted substitutions. \end{definition} There are many ways of giving a more syntactic presentation of labellings. Given a tree \(T = [T_0,\dots,T_n]\), a labelling \(L : T \to \Gamma\) can be written as: \[ t_0\{L_0\}t_1\{L_1\}t_2\cdots t_n\{L_n\}t_{n+1} : \ty(L) \] where each \(t_i\) is the term \(L[i]\) and the sublabellings \(L_i\) have been recursively put in this syntactic bracketing format (omitting the type). The syntactic presentation contains all the information of the original labelling, which can be recovered by letting \(L[i] = t_i\) for each \(i\), \(L[i :: p] = L_i(p)\). As an example, take the tree \(T = [[\emp,\emp], \emp]\) from \cref{fig:tree-example}, and let: \[\Gamma = (x : \star), (f : x \to x), (\alpha : f*f \to f)\] Then we can define the labelling \(L : T \to \Gamma\) by: \[ L = x\bigl\{f*f\{\alpha\}f\{\id(f)\}f\bigr\}x\{f\}x : \star \] which sends the (maximal) paths \([0,0,0]\) to \(\alpha\), \([0,1,0]\) to \(\id(f)\), and \([1,0]\) to \(f\), and has associated substitution: \[ \lfloor L \rfloor = \langle x,x,f*f,f,\alpha,f,\id(f),x,f \rangle\] The curly brackets notation for labellings is used instead of a typical round bracket notation to avoid clashes with notations that already use round brackets, such as \(\id(f)\). We finish this section by examining a boundary operation for trees. We have already seen that for every ps-context \(\Gamma\) and \(n \in \mathbb{N}\), there are the boundary variable sets: \[\bdry n - \Gamma \qquad \bdry n + \Gamma\] Since \(\lfloor T \rfloor\) is a ps-context for any tree \(T\), we immediately obtain such boundary variable sets for \(\lfloor T \rfloor\). However, by recalling the definitions for the wedge sum of variable sets given in \cref{sec:wedge-sums} and the suspension of a variable set given in \cref{sec:operation-properties}, a more natural definition can be given. \begin{definition} For any tree \(T : \Tree\), dimension \(n \in \mathbb{N}\), and \(\epsilon \in \{-,+\}\), we define the boundary set: \[\bdry n \epsilon T\] by induction on \(n\) and \(T\). If \(n = 0\), then we define: \[\bdry 0 - T = \FV(\fst(\lfloor T \rfloor)) \qquad \bdry 0 + T = \FV(\snd(\lfloor T \rfloor))\] Now suppose \(n\) is not \(0\). If the tree \(T\) is the singleton tree, then \(\bdry n \epsilon T = \Var(\lfloor T \rfloor)\). Now suppose that \(T = [T_0,\dots,T_n]\). We then define: \[ \bdry n \epsilon T = \bdry {n-1} \epsilon {T_0} \vee \cdots \vee \bdry {n-1} \epsilon {T_n}\] with the boundary sets \(\bdry n \epsilon {T_i}\) obtained by inductive hypothesis. \end{definition} In the formalisation module \module{Catt.Tree.Support}, we prove that the boundary sets \(\bdry n \epsilon T\), the tree boundary, and \(\bdry n \epsilon {\lfloor T \rfloor}\), the ps-context boundary, coincide. Therefore: \[ (\lfloor S \rfloor, \bdry n - T, \bdry n + T) \in \Std\] for each \(n \geq \dep(S) - 1\). \section{Structured syntax} \label{sec:structured-terms} We now introduce a new class of syntax named \emph{structured syntax}. Terms over tree contexts are commonly built using several of the standard constructions we have seen so far, such as paths, labellings, suspensions, and inclusions. By recording which of these constructions was used in the formation of a term, these terms can compute more usefully, which we will exploit to prove more involved lemmas about insertion in \cref{sec:insertion}. Structured syntax will be our variation on the base syntax of \Catt which records these constructions. The key problem with the base syntax for \Catt is that term-labellings are difficult to compose. We have so far considered term-labellings of the form \(L : T \to \Gamma\), where \(\Gamma\) is any arbitrary context, but there is no reason a labelling couldn't be of the form \(M : S \to \lfloor T \rfloor\) for trees \(S\) and \(T\). We would then hope to be able to compose these labellings to get a labelling of the form: \[ M \bullet L : S \to \Gamma \] Such a labelling would need to send a path \(p: \Path_S\) to a term of \(\Gamma\). The only reasonable way forward is to apply \(M\) to \(p\) to get a term of \(\lfloor T \rfloor\), and then apply \(\lfloor L \rfloor\) to this term to get a term of \(\Gamma\). Unfortunately, for an arbitrary term \(t : \Term_{\lfloor T \rfloor}\) and labelling \(L : T \to \Gamma\), the term: \[ t \sub {\lfloor L \rfloor}\] does not have nice computational properties. We examine two examples: \begin{itemize} \item Suppose \(t\) was of the form \(\lfloor p \rfloor\) for some path \(p\). We then have: \[ \lfloor p \rfloor \sub {\lfloor L \rfloor} \equiv L(p)\] and would hope that this syntactic equality would fall out immediately, and that the left-hand side would reduce to the right-hand side in the formalisation. This is not the case however, and proving that such a syntactic equality holds is non-trivial. \item Suppose \(t \equiv \Sigma(s)\) and \(L = a\{L_1\}b : A\). Similar to the above case we would hope that the syntactic equality: \[ \Sigma(s) \sub {\lfloor a\{L_1\}b : A \rfloor} \equiv s \sub {\lfloor L_1 \rfloor}\] holds ``on the nose''. This however is not the case. \end{itemize} Structured terms alleviate these problems by recording that such a term \(t\) was generated from a path or generated using suspension. This allows the application of a labelling to a structured term to use this information, for example letting the two syntactic equalities above to hold by definition. If a labelling is the ``correct'' notion of substitution from a tree, then a structured term is the ``correct'' notion of term in a tree. \begin{definition} Let \(\U\) be a member of \(\Ctx \uplus \Tree\), either some context \(\Gamma\) or some tree \(T\). We then define the \emph{structured syntax} classes \(\STerm_\U\) of \emph{structured terms}, \(\SType_\U\) of \emph{structured types}, and \emph{(\(\STerm\)-)labellings} \(\arr S {} \U\) for some tree \(S\). The syntax classes for structured terms and types are generated by the following rules: \begin{mathpar} \inferrule{p : \Path_T}{\SPath(p) : \STerm_T}\and \inferrule{s : \STerm_{T_i}\\ 0 \leq i \leq n}{\Inc_i(s) : \STerm_{[T_0,\dots,T_n]}} \and \inferrule{S : \Tree\\ A : \SType_S \\ L : S \to \U}{\SCoh S A L : \STerm_\U}\and \inferrule{t : \Term_\Gamma}{\SOther(t) : \STerm_\Gamma} \\ \inferrule{ }{\star : \SType_\U}\and \inferrule{s : \STerm_\U \\ A : \SType_\U \\ t: \STerm_\U}{\arr s A t : \SType_\U} \end{mathpar} Labellings \(L : S \to \U\) are defined as pairs of a function \(\Path_S \to \STerm_\U\) and structured type, similarly to term-labellings in \cref{sec:tree-contexts}. We note that the syntax for structured types is shared with the syntax for \Catt types, and will be careful to make it clear which syntax we are using when necessary. \end{definition} Each piece of structured syntax can be converted back into the base syntax of \Catt, using many of the constructions already introduced. \begin{definition} Suppose \(\U : \Ctx \uplus \Tree\). Define \(\lfloor \U \rfloor\) to be \(\Gamma\) if \(\U = \Gamma\) for some context \(\Gamma\) or \(\lfloor T \rfloor\) if \(\U = T\) for some tree \(T\). Now, for a structured term \(s : \STerm_\U\), a structured type \(A : \SType_\U\), or a labelling \(L : S \to \U\), we define: \[ \lfloor s \rfloor : \Term_{\lfloor \U \rfloor} \qquad \lfloor A \rfloor : \Type_{\lfloor \U \rfloor} \qquad \lfloor L \rfloor : \arr {\lfloor S \rfloor} {\lfloor \ty(L) \rfloor} {\lfloor \U \rfloor} \] by the equations: \begin{align*} \lfloor \SPath(p) \rfloor &= \lfloor p \rfloor\\ \lfloor \Inc_i(s) \rfloor &= \Sigma(\lfloor s \rfloor) \sub {\inc_{\lfloor T_i \rfloor}}&\text{if }s : \STerm_{[T_0,\dots,T_n]}\\ \lfloor \SCoh S A L \rfloor &= \Coh {\lfloor S \rfloor} {\lfloor A \rfloor} {\id_{\lfloor S \rfloor}} \sub {\lfloor L \rfloor}\\ \lfloor \SOther(t) \rfloor &= t\\[10pt] \lfloor \star \rfloor &= \star\\ \lfloor \arr s A t \rfloor &= \arr {\lfloor s \rfloor} {\lfloor A \rfloor} {\lfloor t \rfloor} \end{align*} and by defining \(\lfloor L \rfloor\) similarly to term labellings except \(\lfloor L \rfloor = \langle \lfloor \ty(L) \rfloor, \lfloor L[0] \rfloor \rangle\) for labellings \(L : {\emp} \to {\U}\) from the singleton tree. We refer to \(\lfloor a \rfloor\), \(\lfloor A \rfloor\) and \(\lfloor L \rfloor\) as the term, type, or substitution generated by \(a\),\(A\), or \(L\). \end{definition} For any tree \(T\), there is an \emph{identity labelling} \(\id_T\) given by: \[ \id_T(p) = \SPath(p) \qquad \ty(\id_T) = \star\] The function \func{Catt.Tree.Structured.Properties}{id-label-to-sub} in the formalisation (see \module{Catt.Tree.Structured.Properties}) shows that: \[\lfloor \id_T \rfloor = \id_{\lfloor T \rfloor}\] The main motivation for introducing structured syntax was to be able to define a composition of labellings, which we do now by defining the application of a labelling to a structured term, structured type, or another labelling. \begin{definition} Let \(L : T \to \U\) be a labelling (with \(\U : \Ctx \uplus \Tree\)). We define the application of \(L\) to a structured term \(s : \STerm_{T}\), a structured type \(A : \SType_T\), and a labelling \(M : S \to T\) to give: \[ s \sub L : \STerm_\U \qquad A \sub L : \SType_\U \qquad M \bullet L : S \to \U\] These definitions are given by mutual recursion: \begin{align*} \SPath(p) \sub L &= L(p)\\ \Inc_i(s) \sub L &= s \sub {L_i}\\ \SCoh S A M \sub L &= \SCoh S A {M \bullet L}\\ \SOther(t) \sub L &= t \sub {\lfloor L \rfloor}\\[10pt] \star \sub L &= B\\ (\arr s A t) \sub L &= \arr {s \sub L} {A \sub L} {t \sub L}\\[10pt] (M \bullet L)(p) &= M(p) \sub L\\ \ty(M \bullet L) &= \ty(M) \sub L \end{align*} It can easily be seen that these definitions satisfy the computational properties given at the start of the section. \end{definition} The main theorem of this section is that the application of a labelling to a structured term is compatible with the map from structured syntax to \Catt syntax. \begin{theorem} \label{thm:structured-main} For any labelling \(L : T \to \U\) and structured term \(s : \STerm_T\), structured type \(A : \SType_T\), or labelling \(M : S \to T\), we have: \[ \lfloor s \sub L \rfloor \equiv \lfloor s \rfloor \sub {\lfloor L \rfloor} \qquad \lfloor A \sub L \rfloor \equiv \lfloor A \rfloor \sub {\lfloor L \rfloor} \qquad \lfloor M \bullet L \rfloor \equiv \lfloor M \rfloor \bullet \lfloor L \rfloor\] \end{theorem} \begin{proof} We proceed by proving all statements by mutual induction. Suppose \(s : \STerm_T\) is a structured term. We split on the form of \(s\): \begin{itemize} \item Suppose \(s\) is of the form \(\SCoh S A M\). Then \(s \sub L\) is \(\SCoh S A {M \bullet L}\) and so the required statement follows from the inductive hypothesis for labellings. \item Suppose \(s\) is of the form \(\SOther(t)\). Then \(\lfloor s \sub L \rfloor \equiv \lfloor \SOther (t \sub {\lfloor L \rfloor}) \rfloor \equiv t \sub {\lfloor L \rfloor} \equiv \lfloor s \rfloor \sub {\lfloor L \rfloor}\). \item Suppose \(T = [T_0,\dots, T_n]\) and \(s\) is of the form \(\Inc_i(t)\). Then: \begin{align*} \lfloor \Inc_i(t) \rfloor \sub {\lfloor L \rfloor} &\equiv \Sigma(t)\sub {\inc_{\lfloor T_i \rfloor}} \sub {\unrestrict \lfloor L_0 \rfloor \vee \cdots \vee \unrestrict \lfloor L_n \rfloor}\\ &\equiv \Sigma(t)\sub {\inc_{\lfloor T_i \rfloor} \bullet (\unrestrict \lfloor L_0 \rfloor \vee \cdots \vee \unrestrict \lfloor L_n \rfloor)}\\ &\equiv \Sigma(t) \sub {\unrestrict \lfloor L_i \rfloor} &\text{by \cref{lem:wedge-sum-prop}}\\ &\equiv \lfloor t \rfloor \sub {\lfloor L_i \rfloor}\\ &\equiv \lfloor t \sub {L_i} \rfloor &\text{by inductive hypothesis}\\ &\equiv \lfloor \Inc_i(t) \sub L \rfloor \end{align*} \item Suppose \(s\) is of the form \(\SPath(p)\). Then if \(\lfloor p \rfloor\) is not a \(0\)-dimensional variable, then an argument similar to the preceding case can be made. If instead \(\lfloor p \rfloor\) is of the form \([k]\) and \(T = [T_0,\dots,T_n]\) then first suppose that \(k < n + 1\) such that \(\lfloor [k] \rfloor \equiv \fst(\lfloor T_k \rfloor) \sub {\inc_{\lfloor T_k \rfloor}}\). Then: \begin{align*} \lfloor [k] \rfloor \lfloor L \rfloor &\equiv \fst(\lfloor T_k \rfloor) \sub {\inc_{\lfloor T_k \rfloor}} \sub {\lfloor \unrestrict \lfloor L_0 \rfloor \vee \cdots \vee \unrestrict \lfloor L_n \rfloor \rfloor}\\ &\equiv \fst(\lfloor T_k \rfloor) \sub {\unrestrict \lfloor L_k \rfloor}\\ &= \lfloor L[k] \rfloor \end{align*} where the last equality follows from the labelling \(L_k\) having type component \(\ty(L_k) \equiv \arr {\lfloor L[k] \rfloor} {B} {\lfloor L[k+1] \rfloor}\). The case where \(k = n+1\) is similar to above using \(\snd(T_n)\) instead of \(\fst(T_k)\) (as there is no tree \(T_k\) in this case). \end{itemize} The case for structured types follows by a simple induction using the case for terms. We now consider the case for a label \(M : S \to T\). Suppose \(S = [S_0,\dots,S_n]\). Then: \begin{align*} \lfloor M \rfloor \bullet \lfloor L \rfloor &\equiv \left( \bigvee_i \unrestrict \lfloor M_i \rfloor \right) \bullet \lfloor L \rfloor\\ &\equiv \bigvee_i \unrestrict \lfloor M_i \rfloor \bullet \lfloor L \rfloor&\text{by \cref{lem:wedge-sum-prop}}\\ &\equiv \bigvee_i \unrestrict \left(\lfloor M_i \rfloor \bullet \lfloor L \rfloor\right)\\ &\equiv \bigvee_i \unrestrict \lfloor M_i \bullet L \rfloor&\text{by inductive hypothesis}\\ &\equiv \lfloor M \bullet L \rfloor \end{align*} with the last line following from \((M \bullet L)_i\) and \(M_i \bullet L\) being the same labelling. This concludes all cases. \end{proof} Structured syntax is only used as computational aid for reasoning about the base syntax of \Catt, and therefore the desired notion of ``syntactic'' equality of structured syntax is syntactic equality of the underlying \Catt terms, that is we say \(s \equiv t\) for structured terms \(s\) and \(t\) exactly when \(\lfloor s \rfloor \equiv \lfloor t \rfloor\). On labellings \(L, M : T \to \U\) we can instead provide the equality: \[ L \equiv M \iff \ty(L) \equiv \ty(M) \land \forall (p : \Path_T).\ L(p) \equiv M(p)\] and by observing the proof of \cref{thm:structured-main}, we see that this equality implies equality of the generated substitutions. It is therefore possible to derive many properties for this equality of structured terms simply by reducing all constructions used to the corresponding \Catt constructions, and using the corresponding result for the syntax of \Catt. \begin{proposition} Composition of labellings is associative and has a left and right unit given by the identity labelling. \end{proposition} \begin{proof} Follows immediately from \cref{thm:structured-main}, the identity labelling generating the identity substitution, and the corresponding results for \Catt. \end{proof} Using this technique, every syntactic result about \Catt can be transported to structured syntax. Further, it is easy to prove that the equality relation is preserved by each constructor, for example if \(L \equiv M\) and \(A \equiv B\), then \(\SCoh S A L \equiv \SCoh A B M\). To extend this, we redefine some constructions we have seen for \Catt in the previous sections, this time for structured terms. \begin{definition} We define the suspension for a structured term \(a : \STerm_\U\), structured type \(A : \STerm_\U\), and restricted substitution for a labelling \(L : T \to \U\), giving structured term \(\Sigma(a) : \STerm_{\Sigma(\U)}\), structured type \(\Sigma(A) : \STerm_{\Sigma(\U)}\), and labelling \(\Sigma'(L) : T \to {\Sigma(\U)}\). These are all defined by mutual induction as follows: \begin{align*} \Sigma(a) &\equiv \Inc_0(a) &\text{if \(\U\) is a tree}\\ \Sigma(\SCoh S A M) &\equiv \SCoh S A {\Sigma'(M)}&\text{if \(\U\) is a context}\\ \Sigma(\SOther(t))&\equiv \SOther(\Sigma(t))\\[10pt] \Sigma(\star) &= \arr N \star S&\text{if \(\U\) is a context}\\ \Sigma(\star) &= \arr {\SPath[0]} \star {\SPath[1]}&\text{otherwise}\\ \Sigma(\arr s A t) &= \arr {\Sigma(s)} {\Sigma(A)} {\Sigma(t)}\\[10pt] \Sigma'(L)(p) &= \Sigma(L(p))\\ \ty(\Sigma'(L)) &= \Sigma(\ty(L)) \end{align*} We further define an unrestriction operation that takes a labelling of the form \(M : T \to \U\) with \(\ty(M) \equiv \arr s A t\) and produces a labelling \[\unrestrict M : {\Sigma(T)} \to \U \equiv s\{M\}t : A\] This can be used to define the full suspension of a labelling as with \Catt substitutions by defining \(\Sigma(L)\) to be \(\unrestrict \Sigma'(L)\). \end{definition} A simple case analysis demonstrates that these constructions commute with \(\lfloor \_ \rfloor\). They therefore inherit the properties of the suspension on \Catt terms, types, and substitutions. We lastly recover wedge sums for structured syntax. \begin{definition} We have seen that the wedge sum of trees \(S\) and \(T\) is given by \(S \doubleplus T\). Letting \(S = [S_0,\dots,S_m]\) and \(T = [T_0,\dots,T_n]\), we further define inclusion labellings: \[ \inc_S : S \to {S \doubleplus T} \qquad \inc_T : T \to {S \doubleplus T}\] by the equations: \begin{align*} \inc_S([k])&\equiv \SPath[k] & \inc_S(k :: p) &\equiv \SPath(k :: p) & \ty(\inc_S) &\equiv \star\\ \inc_T([k])&\equiv \SPath[m + k] & \inc_T(k :: p) &\equiv \SPath (m + k :: p) & \ty(\inc_T) &\equiv \star \end{align*} and finally, we suppose \(L : S \to \U\) and \(M : T \to \U\) are labellings of the form: \[ L \equiv s_0\{L_0\}s_1\cdots s_n\{L_n\}t_0 : A \qquad M \equiv t_0\{M_0\}t_1\cdots t_n\{M_n\}t_{n+1} : A \] and define their concatenation to be the labelling: \[ L\doubleplus M \equiv s_0\{L_0\}s_1\cdots s_n\{L_n\}t_0\{M_0\}t_1\cdots t_n\{M_n\}t_{n+1} : A \] where \(L \doubleplus M : {S \doubleplus T} \to \U\). \end{definition} Many properties of these constructions among others are given in the formalisation module \module{Catt.Tree.Structured.Construct.Properties}. In particular, the diagrammatic notation for substitutions between wedge sums can be repurposed to define labellings, which will be used to define certain labellings in \cref{sec:insertion}. It will be useful to be able to interpret all \Catt syntax as structured syntax. For terms such a mapping is trivially given by the \(\SOther\) constructor. For a type \(A\), a structured type \(\lceil A \rceil\) can be formed by a simple induction, applying the \(\SOther\) constructor to each term in the type. For substitutions, we give the following definition. \begin{definition} Let \(\sigma : \lfloor S \rfloor \to_A \Gamma\) be a substitution. We then define the labelling: \[ \lceil \sigma \rceil : S \to \Gamma \] by \(\lceil \sigma \rceil(p) = \SOther(\lfloor p \rfloor \sub \sigma)\) and \(\ty(\lceil \sigma \rceil) = \lceil A \rceil\). \end{definition} This construction is an inverse to taking generating a substitution from a labelling. \begin{proposition} Let \(\sigma : \lfloor S \rfloor \to_A \Gamma\) be a substitution. Then \(\lfloor \lceil \sigma \rceil \rfloor \equiv \sigma\). Further, for any labelling \(L : S \to \Gamma\), \(\lceil \lfloor L \rfloor \rceil \equiv L\). \end{proposition} \begin{proof} We note that every variable of \(\lfloor S \rfloor\) is given by \(\lfloor p \rfloor\) for some path \(p\). We then have the equality: \begin{equation*} \lfloor p \rfloor \sub {\lfloor \lceil \sigma \rceil \rfloor} \equiv \lfloor p \sub {\lceil \sigma \rceil} \rfloor \equiv \lfloor \SOther(\lfloor p \rfloor \sub \sigma) \rfloor \equiv \lfloor p \rfloor \sub \sigma \end{equation*} and so \(\sigma\) and \(\lfloor \lceil \sigma \rceil \rfloor\) have the same action on each variable and so are equal. Letting \(L : S \to \Gamma\) be a labelling. Then for any path \(p\): \[ \lceil \lfloor L \rfloor \rceil(p) \equiv \SOther(\lfloor p \rfloor \sub {\lfloor L \rfloor}) \equiv \SOther(\lfloor L(p) \rfloor) \] and so \(\lfloor \lceil \lfloor L \rfloor \rceil(p) \rfloor \equiv \lfloor L(p) \rfloor\). Therefore, \(L \equiv \lceil \lfloor L \rfloor \rceil\) by definition. \end{proof} \subsection{Typing and equality} \label{sec:typing-struct-terms} Similarly to the definition of syntactic equality for structured syntax, we also want the equality rules for structured terms and structured types to be inherited from the equality relations on their generated terms, and so define: \[ \U \vdash s = t \iff \lfloor \U \rfloor \vdash \lfloor s \rfloor = \lfloor t \rfloor \qquad \U \vdash A = B \iff \lfloor \U \rfloor \vdash \lfloor A \rfloor = \lfloor B \rfloor\] For labellings, (definitional) equality can be defined similarly to the syntactic equality relation: \[ \U \vdash L = M \iff \U \vdash \ty(L) = \ty(M) \land \forall (p : \Path_T).\ \U \vdash L(p) = M(p)\] Using \cref{lem:wedge-typing}, it can be proven by a simple induction that equality of labellings (along with equality of their associated types) induces equality of the generated substitutions. We also want the typing rules for \(s : \STerm_\U\) and \(A : \SType_\U\) to be inherited from the typing rules for \(\lfloor s \rfloor\) and \(\lfloor A \rfloor\). We re-use the notation for each typing judgement. For labellings, we introduce the following more natural typing judgement: \begin{definition} For a labelling \(L : T \to \U\), where \(\U : \Ctx \uplus \Tree\), we define the judgement: \[ \U \vdash L : T \] to mean that the labelling \(L\) is well-formed. This judgement is generated by the following rule: \begin{mathpar} \inferrule{\U \vdash L[0] : \ty(L)\quad \cdots\quad \U\vdash L[n+1] : \ty(L)\\\U\vdash L_0 : T_0\quad \cdots\quad\U\vdash L_n : T_n}{\U \vdash L : [T_0,\dots,T_n]} \end{mathpar} \end{definition} Paths \(p\) can be equipped with a canonical structured type, \(\ty(p)\), as follows: \begin{itemize} \item For paths \([k]\), \(\ty([k]) = \star\), \item For paths \(k :: p\) where \(p\) is a path, the type \(\ty(k :: p)\) is obtained by taking the type \(\ty(p)\), applying \(\Inc_k\) to each term, and replacing the \(\star\) type at its base by the type \(\arr {\SPath[k]} {\star} {\SPath[k+1]}\). \end{itemize} This can be used to prove that the identity labelling is well-formed. \begin{proposition} Let \(S\) be a tree. Then \(S \vdash \id_S : S\). \end{proposition} \begin{proof} Let \(x\) be a list that indexes a subtree of \(S\), and define the labelling \(\mathsf{subtree}(x) : S^x \to x\) by \(\ty(\mathsf{subtree}(x)) = \ty(x \doubleplus [0])\) and \(\mathsf{subtree}(x)(p) = \SPath(x \doubleplus p)\). We then prove the more general result that \(S \vdash \mathsf{subtree}(x) : S^x\) for each \(x\), with the desired result following from the case \(x = \emp\). If \(S^x = \emp\), then the typing judgement follows from \(S \vdash S^x[0] : \ty(S^x[0])\). If \(S^x = [T_0, \dots, T_n]\) then we must show that \(S \vdash S^x[k] : \ty(S^x[0])\), which follows from the observation that \(\ty(S^x[0]) \equiv \ty(S^x[i])\) for any \(i\) as the definition does not use the last element of the path. We are also required to show that \(S \vdash S^x_i : T_i\), but \(T_i \equiv S^{x \doubleplus [i]}\) and \(S^x_i \equiv S^{x \doubleplus [i]}\), and so this follows from inductive hypothesis. \end{proof} From this typing judgement for labellings, one can obtain a derivation of the typing judgement for the generated substitution. \begin{proposition} Let \(L : T \to \U\), and suppose \(\U \vdash L : T\) and \(\U \vdash \ty(L)\). Then: \[ \lfloor \U \rfloor \vdash \lfloor L \rfloor : \lfloor T \rfloor\] \end{proposition} \begin{proof} We induct on the tree \(T\), splitting into cases on whether it is empty. If it is, then by case analysis on the judgement for label typing we get: \[ \U \vdash L[0] : \ty(L) \] Then, \(\lfloor L \rfloor \equiv \langle \lfloor A \rfloor, \lfloor L[0] \rfloor \rangle\), and so the following derivation can be obtained: \[ \begin{prooftree} \infer0{\U \vdash A} \infer1{\lfloor \U \rfloor \vdash \lfloor A \rfloor} \infer1{\lfloor \U \rfloor \vdash \langle \lfloor A \rfloor \rangle : \emptyset} \infer0{\U \vdash L[0] : A} \infer1{\lfloor \U \rfloor \vdash \lfloor L[0] \rfloor : \lfloor A \rfloor} \infer2{\lfloor \U \rfloor \vdash \langle \lfloor A \rfloor, \lfloor L[0] \rfloor \rangle : \lfloor \emp \rfloor} \end{prooftree} \] Suppose instead that \(T = [T_0,\dots,T_n]\), such that: \[ \lfloor L \rfloor \equiv \unrestrict \lfloor L_0 \rfloor \vee \cdots \vee \unrestrict \lfloor L_n \rfloor\] From \(\U \vdash L : T\), we obtain \(\U \vdash L_i : T_i\) for each \(i \in \{0,\dots,n\}\). We further obtain \(\U \vdash L[k] : \ty(L)\) for \(0 \leq k \leq n+1\) and so: \[\ty(L_i) \equiv \arr {L[i]} {\ty(L)} {L[i+1]}\] is well-formed and so by inductive hypothesis we have \(\lfloor \U \rfloor \vdash \lfloor L_i \rfloor : \lfloor T_i \rfloor\). We have for each \(i\) that \(\lfloor \ty(L) \rfloor\) is not the type \(\star\) and so the unrestriction \(\unrestrict \lfloor L_i \rfloor\) is well-formed. Furthermore, by construction of the unrestriction we have: \[ \fst(\lfloor T_i \rfloor) \sub {\lfloor L_i \rfloor} \equiv \lfloor L[i] \rfloor \qquad \snd(\lfloor T_i \rfloor) \sub {\lfloor L_i \rfloor} \equiv \lfloor L[i+ 1] \rfloor\] and so by \cref{lem:wedge-typing}, the wedge sums are well-formed, completing the proof. \end{proof} It can be shown that the reverse implication also holds: if \(\lfloor \U \rfloor \vdash \lfloor L \rfloor : \lfloor T \rfloor\) then \(\U \vdash L : T\). This follows as a corollary from the following proposition. \begin{proposition} Let \(\sigma : \arr {\lfloor T \rfloor} A \Gamma\) be a substitution with \(\Gamma \vdash \sigma : \lfloor S \rfloor\). Then for any \(L : S \to T\) we have: \[T \vdash L : S \implies \Gamma \vdash L \bullet \lceil \sigma \rceil : S\] and hence \(\Gamma \vdash \lceil \sigma \rceil : T\) follows from letting \(L\) be the identity labelling. \end{proposition} \begin{proof} Let \(S = [S_0, \dots, S_n]\) (where we allow this list to be empty). By the definition of the typing for a labelling, it suffices to show that for each \(0 \leq i \leq n\) and \(0 \leq k \leq n + 1\) that: \[ S \vdash L[k] \bullet \lceil \sigma \rceil : \ty(L) \sub {\lceil \sigma \rceil} \qquad S \vdash (L \bullet \lceil \sigma \rceil)_i : S_i\] The second typing judgement follows directly from inductive hypothesis, as \((L \bullet \lceil \sigma \rceil)_i \equiv L_i \bullet \lceil \sigma \rceil\). By definition of typing for structured terms, the first judgement requires us to prove that: \[ \lfloor S \rfloor \vdash \lfloor L[k] \bullet \lceil \sigma \rceil \rfloor : \lfloor \ty(L) \sub {\lceil \sigma \rceil} \rfloor\] which is equivalent to: \[ \lfloor S \rfloor \vdash \lfloor L[k] \rfloor \sub \sigma : \lfloor \ty(L) \rfloor \sub \sigma\] and so follows from typing being preserved by substitution. \end{proof} By these results, many of the properties enjoyed by the typing judgements in \Cattr with a tame rule set \(\mathcal{R}\) also apply to the typing judgements for structured terms. The module \module{Catt.Tree.Structured.Typing.Properties} also introduces many functions for constructing the typing judgements for structured syntax. One such function is \func{Catt.Tree.Structured.Typing.Properties}{TySCoh}, which represents the admissibility of the following rule: \begin{equation} \label[rule]{rule:scoh} \inferrule{S \vdash \arr s A t \\ \U \vdash L : S \\ \U \vdash \ty(L) \\ (\lfloor S \rfloor, \Supp(s), \Supp(t)) \in \mathcal{O}}{\U \vdash \SCoh S {\arr s A t} L} \end{equation} In keeping with the theme of this section, one could define \(\Supp(s)\) as \(\Supp(\lfloor s \rfloor)\) for a structured term \(s : \STerm_\U\). However, we choose not to do this, instead giving a definition of support for structured syntax that leverages the extra information available in the syntax. \begin{definition} For a path \(p : \Path_T\), a structured term \(s : \STerm_\U\), a structured type \(A : \SType_\U\), and a labelling \(L : S \to \U\), we define their supports \(\Supp(p)\), \(\Supp(s)\), \(\Supp(A)\), and \(\Supp(L)\) by mutual recursion: \begin{align*} \Supp([n]) &= \{\lfloor [0] \rfloor\}\\ \Supp(k :: p) &= \Sigma(\Supp(p)) \sub {\inc_{T_k}} &\text{where }T = [T_1,\dots,T_n]\\[10pt] \Supp(\SPath(p)) &= \Supp(p)\\ \Supp(\Inc_i(s)) &= \Sigma(\Supp(s)) \sub {\inc_{T_k}}&\text{where }T = [T_1,\dots,T_n]\\ \Supp(\SCoh S A L) &= \Supp(L) \cup \Supp(\ty(L))\\ \Supp(\SOther(t)) &= \Supp(t)\\[10pt] \Supp(\star) &= \emptyset\\ \Supp(\arr s A t) &= \Supp(s) \cup \Supp(A) \cup \Supp(t)\\ \Supp(L) &= \bigcup_{i=0}^{n+1} \Supp(L[i]) \cup \bigcup_{i=0}^n\Supp(L_i) \end{align*} \end{definition} We note that each of these support definitions is naturally downwards closed, and there is no need to apply a downwards closure operator as was necessary for the support of \Catt syntax. By some routine calculations given in the formalisation module \module{Catt.Tree.Structured.Support}, these support definitions are equivalent to taking the support of the generated piece of syntax. More precisely, the equations: \begin{mathpar} \Supp(p) = \Supp(\lfloor p \rfloor) \and \Supp(s) = \Supp(\lfloor s \rfloor) \and \Supp(A) = \Supp(\lfloor A \rfloor) \and \Supp(L) \cup \Supp(\ty(L)) = \Supp(\lfloor L \rfloor) \end{mathpar} for path \(p\), structured term \(s\), structured type \(A\), and labelling \(L\). By using this notion of support, we are able to avoid a lot of ``boilerplate'' proof. The above definition of support more closely resembles the format of structured terms, and without this definition, most proofs concerning the support of a structured term would begin by simplifying a variable set similar to \(\Supp(\lfloor s \rfloor)\) to one more similar to \(\Supp(s)\). Here, we instead give this equivalence proof once. We end this section by giving alternative equality relations for labellings, which encapsulate the idea that a substitution is fully determined by where it sends locally maximal variables. These equalities are defined as follows for labellings \(L : T \to \U\) and \(M : T \to \U\): \begin{align*} L \equiv^{\max} M &\iff \forall (p : \MaxPath_T).\ L(p) \equiv M(p)\\ \U \vdash L =^{\max} M &\iff \forall (p : \MaxPath_T).\ \U \vdash L(p) = M(p) \end{align*} and define two labels to be equal exactly when their action on maximal paths is equal. The following theorem gives conditions for when the standard equality relation can be recovered from these. \begin{theorem} \label{thm:label-max-equality} Let \(L : S \to \U\) and \(M : S \to \U\) be labellings. Then the following rules are admissible: \begin{mathpar} \inferrule{\U \vdash L : S\\ \U \vdash M : S \\ L \equiv^{\max} M}{\U \vdash L = M}\and \inferrule{\U \vdash L : S\\ \U \vdash M : S \\ L \equiv^{\max} M}{\U \vdash \ty(L) = \ty(M)} \end{mathpar} If the equality rule set \(\mathcal{R}\) satisfies the preservation and support conditions, then the rules above are still admissible with \(\U \vdash L =^{\max} M\) replacing the syntactic equalities. \end{theorem} \begin{proof} We prove the results for the syntactic equality, with the results for the definitional equality following similarly, but using the preservation property instead of uniqueness of typing. We proceed by induction on the tree \(S\), proving the admissibility of both rules simultaneously. First suppose that \(S = \emp\). Then the path \([0] : \Path_{\emp}\) is maximal and so \(\U \vdash L = M\) follows by the reflexivity of equality. The second rule follows from the uniqueness of typing, as we get \(\U \vdash L[0] : \ty(L)\) and \(\U \vdash M[0] : \ty(M)\) from the premises. Now suppose that \(S = [S_0,\dots,S_n]\). By inductive hypothesis, the following judgements hold for each \(i \in \{0,\dots,n\}\): \[ \U \vdash L_i = M_i \qquad \U \vdash \arr {L[i]} {\ty(L)} {L[i+1]} = \arr {M[i]} {\ty(M)} {M[i+1]}\] From the equalities on types, we immediately get that \(\U \vdash \ty(L) = \ty(M)\) as is required for the admissibility of the second rule, and also get that \(\U \vdash L[i] = M[i]\) for each \(0 \leq i \leq n+1\), which along with equality on (sub)labellings above is sufficient to prove that: \[ \U \vdash L = M\] which witnesses the admissibility of the first rule. \end{proof} \subsection{Standard coherences} \label{sec:standard-coherences} In \cref{sec:background}, we gave a preliminary definition of standard coherences, a definition of a canonical coherence over a given pasting diagram. This diagram relies on inclusion substitutions from the boundary of a pasting diagram into its source and target variables, whose definition for ps-contexts can be unpleasant to work with. In contrast, the \(n\)-boundary of a tree and its associated source and target inclusions have a natural definition by induction on the tree, where the source and target inclusions are given by labellings. We give this definition below. \begin{definition} Given dimension \(n \in \mathbb{N}\) and \(T : \Tree\), we define the \emph{\(n\)-boundary} of the tree \(\bound n T : \Tree\) by induction on \(n\) and \(T\): \begin{equation*} \bound 0 T = \emp \qquad \bound {n + 1} {[T_0, \dots, T_n]} = [\bound n {T_0}, \dots , \bound n {T_n}] \end{equation*} We further define path-to-path functions \(\incbdpath n \epsilon T : \bound n T \to T\) for \(\epsilon \in \{-,+\}\) by induction: \begin{align*} \incbdpath 0 - T ([0]) &= [0]\\ \incbdpath 0 + {[T_0, \dots, T_m]} ([0]) &= [m+1]\\ \incbdpath {n+1} \epsilon {[T_0, \dots, T_m]} ([k]) &= [k]\\ \incbdpath {n+1} \epsilon {[T_0,\dots, T_m]} (k :: p) &= [k :: \incbdpath {n+1} \epsilon {T_k} (p)] \end{align*} and then can define the \emph{source inclusion labelling} \(\incbd n + T : {\bound n T} \to T\) and \emph{target inclusion labelling} \(\incbd n + T : {\bound n T} \to T\) by: \[\incbd n \epsilon T(p) = \SPath(\incbdpath n \epsilon T(p)) \qquad \ty(\incbd n \epsilon T) = \star\] for each \(n\) and \(\epsilon \in \{-,+\}\). \end{definition} In the module \module{Catt.Tree.Boundary.Typing}, it is proven that: \[ T \vdash \incbd n \epsilon T : \bound n T\] for all trees \(T\), \(n \in \mathbb{N}\), and \(\epsilon \in \{-,+\}\). In \cref{sec:background}, the source and target variable sets were defined to be support of the source and target inclusions. This can now be justified by the following lemma. \begin{lemma} For a dimension \(n \in \mathbb{N}\), \(T : \Tree\), and \(\epsilon \in \{-,+\}\) we have: \[ \Supp(\incbd n \epsilon T) = \bdry n \epsilon T \] \end{lemma} \begin{proof} The proof is given by the function \func{Catt.Tree.Boundary.Support}{tree-inc-label-supp} in the formalisation module \module{Catt.Tree.Boundary.Support} and proceeds by induction on \(n\) and \(T\). \end{proof} This definition also allows simple inductive proofs that the boundary inclusions satisfy the globularity conditions, which we state in the following proposition. These proofs are given in the formalisation module \module{Catt.Tree.Boundary.Properties}. \begin{proposition} \label{prop:bdry-glob} Let \(n \leq m\) and let \(T\) be a tree. Then: \[ \bound n {\bound m T} \equiv \bound n T\] Further, for \(\epsilon, \omega \in \{-,+\}\) we have: \[ \incbd n \epsilon {\bound m T} \bullet \incbd m \omega T \equiv \incbd n \epsilon T \] If instead \(n \geq \dep(T)\), then \(\bound n T \equiv T\) and \(\incbd n \epsilon T \equiv \id_T\). \end{proposition} Further, these constructions commute with suspension: The equalities \(\Sigma(\bound n T) \equiv \bound {n+1} {\Sigma(T)}\) and \(\Sigma(\incbd n \epsilon T) \equiv \incbd {n+1} \epsilon {\Sigma(T)}\) hold by definition. We now recall the definitions of standard type, standard coherence, and standard term for a tree \(T\), which are given by mutual induction: \begin{itemize} \item The \emph{standard type}, \(\stdty T n\), is an \(n\)-dimensional type where each component of the type is given by the standard term over the appropriate boundary of the tree \(T\), and then included back into \(T\) by applying the inclusion labelling. \item The \emph{standard coherence}, \(\stdcoh T n\), is the canonical dimension \(n\) coherence term over a tree \(T\). It is formed by a single coherence constructor over \(T\) with type given by the standard type, \(\stdty T n\). \item The \emph{standard term}, \(\stdtm T n\), is a variation on the standard coherence which does not introduce unnecessary unary composites. If \(T\) is linear (and so represents a disc context), and \(n = \dep(T)\), then \(\stdtm T n\) is simply given by the unique maximal path in \(T\). Otherwise, it is given by the standard coherence \(\stdcoh T n\). \end{itemize} At the end of \cref{sec:background} it was stated that \(\Sigma(\stdtm T n) \equiv \stdtm {\Sigma(T)} {n + 1}\). Using this, the standard term can instead be defined by letting \(\stdtm \emp 0\) be \(\SPath([0])\), \(\stdtm {\Sigma(T)} {n+1}\) be \(\Sigma(\stdtm T n)\), and \(\stdtm T n\) be \(\stdcoh T n\) otherwise, which avoids the case split on the linearity of \(T\). We now define all three constructions formally using structured syntax. \begin{definition} We define the \(n\)-dimensional \emph{standard type} over a tree \(T\) as a structured type \(\stdty T n : \SType_T\), and the \(n\)-dimensional \emph{standard coherence} and \emph{standard term} over a tree \(T\) as structured terms \(\stdcoh T n, \stdtm T n : \STerm_T\) by mutual induction: \begin{align*} \stdty T 0 &= \star\\ \stdty T {n + 1} &= \arr {\stdtm {\bound n T} n \sub {\incbd {n + 1} - T}} {\stdty T n} {\stdtm {\bound n T} n \sub {\incbd {n+1} + T}}\\[10pt] \stdcoh T n &= \SCoh T {\stdty T n} {\id_T}\\[10pt] \stdtm T n &= \begin{cases*} \SPath([0])&if \(T = \emp\) and \(n = 0\)\\ \Inc_0(\stdtm {T_0} {n-1})&if \(n \neq 0\) and \(T = [T_0]\)\\ \stdcoh T n&\text{otherwise} \end{cases*} \end{align*} when \(n = \dep(T)\), we call the standard coherence \(\stdcoh T n\) the \emph{standard composite} of \(T\). \end{definition} We can immediately show that these standard construct commute with suspension. \begin{lemma} \label{lem:std-susp} For tree \(T\) and \(n \in \mathbb{N}\), \(\Sigma(\stdty T n) \equiv \stdty {\Sigma(T)} {n+1}\) and \(\Sigma(\stdcoh T n) \equiv \stdcoh {\Sigma(T)} {n+1}\). \end{lemma} \begin{proof} We first consider the standard type. The case for \(n = 0\) follows immediately, so we let \(n > 0\). We then get for \(\epsilon \in \{-,+\}\): \begin{align*} \Sigma\left(\stdtm {\bound {n-1} T} {n-1} \sub {\incbd {n-1} \epsilon T}\right) &\equiv \Sigma(\stdtm {\bound {n-1} T} {n-1}) \sub {\Sigma(\incbd{n-1} \epsilon T)}&\text{by functoriality of suspension}\\ &\equiv \stdtm {\Sigma(\bound {n-1} T)} {n} \sub {\Sigma(\incbd{n-1} \epsilon T)}\\ &\equiv \stdtm {\bound n {\Sigma(T)}} n \sub {\incbd n \epsilon {\Sigma(T)}} \end{align*} By inductive hypothesis \(\Sigma(\stdty T {n-1}) \equiv \stdty {\Sigma(T)} n\) and so \begin{align*} \Sigma(\stdty T n) &\equiv \arr {\Sigma\left(\stdtm {\bound {n-1} T} {n-1} \sub {\incbd {n-1} - T}\right)} {\Sigma(\stdty T {n-1})} {\Sigma\left(\stdtm {\bound {n-1} T} {n-1} \sub {\incbd {n-1} + T}\right)}\\ &\equiv \arr {\stdtm {\bound n {\Sigma(T)}} {n} \sub {\incbd n - {\Sigma(T)}}} {\stdty {\Sigma(T)} {n}} {\stdtm {\bound n {\Sigma(T)}} {n} \sub {\incbd n + {\Sigma(T)}}}\\ &\equiv \stdty {\Sigma(T)} {n + 1} \end{align*} as required. For the standard coherence we have: \[ \Sigma(\stdcoh T n) \equiv \SCoh {\Sigma(T)} {\Sigma(\stdty T n)} {\Sigma(\id_T)} \equiv \SCoh {\Sigma(T)} {\stdty {\Sigma(T)} {n+1}} {\id_{\Sigma(T)}} \equiv \stdcoh {\Sigma(T)} {n+1}\] following from the case for types. \end{proof} To prove that the standard constructions are well-formed, we give a couple of lemmas. The first concerns the support of the standard term and standard coherence. \begin{lemma} \label{lem:std-supp} For a tree \(T\), dimension \(n \in \mathbb{N}\), and \(\epsilon \in \{-,+\}\), we have: \[ \Supp\left(\stdtm {\bound n T} n \sub {\incbd n \epsilon T}\right) = \bdry n \epsilon T \qquad \Supp\left(\stdcoh {\bound n T} n \sub {\incbd n \epsilon T}\right) = \bdry n \epsilon T\] \end{lemma} \begin{proof} The case for coherences follows from the definition and the equality \[\Supp(\incbd n \epsilon T) = \bdry n \epsilon T\] For the standard term, it suffices to consider cases where the standard term and standard coherence are not equal. If \(n = 0\), then \(\bound n T \equiv \emp\), and it suffices to prove that \(\Supp([m]) = \FV(\lfloor [m] \rfloor)\), but this is immediate because \(\Supp([m]) = \Supp(\lfloor [m] \rfloor)\) and \(\lfloor [m] \rfloor\) is a variable of type \(\star\) so its support is equal to its free variables. We therefore consider the case where \(n > 0\) and \(\len(\bound n T) = 1\). The only case where this happens is if \(\len(T) = 1\) too, so assume \(T \equiv [T_0]\) \begin{align*} \Supp\left(\stdtm {\bound n T} n \sub {\incbd n \epsilon T} \right) &= \Supp\left(\stdtm {\Sigma(\bound {n-1} {T_0})} n \sub {\Sigma\left(\incbd {n-1} \epsilon {T_0}\right)} \right)\\ &= \Supp\left(\Sigma\left( \stdtm {\bound {n-1} {T_0}} {n - 1} \right) \sub {\Sigma\left(\incbd {n-1} \epsilon {T_0}\right)} \right)\\ &= \Supp\left(\Sigma\left( \stdtm {\bound {n-1} {T_0}} {n - 1} \sub {\incbd {n-1} \epsilon {T_0}} \right) \right)\\ &= \Sigma\left(\Supp\left( \stdtm {\bound {n-1} {T_0}} {n - 1} \sub {\incbd {n-1} \epsilon {T_0}} \right) \right)\\ &= \Sigma\left( \bdry {n-1} \epsilon {T_0} \right)\\ &= \bdry n \epsilon T \end{align*} as required. \end{proof} The second lemma gives a globularity condition for the standard type. \begin{lemma} \label{lem:std-type-glob} Let \(T\) be a tree. Then: \[ \stdty T n \equiv \stdty {\bound m T} n \sub {\incbd m \epsilon T}\] for \(n \leq m\) and \(\epsilon \in \{-,+\}\). \end{lemma} \begin{proof} We induct on \(n\). If \(n = 0\) then both sides of the equation are the type \(\star\). We therefore consider the case for \(n + 1\) and so we must prove: \begin{align*} \stdty T {n+1} &\equiv \arr {\stdtm {\bound n T} n \sub {\incbd n - T}} {\stdty T k} {\stdtm {\bound n T} n \sub {\incbd n + T}}\\ &\equiv \arr {\stdtm {\bound n {\bound m T}} n \sub {\incbd n - {\bound m T}} \sub {\incbd m \epsilon T}} {\stdty {\bound m T} n \sub {\incbd m \epsilon T}} {\stdtm {\bound n {\bound m T}} n \sub {\incbd n + {\bound m T}} \sub {\incbd m \epsilon T}}\\ &\equiv \stdty {\bound m T} {n+1} \sub {\incbd m \epsilon T} \end{align*} The equality \({\stdty T n} \equiv {\stdty {\bound m T} n \sub {\incbd m \epsilon T}}\) follows by inductive hypothesis. Further, for \(\omega \in \{-,+\}\) we have by \cref{prop:bdry-glob}: \begin{align*} \stdtm {\bound n {\bound m T}} n \sub {\incbd n \omega {\bound m T}} \sub {\incbd n \epsilon T} &\equiv \stdtm {\bound n {\bound m T}} n \sub {\incbd n \omega {\bound m T} \bullet \incbd m \epsilon T} \\ &\equiv \stdtm {\bound n T} n \sub {\incbd n - T} \end{align*} which completes the proof. \end{proof} We can now state and prove the typing properties of standard constructions. \begin{proposition} \label{prop:standard-typing} Suppose that \(\mathcal{O}\) contains the standard operations. Then the following rules are admissible: \begin{mathpar} \inferrule{T : \Tree\\ n \in \mathbb{N}}{T \vdash \stdty T n}\and \inferrule{T : \Tree \\ n \neq 0\\ n \geq \dep(T)}{T \vdash \stdcoh T n : \stdty T n}\and \inferrule{T : \Tree \\ n \geq \dep(T)}{T \vdash \stdtm T n : \stdty T n} \end{mathpar} \end{proposition} \begin{proof} We prove that all three rules are admissible by mutual induction. First consider the cases for types. The case when \(n = 0\) is trivial, so we consider the case for \(n + 1\). We need to show that: \[ T \vdash \arr {\stdtm {\bound n T} n \sub {\incbd n - T}} {\stdty n T} {\stdtm {\bound n T} n \sub {\incbd n + T}}\] The inductive hypothesis on types gives that \(T \vdash \stdty n T\) and so we must show that: \[ T \vdash {\stdtm {\bound n T} n \sub {\incbd n \epsilon T}} : \stdty n T\] for \(\epsilon \in \{-,+\}\). By inductive hypothesis for terms, we have \(\bound n T \vdash \stdtm {\bound n T} n : \stdty {\bound n T} n\) as we have \(\dep(\bound n T) \leq n\). As \(T \vdash \incbd n \epsilon T : \bound n T\) we have that: \[ T \vdash {\stdtm {\bound n T} n \sub {\incbd n \epsilon T}} : \stdty {\bound n T} n \sub {\incbd n \epsilon T} \] and so by \cref{lem:std-type-glob}, this case is complete. For the standard coherence, we apply \cref{rule:scoh}, using the inductive hypothesis for types. To show that \((T, \src(\stdty T n), \tgt(\stdty T n)) \in \mathcal{O}\), we apply \cref{lem:std-supp}. For the standard term, like previous proofs it is sufficient to consider the cases where it is defined differently to the standard coherence. For \(n = 0\) we must have \(T = \emp\) by the condition on the depth of \(T\). Hence, \(\stdtm T n \equiv [0]\) which is well-formed as has type \(\star \equiv \stdty T n\) as required. We now consider \(\stdtm {\Sigma(T)} {n+1} \equiv \Sigma (\stdtm T n)\). By inductive hypothesis on dimension, \(T \vdash \stdtm T n : \stdty T n\) and so we immediately have that: \[ \Sigma(T) \vdash \stdtm {\Sigma(T)} {n + 1} : \Sigma(\stdty T n)\] and so the proof is complete by \cref{lem:std-supp}. \end{proof} The equality relations we have seen so far make heavy use of disc contexts and associated terms and types. We therefore pause to consider the form of these as structured syntax and to relate them to the standard constructions presented in this section. All disc contexts are the result of applying iterated suspensions to the singleton context, and so it follows that disc contexts correspond exactly to linear trees. By an abuse of notation we write: \[ D^n = \Sigma^n(\emp)\] As we further have that \(\Sigma(U^n) \equiv U^{n+1}\) for the sphere type \(U^n\), it can be proved for a simple induction that: \[U^n \equiv \lfloor \stdty {D^n} n \rfloor\] As we have already noted, the maximal dimension term \(d_n : \Term_{D^n}\) is given by \(\lfloor \stdtm {D^n} n \rfloor\). It is also equal to the unique maximal path, \(p^n = \Sigma^n[0]\), which is the list containing \(n+1\) zeros. The only missing construction is an equivalent for the substitution from a disc context. From a structured term \(s : \STerm_\U\) of type \(A : \SType_\U\), there should be a labelling \(\{A,s\}\) from \(D^n\) to \(\U\). This however proves more challenging to define as trees and types have opposite inductive structure. For a labelling, it is natural to specify the lower-dimensional terms first and fill in higher-dimensional terms by induction, though when deconstructing a type, we first receive the highest dimensional terms, only receiving the lower-dimensional terms by further deconstructing the type. To define the labelling \(\{A,t\}\), we define the extension of labelling from a linear tree, which allows us to add higher-dimensional terms to the labelling, and use this to define the labelling from a linear tree. \begin{definition} Let \(L : D^n \to \U\) be a labelling from a linear tree, and let \(s,t : \STerm_\U\) be structured terms. The \emph{extension} of \(L\) by \(s\) and \(t\), \(\ext(L,s,t)\), is defined inductively on \(n\) by: \begin{equation*} \ty(\ext(L,s,t)) = \ty(L) \qquad \ext(L,s,t) = \begin{cases*} L[0]\,\{t\}\,s &if \(n = 0\)\\ L[0]\,\{\ext(L_0,s,t)\}\,L[1]&otherwise \end{cases*} \end{equation*} We then define the labelling \(\{A,t\}\) by induction on \(A\): \[ \{\star,t\} = (p \mapsto t) \qquad \{\arr s A t, u\} = \ext(\{A,s\},t,u) \qquad \ty(\{A,t\}) = \star\] \end{definition} These constructions all satisfy the expected typing judgements. More precisely the following inference rules are admissible: \begin{mathpar} \inferrule{\U \vdash L : D^n\\ \U \vdash s : \stdty {D^n} n \sub L\\ \U \vdash t : \arr {p^n \sub L} {\stdty {D^n} n \sub L} {s}}{\U \vdash \ext(L,s,t) : D^{n+1}}\\ \inferrule{\U \vdash A \\ \U \vdash t : A}{\U \vdash \{A,t\} : D^{\dim(A)}} \end{mathpar} The admissibility of the above rules is routine to verify. Using these constructions, we can recover structured term definitions of the unary composite of a (structured) term \(t\) of type \(A\) of dimension \(n\) as \( \stdcoh {D^n} n \sub {\{A,t\}}\) and can define the identity of the same term \(t\) as \(\stdcoh {D^n} {n+1} \sub {\{A,t\}}\). Therefore, the rules for disc removal and endo-coherence removal can be rephrased in terms of structured syntax to get the following rules: \begin{mathpar} \inferrule{\U : \Ctx \uplus \Tree\\ \U \vdash A \\ \U \vdash t : A \\ \dim(A) = n > 0}{ \U \vdash \stdcoh {D^n} n \sub {\{A,t\}} = t}\textsc{dr'}\\ \inferrule{\U : \Ctx \uplus \Tree\\ T : \Tree \\ L : \arr S \star \U\\ n = \dim(A)\\\\ T \vdash A \\ T \vdash s : A \\ \Supp(s) = \Var(T) \\ \U \vdash L : T}{\U \vdash \SCoh T {\arr s A s} L = \stdcoh {D^n} {n+1} \sub { \{A, s\} \bullet L}}\textsc{ecr'} \end{mathpar} which are admissible if the equality rule set \(\mathcal{R}\) has disc removal or endo-coherence removal respectively. We end this section with two further results that can be proven in the presence of disc removal and endo-coherence removal. The first states that disc removal is sufficient (and necessary) to unify standard coherences and standard terms. \begin{theorem} \label{thm:std-dr} The tame equality rule set \(\mathcal{R}\) has disc removal if and only if the rule: \begin{mathpar} \inferrule{T : \Tree \\ n \in \mathbb{N}\\ n \geq \dep(T) > 0}{T \vdash \stdcoh T n = \stdtm T n} \end{mathpar} is admissible. \end{theorem} \begin{proof} We note that \(\stdcoh T n\) and \(\stdtm T n\) only differ when \(T = D^n\). If \(\mathcal{R}\) has disc removal, then for each \(n \neq 0\) we have \(\stdcoh {D^n} n = \SPath(p^n) \equiv \stdtm {D^n} n\). Conversely, if \(\stdcoh T n = \stdtm T n\) when \(n > 0\) or \(\dep(T) > 0\), then \(\stdcoh {D^n} n = \stdtm {D^n} n\) for any \(n > 0\). Then as \(\mathcal{R}\) is tame, we can apply the substitution \(\{A,t\}\) to both sides of the equation to get the statement of disc removal. \end{proof} Lastly, under the presence of endo-coherence removal, the standard coherences \(\stdtm T n\) for which \(n > \dep(T)\) can be shown to be equal to identities. \begin{theorem} \label{thm:std-ecr} Suppose the equality rule set \(\mathcal{R}\) has endo-coherence removal. Let \(T\) be a tree and suppose \(n \geq \dep(T)\). Then: \[ T \vdash \stdcoh T {n+1} = \stdcoh {D^n} {n+1} \sub {\{\stdty T n, \stdtm T n\}} \] \end{theorem} \begin{proof} The following chain of equalities hold: \begin{align*} \stdcoh T {n+1} &\equiv \SCoh T {\arr {\stdtm {\bound n T} n \sub {\incbd n T -}} {\stdty T n} {\stdtm {\bound n T} n \sub {\incbd n T +}}} {\id_S}\\ &\equiv \SCoh T {\arr {\stdtm T n} {\stdty T n} {\stdtm T n}} {\id_S}&\text{by \cref{prop:bdry-glob}}\\ &= \stdcoh {D^n} {n+1} \sub {\{\stdty T n, \stdtm T n\}}&\text{by \textsc{ecr'}} \end{align*} where \textsc{ecr'} can be applied as \(\Supp(\stdtm T n) = \Var(\lfloor T \rfloor)\) by \cref{lem:std-supp}. \end{proof} Due to these two theorems, every standard term \(\stdtm T n\) with \(n \geq \dep(T)\) is equal to either the unique variable of the singleton context (when \(n = \dep(T) = 0\)), a standard composite (when \(n = \dep(T) > 0\)) or an identity (when \(n > \dep(T)\)), hence completely classifying the well-formed standard terms. \section{Insertion} \label{sec:insertion} We now introduce \emph{insertion}, the construction that powers the strictly associative behaviour of \Cattsua. Insertion incorporates part of the structure of a locally maximal argument term into the head coherence, simplifying the overall syntax of the term. Consider the composite \(f * (g * h)\). This term has two locally maximal arguments, \(f\) and \(g * h\), the second of which is a (standard) coherence. Insertion allows us to merge these two composites into one by ``inserting'' the pasting diagram of the inner coherence into the pasting diagram of the outer coherence. In the case above we will get that the term \(f * (g * h)\) is equal to the ternary composite \(f * g * h\), a term with a single coherence. As the term \((f * g) * h\) also reduces by insertion to the ternary composite, we see that both sides of the associator become equal under insertion. The action of insertion on these contexts is shown in \cref{fig:insertion}. \begin{figure} $$ \begin{aligned} \begin{tikzpicture} \node (x) at (0,0) {$x$}; \node (y) at (1.5,0) {$y$}; \node (z) at (3,0) {$z$}; \draw [->] (x) to node [above, font=\small] {$f$} (y); \draw [->] (y) to node [above, font=\small] {$g*h$} (z); \begin{scope}[xshift=1.25cm, yshift=1.75cm, red] %\draw [fill=red!10, draw=none] (1,0.05) ellipse (1.2cm and .6cm); \draw [rounded corners, fill=red!7, draw=none] (-.25,-.35) rectangle +(2.5,1); \node (x2) at (0,0) {$x'$}; \node (y2) at (1,0) {$y'$}; \node (z2) at (2,0) {$z'$}; \draw [->] (x2) to node [above, font=\small] {$g$} (y2); \draw [->] (y2) to node [above, font=\small] {$h$} (z2); \end{scope} \draw [->, very thick, red] (2.25,1.25) to +(0,-.5); \end{tikzpicture} \end{aligned} \quad\leadsto\quad \begin{aligned} \begin{tikzpicture} \node (x) at (0,0) {$x \vphantom'$}; \node [red] (y) at (1,0) {$x'$}; \node [red] (z) at (2,0) {$y'$}; \node [red] (w) at (3,0) {$z'$}; \begin{scope}[xshift=.5cm, yshift=1.5cm, red] \draw [rounded corners, fill=white, draw=none] (-.25,-.35) rectangle +(2.5,1); \end{scope} \draw [->] (x) to node [above, font=\small] {$f$} (y); \draw [->, red] (y) to node [above, font=\small] {$g$} (z); \draw [->, red] (z) to node [above, font=\small] {$h$} (w); \end{tikzpicture} \end{aligned} $$ \caption{Insertion acting on the composite \(f * (g * h)\).} \label{fig:insertion} \end{figure} Insertion is an operation that is best understood with respect to trees instead of ps-contexts. Insertion merges the structure of two trees along a \emph{branch} of the first tree. \begin{definition} Let \(S\) be a tree. A \emph{branch} of \(S\) is a non-empty list of natural numbers \(P\) which indexes a subtree \(S^P\) which is linear. From each branch \(P\), a maximal path \(\olsi P\) can be obtained by concatenating \(P\) with \(p^{\dep(S^P)}\), the unique maximal path of \(S^P\). For a branch \(P\), we further define the \emph{branch height}, \(\bh(P)\), to be one less than the length of \(P\) (noting that branches are non-empty lists), and the \emph{leaf height}, \(\lh(P)\), to be one less than the length of \(\olsi P\), which is equal to the dimension of \(\lfloor \hat P \rfloor\). \end{definition} While each branch \(P\) uniquely determines a maximal path \(\olsi P\), the converse does not hold. There may be multiple branches of a tree which correspond to the same maximal path. Consider the tree \(T = [[[[\emp],\emp],\emp]]\). This has two distinct branches \(P = [0,0,0]\) and \(Q = [0,0,0,0]\) which both correspond to the maximal path \([0,0,0,0,0]\). We graphically depict these branches below by drawing them in blue. \[ P = \begin{tikzpicture}[yscale=0.7,every node/.append style={scale=0.6},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \node [on grid, Diag1] at (-0.9,3)(x31) {$\bullet$}; \node [on grid] at (-0.1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (-0.9,4)(x41) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw[Diag1,very thick] (x21.center) to (x31.center); \draw (x21.center) to (x32.center); \draw[Diag1,very thick] (x31.center) to (x41.center); \end{scope} \end{tikzpicture} \qquad Q = \begin{tikzpicture}[yscale=0.7,every node/.append style={scale=0.6},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \node [on grid] at (-0.9,3)(x31) {$\bullet$}; \node [on grid] at (-0.1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (-0.9,4)(x41) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw (x21.center) to (x31.center); \draw (x21.center) to (x32.center); \draw[Diag1,very thick] (x31.center) to (x41.center); \end{scope} \end{tikzpicture} \] While \(P\) and \(Q\) represent the same path, they have different branch heights: the branch height of \(P\) is \(2\) while the branch height of \(Q\) is \(3\). This will cause insertions along these two branches to proceed differently (though we will see later in \cref{lem:insertion-irrel} that if both insertions are valid then the results are equivalent). The leaf height and branch height of the branch \(P\) is demonstrated in \cref{fig:leafheight}, where we also depict the trunk height of \(T\), which was defined in \cref{sec:trees}. \begin{figure} \[ \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \node [on grid, Diag1] at (-0.9,3)(x31) {$\bullet$}; \node [on grid] at (-0.1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (-0.9,4)(x41) {$\bullet$}; \node [left=0 of x31.center ,on grid] {$T^P$}; \node [right=0 of x41.center ,on grid] {$\olsi P$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw[Diag1,very thick] (x21.center) to (x31.center); \draw (x21.center) to (x32.center); \draw[Diag1,very thick] (x31.center) to (x41.center); \end{scope} \node [on grid] at (-0.7,1)(th) {}; \draw [|->] (-0.7,0) to node [left] {$\th(T)$} (th.center); \draw [dotted,very thick] (th) to (x11); \node [on grid] at (-1.5,2) (bh) {}; \draw [|->] (-1.5,0) to node [left] {$\bh(P)$} (bh.center); \draw [dotted,very thick] (bh) to (x21); \node [on grid] at (-2.3,4) (lh) {}; \draw [|->] (-2.3,0) to node [left] {$\lh(P)$} (lh.center); \draw [dotted,very thick] (lh) to (x41); \end{tikzpicture} \] \caption{\label{fig:leafheight} Leaf height, branch height and trunk height.} \end{figure} Let us again consider the tree \(S = [[\emp,\emp],\emp]\) from \cref{fig:tree-example}. This tree has three branches, corresponding to the maximal paths \([0,0,0]\), \([0,1,0]\), and \([1,0]\). We consider the action of insertion of three trees \(T_1,T_2,T_3\), given below, into branch \(P = [0,0]\), which corresponds to the first of these maximal paths. \[T = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1)(x11) {$\bullet$}; \node [on grid] at (-0.5, 2) (x21) {$\bullet$}; \node [on grid] at (0.5,2)(x22) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{tikzpicture} \qquad T' = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1)(x11) {$\bullet$}; \draw (x01.center) to (x11.center); \path [draw=none] (-0.5,0) to (0.5,0); \end{tikzpicture} \qquad T'' = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x21.base)] \node [on grid] at (0,1)(x11) {$\bullet$}; \node [on grid] at (-0.5, 2) (x21) {$\bullet$}; \node [on grid] at (0.5,2)(x22) {$\bullet$}; \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{tikzpicture} \] We first consider the insertion of \(T\) into \(S\), which returns the inserted tree \(\insertion S P {T}\), where \(P\) is drawn in blue on the diagram. \[ S = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid, Diag1] at (-0.9, 2)(x21) {$\bullet$}; \node [on grid] at (-0.1, 2)(x22) {$\bullet$}; \node [on grid] at (0.5,1)(x12) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw[Diag1, very thick] (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{scope} \end{tikzpicture} \qquad T = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base), Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5, 2) (x21) {$\bullet$}; \node [on grid] at (0.5,2)(x22) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{tikzpicture} \qquad \insertion S P {T} = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base)] \node [on grid, Diag2] at (0,0) (x01) {$\bullet$}; \node [on grid, Diag2] at (-0.5,1) (x11) {$\bullet$}; \node [on grid, Diag2] at (-0.9, 2)(x21) {$\bullet$}; \node [on grid, Diag2] at (-0.5, 2)(x22) {$\bullet$}; \node [on grid] at (-0.1, 2)(x23) {$\bullet$}; \node [on grid] at (0.5,1)(x12) {$\bullet$}; \begin{scope}[on background layer] \draw [Diag2] (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw [Diag2] (x11.center) to (x21.center); \draw [Diag2] (x11.center) to (x22.center); \draw (x11.center) to (x23.center); \end{scope} \end{tikzpicture} \] In this case the structure of \(T\) is compatible with the point of insertion \(P\) and \(T\) can be inserted into \(S\), replacing the branch \(P\) with the appropriate part of \(T\), where this appropriate part is obtained by removing the trunk of \(T\). We now consider the insertion of \(T'\) into \(S\). Despite \(T'\) having a lower depth than \(S\), it is still insertable, forming the following tree \(\insertion S P {T'}\). \[ S = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid, Diag1] at (-0.9, 2)(x21) {$\bullet$}; \node [on grid] at (-0.1, 2)(x22) {$\bullet$}; \node [on grid] at (0.5,1)(x12) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw[Diag1, very thick] (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{scope} \end{tikzpicture} \qquad T' = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base), Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1)(x11) {$\bullet$}; \draw (x01.center) to (x11.center); \path [draw=none] (-0.5,0) to (0.5,0); \end{tikzpicture} \qquad \insertion S P {T'} = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base)] \node [on grid,Diag2] at (0,0) (x01) {$\bullet$}; \node [on grid,Diag2] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (-0.5, 2)(x21) {$\bullet$}; \node [on grid] at (0.5,1)(x12) {$\bullet$}; \begin{scope}[on background layer] \draw [Diag2] (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw (x11.center) to (x21.center); \end{scope} \end{tikzpicture} \] Here, the branch \(P\) is replaced by a singleton tree, which is the remaining part of \(T'\) after removing its trunk. We note that this operation is the same as pruning the locally maximal variable \(\lfloor \olsi P \rfloor\) from \(\lfloor T \rfloor\). We will see in \cref{sec:univ-prop-insert} that all instances of pruning can be represented as an instance of insertion. When we consider the insertion of \(T''\) into \(S\), it is not clear how to proceed, as there is no ``corresponding part'' of \(T''\) to replace the branch \(P\) with. In the other two cases this is obtained by removing the trunk of the tree, but \(T''\) has no trunk to remove. In this case we say that the insertion is not possible to perform as \(\bh(P) > \th(T'')\), a condition necessary for insertion. More generally we consider a (structured) coherence term \(\SCoh S A L : \STerm_\U\). To apply insertion to this term, we must first identify a branch \(P\) of \(S\) such that \(\olsi P \sub L \equiv \stdcoh T {\lh(P)} \sub M\), that is there is a locally maximal argument of \(L\) which is a standard coherence. We then must construct the following data as part of the insertion operation: \begin{itemize} \item The \emph{inserted tree} \(\insertion S P T\), obtained by inserting \(T\) into \(S\) along the branch \(P\). We have already given some examples of this operation. \item The \emph{interior labelling} \(\iota : T \to \insertion S P T\), the inclusion of \(T\) into a copy of \(T\) living in the inserted tree. \item The \emph{exterior labelling} \(\kappa : S \to \insertion S P T\), which maps \(\olsi P\) to standard coherence over the copy of \(T\), or more specifically \(\mathcal{C}_\Theta^{\lh(P)} \sub \iota\), and other maximal paths to their copy in the inserted tree. \item The \emph{inserted labelling} \(\insertion L P M : \insertion S P T \to \U\), which collects the appropriate parts of \(L\) and \(M\). \end{itemize} Using this notation, insertion yields the following equality: \[\Coh S A L = \Coh {\insertion S P T} {A \sub \kappa} {\insertion L P M}\] These constructions can be assembled into the following diagram, where \(n = \lh(P)\): % https://q.uiver.app/?q=WzAsNSxbMCwwLCJEX24iXSxbMSwwLCJcXERlbHRhIl0sWzAsMSwiXFxUaGV0YSJdLFsxLDEsIlxcaW5zZXJ0aW9uIFxcRGVsdGEgeCBcXFRoZXRhIl0sWzIsMiwiXFxHYW1tYSJdLFsxLDMsIlxca2FwcGEiXSxbMiwzLCJcXGlvdGEiLDJdLFswLDEsIlxce0EseFxcfSJdLFswLDIsIlxce1xcbWF0aGNhbHtVfV9cXFRoZXRhXm4sIFxcbWF0aGNhbHtDfV9cXFRoZXRhXm5cXH0iLDJdLFsxLDQsIlxcc2lnbWEiLDAseyJjdXJ2ZSI6LTN9XSxbMiw0LCJcXHRhdSIsMix7ImN1cnZlIjoyfV0sWzMsNCwiXFxpbnNlcnRpb24gXFxzaWdtYSB4IFxcdGF1IiwxXSxbMywwLCIiLDEseyJzdHlsZSI6eyJuYW1lIjoiY29ybmVyIn19XV0= % tex-fmt: skip \[\begin{tikzcd} {D^n} & S \\ T & {\insertion S P T} \\ && \U \arrow["\kappa", from=1-2, to=2-2] \arrow["\iota"', from=2-1, to=2-2] \arrow["{\{\ty(\olsi P), \olsi P\}}", from=1-1, to=1-2] \arrow["{\{\stdty T n, \stdcoh T n\}}"', from=1-1, to=2-1] \arrow["L", curve={height=-18pt}, from=1-2, to=3-3] \arrow["M"', curve={height=12pt}, from=2-1, to=3-3] \arrow["{\insertion L P M}"{description}, from=2-2, to=3-3] % \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-2, to=1-1] \end{tikzcd} \] It will be proven in \cref{sec:univ-prop-insert} that the square above is cocartesian, and so \(\insertion S P T\) is the pushout of \(S\) and \(T\). We now begin to define each of these constructions in turn. As we need a lot of data to perform an insertion, we will package it up to avoid repetition. \begin{definition} An \emph{insertion point} is a triple \((S,P,T)\) such that \(S\) and \(T\) are trees and \(P\) is a branch of \(S\) with \(\bh(P) \leq \th(T)\) and \(\lh(S) \geq \dim(T)\). An \emph{insertion redex} is a sextuple \((S,P,T,\U,L,M)\) such that \((S,P,T)\) is an insertion point, \(L : S \to \U\) and \(M : T \to \U\) are labellings with \(\ty(L) \equiv \ty(M) \equiv \star\), and \(L(\olsi P) \equiv \mathcal{C}_T^{\lh(P)}\sub M\). \end{definition} We can now define the insertion operation on trees. \begin{definition}[Inserted tree] Let \((S,P,T)\) be an insertion point. Define the \emph{inserted tree} \(\insertion S P T\) by induction on the branch \(P\), noting that \(P\) is always non-empty. \begin{itemize} \item Suppose \(P = [k]\) and \(S = [S_0,\dots,S_k,\dots,S_n]\). Then: \[\insertion S P T = [S_0,\dots,S_{k-1}] \doubleplus T \doubleplus [S_{k+1},\dots,S_n]\] \item Suppose \(P = k :: Q\) and again \(S = [S_0,\dots,S_k,\dots,S_n]\). We note that \(Q\) is a branch of \(S_k\) and by the condition on trunk height of \(T\) we have \(T = \Sigma(T_0)\). Then: \[\insertion S P T = [S_0,\dots,S_{k-1},(\insertion {S_k} {Q} {T_0}),S_{k+1},\dots,S_n ] \] \end{itemize} We draw attention to the condition of the trunk height of \(T\) being at least the branch height of \(P\), which is necessary for the induction to proceed. We recall that a tree is identified with a list of trees, and that in the first case of insertion \(T\) is treated as a list, and in the second case \(\insertion {S_k} {Q} {T_0}\) is treated as a single tree which forms one of the subtrees of \(\insertion S P T\). \end{definition} We now proceed to define the interior and exterior labellings, which will be done using the diagrammatic notation introduced in \cref{sec:wedge-sums}. \begin{definition}[Interior labelling] Given an insertion point \((S, P, T)\) we define the interior labelling \(\iota_{S,P,T} : T \to \insertion S P T\) by induction on~\(P\). \begin{itemize} \item When \(P = [k]\) and \(S = [S_0,\dots,S_k,\dots,S_n]\) we define \(\iota\) by \(\ty(\iota) = \star\) and: % https://q.uiver.app/?q=WzAsNixbMCwwLCJTXzBcXHZlZVxcZG90c1xcdmVlIFNfe2stMX0iXSxbNCwwLCJTX3trKzF9IFxcdmVlIFxcZG90cyBcXHZlZSBTX24iXSxbMiwwLCJUIl0sWzMsMCwiXFx2ZWUiXSxbMSwwLCJcXHZlZSJdLFsyLDIsIlQiXSxbNSwyLCJcXGlkIl1d % tex-fmt: skip \[\begin{tikzcd}[column sep=smaller,row sep=10pt] {[S_0,\dots,S_{k-1}]} & \doubleplus & T & \doubleplus & {[S_{k+1},\dots,S_n]} \\ \\ && T \arrow["\id"{font = \normalsize}, from=3-3, to=1-3] \end{tikzcd} \] \item When \(P = k :: Q\), \(S = [S_0,\dots,S_k,\dots,S_n]\), and \(T = [T_0]\) (by the trunk height condition) we define \(\iota\) by \(\ty(\iota) = \star\) and: % https://q.uiver.app/?q=WzAsNixbMCwwLCJcXGxmbG9vciBbU18xLFxcZG90cyxTX3trLTF9XSBcXHJmbG9vciJdLFs0LDAsIlxcbGZsb29yIFtTX3trKzF9LFxcZG90cyxTX25dIFxccmZsb29yIl0sWzIsMCwiXFxTaWdtYSBcXGxmbG9vciBcXGluc2VydGlvbiB7U19rfSB7UCd9IHtUXzF9IFxccmZsb29yIl0sWzMsMCwiXFx2ZWUiXSxbMSwwLCJcXHZlZSJdLFsyLDIsIlxcU2lnbWEgXFxsZmxvb3IgVF8xIFxccmZsb29yIl0sWzUsMiwiXFxTaWdtYSBcXGlvdGFfe1NfayxQJyxUXzF9Il1d % tex-fmt: skip \[\begin{tikzcd}[column sep=smaller, row sep=10pt] {[S_0,\dots,S_{k-1}]} & \vee & {\Sigma \insertion {S_k} {Q} {T_0}} & \vee & {[S_{k+1},\dots,S_n]} \\ \\ && {\Sigma T_0} \arrow["{\Sigma \iota_{S_k,Q,T_0}}"{font = \normalsize}, from=3-3, to=1-3] \end{tikzcd} \] \end{itemize} We may drop the subscripts on \(\iota\) when they are easily inferred. \end{definition} \begin{definition}[Exterior labelling] Given an insertion point \((S, P, T)\), we define the exterior labelling \(\kappa_{S,P,T} : S \to \insertion S P T\) by induction on \(P\). \begin{itemize} \item When \(P = [k]\) and \(S = [S_0,\dots,S_k,\dots,S_n]\) we define \(\kappa\) by \(\ty(\kappa) = \star\) and: % https://q.uiver.app/?q=WzAsMTAsWzAsMCwiXFxsZmxvb3IgW1NfMSxcXGRvdHMsU197ay0xfV0gXFxyZmxvb3IiXSxbNCwwLCJcXGxmbG9vciBbU197aysxfSxcXGRvdHMsU19uXSBcXHJmbG9vciJdLFsyLDAsIlxcbGZsb29yIFQgXFxyZmxvb3IiXSxbMywwLCJcXHZlZSJdLFsxLDAsIlxcdmVlIl0sWzIsMiwiXFxTaWdtYSBcXGxmbG9vciBTX2sgXFxyZmxvb3IiXSxbMCwyLCJcXGxmbG9vciBbU18xLFxcZG90cyxTX3trLTF9XFxyZmxvb3IiXSxbMSwyLCJcXHZlZSJdLFszLDIsIlxcdmVlIl0sWzQsMiwiXFxsZmxvb3IgW1Nfe2srMX0sXFxkb3RzLFNfbl0gXFxyZmxvb3IiXSxbNSwyLCJcXHtcXG1hdGhjYWx7VX1fVF5uLCBcXG1hdGhjYWx7Q31fVF5uXFx9Il0sWzYsMCwiXFxpZCJdLFs5LDEsIlxcaWQiXV0= % tex-fmt: skip \[\begin{tikzcd}[column sep=smaller,row sep = 10pt] {[S_0,\dots,S_{k-1}]} & \doubleplus & {T} & \doubleplus & {[S_{k+1},\dots,S_n]} \\ \\ {[S_0,\dots,S_{k-1}]} & \vee & {\Sigma S_k} & \vee & {[S_{k+1},\dots,S_n]} \arrow["{\{\stdty T m, \stdcoh T m\}}"{font = \normalsize, pos=.4}, from=3-3, to=1-3] \arrow["\id"{font = \normalsize}, from=3-1, to=1-1] \arrow["\id"{font = \normalsize}, from=3-5, to=1-5] \end{tikzcd}\] Where we note that by the condition of \(P\) being a branch we have that \(S_k\) is linear and so \(\Sigma \lfloor S_k \rfloor\) is a some disc \(D^m\) where \(m = \dep(S_k) + 1\). \item When \(P = k :: Q\), \(S = [S_0,\dots,S_k,\dots,S_n]\), and \(T = [T_0]\) (by the trunk height condition) we define \(\kappa\) by \(\ty(\kappa) = \star\) and: % https://q.uiver.app/?q=WzAsMTAsWzAsMCwiXFxsZmxvb3IgW1NfMSxcXGRvdHMsU197ay0xfV0gXFxyZmxvb3IiXSxbNCwwLCJcXGxmbG9vciBbU197aysxfSxcXGRvdHMsU19uXSBcXHJmbG9vciJdLFsyLDAsIlxcU2lnbWEgXFxsZmxvb3IgXFxpbnNlcnRpb24ge1Nfa30ge1AnfSB7VF8xfSBcXHJmbG9vciJdLFszLDAsIlxcdmVlIl0sWzEsMCwiXFx2ZWUiXSxbMiwyLCJcXFNpZ21hIFxcbGZsb29yIFNfayBcXHJmbG9vciJdLFswLDIsIlxcbGZsb29yIFtTXzEsXFxkb3RzLFNfe2stMX1cXHJmbG9vciJdLFsxLDIsIlxcdmVlIl0sWzMsMiwiXFx2ZWUiXSxbNCwyLCJcXGxmbG9vciBbU197aysxfSxcXGRvdHMsU19uXSBcXHJmbG9vciJdLFs1LDIsIlxcU2lnbWEgXFxrYXBwYV97U19rLFAnLFRfMX0iXSxbNiwwLCJcXGlkIl0sWzksMSwiXFxpZCJdXQ== % tex-fmt: skip \[\begin{tikzcd}[column sep=smaller, row sep = 10pt] {[S_0,\dots,S_{k-1}]} & \vee & {\Sigma \insertion {S_k} {Q} {T_0}} & \vee & {[S_{k+1},\dots,S_n]} \\ \\ {[S_0,\dots,S_{k-1}]} & \vee & {\Sigma S_k} & \vee & {[S_{k+1},\dots,S_n]} \arrow["{\Sigma \kappa_{S_k,Q,T_0}}"{font=\normalsize}, from=3-3, to=1-3] \arrow["\id"{font=\normalsize}, from=3-1, to=1-1] \arrow["\id"{font=\normalsize}, from=3-5, to=1-5] \end{tikzcd}\] \end{itemize} Again the subscripts on \(\kappa\) may be dropped where they can be inferred. \end{definition} Lastly we define the inserted labelling, the labelling out of the inserted tree. \begin{definition}[Inserted labelling] Given an insertion point \((S, P, T)\) with \(L : S \to \U\) and \(M : T \to \U\), we define the \emph{inserted labelling} \(\insertion L P M : {\insertion S P T} \to \U\). Let \[ S = [S_0,\dots,S_n] \qquad L = s_0 \{L_0\}s_1 \cdots \{L_n\}s_{n+1} : A\] and then proceed by induction on \(P\). \begin{itemize} \item Let \(P = [k]\), and \[ T = [T_0,\dots,T_m] \qquad M = t_0\{M_0\}t_1 \cdots \{M_m\}t_{m+1} : B\] Then define \(\insertion L {[k]} M\) to be: \[s_0\{L_0\}s_1 \cdots \{L_{k-1}\}t_0\{M_0\}t_1\cdots \{M_m\}t_{m+1}\{L_{k+1}\}s_{k+2}\cdots \{L_n\}s_{n+1} : A\] \item Suppose \(P = k :: Q\) so that \[T = [T_0] \qquad M = t_0\{M_0\}t_1 : B\] Define \(\insertion L P M\) as: \[s_0\{L_0\}s_1\cdots \{L_{k-1}\}t_0\{\insertion {L_k} {Q} {M_0}\}t_1\{L_{k+1}\}s_{k+2} \cdots \{L_n\}s_{n+1} : A\] \end{itemize} \end{definition} We now proceed to prove that each of these constructions used to generate insertion is well-formed. We begin with the following small lemma. \begin{lemma} \label{lem:inserted-label-lem} Let \((S,P,T,\U,L,M)\) be an insertion redex. If we further suppose that \(\U \vdash L : S\) and \(\U \vdash M : T\), then: \[ \U \vdash \arr {L[k]} {\ty(L)} {L[k+1]} = \arr {M[0]} {\ty(M)} {M[m+1]}\] where \(k\) is the first element of \(P\) (as \(P\) is non-empty) and \(T\) has length \(m\). \end{lemma} \begin{proof} From the insertion redex, we have \(L(\olsi P) \equiv \stdcoh T {\lh(P)} \sub M\). By assumption, \(P\) is of the form \(k :: p\), where \(p\) is a path and \(S = [S_0,\dots,S_n]\) and so \[\SPath(\olsi P) \equiv \Inc_k(\SPath(p)) \] and so supposing that \(S_k \vdash \SPath(p) : A\) (as every path is well-formed), we can obtain: \[\U \vdash \SPath(\olsi P) \sub L : \Sigma(A) \sub {\inc_k} \sub L\] By \cref{prop:standard-typing}, \(\U \vdash \stdcoh T {\lh(P)} \sub M : \stdty T {\lh(P)} \sub M\). Therefore, by uniqueness of types (using the syntactic equality from the insertion redex), we have: \[ \U \vdash \Sigma(A) \sub {\inc_k \bullet L} = \stdty T {\lh(P)} \sub M\] By truncating both sides of this equality \(\lh(P) - 1\) times we get: \[ \U \vdash \Sigma(\star) \sub {\inc_k \bullet L} = \stdty T 1 \sub M\] which after expanding definitions on both sides gives the required equality. \end{proof} The typing properties of each of the constructions involved in insertion are given in the following proposition. \begin{proposition} \label{prop:ins-typing} Let \((S,P,T)\) be an insertion point. Then: \[\insertion S P T \vdash \iota_{S,P,T} : T \qquad \insertion S P T \vdash \kappa_{S,P,T} : S\] If we further have \(\U \vdash L : S\) and \(\U \vdash M : S\) with \(L(\olsi P) \equiv \stdcoh T {\lh(P)} \sub M\) then: \[ \U \vdash \insertion L P M : \insertion S P T\] \end{proposition} \begin{proof} The labellings \(\iota\) and \(\kappa\) are formed using constructions that have already been shown to be well-formed. We therefore focus on the typing judgement for the inserted labelling. As in the definition of the inserted labelling, we let \[ S = [S_0,\dots,S_n] \qquad L = s_0 \{L_0\}s_1 \cdots \{L_n\}s_{n+1} : A\] By inspection of the typing derivation \(\U \vdash L : S\) we have that \(\U \vdash s_i : A\) and \(\U \vdash L_i : S_i\) for each \(i\). We then proceed by induction on \(P\). \begin{itemize} \item Let \(P = [k]\) and \[ T = [T_0,\dots,T_m] \qquad M = t_0\{M_0\}t_1 \cdots \{M_m\}t_{m+1} : B\] By \(\U \vdash M : T\), we have that \(\U \vdash t_i : B\) and \(\U \vdash M_i : T_i\) for each \(i\). Applying \cref{lem:inserted-label-lem}, we have \(\U \vdash A = B\), \(\U \vdash s_k = t_0\), and \(\U \vdash s_{k+1} = t_{m+1}\). Therefore, by applying the conversion rule, \(\U \vdash t_i : A\). To complete this case, we must show that for each \(i\): \[ \U \vdash (\insertion L P M)_i : (\insertion S P T)_i\] For most \(i\) this is trivial, however there is a subtlety for \(i = k-1\) that \((\insertion L P M)_{k-1} \not\equiv L_{k-1}\), as: \[\ty((\insertion L P M)_{k-1}) \equiv \arr {s_{k-1}} A {t_0} \not\equiv \arr {s_{k-1}} A {s_k} \equiv \ty(L_{k-1})\] However, the equality \(\U \vdash s_k = t_0\) means that these two types are definitionally equal, and so the required typing derivation follows from \(\U \vdash L_{k-1} : S_k\). A similar argument is needed to prove that \(\U \vdash L_{k+1} : S_{k+1}\), completing this case. \item Suppose \(P = k :: Q\) so that \[T = [T_0] \qquad M = t_0\{M_0\}t_1 : B\] with \(\U \vdash M_0 : T_0\) and \(\U \vdash t_i : B\) for \(i \in \{0,1\}\). Then: \begin{align*} L_k(\olsi{Q}) &\equiv L(\olsi P)\\ &\equiv \stdcoh T {\lh(P)} \sub M\\ &\equiv \Sigma \left(\stdcoh {T_0} {\lh(Q)}\right) \sub M\\ &\equiv \stdcoh {T_0} {\lh(Q)} \sub {M_0} \end{align*} and so by inductive hypothesis, we have \(\U \vdash\insertion {L_k} {Q} {M_0} : \insertion {S_k} {Q} {T_0}\). Then by a similar argument to above it can be shown that \(\insertion L P M\) is well-formed. \end{itemize} Hence, \(\U \vdash \insertion L P M : \insertion S P T\) for all branches \(P\). \end{proof} We now end this section by formally giving the equality rule set for insertion. \begin{definition} \label{def:insertion-rule} The \emph{insertion rule set}, \insert, is the set consisting of the triples: \[ (\Gamma, \lfloor \SCoh S A L \rfloor, \lfloor \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P M} \rfloor)\] for each insertion redex \((S,P,T,\Gamma,L,M)\), and structured type \(A\). A set of rules \(\mathcal{R}\) \emph{contains insertion} if \(\insert \subseteq \mathcal{R}\). Insertion makes the following rule admissible: \begin{equation*} \inferrule{(S,P,T,\Gamma,L,M)\text{ is an insertion redex}\\ S \vdash A \\ \Gamma \vdash L : S}{\Gamma \vdash \SCoh S A L = \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P M}} \end{equation*} The set \(\mathcal{R}\) \emph{has insertion} if the rule \textsc{insert} holds in the generated theory. \end{definition} \subsection{Universal property of insertion} \label{sec:univ-prop-insert} As stated in the previous section, the constructions involved in insertion arise as a pushout square. In this section, we prove this result, which we state below. Throughout this section we assume that we are working in a tame theory for which the support and preservation conditions hold. Further, we only give the maximal arguments of substitutions from a disc, as we only work with well-formed syntax up to definitional equality and so the type will always be inferable. \begin{theorem} \label{thm:univ-prop} Let \((S,P,T)\) be an insertion point. Then the following commutative square of \(\mathsf{Catt}_{\mathcal{R}}\) is cocartesian: \[\begin{tikzcd}[column sep = large, row sep = large] {D^{\lh(P)}} & \lfloor S \rfloor \\ \lfloor T \rfloor & {\lfloor \insertion S P T \rfloor} \arrow["\lfloor \kappa \rfloor", from=1-2, to=2-2] \arrow["\lfloor \iota \rfloor"', from=2-1, to=2-2] \arrow["{\{\lfloor \olsi P \rfloor\}}", from=1-1, to=1-2] \arrow["{\{\lfloor \stdcoh T {\lh(P)} \rfloor\}}"', from=1-1, to=2-1] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-2, to=1-1] \end{tikzcd} \] The context \(\lfloor \insertion S P T \rfloor\) is the pushout of \(\lfloor S \rfloor\) and \(\lfloor T \rfloor\) along the maps that send the maximal variable of \(D^n\) to the locally maximal variable corresponding to the branch \(P\) and the standard coherence of over \(T\) of dimension equal to the leaf height of \(P\). \end{theorem} This theorem allows an intuitive understanding of the insertion operation; the inserted tree \(\insertion S P T\) is the result of taking the disjoint union of \(S\) and \(T\) and gluing the locally maximal variable of \(S\) corresponding to the branch \(P\) to the composite of \(T\). The original motivation for insertion was to take a term where one of the locally maximal arguments was a standard composition and flatten the structure, which aligns with the intuition given by the universal property. \begin{remark} As contexts have an interpretation as freely generated \(\infty\)-categories, and the category of \(\infty\)-categories is cocomplete, there is an \(\infty\)-category pushout of this square. It however may be surprising that this pushout is freely generated and happens to be freely generated by a pasting diagram. \end{remark} We work towards \cref{thm:univ-prop} by introducing a couple of lemmas. These lemmas will mostly be proven by deferring to the formalisation, using the machinery of structured terms introduced in \cref{sec:structured-terms} to simplify the computations involved. We first show that the square is commutative, while also justifying the description of the exterior labelling given at the start of the section. \begin{lemma} \label{lem:iota-kappa-comm} Let \((S,P,T)\) be an insertion point. Then \(\kappa(\olsi P) \equiv \stdcoh T {\lh(P)} \sub \iota\). \end{lemma} \begin{proof} See \func{Catt.Tree.Insertion.Properties}{κ-branch-path} in \module{Catt.Tree.Insertion.Properties}. \end{proof} We next state two factorisation properties for the interior and exterior labellings. \begin{lemma} \label{lem:ins-comm-max} For insertion redex \((S,P,T,\U,L,M)\), the following hold: \[ \iota_{S,P,T} \circ (\insertion L P M) \equiv M \qquad \kappa_{S,P,T} \circ (\insertion L PM) \equiv^{\max} L \] Hence, the maps \(L\) and \(M\) factor through the labellings \(\kappa\) and \(\iota\) respectively. \end{lemma} \begin{proof} See \funcn{Catt.Tree.Insertion.Properties}{4201}{ι-comm} and \funcn{Catt.Tree.Insertion.Properties}{4738}{κ-comm} in \module{Catt.Tree.Insertion.Properties}. \end{proof} We can now proceed with the proof of \cref{thm:univ-prop}. \begin{proof}[Proof of \cref{thm:univ-prop}] Let \((S,P,T)\) be an insertion point. We must first show that the candidate pushout square is in fact commutative, for which it is sufficient to show: \[ \{\ty(\olsi P), \olsi P\} \bullet \kappa \equiv^{\max} \{\stdty T {\lh(P)}, \stdcoh T {\lh(P)}\} \bullet \iota \] which follows from \cref{lem:iota-kappa-comm}. To prove that this square is cocartesian, we take two substitutions \(\sigma : \lfloor S \rfloor \to \Gamma\) and \(\tau : \lfloor T \rfloor \to \Gamma\) such that the following diagram is commutative: \[ \begin{tikzcd}[column sep = large, row sep = large] {D^{\lh(P)}} & \lfloor S \rfloor \\ \lfloor T \rfloor & {\lfloor \insertion S P T \rfloor}\\ && \Gamma \arrow["\lfloor \kappa \rfloor", from=1-2, to=2-2] \arrow["\lfloor \iota \rfloor"', from=2-1, to=2-2] \arrow["{\{\lfloor \olsi P \rfloor\}}", from=1-1, to=1-2] \arrow["{\{\lfloor \stdcoh T n \rfloor\}}"', from=1-1, to=2-1] \arrow["\sigma", curve={height=-18pt}, from=1-2, to=3-3] \arrow["\tau"', curve={height=12pt}, from=2-1, to=3-3] \end{tikzcd} \] We therefore have that \(\lceil \sigma \rceil\) is a labelling \(S \to \Gamma\) and \(\lceil \tau \rceil\) is a labelling \(T \to \Gamma\) with \[\Gamma \vdash \lceil \sigma \rceil(\olsi P) = \stdcoh T {\lh(P)} \sub{\lceil \tau \rceil}\] To apply \cref{lem:ins-comm-max}, we need this to be a syntactic equality. We therefore define \(M =\lceil \tau \rceil\) and \(L\) to be given by: \[ L(p) = \begin{cases*} \stdcoh T {\lh(P)} \sub M &if \(p = \olsi P\)\\ \lceil \sigma \rceil(p)&otherwise \end{cases*} \] by the equality above, \(L\) is well-formed and \(\lfloor L \rfloor = \sigma\). We then get a well-formed map \(\lfloor \insertion L P M \rfloor\) from \(\lfloor \insertion S P T \rfloor\) to \(\Gamma\) such that the following diagram is commutative by \cref{lem:ins-comm-max}: \[ \begin{tikzcd}[column sep = large, row sep = large] {D^{\lh(P)}} & \lfloor S \rfloor \\ \lfloor T \rfloor & {\lfloor \insertion S P T \rfloor}\\ && \Gamma \arrow["\lfloor \kappa \rfloor", from=1-2, to=2-2] \arrow["\lfloor \iota \rfloor"', from=2-1, to=2-2] \arrow["{\{\lfloor \olsi P \rfloor\}}", from=1-1, to=1-2] \arrow["{\{\lfloor \stdcoh T n \rfloor\}}"', from=1-1, to=2-1] \arrow["\lfloor \sigma \rfloor", curve={height=-18pt}, from=1-2, to=3-3] \arrow["\lfloor M \rfloor"', curve={height=12pt}, from=2-1, to=3-3] \arrow["\lfloor \insertion L P M \rfloor"{description}, from=2-2, to=3-3] \end{tikzcd} \] The uniqueness of this morphism follows from the observation that every path of \(\insertion S P T\) is either of the form \(\iota(p)\) for some \(p : \Path_T\) or \(\kappa(q)\) for some \(q: \Path_S\). \end{proof} From this result we will be able to show that having insertion in a theory implies the existence of pruning. The plan will be to show that pruning satisfies a similar universal property. \begin{proposition} Let \(\mathcal{D} : \Dyck_0\) be a Dyck word, and let \(p\) be a peak of \(\mathcal{D}\). Then the following square is a pushout square: \[\begin{tikzcd}[column sep = large, row sep = large] {D^{n+1}} & \lfloor \mathcal{D} \rfloor \\ D^{n} & {\lfloor \mathcal{D} \sslash p \rfloor} \arrow["\pi_p", from=1-2, to=2-2] \arrow["{\{\src(\lfloor p \rfloor)\}}"', from=2-1, to=2-2] \arrow["{\{\lfloor p \rfloor\}}", from=1-1, to=1-2] \arrow["{\{\id(d_n)\}}"', from=1-1, to=2-1] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-2, to=1-1] \end{tikzcd} \] where \(\dim(A) = n\), and each substitution from a disc is given only by its maximal element. \end{proposition} \begin{proof} As discussed in \cref{sec:prune-construction}, the substitution \(\pi_p\) sends \(\lfloor p \rfloor\) to the identity on the source of \(\lfloor p \rfloor\), which makes the square commute, as it suffices to consider the action of each substitution on \(d_{n+1}\), the maximal variable of \(D^{n+1}\). We now assume that we have substitutions \(\sigma : \lfloor \mathcal{D} \rfloor \to \Gamma\) and \(\{t\} : D^n \to \Gamma\) such that the following diagram commutes: \[\begin{tikzcd}[column sep = large, row sep = large] {D^{n+1}} & \lfloor \mathcal{D} \rfloor \\ D^{n} & {\lfloor \mathcal{D} \sslash p \rfloor}\\ && \Gamma \arrow["\pi_p", from=1-2, to=2-2] \arrow["{\{\src(\lfloor p \rfloor)\}}"', from=2-1, to=2-2] \arrow["{\{\lfloor p \rfloor\}}", from=1-1, to=1-2] \arrow["{\{\id(d_n)\}}"', from=1-1, to=2-1] \arrow["\sigma", curve={height=-18pt}, from=1-2, to=3-3] \arrow["{\{t\}}", curve={height=18pt}, from=2-1, to=3-3] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-2, to=1-1] \end{tikzcd} \] We immediately have that \(\lfloor p \rfloor \sub \sigma = \id(\{t\})\). We can therefore let \(\sigma'\) the same substitution as \(\sigma\) but with \(\lfloor p \rfloor \sub \sigma\) replaced by \(\id(\{t\})\), and then can form the substitution: \[ \sigma \sslash p \equiv \sigma' \sslash p : \lfloor \mathcal{D}\sslash p \rfloor \to \Gamma\] By \cref{prop:prune-ty}, we immediately have \(\sigma = \sigma' = \pi_p \bullet \sigma \sslash p\). The other equality follows from a diagram chase, noting that \(d_n^-\) in \(D^{n+1}\) is sent to the variable \(d^n\) in \(D^n\) by the map \(\{\id(d_n)\}\). It remains to show that the chosen universal map \(\sigma \sslash p\) is unique, but this is trivial as every variable of \(\lfloor \mathcal{D} \sslash p \rfloor\) is also a variable of \(\lfloor \mathcal{D} \rfloor\), and so the universal map is fully determined by the substitution \(\sigma\). \end{proof} \begin{corollary} \label{cor:insertion-pruning} Let \(\mathcal{R}\) have insertion. Then \(\mathcal{R}\) has pruning. \end{corollary} \begin{proof} Assume \(\mathcal{R}\) has insertion. Then take a term \(\Coh {\lfloor \mathcal{D} \rfloor} A \sigma : \Term_\Gamma\) with a peak \(p : \Peak_{\mathcal{D}}\) such that: \[ \lfloor p \rfloor \sub \sigma \equiv \id(A,t)\] for some term \(t\) and type \(A\) of \(\Gamma\). We then need to show that: \[ \Gamma \vdash \Coh {\lfloor \mathcal{D} \rfloor} A \sigma = \Coh {\lfloor \mathcal{D} \sslash p\rfloor} {A \sub {\pi_p}} {\sigma \sslash p}\] From \(\lfloor \mathcal{D} \rfloor\) we can obtain a tree \(S\) with \(\lfloor S \rfloor \equiv \lfloor \mathcal{D} \rfloor\). Further, \(\lfloor p \rfloor\) is a locally maximal variable of \(\lfloor \mathcal{D} \rfloor\), and so there exists a branch \(P\) such that \(\lfloor \olsi P \rfloor\) is this locally maximal variable, and \(\bh(P) = \lh(P) - 1\). Then the diagram: \[ % https://q.uiver.app/#q=WzAsMyxbMSwwLCJEXntuKzF9Il0sWzAsMSwiRF5uIl0sWzIsMSwiXFxsZmxvb3IgUyBcXHJmbG9vciJdLFswLDEsIlxceyBcXGlkKHQpIFxcfSIsMl0sWzAsMiwiXFx7IFxcbGZsb29yIHAgXFxyZmxvb3IgXFx9Il1d % tex-fmt: skip \begin{tikzcd} & {D^{n+1}} \\ {D^n} && {\lfloor S \rfloor} \arrow["{\{ \id(t) \}}"', from=1-2, to=2-1] \arrow["{\{ \lfloor p \rfloor \}}", from=1-2, to=2-3] \end{tikzcd} \] has two pushouts, the one given by insertion, and the one given by pruning. Therefore, we obtain an isomorphism \(\lfloor \insertion S P {D^n} \rfloor \cong \lfloor \mathcal{D} \sslash p \rfloor\). By \cref{prop:ps-context-iso}, this isomorphism must be the identity (as both pushouts exist in \textsf{Catt}), and so we can deduce that \(\pi_p = \kappa_{S,P,D^n}\) and \(\sigma \sslash p = \lfloor \insertion {\lceil \sigma \rceil} P {\{\lceil t \rceil\}} \rfloor\). Therefore, the above equality is given by an insertion along \(P\). \end{proof} \subsection{The insertion rule} \label{sec:insertion-rule} We now prove that the insertion rule set given in \cref{sec:insertion-rule} satisfies the various conditions presented in \cref{sec:ruleset}. We begin with the following lemma. \begin{lemma} \label{lem:insertion-map} Let \((S,P,T)\) be an insertion point and let \(L : S \to \U\) and \(M : T \to \U\) be labellings. Let \(f : \STerm_\U \to \STerm_{\U'}\) be any function from structured terms of \(\U\) to structured terms of \(\U'\). Then for any path \(p\) of \(\insertion S P T\) we have: \[ f((\insertion S P T)(p)) \equiv (\insertion {(f \circ L)} P {(f \circ M)})(p)\] where \(f \circ L\) is the result of composing \(f\) to the function component of \(L\). \end{lemma} \begin{proof} The proof of this follows by a simple induction on \(P\) and is given in the formalisation module \module{Catt.Tree.Insertion.Properties} by function \func{Catt.Tree.Insertion.Properties}{label-from-insertion-map}. \end{proof} \begin{proposition} \label{prop:insert-tame} The insertion rule set, \insert, satisfies the suspension condition. It further satisfies the \(\mathcal{R}\)-substitution condition for any rule set \(\mathcal{R}\), and so also satisfies the weakening condition. \end{proposition} \begin{proof} Let \((S,P,T,\Gamma,L,M)\) be an insertion redex and let \(A\) be a structured type of \(S\), such that: \[ s \equiv \SCoh S A L \qquad t \equiv \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P T} \qquad (\Gamma, \lfloor s \rfloor, \lfloor t \rfloor) \in \insert\] To prove the suspension condition, we observe that \(0 :: P\) is a branch of \(\Sigma(S)\) such that \(\insertion {\Sigma(S)} {0::P} {\Sigma(T)} \equiv \Sigma(\insertion S P T)\) and \(\kappa_{\Sigma(S),0::P,\Sigma(T)} \equiv \Sigma(\kappa_{S,P,T})\) by definition. By applying \cref{lem:insertion-map} with \(f = \Sigma\), we get: \[ \insertion {\Sigma'(L)} {P} {\Sigma'(M)} \equiv \Sigma'(\insertion L P M)\] and so by unwrapping definitions we obtain \(\insertion {\Sigma(L)} {0 :: P} {\Sigma(M)} \equiv \Sigma(\insertion L P M)\). Therefore, we have: \begin{align*} \Sigma(s) &\equiv \SCoh {\Sigma(S)} {\Sigma (A)} {\Sigma(L)} \\ \Sigma(t) &\equiv \SCoh {\insertion {\Sigma(S)} {0::P} {\Sigma(T)}} {\Sigma(A) \sub {\kappa_{\Sigma(S),0::P,\Sigma(T)}}} {\insertion {\Sigma(L)} {0::P} {\Sigma(M)}} \end{align*} and so as \[\Sigma(L)(0 :: \olsi P) \equiv \Sigma'(L)(\olsi P) \equiv \Sigma(\stdcoh T {\lh(P)} \sub M) \equiv \stdcoh {\Sigma(T)} {\lh(0::P)} \sub {\Sigma(M)}\] we get \((\Sigma(\Gamma), \Sigma(\lfloor s \rfloor), \Sigma (\lfloor t \rfloor) ) \in \insert\) as required. For the substitution condition we let \(\sigma : \arr \Gamma \star \Delta\) be any substitution. Then: \[\lfloor s \rfloor \sub \sigma \equiv \lfloor \SCoh S A {L \bullet \lceil \sigma \rceil} \rfloor \qquad \lfloor t \rfloor \sub \sigma \equiv \lfloor \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {(\insertion L P T) \bullet \lceil \sigma \rceil} \rfloor\] Again using \cref{lem:insertion-map}, this time with \(f = u \mapsto u\sub{\lceil \sigma \rceil}\), we have: \[(\insertion L P M) \bullet \lceil \sigma \rceil \equiv \insertion {L \bullet \lceil \sigma \rceil} P {M \bullet \lceil \sigma \rceil}\] Further, we have the equality: \[(L \bullet \lceil \sigma \rceil)(\olsi P) \equiv L(p) \sub {\lceil \sigma \rceil} \equiv \stdcoh T {\lh(P)} \sub {M \bullet \lceil \sigma \rceil}\] and so \((\Gamma, \lfloor s \rfloor \sub \sigma, \lfloor t \rfloor \sub \sigma) \in \insert\) and so \insert satisfies the \(\mathcal{R}\)-substitution condition for any \(\mathcal{R}\), as we made no assumption on \(\sigma\) being well-formed. \end{proof} We next prove the support condition for the insertion rule set. We start with the following support lemma for the exterior labelling. \begin{lemma} \label{lem:kappa-full} Let \((\insertion S P T)\) be an insertion point. Then: \[ \Supp(\kappa_{S,P,T}) = \Var(\insertion S P T)\] The exterior labelling is full. \end{lemma} \begin{proof} Proof proceeds by induction on \(P\), the only non-trivial case is \(P = [k]\), where we rely on \(\Supp(\{\stdty T {\lh{P}}, \stdcoh T {\lh{P}}\})\) being \(\Var(T)\). A full proof is given in the formalisation module \module{Catt.Tree.Insertion.Support}. \end{proof} Similar to the other rule sets introduced so far, to prove the support condition for the insertion rule set, we will take an arbitrary rule set \(\mathcal{R}\) that is tame and satisfies the support condition, and prove instead that the insertion set satisfies the \(\mathcal{R}\)-support condition. This result can be then used as part of the strategy for proving the support condition outlined in \cref{lem:proof-strat-supp}. \begin{proposition} \label{prop:insert-supp} Let \(\mathcal{R}\) be a tame equality rule set that satisfies the support condition. Then \insert satisfies the \(\mathcal{R}\)-support condition. \end{proposition} \begin{proof} As in the previous proposition, let \((S,P,T,\Gamma,L,M)\) be an insertion redex and \(A\) a structured type of \(S\), such that: \[ s \equiv \SCoh S A L \qquad t \equiv \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P T} \qquad (\Gamma, \lfloor s \rfloor, \lfloor t \rfloor) \in \insert\] We now assume that \(\Gamma \vdash_{\mathcal{R}} \lfloor s \rfloor : B\) for some \(B\) and must prove that \(\Supp(s) = \Supp(t)\). By inspecting the typing judgement, we can obtain proofs of the following typing judgements: \[ \Gamma \vdash L : S \qquad S \vdash A \qquad \Gamma \vdash M : T\] where the typing of \(M\) is obtained by transporting the typing of \(L(\olsi P)\) along the syntactic equality \(L(\olsi P) \equiv \stdcoh T {\lh(P)} \sub M\). By \cref{lem:ins-comm-max}, we have: \[ \kappa_{S,P,T} \bullet \insertion L P M \equiv^{\max} L \] By \cref{prop:ins-typing}, both sides of this equation are well-formed and so by \cref{thm:label-max-equality}, we obtain the equality: \[ \Gamma \vdash_{\mathcal{R}} \kappa_{S,P,T} \bullet \insertion L P M = L\] As \(\mathcal{R}\) satisfies the support property, we get: \begin{align*} \Supp(s) &= \Supp(L)\\ &= \Supp(\kappa_{S,P,T} \bullet \insertion L P M)\\ &= \Supp(\kappa_{S,P,T}) \sub {\insertion L P M}\\ &= \Var(\insertion S P T) \sub {\insertion L P M}&\text{by \cref{lem:kappa-full}}\\ &= \Supp(\insertion L P M) \\ &= \Supp(t) \end{align*} and so \(\Supp(\lfloor s \rfloor) =\Supp(\lfloor t \rfloor)\) as required. \end{proof} Similarly to the situation in pruning, we are not able to show that the type \(A \sub {\kappa}\) is a valid operation without knowing more about the set of operations \(\mathcal{O}\). We therefore introduce the following additional condition on the set of operations. \begin{definition} An operation \(\mathcal{O}\) \emph{supports insertion} if for all insertion points \((S,P,T)\) and variable sets \(U,V \subseteq \Var(S)\) we have: \[ (\lfloor \insertion S P T \rfloor, \lfloor U \sub {\kappa_{S,P,T}} \rfloor, \lfloor V \sub {\kappa_{S,P,T}} \rfloor) \in \mathcal{O} \] whenever \((\lfloor S \rfloor, U, V) \in \mathcal{O}\) \end{definition} Using this property, we can give the preservation condition for the insertion rule set. \begin{proposition} \label{prop:insert-preserve} Let \(\mathcal{R}\) be a tame equality rule set and suppose the operation set \(\mathcal{O}\) supports insertion. Then the set \insert satisfies the \(\mathcal{R}\)-preservation condition. \end{proposition} \begin{proof} Let \((S,P,T,\Gamma,L,M)\) be an insertion redex and let \(\arr a A b\) be a structured type such that: \[ s \equiv \SCoh S {\arr a A b} L \qquad t \equiv \SCoh {\insertion S P T} {(\arr a A b) \sub {\kappa_{S,P,T}}} {\insertion L P T} \qquad (\Gamma, \lfloor s \rfloor, \lfloor t \rfloor) \in \insert\] We now suppose that \(\Gamma \vdash \lfloor s \rfloor : B\) and aim to prove that \(\Gamma \vdash \lfloor t \rfloor : B\). By inspecting the typing derivation we get: \begin{mathpar} S \vdash \arr a A b \and \Gamma \vdash L : S \and \Gamma \vdash M : T \and (\lfloor S \rfloor, \Supp(a), \Supp(b)) \in \mathcal{O} \and \Gamma \vdash (\arr a A b) \sub L = A \end{mathpar} and so by \cref{prop:ins-typing} we have: \[ \insertion S P T \vdash \kappa_{S,P,T} : S \qquad \Gamma \vdash \insertion L P M : \insertion S P T \] As the operation set supports insertion with \(\Supp(a \sub {\kappa}) = \Supp(a) \sub \kappa\) and \(\Supp(b \sub {\kappa}) = \Supp(b) \sub \kappa\) we get: \[ (\lfloor \insertion S P T \rfloor, \Supp(a \sub \kappa), \Supp(b \sub \kappa))\] and so we obtain: \[ \Gamma \vdash \SCoh {\insertion S P T} {(\arr a A b) \sub \kappa} {\insertion L P M} : {(\arr a A b) \sub \kappa \sub {\insertion L P M}}\] By \cref{lem:ins-comm-max,thm:label-max-equality}, \(\Gamma \vdash \kappa \bullet \insertion L P M = L\), and so: \begin{align*} (\arr a A b) \sub \kappa \sub {\insertion L P M} &\equiv (\arr a A b) \sub {\kappa \bullet (\insertion L P M)}\\ &= (\arr a A b) \sub {L}\\ &= B \end{align*} and so by applying the conversion rule we obtain \(\Gamma \vdash \lfloor t \rfloor : B\) as required. \end{proof} \subsection{Further properties} \label{sec:further-properties} It has now been proved that insertion can form part of a reasonable type theory. We now proceed to prove further properties of the insertion construction that will be critical for proving the confluence of \Cattsua in \cref{sec:cattsua}. The majority of these properties will therefore concern the interaction of insertion with other constructions and itself. We will justify each property with up to three of the following methods: \begin{itemize} \item For each property, we will give a graphical depiction of the constructions involved, similar to the diagram for \cref{prop:prune-conf}, which should help build intuition for the constructions at play. \item Where applicable, each combination of constructions will be described using the universal property from \cref{sec:univ-prop-insert}. This can be used to classify these constructions up to definitional equality. \item As these properties are used in a confluence proof, we will need a more syntactic form than can be offered by the universal property approach. To do this we fall back to the formalisation, using the computation power of structured terms to brute force each property. \end{itemize} The first two properties we consider concern the interaction of insertion with disc contexts, and will be crucial for proving confluence cases involving insertion and disc removal. Disc contexts often admit insertions, and the disc acts as a left and right unit for the insertion operation. \paragraph{Insertion into a disc} We begin by considering insertions into a disc. A disc context has a branch of height \(0\), and so if the locally maximal variable is sent to a standard coherence, then insertion can always be preformed. Inserting into a disc effectively performs disc removal, replacing the entire disc with the entirety of the inner context. We illustrate this by the following diagram, where we take the branch \([0,0]\) of \(D^4\) (which we note is not the minimal branch). \[ \insertion {\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid, Diag1] at (0,2)(x21){$\bullet$}; \node [on grid, Diag1] at (0,3)(x31) {$\bullet$}; \node [on grid, Diag1] at (0,4)(x41) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag1,very thick] (x11.center) to (x21.center); \draw[Diag1,very thick] (x21.center) to (x31.center); \draw[Diag1,very thick] (x31.center) to (x41.center); \end{scope} \end{tikzpicture}\quad} {[0,0]} {\quad\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base), Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \node [on grid] at (-0.9,3)(x31) {$\bullet$}; \node [on grid] at (-0.1,3) (x32) {$\bullet$}; \node [on grid] at (-0.9,4)(x41) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw (x21.center) to (x31.center); \draw (x21.center) to (x32.center); \draw (x31.center) to (x41.center); \end{tikzpicture}\qquad} = \qquad\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base),Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \node [on grid] at (-0.9,3)(x31) {$\bullet$}; \node [on grid] at (-0.1,3) (x32) {$\bullet$}; \node [on grid] at (-0.9,4)(x41) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw (x21.center) to (x31.center); \draw (x21.center) to (x32.center); \draw (x31.center) to (x41.center); \end{tikzpicture} \] This property of insertion also has a simple proof by universal property. Suppose we have disc \(D^n\) with a branch \(P\) and we insert tree \(T\). Then the inserted tree is given by the following pushout. \[ \begin{tikzcd} {D^n} & {D^n} \\ T & {\insertion {D^n} P T} \arrow["{\{\stdcoh T n\}}"', from=1-1, to=2-1] \arrow["\id", from=1-1, to=1-2] \arrow["\iota"', from=2-1, to=2-2] \arrow["\kappa", from=1-2, to=2-2] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-2, to=1-1] \end{tikzcd} \] By standard properties of pushouts, we have that \(\insertion {D^n} P T\) is isomorphic to \(T\). As this pushout holds in \Catt, we have a \Catt isomorphism between pasting contexts and so by \cref{prop:ps-context-iso}, \(T = \insertion {D^n} P T\), \(\iota = \id\). The following lemma gives syntactic versions of these properties. \begin{lemma} \label{lem:disc-insertion-1} Let \(T\) be a tree, \(n \geq \dim (T)\), and \(P\) a branch of \(D^n\) with \(\bh(P) \leq \th(T)\). Then \(\insertion {D^n} P T = T\) and \(\iota_{D^n,P,T} \equiv \id\). Suppose further that \((D^n,P,T,\Gamma,L,M)\) is an insertion redex. Then \(\insertion L P M \equiv M\). \end{lemma} \begin{proof} See the functions \func{Catt.Tree.Insertion.Properties}{disc-insertion}, \func{Catt.Tree.Insertion.Properties}{disc-ι}, and \func{Catt.Tree.Insertion.Properties}{disc-label-from} in formalisation module \module{Catt.Tree.Insertion.Properties}. \end{proof} \paragraph{Insertion of a disc} We now consider the opposite situation, where a disc context is inserted into an arbitrary tree. For a tree \(T\), with a branch \(P\), we can always insert the disc context \(D^{\lh(P)}\), as the trunk height condition will be satisfied by the linearity of the disc context. Inserting such a disc context makes no change to the tree \(T\), as the operation effectively replaces a branch of \(T\) (which is linear by construction) by a disc. The diagram below depicts this construction. \[ \insertion {\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \node [on grid] at (0,3)(x31) {$\bullet$}; \node [on grid] at (1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (0,4)(x41) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw (x22.center) to (x31.center); \draw (x22.center) to (x32.center); \draw[very thick,Diag1] (x31.center) to (x41.center); \draw (x32.center) to (x42.center); \end{scope} \end{tikzpicture}\quad} {[0,1,0,0]} {\quad\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base),Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (0,2)(x21){$\bullet$}; \node [on grid] at (0,3)(x31) {$\bullet$}; \node [on grid] at (0,4)(x41) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x21.center) to (x31.center); \draw (x31.center) to (x41.center); \end{tikzpicture}\qquad} = \qquad\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid, Diag2] at (0,0) (x01) {$\bullet$}; \node [on grid, Diag2] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid, Diag2] at (0.5,2) (x22){$\bullet$}; \node [on grid, Diag2] at (0,3)(x31) {$\bullet$}; \node [on grid] at (1,3) (x32) {$\bullet$}; \node [on grid, Diag2] at (0,4)(x41) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \begin{scope}[on background layer] \draw[Diag2] (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw[Diag2] (x11.center) to (x22.center); \draw[Diag2] (x22.center) to (x31.center); \draw (x22.center) to (x32.center); \draw[Diag2] (x31.center) to (x41.center); \draw (x32.center) to (x42.center); \end{scope} \end{tikzpicture} \] Similar to the insertion into the disc, the insertion of a disc can be characterised by universal property. Take any tree \(T\) with a branch \(P\). Then the tree \(\insertion T P {D^{\lh(P)}}\) is the following pushout: \[ \begin{tikzcd} {D^n} & T \\ {D^n} & {\insertion T P {D^n}} \arrow["{\{\olsi P\}}", from=1-1, to=1-2] \arrow["{\{\stdcoh {D^n} n\}}"', from=1-1, to=2-1] \arrow["\iota"', from=2-1, to=2-2] \arrow["\kappa", from=1-2, to=2-2] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-2, to=1-1] \end{tikzcd} \] The situation here is less clear than before, as the map \(D^n \to D^n\) is not the identity. However, in the presence of disc removal this map becomes equal to the identity, and in this case a similar argument can be made to determine that \(\kappa\) should be the identity and \(\insertion T P {D^{\lh(P)}}\) should be equal to the tree \(T\). The results are given in the lemma below: \begin{lemma} \label{lem:disc-insertion-2} Let \((T,P,D^{\lh(P)},\Gamma,L,M)\) be an insertion redex. Then: \[\insertion T P {D^{\lh(P)}} \equiv S \qquad \insertion L P M \equiv^{\mathsf{max}} L\] We further have: \[S \vdash_{\mathcal{R}} \kappa_{S,P,D^{\lh(P)}} =^{\mathsf{max}} \id_{S}\] if \(\mathcal{R}\) is a (tame) equality rule set which has disc removal. \end{lemma} \begin{proof} See the functions \func{Catt.Tree.Insertion.Properties}{insertion-disc} and \func{Catt.Tree.Insertion.Properties}{disc-label-from-2} in the formalisation module \module{Catt.Tree.Insertion.Properties} and \funcn{Catt.Typing.Insertion.Equality}{10459}{κ-disc} in \module{Catt.Typing.Insertion.Equality}. \end{proof} \paragraph{Insertion of an endo-coherence} We now turn our attention to the interaction between insertion and endo-coherence removal. Unlike in \Cattsu, the locally maximal argument in an insertion redex need not be in normal form. In particular, since the only condition on the locally maximal argument is that it is a standard coherence, it may be an endo-coherence. In such a situation there are two distinct ways of applying equalities: \begin{itemize} \item The endo-coherence could be directly inserted into the head term. \item The endo-coherence could be transformed into an identity on a standard coherence (see \cref{thm:std-ecr}) after which the head term could undergo two insertions, the first of which ``prunes'' the identity, and the second of which inserts the locally maximal argument of the pruned identity. \end{itemize} As the insertion of an identity acts in a similar way to pruning (see \cref{cor:insertion-pruning}), we re-use the notation. \begin{definition} Let \(S\) be a tree, and \(P\) be a branch of \(S\). Then define: \[ S \sslash P = \insertion S P {D^{\lh(P) - 1}} \qquad \pi_P = \kappa_{S,P,D^{\lh(P) - 1}}\] where we note that \((S,P,D^{\lh(P)-1})\) is always an insertion point. \end{definition} To perform the second equality path of pruning an identity followed by inserting the maximal argument of that identity, we must obtain a branch of the pruned context \(S \sslash P\). This can be done when \(\lh(P) - \bh(P) \geq 2\) by taking the same list as \(P\), as depicted in \cref{fig:pruned-branch}. We name such a branch the \emph{pruned branch}. \begin{definition} Let \(S\) be a tree, and \(P\) be a branch of \(S\) with \(\lh(P) - \bh(P) \geq 2\). We then define the \emph{pruned branch} \(P'\) of \(S \sslash P\) to be given by the same list as \(P\). \end{definition} If \(\lh(P) - \bh(P) = 1\) (noting that \(\lh(P) - \bh(P)\) cannot be zero) then pruning the branch \(P\) removes the branch entirely, and so the condition \(\lh(P) - \bh(P) \geq 2\) is necessary to form the pruned branch. It is clear that \(\bh(P') = \bh(P)\) and \(\lh(P') = \lh(P) - 1\). \begin{figure}[ht] \centering \begin{subfigure}{0.45\linewidth} \centering \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (0.5,1)(x12){$\bullet$}; \node [on grid] at (0.5,2) (x21){$\bullet$}; \node [on grid, Diag1] at (0,3)(x31) {$\bullet$}; \node [on grid] at (1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (0,4)(x41) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \node [on grid] at (-0.8,2)(bh) {}; \node [left=0 of bh ,on grid] {$\bh(P)$}; \node [on grid] at (-0.8,4)(lh) {}; \node [left=0 of lh ,on grid] {$\lh(P)$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw (x12.center) to (x22.center); \draw[very thick,Diag1] (x22.center) to (x31.center); \draw (x22.center) to (x32.center); \draw[very thick,Diag1] (x31.center) to (x41.center); \draw (x32.center) to (x42.center); \end{scope} \draw [<->] (bh.center) to (lh.center); \draw [dotted, very thick] (bh) to (x21); \draw [dotted, very thick] (lh) to (x41); \end{tikzpicture} \caption{Tree \(S\) and branch \(P = [1,0,0]\).} \end{subfigure} \begin{subfigure}{0.45\linewidth} \centering \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (0.5,1)(x12){$\bullet$}; \node [on grid] at (0.5,2) (x21){$\bullet$}; \node [on grid, Diag1] at (0,3)(x31) {$\bullet$}; \node [on grid] at (1,3) (x32) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw (x12.center) to (x22.center); \draw[very thick,Diag1] (x22.center) to (x31.center); \draw (x22.center) to (x32.center); \draw (x32.center) to (x42.center); \end{scope} \end{tikzpicture} \caption{Tree \(S \sslash P\) and branch \(P' = [1,0,0]\).} \end{subfigure} \caption{The pruned branch.} \label{fig:pruned-branch} \end{figure} We also note that the path \(\olsi{P'}\) is the maximal argument of the labelling \(\iota_{S,P,D^{\lh{P}- 1}}\), the inclusion of \(D^{\lh(P)- 1}\) into \(S \sslash P\). Insertion along the pruned branch is then characterised by the following pushout. \[\begin{tikzcd} {D^n} & S \\ {D^{n-1}} & {S \sslash P} \\ T & {\insertion {(S \sslash P)} {P'} T} \\ &&& \U\\ &&& \U \arrow["{\{\olsi P\}}", from=1-1, to=1-2] \arrow["{\pi_P}", from=1-2, to=2-2] \arrow["{\{\olsi {P'}\}}"', from=2-1, to=2-2] \arrow["{\{\stdcoh {D^{n-1}} n\}}"', from=1-1, to=2-1] \arrow["{\{ \stdcoh T {n-1} \}}"', from=2-1, to=3-1] \arrow["\kappa", from=2-2, to=3-2] \arrow["\iota"', from=3-1, to=3-2] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-2, to=1-1] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=3-2, to=2-1] \arrow["L", curve={height=-36pt}, from=1-2, to=4-4] \arrow["M"', curve={height=30pt}, from=3-1, to=5-4] \arrow["{\insertion L P {(\{\stdcoh T {n-1}\} \bullet M)}}"{sloped}, from=2-2, to=4-4, dashed] \arrow["{\insertion {(\insertion L P {(\{\stdcoh T {n-1}\} \bullet M)})} {P'} {M}}"'{sloped, pos=0.45}, from=3-2, to=5-4, dashed] \arrow[equal, nfold, from=4-4, to=5-4] \end{tikzcd} \] The top pushout is from the construction of \(S \sslash P\), noting that \(\iota_{S,P,D^{\lh(P) - 1}} = \{\olsi {P'}\}\). The bottom pushout is from the construction of the insertion along the pruned branch. By the pasting lemma for pushouts, the whole outer rectangle is also a pushout along the maps \(\{\hat P\}\) and \(\{\stdcoh {D^{n-1}} n\} \bullet \{\stdcoh T {n-1}\}\). In the presence of endo-coherence removal we have: \[ \{\stdcoh {D^{n-1}} n\} \bullet \{\stdcoh T {n-1}\} = \{\stdcoh T n\}\] by \cref{thm:std-ecr} and so the outer pushout rectangle is the pushout generated by directly inserting the endo-coherence. There are two ways to form the unique map \(\insertion {(S \sslash P)} {P'} {T} \to \U\), one by the outer pushout rectangle that gives the map \(\insertion L P M\), and one by first using the top pushout square with the maps \(L\) and \(\{\stdcoh T {n-1}\} \bullet M\) to get a map \(S \sslash P \to \U\), and then using this map with the bottom pushout square and \(M\) to get the morphisms depicted in the commutative diagram. These results appear in the next lemma. \begin{lemma} \label{lem:pruned-bp} Suppose \(S\) has branch \(P\) with \(\lh(P) - \bh(P) \geq 2\). Then \(\iota_{S,P,D^{\lh(P) - 1}} \equiv \{ \olsi {P'} \} \). Further suppose that \((S,P,T)\) is an insertion point. Then if the (tame) rule set \(\mathcal{R}\) has disc removal and endo-coherence removal we get: \[\insertion {(S \sslash P)} {P'} T = \insertion S P T \qquad \U \vdash_{\mathcal{R}} \pi_P \bullet \kappa_{S \sslash P,P',T} =^{\max} \kappa_{S,P,T} \] If we further have that \((S,P,T,\U,L,M)\) is an insertion redex then: \[\insertion {(\insertion {L} P {(\{\stdcoh T {\lh(P) - 1}\} \bullet M)})} {P'} {M} \equiv^{\max} \insertion L P M\] \end{lemma} \begin{proof} See the functions \func{Catt.Tree.Insertion.Properties}{insertion-tree-pruned-branch}, \func{Catt.Tree.Insertion.Properties}{pruned-branch-prop}, and \func{Catt.Tree.Insertion.Properties}{label-from-pruned-branch} in formalisation module \module{Catt.Tree.Insertion.Properties}, and \funcn{Catt.Typing.Insertion.Equality}{3281}{pruned-branch-κ} in \module{Catt.Typing.Insertion.Equality}. \end{proof} \paragraph{Branch irrelevance} As has already been noted, a tree \(S\) may admit multiple branches \(P\) and \(Q\) which represent the same locally maximal variable, that is \(\olsi P \equiv \olsi Q\). If there is an insertion that can be applied along either branch \(P\) or \(Q\) then it does not matter which branch we choose. This can be immediately seen by the universal property: The pushout square for an insertion point \((S,P,T)\) only mentions the path \(\olsi P\) and never uses the actual branch \(P\). \begin{lemma} \label{lem:insertion-irrel} Suppose \((S,P,T)\) and \((S,Q,T)\) are insertion points with \(\olsi P \equiv \olsi Q\). Then \(\insertion S P T \equiv \insertion S Q T\) and \(\kappa_{S,P,T} \equiv^{\mathsf{max}} \kappa_{S,Q,T}\). If we further have \(L : S \to \Gamma\) and \(M : T \to \Gamma\), then \(\insertion L P M \equiv^{\mathsf{max}} \insertion L Q M\). \end{lemma} \begin{proof} See the functions \func{Catt.Tree.Insertion.Properties}{insertion-irrel}, \func{Catt.Tree.Insertion.Properties}{κ-irrel}, and \func{Catt.Tree.Insertion.Properties}{irrel-label-from} in formalisation module \module{Catt.Tree.Insertion.Properties}. \end{proof} It is natural to ask why we define branches at all, and don't identify points where insertion can be performed by a maximal path, implicitly taking the branch of minimal branching height. While this could be done, it would make other confluence cases more difficult, as the branch associated to a maximal path could significantly change if a different branch is pruned from the tree. \paragraph{Parallel insertion} We now begin to consider the interaction between insertion and itself. In contrast to the previous case, we now consider two branches \(P\) and \(Q\) such that \(\olsi P\) and \(\olsi Q\) are not the same maximal path, in which case we say the branches \(P\) and \(Q\) are \emph{parallel}. Assume we have a tree \(S\) such that \((S, P, T)\) and \((S,Q, U)\) are insertion points. We then aim to perform both insertions, and prove that the order they occur in is irrelevant. To do this we must form a branch of the inserted tree \(\insertion S P T\), which is intuitively given by the branch \(Q\), but such a branch must be adapted to the new inserted tree. \begin{definition} Let \((S, P, T)\) be an insertion point and let \(Q\) be a branch of \(S\) such that \(\olsi P \neq \olsi Q\). Then we define the branch \(\insertion Q P T\) of \(\insertion S P T\) by induction on \(P\) and \(Q\). \begin{itemize} \item Suppose \(P = [k]\) and \(Q = j :: x\). Then if \(j < k\) we let \(\insertion Q P T = Q\). Otherwise, we let: \[\insertion Q P T = (j + \len(T) - 1) :: x\] \item Suppose \(P = k :: P_2\) and \(Q = j :: x\). If \(j \neq k\) then let \(\insertion Q P T = Q\). Otherwise, both \(P_2\) and \(x\) are branches of \(S_k\) and so we let \[\insertion Q P T = k :: \insertion x {P_2} T\] \end{itemize} It is clear that \(\insertion Q P T\) satisfies the condition for being a branch. \end{definition} The maximal path associated to the branch \(\insertion Q P T\) is obtained by applying the labelling \(\kappa\) to the maximal path associated to \(Q\). That is: \[ \olsi {\insertion Q P T} \equiv \olsi Q \sub {\kappa_{S,P,T}}\] A graphical example of such a situation is given in \cref{fig:ins-parallel} where we note how the right branch changes after the left-hand insertion is performed. We also note that the final trees at the bottom of the diagram are coloured slightly differently, which corresponds to the inserted labellings from these trees being different. To remedy this, we introduce a variant of the inserted labelling, which takes arguments from the head labelling instead of the argument labelling wherever possible. \begin{figure}[ht] \centering \newsavebox\redbase \sbox\redbase{\(\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base),Diag1] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0,2) (x22){$\bullet$}; \node [on grid] at (0.5,2) (x23){$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw (x11.center) to (x23.center); \end{tikzpicture} \quad\mathop{{}_{[1,0]}\mathord{\gg}}\quad \insertion{\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (0,1)(x12){$\bullet$}; \node [on grid] at (0.5,1)(x13){$\bullet$}; \node [on grid, Diag1] at (-0.5,2) (x21){$\bullet$}; \node [on grid, Diag2] at (0.5,2) (x22){$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw (x01.center) to (x13.center); \draw[Diag1, very thick] (x12.center) to (x21.center); \draw[Diag2, very thick] (x12.center) to (x22.center); \end{scope} \end{tikzpicture}\quad }{[1,1]}{\quad\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base),Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{tikzpicture}}\)} \newsavebox\redleft \sbox{\redleft}{\(\insertion{\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base)] \node [on grid, Diag1] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid, Diag1] at (0,1)(x12){$\bullet$}; \node [on grid] at (0.5,1)(x13){$\bullet$}; \node [on grid, Diag1] at (-0.6,2) (x21){$\bullet$}; \node [on grid, Diag1] at (-0.2,2) (x22){$\bullet$}; \node [on grid, Diag1] at (0.2,2) (x23){$\bullet$}; \node [on grid, Diag2] at (0.6,2) (x24){$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag1] (x01.center) to (x12.center); \draw (x01.center) to (x13.center); \draw[Diag1] (x12.center) to (x21.center); \draw[Diag1] (x12.center) to (x22.center); \draw[Diag1] (x12.center) to (x23.center); \draw[Diag2, very thick] (x12.center) to (x24.center); \end{scope} \end{tikzpicture}\quad }{[1,3]}{\quad\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base),Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{tikzpicture}}\)} \newsavebox\redright \sbox{\redright}{\(\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base),Diag1] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0,2) (x22){$\bullet$}; \node [on grid] at (0.5,2) (x23){$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw (x11.center) to (x23.center); \end{tikzpicture} \quad\mathop{{}_{[1,0]}\mathord{\gg}}\quad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base)] \node [on grid,Diag2] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid,Diag2] at (0,1)(x12){$\bullet$}; \node [on grid] at (0.5,1)(x13){$\bullet$}; \node [on grid, Diag1] at (-0.5,2) (x21){$\bullet$}; \node [on grid, Diag2] at (0,2) (x22){$\bullet$}; \node [on grid, Diag2] at (0.5,2) (x23){$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag2] (x01.center) to (x12.center); \draw (x01.center) to (x13.center); \draw[Diag1, very thick] (x12.center) to (x21.center); \draw[Diag2] (x12.center) to (x22.center); \draw[Diag2] (x12.center) to (x23.center); \end{scope} \end{tikzpicture}\)} \newsavebox\redleftbot \sbox{\redleftbot}{\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base)] \node [on grid,Diag2] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid,Diag2] at (0,1)(x12){$\bullet$}; \node [on grid] at (0.5,1)(x13){$\bullet$}; \node [on grid, Diag1] at (-0.8,2) (x21){$\bullet$}; \node [on grid, Diag1] at (-0.4,2) (x22){$\bullet$}; \node [on grid, Diag1] at (0,2) (x23){$\bullet$}; \node [on grid, Diag2] at (0.4,2) (x24){$\bullet$}; \node [on grid, Diag2] at (0.8,2) (x25){$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag2] (x01.center) to (x12.center); \draw (x01.center) to (x13.center); \draw[Diag1] (x12.center) to (x21.center); \draw[Diag1] (x12.center) to (x22.center); \draw[Diag1] (x12.center) to (x23.center); \draw[Diag2] (x12.center) to (x24.center); \draw[Diag2] (x12.center) to (x25.center); \end{scope} \end{tikzpicture}} \newsavebox\redrightbot \sbox{\redrightbot}{\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base)] \node [on grid,Diag1] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid,Diag1] at (0,1)(x12){$\bullet$}; \node [on grid] at (0.5,1)(x13){$\bullet$}; \node [on grid, Diag1] at (-0.8,2) (x21){$\bullet$}; \node [on grid, Diag1] at (-0.4,2) (x22){$\bullet$}; \node [on grid, Diag1] at (0,2) (x23){$\bullet$}; \node [on grid, Diag2] at (0.4,2) (x24){$\bullet$}; \node [on grid, Diag2] at (0.8,2) (x25){$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag1] (x01.center) to (x12.center); \draw (x01.center) to (x13.center); \draw[Diag1] (x12.center) to (x21.center); \draw[Diag1] (x12.center) to (x22.center); \draw[Diag1] (x12.center) to (x23.center); \draw[Diag2] (x12.center) to (x24.center); \draw[Diag2] (x12.center) to (x25.center); \end{scope} \end{tikzpicture}} \begin{tikzpicture} \node(redbase) at (0,0) {\fcolorbox{gray}{white}{\usebox\redbase}}; \node(redleft) at (-4,-5){\fcolorbox{gray}{white}{\usebox\redleft}}; \node(redright) at (4,-5){\fcolorbox{gray}{white}{\usebox\redright}}; \node(redleftbot) at (-4,-9){\fcolorbox{gray}{white}{\usebox\redleftbot}}; \node(redrightbot) at (4,-9){\fcolorbox{gray}{white}{\usebox\redrightbot}}; \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (redbase) to (redleft); \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (redbase) to (redright); \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (redleft) to (redleftbot); \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (redright) to (redrightbot); \end{tikzpicture} \caption{Parallel insertions.} \label{fig:ins-parallel} \end{figure} \begin{definition} We define an alternative to the inserted labelling as follows: Given an insertion point \((S, P, T)\) with \(L : S \to \U\) and \(M : T \to \U\) we define this \emph{alternative inserted labelling} \(\insertionprime L P M : {\insertion S P T} \to \U\). Let \[ S = [S_0,\dots,S_n] \qquad L = s_0 \{L_0\}s_1 \cdots \{L_n\}s_{n+1} : A\] and then proceed by induction on \(P\). \begin{itemize} \item Let \(P = [k]\), and \[ T = [T_0,\dots,T_m] \qquad M = t_0\{M_0\}t_1 \cdots \{M_m\}t_{m+1} : B\] Then define \(\insertion L {[k]} M\) to be: \[s_0\{L_0\}s_1 \cdots \{L_{k-1}\}\mathbf{s_k}\{M_0\}t_1\cdots \{M_m\}\mathbf{s_{k+1}}\{L_{k+1}\}s_{k+2}\cdots \{L_n\}s_{n+1} : A\] \item Suppose \(P = k :: Q\) so that \[T = [T_0] \qquad M = t_0\{M_0\}t_1 : B\] Define \(\insertion L P M\) as: \[s_0\{L_0\}s_1\cdots \{L_{k-1}\}\mathbf{s_k}\{\insertion {L_k} {Q} {M_0}\}\mathbf{s_{k+1}}\{L_{k+1}\}s_{k+2} \cdots \{L_n\}s_{n+1} : A\] \end{itemize} The terms that differ from the regular inserted labelling are written in bold. In the edge case where \(M = \emp\), we arbitrarily use \(s_k\) instead of \(s_{k+1}\) for the definition of \(\insertionprime L {[k]} M\). \end{definition} It is immediate that the alternative inserted labelling only differs up to definitional equality. \begin{proposition} \label{prop:insertion-prime-eq} Let \((S,P,T,\U,L,M)\) be an insertion redex. Then: \[\insertionprime L P M = \insertion L P M\] \end{proposition} \begin{proof} See function \func{Catt.Tree.Insertion.Typing}{label-from-insertion-eq} in the module \module{Catt.Tree.Insertion.Typing}. \end{proof} We now examine the universal property of parallel insertion. This is given by the following diagram, where we insert along \(P\) first, followed by \(Q\), letting \(n = \lh(P)\) and \(m = \lh(Q)\). \[ \begin{tikzcd}[row sep = large] & {D^n} & T \\ {D^m} & S & {\insertion S P T} \\ U && {\insertion {(\insertion S P T)} {\insertion Q P T} U} \arrow["{\{\olsi P\}}", from=1-2, to=2-2] \arrow["{\{\olsi Q\}}"', from=2-1, to=2-2] \arrow["{\{\stdcoh U m\}}"', from=2-1, to=3-1] \arrow["{\{\stdcoh T n\}}", from=1-2, to=1-3] \arrow["{\iota_{S,P,T}}", from=1-3, to=2-3] \arrow["{\kappa_{S,P,T}}"', from=2-2, to=2-3] \arrow["{\kappa_{\insertion S P T,\insertion Q P T, U}}", from=2-3, to=3-3] \arrow["{\iota_{\insertion S P T, \insertion Q P T, U}}"', from=3-1, to=3-3] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-3, to=1-2] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=3-3, to=2-1] \end{tikzcd} \] Here, the top pushout square is given by the insertion along \(P\), and the bottom square is given by the insertion along \(\insertion Q P T\), noting that: \[ \{ \olsi Q \} \bullet \kappa_{S,P,T} \equiv \{ \olsi {\insertion Q P T}\}\] The construction is therefore given by the colimit of the top-left border of the diagram. By a symmetric argument, it can be seen that performing the insertions in the opposite order also leads to a colimit of the same diagram. We state the lemma that formally states these ideas. \begin{lemma} \label{lem:insertion-different} Let \((S,P,T)\) and \((S,Q,U)\) be insertion points such that \(\olsi P \not\equiv \olsi Q\). Then we have: \begin{align*} \insertion {(\insertion S P T)} {\insertion Q P T} U &\equiv \insertion {(\insertion S Q U)} {\insertion P Q U} T\\ \kappa_{S,P,T} \circ \kappa_{\insertion S P T, \insertion Q P T, U} &\equiv^{\max} \kappa_{S,Q,U} \circ \kappa_{\insertion S Q U, \insertion P Q U, T} \intertext{Further:} \insertionprime {(\insertion L P M)} {\insertion Q P T} N &\equiv^{\max} \insertionprime {(\insertion L Q N)} {\insertion P Q U} M \end{align*} for any insertion redexes \((S,P,T,\U,L,M)\) and \((S,P,T,\U,L,N)\). \end{lemma} \begin{proof} See functions \func{Catt.Tree.Insertion.Properties}{insertion-parallel}, \funcn{Catt.Tree.Insertion.Properties}{33917}{κ-parallel}, and \funcn{Catt.Tree.Insertion.Properties}{40709}{label-from-parallel} in formalisation module \module{Catt.Tree.Insertion.Properties}. \end{proof} \paragraph{Boundaries of inserted trees} We now work towards the most complex property of insertion, the action of insertion on an insertable argument. To do this, we must first understand the action of insertion on standard coherences, which itself requires an understanding of how insertion interacts with the boundary inclusion maps of trees. There are two fundamental cases for the boundary of an inserted tree: \begin{itemize} \item The boundary has low enough dimension such that it is unaffected by the insertion. In this case applying the boundary to the inserted tree is the same as applying the boundary to the original tree. \item The boundary has sufficient dimension such that the boundary of the original tree still contains the insertion branch. In this case applying the boundary to the inserted tree is the same as inserting into the boundary of the original tree along this branch. \end{itemize} We begin with the first case. Suppose we have an insertion point \((S, P, T)\) and a dimension \(n \in \mathbb{N}\). The main criterion for the boundary having no interaction with the insertion is that: \[ n \leq \th(T) \] When this condition holds, taking the \(n\)-boundary of \(T\) returns a linear tree, and we have already seen that inserting linear trees has no effect on the head tree. We illustrate this case in the diagram below, where the tree \(T\) has trunk height \(3\) and we set \(n = 2\). The dashed line represents taking the boundary operation, and it is easy to see that the two boundary of \(S\) and the insertion tree \(\insertion S P T\) are the same. \[\insertion{\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (0.5,1)(x12){$\bullet$}; \node [on grid] at (-0.5,2) (x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \node [on grid] at (-0.5,3)(x31) {$\bullet$}; \node [on grid, Diag1] at (0.5,3)(x32) {$\bullet$}; \node [on grid, Diag1] at (0.5,4)(x41) {$\bullet$}; \draw[dashed, thick] (-1,2.5) to (1,2.5); \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw (x11.center) to (x21.center); \draw (x12.center) to (x22.center); \draw (x21.center) to (x31.center); \draw[very thick,Diag1] (x22.center) to (x32.center); \draw[very thick,Diag1] (x32.center) to (x41.center); \end{scope} \end{tikzpicture}\quad} {[1,0,0]} {\quad\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base),Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (0,2) (x21){$\bullet$}; \node [on grid] at (0,3)(x31) {$\bullet$}; \node [on grid] at (-0.5,4)(x41) {$\bullet$}; \node [on grid] at (0.5,4)(x42) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x21.center) to (x31.center); \draw (x31.center) to (x41.center); \draw (x31.center) to (x42.center); \end{tikzpicture}} \qquad = \qquad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid, Diag2] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid, Diag2] at (0.5,1)(x12){$\bullet$}; \node [on grid] at (-0.5,2) (x21){$\bullet$}; \node [on grid, Diag2] at (0.5,2) (x22){$\bullet$}; \node [on grid] at (-0.5,3)(x31) {$\bullet$}; \node [on grid, Diag2] at (0.5,3)(x32) {$\bullet$}; \node [on grid, Diag2] at (0,4)(x41) {$\bullet$}; \node [on grid, Diag2] at (1,4)(x42) {$\bullet$}; \draw[dashed, thick] (-1,2.5) to (1,2.5); \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag2] (x01.center) to (x12.center); \draw (x11.center) to (x21.center); \draw[Diag2] (x12.center) to (x22.center); \draw (x21.center) to (x31.center); \draw[Diag2] (x22.center) to (x32.center); \draw[Diag2] (x32.center) to (x41.center); \draw[Diag2] (x32.center) to (x42.center); \end{scope} \end{tikzpicture} \] As well as knowing about the interaction of the boundary with the inserted tree, we also need to investigate the interaction of the inclusion maps with the exterior labelling. In this first case, we would hope to prove that: \[ \incbd d - S \bullet \kappa_{S,P,T} \equiv \incbd d - {\insertion S P T}\] Now that \(\bound n {\insertion S P T} \equiv \bound n S\), there are two ways to encode the source inclusion \(\bound d S\) into \(\insertion S P T\). The right-hand side of the above equation directly includes \(\bound d {\insertion S P T}\) into the source of \(\insertion S P T\), and the left-hand side first includes \(\bound d S\) into the source of \(S\) and then projects \(S\) onto \(\insertion S P T\) via the exterior labelling. There is a catch with proving this equality; The exterior labelling sends \(\olsi P\) to the standard coherence, and so if \(\incbd d - S\) has \(\olsi P\) in its image, the equality cannot hold syntactically. We therefore further require that \(d < \lh(P)\), which ensures this cannot happen. We now state these results in the following lemma. \begin{lemma} \label{lem:insertion-bd-1} Let \(n \in \mathbb{N}\) and suppose \((S,P,T)\) is an insertion point such that \(n \leq \th(T)\). Then: \[ \bound n S \equiv \bound n {\insertion S P T}\] If we further have \(n < \lh(P)\) then: \[ \incbd n \epsilon S \circ \kappa_{S,P,T} \equiv^{\mathsf{max}} \incbd n \epsilon {\insertion S P T}\] for \(\epsilon \in \{-,+\}\). \end{lemma} \begin{proof} See the functions \func{Catt.Tree.Insertion.Properties}{insertion-bd-1} and \func{Catt.Tree.Insertion.Properties}{bd-κ-comm-1} in the formalisation module \module{Catt.Tree.Insertion.Properties}. \end{proof} We now move to the second case. We again suppose we have an insertion point \((S,P,T)\) and dimension \(n \in \mathbb{N}\). To perform an insertion into the boundary \(\bound n S\), the dimension \(n\) must be high enough not to remove the branch \(P\) from \(S\). More specifically, we must have the inequality: \[ n > \bh(P)\] which ensures that the list \(P\) is still a branch of \(\bound n S\). \begin{definition} Let \(S\) be a tree with a branch \(P\), and let \(n > \bh(P)\). Then there is a branch \(\bound n P\) of \(\bound n S\) given by the same list as \(P\) with \(\bh(\bound n P) = \bh(P)\). \end{definition} As \(\th(\bound n T) \geq \bh(P)\) when \(\th(T) \geq \bh(P)\) and \(n > \bh(P)\), we are able to insert the tree \(\bound n T\) into \(\bound n S\) along the branch \(\bound n P\). This is depicted in the following diagram, where \(\bh(P) = 2\) and \(n = 3\). In this diagram, the insertion \(\insertion S P T\) is drawn, and dashed lines are drawn across each tree where they would be truncated by the boundary operation. Crucially, the branch is still well-formed under this line, and preforming the insertion on the truncated trees yields the truncation of the inserted tree. \[\insertion{\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (0.5,1)(x12){$\bullet$}; \node [on grid] at (0.5,2) (x21){$\bullet$}; \node [on grid, Diag1] at (0,3)(x31) {$\bullet$}; \node [on grid] at (1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (0,4)(x41) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \draw [dashed, thick] (-0.5,3.5) to (1.5,3.5); \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw (x12.center) to (x22.center); \draw[very thick,Diag1] (x22.center) to (x31.center); \draw (x22.center) to (x32.center); \draw[very thick,Diag1] (x31.center) to (x41.center); \draw (x32.center) to (x42.center); \end{scope} \end{tikzpicture}\quad} {[1,0,0]} {\quad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base), Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (0,2) (x21){$\bullet$}; \node [on grid] at (-0.5,3)(x31) {$\bullet$}; \node [on grid] at (0.5,3) (x32) {$\bullet$}; \node [on grid] at (-0.5,4)(x41) {$\bullet$}; \draw [dashed, black, thick] (-1,3.5) to (1,3.5); \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x21.center) to (x31.center); \draw (x21.center) to (x32.center); \draw (x31.center) to (x41.center); \end{tikzpicture}} \qquad = \qquad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid, Diag2] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid, Diag2] at (0.5,1)(x12){$\bullet$}; \node [on grid, Diag2] at (0.5,2) (x21){$\bullet$}; \node [on grid, Diag2] at (0,3)(x31) {$\bullet$}; \node [on grid, Diag2] at (0.5,3) (x32) {$\bullet$}; \node [on grid] at (1,3) (x33) {$\bullet$}; \node [on grid, Diag2] at (0,4)(x41) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \draw [dashed,thick] (-0.5,3.5) to (1.5,3.5); \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag2] (x01.center) to (x12.center); \draw[Diag2] (x12.center) to (x22.center); \draw[Diag2] (x22.center) to (x31.center); \draw[Diag2] (x22.center) to (x32.center); \draw (x22.center) to (x33.center); \draw[Diag2] (x31.center) to (x41.center); \draw (x33.center) to (x42.center); \end{scope} \end{tikzpicture} \] As with the previous case, we explore the interaction of the boundary inclusion labellings and the exterior labelling. We aim to give conditions under which: \[ \incbd n - S \bullet \kappa_{S,P,T} \equiv \kappa_{\bound n S,\bound n P,\bound n T} \bullet \incbd n - {\insertion S P T}\] We examine the action of each side of the equation on the path \(\olsi {\bound n P}\). On the right-hand side, this path is sent by \(\kappa\) to a standard coherence, and so on the left-hand side, \((\incbd n - S)(\olsi {\bound n P}) \) must also be sent to a standard coherence by \(\kappa\). If \((\incbd n - S)( \olsi {\bound n P})\) is a maximal path, which will always be the case when \(n \geq \lh(P)\), then it will be sent to a standard coherence. Alternatively, if \(n \leq \lh(P)\) then \(\lh(\bound n P) = n\) and if \(n > \th(T)\) then the standard term returned by \(\kappa_{S,P,T}\) will be a standard coherence. These conditions lead to the following lemma. \begin{lemma} \label{lem:insertion-bd-2} Let \(n \in \mathbb{N}\) and suppose \((S,P,T)\) is an insertion point with \(n > \bh(P)\). Then: \[ \insertion {\bound n S} {\bound n P} {\bound n T} \equiv \bound n {\insertion S P T} \] Suppose further that one of the following holds: \begin{enumerate} \item \(n > \th(T)\) and \(n \leq \lh(P)\) \item \(n \geq \lh(P)\) \end{enumerate} Then: \[ \incbd n \epsilon S \bullet \kappa_{S,P,T} \equiv^{\mathsf{max}} \kappa_{\bound n S,\bound n P,\bound n T} \bullet \incbd n \epsilon {\insertion S P T} \] for \(\epsilon \in \{-,+\}\). \end{lemma} \begin{proof} See the functions \func{Catt.Tree.Insertion.Properties}{insertion-bd-2} and \funcn{Catt.Tree.Insertion.Properties}{50068}{bd-κ-comm-2} in the formalisation module \module{Catt.Tree.Insertion.Properties}. \end{proof} Both of the further conditions in \cref{lem:insertion-bd-2} imply that \(n > \bh(P)\). We have therefore seen 3 conditions that can be put on \(n\), \(P\), and \(T\): \begin{itemize} \item \(n \leq \th(T)\) and \(n < \lh(P)\), \item \(n > \th(T)\) and \(n \leq \lh(P)\), \item \(n \geq \lh(P)\). \end{itemize} One of these conditions must always hold for any \(n\) and insertion point \((S,P,T)\), and hence one of \cref{lem:insertion-bd-1,lem:insertion-bd-2} can always be applied. \begin{remark} The further conditions in each of \cref{lem:insertion-bd-1,lem:insertion-bd-2} could be dropped in favour of weakening the syntactic equalities to definitional equalities in a theory with disc removal, as this would remove the distinction between standard terms and standard coherences. It was however more convenient to take this approach in the formalisation, and although the extra side conditions may seem arbitrary, the key result is that one of the above lemmas always holds. \end{remark} \paragraph{Insertion into standard constructions} Equipped with \cref{lem:insertion-bd-1,lem:insertion-bd-2}, we can now prove that the standard constructions are preserved by applying an exterior labelling up to a definitional equality containing insertion and disc removal. We begin with the following lemma, whose intuition is clear from the universal property of insertion. \begin{lemma} \label{lem:kappa-iota-insert} Suppose \((S,P,T)\) is an insertion point. Then \(\insertion {\kappa_{S,P,T}} P {\iota_{S,P,T}} \equiv \id_{\insertion S P T}\). \end{lemma} \begin{proof} See \func{Catt.Tree.Insertion.Properties}{κ-ι-prop} in \module{Catt.Tree.Insertion.Properties}. \end{proof} We can then proceed to the main theorem of this section. \begin{theorem} \label{thm:std-insert-props} Let \(\mathcal{R}\) be a tame equality rule set that has disc removal and insertion. Then for any insertion point \((S,P,T)\) and \(n \in \mathbb{N}\), we have: \[\insertion S P T \vdash_{\mathcal{R}} \stdtm {\bound n S} n \sub {\incbd n \epsilon S \bullet \kappa_{S,P,T}} = \stdtm {\bound n {\insertion S P T}} n \sub {\incbd n \epsilon {\insertion S P T}}\] \[\insertion S P T \vdash_{\mathcal{R}} \stdty S n \sub {\kappa_{S,P,T}} = \stdty {\insertion S P T} n\] for \(\epsilon \in \{-,+\}\) and if \(n \geq \dep(S)\) then: \[\insertion S P T \vdash_{\mathcal{R}} \stdcoh S n \sub {\kappa_{S,P,T}} = \stdcoh {\insertion S P T} n \qquad \insertion S P T \vdash_{\mathcal{R}}\stdtm S n \sub {\kappa_{S,P,T}} = \stdtm {\insertion S P T} n \] \end{theorem} \begin{proof} We prove all three properties by mutual induction: We begin with the equality: \[ \stdtm {\bound n S} n \sub {\incbd n \epsilon S \bullet \kappa_{S,P,T}} = \stdtm {\bound n {\insertion S P T}} n \sub {\incbd n \epsilon {\insertion S P T}}\] The conditions for either \cref{lem:insertion-bd-1,lem:insertion-bd-2} must hold, and so we treat in case separately. If the conditions for \cref{lem:insertion-bd-1} hold then the required equality is immediately implied by \(\bound n {\insertion S P T} \equiv \bound n S\) and \(\incbd n \epsilon S \bullet \kappa_{S,P,T} \equiv \incbd n \epsilon {\insertion S P T}\). If instead the conditions for \cref{lem:insertion-bd-2} hold then: \begin{align*} \stdtm {\bound n S} n \sub {\incbd n \epsilon S \bullet \kappa_{S,P,T}} &\equiv \stdtm {\bound n S} n \sub {\kappa_{\bound n S, \bound n P, \bound n T} \bullet \incbd n \epsilon {\insertion S P T}}\\ &\equiv \stdtm {\bound n S} n \sub {\kappa_{\bound n S, \bound n P, \bound n T}} \sub {\incbd n \epsilon {\insertion S P T}}\\ &= \stdtm {\insertion {\bound n S} {\bound n P} {\bound n T}} n \sub{\incbd n \epsilon {\insertion S P T}}\\ &\equiv \stdtm {\bound n {\insertion S P T}} n \sub {\incbd n \epsilon {\insertion S P T}} \end{align*} where the definitional equality is due to the inductive hypothesis on terms. We continue to the case for types. If \(n = 0\), then both sides of the equality are \(\star\). Instead, consider the \(n + 1\) case, where we have: \begin{alignat*}{3} \stdty S {n+1} \sub {\kappa_{S,P,T}} \equiv{} &\stdtm {\bound n S} n \sub {\incbd n - S} \sub {\kappa_{S,P,T}} &\qquad& \stdty {\insertion S P T} {n+1} \equiv{}&&\stdtm {\bound n {\insertion S P T}} n \sub {\incbd n - {\insertion S P T}} \\ &\to_{\stdty S n \sub {\kappa_{S,P,T}}} &&&&\to_{\stdty {\insertion S P T} n}\\ &\stdtm {\bound n S} n \sub {\incbd n + S} \sub {\kappa_{S,P,T}}&&&&\stdtm {\bound n {\insertion S P T}} n \sub {\incbd n + {\insertion S P T}} \end{alignat*} By the inductive hypothesis on \(n\), we have \(\stdty S n \sub {\kappa_{S,P,T}} = \stdty {\insertion S P T} n\), and other necessary equalities follow from the first case we considered. We now consider the case for standard coherences, where we must prove that: \[ \SCoh S {\stdty S n} {\kappa_{S,P,T}} = \SCoh {\insertion S P T} {\stdty {\insertion S P T} n} {\id} \] By \cref{lem:iota-kappa-comm}, \(\olsi P \sub \kappa_{S,P,T}\) is the standard coherence \(\stdcoh T {\lh(P)} \sub {\iota_{S,P,T}}\), and so the left-hand side of the above equation admits an insertion. Therefore: \begin{align*} \SCoh S {\stdty S n} {\kappa_{S,P,T}} &= \SCoh {\insertion S P T} {\stdty S n \sub {\kappa_{S,P,T}}} {\insertion {\kappa_{S,P,T}} P {\iota_{S,P,T}}}&\text{by insertion}\\ &\equiv \SCoh {\insertion S P T} {\stdty S n \sub {\kappa_{S,P,T}}} {\id}&\text{by \cref{lem:kappa-iota-insert}}\\ &= \SCoh {\insertion S P T} {\stdty {\insertion S P T} n} {\id}&\text{by inductive hypothesis}\\ &\equiv \stdcoh {\insertion S P T} n \end{align*} The equality for standard terms follows from the equality for standard coherences, using \cref{thm:std-dr}. \end{proof} \begin{corollary} \label{cor:standard-coh-insert} If \(\mathcal{R}\) has disc removal and insertion, then an insertion into a standard coherence is equal to the standard coherence over the inserted tree. \end{corollary} \begin{proof} Let \(s \equiv \stdcoh S n \sub L\) be a standard coherence, and suppose \((S,P,T,\U,L,M)\) is an insertion redex with \(\U \vdash s : A\) for some \(A\). Then: \begin{align*} \stdcoh S n \sub L &= \SCoh {\insertion S P T} {\stdty S n \sub {\kappa_{S,P,T}}} {\insertion L P M}\\ &= \SCoh {\insertion S P T} {\stdty {\insertion S P T} n} {\insertion L P M}\\ &= \stdcoh {\insertion S P T} n \sub {\insertion L P M} \end{align*} and so \(s\) is equal to a standard coherence over the tree \(\insertion S P T\). \end{proof} \paragraph{Chained insertion} We explore the situation where a term \(s\) has a locally maximal argument \(t\) which can be inserted, and this term \(t\) admits an insertion itself. For the argument \(t\) to be insertable, it must be a standard coherence, and by \cref{cor:standard-coh-insert}, if \(t = t'\) by insertion, then \(t'\) will be equal to a standard coherence over some tree \(T\). For the term \(t'\) to be insertable, \(T\) must have sufficient trunk height. Conditions for this are given in the following lemma. \begin{lemma} \label{lem:insert-lin-height} Let \((S,P,T)\) be an insertion point. Further, assume \(S\) is not linear. Then \(\th(\insertion S P T) \geq \th(S)\). \end{lemma} \begin{proof} See \func{Catt.Tree.Insertion.Properties}{insertion-trunk-height} in \module{Catt.Tree.Insertion.Properties}. \end{proof} If a tree \(S\) is not linear, then any branch of \(S\) has branch height greater than the trunk height of \(S\), and hence any insertion into \(S\) only modifies the tree above its trunk height, and so can only increase the trunk height. Therefore, if \((S,P,T)\) and \(T, Q , U\) are insertion points, and \(T\) is not linear, then \((S, P, \insertion T Q U)\) is also an insertion point. Conversely, it is possible to insert the argument directly into the head term, before performing the inner insertion, looking to perform the inner insertion afterwards. For this to be possible, a branch of the inserted tree must be given. This can again be done under a non-linearity condition. \begin{definition} Let \((S, P, T)\) be an insertion point where \(T\) is not linear. Then from a branch \(Q\) of \(T\) we can obtain a branch \(\insertion S P Q\) of \(\insertion S P T\). We first observe that \(\bh(Q) \geq \th(T) \geq \bh(P)\). We define this branch by induction on \(P\) and \(Q\): \begin{itemize} \item Suppose \(P = [k]\) and \(Q = q :: x\). Then define: \[\insertion S P Q = (k - 1 + q) :: x\] \item Suppose \(P = k :: P_2\) with \(S = [S_0,\dots,S_n]\) and \(T = \Sigma(T_0)\). In this case we must have \(Q = 0 :: Q_2\) where \(Q_2\) is a branch of \(T_0\). Then define: \[ \insertion S P Q = k :: \insertion {S_k} {P_2} {Q_2}\] \end{itemize} It is clear that \(\insertion S P Q\) has the same branching and leaf height as \(Q\). \end{definition} A simple inductive proof shows that: \[\olsi {\insertion S P Q} \equiv \olsi Q \sub{\iota_{S,P,T}}\] Now given insertion points \((S,P,T)\) and \((T, Q ,U)\) with \(T\) non-linear we have that the triple \((\insertion S P T, \insertion S P Q, U)\) is another insertion point. Therefore, two ways of performing both insertions, which are depicted in \cref{fig:chained-insertion}. \begin{figure}[ht] \centering \newsavebox\chaintop \sbox\chaintop{\(\insertion{\insertion{\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (0.5,1)(x12){$\bullet$}; \node [on grid, Diag1] at (0,2) (x21){$\bullet$}; \node [on grid] at (1,2) (x22){$\bullet$}; \node [on grid, Diag1] at (0,3)(x31) {$\bullet$}; \node [on grid] at (1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (0,4)(x41) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw[very thick,Diag1] (x12.center) to (x21.center); \draw (x12.center) to (x22.center); \draw[very thick,Diag1] (x21.center) to (x31.center); \draw (x22.center) to (x32.center); \draw[very thick,Diag1] (x31.center) to (x41.center); \draw (x32.center) to (x42.center); \end{scope} \end{tikzpicture}\quad} {[1,0]} {\quad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base), Diag1] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (0,2) (x21){$\bullet$}; \node [on grid] at (-0.5,3)(x31) {$\bullet$}; \node [on grid] at (0.5,3) (x32) {$\bullet$}; \node [on grid, Diag2] at (-0.5,4)(x41) {$\bullet$}; \node [on grid] at (0.5,4)(x42) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x21.center) to (x31.center); \draw (x21.center) to (x32.center); \begin{scope}[on background layer] \draw[very thick, Diag2] (x31.center) to (x41.center); \end{scope} \draw (x32.center) to (x42.center); \end{tikzpicture}}\quad} {[0,0,0,0]} {\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base), Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (0,2) (x21){$\bullet$}; \node [on grid] at (0,3)(x31) {$\bullet$}; \node [on grid] at (-0.5,4)(x41) {$\bullet$}; \node [on grid] at (0.5,4)(x42) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x21.center) to (x31.center); \draw (x31.center) to (x41.center); \draw (x31.center) to (x42.center); \end{tikzpicture}}\)} \newsavebox\chainleft \sbox\chainleft{\(\insertion{\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid, Diag1] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid, Diag1] at (0.5,1)(x12){$\bullet$}; \node [on grid, Diag1] at (0,2) (x21){$\bullet$}; \node [on grid] at (1,2) (x22){$\bullet$}; \node [on grid, Diag1] at (-0.33,3)(x31) {$\bullet$}; \node [on grid, Diag1] at (0.33,3) (x32) {$\bullet$}; \node [on grid] at (1,3) (x33) {$\bullet$}; \node [on grid, Diag2] at (-0.33,4)(x41) {$\bullet$}; \node [on grid, Diag1] at (0.33,4)(x42) {$\bullet$}; \node [on grid] at (1,4)(x43) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag1] (x01.center) to (x12.center); \draw[Diag1] (x12.center) to (x21.center); \draw (x12.center) to (x22.center); \draw[Diag1] (x21.center) to (x31.center); \draw[Diag1] (x21.center) to (x32.center); \draw (x22.center) to (x33.center); \draw[very thick,Diag2] (x31.center) to (x41.center); \draw[Diag1] (x32.center) to (x42.center); \draw (x33.center) to (x43.center); \end{scope} \end{tikzpicture}\quad} {[1,0,0,0]} {\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base), Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (0,2) (x21){$\bullet$}; \node [on grid] at (0,3)(x31) {$\bullet$}; \node [on grid] at (-0.5,4)(x41) {$\bullet$}; \node [on grid] at (0.5,4)(x42) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x21.center) to (x31.center); \draw (x31.center) to (x41.center); \draw (x31.center) to (x42.center); \end{tikzpicture}}\) } \newsavebox\chainright \sbox\chainright{\(\insertion{\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (0.5,1)(x12){$\bullet$}; \node [on grid, Diag1] at (0,2) (x21){$\bullet$}; \node [on grid] at (1,2) (x22){$\bullet$}; \node [on grid, Diag1] at (0,3)(x31) {$\bullet$}; \node [on grid] at (1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (0,4)(x41) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw[very thick,Diag1] (x12.center) to (x21.center); \draw (x12.center) to (x22.center); \draw[very thick,Diag1] (x21.center) to (x31.center); \draw (x22.center) to (x32.center); \draw[very thick,Diag1] (x31.center) to (x41.center); \draw (x32.center) to (x42.center); \end{scope} \end{tikzpicture}\quad} {[1,0]} {\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base), Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (0,2) (x21){$\bullet$}; \node [on grid] at (-0.5,3)(x31) {$\bullet$}; \node [on grid, Diag1] at (0.5,3) (x32) {$\bullet$}; \node [on grid] at (-0.83,4)(x41) {$\bullet$}; \node [on grid] at (-0.16,4)(x42) {$\bullet$}; \node [on grid, Diag1] at (0.5,4)(x43) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x21.center) to (x31.center); \draw (x31.center) to (x41.center); \draw (x31.center) to (x42.center); \begin{scope}[on background layer] \draw[Diag1] (x21.center) to (x32.center); \draw[Diag1] (x32.center) to (x43.center); \end{scope} \end{tikzpicture}}\) } \newsavebox\chainbot \sbox\chainbot{\(\begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid, Diag2] at (0.125,0) (x01) {$\bullet$}; \node [on grid] at (-0.3125,1) (x11) {$\bullet$}; \node [on grid, Diag2] at (0.5625,1)(x12){$\bullet$}; \node [on grid, Diag2] at (0.125,2) (x21){$\bullet$}; \node [on grid] at (1,2) (x22){$\bullet$}; \node [on grid, Diag2] at (-0.25,3)(x31) {$\bullet$}; \node [on grid, Diag1] at (0.5,3) (x32) {$\bullet$}; \node [on grid] at (1,3) (x33) {$\bullet$}; \node [on grid, Diag2] at (-0.5,4)(x41) {$\bullet$}; \node [on grid, Diag2] at (0,4)(x42) {$\bullet$}; \node [on grid, Diag1] at (0.5,4)(x43) {$\bullet$}; \node [on grid] at (1,4)(x44) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag2] (x01.center) to (x12.center); \draw[Diag2] (x12.center) to (x21.center); \draw (x12.center) to (x22.center); \draw[Diag2] (x21.center) to (x31.center); \draw[Diag1] (x21.center) to (x32.center); \draw (x22.center) to (x33.center); \draw[Diag2] (x31.center) to (x41.center); \draw[Diag2] (x31.center) to (x42.center); \draw[Diag1] (x32.center) to (x43.center); \draw (x33.center) to (x44.center); \end{scope} \end{tikzpicture}\) } \begin{tikzpicture} \node(chaintop) at (0,0) {\fcolorbox{gray}{white}{\usebox\chaintop}}; \node(chainleft) at (-4.5,-6){\fcolorbox{gray}{white}{\usebox\chainleft}}; \node(chainright) at (4.5,-6){\fcolorbox{gray}{white}{\usebox\chainright}}; \node(chainbot) at (0,-12){\fcolorbox{gray}{white}{\usebox\chainbot}}; \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (chaintop) to (chainleft); \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (chaintop) to (chainright); \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (chainleft) to (chainbot); \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (chainright) to (chainbot); \end{tikzpicture} \caption{Chained insertion.} \label{fig:chained-insertion} \end{figure} We now explore the universal property of the insertion along the branch \(\insertion S P Q\). We assume that \(n = \lh(P)\) and \(m = \lh(Q)\) and form the following diagram: \[ \begin{tikzcd} & {D^n} & S \\ {D^m} & T & {\insertion S P T} \\ U && {\insertion {(\insertion S P T)} {\insertion S P Q} U} \arrow["{\{\olsi P \}}", from=1-2, to=1-3] \arrow["{\{ \stdcoh T n \}}"', from=1-2, to=2-2] \arrow["{\{ \olsi Q \}}"', from=2-1, to=2-2] \arrow["{\{\stdcoh U m\}}"', from=2-1, to=3-1] \arrow["{\iota_{S,P,T}}"', from=2-2, to=2-3] \arrow["{\kappa_{S,P,T}}", from=1-3, to=2-3] \arrow["{\kappa_{\insertion S P T, \insertion S P Q, U}}", from=2-3, to=3-3] \arrow["{\iota_{\insertion S P T, \insertion S P Q, U}}"', from=3-1, to=3-3] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-3, to=1-2] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=3-3, to=2-1] \end{tikzcd} \] The top pushout square is given by the insertion of \(T\) into \(S\) along \(P\). The morphism \(\{\olsi Q\} \bullet \iota_{S,P,T}\) through the middle of the diagram is then equal to \(\{\olsi {\insertion S P Q} \}\), allowing the bottom pushout rectangle to be formed by the insertion of \(U\) into \(\insertion S P T\) along \(\insertion S P Q\). We can also consider the universal property of the tree generated by first inserting \(U\) into \(T\), and then inserting the inserted tree into \(S\), which is given by the diagram below: \[ \begin{tikzcd} & {D^n} && S \\ {D^m} & T \\ U & {\insertion T Q U} & & {\insertion S P {(\insertion T Q U)}} \arrow["{\{\olsi P \}}", from=1-2, to=1-4] \arrow["{\{ \stdcoh T n \}}"', from=1-2, to=2-2] \arrow["{\{ \olsi Q \}}"', from=2-1, to=2-2] \arrow["{\{\stdcoh U m\}}"', from=2-1, to=3-1] \arrow["{\kappa_{T,Q,U}}", from=2-2, to=3-2] \arrow["{\iota_{T,Q,U}}"', from=3-1, to=3-2] \arrow["{\kappa_{S,P,\insertion T Q U}}", from=1-4, to=3-4] \arrow["{\iota_{S,P,\insertion T Q U}}"', from=3-2, to=3-4] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=3-4, to=1-2] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=3-2, to=2-1] \end{tikzcd} \] The left-hand pushout square is given by the insertion of \(U\) into \(T\) along \(Q\). The morphism \(\{\stdcoh T n\} \bullet \kappa_{T,Q,U}\) which runs vertically through the centre of the diagram is then equal to \(\{\stdcoh {\insertion T Q U} n\}\) by \cref{cor:standard-coh-insert}, allowing for the right-hand pushout square to be formed as the insertion of \(\insertion T Q U\) into \(S\) along \(P\). By common properties of colimits, both of these constructions then arise as colimits of the same diagram, the shared top left boundary of both constructions. The results of this section are stated in the following lemma. \begin{lemma} \label{lem:inserted-insertion} Let \((S,P,T)\) and \((T,Q,U)\) be insertion points. Further assume \(T\) is not linear. Then: \begin{alignat*}{3} &\insertion S P {(\insertion T Q U)} & &= & &\insertion {(\insertion S P T)} {\insertion S P Q} U\\ &\kappa_{S,P,\insertion T Q U} &&=^{\mathsf{max}}{} &&\kappa_{S,P,T} \circ \kappa_{\insertion S P T, \insertion S P Q, U}\\ &\insertion L P {(\insertion M Q N)} &&\equiv^{\mathsf{max}}{} &&\insertion {(\insertion L P M)} {\insertion S P Q} N \end{alignat*} for any \(L : S \to \U\), \(M : T \to \U\), and \(N : U \to \U\). \end{lemma} \begin{proof} See the functions \func{Catt.Tree.Insertion.Properties}{insertion-tree-inserted-branch} and \func{Catt.Tree.Insertion.Properties}{label-from-inserted-branch} in the formalisation module \module{Catt.Tree.Insertion.Properties}, and \funcn{Catt.Typing.Insertion.Equality}{22361}{κ-inserted-branch} in module \module{Catt.Typing.Insertion.Equality}. \end{proof} \chapter{Semistrict variants of \Catt} \label{cha:cattstrict} The type theories \Cattsu, a type theory for strictly unital \(\infty\)-categories, and \Cattsua, a type theory for strictly unital and associative \(\infty\)-categories, are introduced in this chapter, where we will define both theories and explore some metatheory and properties of each type theory in detail. The results in this chapter will heavily depend on the theory developed in the previous chapters. Both type theories will be defined as instances of \Cattr, which was introduced in \cref{sec:catt-with-equality}, and much of the initial metatheory can be immediately derived by demonstrating that the equality rule sets that generate \Cattsu and \Cattsua satisfy the various conditions given in \cref{sec:ruleset}. The theory \Cattsu is primarily generated by pruning, which was introduced in \cref{sec:pruning}, and the theory \Cattsua depends on the insertion operation, which was introduced in \cref{sec:insertion}. \cref{sec:cattsu} will introduce and define the \Cattsu, and \cref{sec:cattsua} will do the same for \Cattsua. The main contribution of these sections is to give normalisation algorithms for their respective theories, giving a notion of computation to each theory. A normalisation algorithm is a function \(\N : \Term_\Gamma \to \Term_\Gamma\) with the following properties: \begin{itemize} \item For any term \(t : \Term_\Gamma\), \(\Gamma \vdash \N(t) = t\). \item For any \(s, t : \Term_\Gamma\) with \(\Gamma \vdash s = t\), \(\N(s) \equiv \N(t)\). \end{itemize} The term \(\N(t)\) is called the \emph{normal form} of \(t\). Such an algorithm allows equality of two term \(s\) and \(t\) to be decided by taking the normal form of each term and checking if they are syntactically equal. Normalisation can be extended to types and substitutions in a natural way. In \cref{sec:cattsu,sec:cattsua}, the normalisation algorithm is defined by giving a reduction system on the syntax of the type theory, which we show to be terminating, meaning that there is no infinite reduction sequence and confluent, meaning that any two reduction paths converge to a common reduct. The normal form of a term can then be obtained by reducing it until there are no further reductions possible. In \cref{sec:reduction}, these notions are recalled, and we demonstrate that the resulting normalisation algorithm satisfies the two properties stated above. This section also introduces a method for obtaining a reduction system from an arbitrary equality rule set \(\mathcal{R}\). Such a normalisation procedure allows a type checking algorithm to be implemented, creating an interpreter for the language. This allows us to write larger terms, and it can be automatically verified whether they are well-formed. In \cref{sec:towards-nbe}, we introduce our implementation of \Catt, \Cattsu, and \Cattsua, written in rust. This implementation supports features such as implicit arguments to terms, implicit suspension, and native support for trees and tree labellings. We will explain how the tool can be used, and use it to give larger examples of \Cattsua terms, including proofs of Eckmann-Hilton (see \cref{fig:eh}) and its higher-dimensional coherence condition, the syllepsis. The implementation uses an approach closer to normalisation by evaluation for typechecking terms in the theory. \cref{sec:towards-nbe} explores this algorithm and presents some perspectives on applying normalisation by evaluation to semistrict versions of \Catt. \cref{sec:models} provides a discussion of the models of the semistrict type theories \Cattsu and \Cattsua, demonstrating how they can be viewed as semistrict \(\infty\)-categories. The section proves a partial conservativity result, which allows a proof that semistrictness is a property of a weak \(\infty\)-category, and not additional structure. A discussion is provided on some of the challenges that must be overcome to extend this partial conservativity result. The thesis ends with \cref{sec:future-work}, which provides a review of avenues for future work in this area, including a discussion of further variants of \Catt which could be defined. \section{Reduction} \label{sec:reduction} Reduction is a method for defining computation for a type theory. For each term, a number of reductions can be applied to it, representing the various computations that could be applied to the term. Computation can then be run on a term by repeatedly searching for positions in the term that admit a reduction, known as \emph{redexes}, and applying this reduction, until no more redexes exist in the term. When a term admits no reductions, it is called a \emph{normal form}. \begin{definition} A \emph{reduction system} is given by a relation \(s \red t\) on terms. The relation \(\red^{*}\) is defined to be the reflexive transitive closure of \(\red\), and so \(s \red^* t\) exactly when there is some chain \[s \equiv u_0 \red \cdots \red u_k \equiv t\] for \(k \in \mathbb{N}\) (which could be \(0\) with \(s \equiv t\)) and terms \(u_i\) for \(i \leq k\). Further define \(\leftrightsquigarrow\) to be the reflexive symmetric transitive closure of \(\red\). When a term \(s\) admits no reductions, that is there is no \(t\) such that \(s \red t\), we say it is in \emph{normal form}. \end{definition} If we have an equality rule set \(\mathcal{R}\) (see \cref{sec:ruleset}) that generates \Cattr, a reduction system can be defined on \(\mathcal{R}\) modifying the rules for equality to remove the reflexivity, symmetry, and transitivity constructors and ensure that reductions do not happen ``in parallel''. \begin{definition} Let \(\mathcal{R}\) be an equality rule set. Define the reduction system \(\redr\) on well-formed terms, well-formed substitutions, and well-formed types to be generated by the rules in \cref{fig:reduction}. When it is clear which equality rule set is being used, we may simply write \(s \red t\) instead of \(s \redr t\). \end{definition} \begin{figure}[ht] \centering \begin{mathpar} \inferrule{(\Gamma, s, t) \in \mathcal{R}}{s \redr t}\textsc{rule} \and \inferrule{A \redr B}{\Coh \Delta A \sigma \redr \Coh \Delta B \sigma}\textsc{cell} \and \inferrule{\sigma \redr \tau}{\Coh \Delta A \sigma = \Coh \Delta A \tau}\textsc{arg} \\ \inferrule{s \redr s'}{\arr s A t \redr \arr {s'} A t}\and \inferrule{t \redr t'}{\arr s A t \redr \arr s A {t'}}\and \inferrule{A \redr A'}{\arr s A t \redr \arr s {A'} t}\\ \inferrule{\sigma \redr \tau}{\langle \sigma, s \rangle \redr \langle \tau, s \rangle}\and \inferrule{s \redr t}{\langle \sigma, s \rangle \redr \langle \sigma, t \rangle} \end{mathpar} \caption[Reduction rules]{Rules for \(\rightsquigarrow_{\mathcal{R}}\).} \label{fig:reduction} \end{figure} The rules for reduction are set up so that each reduction \(s \redr t\) corresponds to the application of exactly one rule from \(\mathcal{R}\) at a single point in the term. Given a coherence \(\Coh \Delta A \sigma\), we call reductions generated by the \textsc{cell} rule \emph{cell reductions} and reductions generated by the \textsc{arg} rule \emph{argument reductions}. Reductions generated by \textsc{rule} will be named by the rule in \(\mathcal{R}\) that was used. For example a reduction generated by \textsc{rule} applied with an instance of pruning will be called a pruning reduction. We highlight that our reduction system \(\redr\) is only defined between well-formed pieces of syntax. As this reduction will be used with rule sets \(\mathcal{R}\) which satisfy the preservation condition, there will be no additional burden of checking that typing is preserved while applying reductions. Therefore, we can prove that the reflexive symmetric transitive closure of reduction, \(\redrts\), is the same relation as equality on well-formed terms, given the similarity between the rules for reduction and the rules for equality. \begin{proposition} \label{prop:red-is-eq} Let \(\mathcal{R}\) be a rule set satisfying the preservation, support, and substitution conditions (such that the generated equality preserves typing). Letting \(\redrts\) be the reflexive symmetric transitive closure of \(\redr\), we get: \begin{align*} \Gamma \vdash s = t &\iff s \redrts t \\ \intertext{for \(s,t : \Term_\Gamma\) such that \(\Gamma \vdash s : A\) and \(\Gamma \vdash t : A\) for some \(A : \Type_\Gamma\)} \Gamma \vdash A = B &\iff A \redrts B \\ \intertext{for \(A,B : \Type_\Gamma\) such that \(\Gamma \vdash A\) and \(\Gamma \vdash B\)} \Gamma \vdash \sigma = \tau &\iff \sigma \redrts \tau \end{align*} for \(\sigma, \tau : \arr \Delta \star \Gamma\) such that \(\Gamma \vdash \sigma : \Delta\) and \(\Gamma \vdash \tau : \Delta\). \end{proposition} \begin{proof} Each direction can be proved separately by a mutual induction on the derivation in the premise. For the right to left direction, it suffices to show that the single step reduction (\(\redr\)) is contained in the equality, as equality is an equivalence relation by construction. \end{proof} Just as the preservation condition on a rule set \(\mathcal{R}\) allows us to deduce that reduction preserves typing, the substitution condition can be used to prove that reduction is preserved by application of substitution. \begin{proposition} \label{prop:red-sub} Suppose \(\mathcal{R}\) satisfies the substitution condition and let \(\sigma : \Delta \to \Gamma\) be a well-formed substitution. Then: \begin{align*} s \redr t &\implies s \sub \sigma \redr t \sub \sigma \\ A \redr B &\implies A \sub \sigma \redr B \sub \sigma \\ \tau \redr \mu &\implies \tau \bullet \sigma \redr \mu \bullet \sigma \end{align*} for well-formed terms \(s,t\), well-formed types \(A,B\), and well-formed substitutions \(\tau\) and \(\mu\). Furthermore, if \(\sigma \redr \tau\), then: \[ s \sub \sigma \redr^* s \sub \tau \qquad A \sub \sigma \redr^* A \sub \tau \qquad \mu \bullet \sigma \redr^* \mu \bullet \tau\] for term \(s\), type \(A\), and substitution \(\mu\). \end{proposition} \begin{proof} The first part by a simple induction on the reduction in the premise. The second holds by a mutual induction on the term \(s\), type \(A\), and substitution \(\mu\). \end{proof} \subsection{Termination} \label{sec:termination} In order to obtain a normal form of each term of the theory, we perform reductions on a term until no more can be applied. This can only be done if we know that this will eventually result in a normal form, a property known as \emph{strong termination}. \begin{definition} A reduction system \(\red\) is \emph{strongly terminating} if there is no infinite sequence of reductions: \[ s_0 \red s_1 \red s_2 \red \cdots \] For such a reduction, applying reductions to a term will eventually reach a normal form. \end{definition} Demonstrating the termination of the reduction systems defined in \cref{sec:cattsu,sec:cattsua} will be non-trivial, as each reduction adds new constructions to the term, which could themselves admit reductions. Suppose we have the following reduction due to endo-coherence removal (see \cref{sec:ecr}): \[ \Coh \Delta {\arr s A s} \sigma \red \id(A \sub \sigma,s \sub \sigma) \] The identity term was not present in the premise of the reduction, and the term \(s \sub \sigma\) is newly created by the reduction, and could itself admit any number of reductions. To prove termination, we will exploit that although each reduction creates new subterms, these subterms are all of a lower dimension than the dimension of the term that is being reduced. In the example above, the dimension of \(\Coh \Delta {\arr s A s} \sigma\) is greater than the dimension of the term \(s\), and so the reduction has still made progress towards a normal form by decreasing the complexity of the term in dimension \(\dim(A)\), even though it may introduce arbitrary complexity below \(\dim(A)\). To this end we define the following notion of complexity for each class of syntax, which assigns an ordinal number to each term, which we call its \emph{syntactic complexity}. As the ordinal numbers are well-founded, we aim to prove that our reduction is terminating by proving that each single-step reduction reduces the complexity of the term. To define syntactic complexity, we will need to use ordinal numbers up to \(\omega^\omega\). We will also need a construction known as the natural sum of ordinals, \(\alpha \+ \beta\), which is associative, commutative, and strictly monotone in both of its arguments~\cite{lipparini16}. \begin{definition} For all terms \(t\), types \(A\), and substitutions \(\sigma\), the \emph{syntactic complexity} \(\sc(t)\), \(\sc(A)\), and \(\sc(\sigma)\) are mutually defined as follows: \begin{itemize} \item For types: \[ \sc(\star) = 0 \qquad \sc(\arr s A t) = \sc(s) \+ \sc(A) \+ \sc(t)\] \item For substitutions we have: \[\sc(\langle t_0, \dots, t_n \rangle) = \bighash_{i=0}^n t_i\] \item For terms, we have \(\sc(x) = 0\) for variables \(x\) and for coherences we have: \begin{equation*} \sc(\Coh \Delta A \sigma) = \begin{cases*} \omega^{\dim(A)} \+ \sc(\sigma)&if \(\Coh \Delta A \sigma\) is an identity\\ 2\omega^{\dim(A)} \+ \sc(\sigma)&otherwise \end{cases*} \end{equation*} \end{itemize} \end{definition} The syntactic complexity is given as an ordinal to leverage known results, though it should be noted that ordinals below \(\omega^\omega\) can be represented by a list of natural numbers ordered reverse lexicographically. Under this interpretation the syntactic complexity effectively computes the number of coherences at each dimension. Therefore, removing a coherence of dimension \(n\) reduces the complexity, even if arbitrary complexity is added at lower dimensions. Syntactic complexity also treats identities in a special way, as these play a special role in blocking reduction for the theories presented in this chapter. The syntactic complexity does not account for the type in a coherence, as this is difficult to encode. Instead of showing that all reductions reduce syntactic complexity, we instead show that all reductions which are not cell reductions (reductions that have the rule marked \textsc{cell} in their derivation) reduce syntactic complexity and deduce that a hypothetical infinite reduction sequence must only consist of cell reductions after a finite number of steps, and then appeal to an induction on dimension. \begin{lemma} \label{lem:termination-lem} Let \(\mathcal{R}\) be an equality set with \( \sc(s) > \sc(t) \) for all \((\Gamma,s,t) \in \mathcal{R}\). Then \(\redr\) is strongly terminating. \end{lemma} \begin{proof} By a simple induction on reductions, we immediately have that if \(s \redr t\) then \(\sc(s) \geq \sc(t)\), with the inequality strict when the reduction is not a cell reduction. We then proceed by induction on the dimension. Suppose there is an infinite reduction sequence, starting with a \(k\)-dimensional term: \[ s_0 \red s_1 \red s_2 \red \cdots\] Then by assumption, only finitely many of these reductions do not use the cell rule, as otherwise we would obtain an infinite chain of ordinals \[ \sc(s_0) \geq \sc(s_1) \geq \sc(s_2) \geq \cdots\] where infinitely many of these inequalities are strict. Therefore, there is an \(n\) such that: \[ s_n \red s_{n+1} \red \cdots\] are all cell reductions. Each of these reductions reduces one of finitely many subterms of \(s_n\), and each of these subterms has dimension less than \(k\), so by inductive hypothesis, none of these subterms can be reduced infinitely often, contradicting the existence of an infinite reduction sequence. \end{proof} We can immediately prove that disc removal reduces syntactic complexity. \begin{proposition} \label{prop:disc-rem-sc} Let \(s \red t\) be an instance of disc removal. Then \(\sc(s) > \sc(t)\). \end{proposition} \begin{proof} We must have \(s \equiv \Coh {D^n} {\wk(U^n)} {\{A,t\}}\) for some \(n\) and \(A\). Then: \begin{align*} \sc(s) &= \sc(\Coh {D^n} {\wk(U^n)} {\{A,t\}})\\ &= 2\omega^n \+ \sc(\{A,t\})\\ &> \sc(\{A,t\})\\ &\geq \sc(t) \end{align*} where the last inequality holds by a simple induction on the dimension of \(A\). \end{proof} We note that as stated so far the reduction: \[ \id(A,s) \red \id(A,s)\] is a valid instance of endo-coherence removal for type \(A\) and term \(s\), which will break termination. We therefore let \(\ecr'\) be the equality rule set obtained by removing all triples \((\Gamma,s,t)\) from \(\ecr\) where \(s\) is already an identity. We justify replacing \ecr by \ecr' with the following lemma. \begin{lemma} \label{lem:always-ecr} The following reduction holds, even when the left-hand side is an identity: \[\Coh \Delta {\arr s A s} \sigma \red_{\ecr'}^* \id(A\sub \sigma,s\sub\sigma)\] \end{lemma} \begin{proof} If \(\Coh \Delta {\arr s A s} \sigma\) is not an identity then it can be reduced by endo-coherence removal. Otherwise, we have \(\Delta = D^n\) for some \(n\), \(s \equiv d_n\), \(A \equiv \wk(U^n)\), and \(\sigma \equiv \{B,t\}\) for some \(B\) and \(t\) and so: \[\id(A\sub \sigma,s \sub \sigma) \equiv \id(\wk(U^n)\sub{\{B,t\}}, d_n \sub {\{B,t\}}) \equiv \id(B,t) \] It follows that the reduction is trivial. \end{proof} It can then be proven that the reductions in this set reduce syntactic complexity. \begin{proposition} \label{prop:ecr-sc} Let \(s \red t\) be an instance of endo-coherence removal. If \(s\) is not an identity then \(\sc(s) > \sc(t)\). \end{proposition} \begin{proof} As \(s \red t\) is an instance of endo-coherence removal, we must have \(s \equiv \Coh \Delta {\arr u A u} \sigma\) and \(t \equiv \id(A \sub \sigma, u \sub \sigma)\). Further, \(s\) is not an identity and so: \begin{align*} \sc(s) &= \sc(\Coh \Delta {\arr u A u} \sigma)\\ &= 2\omega^{\dim(A) + 1} \+ \sc(\sigma)\\ &\geq 2\omega^{\dim(A) + 1}\\ &< \omega^{\dim(A) + 1} \+ \sc(A \sub \sigma) \+ \sc(u \sub \sigma) &= \sc(\id(A \sub \sigma, u \sub \sigma)) \\ &= \sc(t) \end{align*} where the last inequality holds as \(\sc(A \sub \sigma) \+ \sc(u \sub \sigma) < \omega^{\dim(A) + 1}\) due to both \(A \sub \sigma\) and \(u \sub \sigma\) having the same dimension as \(\dim(A)\), meaning that their syntactic complexities are strictly bounded by \(\omega^{\dim(A) + 1}\). \end{proof} \subsection{Confluence} \label{sec:confluence} Another crucial property of reduction systems is \emph{confluence}. A term \(s\) may have any number of redexes and could reduce to distinct terms \(t\) and \(u\). Confluence states that both the terms \(t\) and \(u\) must reduce to some common term, allowing us to apply reductions to a term in any order. \begin{definition} Let \(\red\) be a reduction system. It is \emph{(globally) confluent} if for all terms \(s\),\(t\), and \(u\) with \(s \red^* t\) and \(s \red^* u\), there is a term \(v\) such that \(t \red^* v\) and \(t \red^* v\). This can be assembled into the following diagram: \[ \begin{tikzcd} & s \\ t && u \\ & v \arrow["{*}", squiggly, from=1-2, to=2-3] \arrow["{*}"', squiggly, from=1-2, to=2-1] \arrow["{*}"', squiggly, from=2-1, to=3-2] \arrow["{*}", squiggly, from=2-3, to=3-2] \end{tikzcd} \] and hence is sometimes called the diamond property for \(\red^*\). \end{definition} From global confluence, it is clear that if \(s \redrts t\), where \(\redrts\) is the reflexive symmetric transitive closure of \(\redr\), then there is \(u\) with \(s \redr^* u\) and \(t \redr^* u\). It is sometimes simpler to show that the following weaker confluence property holds: \begin{definition} Let \(\red\) be a reduction system. It is \emph{locally confluent} if given \(s \red t\) and \(s \red u\) there exists a term \(v\) such that: \[ \begin{tikzcd} & s \\ t && u \\ & v \arrow["", squiggly, from=1-2, to=2-3] \arrow[""', squiggly, from=1-2, to=2-1] \arrow["{*}"', squiggly, from=2-1, to=3-2] \arrow["{*}", squiggly, from=2-3, to=3-2] \end{tikzcd} \] that is \(t \red^* v\) and \(u \red^* v\). \end{definition} Global confluence trivially implies local confluence. If we further know that the reduction system \(\red\) is strongly terminating then local confluence is sufficient to show global confluence. \begin{lemma}[Newman's lemma \cite{newman1942theories}] \label{lem:newman} Let \(\red\) be strongly terminating and locally confluent. Then \(\red\) is globally confluent. \end{lemma} Local confluence for the reduction systems of the type theories \Cattsu and \Cattsua will be proved using \emph{critical pair analysis}. A critical pair is a pair of distinct reductions which apply to the same term. When analysing the critical pairs of our semistrict type theories, we will encounter terms that are structurally similar, but differ on lower-dimensional subterms up to equality. We define this precisely. \begin{definition} Let \(\mathcal{R}\) be an equality rule set. For \(n \in \mathbb{N}\), define the \emph{bounded equality set} \(\mathcal{R}_n\) as: \[ \mathcal{R}_n = \left\{ (\Gamma, s, t) \in \mathcal{R} \mid \dim(s) = \dim(t) < n \right\}\] Let the \emph{bounded equality relation} \(s =_n t\) be the equality generated by the set \(\mathcal{R}_n\). \end{definition} This is used to prove the following lemma, which implies that for a critical pair \(t \leftsquigarrow s \rightsquigarrow u\) it is not necessary to find a common reduct of \(t\) and \(u\), but simply find reducts \(t'\) and \(u'\) of \(t\) and \(u\) such that \(t' =_{\dim(s)} u'\). \begin{lemma} \label{lem:conf-strat} Let \(\mathcal{R}\) be a tame equality rule set which satisfies the preservation and support conditions, and further assume that \(\redr\) is strongly terminating. Suppose the following diagram can be formed: % https://q.uiver.app/#q=WzAsNixbMiwwLCJzIl0sWzAsMSwidCJdLFs0LDEsInUiXSxbMSwyLCJ0JyJdLFszLDIsInUnIl0sWzIsMiwiPV97XFxkaW0ocyl9Il0sWzAsMiwiIiwwLHsic3R5bGUiOnsiYm9keSI6eyJuYW1lIjoic3F1aWdnbHkifX19XSxbMCwxLCIiLDIseyJzdHlsZSI6eyJib2R5Ijp7Im5hbWUiOiJzcXVpZ2dseSJ9fX1dLFsxLDMsIioiLDIseyJzdHlsZSI6eyJib2R5Ijp7Im5hbWUiOiJzcXVpZ2dseSJ9fX1dLFsyLDQsIioiLDAseyJzdHlsZSI6eyJib2R5Ijp7Im5hbWUiOiJzcXVpZ2dseSJ9fX1dXQ== % tex-fmt: skip \[ \begin{tikzcd}[column sep=tiny] && s \\ t &&&& u \\ & {t'} & {\mathclap{=_{\dim(s)}}} & {u'} \arrow[squiggly, from=1-3, to=2-5] \arrow[squiggly, from=1-3, to=2-1] \arrow["{*}"', squiggly, from=2-1, to=3-2] \arrow["{*}", squiggly, from=2-5, to=3-4] \end{tikzcd} \] for all critical pairs \(t \leftsquigarrow_{\mathcal{R}} s \redr u\) such that \(s \redr t\) is derived using \textsc{rule}. Then \(\redr\) is confluent. \end{lemma} \begin{proof} By \cref{lem:newman}, it suffices to show local confluence. We proceed by strong induction on \(n\) and \(s\), proving that all critical pairs \(t \leftsquigarrow_{\mathcal{R}_n} s \red_{\mathcal{R}_n} u\) have a common reduct, assuming that all critical pairs \(t \leftsquigarrow_{\mathcal{R}_m} s' \red_{\mathcal{R}_m} u\) have a common reduct, where \(s'\) is a subterm of \(s\) or \(m < n\). We justify this induction principle by noting that for all subterms \(s'\) of \(s\) we have \(\dim(s') \leq \dim(s)\). We now consider critical pair \(t \leftsquigarrow_{\mathcal{R}_n} s \red_{\mathcal{R}_n} u\). We first suppose that \(s \red_{\mathcal{R}_n} t\) is derived from \textsc{rule}. Then, by definition of the set \(\mathcal{R}_n\), we must have that \(n > \dim(s)\). By the assumption of the lemma, there exist \(t'\) and \(u'\) with \(t' =_{\dim(s)} u'\) and \(t \redr^* t'\) and \(u \redr^* u'\). As \(n > \dim(s)\), we further have that \(t \red_{\mathcal{R}_n}^* t'\) and \(u \red_{\mathcal{R}_n}^* u'\). By \cref{prop:red-is-eq}, \(t' \leftrightsquigarrow_{\mathcal{R}_{\dim(s)}} u'\), and so as \(\red_{\mathcal{R}_{\dim(s)}}\) is confluent by inductive hypothesis on dimension we have \(v\) such that \(t' \red_{\mathcal{R}_{\dim(s)}}^* v \leftsquigarrow_{\mathcal{R}_{\dim(s)}}^* u'\). The following diagram can therefore be formed, where all the reductions are \(\mathcal{R}_n\) reductions (noting that \(\mathcal{R}_{\dim(s)} \subseteq \mathcal{R}_n\)): \[ \begin{tikzcd}[column sep=tiny] && s \\ t &&&& u \\ & {t'} && {u'} \\ && v \arrow[squiggly, from=1-3, to=2-5] \arrow[squiggly, from=1-3, to=2-1] \arrow["{*}"', squiggly, from=2-1, to=3-2] \arrow["{*}", squiggly, from=2-5, to=3-4] \arrow["{*}"', squiggly, from=3-2, to=4-3] \arrow["{*}", squiggly, from=3-4, to=4-3] \end{tikzcd} \] If \(s \red_{\mathcal{R}_n} u\) was derived from \textsc{rule}, then finding a reduct can be found similarly to the first case by symmetry. We therefore consider the cases where neither \(s \red t\) nor \(s \red u\) are derived using \textsc{rule}. Both reductions must be either cell or argument reductions, and so each reduces some subterm of \(s\). If they reduce distinct subterms of \(s\), then a common reduct \(v\) can be formed by applying both reductions to \(s\). Otherwise, both reductions act on the same subterm of \(s\), and a common reduct can be found by applying the inductive hypothesis for subterms. \end{proof} Once termination and confluence have been proven, a normalisation function can be defined, which repeatedly applies reductions until no more can be applied. \begin{lemma} Suppose that \(\red\) is strongly terminating and confluent. Then every term \(s\) reduces to a unique normal form \(\N(s)\). Furthermore, if \(s \redrts t\), then \(\N(s) \equiv \N(t)\). \end{lemma} \begin{proof} By termination, repeatedly reducing a term will reach a normal form. Suppose a term \(s\) has two normal forms \(t\) and \(u\) such that there are reduction sequences \(s \red^* t\) and \(s \red^* u\). Then by confluence there must be a term \(v\) with \(t \red^* v\) and \(u \red^* u\). However, \(t\) and \(u\) are normal forms and so admit no reductions, so \(t \equiv v \equiv u\) as required. Suppose \(s \redrts t\). Then there are terms \(s_i\) such that: \[ s \equiv s_0 \rightsquigarrow^* s_1 \leftsquigarrow^* s_2 \rightsquigarrow^* \cdots \leftsquigarrow^* s_k \equiv t\] Now we must have \(\N(s_i) \equiv \N(s_{i+1})\) for each \(i\) as if \(s_i \rightsquigarrow^* s_{i+1}\) then both \(\N(s_i)\) and \(\N(s_{i+1})\) are normal forms of \(s_i\) and if \(s_i \leftsquigarrow^* s_{i+1}\) then both are normal forms of \(s_{i+1}\). Therefore, \(\N(s)\) and \(\N(t)\) are syntactically equal as required. \end{proof} \begin{corollary} Let \(\mathcal{R}\) be tame and satisfy the preservation and support properties. Further, suppose that \(\redr\) is strongly terminating and confluent, and it is decidable whether a term admits a reduction. Then the equality \(s = t\) is decidable. \end{corollary} \begin{proof} By \cref{prop:red-is-eq}, \(s = t\) if and only if \(s \redrts t\). By the above lemma, \(s \redrts t\) if and only if \(\N(s) \equiv \N(t)\). As syntactic equality is clearly decidable, and normal forms can be computed, equality is also decidable. \end{proof} We note that for an arbitrary rule set \(\mathcal{R}\), it may not be decidable whether a specific term \(s\) admits a reduction, but for the rule sets introduced in \cref{sec:cattsu,sec:cattsua}, it will be easy to mechanically check whether any reduction applies to a term \(s\). \section{\texorpdfstring{\Cattsu}{Cattsu}} \label{sec:cattsu} We are ready to define \Cattsu, the type theory for strictly unital \(\infty\)-categories. \Cattsu is a variant of \Cattr for which the equality is built from three classes of equalities: \begin{itemize} \item Pruning: The pruning operation was introduced in \cref{sec:pruning}. Pruning is the key operation in \Cattsu and drives the strict unitality of the theory. The operation ``prunes'' identities that appear as locally maximal arguments to other terms, simplifying the overall structure of a term by removing unnecessary units. \item Endo-coherence removal: This operation was introduced in \cref{sec:ecr}, and converts ``fake identities'', terms which are morally identities yet have the wrong syntactic form, into true identities. These converted identities can then be further removed from terms by pruning. \item Disc removal: Disc removal was the running example from \cref{sec:catt-with-equality}, and removes unary composites from the theory. Commonly after pruning, a composite is reduced to a unary composite, for which disc removal is necessary to complete the simplification of the term. \end{itemize} In this section we will prove that \Cattsu is a type theory satisfying many standard meta-theoretic properties by combining results from previous chapters. We also give a reduction system for \Cattsu and show that this is strongly terminating and confluent. \begin{example} Suppose we have terms \(f : \arr x \star y\), \(g : \arr y \star z\), \(h : \arr x \star z\), and \(\alpha : f * g \to h\) in some context \(\Gamma\). We can then consider the term: \[ \Coh {(x : *), (y : *), (f : \arr x \star y), (z : \star), (g : \arr y \star z)} {f * g \to f * g} {\langle x,y,f,z, g \rangle} * \alpha\] which consists of an endo-coherence composed with the variable \(\alpha\). This then reduces as follows: \begin{align*} &\phantom{{}\red{}}\Coh {(x : *), (y : *), (f : \arr x \star y), (z : \star), (g : \arr y \star z)} {f * g \to f * g} {\langle f, g \rangle} * \alpha\\ &\red \id(\arr x \star z, f * g) * \alpha&\text{by endo-coherence removal}\\ &\red \Coh {D^2} {\wk(U^2)} {\langle x,z,f*g,h,\alpha \rangle}&\text{by pruning}\\ &\red \alpha &\text{by disc removal} \end{align*} and so uses all three reductions to fully simplify to a variable. \end{example} We define \Cattsu by the following equality rule set. \begin{definition} Define the equality rule set \su for \Cattsu by: \[ \su = \dr \cup \prune \cup \ecr\] \Cattsu is then the variant of \Cattr where \(\mathcal{R} = \su\). \end{definition} When it is not specified, we will assume that the operation set \(\mathcal{O}\) is given by the regular operation set \(\Reg\). \begin{theorem} The rule set \su is tame and satisfies the support and preservation conditions. \end{theorem} \begin{proof} By \cref{prop:dr-weak,prop:dr-susp,prop:dr-sub}, disc removal satisfies the weakening, suspension, and \(\su\)-substitution conditions. Endo-coherence removal and pruning satisfy the same conditions by \cref{prop:ecr-props,prop:prune-tame}. As these conditions are closed under unions, the set \su must also satisfy the weakening, suspension, and substitution conditions, and hence is tame. We now use the proof strategy introduced in \cref{sec:further-conditions} to prove that \su satisfies the support condition. Firstly, by \cref{lem:supp-sat-conds} we know that \(\su_{\mathsf{s}}\) is also tame. Disc removal then satisfies the \(\su_{\mathsf{s}}\)-support condition by \cref{prop:dr-supp}. The same condition is satisfied by endo-coherence removal (\cref{item:ecr-supp}) and pruning (\cref{prop:prune-supp}) and so \(\su\) satisfies the \(\su_{\mathsf{s}}\)-support condition. By \cref{lem:proof-strat-supp}, \su satisfies the support condition. Lastly, \su satisfies the \su-preservation condition as it is satisfied by disc removal (\cref{prop:dr-preserve}), endo-coherence removal (\cref{item:ecr-preserve}), and pruning (\cref{prop:prune-preserve}) and is closed under unions of rule sets. \end{proof} From this theorem it can be deduced that weakening, suspension, and applications of substitution are well-formed. Furthermore, equality in \Cattsu preserves the support of a term and preserves typing judgements. Such results are found in \cref{sec:ruleset}. Before giving normalisation results for \Cattsu, we recall the Eckmann-Hilton argument (\cref{fig:eh}) and give the definition of the term giving this equivalence. First let \(\Delta\) be the ps-context given by: \begin{alignat*}{2} \Delta = D^2 \wedge D^2 ={} &(x : *),\\ &(y : *),&\ &(f : x \to y),\\ &&&(g : x \to y),(a : f \to g),\\ &(z : *),&&(h : x \to y),\\ &&&(j : x \to y),(b : h \to j) \end{alignat*} which is given by the diagram: \[ \begin{tikzcd} \bullet & \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, "h"', curve={height=18pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "i", curve={height=-18pt}, from=1-2, to=1-3] \arrow["a"', shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["b"', shorten <=5pt, shorten >=5pt, Rightarrow, from=2, to=3] \end{tikzcd} \] The following term can be formed, which is similar to an interchange move, and changes the order in which two whiskered terms are composed: \[ \mathsf{swap} = \Coh {\Delta} {(a *_0 j) *_1 (g *_0 b) \to (f *_0 b) *_1 (a *_0 h)} {\id_\Delta}\] Then given a context \(\Gamma\) with terms \(x : *\) and \(\alpha,\beta : \id(x) \to \id(x)\), the following term, the Eckmann-Hilton term, can be formed: \[ \mathsf{EH}_{\alpha,\beta} = \mathsf{swap} \sub{\langle x,x,\id(x),\id(x),\alpha, x,\id(x),\id(x),\beta \rangle}\] In \Cattsu, this term can be typed as follows: \[ \Gamma \vdash \mathsf{EH}_{\alpha,\beta} : \alpha *_1 \beta \to \beta *_1 \alpha\] and so witnesses the Eckmann-Hilton argument. We note that there is a clear inverse of the Eckmann Hilton term, which immediately gives rise to two morphisms \(\alpha *_1 \beta \to \beta *_1 \alpha\): the original term \(\mathsf{EH}_{\alpha,\beta}\) and the term \(\mathsf{EH}_{\beta,\alpha}^{-1}\). These two terms manoeuvre \(\alpha\) and \(\beta\) round each other in opposite directions, and are not in general equivalent. However, we can instead apply Eckmann-Hilton to terms \(\phi\) and \(\psi\) of type \(\id^2(x) \to \id^2(x)\), which is done by suspending the Eckmann-Hilton term. By an abuse of notation we define this term to be (only giving the locally maximal arguments of the substitution): \[\mathsf{EH}_{\phi,\psi} = \Sigma(\mathsf{swap}) \sub{\langle \phi, \psi \rangle}\] In this case, the extra dimension gives enough freedom to give an equivalence between the resulting two terms \(\phi *_2 \psi \to \psi *_2 \phi\) which is called the \emph{syllepsis} and has the type: \[ \mathsf{Syl}_{\phi,\psi} : \mathsf{EH}_{\phi,\psi} \to \mathsf{EH}^{-1}_{\psi,\phi}\] To define this term, a similar approach to the approach used for Eckmann-Hilton of giving a single coherence containing a more complex type and a substitution containing multiple identity terms, and letting the \Cattsu reduction simplify the type to the required one. We delay defining this term until \cref{sec:towards-nbe}, where the implementation presented in this section can be used to check that the resulting term is well-formed. \subsection{Normalisation for \texorpdfstring{\Cattsu}{Cattsu}} \label{sec:reduction-cattsu} Following \cref{sec:reduction} we aim to give a normalisation algorithm for \Cattsu by exhibiting a strongly terminating and confluent reduction system. The reduction system \(\red_{\su}\) cannot be used directly because the reduction generated from \ecr is not terminating, as it allows identities to reduce to identities. Even after replacing the equality rule set \ecr by \ecr', the equality set obtained by removing these trivial identity to identity reductions from \ecr, the generated reduction is still non-terminating. Consider the term \(\id(\arr t A t,\id(A,t))\) for some term \(t\) of type \(A\). Then the following reduction sequence can be formed: \[ \id(\arr t A t,\id(A,t)) \red \Coh {D^n} {\id(\wk(U^n), d_n) \to \id(\wk(U^n), d_n)} {\{A,t\}} \red \id(\arr t A t, \id(A,t)) \] where \(n = \dim(A)\), the first reduction is by pruning, and the second reduction is by endo-coherence removal. We therefore choose to also restrict the pruning equality rule set to not apply when the head term is an identity, obtaining the set \prune'. We can now define the reduction system for \Cattsu. \begin{definition} Define the reduction \(\red_{\su'}\) to be the reduction generated by the equality rule set \(\su'\) where \[ \su' = \dr \cup \prune' \cup \ecr'\] where \ecr' is the endo-coherence removal set without identity to identity equalities and \prune' is the pruning set restricted to the triples where the left-hand side term is not an identity. \end{definition} The reduction \(\red_{\su'}\) applies equality rules from \Cattsu when the redex is not an identity, effectively forcing identities to be normal forms of the theory. As applying a substitution to or suspending a non-identity term cannot result in an identity, it is clear that \su' is tame. Strong termination for \(\red_{\su'}\) can now be proven using \cref{lem:termination-lem}, by showing that all rules reduce the syntactic complexity of terms. \begin{proposition} Let \(s \red t\) be an instance of pruning. If \(s\) is not an identity then \(\sc(s) > \sc(t)\). \end{proposition} \begin{proof} The reduction \(s \red t\) is an instance of pruning, and so there must be Dyck word \(\mathcal{D} : \Dyck_0\), and peak \(p : \Peak_{\mathcal{D}}\) such that \[s \equiv \Coh {\lfloor \mathcal{D} \rfloor} {A} {\sigma} \qquad t \equiv \Coh {\lfloor \mathcal{D} \sslash p \rfloor} {A \sub {\pi_p}} {\sigma \sslash p}\] where \(s\) is not an identity and \(\lfloor p \rfloor \sub \sigma\) is. We then have \(\sc(s) = \sc(\sigma)\) and \(\sc(t) = \sc(\sigma \sslash p)\), but \(\sigma \sslash p\) is simply \(\sigma\) with two terms removed, one of which is known to be a coherence, and so \(\sc(s) > \sc(t)\). \end{proof} \begin{corollary} The reduction \(\red_{\su'}\) is strongly terminating. \end{corollary} \begin{proof} By \cref{lem:termination-lem}, it suffices to show that each rule of \(\su'\) reduces syntactic complexity, which follows from the preceding proposition and \cref{prop:ecr-sc,prop:disc-rem-sc}. \end{proof} By \cref{prop:red-is-eq}, we know that the reflexive symmetric transitive closure of \(\red_{\su'}\) is the equality relation generated by \su'. We therefore prove that this agrees with the equality relation from \Cattsu. \begin{proposition} \label{prop:suprime-equiv} The type theories generated from \su and \su' are equivalent. Terms are equal or well-formed in one theory exactly when they are equal or well-formed in the other, and similar properties hold for types and substitutions. \end{proposition} \begin{proof} We use \cref{lem:subset-lem} for both directions. Since \(\su' \subseteq \su\), we are only required to show that if \((\Gamma, s, t) \in \su\) with \(\Gamma \vdash_{\su'} s : A\) for some \(A : \Type_\Gamma\) then \[ \Gamma \vdash_{\su'} s = t\] If \((\Gamma,s,t) \in \su'\), then the equality follows from the \textsc{rule} constructor. Otherwise, \(s\) must be an identity and the rule is an instance of endo-coherence removal or pruning. Suppose \(s\) reduces to \(t\) by endo-coherence removal. Then \(s \equiv \id(A,u)\) and \[t \equiv \id(\wk(U^n) \sub {\{A,u\}}, d_n \sub {\{A,u\}}) \equiv \id(A,u) \equiv s\] and so the equality holds by reflexivity. Now assume \(s\) reduces by pruning to \(t\). Letting \(s \equiv \id(A,u)\) and \(n = \dim(A)\), we get: \begin{align*} t &\equiv \Coh {\lfloor \mathcal{D}^n \sslash p^n \rfloor} {\arr {d_n} {\wk(U^n)} {d_n} \sub {\pi_{p^n}}} {\{A,u\} \sslash p} \\ &= \id(\wk(U^n) \sub {\pi_{p^n}} \sub {\{A,u\} \sslash p^n}, d_n \sub {\pi_{p^n}} \sub {\{A,u\} \sslash p^n})&\text{by endo-coherence removal}\\ &\equiv \id(\wk(U^n),d_n) \sub {\pi_{p^n} \bullet \{A,u\} \sslash p^n}\\ &= \id(\wk(U^n),d_n) \sub {\{A,u\}}&\text{by \cref{prop:prune-ty}}\\ &\equiv \id(\wk(U^n) \sub {\{A,u\}}, d_n \sub {\{A,u\}})\\ &\equiv \id(A,u) \end{align*} and so the equality holds as required. \end{proof} We therefore have that two terms \(s\) and \(t\) are equal in \Cattsu if and only if \(s \leftrightsquigarrow_{\su'} t\). To demonstrate normalisation, it therefore remains to show that the reduction system is confluent, for which we employ the strategy introduced in \cref{lem:conf-strat}. \begin{theorem} \label{thm:su-conf} The reduction \(\red_{\su'}\) is confluent. \end{theorem} \begin{proof} By \cref{lem:conf-strat} it is sufficient to show that for all \(t \leftsquigarrow s \rightsquigarrow u\) with \(s \rightsquigarrow t\) being a reduction derived from \textsc{rule}, that the following diagram can be formed: \[ \begin{tikzcd}[column sep=tiny] && s \\ t &&&& u \\ & {t'} & {\mathclap{=_{\dim(s)}}} & {u'} \arrow[squiggly, from=1-3, to=2-5] \arrow[squiggly, from=1-3, to=2-1] \arrow["{*}"', squiggly, from=2-1, to=3-2] \arrow["{*}", squiggly, from=2-5, to=3-4] \end{tikzcd} \] We therefore begin by case splitting on the reduction \(s \red t\), ignoring cases where both reductions are identical and ignoring cases which follow by symmetry of other cases. \textbf{Disc removal:} Suppose \(s \red t\) is a disc removal reduction. Then \(s \equiv \Coh {D^n} {\wk(U^n)} {\{A,t\}}\). We now split on the reduction \(s \red u\). We immediately know that \(s \red u\) cannot be an endo-coherence removal reduction, as \(s\) is not an endo-coherence. It also cannot be a cell reduction as \(\wk(U^n)\) only contains variables and so is in normal form. Let \(s \red u\) be an argument reduction. It must therefore be generated by a reduction on \(\{A,t\}\). If it is a reduction generated by \(A \red A'\) then \(u \red t\) by endo-coherence removal and so we are done. Otherwise, it is generated by \(t \red t'\) and so \(t\) and \(u\) both reduce by disc removal to \(t'\). The only remaining case is where \(s \red u\) is an instance of pruning, which forces \(t \equiv \id(B,a)\) for some \(B\) and \(a\). As \(s\) is well-formed, we must have \(n > 0\) and so \(A \equiv \arr b {A'} c\). Therefore: \begin{align*} u &\equiv \Coh {\lfloor \mathcal{D}^{n} \sslash p \rfloor} {\wk(U^n) \sub {\pi_{p}}} {\{A,\id(B,a)\} \sslash p}\\ &\equiv \Coh {D^{n-1}} {\wk(U^n) \sub {\{\arr {d_{n-1}} {\wk(U^{n-1})} {d_{n-1}}, \id(\wk(U^{n-1}), d_{n-1})\} }} {\{A',b\}}&\text{by \cref{prop:prune-disc}}\\ &\equiv \Coh {D^{n-1}} {\arr {d_{n-1}} {\wk(U^{n-1})} {d_{n-1}}} {\{A',b\}}\\ &\equiv \id(A',b) \end{align*} Now as \(s\) is well-formed we have \(\Gamma \vdash \{A,\id(B,a)\} : D^n\) and so by \cref{lem:disc-typing}, we have \(\Gamma \vdash \id(B,a) : A\) and hence by \cref{cor:id-typing} and uniqueness of typing: \[ \arr a {B} a = A \equiv \arr b {A'} c\] and so \(a = b\) and \(B = A'\) and hence \(s \equiv \id(A', b) = \id(B,a) \equiv t\). Since \(\dim(a) = \dim(B) < \dim(s)\), we get \(t =_{\dim(s)} u\) as required. \textbf{Endo coherence removal:} Suppose \(s \red t\) is an endo-coherence removal reduction. Then: \[ s \equiv \Coh {\Delta} {\arr a A a} {\sigma} \red \id(A \sub \sigma, a \sub \sigma) \equiv t\] with \(s\) not being an identity. We now split on the reduction \(s \red u\). First consider when it is an argument reduction generated by \(\sigma \red \tau\). Then by \cref{prop:red-sub}, we have \(t \equiv \id(A \sub \sigma, a \sub \sigma) \red^* \id(A \sub \tau, a \sub \tau)\). By endo-coherence removal, \(u \red \id(A \sub \tau, a \sub \tau)\), completing this case. Now suppose the reduction \(s \red u\) is an instance of cell reduction. If it is generated from a reduction \(A \red B\) then by \cref{prop:red-sub}, \(t \red \id(B \sub \sigma, a \sub \sigma)\) and by endo-coherence removal: \[u \equiv \Coh \Delta {\arr a B a} \sigma \red \id(B \sub \sigma, a \sub \sigma)\] We now consider when the reduction is generated by \(\arr a A a \red \arr b A a\), with the case where it is generated by \(\arr a A a \red \arr a A b\) following symmetrically. We consider the reductions sequence from \(u\): \begin{align*} u &\equiv \Coh \Delta {\arr b A a} {\sigma} \\ &\red \Coh \Delta {\arr b A b} \sigma &\text{by cell reduction}\\ &\red \id(A\sub \sigma,b\sub \sigma) &\text{by endo-coherence removal} \end{align*} Again by \cref{prop:red-sub}, \(t \equiv \id(A\sub\sigma,a\sub\sigma) \red \id(A\sub\sigma,b\sub\sigma)\), completing the case. Lastly, we consider when \(s \red u\) is a pruning reduction. We suppose \(\Delta = \lfloor \mathcal{D} \rfloor\) and that the pruning is generated from peak \(p : \mathcal{D}\). Then: \[ u \equiv \Coh {\lfloor \mathcal{D} \sslash p \rfloor} {(\arr a A a) \sub {\pi_p}} {\sigma \sslash p}\] Then: \begin{align*} u &\red \id(A \sub {\pi_p} \sub {\sigma\sslash p},a \sub {\pi_p} \sub {\sigma\sslash p})&\text{by \cref{lem:always-ecr}}\\ &\equiv \id(A,a) \sub {\pi_p \bullet \sigma \sslash p} \\ &=_{\dim(s)} \id(A,a) \sub \sigma \end{align*} where the last (bounded) equality is by \cref{prop:prune-ty} and by noting that \(\dim(A) = \dim(a) < \dim(s)\). \textbf{Pruning:} Let \(s \red t\) be a reduction by pruning with \[ s \equiv \Coh {\lfloor \mathcal{D} \rfloor} A \sigma\] for some \(\mathcal{D} : \Dyck_0\) with peak \(p : \Peak_{\mathcal{D}}\) such that \(\lfloor p \rfloor \sub \sigma\) is an identity. Then: \[ t \equiv \Coh {\lfloor \mathcal{D} \sslash p \rfloor} {A \sub {\pi_p}} {\sigma \sslash p}\] We now split on the reduction \(s \red u\). First suppose it is given by an argument reduction \(\sigma \red \tau\). Identities do not admit head reductions, meaning \(\lfloor p \rfloor \sub \tau\) is still an identity. Therefore, pruning can be applied to \(u\) to get: \[ u \red \Coh {\lfloor \mathcal{D} \sslash p \rfloor} {A \sub {\pi_p}} {\tau \sslash p}\] Now \(\sigma \sslash p\) is simply \(\sigma\) with two terms removed, and so \(\sigma \sslash p \red^* \tau \sslash p\), meaning \(t\) reduces to the same term as \(u\). If \(s \red u\) is a cell reduction \(A \red B\), then pruning can be applied to \(u\) immediately to get the term: \[\Coh {\lfloor \mathcal{D} \sslash p \rfloor} {B \sub {\pi_p}} {\sigma \sslash p}\] but \(t\) also reduces to this term by \cref{prop:red-sub}. Let \(s \red u\) be a second pruning reduction, at a different peak \(q : \Peak_{\mathcal{D}}\). By \cref{prop:prune-conf}, there is a common reduct: \[ \Coh {\lfloor (\mathcal{D} \sslash p) \sslash q_p \rfloor} {A \sub {\pi_p} \sub{\pi_{q_p}}} {(\sigma \sslash p) \sslash q_p} \] which both reduce to by pruning if \(\lfloor q_p \rfloor\) and \(\lfloor p_q \rfloor\) are identities. However: \[\lfloor q_p \rfloor \equiv \lfloor q \rfloor \sub {\pi_p}\] and \(\lfloor q \rfloor\) must be an identity for \(s \red u\) to be a valid instance of pruning. Therefore, as identities are preserved by application of substitution, \(\lfloor q_p \rfloor\) is an identity. Similarly, \(\lfloor p_q \rfloor\) is an identity, and so both \(t\) and \(u\) reduce to the term above. Any remaining cases follow by symmetry, completing the proof. \end{proof} \subsection{Disc trivialisation} \label{sec:properties-cattsu} We take a brief moment to explore the theory \Cattsu in its entirety. For this section we will further assume that we take the set of operations \(\mathcal{O}\) to be the regular operations. We begin by proving a property of terms over disc contexts, which we call \emph{disc trivialisation}. This is the following structure theorem: in a disc context \(D^n\), every term is either a variable, or the iterated identity on a variable, up to definitional equality. Restricting to those terms \(t : \Term_{D^n}\) that are full, that is \(\Supp(t) = \Var{D^n}\), then there is exactly one term (up to definitional equality) at each dimension \(k \geq n\). Hence, the type theory \Cattsu trivialises disc contexts. This property relates the type theory \Cattsu to the definition of strictly unital \(\infty\)-categories of \citeauthor{Batanin2013}~\cite{Batanin2013}, whose \emph{reduced operads} enforce that there is a unique term of each dimension over a linear tree. We now state and prove disc trivialisation, recalling the definition of an iterated canonical identity from \cref{def:canonical-id}. \begin{theorem}[Disc trivialisation] \label{thm:disc-triv} Suppose \(D^n \vdash t : A\) in \Cattsu. Then \(t\) is equal to an iterated canonical identity on a variable, that is \(t = \id^k(x)\) for some variable \(x \in \Var(D^n)\) and \(k \in \mathbb{N}\). \end{theorem} \begin{proof} Without loss of generality, we may assume that \(t\) is in \Cattsu normal form, and proceed to prove that \(t\) is an iterated canonical identity. We proceed by induction on subterms of the term \(t\). If \(t\) is a variable then we are done. Otherwise, we assume \(t\) is a coherence term \(\Coh \Delta U \sigma\). We now show that \(\Delta\) must be a disc context by contradiction. We therefore assume that \(\Delta\) is not a disc, and hence \(t\) is not an identity. By induction on subterms, we must have that each term in \(\sigma\) is an iterated canonical identity on a variable. No locally maximal argument can be an identity, as otherwise pruning could be performed and \(t\) would not be in normal form, and so every locally maximal argument is a variable. Suppose there is some variable \(x\) such that \(x \sub\sigma\) is an identity, and let \(x\) be a variable of maximal dimension with this property. As \(x\) cannot be locally maximal, there must either be the source or target of a variable \(y\), but this variable \(y\) must be sent to a variable of \(D^n\), which cannot have an identity as its source or target. Therefore, the substitution \(\sigma\) is variable to variable. We now let \(\Gamma\) be the smallest ps-context prefix of \(\Delta\) such that \(\Gamma\) is not a disc. We must have: \[ \Gamma \equiv D^k, (y : A), (f : \arr x A y)\] where \(D^k \vdash_{\mathsf{ps}} x : A\). Furthermore, the last rule used in this derivation must be \textsc{psd}, as if it were \textsc{pse} or \textsc{pss} then \(k = \dim(A)\) and \(\Gamma \equiv D^{k+1}\), breaking the assumption that \(\Gamma\) is not a disc. Therefore, \(D^k \vdash_{\mathsf{ps}} g : \arr w A x\) for some variables \(g\) and \(x\). However, now \(g \sub \sigma\), \(x \sub \sigma\), and \(f \sub \sigma\) are variables of \(D^n\) such that \(\tgt(g \sub \sigma) \equiv x \sub \sigma \equiv \src(f \sub \sigma)\). No such variables exist in \(D^n\) and so we reach a contradiction. We therefore deduce that \(\Delta\) is a disc \(D^n\) for some \(n\). Now \(t \equiv \Coh {D^n} {\arr u A v} \sigma\) and so by induction on subterms, \(u\) and \(v\) are equal to iterated canonical identities. We now split on whether \(t\) is a composition or equivalence. If it is a composition then \(\Supp(u) = \bdry {n-1} - {D^n}\) and \(\Supp(v) = \bdry {n-1} + {D^n}\) and therefore neither \(u\) or \(v\) are identities (as then \(A\) would have the same support as \(u\) or \(v\) respectively) and so \(u = d_{n-1}^-\) and \(v = d_{n-1}^*\), but this makes \(t\) a disc removal redex, and so \(t\) is not in normal form. We therefore assume that \(t\) is an equivalence and \(u\) and \(v\) are full. Then \(u\) and \(v\) must be iterated identities on \(d_n\), and must have the same dimension and so are syntactically equal. To avoid \(t\) being an endo-coherence removal redex, it must be an identity \(\id(B,s)\). Now, \(s \equiv \id^k(x)\) for some variable \(x\) (as \(s\) is a subterm of \(t\)), and so if \(k = 0\) then \(\ty(s) \equiv d_{n-1}^- \to d_{n-1}^+\) and if \(k > 0\) then \(\ty(s) \equiv \id^{k-1}(x) \to \id^{k-1}(x)\). In either case, \(\ty(s)\) is in normal form, and so since \(B\) is also a normal form and \(\Gamma \vdash s : B\) (by the well-typing of \(t\) and \cref{cor:id-typing}), we have \(B \equiv \ty(s)\) and so \(t \equiv \id(s) \equiv \id^{k+1}(x)\) as required. \end{proof} Disc trivialisation allows us to prove the following results concerning terms and substitutions in pasting diagrams. \begin{theorem} Let \(\mathcal{D}\) be a Dyck word. Let \(t\) be a well-formed \Cattsu term of \(\lfloor \mathcal{D} \rfloor\). Then \(\Supp(t)\) is a ps-context. \end{theorem} \begin{proof} Suppose, for contradiction, that we have a Dyck word \(\mathcal{D}\) and a term \(t\) where \(\Supp(t)\) is not a ps-context. Assume further that \(\mathcal{D}\) is minimal (in terms of length) where such a term exists. Immediately, \(\mathcal{D} \not\equiv \circleddash\), as all terms have non-empty support. We now examine the locally maximal variables of \(\mathcal{D}\). There must exist some locally maximal variable \(f : x \to y\) such that \(f \not\in \Supp(t)\), as otherwise \(\Supp(t) = \Var(\lfloor \mathcal{D} \rfloor)\). Now suppose that \(y \not\in\Supp(t)\). Then we let \(p\) be the peak corresponding to \(f\) and consider the term: \[t \sub {\pi_p} : \Term_{\lfloor \mathcal{D}\sslash p \rfloor}\] Then \(\Supp(t \sub {\pi_p}) = \Supp(t)\), which contradicts the minimality of \(\mathcal{D}\). By a similar argument, \(x\) must also be in \(\Supp(t)\). It is also the case that if such a variable \(f : x \to y\) with \(f \not\in \Supp(t)\) and \(\{x,y\} \subseteq \Supp(t)\) exists, then \(\Supp(t)\) cannot be a ps-context, by an argument involving the linear order on ps-contexts introduced by \citeauthor{finster2017type}~\cite{finster2017type}. Now suppose \(\mathcal{D}\) has a peak \(p\) that is not \(f\). Then \(f\sub{\pi_p} : x \sub{\pi_p} \to y \sub{\pi_p}\) is a locally maximal variable of \(\lfloor \mathcal{D} \sslash p \rfloor\) with \(f\sub{\pi_p} \not\in \Supp(t \sub {\pi_p})\) and \(\{x \sub {\pi_p}, y \sub {\pi_p}\} \subseteq \Supp(t \sub {\pi_p})\). Hence, \(\Supp(t \sub {\pi_p})\) is not a ps-context, again breaking the minimality of \(\mathcal{D}\). Therefore, \(\mathcal{D}\) has one peak, and so \(\lfloor \mathcal{D} \rfloor \equiv D^n\) for some \(n\). Now by \cref{thm:disc-triv}, \(t\) is \Cattsu equal to a variable \(z\) or an iterated identity on a variable \(z\). Since \Cattsu preserves support, we must have \(\Supp(t) = \Supp(z)\), but \(\Supp(z)\) is a disc and so is a ps-context. Hence, no such term \(t\) existed. \end{proof} Since any \Catt term is also a \Cattsu term, we get the following corollary. \begin{corollary} \label{cor:supp-ps} If \(\Gamma \vdash t : A\) in \Catt, and \(\Gamma\) is a ps-context, then \(\Supp(t)\) is a ps-context. \end{corollary} \section{\texorpdfstring{\Cattsua}{Cattsua}} \label{sec:cattsua} We now move on to defining \Cattsua, the type theory for strictly unital and associative \(\infty\)-categories. \Cattsua extends \Cattsu by replacing the pruning equality with the more general insertion equality, which was introduced in \cref{sec:insertion}. Under certain conditions, insertion can merge more complex terms into a single coherence. As an example, the term \((f * g) * h\), which is a composite which has a composite as one of its arguments, is reduced by insertion to the ternary composite \(f*g*h\), reducing the depth of the term. As we did for \Cattsu, we will prove in this section that \Cattsua satisfies standard meta-theoretic properties, and provide a reduction system for it which is strongly terminating and confluent. \begin{example} We consider the associator term, and its reductions in \Cattsua. The associator witnesses the associativity law in a weak \(\infty\)-category. Letting \(\Delta\) be the following ps-context: \begin{alignat*}{2} \Delta = \lfloor [\emp,\emp,\emp] \rfloor ={}& (w : *)\\ &(x : *)&&(f : w \to x)\\ &(y : *)&&(g : x \to y)\\ &(z : *)&&(h : y \to z) \end{alignat*} we can define the associator as: \[ \alpha = \Coh \Delta {(f * g) * h \to f * (g * h)} {\id_\Delta}\] This then admits the following reduction sequence in \Cattsua: \begin{align*} \alpha &\rightsquigarrow \Coh \Delta {f*g*h \to f * (g * h)} {\id_\Delta}&\text{by insertion}\\ &\rightsquigarrow \Coh \Delta {f * g * h \to f * g * h} {\id_\Delta}&\text{by insertion}\\ &\rightsquigarrow \id(f*g*h) &\text{by endo-coherence removal} \end{align*} \end{example} We formally define \Cattsua as the version of \Cattr generated by the rule set \sua, which we define below: \begin{definition} We define the equality rule set \sua for \Cattsua by: \[ \sua = \dr \cup \ecr \cup \insert \] \Cattsua is then the variant of \Cattr where \(\mathcal{R} = \sua\). \end{definition} As before, when we do not specify an operation set, it should be assumed that the regular operation set is used. When we use the groupoidal operation set, we refer to the resulting theory as \emph{groupoidal \Cattsua}. \begin{theorem} \label{thm:sua-conds} The rule set \sua is tame and satisfies the support condition. If \(\mathcal{O}\) supports insertion, then \sua also satisfies the preservation condition. \end{theorem} \begin{proof} By \cref{prop:dr-weak,prop:dr-susp,prop:dr-sub,prop:ecr-props,prop:insert-tame}, each of the disc removal, endo-coherence removal, and insertion sets satisfy the weakening, suspension, and \(\sua\)-substitution conditions. It follows that \(\sua\) satisfies the weakening, suspension, and substitution conditions. Hence, \sua is tame. To prove that the support condition holds for \sua, we use the strategy introduced in \cref{sec:further-conditions} and instead show that \sua satisfies the \(\sua_{\mathsf{S}}\)-support condition. By \cref{lem:supp-sat-conds}, the equality rule set \(\sua_{\mathsf{S}}\), the restriction of \sua to support preserving equalities, is also tame. As it trivially satisfies the support condition, we have by \cref{prop:dr-supp,item:ecr-supp,prop:insert-supp} that disc removal, endo-coherence removal, and insertion satisfy the \(\sua_{\mathsf{S}}\)-support condition. Therefore, \sua satisfies the \(\sua_{\mathsf{S}}\)-support condition and so by \cref{lem:proof-strat-supp} \sua satisfies the support condition. The \sua-preservation condition is satisfied by disc removal (by \cref{prop:dr-preserve}) and endo-coherence removal (by \cref{item:ecr-preserve}). If \(\mathcal{O}\) supports insertion, then insertion also satisfies the \sua-preservation condition by \cref{prop:insert-preserve}. Therefore, \sua satisfies the preservation condition, completing the proof. \end{proof} While the groupoidal operation set trivially supports insertion, we have not yet proven that the regular operation set, \Reg, supports insertion. This is done now using \cref{thm:sua-conds}. \begin{proposition} The regular operation set, \Reg, supports insertion. \end{proposition} \begin{proof} Using that the regular operation set is equal to the standard operation set, we instead prove that the standard operation set supports insertion. For this it will be sufficient to prove that for an insertion point \((S, P, T)\), dimension \(n \in \mathbb{N}\) and \(\epsilon \in \{-,+\}\) that: \[ \bdry n \epsilon S \sub {\kappa_{S,P,T}} = \bdry n \epsilon {\insertion S P T}\] Then: \begin{align*} \bdry n \epsilon S \sub {\kappa_{S,P,T}} &= \Supp(\stdtm {\bound n S} n \sub {\incbd n \epsilon S}) \sub {\kappa_{S,P,T}}&\text{by \cref{lem:std-supp}}\\ &= \Supp(\stdtm {\bound n S} n \sub {\incbd n \epsilon S \bullet \kappa_{S,P,T}})\\ &= \Supp(\stdtm {\bound n {\insertion S P T}} n \sub {\incbd n \epsilon {\insertion S P T}})&\text{by (*)}\\ &= \bdry n \epsilon {\insertion S P T} &\text{by \cref{lem:std-supp}} \end{align*} where the equality \((*)\) holds as \sua satisfies the support condition by \cref{thm:sua-conds} and: \[ \insertion S P T \vdash_\sua \stdtm {\bound n S} n \sub {\incbd n \epsilon S \bullet \kappa_{S,P,T}} = \stdtm {\bound n {\insertion S P T}} n \sub {\incbd n \epsilon {\insertion S P T}} \] by \cref{thm:std-insert-props}. \end{proof} \subsection{Reduction for \texorpdfstring{\Cattsua}{Cattsua}} \label{sec:norm-cattsua} Using the results of \cref{sec:reduction}, we give a normalisation algorithm for \Cattsua by defining a reduction system which generates the equality relation and proving that this reduction system is strongly terminating and confluent. As with \Cattsu, we cannot directly use the reduction \(\red_\sua\) directly, as we have seen already that the reduction \(\red_\ecr\) alone is non-terminating. Similarly to pruning, allowing insertions into identity terms also creates non-terminating loops of reductions when combined with endo-coherence removal, as was explained in \cref{sec:reduction-cattsu}. We therefore restrict our reduction so that no head-reductions can be applied to identity terms. Although these restrictions are sufficient to ensure termination, we choose to further restrict the set of insertion reductions, in order to streamline the proof of confluence. Firstly, we only allow insertions of a locally maximal argument when that argument is either an identity or a standard composition. The motivation for this restriction is that identities and standard compositions are the only standard coherences that are in normal form. Moreover, not allowing the insertion of endo-coherences avoids a difficult insertion/argument endo-coherence removal confluence case. We also disallow insertions into a unary composite and insertions of a unary composite, as we have already seen in \cref{sec:further-properties} that discs act as a left and right unit for insertion, and so these two insertion reductions are subsumed by disc removal. Further, disallowing the insertion of discs removes another case where an insertable standard coherence is not in normal form. We now define the resulting reduction system. \begin{definition} Define the reduction \(\red_{\sua'}\) to be the reduction generated by the equality rule set \(\sua'\) where: \[\sua' = \dr \cup \ecr' \cup \insert'\] where \(\ecr'\) is the endo-coherence removal set without the identity-to-identity reductions, and \(\insert'\) is the insertion rule set restricted to insertion redexes \((S,P,T,\Gamma,L,M)\) and types \(A\) such that \(\SCoh S A L\) is not an identity or a unary composite, and \(L(\olsi P) \equiv \stdcoh T {\lh(P)} \sub M\) is an identity or a standard composite which is not a unary composite. \end{definition} It can be determined by a simple observation that \(\sua'\) is tame, as suspension and the application of substitution cannot transform a term into an identity or unary composite where it wasn't before. We further justify the restrictions made to insertion by showing that many insertion reductions can still be performed, starting with the following technical lemma. \begin{lemma} \label{lem:insertion-change-max} If \(P\) is a branch of \(S\), and \(L, L' : S \to \Gamma\) are labellings differing only on \(\olsi P\), then the following holds for insertion redex \((S,P,T,\Gamma,L,M)\): \[\insertion L P M \equiv \insertion {L'} P M\] \end{lemma} \begin{proof} By inspection of the definition, \(\insertion L P M\) does not use the term \(L(\olsi P)\). \end{proof} We now show that many insertion reductions can still be simulated up to bounded equality. \begin{lemma} \label{lem:insertable} Let \((S,P,T,\Gamma, L, M)\) be an insertion redex. Further suppose that \(a \equiv \SCoh S A L\) is not an identity or disc. Then there exists a term \(s\) with: \[a \red_{\sua'}^* s =_{\dim(a)} \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P M}\] even when \(L(\olsi P)\) is a unary composite or is not a standard composite or identity. \end{lemma} \begin{proof} We proceed by induction on \(\lh(P) - \dep(T)\). If \(\lh(P) - \dep(T) = 0\) then \(\stdcoh T {\lh(P)}\) is a composite. The only case for which insertion cannot be performed is when \(\stdcoh T {\lh(P)}\) is a unary composite, such that \(T = D^{\lh(P)}\). Now by \cref{lem:disc-insertion-2}, \(\insertion S P T \equiv S\), \(\insertion L P M \equiv^{\max} L\) and \(\kappa_{S,P,T} = \id_S\) and so \[a =_{\dim(a)} \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P M}\] We now assume that \(\lh(P) > \dim(T)\). We may also assume without loss of generality that \(\stdcoh T {\lh(P)}\) is not an identity, as otherwise it would be immediately insertable. This allows us to perform endo-coherence removal to get: \[\stdcoh T {\lh(P)} \red \id(\stdty T {\lh(P) - 1}, \stdtm T {\lh(P)- 1}) \sub M\] Now suppose \(b \equiv \Coh S A {L'}\) where \(L'\) is the result of applying the above reduction to the term of \(L\) corresponding to \(\olsi P\). Since \(L'(\olsi P)\) is now an identity it can be inserted to get \(b \red c\) where: \begin{align*} c &\equiv \SCoh {S \sslash P} {A \sub {\pi_P}} {\insertion {L'} P {(\{\stdtm T {\lh(P) - 1}\} \bullet M)}}\\ &\equiv \SCoh {S \sslash P} {A \sub {\pi_P}} {\insertion {L'} P {(\{\stdcoh T {\lh(P) - 1}\} \bullet M)}} \end{align*} where \(\stdtm T {\lh(P - 1)} \equiv \stdcoh T {\lh(P - 1)}\) as if \(\stdtm T {\lh(P)-1}\) was a variable then \(\stdcoh T {\lh(P)}\) would be an identity. We now wish to show that \(2 + \bh(P) \leq \lh(P)\) so that \(P'\) exists as a branch of \(S \sslash P\). Since we always have \(1 + \bh(P) \leq \lh(P)\), we consider the case where \(1 + \bh(P) = \lh(P)\). We know that \(\bh(P) \leq \dep(T) \leq \lh(P)\) and so one of these inequalities must be an equality. If \(\dep(T) = \lh(P)\) then \(\stdcoh T {\lh(P)}\) is a standard composite. If \(\dep(T) = \bh(P)\) then \(\th(T) = \dep(T)\) and so \(T\) is linear. However, this makes \(\stdcoh T {\lh(P)}\) an identity. Either case is a contradiction and so \(2 + \bh(P) \leq \lh(P)\) and so \(P'\) is a branch of \(S \sslash P\). By \cref{lem:pruned-bp,lem:iota-kappa-comm}, we now have: \begin{align*} &\phantom{{}\equiv{}}\olsi {P'} \sub {\insertion{L'} P {(\{\stdcoh T {\lh(P) - 1}\} \bullet M)}} \\ &\equiv d_{\lh(P) - 1} \sub {\iota_{S,P,D^{\lh(P) - 1}} \bullet (\insertion {L'} P {(\{\stdcoh T {\lh(P) - 1}\} \bullet M)})} \\ &\equiv d_{\lh(P) - 1} \sub {\{\stdcoh T {\lh(P) - 1}\} \bullet M} \\ &\equiv \stdcoh T {\lh(P) - 1}\sub M \end{align*} As \(\lh(P') - \dim(T) = \lh(P) - \dim(T) - 1\) we can use the induction hypothesis to get that \(c \leadsto d\) and: \begin{align*} d =_{\dim(a)}{} &\SCoh {\insertion {(S \sslash P)} {P'} T} {A \sub {\pi_P \bullet \kappa_{S\sslash P,P',T}}} {\\&\insertion {(\insertion {L'} P {(\{\stdcoh T {\lh(P) - 1}\} \bullet M)})} {P'} {M}} \end{align*} By \cref{lem:pruned-bp,lem:insertion-change-max}, \begin{equation*} d =_{\dim(a)} \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P M} \end{equation*} which completes the proof as \(a \leadsto^* d\). \end{proof} We further show that insertions into discs can be simulated by disc removal. \begin{lemma} \label{lem:disc-insertion-red} Let \((D^n,P,T,\Gamma,L,M)\) be an insertion redex and let \(a \equiv \stdcoh {D^n} n \sub L\). Then: \[ a \red_{\sua'} s =_{n} \SCoh {\insertion {D^n} P T} {\stdty {D^n} n \sub \kappa} {\insertion L P M}\] \end{lemma} \begin{proof} We have the equality: \begin{align*} \SCoh {\insertion {D^n} P T} {\stdty {D^n} n \sub \kappa} {\insertion L P M} &\equiv \SCoh T {\stdty {D^n} n \sub {\kappa_{D^n,P,T}}} M&\text{ \cref{lem:disc-insertion-1}}\\ &=_n \SCoh T {\stdty T n} M&\text{by \cref{thm:std-insert-props}}\\ &\equiv \stdcoh T n \sub M\\ &\equiv L(\olsi P) \end{align*} Therefore, the reduction \(a \red s \equiv L(\olsi P)\) is given by disc removal. \end{proof} Using these lemmas, we now show that the type theories \Cattsua and \(\Catt_{\sua'}\) are equivalent. \begin{proposition} The type theories generated by \sua and \sua' are equivalent. Terms, types, and substitutions are equal or well-formed in one theory exactly when they are equal or well-formed in the other. \end{proposition} \begin{proof} Both directions proceed by \cref{lem:subset-lem}. Since \(\sua' \subseteq \sua\), it suffices to show that if \((\Gamma,s,t) \in \sua\) with \(\Gamma \vdash_{\sua'} s : A\) for some type \(A\) then: \[ \Gamma \vdash_{\sua'} s = t\] If \((\Gamma,s,t) \in \sua'\), then there is nothing to do. If it is in \(\ecr'\), then the argument is the same as in the proof of \cref{prop:suprime-equiv}. We therefore assume \((\Gamma,s,t) \in \insert\), and so there must be some insertion redex \((S,P,T,\Gamma,L,M)\) such that \(s \equiv \lfloor \SCoh S B L \rfloor\) and \[ t \equiv \lfloor \SCoh {\insertion S P T} {B \sub {\kappa_{S,P,T}}} {\insertion L P M} \rfloor \] By an induction on dimension, we assume that the theories generated by \sua and \(\sua'\) are already equivalent for terms of dimension less than \(\dim(s)\). We begin a case analysis of such reductions than are not in \insert. If \(s\) is an identity, then \(B \equiv b \to b\) for some term \(b\) and so \(t\) is an endo-coherence. If \(t\) is already an identity, then \(s \equiv t\). Otherwise: \begin{align*} \Gamma \vdash_{\sua'} t &= \id(b \sub {\kappa_{S,P,T}}) \sub {\insertion L P M}\\ &\equiv \id(b) \sub {\kappa_{S,P,T} \bullet (\insertion L P M)}\\ &= \id(b) \sub L\\ &\equiv s \end{align*} where the first equality is by endo-coherence removal, and the second equality is by \cref{lem:ins-comm-max}, appealing to the induction on dimension. If \(s\) is a unary composite we apply \cref{lem:disc-insertion-red} and use the inductive hypothesis on dimension. Otherwise, we are done by \cref{lem:insertable} and the inductive hypothesis on dimension. \end{proof} Having shown that the reflexive symmetric transitive closure of the reduction \(\red_{\sua'}\) agrees with the equality of \Cattsua, we move on to showing that this reduction is strongly terminating. To do this we appeal to \cref{lem:termination-lem}, and show that all reductions reduce the syntactic complexity of the terms involved. \begin{lemma} \label{lem:insert-sc-prop} The following inequality holds for any insertion redex \((S,P,T,\Gamma,L,M)\): \[\sc(\insertion L P M) < \sc(L)\] \end{lemma} \begin{proof} We extend the notion of syntactic depth to labellings in the obvious way. We begin by noting that: \begin{align*} \sc(L) &= \left(\bighash_{p \neq \olsi P} \sc(L(p))\right) \+ \sc(L(\olsi P))\\ &= \left(\bighash_{p \neq \olsi P} \sc(L(p))\right) \+ \sc(\stdcoh T {\lh(P)} \sub M)\\ &> \left(\bighash_{p\neq \olsi P} \sc(L(p))\right) \+ \sc(M) \end{align*} Further, we show that for all labels \(L\) and \(M\) with appropriate conditions that: \[\sc(\insertion L P M) \leq \bighash_{p\neq \olsi P} \sc(L(p)) \+ \sc(M) \] which we do by induction on \(P\). If \(P = [k]\) then it is clear that \(\insertion L P M\) contains all the terms of \(M\) and some terms of \(L\), and crucially not \(L(\olsi P)\). If instead \(P = k :: P_2\) then by induction hypothesis we get that: \[\sc(\insertion {L_k} {P_2} {M_0}) \leq \bighash_{p\neq \olsi {P_2}} \sc(L_k(p)) \+ \sc(M_1)\] It is then clear again that \(\insertion L P M\) contains terms from \(M\) and terms of \(L\) which are not \(L(\olsi P)\), and so the inequality holds. \end{proof} We can now show that insertion reductions reduce syntactic complexity. \begin{proposition} \label{prop:insert-sc} Let \(s \red t\) be an instance of insertion. If \(s\) is not an identity then \(\sc(s) > \sc(t)\). \end{proposition} \begin{proof} Let \((S,P,T,\Gamma,L,M)\) be an insertion redex so that: \[\SCoh S A L \red \SCoh {\insertion S P T} {A \sub \kappa} {\insertion L P M}\] by insertion. By assumption \(\Coh S A L\) is not an identity. Then: \begin{align*} \sc(t) &= \sc(\SCoh {\insertion S P T} {A \sub \kappa} {\insertion L P M})\\ &\leq 2\omega^{\dim(A)} \+ \sc(\insertion L P M)\\ &< 2\omega^{\dim(A)} \+ \sc(L) &\text{by \cref{lem:insert-sc-prop}}\\ &\leq \SCoh S A L\\ &= \sc(s) \end{align*} and so \(\sc(s) > \sc(t)\), completing the proof. \end{proof} \begin{corollary} The reduction system \(\redr\) is strongly terminating. \end{corollary} \begin{proof} By \cref{lem:termination-lem}, it suffices to show that each rule of \(\sua'\) reduces syntactic complexity, which follows from \cref{prop:disc-rem-sc,prop:ecr-sc,prop:insert-sc}. \end{proof} \subsection{Confluence of \texorpdfstring{\Cattsua}{Cattsua}} \label{sec:confluence-cattsua} In this section, we prove the following theorem: \begin{theorem} \label{thm:sua-conf} The reduction \(\red_{\sua'}\) is confluent. \end{theorem} The confluence proof for \Cattsua is significantly more complex than the corresponding proof for \Cattsu. The primary difficulty with \Cattsua is that a term can have an insertion redex where the term to be inserted admits a head reduction. In particular, consider the case where $a \equiv \SCoh S A L \red b$ is an instance of insertion along some branch \(P\), and \(a \red c\) is an insertion on the argument \(L(\olsi P)\). The difficulty of this critical pair is that \(L(\olsi P)\) need not be in head normal form, and furthermore, the reduction \(a \leadsto c\) can make the original insertion invalid. This does not occur in the predecessor theory \Cattsu, where only identities can be pruned, and all reducts of identities are again identities. We will prove this theorem using \cref{lem:conf-strat}. It is therefore sufficient to show that whenever \(b \leftsquigarrow a \red c\), with \(a \red b\) being a reduction derived from \textsc{rule}, that the following diagram can be formed: \[ \begin{tikzcd}[column sep=tiny] && a \\ b &&&& c \\ & {b'} & {\mathclap{=_{\dim(s)}}} & {c'} \arrow[squiggly, from=1-3, to=2-5] \arrow[squiggly, from=1-3, to=2-1] \arrow["{*}"', squiggly, from=2-1, to=3-2] \arrow["{*}", squiggly, from=2-5, to=3-4] \end{tikzcd} \] We split by cases on the reduction \(a \red b\), ignoring cases where both reductions are identical and ignoring cases which follow by symmetry of other cases. Any cases which do not mention insertion will follow from an identical argument to the one given in \cref{thm:su-conf}, and so we omit these here. We can therefore assume without loss of generality that \(a \leadsto b\) is an insertion along redex \((S,P,T,\Gamma,L,M)\) such that \(a\) is not an identity or unary composite and \(\stdcoh T {\lh(P)}\) is an identity or a standard composite which is not unary. We now split on the reduction \(a \red c\). \paragraph{Insertion on the inserted argument \(\bm{L(\olsi P)}\)} Suppose \(\stdcoh T {\lh(P)} \sub M\) admits an insertion along redex \((T, Q, U, \Gamma, M, N)\). Then: \[\stdcoh T {\lh(P)} \sub M \red \SCoh {\insertion T Q U} {\stdty T {\lh(P)}\sub{\kappa_{T,Q,U}}} {\insertion M Q N}\] We then have \(c \equiv \SCoh S A {L'}\) where \(L'\) is \(L\) with the reduction above applied. We can conclude that \(\stdcoh T {\lh(P)}\) must be a composite (i.e.\ not an identity) as otherwise the second insertion would not be possible. Similarly, \(T\) cannot be linear as otherwise \(\stdcoh T {\lh(P)}\) would be a unary composite. We now need the following lemmas, the second of which is a directed version of \cref{thm:std-insert-props} with more conditions. \begin{lemma} \label{lem:comp-to-tm} For all \(n\) and \(S\), \(\stdcoh S n \red^* \stdtm S n\). \end{lemma} \begin{proof} The only case in which \(\stdcoh S n \neq \stdtm S n\) is when \(S = D^n\), in which case a single disc removal gives the required reduction. \end{proof} \begin{lemma} \label{lem:standard-type-exterior-reduct} Let \((S,P,T)\) be an insertion point. Then if \(S\) is not linear or \(n \leq \dep(S)\), \(\stdty S n \sub{\kappa_{S,P,T}} \red^* \stdty {\insertion S P T} n\) and if \(\dep(S) \leq n\) and \(S\) is not linear or \(\dep(S) = n\) then \(\stdtm S n \sub{\kappa_{S,P,T}} \red^* \stdtm {\insertion S P T} n\). \end{lemma} \begin{proof} We proceed by induction on \(n\), starting with the statement for types. If \(n = 0\) then both standard types are \(\star\), so we are done. Otherwise, we have: \begin{alignat*}{3} \stdty S {n+1} \sub {\kappa_{S,P,T}} \equiv{} &\stdtm {\bound n S} n \sub {\incbd n - S} \sub {\kappa_{S,P,T}} &\qquad& \stdty {\insertion S P T} {n+1} \equiv{}&&\stdtm {\bound n {\insertion S P T}} n \sub {\incbd n - {\insertion S P T}} \\ &\to_{\stdty S n \sub {\kappa_{S,P,T}}} &&&&\to_{\stdty {\insertion S P T} n}\\ &\stdtm {\bound n S} n \sub {\incbd n + S} \sub {\kappa_{S,P,T}}&&&&\stdtm {\bound n {\insertion S P T}} n \sub {\incbd n + {\insertion S P T}} \end{alignat*} By inductive hypothesis: \(\stdty S n \sub{\kappa_{S,P,T}} \red^* \stdty {\insertion S P T} n\), and so we need to show that: \[\stdtm {\bound n S} n\sub{\incbd n \epsilon S \bullet \kappa_{S,P,T}} \red^* \stdty {\bound n {\insertion S P T}} n \sub{\incbd n \epsilon {\insertion S P T}}\] We now note that either the conditions for \cref{lem:insertion-bd-1} or \cref{lem:insertion-bd-2} must hold. If conditions for \cref{lem:insertion-bd-1} hold then (as everything is well-formed in \Catt) we get that the required reduction is trivial. Therefore, we focus on the second case. Here we get from \cref{lem:insertion-bd-2} that: \[\stdtm {\bound n S} n\sub{\incbd n \epsilon S \bullet \kappa_{S,P,T}} \equiv \stdtm {\bound n S} n\sub{\kappa_{\bound n S,\bound n P,\bound n T} \bullet \incbd n \epsilon {\insertion S P T}}\] Then we can apply the inductive hypothesis for terms as if \(n \leq \dim(S)\) then \(\dep(\bound n S) = n\) and otherwise \(\bound n S = S\) is not linear, and so we get the required reduction. Now we move on to the case for terms. If \(\stdtm S n\) is a variable, then we must have that \(S\) is linear and so \(S = D^n\). We must also have in this case that \(\stdtm S n = \olsi P\). Then by \cref{lem:iota-kappa-comm}, \(\stdtm S n \sub {\kappa_{S,P,T}} \equiv \stdcoh T n \sub {\iota_{S,P,T}}\) and then by \cref{lem:disc-insertion-1,lem:comp-to-tm} this reduces to \(\stdtm {\insertion S P T} n\) as required. If \(\stdtm S n\) is not a variable, then \(\stdtm S n \equiv \stdcoh S n\), and \(\stdcoh S n\) cannot be an identity (as either \(S\) is non-linear or \(n = \dim(S)\)). By \cref{lem:iota-kappa-comm} and other assumptions we get that \(\stdcoh S n \sub {\kappa_{S,P,T}}\) admits an insertion along branching point \(P\) and so: \begin{alignat*}{2} \stdtm S n\sub{\kappa_{S,P,T}} &\equiv{} &&\stdcoh S n\sub {\kappa_{S,P,T}}\\ &\red{} &&\SCoh {\insertion S P T} {\stdty S n \sub {\kappa_{S,P,T}}} {\insertion {\kappa_{S,P,T}} P {\iota_{S,P,T}}}\\ &\equiv{} &&\SCoh {\insertion S P T} {\stdty S n \sub {\kappa_{S,P,T}}} {\id}\\ &\red^*{} &&\SCoh {\insertion S P T} {\stdty {\insertion S P T} n} {\id}\\ &\equiv{} &&\stdcoh {\insertion S P T} n\\ &\red^*{} &&\stdtm {\insertion S P T} n \end{alignat*} With the second equivalence coming from \cref{lem:kappa-iota-insert}, the second reduction coming from inductive hypothesis (which is well-founded as the proof for types only uses the proof for terms on strictly lower values of \(n\)), and the last reduction coming from \cref{lem:comp-to-tm}. \end{proof} By this lemma (as \(T\) is not linear), we have \[\stdty T {\lh(P)}\sub{\kappa_{T,Q,U}} \red^* \stdty {\insertion T P Q} {\lh(P)}\] and so \(\stdcoh T {\lh(P)} \sub M \red^* \stdcoh {\insertion T Q U} {\lh(P)} \sub {\insertion M Q N}\). Let \(c'\) be the term obtained by applying this further reduction to the appropriate argument. Now by \cref{lem:insert-lin-height}, we have that \(\th(\insertion T Q U) \geq \th(T)\) and so by \cref{lem:insertable}, there is \(c' \leadsto^* c''\) with: \begin{equation*} c'' =_{\dim(a)} \SCoh {\insertion S P {(\insertion T Q U)}} {A \sub {\kappa_{S,P,\insertion T Q U}}} {\insertion L P {(\insertion M Q N)}} \end{equation*} We now examine how \(b\) reduces. As \(T\) is not linear, there is a branch \(\insertion S P Q\) of \(\insertion S P T\) and we get the following by \cref{lem:ins-comm-max}: \begin{equation*} \olsi {\insertion S P Q} \sub {\insertion L P M} \equiv \olsi Q \sub {\iota_{S,P,T} \bullet (\insertion L P M)} \equiv \olsi Q \sub M \equiv \stdcoh U {\lh(Q)}\sub N \end{equation*} Since \(\th(U) \geq \bh(Q) = \bh(\insertion S P Q)\) we can reduce \(b\) to \(b'\) by insertion as follows: \begin{equation*} b' \equiv{} \SCoh {\insertion {(\insertion S P T)} {\insertion S P Q} U} {A \sub {\kappa_{S,P,T} \bullet \kappa_{\insertion S P T, \insertion S P Q, U}}} {\insertion {(\insertion L P M)} {\insertion S P Q} N} \end{equation*} and then by \cref{lem:inserted-insertion} we get \(b' =_{\dim(a)} c''\) as required. \paragraph{Argument reduction on the inserted argument \(\bm{L(\olsi P)}\)} Suppose \(M \leadsto M'\), and \(L'\) is \(L\) but with the argument for \(\olsi P\) replaced by \(\stdcoh T {\lh(P)} \sub {M'}\), such that \(L \red L'\) and \(a \red c \equiv \Coh S A {L'}\). Then \(c\) admits an insertion and reduces as follows: \[c \leadsto c' \equiv \Coh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion {L'} P {M'}}\] Since each term in \(\insertion {L} P {M}\) is a term of \(L\) or a term of \(M\), we can simply apply the same reductions from \(L \red L'\) and \(L \red M'\) to get \(\insertion L P M \red^* \insertion {L'} P {M'}\). Therefore, \(b \red^* c'\). \paragraph{Other reduction on the inserted argument \(\bm{L(\olsi P)}\)} The argument \(L(\olsi P)\) is either a standard composite which is not unary or an identity. Therefore, the type contained in the coherence is in normal form and hence a cell reduction cannot be applied. Further, disc removal cannot be applied, as \(L(\olsi P)\) is not a unary composite, and endo-coherence removal cannot be applied as if \(L(\olsi P)\) is an endo-coherence then it is an identity. Hence, there are no other reductions that can be applied to the inserted argument and so this case is vacuous. \paragraph{Reduction of non-inserted argument} Suppose \(L \leadsto L'\) along an argument which is not \(\olsi P\) and \(c \equiv \Coh S A {L'}\). Then as \(L'(\olsi P) \equiv \mathcal{C}_T^{\lh(P)}\), an insertion can still be performed on \(c\) to get: \[ c \leadsto c' \equiv \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion {L'} P M}\] Since the terms of \(\insertion L P M\) are a subset of the terms of \(L\) and \(M\), we get \(\insertion L P M \red^* \insertion {L'} P M\) and so \(b \red^* c'\). \paragraph{Disc removal} By assumption, insertion cannot be applied to unary composites, and so this case is vacuous. \paragraph{Endo-coherence removal} Suppose \(A \equiv \arr s B s\) and \(a \red c\) by endo-coherence removal. In this case \(c \equiv \id(A,s) \sub L\) and \[ b \equiv \Coh {\insertion S P T} {(\arr s B s) \sub {\kappa_{S,P,T}}} {\insertion L P M}\] which reduces by endo-coherence removal to: \[b' \equiv \id(A, s) \sub {\kappa_{S,P,T} \bullet (\insertion L P M)}\] By \cref{lem:ins-comm-max}, we have that \(\kappa_{S,P,T} \circ (\insertion L P M) =_{\dim(S)} L\) and so \(b' = _{\dim(S)} c\) and since \(\dim(S) \leq \dim(a)\), we get \(b' =_{\dim(a)} c\) as required. \paragraph{Cell reduction} If \(A \red B\) and \(c \equiv \SCoh S B L\) from cell reduction, then if \(c\) is not an identity or disc it admits an insertion to reduce to: \[c' \equiv \SCoh {\insertion S P T} {B \sub {\kappa_{S,P,T}}} {\insertion L P M}\] As reduction is compatible with substitution, \(b\) also reduces to~\(c'\). If instead \(c\) was an identity then \begin{align*} b &\equiv \SCoh {\insertion {D^n} P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P M}\\ &\red \SCoh {\insertion {D^n} P T} {\stdty {D^n} {n+1}\sub {\kappa_{S,P,T}}} {(\insertion L P M)}\\ &\red^* \id(d_n) \sub {\kappa_{S,P,T} \bullet \insertion L P M}\\ &=_{n+1} \id(d_n) \sub L\\ &\equiv c \end{align*} Where the second reduction is due to \cref{lem:always-ecr} and the equality is due to \cref{lem:ins-comm-max}. If \(c\) is a disc then \cref{lem:disc-insertion-red} can be applied to get that \(c\) reduces to a term \(c''\) with \(c'' =_{n+1} c'\) and \(b \red c'\), completing this case. \paragraph{Insertion} Suppose \(a \leadsto c\) is also an insertion, along a branch \(Q\) of \(S\). We now split on whether \(\olsi P = \olsi Q\). First suppose \(\olsi P = \olsi Q\); then by \cref{lem:insertion-irrel}, we have \(b =_{\dim(a)} c\). Suppose now that \(\olsi P \neq \olsi Q\), and that \(L(\olsi Q) \equiv \stdcoh U {\lh(Q)} \sub N\), such that: \[c \equiv \SCoh {\insertion S Q U} {A \sub {\kappa_{S,Q,U}}} {\insertion L Q N}\] We now consider the case where \(b\) is an identity. As \(P\) and \(Q\) are distinct branches of \(S\), we must have that \(S\) itself is not linear. Therefore, the insertion along \(P\) must be an insertion of an identity. Further, for \(b\) to have the correct type for an identity, we must have that \(A \sub {\pi_P} \equiv \SPath(\olsi Q) \to \SPath(\olsi Q)\). The only path sent to \(\olsi Q\) by \(\pi_P\) is \(\olsi Q\) itself, and so \(A \equiv \SPath(\olsi Q) \to \SPath(\olsi Q)\). Now, by \cref{lem:iota-kappa-comm}: \begin{align*} c &\equiv \SCoh {\insertion S Q U} {\stdcoh U {\lh(Q)} \sub {\iota} \to \stdcoh U {\lh(Q)} \sub {\iota}} {\insertion L Q N}\\ &\red \id(\stdcoh U {\lh(Q)} \sub \iota) \sub {\insertion L Q N}&\text{ by endo-coherence removal}\\ &\equiv \id(\stdcoh U {\lh(Q)}) \sub N & \text{by \cref{lem:ins-comm-max}} \end{align*} Then, \(\insertion L P M\) sends \(\olsi Q\) to \(L(\olsi Q) \equiv \stdcoh U {\lh(Q)} \sub N\), and so \(b \equiv \id(\stdcoh U {\lh(Q)}) \sub N\). The case where \(c\) is an identity is symmetric, so we now consider when neither \(b\) or \(c\) are identities. We now observe that \(b\) and \(c\) further reduce as follows: \begin{align*} b &\red^* b' =_{\dim(a)} \SCoh {\insertion {(\insertion S P T)} {\insertion Q P T} U} {A\sub{\kappa_{S,P,T} \bullet \kappa_{\insertion S P T, \insertion Q P T, U}}} {\insertion {(\insertion L P M)} {\insertion Q P T} N}\\ c &\red^* c' =_{\dim(a)} \SCoh {\insertion {(\insertion S Q U)} {\insertion P Q U} T} {A\sub{\kappa_{S,Q,U} \bullet \kappa_{\insertion S Q U, \insertion P Q U, T}}} {\insertion {(\insertion L Q N)} {\insertion P Q U} M} \end{align*} We show that the first reduction is valid with the validity of the second holding by symmetry. If \(b\) is a unary composite then we apply \cref{lem:disc-insertion-red} to obtain a suitable \(b'\): Otherwise, we obtain the reduction via insertion, noting that: \begin{align*} \olsi{\insertion Q P T} \sub {\insertion L P M} &\equiv \olsi{Q} \sub{\kappa} \sub{\insertion L P M}\\ &\equiv L(Q)\\ &\equiv \stdcoh U {\lh(Q)}\sub N\\ &\equiv \stdcoh U {\lh(\insertion Q P T)}\sub N \end{align*} as required for the insertion, with the third equality coming from \cref{lem:ins-comm-max}. Lastly, the trunk height condition is satisfied as \(\bh(Q) = \bh(\insertion Q P T)\). Therefore, both reductions are valid. We now need the following lemma to complete the proof: \begin{lemma} Let \((S, P, T, \Gamma, L, M)\) be an insertion redex. Then: \[ \insertion L P M =_{\bh(P)+1} \insertionprime L P M\] \end{lemma} \begin{proof} By \cref{prop:insertion-prime-eq}, the two labellings are equal. By inspection of the definition, the maximum dimension of terms that differ is \(\dim(\bh(P))\). \end{proof} By the above and \cref{lem:insertion-different}, \(b' =_{\dim(a)} c'\). This completes all cases of \cref{thm:sua-conf}. \section{Towards normalisation by evaluation} \label{sec:towards-nbe} In this section, the Rust implementation of \Catt, \Cattsu, and \Cattsua, which can be found at \cite{alex_rice_2024_10964705}, is introduced. This implementation takes the form of an interpreter, allowing terms of \Catt to be written in a convenient syntax which can be mechanically checked. The implementation aids the user in writing \Catt terms by automatically constructing standard composites, allowing terms to be bound to top level syntax, implicitly suspending terms, automatically filling arguments which are not locally maximal, and providing informative error messages to the user when typechecking fails. We highlight three points of our implementation: \begin{itemize} \item The typechecker uses \emph{bidirectional typing}~\cite{10.1145/3450952} to mix ``inference'' and ``checking'' rules. Although types for \Catt can always be inferred, we find ourselves in the unusual situation where in some cases the context a term lives in can be inferred, and in some cases it must be provided. We expand on this type system in \cref{sec:typechecking}. \item Tree contexts (see \cref{sec:trees}) are given an explicit representation in the tool. The syntax in the theory is then split into syntax over a tree context and syntax over an arbitrary context. Syntax over a tree context can then use paths instead of de Bruijn levels to reference positions in the context, and substitutions from tree contexts can be given by labellings. We explore this syntax in \cref{sec:nbe-syntax}. \item During typechecking, the equality between types must be checked, which is done by syntactically comparing the normal form of each type. In this implementation, an approach inspired by \emph{normalisation by evaluation} is taken, as opposed to the reduction based approaches used in the previous sections. \end{itemize} Normalisation by evaluation (NbE) (see \cite{abel2013normalization} for an introduction), can be viewed as a method of evaluating terms with ``unknowns''. Equivalently, NbE defines a semantic model of the theory, and interprets each constructor of the type theory in these semantics. When equipped with a method for transforming elements of this model back to terms of the type theory (referred to as \emph{quoting}), the normal form of a term can be calculated directly by recursion on its structure. Compared to the reduction based approach taken in the previous sections, which simplifies the term via a series of locally applied reduction rules, NbE takes a more global approach, deconstructing the original term and using it to synthesise a normal form. The form of NbE implemented in the tool is largely inspired by the paper \citetitle{gratzer2019implementing}~\cite{gratzer2019implementing}, although we note that the form of the theory \Catt is vastly different to the modal type theory they present; \Catt does not have lambda abstraction or application in the usual sense, which makes adapting NbE techniques from the literature difficult. Nevertheless, the overall form of the evaluation is similar. \begin{figure}[ht] \centering % https://q.uiver.app/#q=WzAsMyxbMCwwLCJcXHRleHR7UmF3IHN5bnRheH0iXSxbMCwyLCJcXHRleHR7Q2hlY2tlZCBzeW50YXh9Il0sWzAsNCwiXFx0ZXh0e05vcm1hbC1mb3JtIHN5bnRheH0iXSxbMCwxLCJcXG1hdGhzZntjaGVja30iLDAseyJjdXJ2ZSI6LTR9XSxbMCwxLCJcXG1hdGhzZntpbmZlcn0iLDIseyJjdXJ2ZSI6NH1dLFsxLDAsIlxcbWF0aHNme3RvXFxfcmF3fSIsMV0sWzEsMiwiXFxtYXRoc2Z7ZXZhbH0iLDAseyJjdXJ2ZSI6LTR9XSxbMiwxLCJcXG1hdGhzZntxdW90ZX0iLDAseyJjdXJ2ZSI6LTR9XV0= % tex-fmt: skip \begin{tikzcd} {\text{Raw syntax}} \\ \\ {\text{Core syntax}} \\ \\ {\text{Normal form syntax}} \arrow["{\mathsf{check}}", curve={height=-24pt}, from=1-1, to=3-1] \arrow["{\mathsf{infer}}"', curve={height=24pt}, from=1-1, to=3-1] \arrow["{\mathsf{to\_raw}}"{description}, from=3-1, to=1-1] \arrow["{\eval}", curve={height=-24pt}, from=3-1, to=5-1] \arrow["{\quote}", curve={height=-24pt}, from=5-1, to=3-1] \end{tikzcd} \caption{Implementation overview.} \label{fig:overview} \end{figure} A high-level overview of the implementation is given in \cref{fig:overview}. We pause to explain the purpose of each component: \begin{itemize} \item The \emph{raw syntax} is the syntax that the user of the tool interacts with. We maintain no invariants over the well-formedness of the raw syntax, and it allows the user to omit arbitrary arguments. The primary purpose of the raw syntax is to be the target of parsing, and conversely to facilitate the pretty-printing of terms. We also specify a command language around this raw syntax which is used to interact with the tool. \item The \emph{core syntax} is the result of the typechecking procedure. Syntax of this form is known to be well-formed, and all implicit arguments have been filled in at this point. The terms of this syntax resemble the structured terms of \cref{sec:structured-terms}, with various common operations of \Catt being defined as constructors. Contrary to previous representations of \Catt in this thesis, the application of substitution is treated as a term former, instead of an operation. \item The \emph{normal form syntax} represents the normal forms of each of the type theories \Cattsua, \Cattsu, and \Catt itself. This syntax is also always assumed to be well-formed, and is the closest to original syntax of \Catt. \item The \textsf{eval} and \textsf{quote} functions convert syntax between core syntax and normal form syntax. For each constructor in the core syntax, evaluation computes the result of the corresponding operation, quotienting by the rules of \Cattsu or \Cattsua when applicable. We note that despite \Catt itself having no computation, evaluation must still process operations such as suspension and substitution application. Quotation converts normal form syntax back to core syntax, and in our case is a trivial inclusion. \item The \textsf{infer} and \textsf{check} functions perform typechecking while converting raw syntax into core syntax. Both functions are mutually dependent on each other, and also may need to convert types to normal form syntax to check equality. The \textsf{to\_raw} functions ``forget'' that a piece of core syntax is well-formed, returning a piece of raw syntax, and can optionally remove all non-locally maximal arguments from terms. \end{itemize} In the following subsections, we expand on these points, fully defining each class of syntax, and describing the typechecking and evaluation procedures. \subsection{Syntax} \label{sec:nbe-syntax} Before defining each of the syntactic classes in the tool, we introduce some common notation that will be used in the definitions below: \begin{itemize} \item The letter \(v\) will be used to represent \emph{names} in the syntax: strings that represent a valid identifier. \item A \(\mathsf{Maybe}(x)\) is either of the form \(\mathsf{Some}(x)\) or \(\mathsf{None}\). \item The notation \(\mathsf{Tree}(x)\) represents a tree structure which is given by a list of \(x\)'s which we call the \emph{elements} and a list of trees, which we call the \emph{branches}, whose length is one less than the list of elements. These resemble labellings from \cref{sec:tree-contexts}, but will allow trees to be labelled with arbitrary objects. \end{itemize} We begin our study of the syntax with the raw syntax, which is defined by the following grammar: \begin{alignat*}{4} &(\text{Terms})&\quad&s,t &\ \ &{}::={}&\ \ &{v} \mid \mathsf{coh}[T:A] \mid \_ \mid \mathsf{id} \mid \mathsf{comp} \mid \mathsf{inc}_n^m(s) \mid s \sub \sigma \mid \Sigma(s)\\ &(\text{Types})&&A&&{}::={}&& \star \mid \arr s {\mathsf{Maybe}(A)} t \mid \_ \mid A \sub \sigma \mid \Sigma(A)\\ &(\text{Arguments})&&\sigma&&{}::={}&& (\mathsf{Tree}(\mathsf{Maybe}(s)), \mathsf{Maybe}(A)) \mid (\mathsf{Maybe}(A,)s_0,\dots,s_n)\\ &(\text{Contexts})&&\Gamma&&{}::={}&& T \mid (v_0, A_0), \dots, (v_n : A_n)\\ &(\text{Tree Contexts})&&T&&{}::={}&& \mathsf{Tree}(\mathsf{Maybe}(v)) \end{alignat*} The primary purpose of the raw syntax is to accurately represent the written plaintext syntax. For most cases, each constructor is written in plaintext exactly how it is written above, apart from a few cases: \begin{itemize} \item The application of substitution \(s \sub \sigma\) and \(A \sub \sigma\) is simply written \(s\ \sigma\) and \(A\ \sigma\) respectively. \item The constructor \(\mathsf{inc}_n^m\) is not parsed and is used as an internal operation for defining the external substitution (see \cref{sec:insertion}). It is displayed as \(\mathsf{inc}\verb||\). \item The suspension can be given by the characters \(\Sigma\) or \(S\), to avoid the user being forced to type Unicode characters. \item The type \(\arr s {\mathsf{None}} t\) is written simply as \(s \to t\), and the type \(\arr s {\mathsf{Some}(A)} t\) is written as \(A \mid s \to t\), where the symbol \(\to\) can be replaced by \verb|->| in either case. \item For the construction \(\mathsf{Maybe}\), \(\mathsf{Some}(s)\) is printed the same as \(s\), and \(\mathsf{None}\) is printed as the empty string. \item We provide two ways to write trees: \begin{itemize} \item The curly bracket notation from \cref{sec:trees} can be used. The string: \[ s_0\{T_0\}s_1\cdots\{T_n\}s_{n+1}\] is parsed as a tree with elements given by (the parse of) \(s_0\) to \(s_{n+1}\) and branches given by the parse of \(T_0\) to \(T_n\). \item We provide a notation for specifying the locally maximal arguments of a tree. We parse the string: \[ [a_1,a_2,\dots,a_n]\] As a tree that has \(\mathsf{None}\) as each of elements branches by given by each of the \(a_i\), where if \(a_i\) does not recursively parse as a tree, it is parsed as an element and wrapped in a singleton tree. \end{itemize} To compare these two notations, the two trees below are equal: \[ \{f\}\{\{a\}\{b\}\} = [f,[a,b]]\] When using the full (curly bracket) notation to specify a labelling, it must be wrapped in angle brackets to avoid parse ambiguity. \end{itemize} We highlight the use of the extended substitution introduced in \cref{sec:extend-subst} in the raw syntax. This allows the tool to perform ``implicit suspension'', the automatic suspension of a term, by reducing it to a problem of type inference. These extended substitutions are converted to regular substitutions by the evaluation function introduced in \cref{sec:evaluation}, which applies the appropriate number of suspensions to the head term. An example of this is given in \cref{sec:examples}. We also provide a command language on top of the raw syntax for \Catt, which allows the user to perform various operations on terms, such as binding them to a top-level name, or normalising them. These commands are given by the following syntax: \begin{alignat*}{3} &\mathsf{def}\ v = s &&{}\mathrel{\big\vert} \mathsf{def}\ v\ \Gamma = s &&{}\mathrel{\big\vert} \mathsf{def}\ v\ \Gamma : A = s \\ \mathrel{\big\vert}{}&\mathsf{normalise}\ s\ \mathsf{in}\ \Gamma &&{}\mathrel{\big\vert} \mathsf{assert}\ s = t\ \mathsf{in}\ \Gamma &&{}\mathrel{\big\vert} \mathsf{size}\ s\ \mathsf{in}\ \Gamma\\ \mathrel{\big\vert}{}&\mathsf{import}\ \mathtt{filename} \end{alignat*} The first three commands define the name \(v\) to be given by the term \(s\), where the context \(\Gamma\) and type \(A\) can optionally be given, determining whether the term \(s\) will be inferred or checked. The next three commands take a context \(\Gamma\) and respectively calculate the normal form of \(s\) in \(\Gamma\), assert that \(s\) and \(t\) are equal in \(\Gamma\), or count the number of coherence constructors in \(s\). The last command parses the file \texttt{filename} and runs the commands it contains. In the implementation, each piece of syntax is paired with a piece of span information, which specifies where in the source file it originated. This is done by making the raw syntax generic over a type \(S\) of spans. When obtaining the raw syntax from parsing, this \(S\) is given by a range \(n \lh(T)\), we return the insertion redex \((S,P,T,\_,{\color{Diag2}L}, {\color{Diag2}M})\). \end{itemize} If an insertion redex \((S,P,T,\_,{\color{Diag2}L}, {\color{Diag2}M})\) is found, then \(S\) is replaced by \(\insertion S P T\), \({\color{Diag2}L}\) is replaced by \(\insertion {\color{Diag2}L} P {\color{Diag2}M}\), and \(\color{Diag1}A\) is replaced by \(\color{Diag1}A \sub {\kappa_{S,P,T}}\). This step is then repeated until no insertion redexes are found. \begin{remark} At this critical step, the evaluation proceeds in a fashion closer to reduction than NbE, with insertions repeatedly applied by searching for redexes and applying reductions to the head term. This seems unavoidable; even if one could define a parallel insertion which inserted all insertable arguments at once, it is not clear how to deal with locally maximal arguments that are iterated identities. Despite this, we still claim that the overall structure of the evaluation follows an NbE style, especially regarding the treatment of suspension and application of substitutions and labellings. \end{remark} We next obtain the type \({\color{Diag2}B} = \eval_{\color{Diag2}\id_S}({\color{Diag1}A})\), and split into cases: \begin{itemize} \item If endo-coherence removal is enabled, and \(\color{Diag2}B\) is of the form \(\color{Diag2}(s,s) :: B'\), then we let \({\color{Diag1}\arr t C t} = \quote({\color{Diag2}B})\), interpret \(\color{Diag2}L\) as an environment by letting \({\color{Diag2}\ty(L)} = {\color{Diag2}\star}\) and let: \[ \eval_{\color{Diag2}\rho}({\color{Diag1}\mathsf{coh}[S : A]}) = {\color{Diag2}\mathsf{id}_{\dim(B')}} \sub {\{\eval_{\color{Diag2}L}({\color{Diag1}C}), \eval_{\color{Diag2}L}({\color{Diag1}t})\}}\] where the labelling \(\{\_,\_\}\) from a disc can be trivially constructed by deconstructing the type. \item Suppose endo-coherence removal is disabled, \(S\) is a disc \(D^n\), and \(\color{Diag2}B\) is of the form \(\color{Diag2}(\mathsf{var}_{p^n}, \mathsf{var}_{p^n}) :: B'\), where we recall the path \(p^n\) is the unique locally maximal variable of \(D^n\), then we let: \[ \eval_{\color{Diag2}\rho}({\color{Diag1}\mathsf{coh}[S : A]}) = {\color{Diag2}\mathsf{id}_n \sub {L}}\] \item If disc-removal is enabled, \(S = D^n\), and \(\color{Diag2}B\) is equal to the standard type of dimension \(n\), then: \[ \eval_{\color{Diag2}\rho}({\color{Diag1}\mathsf{coh}[S : A]}) = {\color{Diag2}L(p^n)}\] \item If none of the above cases hold, and \(\color{Diag2}B\) is equal to the standard type of dimension \(\dim(S)\), then: \[ \eval_{\color{Diag2}\rho}({\color{Diag1}\mathsf{coh}[S : A]}) = {\color{Diag2}\mathsf{comp}_S \sub L}\] \item If none of the above cases hold, then: \[ \eval_{\color{Diag2}\rho}({\color{Diag1}\mathsf{coh}[S : A]}) = {\color{Diag2}\mathsf{coh}[S : B] \sub L}\] \end{itemize} The \(\color{Diag1}\mathsf{comp}_T\) case is treated in much the same way, removing any step involving \(\color{Diag1}A\) and instead setting \({\color{Diag2}B} = \eval_{\color{Diag2}\id_T}({\color{Diag1}\stdty T n})\), where \(n\) is given by the dimension of \(T\) before any insertion was performed. This completes all cases for the evaluation function. In contrast, the \(\quote\) function is defined completely trivially by recursion, converting head terms and normal form terms to core terms, normal form labellings to core labellings, and converting normal form types to an iterated arrow type in the obvious way. We note that this is unusual for NbE, where the \(\quote\) function is often mutually defined with evaluation, and performs a significant portion of the work of converting terms to normal form. \subsection{Typechecking} \label{sec:typechecking} Now that the three classes of syntax and the evaluation function have been introduced, the bidirectional typechecking algorithm in the tool can be described. Bidirectional typing allows us to mix typing rules which ``check'' a term, and typing rules which ``infer'' the type for a term. In the implementation, this will determine which pieces of data are inputs to a procedure, and which pieces of data are outputs. By \cref{lem:ty-unique}, all \Catt terms \(s\) have a unique type, which is given by the canonical type \(\ty(s)\). However, for certain terms, such as the coherence term \(\mathsf{coh}[T : A]\), we will be able to further infer the context that a term lives in, which in this case is the tree context \(T\). In this case the pair of the inferred context and type is known as a \emph{principal typing}~\cite{10.1145/237721.237728}, which is not to be confused with a \emph{principal type} of a term in a fixed context. Due to our unique case where all types are inferable, but the context in a judgement may or may not be inferable, we refer to judgements where the context is an input as \emph{checking} judgements and judgements where the context is output as \emph{inferring} judgements. \begin{remark} We justify this choice of terminology by noting the similarity of the judgements \(\Gamma \vdash s : A\) and \(\cdot \vdash \Pi_{\Gamma}\,s : \Gamma \to A\) in a type theory with (dependent) function types, where inferring the type of the second judgement would infer the context of the first. Of course, \Catt does not have function types, yet the intuition can still apply. \end{remark} The typing system will be defined with respect to a \emph{Signature} \(\Psi\), which contains a mapping from names to triples \(({\color{Diag1}\U}, {\color{Diag1}s}, {\color{Diag1}A})\) where \(\color{Diag1}s\) is a term of type \(\color{Diag1}A\) in (tree) context \(\color{Diag1}\U\). In the implementation, the signature also stores all relevant settings for the tool: which reductions are active, the operation set \(\mathcal{O}\) (which can only be configured to the groupoidal or regular operation sets), and whether implicit variables should be kept in the \(\mathsf{to\_raw}\) functions. We write: \[ \Psi(v) = ({\color{Diag1}\U}, {\color{Diag1}s}, {\color{Diag1}A})\] if the signature \(\Psi\) maps \(v\) to the triple above. We further define the notation \({\color{Diag1}\U}(i) = (v : {\color{Diag1}A})\) to mean that at the \(i^{\text{th}}\) index of \(\color{Diag1}\U\) (with \(\color{Diag1}\U\) being a tree or a context), contains a variable name \(v\), which is given type \(\color{Diag1}A\) by \(\color{Diag1}\U\). Lastly we define two conversion functions: \(\mathsf{from\_sub}\) and \(\mathsf{flatten}\). The first is a (partial) function which takes a tree \(T\) and a substitution \(\sigma\) and creates a labelling \(\mathsf{from\_sub}_T(\sigma)\) by letting the locally maximal arguments be given by the terms of \(\sigma\), if \(\sigma\) contains the correct number of terms. The function \(\mathsf{flatten}\) acts on the \(\mathsf{Maybe}\) construction applied to a term or type. It takes \(\mathsf{Some}(s)\) and \(\mathsf{Some}(A)\) to \(s\) and \(A\) respectively, and \(\mathsf{None}\) to \(\_\), the hole constructor for terms and types. Our bidirectional typing system will be based on the following judgements, letting \(\color{Diag1}\U\) refer to either a context or tree context: \begin{alignat*}{2} &s \rightsquigarrow {\color{Diag1}\U} \vdash {\color{Diag1}t} : {\color{Diag1}A}&\qquad&\text{Convert \(s\) to \(\color{Diag1}t\) inferring its type \(\color{Diag1}A\) in inferred (tree) context \(\color{Diag1}\U\)}\\ &{\color{Diag1}\U} \vdash s \rightsquigarrow {\color{Diag1}t} : {\color{Diag1}A} &&\text{Given \(\color{Diag1}\U\), convert \(s\) to \(\color{Diag1}t\) checking it has some type \(\color{Diag1}A\) in \(\color{Diag1}\U\)}\\ &{\color{Diag1}\U} \vdash s = {\color{Diag2}t} \rightsquigarrow () &&\text{In \(\color{Diag1}\U\), check \(s\) has normal form \(\color{Diag2}t\)}\\ &{\color{Diag1}\U} \vdash A \rightsquigarrow {\color{Diag1}B} = {\color{Diag2}C} &&\text{In \(\color{Diag1}\U\), convert \(A\) to \(\color{Diag1}B\), inferring its normal form \(\color{Diag2}C\)}\\ &{\color{Diag1}\U} \vdash A = {\color{Diag2}C} \rightsquigarrow () &&\text{In \(\color{Diag1}\U\), check \(A\) has normal form \(\color{Diag2}C\)}\\ &\Gamma \vdash{} \rightsquigarrow {\color{Diag1}\U}&&\text{Check \(\Gamma\), producing (tree) context \(\color{Diag1}\U\)}\\ &{\color{Diag1}\U} \vdash \sigma : {\color{Diag1}\Gamma} \rightsquigarrow {\color{Diag1}\tau} &&\text{Check \(\sigma\) is a substitution from \(\color{Diag1}\Gamma\) to \(\color{Diag1}\U\), producing \(\color{Diag1}\tau\)}\\ &{\color{Diag1}\U} \vdash L : T \rightsquigarrow {\color{Diag1}M} : {\color{Diag2}A}&&\text{Check labelling \(L\) in \(\color{Diag1}\U\), producing \(\color{Diag1}M\) with type \(\color{Diag2}A\)} \end{alignat*} for each judgement, the syntax to the left of \(\rightsquigarrow\) are the inputs to the judgements, and the syntax to the right are the outputs. \newcommand{\ruleone}{\alpha} \newcommand{\ruletwo}{\beta} \newcommand{\rulethree}{\gamma} \newcommand{\rulefour}{\delta} \newcommand{\rulefive}{\varepsilon} The typing rules for all judgements of this system are given in \cref{fig:bidirectional}. In this figure, \(D^n\) always refers to the linear tree of depth \(n\), rather than the disc context, \(\emptyset\) refers to the empty context, and \(\emp\) refers to the singleton tree. In the final rules, \(i\) should be treated as if it is universally quantified. We pause to highlight some of these rules: \begin{itemize} \item In the rule for coherences, marked \(\ruleone\), the support conditions are checked. This is done using the normal form syntax for the type, due to the simplicity of this syntax. The variable sets of a term can easily be collected by recursion, and in the implementation are stored in a hash set, using Rust's \textsf{HashSet} type. \item The rule for composites, marked \(\ruletwo\), is crucially a checking rule as there is no way to infer the tree \(T\) for the term \(\color{Diag1}\mathsf{comp}_T\). \item For the rule for the application of labellings, marked \(\rulethree\), the premise for the typing of the term is given by a checking judgement instead of an inferring judgement, as the tree \(T\) can be inferred form the labelling. This is in contrast to the corresponding rule for application of substitutions, where the context must be inferred from the inner term before the substitution can be checked. Combined with the point above, this allows a labelling applied to a \(\mathsf{comp}\) term to be checked. \item The rule marked \(\rulefour\) allows a substitution to be applied to a term over a tree context, by converting the substitution to a labelling. This is mainly a convenience feature, as given a term \(s\) where it can be inferred that the context of \(s\) is a tree \(T\), it can be easier to give the locally maximal arguments for \(s\) as a list rather than describing the labelling. \item Lastly, we explain each component of the rule for the typing of a substitution, marked \(\rulefive\). We note that the first type in any \Catt context, which in the rule is given by the type \(\color{Diag1}A_0\), is always \(\star\). Therefore, the type of the first term in a substitution \(\sigma\) should be equal to \(\star \sub \sigma \equiv \ty(\sigma)\). In the rule, the type of the first term is given by \(\color{Diag1}B_0\), explaining its presence as the type of the substitution that gets evaluated to \(\color{Diag2}\rho\). We further note that \(\color{Diag2}\ty(\rho)\) is simply the evaluation of \(\color{Diag1}B_0\), which is why \(X\) is checked against it. Due to the choice to use de Bruijn levels instead of indices, weakening a term is the identity, and so \(s \sub \sigma \equiv s \sub {\langle \sigma,t \rangle}\) for any \(t\). Therefore, by inspecting the typing rules for substitutions in \Catt, it can be proven that to type \(\Gamma \vdash \sigma : \Delta\), it is sufficient to show that \(\Gamma \vdash x \sub \sigma : A \sub \sigma\) for all \((x : A) \in \Delta\). Observing the rule \(\rulefive\), this translates to proving that \(A_i \sub {(B_0,t_0,\dots,t_n)} = B_i\) recalling that \(B_0\) is the core syntax version of the type of the substitution. These equations can be shown by proving that the evaluation of each side is the same, but the evaluation of the left-hand side is given by \(\eval_{\color{Diag2}\rho}({\color{Diag1}A_i})\) for each \(i\), and so for efficiency we factor out the calculation of \(\color{Diag2}\rho\). \end{itemize} \begin{figure}[p] \centering \begin{mathpar} \inferrule{\Psi(v) = ({\color{Diag1}\U}, {\color{Diag1}t}, {\color{Diag1}A})}{v \rightsquigarrow {\color{Diag1}\U} \vdash {\color{Diag1}\mathsf{top\_lvl}(v,t)} : {\color{Diag1}A}}\and \inferrule{{\color{Diag1}T} \vdash A \rightsquigarrow {\color{Diag1}B} = {\color{Diag2}C} \\ (T, \src({\color{Diag2}C}), \tgt({\color{Diag2}C})) \in \mathcal{O}}{\mathsf{coh}[T : A] \rightsquigarrow {\color{Diag1}T} \vdash {\color{Diag1}\mathsf{coh}[T : B]} : {\color{Diag1}B}}\ \ruleone\and \inferrule{ }{\mathsf{id} \rightsquigarrow {\color{Diag1}D^1} \vdash {\color{Diag1}\mathsf{id}_0} : {\color{Diag1}\arr {\mathsf{var}_{[0]}} {\star} {\mathsf{var}_{[0]}}}}\and \inferrule{s \rightsquigarrow {\color{Diag1}\U} \vdash {\color{Diag1}t} : {\color{Diag1}A}}{\Sigma(s) \rightsquigarrow {\color{Diag1}\Sigma(\U)} \vdash {\color{Diag1}\Sigma(t)}: {\color{Diag1}\Sigma(A)}}\\ \inferrule{s \rightsquigarrow {\color{Diag1}T} \vdash {\color{Diag1}t} : {\color{Diag1}A}}{{\color{Diag1}T} \vdash s \rightsquigarrow {\color{Diag1}t} : {\color{Diag1}A}}\and \inferrule{{\color{Diag1}\U}(i) = (v : {\color{Diag1}A})}{{\color{Diag1}\U} \vdash v \rightsquigarrow {\color{Diag1}\mathsf{var}_i} : {\color{Diag1}A}}\and \inferrule{ }{{\color{Diag1}D^n} \vdash \mathsf{id} \rightsquigarrow {\color{Diag1}\mathsf{id}_n} : {\color{Diag1}\stdty {D^n} {n + 1}}}\and \inferrule{ }{{\color{Diag1}T} \vdash \mathsf{comp} \rightsquigarrow {\color{Diag1}\mathsf{comp}_T} : {\color{Diag1}\stdty T n}}\ \ruletwo\and \inferrule{s \rightsquigarrow {\color{Diag1}\Gamma} : {\color{Diag1}t} : {\color{Diag1}A} \\ {\color{Diag1}\U} \vdash \sigma : {\color{Diag1}\Gamma} \rightsquigarrow {\color{Diag1}\tau}\\ {\color{Diag1}\U} \vdash \ty(\sigma) = {\color{Diag2}B} \rightsquigarrow ()}{{\color{Diag1}\U} \vdash s \sub \sigma \rightsquigarrow {\color{Diag1}t \sub \tau} : {\color{Diag1}A \sub \tau}}\and \inferrule{{\color{Diag1}T} \vdash s \rightsquigarrow {\color{Diag1}t} : {\color{Diag1}A} \\ {\color{Diag1}\U} \vdash \mathsf{from\_sub}_T(\sigma) : {\color{Diag1}T} \rightsquigarrow {\color{Diag1}M} : {\color{Diag2}B}\\ {\color{Diag1}\U} \vdash \ty(\sigma) = {\color{Diag2}B} \rightsquigarrow ()}{{\color{Diag1}\U} \vdash s \sub \sigma \rightsquigarrow {\color{Diag1}t \sub M} : {\color{Diag1}A \sub M}}\ \rulethree\and \inferrule{{\color{Diag1}T} : s \rightsquigarrow {\color{Diag1}t} : {\color{Diag1}A} \\ {\color{Diag1}\U} \vdash L : T \rightsquigarrow {\color{Diag1}M} : {\color{Diag2}B}\\ {\color{Diag1}\U} \vdash \ty(L) = {\color{Diag2}B} \rightsquigarrow ()}{{\color{Diag1}\U} \vdash s \sub L \rightsquigarrow {\color{Diag1}t \sub M} : {\color{Diag1}A \sub M}}\ \rulefour\\ \inferrule{ }{{\color{Diag1}\U} \vdash \_ = {\color{Diag2}t} \rightsquigarrow ()}\and \inferrule{{\color{Diag1}\U} \vdash s \rightsquigarrow {\color{Diag1}t} : {\color{Diag1}A}}{{\color{Diag1}\U} \vdash s = {\eval_{\color{Diag2}\id_{\color{Diag1}\U}}({\color{Diag1}t})} \rightsquigarrow ()}\\ \inferrule{ }{{\color{Diag1}\U} \vdash \star \rightsquigarrow {\color{Diag1}\star} = {\color{Diag2}\emp}}\and \inferrule{{\color{Diag1}\U} \vdash s \rightsquigarrow {\color{Diag1}s'} : {\color{Diag1}A} \\ {\color{Diag1}\U} \vdash t \rightsquigarrow {\color{Diag1}t'} : {\color{Diag1}B} \\ \eval_{\color{Diag2}\id_{\color{Diag1}\U}}{\color{Diag1}A} = \eval_{\color{Diag2}\id_{\color{Diag1}\U}}{\color{Diag1}B}}% {{\color{Diag1}\U} \vdash \arr s {} t \rightsquigarrow {\color{Diag1}\arr {s'} {A} {t'}} = {\color{Diag2}(\eval_{\id_{\color{Diag1}\U}}{\color{Diag1}s'}, \eval_{\id_{\color{Diag1}\U}}{\color{Diag1}t'}) :: \eval_{\id_{\color{Diag1}\U}}{\color{Diag1}A}}}\and \inferrule{{\color{Diag1}\U} \vdash s \rightsquigarrow {\color{Diag1}s'} : {\color{Diag1}B} \\ {\color{Diag1}\U} \vdash t \rightsquigarrow {\color{Diag1}t'} : {\color{Diag1}C} \\ {\color{Diag1}\U} \vdash A \rightsquigarrow {\color{Diag1}A'} = {\color{Diag2}A''}\\ {\color{Diag2}A''} = \eval_{\color{Diag2}\id_{\color{Diag1}\U}}{\color{Diag1}B} = \eval_{\color{Diag2}\id_{\color{Diag1}\U}}{\color{Diag1}C}}% {{\color{Diag1}\U} \vdash \arr s {A} t \rightsquigarrow {\color{Diag1}\arr {s'} {A'} {t'}} = {\color{Diag2}(\eval_{\id_{\color{Diag1}\U}}{\color{Diag1}s'}, \eval_{\id_{\color{Diag1}\U}}{\color{Diag1}t'}) :: A''}}\\ \inferrule{{\color{Diag1}\U} \vdash A \rightsquigarrow {\color{Diag1}B} = {\color{Diag2}C}}{{\color{Diag1}\U} \vdash A = {\color{Diag2}C} \rightsquigarrow ()}\and \inferrule{{\color{Diag1}\U} \vdash s = {\color{Diag2}s'} \rightsquigarrow ()\\ {\color{Diag1}\U} \vdash t = {\color{Diag2}t'} \rightsquigarrow () }{{\color{Diag1}\U} \vdash \arr s {} t = {\color{Diag2}\arr {s'} {A} {t'}} \rightsquigarrow ()}\and \inferrule{{\color{Diag1}\U} \vdash s = {\color{Diag2}s'} \rightsquigarrow ()\\ {\color{Diag1}\U} \vdash t = {\color{Diag2}t'} \rightsquigarrow ()\\ {\color{Diag1}\U} \vdash A = {\color{Diag2}A'} \rightsquigarrow ()\\ }{{\color{Diag1}\U} \vdash \arr s {A} t = {\color{Diag2}\arr {s'} {A'} {t'}} \rightsquigarrow ()}\and \inferrule{ }{{\color{Diag1}\U} \vdash \_ = {\color{Diag2}C} \rightsquigarrow ()}\\ \inferrule{ }{T \vdash {} \rightsquigarrow {\color{Diag1}T}}\and \inferrule{ }{\emptyset \vdash {} \rightsquigarrow {\color{Diag1}\emptyset}}\and \inferrule{\Gamma \vdash {} \rightsquigarrow {\color{Diag1}\Delta} \\ {\color{Diag1}\Delta} \vdash A \rightsquigarrow {\color{Diag1}B} = {\color{Diag2}C}}{\Gamma, (v : A) \vdash {} \rightsquigarrow {\color{Diag1}\Delta, (v : B)}}\\ \inferrule{{\color{Diag1}\U} \vdash s_i \rightsquigarrow {\color{Diag1}t_i} : {\color{Diag1}B_i}\\ {\color{Diag2}\rho} := \eval_{\color{Diag2}\id_{\color{Diag1}\U}}% ({\color{Diag1}(B_0,t_0,\dots,t_n)})\\\\ \eval_{\color{Diag2}\id_{\color{Diag1}\U}}({\color{Diag1}B_i}) = \eval_{\color{Diag2}\rho}({\color{Diag1}A_i})\\ {\color{Diag1}\U} \vdash \mathsf{flatten}(X) = {\color{Diag2}\ty(\rho)} \rightsquigarrow ()}{{\color{Diag1}\U} \vdash (X,s_0,\dots,s_n) : {\color{Diag1}(v_0:A_0),\dots,(v_n:A_n)} \rightsquigarrow {\color{Diag1}(B,t_0,\dots,t_n)} }\ \rulefive\\ \inferrule{{\color{Diag1}\U} \vdash \mathsf{flatten}(x) \rightsquigarrow {\color{Diag1}A}}{{\color{Diag1}\U} \vdash x : \emp \rightsquigarrow {\color{Diag1}t} : \eval_{\color{Diag2}\id_{\color{Diag1}\U}}({\color{Diag1}A})}\and \inferrule{{\color{Diag1}\U} \vdash L_i \rightsquigarrow {\color{Diag1}M_i} : {\color{Diag2}(s_i,s_{i+1})::A} \\ {\color{Diag1}\U} \vdash \mathsf{flatten}(x_i) = {\color{Diag2}s_i} \rightsquigarrow ()}{{\color{Diag1}\U} \vdash x_0\{L_0\}\cdots\{L_n\}x_{n+1} \rightsquigarrow {\color{Diag1}s_0\{M_0\}\cdots\{M_n\}s_{n+1}} : {\color{Diag2}A}} \end{mathpar} \caption{Bidirectional typing rules.} \label{fig:bidirectional} \end{figure} The typing rules in \cref{fig:bidirectional} can easily be translated into an algorithm for mechanically checking each of these typing judgements. In some cases, some equalities of normal forms are left implicit, such as in the final rule concerning the typing of a non-singleton labelling, and must be made explicit in the final algorithm. Many of the choices for the form of these rules was made to improve the quality of error messages. Each of these rules can fail for a variety of reasons, at which point an error is created by converting the relevant syntax back to raw syntax using the \(\mathsf{to\_raw}\) functions so that it can be displayed to the user. The use of Rust's \textsf{Result} type, which allows each of these functions to return either the well-formed core syntax or an appropriate error message, is essential, and benefits greatly from the question mark syntax in Rust, which allows errors to easily be propagated through the code. We end this section by describing the function of each of the commands introduced in \cref{sec:nbe-syntax}. Each of these commands is run with a mutable reference to a signature \(\Psi\). The commands use this signature for typechecking, and may modify the signature. The three \(\mathsf{def}\) commands are used to add a new binding to the signature \(\Psi\). For the first command, which omits the context, the term \(s\) must be inferred, producing a core syntax context, term, and type, which is inserted into the signature with key \(v\) and printed to the user. The second command is given a raw context and so first checks this raw context to produce a core (possibly tree) context \(\color{Diag1}\U\), before checking the term \(s\) in this context. Checking the term then produces a core syntax term and type, which are inserted into the signature along with the context \(\color{Diag1}\U\). The last \(\mathsf{def}\) command proceeds as before, checking the context to get a context \(\color{Diag1}\U\) and then checking the term in \(\color{Diag1}\U\), producing a core term \(\color{Diag1}t\) and type \(\color{Diag1}B\). The supplied type \(A\) is then checked against \(\eval_{\color{Diag2}\id_{\color{Diag1}\U}}({\color{Diag1}B})\). If this check succeeds, the key-value pair \((v, ({\color{Diag1}\U},{\color{Diag1}t},{\color{Diag1}B}))\) is added to the signature \(\Psi\), identically to the previous case. The \(\mathsf{normalise}\) command is used to print the normal form of a term \(s\). As with the final two \(\mathsf{def}\) cases, we begin by checking the context, and checking the term \(s\) in the resulting core context to get term \(\color{Diag1}t\) of type \(\color{Diag1}A\). Both \(\color{Diag1}t\) and \(\color{Diag1}A\) are then evaluated to normal form, quoted, and converted back to raw syntax, before being pretty-printed to the user. The \(\mathsf{size}\) command calculates a primitive estimate of the complexity of a term (which we note is not the same as the syntactic complexity given in \cref{sec:termination}) by counting the number of constructors in the normal form. To run this command, the term \(s\) is checked as before, and converted to a normal form term \(\color{Diag2}t\). Then \(\mathsf{size}({\color{Diag2}t})\) is then calculated by induction by the rules given in \cref{fig:size} and this size is printed to the user. The \(\mathsf{assert}\) command checks both input terms \(s\) and \(t\), and evaluates the resulting core syntax terms to normal form to check that they are equal. None of the \(\mathsf{normalise}\), \(\mathsf{size}\), or \(\mathsf{assert}\) commands modify the signature \(\Psi\). \begin{figure}[ht] \centering \begin{mathpar} \mathsf{size}({\color{Diag2}\mathsf{coh}[T : A]}) = 1 + \mathsf{size}({\color{Diag2}A})\and \mathsf{size}({\color{Diag2}\mathsf{id}_n}) = \mathsf{size}({\color{Diag2}\mathsf{comp}_T}) = 1 \and \mathsf{size}({\color{Diag2}\mathsf{var}_p}) = 0 \and \mathsf{size}({\color{Diag2}H \sub L}) = \mathsf{size}({\color{Diag2}H}) + \mathsf{size}({\color{Diag2}L})\and \mathsf{size}({\color{Diag2}L}) = \sum_{p : \Path_T} \mathsf{size}({\color{Diag2}L(p)})\and \mathsf{size}({\color{Diag2}[(s_0,t_0), \dots, (s_n,t_n)]}) = \sum_{i= 0}^n \left(\mathsf{size}({\color{Diag2}s_i}) + \mathsf{size}({\color{Diag2}t_i})\right) \end{mathpar} \caption{Size of normal form syntax.} \label{fig:size} \end{figure} Finally, the \(\mathsf{import}\) command reads the contents of the supplied file, parses it as a list of commands, and runs each of these commands with the same signature. The tool has a command line interface, which allows files to be loaded at startup, as well as providing a REPL (read-eval-print loop) which parses one command at a time. \subsection{Examples} \label{sec:examples} We now demonstrate the use of the tool with some examples. All the examples below can be found in the \texttt{/examples} directory of the implementation code base~\cite{alex_rice_2024_10964705}. We begin defining some standard operations that can be found in a monoidal category or bicategory, which can be found in the file \texttt{/examples/monoidal.catt}. We start by defining \(1\)-composition as a coherence: \begin{lstlisting}[language=Catt] def comp1coh [f,g] = coh [ x{}{}z : x -> z ] (f,g) \end{lstlisting} This example demonstrates the two ways of giving a tree context: in the \(\mathsf{def}\) command we give the context using the square bracket notation, which only labels the maximal elements, and in the coherence it is given by the full labelling, as we require access to the variables \(x\) and \(z\) (we note that all other variables of the context have been omitted). This example further demonstrates that a substitution can be applied to a term over a tree context, where we have only specified the locally maximal arguments. This composite can of course also be given using the \(\mathsf{comp}\) construction. \begin{lstlisting}[language=Catt] def comp1 [f,g] = comp assert comp1coh(f,g) = comp1(f,g) in [f,g] \end{lstlisting} The tree for \(\mathsf{comp}\) is inferred from the labelling \texttt{[f,g]}. The assert statement ensures that these two ways of giving the \(1\)-composition are equal in the theory. The assert passes even with no reduction enabled, demonstrating the value of evaluation in the fully weak case. The horizontal and vertical composites of \(2\)-cells can be given similarly: \begin{lstlisting}[language=Catt] def horiz [[a],[b]] = comp def vert [[a,b]] = comp \end{lstlisting} As the vertical composite is the suspension of \(1\)-composition, it can also be given using implicit suspension: \begin{lstlisting}[language=Catt] def vertsusp [[a,b]] = comp1[a,b] assert vert(a,b) = vertsusp(a,b) in [[a,b]] \end{lstlisting} In this case, the labelling applied to \texttt{comp1} is a tree of depth \(1\) where the locally maximal arguments are given by \(2\)-dimensional terms. Type inference then deduces that the type component of this labelling should be \(1\)-dimensional, and hence evaluation causes the head term \texttt{comp1} to be suspended, making it equal to the composite \texttt{vert}, as demonstrated by the assertion. The unitors and associator are then given by the following coherences, using the \(\mathsf{id}\) builtin for the unitors: \begin{lstlisting}[language=Catt] def unitor_l = coh [ x{f}y : comp1(id(x),f) -> f ] def unitor_r = coh [ x{f}y : comp1(f, id(y)) -> f ] def assoc = coh [ {f}{g}{h} : comp1(comp1(f,g),h) -> comp1(f,comp1(g,h)) ] \end{lstlisting} which allows definitions to be given for terms which witness the triangle and pentagon equations of monoidal categories: \begin{lstlisting}[language=Catt] def triangle = coh [ x{f}y{g}z : vert(assoc(f,id(y),g), horiz(id(f),unitor_l(g))) -> horiz(unitor_r(f),id(g)) ] def pentagon = coh [ v{f}w{g}x{h}y{i}z : vert(assoc(comp1(f,g),h,i),assoc(f,g,comp1(h,i))) -> comp [ horiz(assoc(f,g,h),id(i)), assoc(f,comp1(g,h),i), horiz(id(f),assoc(g,h,i)) ] ] \end{lstlisting} We note the direct use of the \(\mathsf{comp}\) constructor to easily provide a ternary composite without needing to give a new top-level definition. Using the \(\mathsf{normalise}\) command, it can be shown that the triangle reduces to the identity with \Cattsu normalisation enabled, and the pentagon reduces to the identity with \Cattsua normalisation enabled. In the files \texttt{/examples/eh.catt} and \texttt{/examples/eh-cyll.catt}, we give two \Catt proofs of the Eckmann-Hilton argument (see \cref{prop:eh}). In \Cattsu, these both normalise to the following vastly smaller term: \begin{lstlisting}[language=Catt] def swap = coh [ x{f{a}g}y{h{b}k}z : comp[comp [[a],h], comp[g,[b]]] -> comp[comp [f,[b]], comp[[a],k]] ] \end{lstlisting} The \(\mathsf{size}\) command demonstrates that the \Catt Eckmann-Hilton proof in \texttt{/examples/eh.catt} has size 1807 whereas its \Cattsu normalisation has a size of only 19. Due to the simplicity of Eckmann-Hilton in \Cattsu, we are able to give \Cattsu and \Cattsua proofs of the syllepsis (see \cref{sec:cattsu}) in \texttt{/examples/syllepsis-su.catt} and \texttt{/examples/syllepsis.catt} respectively. It can be verified that in \Cattsua, the \Cattsu proof of syllepsis, which has size 2745, reduces to the \Cattsua proof, which has size 1785. \subsection{Further work} \label{sec:further-work} We end the discussion of this implementation with some options for improving the tool. Each of these suggestions could make the tool easier to use and interact with, which in turn extends what can be achieved with it. Currently, the tool completely relies on the bidirectional typing rules to perform all of its type inference. While this is effective in some scenarios, for example labellings and implicit suspension, it is lacking in others, such as the lack of implicit arguments in substitutions. One could try to implement such features by adding metavariables and a unification procedure to the typechecker. Contrary to the situation for the fully weak \Catt, unification for \Cattsu and \Cattsua is non-trivial. Suppose we wished to unify the following two terms: \[ f *_0 g = h *_0 i\] where \(f\),\(g\),\(h\), and \(i\) may contain metavariables. In \Catt, this problem could be reduced to the unification problems \(f = h\) and \(g = i\). In \Cattsu however, this cannot be done, as a potential solution is \(f = h *_0 i\) and \(g = \id\). It is likely that any unification that can be implemented for \Cattsu (and \Cattsua) is quite limited, but an investigation into the limits of unification in these settings could be valuable. Even without a powerful unification algorithm, there are still instances where an argument could be inferred by the tool. One such example is the Eckmann-Hilton term presented in the previous section. This term is defined in the context: \[ (x : \star)\ (\alpha : \id(x) \to \id(x))\ (\beta : \id(x) \to \id(x)) \] Here, the \(x\) should be inferable as it is the \(0\)-source of \(\alpha\). The tool currently has no way to deduce this. Separately, improvements could be made to the treatment of unfolding of top-level definitions in the tool. Whenever a term is evaluated by the tool, any top-level definition is unfolded to its normal form. This is not always desirable, as it means that error messages frequently contain fully expanded terms, increasing the length and readability of terms in addition to losing the information associated with the name given to the definition. Conversely, the full unfolding of evaluation often means that we avoid evaluating terms before displaying them to the user, even when a (partial) evaluation would simplify the term. A notable example is that when giving a new definition, its type is not simplified before being displayed, often resulting in terms such as \verb|p0{x{f}y}|. A better approach would likely add top-level definitions to the normal form syntax as a head term, allowing their unfolding to be optional. One potential approach for efficient unfolding is given by \citeauthor{andrastalk}~\cite{andrastalk}. Finally, the accessibility of the tool could be improved with proper editor integration, for example by implementing the language server protocol (see \url{https://microsoft.github.io/language-server-protocol/}), which would allow errors to be displayed directly in the editor, among other code refactoring features. \section{Models} \label{sec:models} Despite claiming that the type theories \Cattsu and \Cattsua model semistrict \(\infty\)-categories, we are yet to discuss their models. In this section we recall the definition of a model for these theories, and discuss some properties of these models. The definitions of \emph{globular category} and \emph{globular sum} were given in \cref{sec:background}. Any variant of \Cattr can be equipped with the structure of a globular category by choosing the disc objects to be the disc contexts and letting the source and target maps be given by the inclusions \(\lfloor \incbd {n} \epsilon {D^{n+1}} \rfloor\) for \(\epsilon \in \{-,+\}\). We then define the category of models of \Cattsu and \Cattsua. \begin{definition} Recall that for any tame variant of \Cattr, the category \(\mathsf{Catt}_{\mathcal{R}}^{\mathsf{ps}}\) is defined to be the restriction of the syntactic category \(\mathsf{Catt}_{\mathcal{R}}\) to the ps-contexts. We define the category of models to be the full subcategory of the presheaf category on \(\mathsf{Catt}_{\mathcal{R}}^{\mathsf{ps}}\) consisting of functors: \[F : \left( \mathsf{Catt}_{\mathcal{R}}^{\mathsf{ps}} \right)^{\text{op}} \to \mathbf{Set}\] such that \(F^{\text{op}}\) preserves globular sums. \end{definition} Each element of the category of models has the structure of a weak \(\infty\)-category. For a model \(F : \left( \mathsf{Catt}_{\mathcal{R}}^{\mathsf{ps}} \right)^{\text{op}} \to \mathbf{Set}\), the set of \(n\)-cells is given by \(F(D^n)\), with source and target maps given by the functions: \[ F(\lfloor \incbd {n-1} - {D^n} \rfloor), F(\lfloor \incbd {n-1} + {D^n} \rfloor) : F(D^n) \to F(D^{n-1})\] for which the globularity equations follow from the globularity of the inclusion maps. For each term over a ps-context in \Cattr, an operation on each of the models can be derived. We consider the action of the \(1\)-composition term, given by \(\stdcoh {[\emp,\emp]} 1\). For the model \(F\), this induces an operation: \[ F(\{\lfloor \stdcoh {[\emp, \emp]} 1 \rfloor\}) : F(D^1\vee D^1) \to F(D^1)\] Due to the preservation of globular sums, we have \(F(D^1 \vee D^1) = F(D^1) \amalg_{F(D^0)} F(D^1)\), which is exactly the set of composable \(1\)-cells, which the function above sends to their composition. Similarly, the identity \(\id(d_0)\) induces a map \(F(D^0) \to F(D^1)\), giving the identity on each \(0\)-cell. These operations can be combined, to get a compound operation of the following form: \[ \begin{tikzcd}[column sep = large] {F(D^1)} = {F(D^1) \amalg_{F(D^0)} F(D^0)} & {F(D^1) \amalg_{F(D^0)} F(D^1)} & {F(D^1)} \arrow["{\id \amalg F(\id(d_0))}", from=1-1, to=1-2] \arrow["{F(\{\stdcoh {[\emp,\emp]} 1\})}", from=1-2, to=1-3] \end{tikzcd} \] By the functoriality of \(F\) (and preservation of globular sums), this composite should be equal to: \[ F(\{\stdcoh {[\emp,\emp]} 1\} \bullet \langle d_1 , \id_{d_0^+} \rangle) : F(D^1) \to F(D^1)\] Therefore, if \(F\) is further a \Cattsu model, then this operation must equal \(F(\id) = \id\), enforcing the semistrict properties of \Cattsu onto the model. Throughout the thesis, contexts in \Cattr have been viewed as semistrict \(\infty\)-categories themselves. This viewpoint can be made precise by the Yoneda embedding, as for each context \(\Gamma\) of \Cattr, we obtain the presheaf: \[Y(\Gamma) : \mathsf{Catt}_{\mathcal{R}}^{\mathsf{op}} \to \mathbf{Set}\] which sends \(\Delta\) to \(\mathrm{Hom}(\Delta, \Gamma)\), the substitutions from \(\Delta\) to \(\Gamma\). This map preserves all limits, so in particular its opposite preserves the globular sums, meaning it can be restricted to a model of \Cattr. Furthermore, the \(n\)-cells are given by substitutions \(D^n \to \Gamma\), which are precisely the \(n\)-dimensional terms of \(\Gamma\) up to definitional equality. Since every \Catt term is also a \Cattr term, there is an evident functor: \[ K_{\mathcal{R}} : \mathsf{Catt} \to \mathsf{Catt}_{\mathcal{R}}\] which sends each context and substitution to its equivalence class in \Cattr. This functor can be restricted to the functor: \[ K_{\mathcal{R}}^{\mathsf{ps}} : \mathsf{Catt}^{\mathsf{ps}} \to \mathsf{Catt}_{\mathcal{R}}^{\mathsf{ps}} \] which is the identity on objects. We now prove that this functor preserves globular sums. By \cite[Lemma 64]{benjamin2021globular}, the functor \(\mathbf{FinGlob} \to \mathsf{Catt}\) from the category of finite globular sets preserves globular sums, and so it suffices to show that the functor \(\mathbf{FinGlob} \to \mathsf{Catt}_{\mathcal{R}}\) preserves globular sum. By \cite[Lemmas 25 and 29]{benjamin2021globular}, it suffices to show that this functor preserves the initial object and preserves pushouts along the inclusion maps \(S^n \to D^n\). The empty context is clearly the initial object, and this is preserved by the above functor. For the second property it suffices to show that: \[ \begin{tikzcd} {S^n} & \Gamma \\ {D^n} & {\Gamma, (x: A)} \arrow["\{\wk(U^n)\}"', from=1-1, to=2-1] \arrow["{\{A\}}", from=1-1, to=1-2] \arrow["{\{A,x\}}"', from=2-1, to=2-2] \arrow[from=1-2, to=2-2] \end{tikzcd}\] is a pushout for each \(\Gamma \vdash A\) in \Cattr. Suppose there is context \(\Delta\) with substitutions \(\sigma : \Gamma \to \Delta\) and \(\{B,t\} : D^n \to \Delta\) such that: \[\{B\} \equiv \{\wk(U^n)\} \bullet \{B, t\} = \{A\} \bullet \sigma \equiv \{A \sub \sigma\}\] Then the universal map is given by \(\langle \sigma, t \rangle\), with this map being well-formed as \(\Delta \vdash t : B\) and \(B = A \sub \sigma\). The uniqueness of this universal map is clear. Hence, the square above is cocartesian. From this we get the following proposition. \begin{proposition} The functors \(K_{\mathcal{R}}\) and \(K_{\mathcal{R}}^{\mathsf{ps}}\) preserve globular sums. \end{proposition} \begin{proof} As the maps \(\mathbf{FinGlob} \to \mathsf{Catt}\) and \(\mathbf{FinGlob} \to \mathsf{Catt}_{\mathcal{R}}\) preserve globular sums, the globular sums in both \(\mathsf{Catt}\) and \(\mathsf{Catt}_{\mathcal{R}}\) are given exactly by the ps-contexts. The two functors \(K_{\mathcal{R}}\) and \(K_{\mathcal{R}}^{\mathsf{ps}}\) are the identity on ps-contexts, and hence preserve globular sums. \end{proof} Due to this proposition, any model of \Cattr can be also seen as a model of \Catt, by precomposing with the functor \(K_{\mathcal{R}}^{\mathsf{ps}}\). This is to be expected, as intuitively every semistrict \(\infty\)-category should also be a weak \(\infty\)-category, where certain operations are given by identities. \subsection{Rehydration for pasting diagrams} \label{sec:rehydration} We have shown a way in which every model of \Cattr can be viewed as a model of \Catt. In this section we prove that this mapping from \Cattr models to \Catt models is injective. This implies that being semistrict is a \emph{property} of the model, a particular \Catt model can only arise from a unique \Cattr model, if such a \Cattr model exists. We prove this result by demonstrating a partial conservativity result for \Cattr, which we call \emph{rehydration for pasting contexts}. Rehydration refers to the process of taking a term in the semistrict theory, and inserting the necessary coherence morphisms into the term such that it can be typed in \Catt. We discuss the difficulties involved with rehydrating an arbitrary term in \cref{sec:towards-gener-rehydr}, but for now we are only concerned with the simpler case of rehydrating a term \(t : \Term_\Gamma\) where \(\Gamma\) is a ps-context. We work towards the following theorem: \begin{theorem} \label{thm:rehydration} Let \(\mathcal{R}\) be a tame equality rule set that satisfies the support condition and has pruning, disc removal, and endo-coherence removal. Then for any ps-context \(\Delta\) and term \(t : \Term_\Delta\), there is a \Catt term \(s : \Term_\Delta\) such that \(\Delta \vdash s = t\) in \Cattr. \end{theorem} We begin with an example for \Cattsu. Take the pasting context given by the following diagram: \[ \Delta = \begin{tikzcd} w & x & y & z \arrow["f", from=1-1, to=1-2] \arrow["g", from=1-2, to=1-3] \arrow["h", from=1-3, to=1-4] \end{tikzcd} \] The associator \(\alpha\) is a \Cattsu normal form term over \(\Delta\), and we can further define the term: \[ \eta : \id((f*g)*h) \to \alpha_{f,g,h} * \alpha_{f,g,h}^{-1}\] as a single coherence over \(\Delta\). This term is also a \Cattsu normal form. Finally the term: % https://q.uiver.app/#q=WzAsMyxbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzQsMCwiXFxidWxsZXQiXSxbMCwxLCJcXGlkIiwyXSxbMSwyLCJcXGlkIiwyXSxbMCwxLCJcXGFscGhhICogXFxhbHBoYV57LTF9IiwwLHsiY3VydmUiOi00fV0sWzEsMiwiXFxhbHBoYSAqIFxcYWxwaGFeey0xfSIsMCx7ImN1cnZlIjotNH1dLFswLDIsIlxcYWxwaGEgKiBcXGFscGhhXnstMX0iLDIseyJjdXJ2ZSI6NX1dLFszLDUsIlxcZXRhIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs0LDYsIlxcZXRhIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs3LDEsIlxcZXRhXnstMX0iLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjB9fV1d % tex-fmt: skip \[ \begin{tikzcd} \bullet && \bullet && \bullet \arrow[""{name=0, anchor=center, inner sep=0}, "\id"', from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "\id"', from=1-3, to=1-5] \arrow[""{name=2, anchor=center, inner sep=0}, "{\alpha * \alpha^{-1}}", curve={height=-24pt}, from=1-1, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "{\alpha * \alpha^{-1}}", curve={height=-24pt}, from=1-3, to=1-5] \arrow[""{name=4, anchor=center, inner sep=0}, "{\alpha * \alpha^{-1}}"', curve={height=30pt}, from=1-1, to=1-5] \arrow["\eta"', shorten <=3pt, shorten >=3pt, Rightarrow, from=0, to=2] \arrow["\eta"', shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=3] \arrow["{\eta^{-1}}"', shorten <=3pt, Rightarrow, from=4, to=1-3] \end{tikzcd} \] is a \Cattsu normal form term over a pasting context, which is not well-formed in \Catt. Such a term can be rehydrated by inserting the equivalence \(\id \cong \id * \id\) into the centre of the term. Performing a similar construction with the interchanger instead of the associator creates a \Cattsua normal form term over a pasting context which is not a \Catt term. We now proceed with the proof of \cref{thm:rehydration}. We introduce three operations, which are mutually defined on terms of \Cattr over pasting contexts. \begin{itemize} \item The \emph{rehydration} \(R(t)\) of a term \(t\) recursively rehydrates all subterms of \(t\), and then pads the resulting term. For any \Cattr term \(t\), the rehydration is a \Catt term over the same context. For any term \(t\), we call \(R(N(t))\) its \emph{rehydrated normal form}, where \(N\) is the function taking any term to its normal form. We similarly define the rehydration \(R(A)\) of a type \(A\) over a pasting context and \(R(\sigma)\) of a substitution \(\sigma\) whose domain and codomain are pasting contexts. \item The \emph{padding} \(P(t)\) of a \Catt term \(t\), which composes the term with coherences to ensure that its boundaries are in rehydrated normal form. \item The normaliser \(\phi(t)\), a coherence term from \(t\) to its rehydrated normal form \(R(N(t))\) for any \Catt term \(t\). \end{itemize} We give formal definitions for each of these, which we define mutually with proofs of the following statements, where we assume \(\Delta\) and \(\Gamma\) are pasting contexts: \begin{enumerate} \item Suppose \(\Delta \vdash_{\mathcal{R}} t : A\). Then \(\Delta \vdash R(t) : R(N(A))\). Similarly, if \(\Delta \vdash_{\mathcal{R}} A\) or \(\Delta \vdash_{\mathcal{R}} \sigma : \Gamma\), then \(\Delta \vdash R(A)\) and \(\Delta \vdash R(\sigma) : \Gamma\). \item For a \Cattr well-formed term \(t\), type \(A\), and substitution \(\sigma\), we have \(\Delta \vdash t = R(t)\), \(\Delta \vdash A = R(A)\), and \(\Delta \vdash \sigma = R(\sigma)\) in \Cattr. \item Suppose \(\Delta \vdash t : A\) for a \Catt term \(t\), then \(P_k(t)\) is well-formed for \(k \leq \dim(t)\) and \(\Delta \vdash P(t) : R(N(A))\). \item Suppose \(t\) is a well-formed \Catt term. Then for each \(k \leq \dim(t)\), \(P_k(t) = t\). \item If \(\Delta \vdash t : R(N(A))\) in \Catt, then \(\Delta \vdash \phi(t) : \arr t A {R(N(t))}\). \item Let \(t\) be a well-formed \Catt term over a pasting context. Then \(\phi(t) = \id(t)\). \end{enumerate} Each of these definitions and proofs are given by an induction on dimension and subterms, ensuring that they are well-founded. We begin with the definition of the rehydrated term, type, and substitution. \begin{definition} Let \(\Delta\) and \(\Gamma\) be a pasting context. For a term \(t\) or type \(A\) over \(\Delta\), or a substitution \(\sigma : \Gamma \to \Delta\), we define the rehydrations: \[ R(t) : \Term_\Delta \qquad R(A) : \Type_\Delta \qquad R(\sigma) : \Gamma \to \Delta\] by mutual recursion. For a variable \(x\), we let \(R(x) = x\), and for a coherence term we define: \[ R(\Coh \Gamma A \sigma) = P(\Coh {\Gamma} {R(A)} {R(\sigma)})\] For types and substitutions, we recursively apply the rehydration to all subterms. \end{definition} To define the padding, we need the composites over certain trees \(T_k^n\) for \(k < n\) which are defined by: \[ T_0^n = [\emp, D^{n-1}, \emp] \qquad T_{k+1}^{n+1} = \Sigma(T_k^n)\] As an example \(T_1^3\) produces the following context: % https://q.uiver.app/#q=WzAsMixbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzAsMSwiIiwwLHsiY3VydmUiOi0zfV0sWzAsMSwiIiwyLHsiY3VydmUiOjN9XSxbMCwxLCIiLDEseyJjdXJ2ZSI6LTV9XSxbMCwxLCIiLDEseyJjdXJ2ZSI6NX1dLFs1LDMsIiIsMSx7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMiw0LCIiLDAseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzMsMiwiIiwyLHsib2Zmc2V0IjotNSwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFszLDIsIiIsMix7Im9mZnNldCI6NSwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs4LDksIiIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XV0= % tex-fmt: skip \[ \begin{tikzcd} \bullet && \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-18pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=18pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-40pt}, from=1-1, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=40pt}, from=1-1, to=1-3] \arrow[shorten <=2pt, shorten >=2pt, Rightarrow, from=3, to=1] \arrow[shorten <=2pt, shorten >=2pt, Rightarrow, from=0, to=2] \arrow[""{name=4, anchor=center, inner sep=0}, shift left=5, shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow[""{name=5, anchor=center, inner sep=0}, shift right=5, shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow[shorten <=4pt, shorten >=4pt, Rightarrow, scaling nfold=3, from=4, to=5] \end{tikzcd} \] The composite over this context allows us to ``fix'' the \(1\)-dimensional boundary of a \(3\)-dimensional term. \begin{definition} Let \(t\) be an \(n\)-dimensional term of a pasting diagram \(\Delta\). Define its padding \(P(t)\) to be equal to \(P_n(t)\) where: \[ P_0(t) = t \qquad P_{k+1}(t) = \stdcoh {T_{k}^{n}} n \sub {\langle \phi(\src_{k}(P_{k}(t)))^{-1}, P_{k}(t), \phi(\tgt_k(P_{k}(t))) \rangle}\] where \(\src_k\) and \(\tgt_k\) give the \(k\) dimensional source and target of a term. \end{definition} Consider the term \(\alpha : \arr f {\arr x \star y} g\). As an example, we build the following sequence of paddings: \begin{center} \begin{tabular}{P{3cm} P{9cm}} \(P_0(\alpha)\)&{\begin{tikzcd}[ampersand replacement=\&] x \& y \arrow[""{name=0, anchor=center, inner sep=0}, "f"', curve={height=12pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "g", curve={height=-12pt}, from=1-1, to=1-2] \arrow["\alpha"', shorten <=3pt, shorten >=3pt, Rightarrow, from=0, to=1] \end{tikzcd}}\\ \(P_1(\alpha)\)&{\begin{tikzcd}[ampersand replacement=\&] {R(N(x))} \& x \& y \& {R(N(y))} \arrow[""{name=0, anchor=center, inner sep=0}, "f"', curve={height=12pt}, from=1-2, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "g", curve={height=-12pt}, from=1-2, to=1-3] \arrow["{\phi(x)}"', from=1-2, to=1-1] \arrow["{\phi(y)}", from=1-3, to=1-4] \arrow["\alpha"', shorten <=3pt, shorten >=3pt, Rightarrow, from=0, to=1] \end{tikzcd}}\\ \(P_2(\alpha)\)&{\begin{tikzcd}[ampersand replacement=\&] {R(N(x))} \& x \& y \& {R(N(y))} \arrow[""{name=0, anchor=center, inner sep=0}, "f"{description}, curve={height=12pt}, from=1-2, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "g"{description}, curve={height=-12pt}, from=1-2, to=1-3] \arrow["{\phi(x)}"', from=1-2, to=1-1] \arrow["{\phi(y)}", from=1-3, to=1-4] \arrow[""{name=2, anchor=center, inner sep=0}, "R(N(g))", curve={height=-60pt}, from=1-1, to=1-4] \arrow[""{name=3, anchor=center, inner sep=0}, "R(N(f))"', curve={height=60pt}, from=1-1, to=1-4] \arrow["\alpha"', shorten <=4pt, shorten >=3pt, Rightarrow, from=0, to=1] \arrow["{\phi(\phi(x)^{-1} * g * \phi(y))}"{description}, shorten <=5pt, shorten >=3pt, Rightarrow, from=1, to=2] \arrow["{\phi(\phi(x)^{-1}*f*\phi(y))}"{description}, shorten <=5pt, shorten >=3pt, Rightarrow, from=0, to=3] \end{tikzcd}} \end{tabular} \end{center} We lastly define the normaliser coherences. As these are each built from a coherence constructor with the rule for equivalences, they can all be inverted. \begin{definition} Let \(t\) be a term of a pasting diagram \(\Delta\). By \cref{cor:supp-ps}, \(\Supp(t)\) is a pasting diagram, and we let \(i_t\) be the inclusion \(\Supp(t) \to \Delta\). Then we define the normaliser \(\phi(t)\): \[ \phi(t) = \Coh {\Supp(t)} {t \to R(N(t))} {i_t}\] By assumption, \(R(N(t)) = N(t) = t\) and so \(\Supp(R(N(t))) = \Supp(t)\), making the above term well-formed. \end{definition} We now prove the required properties, starting with statement 1. The statements for types and substitutions follow by a simple induction using the case for terms, as if \(A = B\) then \(R(N(A)) = R(N(B))\) (as \(N(A) = N(B)\)). The case for a variable is also trivial, so assume that: \[\Delta \vdash_{\mathcal{R}} \Coh \Gamma B \sigma : A\] Then it follows from induction on subterms that \(\Gamma \vdash R(B)\) and \(\Delta \vdash R(\sigma) : \Gamma\), and so: \[ \Delta \vdash \Coh \Gamma {R(B)} {R(\sigma)} : R(B) \sub {R(\sigma)}\] Then by induction on statement (3), we get: \[ \Delta \vdash P(\Coh \Gamma {R(B)} {R(\sigma)} : R(N(R(B) \sub {R(\sigma)}))) \] By induction on statement (2), we have \(R(B) \sub {R(\sigma)} = B \sub \sigma\). By inspection of the original typing derivation, we have \(B \sub \sigma = A\), and so \(R(N(R(B) \sub {R(\sigma)})) \equiv R(N(A))\), as required. Now consider statement 2. The cases for types and substitutions follow by an easy induction from the result for terms. Since the case for variables is trivial, we restrict to the cases for the coherence terms, where we must prove that: \[ \Gamma \vdash_{\mathcal{R}} \Coh \Delta A \sigma = P(\Coh \Delta {R(A)} {R(\sigma)})\] By (1), \(\Coh \Delta {R(A)} {R(\sigma)}\) is a well-formed \Catt term, and so by (4) and induction on subterms we have: \[ P(\Coh \Delta {R(A)} {R(\sigma)}) = \Coh \Delta {R(A)} {R(\sigma)} = \Coh \Delta A \sigma\] For statement 3, we let \(\Delta \vdash t : A\) and prove for each \(k\) that \(P_k(t)\) is well-formed and that \(\src_m(P_k(t)) \equiv R(N(\src_m(t)))\) and \(\tgt_m(P_k(t)) \equiv R(N(\tgt_m(t)))\) for \(m \leq k\). We proceed by induction on \(k\). The case for \(k = 0\) is trivial, so we must prove that \(P_{k+1}(t)\) is well-formed, which is the term: \[\stdcoh {T_{k}^{n}} n \sub {\langle \phi(\src_{k}(P_{k}(t)))^{-1}, P_{k}(t), \phi(\tgt_k(P_{k}(t))) \rangle} \] By (5), noting that the inductive hypothesis on \(k\) implies that the types of \(\src_k(P_k(t))\) and \(\tgt_k(P_k(t))\) are in rehydrated normal form, we have that the normalisers are well-typed. Therefore, \(P_{k+1}(t)\) is well-formed by the previous fact and the inductive hypothesis on \(k\). By simple calculation it follows that: \[\src_m(P_k(t)) \equiv \src_m(P_m(t)) \equiv \src(\phi(\src_m(t))^{-1}) \equiv R(N(\src_m(t)))\] with a similar equation holding for the target. It then follows that \(\Delta \vdash P(t) : R(N(A))\). Statement 4 holds by a simple induction on \(k\), using statement (6) to reduce each normaliser to an identity, and then using pruning and disc removal to get the equality: \[ \stdcoh {T_{k}^{n}} n \sub {\langle \id(\src_{k}(P_{k}(t))), P_{k}(t), \id(\tgt_k(P_{k}(t))) \rangle} = P_k(t) \] which along with the inductive hypothesis on \(k\) is sufficient. For statement 5, we assume \(\Delta \vdash t : R(N(A))\). Then, by (1) and the preservation rule, we have \(\Delta \vdash \Delta \vdash R(N(t)) : R(N(R(N(A)))) \equiv R(N(A))\), where the equality follows from (2) and the idempotency of the normal form functor. The typing for the normaliser then trivially follows, as \(t\) and \(R(N(t))\) are full in \(\Supp(t)\). For statement 6, we apply statement (1) to get that \(t = N(t) = R(N(t))\). Therefore: \begin{align*} \phi(t) &\equiv \Coh {\Supp(t)} {t \to R(N(t))} {i_t}\\ &= \Coh {\Supp(t)} {t \to t} {i_t}\\ &= \id(t) \sub {i_t}&\text{by endo-coherence removal}\\ &\equiv \id(t) \end{align*} This completes all parts of the definitions and proofs. Then for any well-formed \Cattr term \(t\), \(R(N(t))\) is a well-formed \Catt term with \(R(N(t)) = t\) in \Cattr completing the proof of \cref{thm:rehydration}. Moreover, if \(t = t'\) then \(R(N(t)) \equiv R(N(t'))\), and so the rehydrated of \Cattr terms over pasting contexts can be chosen to respect \Cattr equality. From this we get the following corollary. \begin{corollary} Semistrictness is a property. Let \(\mathcal{R}\) is a tame equality rule set satisfying the support and preservation conditions in addition to having pruning, disc removal, and endo-coherence removal. If \(F\) and \(G\) are \Cattr models such that: \[F \circ K_{\mathcal{R}}^{\mathsf{ps}} = G \circ K_{\mathcal{R}}^{\mathsf{ps}}\] then \(F = G\). \end{corollary} \begin{proof} Since \(K_{\mathcal{R}}^{\mathsf{ps}}\) is the identity on objects, it follows that \(F\) and \(G\) must be equal on objects. Now let \(\Gamma\) and \(\Delta\) be pasting diagrams, and let \(\Gamma \vdash_{\mathcal{R}} \sigma : \Delta\). Then by \cref{thm:rehydration} we have, \(\Gamma \vdash R(\sigma) : \Delta\) and so: \[ F(K_{\mathcal{R}}^{\mathsf{ps}}(R(\sigma))) = G(K_{\mathcal{R}}^{\mathsf{ps}}(R(\sigma))) \] but \(K_{\mathcal{R}}^{\mathsf{ps}}\) is simply an inclusion, so \(F(R(\sigma)) = G(R(\sigma))\) and since \(R(\sigma) = \sigma\) in \Cattr, we have \(F(\sigma) = G(\sigma)\). The substitution \(\sigma\) was arbitrary, so \(F = G\) as required. \end{proof} The above result holds in particular for the equality rule sets \su and \sua, meaning that a model of \Catt can be a model of \Cattsu or \Cattsua in at most one way. \subsection{Towards generalised rehydration} \label{sec:towards-gener-rehydr} The rehydration result of the previous section can be viewed as a partial conservativity result, stating that in a pasting context, \Cattsu and \Cattsua have the same expressive power as \Catt. The original motivation of semistrictness was to strictify parts of the theory without losing the expressiveness of the fully weak setting. We would therefore hope that the rehydration results of \cref{sec:rehydration} extend to arbitrary contexts. Such a result would be a powerful tool for constructing terms in a weak setting; a term could be constructed by constructing it in the semistrict setting, before applying rehydration to the resulting term to get term in the fully weak setting. Such a technique would allow a \Catt proof of Eckmann-Hilton to be constructed mechanically from the vastly simpler \Cattsu Eckmann-Hilton proof, or even give a proof of the Syllepsis in \Catt, for which no proof has been given as of writing. By observing the proof of \cref{thm:rehydration}, we see that the main part that would need replacing for a general rehydration result is the construction of the normalisers, as we can no longer rely on the source and target term of our normaliser living over a pasting diagram that allows the construction of a single coherence. A natural way to proceed is to attempt to build a normaliser \(\phi(t) : t \to R(N(t))\) by recursion on the reduction sequence \(t \red^* N(t)\). We consider a context with \(x : *\) and a scalar \(\alpha : \id(x) \to \id(x)\), and consider the reduction by pruning: \[ \alpha *_0 \id(x) \red (\alpha)\] where \((\alpha)\) is the unary composite on \(\alpha\). We immediately encounter two problems: \begin{itemize} \item For each individual reduction, the source and target of the reduction may not have the same type. In the example above, the source has type \(\id(x) * \id(x) \to \id(x) * \id(x)\), but the target has type \(\id(x) \to \id(x)\). A normaliser between these two terms can therefore not be directly constructed. \item If the source term is padded such that it has the same type as the target term, we can run into a separate problem. Consider the reduction given above again. The following normaliser can be formed: \[ \Coh {D^2} {\rho_{d_1^-}^{-1} *_1 (d_2 *_0 \id(d_0^+)) *_1 \rho_{d_1^+} \to (d_2)} {\langle \{\alpha\} \rangle}\] which has source given by the padded term: \[ \begin{tikzcd} x & x & x \arrow[""{name=0, anchor=center, inner sep=0}, "{\id(x)}", curve={height=-12pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "{\id(x)}"', curve={height=12pt}, from=1-1, to=1-2] \arrow["{\id(x)}"', from=1-2, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "{\id(x)}", controls=+(90:1.5) and +(90:1.5), from=1-1, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "{\id(x)}"', controls=+(270:1.5) and +(270:1.5), from=1-1, to=1-3] \arrow["\alpha"', shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \arrow["{\rho_{\id(x)}}", shorten >=3pt, Rightarrow, from=1-2, to=3] \arrow["{\rho_{\id(x)}}"', shorten >=3pt, Rightarrow, from=1-2, to=2] \end{tikzcd} \] However this term is padded by the right unitor on each side, which is not the canonical normaliser from \(\id(x) * \id(x)\) to \(\id(x)\), the unbiased unitor. \end{itemize} The reduction above was not only chosen to demonstrate both of these problems, but was chosen as it is the problematic reduction that is encountered if one tries to rehydrate the Eckmann-Hilton term from \Cattsu. To give a proof of Eckmann-Hilton, one reaches a critical point where a left unitor and right unitor on the identity must be cancelled out, highlighting the second of the two problems. To solve the second problem one could attempt to prove that for any two reductions paths from \(t\) to \(N(t)\), that there is a higher cell between the normalisers generated from each reduction path, critically relying on the confluence proof for the theory to modularise the problem into finding fillers for each confluence diamond. Such an approach seems infeasible for the following reasons: To find fillers for a confluence diamond, we presumably must already know the form of all rehydrations in the dimension below, which themselves could depend on filling confluence diamonds of the dimension below. This seems to necessitate rehydrating on a dimension by dimension basis, making the full rehydration problem infeasible. It is also likely that at some point it would be necessary to show that two different fillers of a confluence diamond have a higher cell between them, leading to some form of \(\infty\)-groupoid flavoured confluence problem. Such a problem also seems infeasible with the tools currently available to us. An alternative approach could be to show that the ``space'' of all rehydrations is contractible. This can be made precise in the following way. Let \(t\) be a \Cattr term. Then consider the globular set whose \(0\)-cells are \Catt terms \(s\) which are equal to \(t\) in \Cattr, \(1\)-cells are given by \Catt terms \(f : s \to s'\) which are equal to \(\id(t)\) in \Cattr, in general \(n\)-cells given by \Catt terms that are equal to \(\id^n(t)\). The contractability of such a globular set is exactly the property needed for rehydration, as it gives the existence of a \(0\)-cell \(s\) which gives the rehydration, and witnesses the essential uniqueness of this rehydration. Such a contractability proof can be given when the term \(t\) is a term of a pasting diagram, as any higher cells can be given by a simple coherence. This allows us to fix the padding in the example above, observing that the right unitor is equivalent to the unbiased unitor. It is however unclear how such a contractability proof could be extended to arbitrary contexts. We now turn our attention to the first problem presented above. One method for tackling this problem is to give normalisers as a \emph{cylindrical equivalence} instead of a regular equivalence. A cylindrical equivalence can be viewed as the canonical notion of equivalence between two objects of different types. We introduce the first few dimensions of cylinder terms. A \(0\)-cylinder is simply a \(1\)-dimensional term. A \(1\)-cylinder from a cylinder \(f : w \to x\) to a cylinder \(g : y \to z\) can be defined by the square: \[ \begin{tikzcd} x & z \\ w & y \arrow["f", from=2-1, to=1-1] \arrow["g"', from=2-2, to=1-2] \arrow["a"', from=2-1, to=2-2] \arrow["a'", from=1-1, to=1-2] \arrow[Rightarrow, from=2-2, to=1-1] \end{tikzcd} \] where the central arrow has type \(a * g \to f * a'\). If such a cylinder was invertible, which is the case when \(a\), \(b\), and the two-dimensional cell are invertible, then it would be a cylindrical equivalence and would witness the equivalence of \(f\) and \(g\). Suppose two \(1\)-cylinders \(\alpha : f \to g\) and \(\beta : g \to h\) as below: \[ \begin{tikzcd} x & z & v \\ w & y & u \arrow["f", from=2-1, to=1-1] \arrow["g"{description}, from=2-2, to=1-2] \arrow["a"', from=2-1, to=2-2] \arrow["{a'}", from=1-1, to=1-2] \arrow[Rightarrow,from=2-2, to=1-1] \arrow["h"', from=2-3, to=1-3] \arrow["b"', from=2-2, to=2-3] \arrow["{b'}", from=1-2, to=1-3] \arrow[Rightarrow,from=2-3, to=1-2] \end{tikzcd} \] Then a composite cylinder \(f \to h\) could be formed by letting the front ``face'' be given by \(a * b\), the back ``face'' be given by \(a' * b'\) and the filler given by a combination of associators and whiskerings of the two fillers in the diagram. A \(2\)-cylinder could be given by the following diagram: % https://q.uiver.app/#q=WzAsNCxbMCwxLCJcXGJ1bGxldCJdLFsyLDEsIlxcYnVsbGV0Il0sWzIsMCwiXFxidWxsZXQiXSxbNCwwLCJcXGJ1bGxldCJdLFswLDEsIiIsMCx7ImN1cnZlIjotNX1dLFswLDEsIiIsMix7ImN1cnZlIjo1fV0sWzIsMywiIiwyLHsiY3VydmUiOi01fV0sWzIsMywiIiwwLHsiY3VydmUiOjV9XSxbMCwyXSxbMSwzXSxbNCw2LCIiLDAseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzUsNywiIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dXQ== % tex-fmt: skip \[\begin{tikzcd} && \bullet && \bullet \\ \bullet && \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-40pt}, from=2-1, to=2-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=40pt}, from=2-1, to=2-3] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-40pt}, from=1-3, to=1-5] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=40pt}, from=1-3, to=1-5] \arrow[from=2-1, to=1-3] \arrow[from=2-3, to=1-5] \arrow[shorten <=13pt, shorten >=13pt, Rightarrow, from=0, to=2] \arrow[shorten <=13pt, shorten >=13pt, Rightarrow, from=1, to=3] \arrow[shorten <=8pt, shorten >=8pt, Rightarrow, from=1, to=0] \arrow[shorten <=8pt, shorten >=8pt, Rightarrow, from=3, to=2] \end{tikzcd}\] where the top and bottom faces of this diagram are \(1\)-cylinders, and the whole digram should be filled by a \(3\)-dimensional term with appropriate source and target. The shape of this diagram gives the name to this construction. When using cylinders to represent the normalisers in a rehydration process, the inductive step for coherences would require a cylinder to be generated from a cylindrical version of the substitution attached to the coherence. We have seen that this can be done when the coherence is given by \(1\)-composition, but achieving full rehydration would involve giving cylindrical versions of every operation in \Catt. No such proof has been given for any variety of globular weak \(\infty\)-categories. We offer an alternative solution which avoids defining cylinder composition, which we call \emph{rehydration by dimension}. From an equality rule set \(\mathcal{R}\), we can form the rule sets \(\mathcal{R}_n\) which consists of the rules in \((\Gamma,s,t) \in \mathcal{R}\) such that \(\dim(s) = \dim(t) \leq n\). Rehydration by dimension attempts to rehydrate an \(n\)-dimensional term \(t\) by constructing terms \(t_n,\dots,t_0\) such that \(t_i\) is a term which is well-formed in \(\Catt_{\mathcal{R}_i}\), creating a rehydration sequence: \[ \Catt_{\mathcal{R}_n} \to \Catt_{\mathcal{R}_{n-1}} \to \cdots \to \Catt_{\mathcal{R}_1} \to \Catt_{\mathcal{R}_0}\] The term \(t_n\) is given immediately by \(t\), and \(t_0\) is then a term of \(\Catt_{\mathcal{R}_0} = \Catt\), giving the rehydration of \(t\). The key insight of this method is that when generating the normaliser for a particular \(k\)-dimensional generating rule \(s \red t\), we know by the preservation property that the types of \(s\) and \(t\) are equal, and so are further equal in \(\Catt_{\mathcal{R}_{k-1}}\). By factoring through these partial rehydrations, the normaliser of a dimension \(k\) generating rule only has to be valid in \(\Catt_{\mathcal{R}_{k-1}}\), meaning that the normalisers can again be given by regular equivalences. Unfortunately, this method does not avoid the need to define new classes of operations in \Catt, as we could be required to prove that arbitrary \Catt operations are natural in their lower-dimensional arguments. Consider terms \(f : x \to y\) and \(g : y \to z\) and suppose the \(\Catt_{\mathcal{R}_1}\) normal form of \(y\) is \(y'\) with normaliser \(\phi(y)\). Then, during a rehydration proof to \(\Catt_{\mathcal{R}_0}\), it may be required to give a normaliser from \(f * g\) to \((f * \phi(y)) * (\phi(y)^{-1} * g)\), effectively requiring us to prove that \(1\)-composition is natural in its central \(0\)-cell. Similarly to the case with cylinders, in this case for \(1\)-composition, such a normaliser can easily be given, but we possess no way of creating such naturality arguments on arbitrary coherences. The proofs of Eckmann-Hilton given in \cref{sec:examples} give an example of the result of each of these methods, with the proof in \texttt{/examples/eh.catt} proceeding by ``rehydration by dimension'', and the proof in \texttt{/examples/eh-cyll.catt} using cylinders. In both proofs, the only example of the second problem we encounter is proving that the left and right unitors on the identity are equivalent to the unbiased unitor. For the cylinder proof, the composition of \(1\)-cylinders is used and is given by the term \texttt{cyl\_comp}, which is then implicitly suspended by the tool. The rehydration by dimension proof needs a naturality move like the one described above, which is given by the term \texttt{compat\_move}. \section{Future ideas} \label{sec:future-work} In this final section, we collect together some ideas for the continuation of this work, including ideas for different semistrict theories based on \Cattr, and modifications to the existing theories. Some ideas for future avenues of research have already been discussed, such as the potential improvements to the implementation discussed in \cref{sec:further-work}, and the discussion of full rehydration given in \cref{sec:towards-gener-rehydr}, which we will not repeat here. \paragraph{Further results for \Cattsua} The metatheory of \Cattsua is more complicated than the corresponding metatheory of \Cattsu, though at first glance the relative increase in power does not match this complexity. The jump from \Catt to \Cattsu vastly simplified the proof of Eckmann-Hilton, allowed the syllepsis to be proven, and lead to results such as disc trivialisation. In contrast, \Cattsua provides no further simplification to Eckmann-Hilton and only slightly simplifies the syllepsis, removing some associators from the proof. One potential utility of \Cattsua could be simplifying the composites of cylinders, as briefly introduced in \cref{sec:towards-gener-rehydr}. Consider the following diagram from that section which contains two composable \(1\)-cylinders. \[ \begin{tikzcd} x & z & v \\ w & y & u \arrow["f", from=2-1, to=1-1] \arrow["g"{description}, from=2-2, to=1-2] \arrow["a"', from=2-1, to=2-2] \arrow["{a'}", from=1-1, to=1-2] \arrow["X", Rightarrow,from=2-2, to=1-1] \arrow["h"', from=2-3, to=1-3] \arrow["b"', from=2-2, to=2-3] \arrow["{b'}", from=1-2, to=1-3] \arrow["Y", Rightarrow,from=2-3, to=1-2] \end{tikzcd} \] In \Catt, the \(1\)-composite of these cylinders is a term \((a*b)*h \to f*(a'*b')\) given by: \[ \alpha_{a,b,h} *_1 (a *_0 Y) *_1 \alpha_{a,g,b'}^{-1} *_1 (X *_0 b') * \alpha_{f,a',b'}\] where each \(\alpha\) term is an associator. This would of course simplify in \Cattsua to \((a *_0 Y) *_1 (X *_0 b')\). Such a simplification could make it simpler to define higher cylinder coherences, such as associator for \(1\)-cylinders, which would be trivial in \Cattsua, but far more involved in \Catt. Further future work for \Cattsua could involve the search for an analogue of disc trivialisation for \Cattsua. We would expect there to be a more general class of contexts that are trivialised by \Cattsua but are not trivialised. The contexts present in the cylinder contexts presented above could form a starting point for such a study. A separate avenue for further study is to explore the links between \Cattsua and more graphical presentations of semistrict \(\infty\)-categories. String diagrams are a common graphical method for working with monoidal categories and bicategories~\cite{selinger2011survey}, and their higher-dimensional counterparts, such as those implemented in the tool \textsf{homotopy.io}, can be viewed as strictly associative and unital finitely presented \(\infty\)-categories, much like contexts of \Cattsua. Translation results in either direction between these two settings, while highly non-trivial due to the contrast in the way each system approaches composition, would be valuable. \paragraph{Generalised insertion} The conditions given for insertion in \cref{sec:insertion} were not the most general conditions possible. In this section, we stated that to perform insertion we required an insertion redex \((S,P,T,\U,L,M)\), and one of the conditions of this insertion redex was that: \[ L(\olsi P) \equiv \stdcoh T {\lh(P)} \sub M\] It turns out that it is sufficient to give the weaker condition that the locally maximal argument is a coherence where the type contained in the coherence is sufficiently suspended: \[ L(\olsi P) \equiv \Coh T {\Sigma^{\bh(P)}(A)} M\] As \(\stdcoh {\Sigma(T)} {n+1} \equiv \Sigma (\stdcoh T n)\), and the original condition required that \(\th(T) \geq \bh(P)\), this alternative condition is a strict generalisation of the previous condition. Under the new condition, the exterior labelling must be modified. It firstly must take the type \(A\) as an argument. The case for \(P = [k]\) is then modified such that \(\kappa_{S,[k],T,A}\) (noting the extra type subscript) is given by: \[ \begin{tikzcd}[column sep=smaller,row sep = 20pt] {[S_0,\dots,S_{k-1}]} & \doubleplus & {T} & \doubleplus & {[S_{k+1},\dots,S_n]} \\ \\ {[S_0,\dots,S_{k-1}]} & \vee & {\Sigma S_k} & \vee & {[S_{k+1},\dots,S_n]} \arrow["{\{A, \Coh T A {\id_T}\}}"{description, font = \normalsize}, from=3-3, to=1-3] \arrow["\id"{font = \normalsize}, from=3-1, to=1-1] \arrow["\id"{font = \normalsize}, from=3-5, to=1-5] \end{tikzcd} \] when \(S = [S_0,\dots,S_n]\). The inductive step of the exterior labelling then relies on the type \(A\) being sufficiently suspended to proceed, just as the original version depends on the trunk height of \(T\) being sufficient to proceed (we note that the trunk height condition is still needed in this generalisation). For the necessary typing judgements to be satisfied, we must have \(\src_0(A) \equiv \fst(\lfloor T \rfloor)\) and \(\tgt_0(A) \equiv \snd(\lfloor T \rfloor)\), but no other extra condition is necessary. In some ways, this definition of insertion is more natural than the definition given earlier. We no longer rely on the syntactic condition of the locally maximal argument being a standard coherence, only relying on the far weaker suspendability property. In the proof for confluence of \Cattsua, a large focus was cases where a reduction modified a standard coherence into a term which was no longer a standard coherence. Cases like these do not happen with generalised insertion, as reductions do not break the suspendability property. More generally, a confluence proof for generalised insertion does not require any proof about the interaction of insertion with boundary inclusion maps and standard coherences (given in \cref{sec:further-properties} for the original definition). Unfortunately, this generalised form of insertion cannot be directly used in \Cattsua without breaking confluence. Let \(\Gamma\) be the following context given by the following diagram: % https://q.uiver.app/#q=WzAsNCxbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzMsMCwiXFxidWxsZXQiXSxbNCwwLCJcXGJ1bGxldCJdLFswLDEsIiIsMCx7ImN1cnZlIjotNX1dLFswLDEsIiIsMix7ImN1cnZlIjo1fV0sWzEsMiwiZyIsMCx7ImN1cnZlIjotMn1dLFsxLDIsImYiLDIseyJjdXJ2ZSI6Mn1dLFsyLDMsImkiLDAseyJjdXJ2ZSI6LTJ9XSxbMiwzLCJoIiwyLHsiY3VydmUiOjJ9XSxbNSw0LCIiLDIseyJvZmZzZXQiOi01LCJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzUsNCwiIiwwLHsib2Zmc2V0Ijo1LCJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzcsNiwiXFxhbHBoYSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbOSw4LCJcXGJldGEiLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzEwLDExLCJcXHBoaSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XV0= % tex-fmt: skip \[ \begin{tikzcd} \bullet && \bullet & \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=30pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "g", curve={height=-12pt}, from=1-3, to=1-4] \arrow[""{name=3, anchor=center, inner sep=0}, "f"', curve={height=12pt}, from=1-3, to=1-4] \arrow[""{name=4, anchor=center, inner sep=0}, "i", curve={height=-12pt}, from=1-4, to=1-5] \arrow[""{name=5, anchor=center, inner sep=0}, "h"', curve={height=12pt}, from=1-4, to=1-5] \arrow[""{name=6, anchor=center, inner sep=0}, "\gamma", shift left=5, shorten <=8pt, shorten >=8pt, Rightarrow, from=1, to=0] \arrow[""{name=7, anchor=center, inner sep=0}, "\delta"', shift right=5, shorten <=8pt, shorten >=8pt, Rightarrow, from=1, to=0] \arrow["\alpha"', shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=2] \arrow["\beta"', shorten <=3pt, shorten >=3pt, Rightarrow, from=5, to=4] \arrow["\phi"', shorten <=4pt, shorten >=4pt, Rightarrow, nfold=3, from=6, to=7] \end{tikzcd} \] and consider the terms: \begin{align*} I &= (\alpha *_0 h) *_1 (g *_0 \beta)\\ E &= \Coh {\Supp(I)} {I \to I} {\id}\\ X &= \phi *_0 E \end{align*} We now have the following critical pair: \(X\) can reduce by inserting the locally maximal argument \(E\), as the branch has branching height \(0\) making the suspendability condition vacuous, but \(E\) also reduces by endo-coherence removal. By performing the generalised insertion we obtain the coherence: \[ \Coh \Gamma {\gamma *_0 I \to \delta *_0 I} \id\] Let \(W(x,y,z)\) refer to the standard composite over the diagram: % https://q.uiver.app/#q=WzAsMyxbMCwwLCJcXGJ1bGxldCJdLFsxLDAsIlxcYnVsbGV0Il0sWzIsMCwiXFxidWxsZXQiXSxbMCwxLCIiLDAseyJjdXJ2ZSI6LTN9XSxbMCwxLCIiLDIseyJjdXJ2ZSI6M31dLFsxLDIsIiIsMix7ImN1cnZlIjotNX1dLFsxLDIsIiIsMix7ImN1cnZlIjo1fV0sWzEsMl0sWzQsMywiIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs2LDcsIiIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbNyw1LCIiLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV1d % tex-fmt: skip \[ \begin{tikzcd} \bullet & \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=18pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=30pt}, from=1-2, to=1-3] \arrow[""{name=4, anchor=center, inner sep=0}, from=1-2, to=1-3] \arrow["x"',shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["y"',shorten <=4pt, shorten >=4pt, Rightarrow, from=3, to=4] \arrow["z"',shorten <=4pt, shorten >=4pt, Rightarrow, from=4, to=2] \end{tikzcd} \] Then the coherence term above admits further cell reductions which convert the composites \(\gamma *_0 I\) and \(\delta *_0 I\) to \(W(\gamma, (\alpha *_0 h), (g *_0 \beta))\) and \(W(\delta, (\alpha *_0 h), (g *_0 \beta))\). The resulting term reduces no further. If the endo-coherence removal is performed, then \(E\) reduces to \(\id(I)\), which can be pruned from the original composite. After further reductions, we obtain a coherence over the context \(\Delta\) given by the following diagram: % https://q.uiver.app/#q=WzAsMyxbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzMsMCwiXFxidWxsZXQiXSxbMCwxLCIiLDAseyJjdXJ2ZSI6LTV9XSxbMCwxLCIiLDAseyJjdXJ2ZSI6NX1dLFsxLDIsIiIsMCx7ImN1cnZlIjotMn1dLFsxLDIsIiIsMix7ImN1cnZlIjoyfV0sWzQsMywiIiwwLHsib2Zmc2V0IjotNSwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs0LDMsIiIsMix7Im9mZnNldCI6NSwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs2LDUsIkIiLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzcsOCwiQSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XV0= % tex-fmt: skip \[ \begin{tikzcd} \bullet && \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=30pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-3, to=1-4] \arrow[""{name=6, anchor=center, inner sep=0}, curve={height=0pt}, from=1-3, to=1-4] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=30pt}, from=1-3, to=1-4] \arrow[""{name=4, anchor=center, inner sep=0}, "x", shift left=5, shorten <=8pt, shorten >=8pt, Rightarrow, from=1, to=0] \arrow[""{name=5, anchor=center, inner sep=0}, "y"', shift right=5, shorten <=8pt, shorten >=8pt, Rightarrow, from=1, to=0] \arrow["B"', shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=6] \arrow["C"', shorten <=3pt, shorten >=3pt, Rightarrow, from=6, to=2] \arrow["A"', shorten <=4pt, shorten >=4pt, Rightarrow, nfold=3, from=4, to=5] \end{tikzcd} \] In particular, the result of these reductions is the following coherence: \[ \Coh \Delta {W(x,B,C) \to W(y,B,C)} {\langle \phi, (\alpha *_0 h), (g *_0 \beta) \rangle}\] which admits no further reductions, hence breaking confluence. It is even unclear which of these reduction paths is the more canonical for such a system, the first moves the complexity of \(I\) to the type in the coherence, whereas the second keeps the complexity of \(I\) in the arguments of the coherence. Conjecturally, one could consider generalisations to endo-coherence removal which could factor out the common structure of \(W(\gamma, (\alpha *_0 h), (g *_0 \beta))\) and \(W(\delta, (\alpha *_0 h), (g *_0 \beta))\), reducing the result of the first reduction path to the result of the second reduction path, though we have not explored any such definition. \paragraph{A further strictification to \Cattsua} \citeauthor{douglas2016internal} give an explicit representation a Gray category~\cite[Definition~2.8]{douglas2016internal}, which can be used as a direct point of comparison to \Cattsua, as Gray categories are semistrict \(3\)-categories with strict unitors and associators. The weak structure in their presentation of Gray categories is given by an invertible \(3\)-cell they call \emph{switch}, which has the same form as the \Catt term which we called \(\mathsf{swap}\) in \cref{sec:cattsu}. In their paper, all of the equalities between \(2\)-cells are generated by a set of axioms [S2-4] to [S2-15]. Each of these equalities is contained in the definitional equality of \Cattsua, with the exception of [S2-9] and [S2-10], which witness a compatibility between whiskering and vertical composition. We consider the axiom [S2-9], as [S2-10] can be treated symmetrically. Let \(\Delta\) be the context given by diagram: % https://q.uiver.app/#q=WzAsMyxbMCwwLCJcXGJ1bGxldCJdLFsxLDAsIlxcYnVsbGV0Il0sWzIsMCwiXFxidWxsZXQiXSxbMCwxLCJmIl0sWzEsMiwiIiwwLHsiY3VydmUiOi00fV0sWzEsMiwiIiwwLHsiY3VydmUiOjR9XSxbMSwyXSxbNSw2LCJcXGFscGhhIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs2LDQsIlxcYmV0YSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XV0= % tex-fmt: skip \[ \begin{tikzcd} \bullet & \bullet & \bullet \arrow["f", from=1-1, to=1-2] \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-24pt}, from=1-2, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=24pt}, from=1-2, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, from=1-2, to=1-3] \arrow["\alpha"', shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=2] \arrow["\beta"', shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=0] \end{tikzcd} \] and consider the following terms of \(\Delta\): \[ (f *_0 \alpha) *_1 (f *_0 \beta) \qquad f *_0 (\alpha *_1 \beta)\] while the second term reduces to the standard composite over \(\Delta\), the first does not reduce, as no insertion can be performed due to the condition on trunk height, and hence these two terms are not equal in \Cattsua, unlike in Gray categories. Although it could be argued that these axioms reside in the interchange family of laws for \(\infty\)-categories, one could attempt to define a stricter version of \Cattsua which incorporates these equalities, with the aim of proving that \(3\)-truncated models of this stricter type theory are equivalent to Gray categories. \paragraph{Strict interchange} In contrast to the reductions in this thesis which strictify units, one could instead consider reductions that strictify all composition, making the associativity and interchange laws strict, leaving only units weak. Such a form of semistrictness is often called \emph{Simpson semistrictness}, due to a conjecture of \citeauthor{simpson1998homotopy}~\cite{simpson1998homotopy} that leaving units weak is sufficient to retain the full expressiveness of weak \(\infty\)-categories. To achieve this, one could try an approach similar to insertion of merging arguments of a term into the head coherence, when all the involved terms are standard coherences. To be able to strictify terms such as the \(\mathsf{swap}\) term given in \cref{sec:cattsu}, the trunk height condition of insertion must be dropped. This immediately leads to composites over contexts which are not pasting diagrams: Consider the context generated by the diagram: % https://q.uiver.app/#q=WzAsMyxbMCwwLCJ4Il0sWzEsMCwieSJdLFsyLDAsInoiXSxbMCwxLCJnIiwwLHsiY3VydmUiOi0zfV0sWzAsMSwiZiIsMix7ImN1cnZlIjozfV0sWzEsMiwiaSIsMCx7ImN1cnZlIjotM31dLFsxLDIsImgiLDIseyJjdXJ2ZSI6M31dLFs0LDMsIlxcYWxwaGEiLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzYsNSwiXFxiZXRhIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dXQ== % tex-fmt: skip \[ \begin{tikzcd} x & y & z \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, "i", curve={height=-18pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "h"', curve={height=18pt}, from=1-2, to=1-3] \arrow["\alpha"', shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["\beta"', shorten <=5pt, shorten >=5pt, Rightarrow, from=3, to=2] \end{tikzcd} \] and then consider the following composite in this context: \[ \alpha *_0 ((\beta *_0 \id(\id(z))) *_1 \rho_i)\] where \(\rho_i\) is the right unitor on \(i\). Allowing a more general form of merging would lead to this term becoming a composite of the following form: % https://q.uiver.app/#q=WzAsNCxbMCwwLCJ4Il0sWzEsMCwieSJdLFsyLDAsInoiXSxbMywwLCJ6Il0sWzAsMSwiZyIsMCx7ImN1cnZlIjotM31dLFswLDEsImYiLDIseyJjdXJ2ZSI6M31dLFsxLDIsImkiXSxbMSwyLCJoIiwyLHsiY3VydmUiOjN9XSxbMiwzLCJcXGlkKHopIiwyXSxbMSwzLCIiLDAseyJjdXJ2ZSI6LTV9XSxbMiwzLCJcXGlkKHopIiwyLHsiY3VydmUiOjN9XSxbNSw0LCJcXGFscGhhIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs3LDYsIlxcYmV0YSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMiw5LCJcXHJob19pIiwyLHsic2hvcnRlbiI6eyJ0YXJnZXQiOjIwfX1dLFsxMCw4LCJcXGlkKFxcaWQoeikpIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dXQ== % tex-fmt: skip \[ \begin{tikzcd} x & y & z & z \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, "i", from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "h"', curve={height=25pt}, from=1-2, to=1-3] \arrow[""{name=4, anchor=center, inner sep=0}, "{\id}", from=1-3, to=1-4] \arrow[""{name=5, anchor=center, inner sep=0}, "i", curve={height=-40pt}, from=1-2, to=1-4] \arrow[""{name=6, anchor=center, inner sep=0}, "{\id}"', curve={height=25pt}, from=1-3, to=1-4] \arrow["\alpha"', shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["\beta"', shorten <=2pt, shorten >=2pt, Rightarrow, from=3, to=2] \arrow["{\rho_i}"'{pos=0.4}, shorten >=3pt, Rightarrow, from=1-3, to=5] \arrow["{\id^2}"', shorten <=2pt, shorten >=2pt, Rightarrow, from=6, to=4] \end{tikzcd} \] Although this diagram is not a pasting diagram, as it is not a globular set, we would still expect it to fulfil a similar contractability property to the one pasting diagrams do. One may therefore be lead to believe that strict interchange could be achieved in a type theory similar to \Catt by allowing a more general class of pasting diagrams. This, however, does not work. We consider the following counterexample due to \citeauthor{forest2022unifying}~\cite{forest2022unifying}: let \(\Gamma\) be the context generated by the following diagram. % https://q.uiver.app/#q=WzAsMyxbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzQsMCwiXFxidWxsZXQiXSxbMCwxLCIiLDAseyJjdXJ2ZSI6LTV9XSxbMCwxLCIiLDIseyJjdXJ2ZSI6NX1dLFswLDFdLFsxLDIsIiIsMSx7ImN1cnZlIjotNX1dLFsxLDIsIiIsMSx7ImN1cnZlIjo1fV0sWzEsMl0sWzQsNSwiXFxhbHBoYSIsMix7Im9mZnNldCI6LTQsInNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbNCw1LCJcXGFscGhhJyIsMix7Im9mZnNldCI6NCwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs1LDMsIlxcYmV0YSIsMix7Im9mZnNldCI6LTQsInNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbNSwzLCJcXGJldGEnIiwyLHsib2Zmc2V0Ijo0LCJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzgsNiwiXFxkZWx0YSIsMix7Im9mZnNldCI6LTQsInNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbOCw2LCJcXGRlbHRhJyIsMix7Im9mZnNldCI6NCwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs3LDgsIlxcZ2FtbWEiLDIseyJvZmZzZXQiOi00LCJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzcsOCwiXFxnYW1tYSciLDIseyJvZmZzZXQiOjQsInNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XV0= % tex-fmt: skip \[ \begin{tikzcd} \bullet && \bullet && \bullet \arrow[""{name=0, anchor=center, inner sep=0}, "h", curve={height=-40pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=40pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "g"{description}, from=1-1, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "k", curve={height=-40pt}, from=1-3, to=1-5] \arrow[""{name=4, anchor=center, inner sep=0}, "i"', curve={height=40pt}, from=1-3, to=1-5] \arrow[""{name=5, anchor=center, inner sep=0}, "j"{description}, from=1-3, to=1-5] \arrow["\alpha\vphantom{\alpha'}"', shift left=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=1, to=2] \arrow["{\alpha'}"', shift right=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=1, to=2] \arrow["\beta\vphantom{\beta'}"', shift left=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=2, to=0] \arrow["{\beta'}"', shift right=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=2, to=0] \arrow["\delta\vphantom{\delta'}"', shift left=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=5, to=3] \arrow["{\delta'}"', shift right=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=5, to=3] \arrow["\gamma\vphantom{\gamma'}"', shift left=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=4, to=5] \arrow["{\gamma'}"', shift right=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=4, to=5] \end{tikzcd} \] and let \(\Delta = \Gamma, (X : \alpha *_0 \delta \to \alpha' *_0 \delta'), (Y : \beta *_0 \gamma \to \beta' *_0 \gamma')\). We then have the following distinct composites: \[ \left( \begin{matrix} f *_0 \gamma\\ *_1\\ X\\ *_1\\ \beta *_0 k \end{matrix} \right) *_2 \left( \begin{matrix} \alpha' *_0 i\\ *_1\\ Y\\ *_1\\ h *_0 \delta' \end{matrix} \right) \not\cong \left( \begin{matrix} \alpha *_0 i\\ *_1\\ Y\\ *_1\\ h *_0 \delta \end{matrix} \right) *_2 \left( \begin{matrix} f *_0 \gamma'\\ *_1\\ X\\ *_1\\ \beta' *_0 k \end{matrix} \right) \] which are intuitively the composite of \(X\) and \(Y\) in either order, where \(X\) and \(Y\) have been whiskered with the appropriate terms. We note that the matrix notation above is only used to aid comprehension, and does not represent the application of any matrix operations. The approach described above of merging together composites would lead to both of the above composites of \(X\) and \(Y\) being reduced to the same composite over \(\Delta\), contradicting the viability of such an approach. An alternative, non-rewriting based approach could be defined by the following equality rule: \begin{equation*} \left\{ (\Gamma, s \sub \sigma, t \sub \sigma) \mathrel{\bigg\vert}{} \begin{matrix*}[l] \text{\(s\) and \(t\) are pure composite terms,}\\ s = t \text{ in a strict \(\infty\)-category} \end{matrix*} \right\} \end{equation*} where a \emph{pure composite} is a term constructed only using standard composites. Such an approach avoids the counter example above, as the two composites of \(X\) and \(Y\) are not equal in a strict \(\infty\)-category, and so would not be equated in the type theory generated by this equality rule set. We note that due to an algorithm of \citeauthor{makkai2005word}~\cite{makkai2005word}, which is also described and implemented by \citeauthor{forest2021computational}~\cite{forest2021computational}, it can be decided whether terms \(s\) and \(t\) are equal in a strict \(\infty\)-category. Therefore, to decide equality of the above system, we need a method of finding the correct decomposition of a term into a substitution applied to a purely compositional term. We conjecture that there exists a factorisation system on \(\mathsf{Catt}\) with the left class of morphisms given by purely compositional substitutions, substitutions whose contained terms are all pure composites, which could be used for this purpose. We leave all details of such a construction for future work. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% References: %% \printbibliography \end{document} % Local Variables: % jinx-local-words: "CollegeShields ps" % TeX-engine: xetex % End: tex-fmt-0.5.2/tests/source/ignore.tex000066400000000000000000000007111473573253500175660ustar00rootroot00000000000000\documentclass{article} \begin{document} Lines which end with the ignore keyword are not indented or wrapped even if they are long % tex-fmt: skip % tex-fmt: off It is also possible to ignore blocks of lines together and not indent them even like this % tex-fmt: on Not ignored % tex-fmt: on Not ignored % tex-fmt: off Ignored % tex-fmt: off Ignored % tex-fmt: on Not ignored % tex-fmt: off Ignored \end{document} tex-fmt-0.5.2/tests/source/lists.tex000066400000000000000000000006521473573253500174450ustar00rootroot00000000000000\documentclass{article} \begin{document} \begin{itemize} \item Lists with items on one line \item Lists with items on multiple lines % comments before a list item \item Another item \item Another item % comments inside a list item Or even just % trailing comments \item Every \item should start \item a new line \end{itemize} Commands such as itemsep should not be affected. \setlength{\itemsep}{0pt} \end{document} tex-fmt-0.5.2/tests/source/masters_dissertation.tex000066400000000000000000002722271473573253500225660ustar00rootroot00000000000000\documentclass[12pt,draft]{ociamthesis} %TC:ignore % PDF Version %\pdfminorversion=7 % general typesetting \usepackage[utf8]{inputenc} \usepackage[british]{babel} \usepackage{microtype} \usepackage[table]{xcolor} % lengths \usepackage{etoolbox} \usepackage{setspace} % mathematics typesetting \usepackage{amsmath} \usepackage{amssymb} \usepackage{amsthm} \usepackage{mathtools} \usepackage{dsfont} % headers \usepackage{fancyhdr} \fancyhead{} \renewcommand{\headrulewidth}{0pt} % Lof, LoT \usepackage{tocloft} \setlength{\cftfigindent}{0pt} \setlength{\cfttabindent}{0pt} % algorithms \usepackage[boxruled, linesnumbered, commentsnumbered, algochapter, ]{algorithm2e} % graphics \usepackage{graphicx} \usepackage{float} \usepackage{subcaption} % draft options %\usepackage{ifdraft} %\ifoptiondraft{ %\usepackage{draftwatermark} %\SetWatermarkText{DRAFT} %\SetWatermarkScale{6} %\SetWatermarkColor[rgb]{1,0.9,0.9} %\usepackage{showframe} %\usepackage{layout} %}{} % hyperlinks \usepackage[plainpages=false,draft=false ,hidelinks ]{hyperref} \usepackage{cite} % glossary \usepackage[nopostdot,nonumberlist]{glossaries} %TC:endignore % suppress pdf warnings %\pdfsuppresswarningpagegroup=1 \title{Motif\hspace*{0.05cm}-Based Spectral Clustering\\[1ex] of Weighted Directed Networks} \author{William George Underwood} \college{Department of Statistics} \renewcommand{\submittedtext}{} \degree{Part C Dissertation in Mathematics \& Statistics} \degreedate{Trinity 2019} %TC:ignore \theoremstyle{plain} \newtheorem{theorem}{Theorem}[chapter] \newtheorem{proposition}{Proposition}[chapter] \theoremstyle{definition} \newtheorem{definition}{Definition}[chapter] \newtheorem{example}{Example}[chapter] \newtheorem{prf}{Proof}[chapter] \theoremstyle{remark} \newtheorem*{remark}{Remark} \newtheorem*{notation}{Notation} % algorithms \DontPrintSemicolon % input output definitions \makeatletter \renewcommand{\SetKwInOut}[2]{% \sbox\algocf@inoutbox{\KwSty{#2}\algocf@typo:}% \expandafter\ifx\csname InOutSizeDefined\endcsname\relax% \newcommand\InOutSizeDefined{}\setlength{\inoutsize}{\wd\algocf@inoutbox}% \sbox\algocf@inoutbox{\parbox[t]{\inoutsize}% {\KwSty{#2}\algocf@typo:\hfill}~}% \setlength{\inoutindent}{\wd\algocf@inoutbox}% \else% else keep the larger dimension \ifdim\wd\algocf@inoutbox>\inoutsize% \setlength{\inoutsize}{\wd\algocf@inoutbox}% \sbox\algocf@inoutbox{\parbox[t]{\inoutsize}% {\KwSty{#2}\algocf@typo:\hfill}~}% \setlength{\inoutindent}{\wd\algocf@inoutbox}% \fi% \fi% the dimension of the box is now defined. \algocf@newcommand{#1}[1]{% \ifthenelse{\boolean{algocf@inoutnumbered}}{\relax}{\everypar={\relax}}% {\let\\\algocf@newinout\hangindent=\inoutindent\hangafter=1\parbox[t]% {\inoutsize}{\KwSty{#2}\algocf@typo:\hfill}~##1\par}% \algocf@linesnumbered% reset the numbering of the lines }}% \makeatother % keywords \SetKwInOut{Input}{Input} \SetKwInOut{Output}{Output} \SetKw{In}{in} \SetKwProg{Function}{function}{:}{} % algorithm comment styles \newcommand\commfont[1]{\rmfamily{#1}} \SetCommentSty{commfont} \SetKwComment{Comm}{$\rhd\ $}{} % line spacing \AtBeginEnvironment{algorithm}{\setstretch{1.15}} % glossaries \setlength{\glsdescwidth}{0.92\hsize} \newglossarystyle{mystyle}{% \setglossarystyle{long}% \renewenvironment{theglossary}% \begin{longtable}{@{}p{2cm}p{\glsdescwidth}}% \end{longtable}% } \makeglossaries % macros \newcommand\bb[1]{\mathbb{#1}} \newcommand\ca[1]{\mathcal{#1}} \newcommand\Aut{\mathrm{Aut}} % for inputting tables \makeatletter\let\expandableinput\@@input\makeatother %TC:endignore % Glossary \newglossaryentry{MAM}{name=MAM, description={Motif adjacency matrix}} \newglossaryentry{DSBM}{name=DSBM, description={Directed stochastic block model}} \newglossaryentry{BSBM}{name=BSBM, description={Bipartite stochastic block model}} \newglossaryentry{Ncut}{name=Ncut, description={Normalised cut}} \newglossaryentry{ARI}{name=ARI, description={Adjusted Rand Index}} \glsaddall \begin{document} %TC:ignore % give sufficient line spacing for comment markup \baselineskip=18pt plus1pt % set how many section levels get numbers and appear in the contents \setcounter{secnumdepth}{3} \setcounter{tocdepth}{2} % do not hyphenate short words \lefthyphenmin4 \righthyphenmin4 \pagenumbering{Alph} %TC:endignore \maketitle \clearpage{} \begin{abstract} Clustering is an essential technique for network analysis, with applications in a diverse range of fields. Although spectral clustering is a popular and effective method, it fails to consider higher-order structure and can perform poorly on directed networks. We aim to address these shortcomings by exploring motif-based spectral clustering methods. We present new matrix formulae for motif adjacency matrices, and a motif-based approach for clustering bipartite networks. Comprehensive experimental results from both synthetic and real data demonstrate the effectiveness of our techniques on a variety of networks. We conclude that motif-based spectral clustering is a valuable tool for analysis of directed and bipartite weighted networks, which is also scalable and easy to implement. \end{abstract} \clearpage{} %TC:ignore \pagenumbering{arabic} \begin{romanpages} \tableofcontents \newpage \listoffigures \newpage \listoftables \begingroup \let\cleardoublepage\relax \let\clearpage\relax %\printglossary[title=Abbreviations, style=mystyle] \endgroup \end{romanpages} % fancy headers \pagestyle{fancy} \renewcommand{\chaptermark}[1]{\markboth{#1}{}} \fancyhead[RO]{\itshape{\nouppercase{Chapter \thechapter : \leftmark}}} %TC:endignore \clearpage{} \chapter{Introduction} % Importance of network analysis in the modern world Networks are ubiquitous in modern society; from the internet and online blogs to protein interactions and human migration, we are surrounded by inherently connected structures~\cite{kolaczyk2014statistical}. The mathematical and statistical analysis of networks is therefore a very important area of modern research, with applications in a diverse range of fields including biology~\cite{albert2005scale}, chemistry~\cite{jacob2018statistics}, physics~\cite{newman2008physics} and sociology~\cite{adamic2005political}. % Clustering is a core technique A common problem in network analysis is that of \emph{clustering}~\cite{schaeffer2007graph}. Network clustering refers to the division of a network into several parts so that objects in the same part are similar, while those in different parts are dissimilar. %Spectral methods are good Spectral methods for network clustering have a long and successful history~\cite{cheeger1969lower,donath1972algorithms,guattery1995performance}, and have become increasingly popular in recent years. These techniques exhibit many attractive properties including generality, ease of implementation and scalability~\cite{von2007tutorial}. % Shortcomings of spectral methods However traditional spectral methods have shortcomings, particularly involving their inability to consider higher-order network structures~\cite{benson2016higher}, and their insensitivity to edge direction~\cite{DirectedClustImbCuts}. These weaknesses can lead to unsatisfactory results, especially when working with directed networks. Motif-based spectral methods have proven more effective for clustering directed networks on the basis of higher-order structures~\cite{tsourakakis2017scalable}, with the introduction of the \emph{motif adjacency matrix} (MAM). % Problems we want to solve In this dissertation we will explore motif-based spectral clustering methods with a focus on addressing these shortcomings for weighted directed networks. Our main contributions include a collection of new matrix-based formulae for MAMs on weighted directed networks, and a motif-based approach for clustering bipartite networks. We also provide comprehensive experimental results both from synthetic data (stochastic block models) and from real-world network data. \section*{Dissertation layout} In Chapter~\ref{chap:graphs} we describe our graph-theoretic framework which provides a natural model for real-world weighted directed networks. We define motifs and instances, and then state and prove new matrix-based formulae for MAMs. % In Chapter~\ref{chap:spectral} we provide a summary of random-walk spectral clustering and discuss techniques for cluster extraction and evaluation. We state the algorithms for both traditional and motif-based spectral clustering. % In Chapter~\ref{chap:motif} we introduce directed stochastic block models (DSBMs), a family of generative models for directed networks, and evaluate the performance of motif-based clustering both on synthetic data and on real data (US Political Blogs network, US Migration network). % In Chapter~\ref{chap:bipartite} we propose a motif-based approach for clustering bipartite graphs and introduce bipartite stochastic block models (BSBMs), a family of generative models for bipartite networks. We again provide experimental results both on synthetic data and on real data (American Revolution network, Unicode Languages network). % Finally in Chapter~\ref{chap:conclusions} we present our conclusions, along with a discussion about limitations and potential extensions of our work. \clearpage{} \clearpage{} \chapter{Graphs and Motifs} \label{chap:graphs} We describe our graph-theoretic framework for network analysis and give matrix-based formulae for motif adjacency matrices (MAMs). In Section~\ref{sec:graphs_graph_definitions} we outline basic concepts relating to graphs and motifs. In Section~\ref{sec:graphs_adj_and_ind_matrices} we define the adjacency and indicator matrices of a graph. In Section~\ref{sec:graphs_motif_adj_matrices} we introduce MAMs and present the main results of this chapter, Proposition~\ref{prop:motif_adj_matrix_formula} and Proposition~\ref{prop:motif_adj_matrix_computation}. \section{Graph definitions} \label{sec:graphs_graph_definitions} Graph notation is notoriously inconsistent in the literature \cite{intro_to_graph_theory}, so we begin by giving all of the relevant notation and definitions. \begin{definition}[Graphs] A \emph{graph} is a triple $\ca{G} = (\ca{V,E},W)$ where $\ca{V}$ is the \emph{vertex set}, $\ca{E} \subseteq \left\{ (i,j) : i,j \in \ca{V}, i \neq j \right\}$ is the \emph{edge set} and $W\colon \ca{E} \to (0,\infty)$ is the \emph{weight map}. \end{definition} \begin{remark} We consider weighted directed graphs without self-loops or multiple edges. We can extend to undirected graphs by replacing undirected edges with bidirectional edges. Where it is not relevant, we may sometimes omit the weight map $W$. \end{remark} \begin{definition}[Underlying edges] Let $\ca{G} = (\ca{V,E})$ be a graph. Its \emph{underlying edges} are $\bar{\ca{E}} \vcentcolon = \big\{ \{i,j\} : (i,j) \in \ca{E} \big\}$. \end{definition} \begin{definition}[Subgraphs] A graph $\ca{G'} = (\ca{V',E'})$ is a \emph{subgraph} of a graph $\ca{G} = (\ca{V,E})$ (write $\ca{G'} \leq \ca{G}$) if $\ca{V'} \subseteq \ca{V}$ and $\ca{E'} \subseteq \ca{E}$. It is an \emph{induced subgraph} (write $\ca{G'} < \ca{G}$) if further $\ca{E'} = \ca{E} \cap ( \ca{V'} \times \ca{V'} )$. \end{definition} \begin{definition}[Connected components] Let $\ca{G} = (\ca{V,E})$ be a graph. The \emph{connected components} of $\ca{G}$ are the partition $\ca{C}$ generated by the transitive closure of the relation $\sim$ on $\ca{V}$ defined by $i \sim j \iff \{i,j\} \in \bar{\ca{E}}$. We say $\ca{G}$ is (weakly) \emph{connected} if $|\ca{C}| = 1$. \end{definition} \begin{definition}[Graph isomorphisms] A graph $\ca{G'} = (\ca{V',E'})$ is \emph{isomorphic} to a graph $\ca{G} = (\ca{V,E})$ (write $\ca{G'} \cong \ca{G}$) if there is a bijection $\phi\colon \ca{V'} \rightarrow \ca{V}$ with $(u,v) \in \ca{E'} \iff \big(\phi(u), \phi(v) \big) \in \ca{E}$. An isomorphism from a graph to itself is called an \emph{automorphism}. \end{definition} \begin{definition}[Motifs and anchor sets] A \emph{motif} is a pair $(\ca{M,A})$ where $\ca{M} = (\ca{V_M,E_M})$ is a connected graph with $\ca{V_M} = \{ 1, \ldots, m \}$ for some small $m \geq 2$, and $\ca{A} \subseteq \ca{V_M}$ with $|\ca{A}| \geq 2$ is an \emph{anchor set}. If $\ca{A} \neq \ca{V_M}$ we say the motif is \emph{anchored}, and if $\ca{A=V_M}$ we say it is \emph{simple}. \end{definition} \begin{remark} Anchor sets~\cite{benson2016higher} specify which r\^oles vertices play in the motif, and are crucial for defining the collider and expander motifs given in Section~\ref{sec:coll_expa}. When an anchor set is not given, it is assumed that the motif is simple. Figure~\ref{fig:motif_definitions_directed} shows all simple motifs (up to isomorphism) on at most three vertices. \end{remark} \begin{definition}[Instances] Let $\ca{G}$ be a graph and $(\ca{M,A})$ a motif. We say that $\ca{H}$ is a \emph{functional instance} of $\ca{M}$ in $\ca{G}$ if $\ca{M} \cong \ca{H} \leq \ca{G}$. We say that $\ca{H}$ is a \emph{structural instance} of $\ca{M}$ in $\ca{G}$ if $\ca{M} \cong \ca{H} < \ca{G}$. \end{definition} \begin{definition}[Anchored pairs] Let $\ca{G}$ be a graph and $(\ca{M,A})$ a motif. Suppose $\ca{H}$ is an instance of $\ca{M}$ in $\ca{G}$. Define the \emph{anchored pairs of the instance} $\ca{H}$ as $$ \ca{A(H)} \vcentcolon = \big\{ \{\phi(i),\phi(j)\} : i,j \in \ca{A}, \ i \neq j, \ \phi \textrm{ is an isomorphism from } \ca{M} \textrm{ to } \ca{H} \big\}\,.$$ \end{definition} \begin{remark} Example~\ref{ex:instances} demonstrates functional and structural instances. Note that $\{i,j\} \in \ca{A(H)}$ if and only if $\ca{H}$ appears in $\ca{G}$ as an instance of $\ca{M}$ with $i \neq j$ co-appearing in the image of $\ca{A}$ under isomorphism. The motivation for this is that clustering methods should avoid separating vertices which appear as an anchored pair. \end{remark} % \begin{figure}[H] \centering %\includegraphics[scale=0.7,draft=false]{% %../tikz/motif_definitions_directed/motif_definitions_directed.pdf} \caption{All simple motifs on at most three vertices} \label{fig:motif_definitions_directed} \end{figure} \section{Adjacency and indicator matrices} \label{sec:graphs_adj_and_ind_matrices} Adjacency matrices provide a useful data structure for representing graphs and have many uses in calculating graph properties \cite{bapat2010graphs}. We define several variants of the adjacency matrix, which appear in Proposition~\ref{prop:motif_adj_matrix_formula} and Table~\ref{tab:motif_adj_mat_table}. \begin{definition}[Adjacency matrices] Let $\ca{G} = (\ca{V,E},W)$ be a graph with vertex set $\ca{V} = \{1, \ldots, n \}$. The \emph{adjacency matrix, single-edge adjacency matrix} and \emph{double-edge adjacency matrix} of $\ca{G}$ are respectively the $n \times n$ matrices \begin{align*} G_{i j} &\vcentcolon= W((i,j)) \ \bb{I} \{ (i,j) \in \ca{E} \}\,, \\ (G_\mathrm{s})_{i j} &\vcentcolon= W((i,j)) \ \bb{I} \{ (i,j) \in \ca{E} \textrm{ and } (j,i) \notin \ca{E} \}\,, \\ (G_\mathrm{d})_{i j} &\vcentcolon= \big( W((i,j)) + W((j,i)) \big) \ \bb{I} \{ (i,j) \in \ca{E} \textrm{ and } (j,i) \in \ca{E} \}\,. \end{align*} \end{definition} \begin{definition}[Indicator matrices] Let $\ca{G} = (\ca{V,E},W)$ be a graph with vertex set $\ca{V} = \{1, \ldots, n \}$. The \emph{indicator matrix, single-edge indicator matrix, double-edge indicator matrix, missing-edge indicator matrix} and \emph{vertex-distinct indicator matrix} of $\ca{G}$ are respectively the $n \times n$ matrices \begin{align*} J_{i j} &\vcentcolon= \bb{I} \{ (i,j) \in \ca{E} \}\,, \\ (J_\mathrm{s})_{i j} &\vcentcolon= \bb{I} \{ (i,j) \in \ca{E} \textrm{ and } (j,i) \notin \ca{E} \}\,, \\ (J_\mathrm{d})_{i j} &\vcentcolon= \bb{I} \{ (i,j) \in \ca{E} \textrm{ and } (j,i) \in \ca{E} \}\,, \\ (J_0)_{i j} &\vcentcolon= \bb{I} \{ (i,j) \notin \ca{E} \textrm{ and } (j,i) \notin \ca{E} \textrm{ and } i \neq j \}\,, \\ (J_\mathrm{n})_{i j} &\vcentcolon= \bb{I} \{ i \neq j \}\,. \end{align*} \end{definition} \section{Motif adjacency matrices} \label{sec:graphs_motif_adj_matrices} The central object in motif-based spectral clustering is the \emph{motif adjacency matrix} (MAM) \cite{benson2016higher}, which serves as a similarity matrix for spectral clustering (Chapter~\ref{chap:spectral}). We provide here our main results: Proposition~\ref{prop:motif_adj_matrix_formula} gives a computationally useful formula for MAMs, and Proposition~\ref{prop:motif_adj_matrix_computation} gives a complexity analysis of this formula. \pagebreak \subsection{Definitions} \begin{definition}[Motif adjacency matrices] \label{def:motif_adj_matrices} % Let $\ca{G} = (\ca{V,E},W)$ be a graph with $n$ vertices and let $\ca{(M,A)}$ be a motif. The \emph{functional} and \emph{structural motif adjacency matrices} (MAMs) of $\ca{(M,A)}$ in $\ca{G}$ are respectively the $n \times n$ matrices % \begin{align*} M^\mathrm{func}_{i j} &\vcentcolon= \frac{1}{|\ca{E_M}|} \sum_{\ca{M} \cong \ca{H} \leq \ca{G}} \bb{I} \big\{ \{i,j\} \in \ca{A}(\ca{H}) \big\} \sum_{e \in \ca{E_H}} W(e)\,, \\ M^\mathrm{struc}_{i j} &\vcentcolon= \frac{1}{|\ca{E_M}|} \sum_{\ca{M} \cong \ca{H} < \ca{G}} \bb{I} \big\{ \{i,j\} \in \ca{A}(\ca{H}) \big\} \sum_{e \in \ca{E_H}} W(e)\,. \end{align*} \end{definition} \begin{remark} Example~\ref{ex:motif_adj_matrices} gives a simple illustration of calculating an MAM. When $W \equiv 1$ and $\ca{M}$ is simple, the (functional or structural) MAM entry $M_{i j} \ (i \neq j)$ simply counts the (functional or structural) instances of $\ca{M}$ in $\ca{G}$ containing $i$ and $j$. When $\ca{M}$ is not simple, $M_{i j}$ counts only those instances with anchor sets containing both $i$ and $j$. MAMs are always symmetric, since the only dependency on $(i,j)$ is via the unordered set $\{i,j\}$. \end{remark} \subsection{Computation} \label{sec:graphs_computation} In order to state Propositions \ref{prop:motif_adj_matrix_formula} and~\ref{prop:motif_adj_matrix_computation}, we need one more definition. \begin{definition}[Anchored automorphism classes] Let $(\ca{M,A})$ be a motif. Let $S_\ca{M}$ be the set of permutations on $ \ca{V_M} = \{ 1, \ldots, m \}$ and define the \emph{anchor-preserving permutations} $S_\ca{M,A} = \{ \sigma \in S_\ca{M} : \{1,m\} \subseteq \sigma(\ca{A}) \}$. Let $\sim$ be the equivalence relation defined on $S_\ca{M,A}$ by: $\sigma \sim \tau \iff \tau^{-1} \sigma$ is an automorphism of $\ca{M}$. Finally the \emph{anchored automorphism classes} are the quotient set $S_\ca{M,A}^\sim \vcentcolon= S_\ca{M,A} \ \big/ \sim$\,. \end{definition} \begin{proposition}[MAM formula] \label{prop:motif_adj_matrix_formula} Let $\ca{G} = (\ca{V,E},W)$ be a graph with vertex set ${\ca{V}=\{1,\ldots,n\}}$ and let $(\ca{M,A})$ be a motif on $m$ vertices. Then for any $i,j \in \ca{V}$ and with $k_1 = i$, $k_m = j$, the functional and structural MAMs of $\ca{(M,A)}$ in $\ca{G}$ are given by % % \begin{align*} M^\mathrm{func}_{i j} &= \frac{1}{|\ca{E_M}|} \sum_{\sigma \in S_\ca{M,A}^\sim} \ \sum_{\{k_2, \ldots, k_{m-1}\} \subseteq \ca{V}} \ J^\mathrm{func}_{\mathbf{k},\sigma} \ G^\mathrm{func}_{\mathbf{k},\sigma}\,, &(1) \\ M^\mathrm{struc}_{i j} &= \frac{1}{|\ca{E_M}|} \sum_{\sigma \in S_\ca{M,A}^\sim} \ \sum_{\{k_2, \ldots, k_{m-1}\} \subseteq \ca{V}} \ J^\mathrm{struc}_{\mathbf{k},\sigma} \ G^\mathrm{struc}_{\mathbf{k},\sigma}\,, &(2) \end{align*} % where % \begin{align*} \ca{E}_\ca{M}^0 &\vcentcolon= \{ (u,v) : 1 \leq u < v \leq m : (u,v) \notin \ca{E_M}, (v,u) \notin \ca{E_M} \}\,, \\ \ca{E}_\ca{M}^\mathrm{s} &\vcentcolon= \{ (u,v) : 1 \leq u < v \leq m : (u,v) \in \ca{E_M}, (v,u) \notin \ca{E_M} \}\,, \\ \ca{E}_\ca{M}^\mathrm{d} &\vcentcolon= \{ (u,v) : 1 \leq u < v \leq m : (u,v) \in \ca{E_M}, (v,u) \in \ca{E_M} \}\,, \end{align*} % are respectively the missing edges, single edges and double edges of $\ca{E_M}$, and % %TC:ignore \begin{alignat*}{3} % J^\mathrm{func}_{\mathbf{k},\sigma} & \vcentcolon= \prod_{\ca{E}_\ca{M}^0} (J_\mathrm{n})_{k_{\sigma u},k_{\sigma v}} && && \prod_{\ca{E}_\ca{M}^\mathrm{s}} J_{k_{\sigma u},k_{\sigma v}} \prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}\,, \\ % G^\mathrm{func}_{\mathbf{k},\sigma} & \vcentcolon= \sum_{\ca{E}_\ca{M}^\mathrm{s}} G_{k_{\sigma u},k_{\sigma v}} && + && \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}\,, \\ % J^\mathrm{struc}_{\mathbf{k},\sigma} & \vcentcolon= \prod_{\ca{E}_\ca{M}^0} (J_0)_{k_{\sigma u},k_{\sigma v}} && && \prod_{\ca{E}_\ca{M}^\mathrm{s}} (J_\mathrm{s})_{k_{\sigma u},k_{\sigma v}} \prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}\,, \\ % G^\mathrm{struc}_{\mathbf{k},\sigma} &\vcentcolon= \sum_{\ca{E}_\ca{M}^\mathrm{s}} (G_\mathrm{s})_{k_{\sigma u},k_{\sigma v}} && + && \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}\,. % \end{alignat*} %TC:endignore \end{proposition} % \begin{proof} See Proof~\ref{proof:motif_adj_matrix_formula}. \end{proof} \begin{proposition}[Complexity of MAM formula] \label{prop:motif_adj_matrix_computation} Suppose that ${m \leq 3}$, and the adjacency matrix $G$ of $\ca{G}$ is known. Then computing adjacency and indicator matrices and calculating an MAM using Equations $(1)$ and $(2)$ in Proposition~\ref{prop:motif_adj_matrix_formula} involves at most 18 matrix multiplications, 22 entry-wise multiplications and 21 additions of (typically sparse) $n \times n$ matrices. \end{proposition} \begin{proof} See Proof~\ref{proof:motif_adj_matrix_computation}. \end{proof} Hence for motifs on at most three vertices and with sparse adjacency matrices, Proposition~\ref{prop:motif_adj_matrix_formula} gives a fast and parallelisable matrix-based procedure for computing MAMs. In practice, additional symmetries of the motif often allow computation with even fewer matrix operations, demonstrated in Example~\ref{ex:motif_adj_calc}. A list of such MAM formulae for all simple motifs on at most three vertices (up to isomorphism), as well as for the \emph{collider} and \emph{expander} motifs (Section~\ref{sec:coll_expa}), is given in Table~\ref{tab:motif_adj_mat_table}. These formulae are generalisations of those stated in Table S6 in the supplementary materials for \cite{benson2016higher}, in an incomplete list of only \emph{structural} MAMs of \emph{unweighted} graphs. Note that the functional MAM formula for the two-vertex motif $\ca{M}_\mathrm{s}$ yields the symmetrised adjacency matrix $M = G + G^\top$ which is used for traditional spectral clustering (Section~\ref{sec:spectral_overview}). The question of whether to use functional or structural MAMs for motif-based spectral clustering will be addressed in Section~\ref{sec:spectral_motifrwspectclust}. \clearpage{} \clearpage{} \chapter{Spectral Clustering} \label{chap:spectral} We provide a summary of traditional random-walk spectral clustering and show how it applies to motif-based clustering. This chapter mostly follows the relevant sections in the tutorial by U.~Von~Luxburg~\cite{von2007tutorial}, which provides further explanations and proofs. In Section~\ref{sec:spectral_overview} we give an overview of the spectral clustering procedure. In Section~\ref{sec:spectral_laplacians} we define the random-walk Laplacian and state some of its useful properties (Proposition~\ref{prop:laplacian}). In Section~\ref{sec:spectral_graph_cut} we introduce normalised cut (Ncut) as an objective function for graph partitioning. In Section~\ref{sec:spectral_cluster_extraction} we explore methods of extracting clusters from $\bb{R}^l$-valued embeddings, and in Section~\ref{sec:spectral_algs} we present the algorithms for both traditional and motif-based random-walk spectral clustering. \section{Overview of spectral clustering} \label{sec:spectral_overview} Suppose $x_1, \ldots, x_n$ are data points with some associated symmetric similarity matrix $M$ with ${M_{i j} = \mathrm{similarity}(x_i,x_j)}$. The intuitive aim of clustering is to find a partition $\ca{P}_1, \ldots, \ca{P}_k$ of $\{ x_1, \ldots, x_n \}$ which places similar points in the same group and dissimilar points in different groups. Where other methods such as $k$-means++ \cite{arthur2007k} and GMM clustering \cite{duda1973pattern} demand some further structure on $x_i$ (such as taking values in $\bb{R}^l$), spectral clustering has no such requirements. In the context of \emph{undirected} graph clustering, the data points are the vertices of the graph, and a similarity matrix is provided by the graph's adjacency matrix $G$. To cluster directed graphs, the adjacency matrix must first be symmetrised, traditionally by the transformation $M = G + G^\top$ \cite{Meila2007ClusteringBW}. This symmetrisation ignores information about edge direction and higher-order structures; and can lead to poor performance, as will be seen in Section~\ref{sec:motif_asymm_dsbms}. Spectral clustering consists of two steps. Firstly, eigendecomposition of a Laplacian matrix embeds the vertices into $\bb{R}^{l}$. The $k$ clusters are then extracted from this space. \section{Graph Laplacians} \label{sec:spectral_laplacians} The Laplacians of an undirected graph are a family of matrices which play a central r\^ole in spectral clustering. While many different graph Laplacians are available, we focus in this dissertation on just the \emph{random-walk Laplacian}, for reasons concerning objective functions, consistency and computation \cite{von2007tutorial, luxburg2004convergence}. \begin{definition} Let $\ca{G}$ be an undirected graph with (symmetric) adjacency matrix $G$. The \emph{random-walk Laplacian matrix} of $\ca{G}$ is $$ L_\mathrm{rw} \vcentcolon= I - D^{-1} G $$ where $I$ is the identity and $D_{ii} \vcentcolon= \sum_j G_{i j}$ is the diagonal matrix of weighted degrees. \end{definition} \begin{remark} $D^{-1} G$ is the transition matrix of a random walk on the vertex set $\ca{V}$ where the probability of the transition $v_i \to v_j$ is proportional to $G_{i j}$. \end{remark} \begin{proposition}[Properties of the random-walk Laplacian] \label{prop:laplacian} $L_\mathrm{rw}$ is positive semi-definite with eigenvalues $0 = \lambda_1 \leq \cdots \leq \lambda_n$. The multiplicity $k$ of the eigenvalue $0$ is equal to the number of connected components $\ca{P}_1, \ldots, \ca{P}_k$ of $\ca{G}$. The eigenspace of the eigenvalue $0$ is spanned by the indicator vectors on these components; $ \bb{I}_{\ca{P}_1}, \ldots, \bb{I}_{\ca{P}_k} $. \end{proposition} \begin{proof} See \cite{von2007tutorial}. \end{proof} \section{Graph cuts} \label{sec:spectral_graph_cut} Graph cuts provide objective functions which we seek to minimise while clustering the vertices of a graph. We look at the normalised cut and its relationship with the random-walk Laplacian. \begin{definition} Let $\ca{G}$ be a graph. Let $ \ca{P}_1, \ldots, \ca{P}_k $ be a partition of $\ca{V}$. Then the \emph{normalised cut} \cite{shi2000normalized} of $\ca{G}$ with respect to $ \ca{P}_1, \ldots, \ca{P}_k $ is % $$ \mathrm{Ncut}_\ca{G}(\ca{P}_1, \ldots, \ca{P}_k) \vcentcolon= \frac{1}{2} \sum_{i=1}^k \frac{ \mathrm{cut}(\ca{P}_i,\bar{\ca{P}_i}) }{ \mathrm{vol}(\ca{P}_i) } $$ % where $ \mathrm{cut}(\ca{P}_i,\bar{\ca{P}_i}) \vcentcolon= \sum_{u \in \ca{P}_i, \, v \in \ca{V} \setminus \ca{P}_i} G_{u v}$ and $\mathrm{vol}(\ca{P}_i) \vcentcolon= \sum_{u \in \ca{P}_i} D_{u u}$. \end{definition} \begin{remark} More desirable partitions have a lower Ncut value; the numerators penalise partitions which cut a large number of heavily weighted edges, and the denominators penalise partitions which have highly imbalanced cluster sizes. \end{remark} It can be shown \cite{von2007tutorial} that minimising Ncut over partitions $ \ca{P}_1, \ldots, \ca{P}_k $ is equivalent to finding the cluster indicator matrix $H \in \bb{R}^{n \times k}$ minimising $$ \mathrm{Tr} \big( H^\top (D-G) H \big) $$ subject to $$ H_{i j} = \mathrm{vol}(\ca{P}_j)^{-\frac{1}{2}} \ \bb{I} \{ v_i \in \ca{P}_j \}\,, \qquad (\dagger) $$ $$ H^\top D H = I\,. $$ Solving this problem is in general \textsf{NP}-hard \cite{wagner1993between}. However, by dropping the constraint~$(\dagger)$ and applying the Rayleigh Principle \cite{lutkepohl1996handbook}, we find that the solution to this relaxed problem is that $H$ contains the first $k$ eigenvectors of $L_\mathrm{rw}$ as columns \cite{von2007tutorial}. In practice, to find $k$ clusters it is often sufficient to use only the first $l < k$ eigenvectors of $L_\mathrm{rw}$. \section{Cluster extraction} \label{sec:spectral_cluster_extraction} Once Laplacian eigendecomposition has been used to embed the data into $\bb{R}^l$, the clusters may be extracted using a variety of methods. We propose $k$-means++ and eigenvector sweep as two appropriate techniques. \subsection{\texorpdfstring{$k$}{k}-means++} $k$-means++ \cite{arthur2007k} is a popular clustering algorithm for data in $\bb{R}^l$. It aims to minimise the within-cluster sum of squares, based on the standard Euclidean metric on $\bb{R}^l$. This makes it a reasonable candidate for clustering spectral data, since the Euclidean metric corresponds to notions of `diffusion distance' in the original graph \cite{nadler2006diffusion}. \subsection{Eigenvector sweep} \label{sec:spectral_sweep} Eigenvector sweep (Algorithm~\ref{alg:eigenvector_sweep})~\cite{shi2000normalized} offers a more principled technique for cluster extraction when $k=2$ clusters are required, and a single eigenvector (usually the second eigenvector of $L_\mathrm{rw}$) is available. It works by sorting the eigenvector and selecting a splitting point to minimise the Ncut score of the partition generated. \pagebreak \begin{algorithm}[H] \caption{Eigenvector sweep} \label{alg:eigenvector_sweep} \SetKwFunction{Main}{EigenvectorSweep} \newcommand{\MainArgs}{$\ca{G}, x$} \BlankLine \Input{Graph $\ca{G}$, eigenvector $x$} \Output{Partition $\ca{P}_1, \ca{P}_2$} \BlankLine \Function{\Main{\MainArgs}}{ $\hat{x} \leftarrow \mathtt{sort}(x)$ \; $\mathrm{Score_{best}} \leftarrow \infty$ \; \For{$i$ \In $1, \ldots, n-1$}{ $\ca{P} \leftarrow \{ \hat{x}_1, \ldots \hat{x}_i \}$ \; $\mathrm{Score} \leftarrow \mathrm{Ncut}_\ca{G} (\ca{P}, \ca{V} \setminus \ca{P})$ \; \If{$\mathrm{Score} < \mathrm{Score_{best}}$}{ $\ca{P}_\mathrm{best} \leftarrow \ca{P}$ \; $\mathrm{Score_{best}} \leftarrow \mathrm{Score}$ \; } } $\ca{P}_1 \leftarrow \ca{P}_\mathrm{best}$ \; $\ca{P}_2 \leftarrow \ca{V} \setminus \ca{P}_\mathrm{best}$ \; \Return $\ca{P}_1, \ca{P}_2$ } \end{algorithm} \vspace*{0.5cm} Figure~\ref{fig:eigenvector_sweep_network} shows a small network with vertices labelled by position in the sorted second eigenvector $\hat{x}$ of $L_\mathrm{rw}$. Figure~\ref{fig:eigenvector_sweep_profile} shows the `sweep profile' of Ncut scores, which is minimised at the splitting point $i=5$. Hence eigenvector sweep chooses the final partition $\ca{P}_1 = \{1, \ldots,5\}, \ \ca{P}_2 = \{6, \ldots,10\}$; as indicated by the vertex colours and dashed line in Figure~\ref{fig:eigenvector_sweep_network}. % % \begin{figure}[H] \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../tikz/eigenvector_sweep_network/eigenvector_sweep_network.pdf} \caption{A small network} \label{fig:eigenvector_sweep_network} \end{subfigure} % \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/eigenvector_sweep/eigenvector_sweep_scores.pdf} \caption{Sweep profile of the network} \label{fig:eigenvector_sweep_profile} \end{subfigure} \caption{Eigenvector sweep selects a partition by minimising Ncut} \label{fig:eigenvector_sweep} \end{figure} % \subsection{Cluster evaluation} When a graph has been clustered, we assign a score to the partition. If the ground-truth clustering is available, we can compare it to our clustering using the \emph{adjusted Rand index} (ARI) \cite{hubert1985comparing}. The ARI between two clusterings has expected value $0$ under random cluster assignment, and maximum value $1$ denoting perfect agreement between the clusterings. A larger ARI indicates a more similar clustering. If the ground-truth clustering is not available, we can use the objective function Ncut. Clusterings with lower Ncut values partition the graph more agreeably. \section{Spectral clustering algorithms} \label{sec:spectral_algs} We present the full random-walk spectral clustering algorithm and show how it can be applied to motif-based random-walk spectral clustering. \subsection{Random-walk spectral clustering} Algorithm~\ref{alg:rwspectclust} gives random-walk spectral clustering \cite{von2007tutorial}, which takes a symmetric connected adjacency matrix as input. We use $k$-means++ rather than eigenvector sweep as the cluster extraction method, due to its superior flexibility and computational speed. We drop the first column of $H$ (the first eigenvector of $L_\mathrm{rw}$) since although it should be constant and uninformative (Proposition~\ref{prop:laplacian}), numerical imprecision may give unwanted artefacts. It is worth noting that although the relaxation used in Section~\ref{sec:spectral_graph_cut} is reasonable and often leads to good approximate solutions of the Ncut problem, there are cases where it performs poorly~\cite{guattery1998quality}. The Cheeger inequality~\cite{chung2005laplacians} gives a bound on the error introduced by this relaxation. \vspace*{0.5cm} \begin{algorithm}[H] \caption{Random-walk spectral clustering} \label{alg:rwspectclust} \SetKwFunction{Main}{RWSpectClust} \newcommand{\MainArgs}{$G,k,l$} \BlankLine \Input{Symmetric adjacency matrix $G$, number of clusters $k$, dimension $l$} \Output{Partition $\ca{P}_1, \ldots, \ca{P}_k$} \BlankLine \Function{\Main{\MainArgs}}{ Construct the weighted degree matrix $D_{ii} \leftarrow \sum_j G_{i j}$ \\ Construct the random walk Laplacian matrix $L_\mathrm{rw} \leftarrow I-D^{-1}G$ \\ Let $H$ have the first $l$ eigenvectors of $L_\mathrm{rw}$ as columns \\ Drop the first column of $H$ \\ Run $k$-means++ on the rows of $H$ with $k$ clusters to produce $\ca{P}_1, \ldots, \ca{P}_k$ \\ \Return $\ca{P}_1, \ldots, \ca{P}_k$ } \end{algorithm} \subsection{Motif-based random-walk spectral clustering} \label{sec:spectral_motifrwspectclust} Algorithm~\ref{alg:motifrwspectclust} gives motif-based random-walk spectral clustering. Note that although $\ca{G}$ may be a connected graph, there is no guarantee that the MAM is connected too. Hence $M$ is restricted to its largest connected component $C$ before spectral clustering is applied. While this may initially seem to be a flaw with motif-based spectral clustering (since not all vertices are assigned to a cluster), in fact it can be very useful; restriction of $M$ can remove vertices which are in some sense not `well connected' to the rest of the graph, which means that only a `core' set of vertices are clustered. This can result in Algorithm~\ref{alg:motifrwspectclust} making fewer misclassifications with motif-based methods than with traditional spectral clustering, as seen in Section~\ref{sec:motif_polblogs}. There is ambiguity in whether to use functional or structural MAMs. While the authors in~\cite{benson2016higher} opt for structural MAMs, we propose to use functional MAMs, for a few reasons. Firstly, note that $ 0 \leq M^\mathrm{struc}_{i j} \leq M^\mathrm{func}_{i j}$ for all $i,j \in \ca{V}$. This implies that the largest connected component of $M^\mathrm{func}$ is always at least as large as that of $M^\mathrm{struc}$, meaning that often more vertices can be assigned to a cluster. Secondly, we argue that functional instances are of more interest than structural motifs, since they specify only `existence' rather than `non-existence' of edges. For consistency we will therefore use functional MAMs throughout our experiments. The most computationally expensive part of Algorithm~\ref{alg:motifrwspectclust} is the calculation of the MAM using a formula from Table~\ref{tab:motif_adj_mat_table}. We found this to be feasible for graphs with up to around $n \approx 10 \, 000$ vertices. General notes on hardware and software are given in Section~\ref{sec:notes_hardware}, and timings for MAM computation across a range of graph sizes and sparsities are available in Section~\ref{sec:notes_timing}. \vspace*{0.5cm} \begin{algorithm}[H] \caption{Motif-based random-walk spectral clustering} \label{alg:motifrwspectclust} \SetKwFunction{Main}{MotifRWSpectClust} \newcommand{\MainArgs}{$\ca{G},\mathcal{M},k,l$} \BlankLine \Input{Graph $\ca{G}$, motif $\ca{M}$, number of clusters $k$, dimension $l$} \Output{Partition $\ca{P}_1, \ldots, \ca{P}_k$} \BlankLine \Function{\Main{\MainArgs}}{ Construct the motif adjacency matrix $M$ of the graph $\ca{G}$ with motif $\ca{M}$ \\ Let $\tilde{M}$ be $M$ restricted to its largest connected component, $C$ \\ $\ca{P}_1, \ldots, \ca{P}_k \leftarrow$ \texttt{RWSpectClust($\tilde{M},k,l$)} \\ \Return $\ca{P}_1, \ldots, \ca{P}_k$ } \end{algorithm} \clearpage{} \clearpage{} \chapter{Motif-Based Clustering} \label{chap:motif} We analyse the performance of motif-based random-walk spectral clustering on both synthetic and real data. In Section~\ref{sec:motif_dsbms} we propose a family of stochastic block models and perform experiments with a variety of motifs and parameters. In Section~\ref{sec:motif_polblogs} we analyse the US Political Blogs network and in Section~\ref{sec:motif_migration} we present results from the US Migration network. \section{Directed stochastic block models} \label{sec:motif_dsbms} We begin by describing \emph{directed stochastic block models} (DSBMs), a broad class of generative models for directed graphs. A DSBM is characterised by a block count $k$, a list of block sizes $(n_i)_{i=1}^k$ and a sparsity matrix $F \in [0,1]^{k \times k}$. We define the cumulative block sizes $N_i = \sum_{j=1}^i n_j$ with $N_0=0$, and the total graph size $N=N_k$. These are used to construct the expected adjacency matrix $A \in [0,1]^{N \times N}$ given by $A_{i j} = F_{rs} \ \bb{I}\{i \neq j\}$ where $N_{r-1} < i \leq N_r$ and $N_{s-1} < j \leq N_s$. Finally a graph $\ca{G}$ is generated with adjacency matrix entries $G_{i j} \sim \textrm{Ber}(A_{i j})$ sampled independently. We say that a DSBM is \emph{symmetric} if $F$ is a symmetric matrix. This DSBM definition is similar to that given by \cite{DirectedClustImbCuts}, although we impose independence between all entries of the adjacency matrix, allowing for bidirectional edges. \subsection{Symmetric two-block DSBMs} We define the \emph{symmetric two-block DSBM} as the DSBM with $k=2$, $n_1=n_2=n$ and $F = \begin{psmallmatrix} p & q \\ q & p \end{psmallmatrix}$ where $p > q$. Figure~\ref{fig:sym_two_block_dsbm} illustrates the block structure and sparsity matrix of this model. Thicker lines indicate existence of edges with higher probability. \begin{figure}[H] \centering %\includegraphics[scale=0.8,draft=false]{% %../tikz/sym_two_block_dsbm/sym_two_block_dsbm.pdf} \caption{Symmetric two-block DSBM block structure and sparsity matrix} \label{fig:sym_two_block_dsbm} \end{figure} We test the performance of Algorithm~\ref{alg:motifrwspectclust} across various motifs with parameters $k=l=2$ on this model. Figure~\ref{fig:motifsym} shows violin plots over 20 trials of ARI against motif, for different sets of parameters $n,p,q$. Also shown is $|C|$, the average size of the largest connected component of each MAM. It can be seen that several motifs (such as $\ca{M}_5$ and $\ca{M}_9$) achieve a similar ARI to the traditional spectral clustering technique given by the symmetrised adjacency matrix $M=G+G^\top$ generated by the motif $\ca{M}_\mathrm{s}$ (Table~\ref{tab:motif_adj_mat_table}). However the strongly connected motifs (particularly $\ca{M}_4$) generate MAMs with small connected components, especially when $\ca{G}$ is sparse, and hence only cluster a subset of the vertices of $\ca{G}$. \begin{figure}[H] \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/motifsym/motifsym_1.pdf} \caption{$n=50$, $p=0.3$, $q=0.2$} \end{subfigure} \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/motifsym/motifsym_2.pdf} \caption{$n=100$, $p=0.15$, $q=0.1$} \end{subfigure} \caption{ARI violin plots for the symmetric two-block DSBM} \label{fig:motifsym} \end{figure} \subsection{Asymmetric two-block DSBMs} \label{sec:motif_asymm_dsbms} We define the \emph{asymmetric two-block DSBM} as the DSBM with $k=2$, $n_1=n_2=n$ and $F = \begin{psmallmatrix} p & q_1 \\ q_2 & p \end{psmallmatrix}$ where $q_1 > q_2$ and $p = \frac{1}{2}(q_1+q_2)$. Figure~\ref{fig:asym_two_block_dsbm} shows this model. \begin{figure}[H] \centering %\includegraphics[scale=0.8,draft=false]{% %../tikz/asym_two_block_dsbm/asym_two_block_dsbm.pdf} \caption{Asymmetric two-block DSBM block structure and sparsity matrix} \label{fig:asym_two_block_dsbm} \end{figure} We test the performance of Algorithm~\ref{alg:motifrwspectclust} across various motifs with parameters $k=l=2$ on this model. Figure~\ref{fig:motifasym} shows violin plots over 20 trials of ARI against motif, for different sets of parameters $n,p,q_1,q_2$, and $|C|$ is shown. It is apparent that motif-based clustering with $\ca{M}_1$ is the best method, consistently achieving the highest ARI and keeping $|C|$ at its maximum value of $2n$. It is unsurprising that $\ca{M}_1$ (feed-back loop) performs well on this model; large $p$ makes feed-back loops within clusters likely, and small $q_2$ makes feed-back loops spanning the clusters unlikely. Motif $\ca{M}_2$ also performs reasonably well since it contains $\ca{M}_1$ as a submotif. Furthermore, the constraint $p = \frac{1}{2}(q_1+q_2)$ ensures that the na\"ive symmetrisation $M=G+G^\top$ produces indistinguishable clusters, and hence the traditional method performs extremely poorly. \begin{figure}[H] \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/motifasym/motifasym_1.pdf} \caption{$n=100$, $p=0.2$, $q_1=0.35$, $q_2=0.05$} \end{subfigure} \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/motifasym/motifasym_2.pdf} \caption{$n=200$, $p=0.15$, $q_1=0.25$, $q_2=0.05$} \end{subfigure} \caption{ARI violin plots for the asymmetric two-block DSBM} \label{fig:motifasym} \end{figure} \section{US Political Blogs network} \label{sec:motif_polblogs} Our first real data set is the US Political Blogs network \cite{adamic2005political}, consisting of data collected two months before the 2004 US election. Vertices represent blogs, and are labelled by their political leaning (`liberal' or `conservative'). Weighted directed edges represent the number of citations from one blog to another. After preprocessing (Section~\ref{sec:notes_preprocessing}) there are $536$ liberal blogs, $636$ conservative blogs (total 1222) and $19 \, 024$ edges. The network is plotted in Figure~\ref{fig:polblogs_network}. We test the performance of Algorithm~\ref{alg:motifrwspectclust} across various motifs with parameters $k=l=2$ on this network. Figure~\ref{fig:polblogs_ariplot} plots ARI against component size $|C|$. There is an apparent trade-off between ARI and connected component size. Motif $\ca{M}_9$ clusters many vertices with $|C|=1197$ and an ARI of 0.82, while the more strongly connected $\ca{M}_4$ only clusters $378$ vertices, with an improved ARI of 0.92. Finally, the poor performance of traditional spectral clustering is due to a small number of very weakly connected vertices being partitioned off, indicated by the dashed line and circled vertices in Figure~\ref{fig:polblogs_network}. \vspace*{0.5cm} \begin{figure}[H] \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/polblogs/polblogs_network.pdf} \caption{The US Political Blogs network} \label{fig:polblogs_network} \end{subfigure} \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/polblogs/polblogs_ari_conn.pdf} \caption{ARI against $|C|$ across motifs} \label{fig:polblogs_ariplot} \end{subfigure} \caption{Plots relating to the US Political Blogs network} \end{figure} Figure~\ref{fig:polblogs_embedding} shows the embedding given by eigenvectors 2 and 3 of the random-walk Laplacian of the MAM generated by motif $\ca{M}_{12}$. An instance of this motif in the network indicates the presence of a pair of mutually citing blogs with an incoming citation from a third (see Figure~\ref{fig:motif_definitions_directed}). Colourings are provided for Figure~\ref{fig:polblogs_embedding_truth} by the truth labels and for Figure~\ref{fig:polblogs_embedding_kmeans} by the $k$-means++ clustering of eigenvector 2. The clusterings are very similar, giving an ARI of $0.82$. \vspace*{0.5cm} \begin{figure}[H] \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/polblogs/polblogs_M12_truth.pdf} \caption{Colouring by truth label} \label{fig:polblogs_embedding_truth} \end{subfigure} \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/polblogs/polblogs_M12_clusts.pdf} \caption{Colouring by $k$-means++ cluster} \label{fig:polblogs_embedding_kmeans} \end{subfigure} \caption{Eigendecomposition embedding of the US Political Blogs network} \label{fig:polblogs_embedding} \end{figure} \pagebreak \section{US Migration network} \label{sec:motif_migration} The next data set is the US Migration network \cite{census2000}, consisting of data collected during the US Census in 2000. Vertices represent the 3075 counties in 49 contiguous states (excluding Alaska and Hawaii, and including the District of Columbia). The $721\,432$ weighted directed edges represent the number of people migrating from county to county, capped at $10 \, 000$ (the 99.9th percentile) to control large entries, as in \cite{DirectedClustImbCuts}. We test the performance of Algorithm~\ref{alg:motifrwspectclust} with three selected motifs: $\ca{M}_\mathrm{s}$, $\ca{M}_6$ and $\ca{M}_9$ (see Figure~\ref{fig:motif_definitions_directed}). $\ca{M}_\mathrm{s}$ gives the traditional spectral clustering method with na\"ive symmetrisation. $\ca{M}_6$ represents a pair of counties exchanging migrants, with both also receiving migrants from a third. $\ca{M}_9$ is a path of length two, allowing counties to be deemed similar if there is migration between them via another. Firstly, we plot sweep profiles of the graph using the second eigenvector of the random-walk Laplacian of the MAM associated with each motif, in Figure~\ref{fig:migration_sweep}. Note that all three display clear minima, indicating that these motifs produce well-defined clusters. The two-part clusterings produced by eigenvector sweep are somewhat similar across the three motifs, with pairwise ARIs equal to $\textrm{ARI}(\ca{M}_\mathrm{s}, \ca{M}_6) = 0.67$, $\textrm{ARI}(\ca{M}_\mathrm{s}, \ca{M}_9) = 0.92$ and $\textrm{ARI}(\ca{M}_6, \ca{M}_9) = 0.73$. \begin{figure}[H] \begin{subfigure}{.325\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/us_migration/us_migration_sweep_profile_Ms.pdf} \caption{$\ca{M}_\mathrm{s}$} \end{subfigure} \begin{subfigure}{.325\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/us_migration/us_migration_sweep_profile_M6.pdf} \caption{$\ca{M}_6$} \end{subfigure} \begin{subfigure}{.325\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/us_migration/us_migration_sweep_profile_M9.pdf} \caption{$\ca{M}_9$} \end{subfigure} \caption{Sweep profiles of the US Migration network} \label{fig:migration_sweep} \end{figure} Next, Figure~\ref{fig:us_migration} plots maps of the US, with counties coloured initially by the first six non-trivial eigenvectors $x_2, \ldots, x_7$ of the random-walk Laplacian of the associated MAM, and then by the clustering $C$ obtained by Algorithm~\ref{alg:motifrwspectclust} with $k=l=7$. For the eigenvector colourings, note how the coloured regions often line up with state boundaries, indicating that many migrants stay within the same state. It is also apparent that the motifs $\ca{M}_6$ and $\ca{M}_9$ produce `noisier' embeddings than traditional spectral clustering, due to their reliance on three-vertex motifs. Eigenvector~2 approximately differentiates counties by longitude, although $\ca{M}_9$ achieves a clearer division between east and west, while $\ca{M}_\mathrm{s}$ and $\ca{M}_6$ colour California (CA, see Figure~\ref{fig:notes_us_map}) more similarly to the East Coast. Eigenvector 3 tends to differentiate by latitude, though $\ca{M}_\mathrm{s}$ and $\ca{M}_6$ particularly isolate the states of North Dakota (ND), South Dakota (SD), Minnesota (MN), Wisconsin (WI) and Michigan (MI). Further structure is visible across all three motifs for eigenvectors 4--7. The clusterings $C$ partition the counties into $k=7$ regions, and there are some interesting differences between the motifs. Since there is no ground-truth clustering, we record the Ncut score associated with each clustering. It is apparent that motifs $\ca{M}_6$ and $\ca{M}_9$ give a similar partition, although with some differences: $\ca{M}_6$ clusters the East Coast together with western Florida (FL) and the counties containing Los Angeles (CA), San Diego (CA), Las Vegas (NV), Phoenix (AZ), Tucson (AZ), Denver (CO), Chicago (IL) and Nashville (TN). $\ca{M}_6$ favours a larger `central' region, which includes significant parts of Colorado (CO), Oklahoma (OK), Arkansas (AR) and Illinois (IL). $\ca{M}_\mathrm{s}$ gives a somewhat different partition, with one of the clusters allocated to Michigan (MI) and Wisconsin (WI) rather than Mississippi (MS), Alabama (AL), Georgia (GA) and Tennessee (TN). As with the eigenvectors, the clustering is smoother for $\ca{M}_\mathrm{s}$ than for $\ca{M}_6$ and $\ca{M}_9$. \pagebreak \vspace*{-1cm} \begin{figure}[H] \begin{table}[H] \centering \setlength{\tabcolsep}{0em} \begin{tabular}{ |c|c|c|c| } %\expandableinput ../../results/us_migration/us_migration_table.txt \end{tabular} \end{table} \vspace*{-0.5cm} \caption{Motif-based colourings of the US Migration network} \label{fig:us_migration} \end{figure} \clearpage{} \clearpage{} \addtocontents{toc}{\protect\newpage} \chapter{Bipartite Clustering} \label{chap:bipartite} We propose a technique for spectral clustering of bipartite graphs and test its performance on both real and synthetic data. In Section~\ref{sec:bipartite_graphs} we define bipartite graphs and present our clustering technique. In Section~\ref{sec:bipartite_sbms} we propose a bipartite stochastic block model (BSBM) and perform experiments with varying parameters. In Section~\ref{sec:bipartite_american_revolution} we demonstrate our method using the American Revolution network. In Section~\ref{sec:bipartite_languages} we analyse the Unicode Languages network. \section{Bipartite graphs} \label{sec:bipartite_graphs} \begin{definition} A \emph{bipartite graph} is a graph $\ca{G}=(\ca{V,E})$ where $\ca{V}$ can be partitioned into $\ca{V} = \ca{S} \sqcup \ca{D}$ such that $\ca{E} \subseteq \ca{S} \times \ca{D}$. That is, every edge starts in $\ca{S}$ and ends in $\ca{D}$. We refer to $\ca{S}$ as the \emph{source vertices} and to $\ca{D}$ as the \emph{destination vertices}. \end{definition} \subsection{Collider and expander motifs} \label{sec:coll_expa} Our method for clustering bipartite graphs revolves around two \emph{anchored} motifs; the \emph{collider} and the \emph{expander} (Figure~\ref{fig:expa_coll}). For each motif the anchor set is $\ca{A}=\{ 1,3 \}$. \begin{figure}[H] \centering %\includegraphics[scale=0.8,draft=false]{../tikz/expa_coll/expa_coll.pdf} \caption{The collider and expander motifs} \label{fig:expa_coll} \end{figure} These motifs are useful for bipartite clustering because of Proposition~\ref{prop:coll_expa_formulae}, which states that their restricted MAMs are the adjacency matrices of the projections~\cite{kolaczyk2014statistical} of the graph $\ca{G}$. In particular they can be used as similarity matrices for the source and destination vertices respectively. The similarity of two distinct source (resp. destination) vertices is the sum over their mutual neighbours of the average weights of their edges to (resp. from) that neighbour. \begin{proposition}[Colliders and expanders in bipartite graphs] \label{prop:coll_expa_formulae} Let $\ca{G} = (\ca{V,E},W)$ be a directed bipartite graph. Let $M_\mathrm{coll}$ and $M_\mathrm{expa}$ be the structural or functional MAMs of $\ca{M}_\mathrm{coll}$ and $\ca{M}_\mathrm{expa}$ respectively in $\ca{G}$. Then % \begin{align*} (M_\mathrm{coll})_{i j} &= \bb{I} \{i \neq j\} \hspace*{-0.4cm} \sum_{\substack{k \in \ca{D} \\ (i,k), (j,k) \in \ca{E}}} \hspace*{-0.2cm} \frac{1}{2} \Big[ W((i,k)) + W((j,k)) \Big]\,, &(1)\\ (M_\mathrm{expa})_{i j} &= \bb{I} \{i \neq j\} \hspace*{-0.4cm} \sum_{\substack{k \in \ca{S} \\ (k,i), (k,j) \in \ca{E}}} \hspace*{-0.2cm}\frac{1}{2} \Big[ W((k,i)) + W((k,j)) \Big]\,. &(2) \end{align*} % \end{proposition} % \begin{proof} See Proof~\ref{proof:coll_expa_formulae}. \end{proof} \subsection{Bipartite spectral clustering algorithm} Algorithm~\ref{alg:bipartite_clustering} gives our procedure for clustering a bipartite graph. The algorithm uses the collider and expander motifs to create similarity matrices for the source and destination vertices respectively (as in Section~\ref{sec:coll_expa}), and then applies random-walk spectral clustering (Algorithm~\ref{alg:rwspectclust}) to produce the partitions. \vspace*{0.5cm} \begin{algorithm}[H] \SetKwFunction{Main}{BipartiteRWSpectClust} \newcommand{\MainArgs}{$\ca{G},k_\ca{S},k_\ca{D},l_\ca{S},l_\ca{D}$} \BlankLine \Input{Bipartite graph $\ca{G}$, source clusters $k_\ca{S}$, destination clusters $k_\ca{D}$, source dimension $l_\ca{S}$, destination dimension $l_\ca{D}$} \Output{Source partition $\ca{S}_1, \ldots, \ca{S}_{k_\ca{S}}$, destination partition $\ca{D}_1, \ldots, \ca{D}_{k_\ca{D}}$} \BlankLine \Function{\Main{\MainArgs}}{ Construct the collider motif adjacency matrix $M_\mathrm{coll}$ of the graph $\ca{G}$ \\ Construct the expander motif adjacency matrix $M_\mathrm{expa}$ of the graph $\ca{G}$ \\ $M_\mathrm{coll} \leftarrow M_\mathrm{coll}[\ca{S,S}]$ \Comm*{restrict rows and columns of $M_\mathrm{coll}$ to $\ca{S}$ \hspace*{0.07cm}} $M_\mathrm{expa} \leftarrow M_\mathrm{expa}[\ca{D,D}]$ \Comm*{restrict rows and columns of $M_\mathrm{expa}$ to $\ca{D}$} $\ca{S}_1, \ldots, \ca{S}_{k_\ca{S}} \leftarrow$ \texttt{RWSpectClust($M_\mathrm{coll},k_\ca{S},l_\ca{S}$)} \\ $\ca{D}_1, \ldots, \ca{D}_{k_\ca{D}} \leftarrow$ \texttt{RWSpectClust($M_\mathrm{expa},k_\ca{D},l_\ca{D}$)} \\ \Return $\ca{S}_1, \ldots, \ca{S}_{k_\ca{S}}$ and $\ca{D}_1, \ldots, \ca{D}_{k_\ca{D}}$ } \caption{Bipartite random walk spectral clustering} \label{alg:bipartite_clustering} \end{algorithm} \section{Bipartite stochastic block models} \label{sec:bipartite_sbms} We define the \emph{bipartite stochastic block model} (BSBM) \cite{florescu2016spectral} as the DSBM with $k=4$, $n_1 = \dots = n_4=n$ and $F = \begin{psmallmatrix} 0 & 0 & p & q \\ 0 & 0 & q & p \\ 0 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 \end{psmallmatrix}$ where $p > q$. Figure~\ref{fig:bipartite_bsbm} illustrates the block structure and sparsity matrix of this model. This model partitions the source vertices as $\ca{S} = \ca{S}_1 \sqcup \ca{S}_2$ and the destination vertices as $\ca{D}=\ca{D}_1 \sqcup \ca{D}_2$. Edges exist with high probability from $\ca{S}_1$ to $\ca{D}_1$ and from $\ca{S}_2$ to $\ca{D}_2$. \begin{figure}[H] \centering %\includegraphics[scale=0.8,draft=false]{% %../tikz/bipartite_dsbm/bipartite_dsbm.pdf} \caption{BSBM block structure and sparsity matrix} \label{fig:bipartite_bsbm} \end{figure} We test the performance of Algorithm~\ref{alg:bipartite_clustering} with parameters $k_\ca{S} = k_\ca{D} = l_\ca{S} = l_\ca{D} = 2$ on this model. For comparison we implement the co-clustering method from \cite{dhillon2001co}, which is based on random-walk spectral clustering of the symmetrised adjacency matrix $G+G^\top$. Figure~\ref{fig:bipartite} shows violin plots over 20 trials of ARI against method, for different sets of parameters $n,p,q$. Note that if a bipartite graph is connected, then so are $M_\mathrm{coll}$ and $M_\mathrm{expa}$, so we need not consider the largest connected component size $|C|$. Performance of the two methods is very similar, for source and destination vertices. \begin{figure}[H] \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/bipartite/bipartite1.pdf} \caption{$n=100$, $p=0.2$, $q=0.1$} \end{subfigure} \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/bipartite/bipartite2.pdf} \caption{$n=200$, $p=0.1$, $q=0.06$} \end{subfigure} \caption{ARI violin plots for the BSBM} \label{fig:bipartite} \end{figure} \section{American Revolution network} \label{sec:bipartite_american_revolution} As an example of application of our bipartite clustering method to real data, we consider the American Revolution network \cite{konect:brunson_revolution}. This consists of data collected from before the American Revolution. Source vertices are people, and destination vertices are organisations. Edges represent membership of a person to an organisation. There are 136 people, 5 organisations and 160 edges. Algorithm~\ref{alg:bipartite_clustering} is run on the American Revolution network, with parameters $k_\ca{S} = l_\ca{S} = 5$ and $k_\ca{D} = l_\ca{D} = 2$. Figure~\ref{fig:bipartite_revolution_source} plots the network with people coloured by source cluster, and Figure~\ref{fig:bipartite_revolution_dest} plots the network with organisations coloured by destination cluster. The algorithm succeeds in clustering people based on their common memberships, and in clustering organisations based on their common members. \begin{figure}[H] \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/american_revolution/american_revolution_source.pdf} \caption{Grouping people into 5 clusters} \label{fig:bipartite_revolution_source} \end{subfigure} \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/american_revolution/american_revolution_dest.pdf} \caption{Grouping organisations into 2 clusters} \label{fig:bipartite_revolution_dest} \end{subfigure} \caption{Bipartite clustering of the American Revolution network} \label{fig:bipartite_revolution} \end{figure} \section{Unicode Languages network} \label{sec:bipartite_languages} The final data set is the Unicode Languages network \cite{konect:unicodelang}, consisting of data collected in 2014 on languages spoken around the world. Source vertices are territories, and destination vertices are languages. Weighted directed edges from territory to language indicate the number of inhabitants in that territory who speak the specified language (territory population data taken from \cite{geonames}). After preprocessing (Section~\ref{sec:notes_preprocessing}) there are $155$ territories, $270$ languages and $705$ edges. We test Algorithm~\ref{alg:bipartite_clustering} with parameters $k_\ca{S} = l_\ca{S} = k_\ca{D} = l_\ca{D} = 6$ on this network. For the source vertices, Figure~\ref{fig:bipartite_languages_map} plots maps of the world with territories coloured by the clustering obtained. The top 20 territories (by population) in each cluster are given in Table~\ref{tab:bipartite_languages_source_clusters}. Cluster~1 is by far the largest cluster, and includes a wide variety of territories, of which many but not all speak some English. Cluster~2 contains the Persian-speaking territories of Iran and Afghanistan, the Arabic territories of Saudi Arabia and Syria, and the African French-speaking DR Congo, C\^ote d'Ivoire, Burkina Faso, Niger and others. It also includes Haiti, another French-speaking territory. Cluster~3 mostly captures Spanish-speaking territories in the Americas and also contains Equatorial Guinea, another Spanish-speaking territory in Africa. Cluster~4 includes the Slavic territories of Russia and some of its neighbours. The absence of Kazakhstan may be due to the $981 \, 760$ Kazakhs who speak German which is not a Slavic or Turkic language. Cluster~5 covers China, Hong Kong, Mongolia and some of South-East Asia. The inclusion of Panama might be due to the $6821$ Panamanians who speak Chinese. Cluster~6 is the smallest cluster and contains only Japan and the Koreas, which are connected by the $636 \, 440$ Japanese who speak Korean. There are a few territories and languages which are not contained in the large connected component of the network due to their linguistic isolation. These territories are Laos, Norway and Timor-Leste, and the languages are Lao, Norwegian Bokm{\aa}l and Norwegian Nynorsk. \begin{figure}[H] \centering %\includegraphics[scale=0.6, draft=false]{% %../../results/languages/languages_source_map_clusts.pdf} \caption{Clustering the territories from the Unicode Languages network} \label{fig:bipartite_languages_map} \end{figure} \begin{table}[H] \centering \scriptsize \begin{tabular}{ |c|c|c|c|c|c| } \hline \rule{0pt}{1.2em} \cellcolor[HTML]{8DD3C7} Cluster 1 & \cellcolor[HTML]{FFFFB3} Cluster 2 & \cellcolor[HTML]{BEBADA} Cluster 3 & \cellcolor[HTML]{FB8072} Cluster 4 & \cellcolor[HTML]{80B1D3} Cluster 5 & \cellcolor[HTML]{FDB462} Cluster 6 \\[0.1cm] \hline \rule{0pt}{1.2em} India & Iran & Mexico & Russia & China & Japan \\ United States & DR Congo & Colombia & Ukraine & Indonesia & S.\ Korea \\ Brazil & Afghanistan & Argentina & Uzbekistan & Vietnam & N.\ Korea \\ Pakistan & Saudi Arabia & Peru & Belarus & Malaysia & \\ Bangladesh & Syria & Venezuela & Tajikistan & Taiwan & \\ Nigeria & C\^ote d'Ivoire & Ecuador & Kyrgyzstan & Cambodia & \\ Philippines & Burkina Faso & Guatemala & Turkmenistan & Hong Kong & \\ Ethiopia & Niger & Cuba & Georgia & Singapore & \\ Germany & Mali & Bolivia & Moldova & Panama & \\ Egypt & Senegal & Paraguay & Latvia & Mongolia & \\ Turkey & Tunisia & El Salvador & Estonia & & \\ Thailand & Chad & Nicaragua & & & \\ France & Guinea & Costa Rica & & & \\ United Kingdom & Somalia & Uruguay & & & \\ Italy & Burundi & Eq.\ Guinea & & & \\ Myanmar & Haiti & & & & \\ South Africa & Benin & & & & \\ Spain & Azerbaijan & & & & \\ Tanzania & Togo & & & & \\ Kenya & Libya & & & & \\ $\cdots$ & $\cdots$ & & & & \\ $|\textrm{Cluster\ } 1 |$ = 87 & $|\textrm{Cluster\ } 2 |$ = 29 & $|\textrm{Cluster\ } 3 |$ = 15 & $|\textrm{Cluster\ } 4 |$ = 11 & $|\textrm{Cluster\ } 5 |$ = 10 & $|\textrm{Cluster\ } 6 |$ = 3 \\[0.1cm] \hline \end{tabular} \caption{Clustering the territories from the Unicode Languages network} \label{tab:bipartite_languages_source_clusters} \end{table} For the destination vertices, we present the six clusters obtained by Algorithm~\ref{alg:bipartite_clustering}. Table~\ref{tab:bipartite_languages_dest_clusters} contains the top 20 languages (by number of speakers) in each cluster. Cluster~1 is the largest cluster and contains the European languages of Spanish, Portuguese and French, as well as dialects of Arabic. Cluster~2 is also large and includes English as well as several South Asian languages such as Hindi, Bengali, Urdu and Punjabi. Cluster~3 consists of many indigenous African languages such as Swahili, Kinyarwanda and Somali. Cluster~4 captures languages from South-East Asia, mostly spoken in Indonesia and Malaysia. Cluster~5 identifies several varieties of Chinese and a few other Central and East Asian languages such as Kazakh and Uighur. Interestingly Korean is also placed in this group and not with Japanese, even though the Koreas are clustered together with Japan in Table~\ref{tab:bipartite_languages_source_clusters}. Cluster~6 captures more South-East Asian languages, this time from Thailand, Myanmar and Cambodia. Pattani Malay is in this cluster because despite its name it is spoken more in Thailand than in Malaysia. \vspace*{0.5cm} \begin{table}[H] \centering \scriptsize \begin{tabular}{ |c|c|c|c|c|c| } \hline \rule{0pt}{1.2em} Cluster 1 & Cluster 2 & Cluster 3 & Cluster 4 & Cluster 5 & Cluster 6 \\[0.1cm] \hline \rule{0pt}{1.2em} Spanish & English & Swahili & Indonesian & Chinese & Thai \\ Arabic & Hindi & Kinyarwanda & Javanese & Wu Chinese & N.E.\ Thai \\ Portuguese & Bengali & Somali & Malay & Korean & Khmer \\ French & Urdu & Luba-Lulua & Sundanese & Xiang Chinese & N.\ Thai \\ Russian & Punjabi & Kikuyu & Madurese & Hakka Chinese & S.\ Thai \\ Japanese & Telugu & Congo Swahili & Minangkabau & Minnan Chinese & Shan \\ German & Marathi & Luyia & Betawi & Gan Chinese & Pattani Malay \\ Turkish & Vietnamese & Ganda & Balinese & Kazakh & \\ Persian & Tamil & Luo & Buginese & Uighur & \\ Italian & Lahnda & Sukuma & Banjar & Sichuan Yi & \\ Egyptian Arabic & Filipino & Kalenjin & Achinese & Mongolian & \\ Polish & Gujarati & Lingala & Sasak & Zhuang & \\ Nigerian Pidgin & Kannada & Nyankole & Makasar & Tibetan & \\ Ukrainian & Pushto & Gusii & Lampung Api & & \\ Dutch & Malayalam & Kiga & Rejang & & \\ Algerian Arabic & Oriya & Soga & & & \\ Moroccan Arabic & Burmese & Luba-Katanga & & & \\ Hausa & Bhojpuri & Meru & & & \\ Azerbaijani & Amharic & Teso & & & \\ Uzbek & Oromo & Nyamwezi & & & \\ $\cdots$ & $\cdots$ & $\cdots$ & & & \\ $|\textrm{Cluster\ } 1 |$ = 120 & $|\textrm{Cluster\ } 2 |$ = 90 & $|\textrm{Cluster\ } 3 |$ = 25 & $|\textrm{Cluster\ } 4 |$ = 15 & $|\textrm{Cluster\ } 5 |$ = 13 & $|\textrm{Cluster\ } 6 |$ = 7 \\[0.1cm] \hline \end{tabular} \caption{Clustering the languages from the Unicode Languages network} \label{tab:bipartite_languages_dest_clusters} \end{table} \clearpage{} \clearpage{} \chapter{Conclusion} \label{chap:conclusions} With this dissertation we have introduced a graph-theoretic framework for analysis of weighted directed networks, and presented new matrix-based formulae for MAMs (Chapter~\ref{chap:graphs}). We have summarised the method of random-walk spectral clustering and shown how it can be used with motif-based techniques (Chapter~\ref{chap:spectral}). We have presented results from the application of a motif-based method both to synthetic data (DSBMs) and to real data (US Political Blogs network, US Migration network). We have demonstrated that this technique outperforms traditional spectral clustering methods on several occasions (Chapter~\ref{chap:motif}). We have introduced a motif-based spectral method for clustering bipartite graphs and presented results both from synthetic data (BSBMs) and from real data (American Revolution network, Unicode Languages network). In particular we have shown that motif-based spectral clustering is a valuable tool for clustering weighted directed networks, which is scalable and easy to implement. Superior performance has been demonstrated especially with asymmetric DSBMs in Section~\ref{sec:motif_asymm_dsbms}, and with the US Political Blogs network in Section~\ref{sec:motif_polblogs}. \section*{Limitations} There are limitations to our work. While our matrix-based formulae for MAMs are simple to implement and moderately scalable, they are computationally unwieldy for large networks (see Section~\ref{sec:notes_computation} for details). As mentioned in~\cite{benson2016higher}, fast triangle enumeration algorithms~\cite{demeyer2013ISMA,wernicke2006efficient,wernicke2006fanmod} offer increased performance, at the expense of methodological simplicity. Another shortcoming of the matrix-based formulae is that unlike motif detection algorithms such as~\cite{wernicke2006fanmod}, they do not extend to motifs on four or more vertices. \section*{Future work} There is plenty of scope for methodological investigation related to our work. Simple extensions could involve an analysis of the differences between clustering methods based on functional and structural MAMs respectively. One could also experiment with the effects of replacing the random-walk Laplacian with the unnormalised Laplacian or symmetric normalised Laplacian \cite{von2007tutorial}. Similarly one might try replacing Ncut with RatioCut \cite{hagen1992new}. We note that although our methods apply to weighted graphs, we have only discussed unweighted DSBMs. Therefore it would be interesting to investigate weighted DSBMs (perhaps following the exponential family method detailed in \cite{aicher2013adapting}) and to use them for evaluation of motif-based spectral clustering procedures. Further experimental work is also desirable. We would like to conduct experiments on more real data, and suggest that collaboration networks such as~\cite{snap:astro}, and bipartite preference networks such as~\cite{icon:movie} could be interesting. Comparison with other clustering methods could also be insightful; the Hermitian matrices method in~\cite{DirectedClustImbCuts}, the PageRank method in~\cite{yin2017local} and \textsc{Tectonic} from~\cite{tsourakakis2017scalable} may give suitable benchmarks for performance. \clearpage{} %TC:ignore % enable appendix numbering format and include appendices \appendix \fancyhead[RO]{\itshape{\nouppercase{Appendix \thechapter : \leftmark}}} %TC:endignore \clearpage{} \chapter{Proofs and Examples}\label{chap:appendix_proofs} \section{Proofs} \begin{prf}[Proposition~\ref{prop:motif_adj_matrix_formula}, MAM formula] \label{proof:motif_adj_matrix_formula} % Consider $(1)$. We sum over functional instances $\ca{M} \cong \ca{H} \leq \ca{G}$ such that $\{i,j\} \in \ca{A(H)}$. This is equivalent to summing over $\{k_2, \ldots, k_{m-1}\} \subseteq \ca{V}$ and $\sigma \in S_\ca{M,A}^\sim$, such that $k_u$ are all distinct and % $$ (u,v) \in \ca{E_M} \implies (k_{\sigma u}, k_{\sigma v}) \in \ca{E}\,. \qquad (\dagger) $$ % This is because the vertex set $\{k_2, \ldots, k_{m-1}\} \subseteq \ca{V}$ indicates which vertices are present in the instance $\ca{H}$, and $\sigma$ describes the mapping from $\ca{V_M}$ onto those vertices: $u \mapsto k_{\sigma u}$. We take $\sigma \in S_\ca{M,A}^\sim$ to ensure that $\{i,j\} \in \ca{A(H)}$ (since $i=k_1, \ j=k_m$), and that instances are counted exactly once. The condition $(\dagger)$ is to check that $\ca{H}$ is a functional instance of $\ca{M}$ in $\ca{G}$. Hence % \begin{align*} M^\mathrm{func}_{i j} &= \frac{1}{|\ca{E_M}|} \sum_{\ca{M} \cong \ca{H} \leq \ca{G}} \bb{I} \big\{ \{i,j\} \in \ca{A}(\ca{H}) \big\} \sum_{e \in \ca{E_H}} W(e) \\ % &= \frac{1}{|\ca{E_M}|} \sum_{\{ k_2, \ldots, k_{m-1} \}} \sum_{\sigma \in S_\ca{M,A}^\sim} \bb{I} \big\{ k_u \textrm{ all distinct}, \, (\dagger) \big\} \sum_{e \in \ca{E_H}} W(e)\,. \end{align*} % For the first term, by conditioning on the types of edge in $\ca{E_M}$: \begin{align*} % \bb{I} \big\{ k_u \textrm{ all distinct}, \, (\dagger) \big\} &= \prod_{\ca{E}_\ca{M}^0} \bb{I} \{ k_{\sigma u} \neq k_{\sigma v} \} \\ & \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{s}} \bb{I} \{ (k_{\sigma u}, k_{\sigma v}) \in \ca{E} \} \\ & \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{d}} \bb{I} \{(k_{\sigma u}, k_{\sigma v}) \in \ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u}) \in \ca{E}\} \\ % &= \prod_{\ca{E}_\ca{M}^0} (J_\mathrm{n})_{k_{\sigma u},k_{\sigma v}} \prod_{\ca{E}_\ca{M}^\mathrm{s}} J_{k_{\sigma u},k_{\sigma v}} \prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma v}} \\ % &= J^\mathrm{func}_{\mathbf{k},\sigma}\,. % \end{align*} % Assuming $\big\{ k_u \textrm{ all distinct}, \, (\dagger) \big\}$, the second term is % \begin{align*} % \sum_{e \in \ca{E_H}} W(e) &= \sum_{\ca{E}_\ca{M}^\mathrm{s}} W((k_{\sigma u},k_{\sigma v})) + \sum_{\ca{E}_\ca{M}^\mathrm{d}} \big( W((k_{\sigma u},k_{\sigma v})) + W((k_{\sigma v},k_{\sigma u})) \big) \\ % &= \sum_{\ca{E}_\ca{M}^\mathrm{s}} G_{k_{\sigma u},k_{\sigma v}} + \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma v}} \\ % &= G^\mathrm{func}_{\mathbf{k},\sigma} \end{align*} % as required. For $(2)$, we simply change $(\dagger)$ to $(\ddagger)$ to check that an instance is a \emph{structural} instance: % $$ (u,v) \in \ca{E_M} \iff (k_{\sigma u}, k_{\sigma v}) \in \ca{E} \qquad (\ddagger) $$ % Now for the first term: % \begin{align*} % \bb{I} \big\{ k_u \textrm{ all distinct}, \, (\ddagger) \big\} &= \prod_{\ca{E}_\ca{M}^0} \bb{I} \{(k_{\sigma u}, k_{\sigma v}) \notin \ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u}) \notin \ca{E}\} \\ & \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{s}} \bb{I} \{(k_{\sigma u}, k_{\sigma v}) \in \ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u}) \notin \ca{E}\} \\ & \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{d}} \bb{I} \{(k_{\sigma u}, k_{\sigma v}) \in \ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u}) \in \ca{E}\} \\ % &= \prod_{\ca{E}_\ca{M}^0} (J_\mathrm{0})_{k_{\sigma u},k_{\sigma v}} \prod_{\ca{E}_\ca{M}^\mathrm{s}} (J_\mathrm{s})_{k_{\sigma u},k_{\sigma v}} \prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma v}} \\ % &= J^\mathrm{struc}_{\mathbf{k},\sigma}\,. % \end{align*} % Assuming $\big\{ k_u \textrm{ all distinct}, \, (\ddagger) \big\}$, the second term is % \begin{align*} % \sum_{e \in \ca{E_H}} W(e) &= \sum_{\ca{E}_\ca{M}^\mathrm{s}} W((k_{\sigma u},k_{\sigma v})) + \sum_{\ca{E}_\ca{M}^\mathrm{d}} \big( W((k_{\sigma u},k_{\sigma v})) + W((k_{\sigma v},k_{\sigma u})) \big) \\ % &= \sum_{\ca{E}_\ca{M}^\mathrm{s}} (G_\mathrm{s})_{k_{\sigma u},k_{\sigma v}} + \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma v}} \\ % &= G^\mathrm{struc}_{\mathbf{k},\sigma}\,. \end{align*} \hfill $\square$ \end{prf} \pagebreak \begin{prf}[Proposition~\ref{prop:motif_adj_matrix_computation}, Complexity of MAM formula] \label{proof:motif_adj_matrix_computation} Suppose ${m \leq 3}$ and consider $M^\mathrm{func}$. The adjacency and indicator matrices of $\ca{G}$ are % \begin{equation*} \begin{aligned}[c] &(1) \quad J = \bb{I} \{ G>0 \}\,, \\ &(2) \quad J_0 = \bb{I} \{ G + G^\top = 0 \} \circ J_\mathrm{n}\,, \\ &(3) \quad J_\mathrm{s} = J - J_\mathrm{d}\,, \\ &(4) \quad G_\mathrm{d} = (G + G^\top) \circ J_\mathrm{d} \,, \end{aligned} \hspace*{2cm} \begin{aligned}[c] &(5) \quad J_\mathrm{n} = \bb{I} \{I_{n \times n} = 0 \}\,, \\ &(6) \quad J_\mathrm{d} = J \circ J^\top\,, \\ &(7) \quad G_\mathrm{s} = G \circ J_\mathrm{s}\,, \\ & \end{aligned} \end{equation*} % and are computed using four additions and four element-wise multiplications. $J^\mathrm{func}_{\mathbf{k},\sigma}$ is a product of at most three factors, and $G^\mathrm{func}_{\mathbf{k},\sigma}$ contains at most three summands, so % $$ \sum_{k_2 \in \ca{V}} J^\mathrm{func}_{\mathbf{k},\sigma} \ G^\mathrm{func}_{\mathbf{k},\sigma} $$ % is expressible as a sum of at most three matrices, each of which is constructed with at most one matrix multiplication (where $\{k_{\sigma r},k_{\sigma s}\} \neq \{i,j\}$) and one entry-wise multiplication (where $\{k_{\sigma r},k_{\sigma s}\} = \{i,j\}$). This is repeated for each $\sigma \in S_\ca{M,A}^\sim$ (at most six times) and the results are summed. Calculations are identical for $M^\mathrm{struc}$. \hfill $\square$ \end{prf} \begin{prf}[Proposition~\ref{prop:coll_expa_formulae}, Colliders and expanders in bipartite graphs] \label{proof:coll_expa_formulae} % Consider (1) and the collider motif $\ca{M}_\mathrm{coll}$. Since $\ca{G}$ is bipartite, $M_\mathrm{coll}^\mathrm{func} = M_\mathrm{coll}^\mathrm{struc} = \vcentcolon M_\mathrm{coll}$, and by Table~\ref{tab:motif_adj_mat_table}, $M_\mathrm{coll} = \frac{1}{2} J_\mathrm{n} \circ (J G^\top + G J^\top)$. Hence % \begin{align*} (M_\mathrm{coll})_{i j} &= \frac{1}{2} (J_\mathrm{n})_{i j} \ (J G^\top + G J^\top)_{i j} \\ &= \bb{I}\{i \neq j\} \sum_{k \in \ca{V}} \ \frac{1}{2} \Big(J_{i k} G_{j k} + G_{i k} J_{j k} \Big) \\ &= \bb{I}\{i \neq j\} \sum_{k \in \ca{V}} \ \frac{1}{2} \,\bb{I} \, \Big\{ (i,k),(j,k) \in \ca{E} \Big\} \Big[W((i,k)) + W((j,k))\Big] \\ &= \bb{I} \{i \neq j\} \hspace*{-0.4cm} \sum_{\substack{k \in \ca{D} \\ (i,k), (j,k) \in \ca{E}}} \hspace*{-0.2cm} \frac{1}{2} \Big[ W((i,k)) + W((j,k)) \Big]\,. \end{align*} % Similarly for the expander motif, $M_\mathrm{expa} = \frac{1}{2} J_\mathrm{n} \circ (J^\top G + G^\top J)$ so % \begin{align*} (M_\mathrm{expa})_{i j} &= \frac{1}{2} (J_\mathrm{n})_{i j} \ (J^\top G + G^\top J)_{i j} \\ &= \bb{I} \{i \neq j\} \hspace*{-0.4cm} \sum_{\substack{k \in \ca{S} \\ (k,i), (k,j) \in \ca{E}}} \hspace*{-0.2cm} \frac{1}{2} \Big[ W((k,i)) + W((k,j)) \Big]\,. \end{align*} % \hfill $\square$ \end{prf} \section{Examples} \begin{example}[Functional and structural instances] \label{ex:instances} Let $\ca{G}=(\ca{V,E})$ be the graph with $\ca{V} = \{ 1,2,3,4 \}$ and $\ca{E} = \{ (1,2),(1,3),(1,4),(2,3),(3,4),(4,3) \}$. Let $(\ca{M,A})$ be the anchored motif with $\ca{V_M} = \{1,2,3\}$, $\ca{E_M} = \{(1,2),(1,3),(2,3)\}$ and $\ca{A} = \{1,3\}$ as defined in Figure \ref{fig:instance_example_1}. % \begin{figure}[H] \centering %\includegraphics[scale=0.7,draft=false]{% %../tikz/instance_example_1/instance_example_1.pdf} \caption{The specified graph $\ca{G}$ and anchored motif $\ca{M}$} \label{fig:instance_example_1} \end{figure} % There are three functional instances of $\ca{M}$ in $\ca{G}$, shown in Figure~\ref{fig:instance_example_2}. However there is just one structural instance of $\ca{M}$ in $\ca{G}$, given by $\ca{H}_1$. This is because the double edge $3 \leftrightarrow 4$ in $\ca{G}$ prevents the subgraphs on $\{1,3,4\}$ from being induced subgraphs. % \begin{align*} \ca{H}_1 &: \quad \ca{V}_1 = \{ 1,2,3 \} ; \quad \ca{E}_1 = \{ (1,2) , (2,3) , (1,3) \} ; \quad \ca{A(H}_1) = \big\{\{1,3\}\big\}\,, \\ \ca{H}_2 &: \quad \ca{V}_2 = \{ 1,3,4 \} ; \quad \ca{E}_2 = \{ (1,3) , (1,4) , (3,4) \} ; \quad \ca{A(H}_2) = \big\{\{1,4\}\big\}\,, \\ \ca{H}_3 &: \quad \ca{V}_3 = \{ 1,3,4 \} ; \quad \ca{E}_3 = \{ (1,3) , (1,4) , (4,3) \} ; \quad \ca{A(H}_3) = \big\{\{1,3\}\big\}\,. \end{align*} % \begin{figure}[H] \centering %\includegraphics[scale=0.7,draft=false]{% %../tikz/instance_example_2/instance_example_2.pdf} \caption{Functional instances $\ca{H}_1,\ca{H}_2$ and $\ca{H}_3$} \label{fig:instance_example_2} \end{figure} \end{example} \begin{example}[Motif adjacency matrices] \label{ex:motif_adj_matrices} Let $\ca{G}$ and $\ca{(M,A)}$ be as in Example~\ref{ex:instances}, and suppose $\ca{G}$ has weight map $W((i,j)) \vcentcolon = i + j$. Then using Definition~\ref{def:motif_adj_matrices} directly, the functional and structural MAMs of $\ca{(M,A)}$ in $\ca{G}$ are respectively \vspace*{0.2cm} $$ % M^\mathrm{func} = \begin{pmatrix} 0 & 0 & 28 & 16 \\ 0 & 0 & 0 & 0 \\ 28 & 0 & 0 & 0 \\ 16 & 0 & 0 & 0 \end{pmatrix} \,, \qquad M^\mathrm{struc} = \begin{pmatrix} 0 & 0 & 12 & 0 \\ 0 & 0 & 0 & 0 \\ 12 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 \end{pmatrix}\,. $$ \end{example} \pagebreak \begin{example}[Calculating an explicit formula for an MAM] \label{ex:motif_adj_calc} Consider the functional MAM of the simple motif $\ca{M}_6$ (Figure~\ref{fig:M6}). % \begin{figure}[H] \centering %\includegraphics[scale=0.7,draft=false]{../tikz/M6/M6.pdf} \caption{The motif $\ca{M}_6$} \label{fig:M6} \end{figure} % We use Equation (1) in Proposition~\ref{prop:motif_adj_matrix_formula}. Firstly, $m = |\ca{V_M}| = 3$ and $|\ca{E_M}| = 4$. The automorphism group of $\ca{M}_6$ has order 2, corresponding to swapping vertices 1 and 3. Hence $|S_\ca{M,A}^\sim| = |S_m| / 2 = 6/2 = 3$, and suitable representatives from $S_\ca{M,A}^\sim$ are $$ S_\ca{M,A}^\sim = \left\{ % \sigma_1 = \begin{pmatrix} 1 & 2 & 3 \\ 1 & 2 & 3 \end{pmatrix}, % \sigma_2 = \begin{pmatrix} 1 & 2 & 3 \\ 2 & 1 & 3 \end{pmatrix}, % \sigma_3 = \begin{pmatrix} 1 & 2 & 3 \\ 1 & 3 & 2 \end{pmatrix} \right\}\,. \vspace*{0.2cm}$$ % So by Proposition~\ref{prop:motif_adj_matrix_formula}, with $i=k_1$ and $j=k_3$, and writing $k$ for $k_2$: $$ M^\mathrm{func}_{i j} = \frac{1}{4} \sum_{\sigma \in S_\ca{M,A}^\sim} \ \sum_{k \in \ca{V}} J^\mathrm{func}_{\mathbf{k},\sigma} \ G^\mathrm{func}_{\mathbf{k},\sigma} $$ % where since there are no missing edges in $\ca{M}_6$: % \begin{align*} % J^\mathrm{func}_{\mathbf{k},\sigma} &= \prod_{\ca{E}_\ca{M}^\mathrm{s}} J_{k_{\sigma u},k_{\sigma v}} \prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}\,, \\ % G^\mathrm{func}_{\mathbf{k},\sigma} &= \sum_{\ca{E}_\ca{M}^\mathrm{s}} G_{k_{\sigma u},k_{\sigma v}} + \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}\,. % \end{align*} % Writing out the sum over $\sigma$: % \begingroup \allowdisplaybreaks \begin{align*} M^\mathrm{func}_{i j} &= \frac{1}{4} \sum_{k=1}^n J^\mathrm{func}_{\mathbf{k},\sigma_1} \ G^\mathrm{func}_{\mathbf{k},\sigma_1} + \frac{1}{4} \sum_{k=1}^n J^\mathrm{func}_{\mathbf{k},\sigma_2} \ G^\mathrm{func}_{\mathbf{k},\sigma_2} + \frac{1}{4} \sum_{k=1}^n J^\mathrm{func}_{\mathbf{k},\sigma_3} \ G^\mathrm{func}_{\mathbf{k},\sigma_3} \\ % &= \frac{1}{4} \sum_{k=1}^n J_{j i} J_{j k} (J_\mathrm{d})_{i k} \big(G_{j i} + G_{j k} + (G_\mathrm{d})_{i k}\big) \\ & \qquad + \frac{1}{4} \sum_{k=1}^n J_{i j} J_{i k} (J_\mathrm{d})_{j k} \big(G_{i j} + G_{i k} + (G_\mathrm{d})_{j k}\big) \\ & \qquad + \frac{1}{4} \sum_{k=1}^n J_{k i} J_{k j} (J_\mathrm{d})_{i j} \big(G_{k i} + G_{k j} + (G_\mathrm{d})_{i j}\big) \\ % & \\ & \\ & \\ &= \frac{1}{4} J^\top_{i j} \sum_{k=1}^n (J_\mathrm{d})_{i k} J^\top_{k j} \big(G^\top_{i j} + (G_\mathrm{d})_{i k} + G^\top_{k j}\big) \\ & \qquad + \frac{1}{4} J_{i j} \sum_{k=1}^n J_{i k} (J_\mathrm{d})_{k j} \big(G_{i j} + G_{i k} + (G_\mathrm{d})_{k j}\big) \\ & \qquad + \frac{1}{4} (J_\mathrm{d})_{i j} \sum_{k=1}^n J^\top_{i k} J_{k j} \big((G_\mathrm{d})_{i j} + G^\top_{i k} + G_{k j}\big) \,, \end{align*} \endgroup % and writing this as a sum of entry-wise and matrix products: % \begin{align*} M^\textrm{func} &= \frac{1}{4} \Big[ J^\top \circ (J_\mathrm{d} G^\top) + J^\top \circ (G_\mathrm{d} J^\top) + G^\top \circ (J_\mathrm{d} J^\top) \Big] \\ & \qquad + \frac{1}{4} \Big[ J \circ (J G_\mathrm{d}) + J \circ (G J_\mathrm{d}) + G \circ (J J_\mathrm{d}) \Big] \\ & \qquad + \frac{1}{4} \Big[ J_\mathrm{d} \circ (J^\top G) + J_\mathrm{d} \circ (G^\top J) + G_\mathrm{d} \circ (J^\top J) \Big] \end{align*} % where $A \circ B$ is an entry-wise product and $AB$ is a matrix product. Finally, setting $$C = J \circ (J G_\mathrm{d}) + J \circ (G J_\mathrm{d}) + G \circ (J J_\mathrm{d}) + J_\mathrm{d} \circ (J^\top G)\,, $$ and $$ C' = G_\mathrm{d} \circ (J^\top J)\,, $$ then we have that $$ M^\mathrm{func} = \frac{1}{4} \big(C + C^\top + C' \big)\,. $$ as in Table~\ref{tab:motif_adj_mat_table}, achieved with just five matrix multiplications, nine entry-wise multiplications and nine matrix additions (including the four entry-wise multiplications and four additions needed to construct the adjacency and indicator matrices). \end{example} \clearpage{} \clearpage{} \chapter{Motif Adjacency Matrix Formulae} \label{chap:appendix_matrices} We give explicit matrix-based formulae for functional motif adjacency matrices $M^\mathrm{func}$ for all simple motifs $\ca{M}$ on at most three vertices, along with the anchored motifs $\ca{M}_\mathrm{coll}$ and $\ca{M}_\mathrm{expa}$. For structural motif adjacency matrices, simply replace $J_\mathrm{n}$, $J$ and $G$ with $J_0$, $J_\mathrm{s}$ and $G_\mathrm{s}$ respectively. Entry-wise products are denoted by $\circ$. \vspace*{0.2cm} \begin{table}[H] \centering \renewcommand{\arraystretch}{1.8} \tiny \begin{tabular}{ |c|c|c|c| } \hline Motif & $C$ & $C'$ & $M^\mathrm{func}$ \\ \hline $\ca{M}_\mathrm{s}$ & & & $G + G^\top$ \\ \hline $\ca{M}_\mathrm{d}$ & & & $\frac{1}{2} G_\mathrm{d}$ \\ \hline $\ca{M}_1$ & $J^\top \circ (J G) + J^\top \circ (G J) + G^\top \circ (J J)$ & & $\frac{1}{3} \big(C + C^\top\big)$ \\ \hline $\ca{M}_2$ & \rule{0pt}{2.7em}$\displaystyle \begin{aligned} & J^\top \circ (J_\mathrm{d} G) + J^\top \circ (G_\mathrm{d} J) + G^\top \circ (J_\mathrm{d} J) \\ & + J^\top \circ (J G_\mathrm{d}) + J^\top \circ (G J_\mathrm{d}) + G^\top \circ (J J_\mathrm{d}) \\ & + J_\mathrm{d} \circ (J G) + J_\mathrm{d} \circ (G J) + G_\mathrm{d} \circ (J J) \end{aligned} $\rule[-2em]{0pt}{1em} & & $\frac{1}{4} \big(C + C^\top\big)$ \\ \hline $\ca{M}_3$ & \rule{0pt}{2.7em}$\displaystyle \begin{aligned} & J \circ (J_\mathrm{d} G_\mathrm{d}) + J \circ (G_\mathrm{d} J_\mathrm{d}) + G \circ (J_\mathrm{d} J_\mathrm{d}) \\ & + J_\mathrm{d} \circ (J_\mathrm{d} G) + J_\mathrm{d} \circ (G_\mathrm{d} J) + G_\mathrm{d} \circ (J_\mathrm{d} J) \\ & + J_\mathrm{d} \circ (J G_\mathrm{d}) + J_\mathrm{d} \circ (G J_\mathrm{d}) + G_\mathrm{d} \circ (J J_\mathrm{d}) \end{aligned} $\rule[-2em]{0pt}{1em} & & $\frac{1}{5} \big(C + C^\top\big)$ \\ \hline $\ca{M}_4$ & $ J_\mathrm{d} \circ (J_\mathrm{d} G_\mathrm{d}) + J_\mathrm{d} \circ (G_\mathrm{d} J_\mathrm{d}) + G_\mathrm{d} \circ (J_\mathrm{d} J_\mathrm{d}) $ & & $ \frac{1}{6} C$ \\ \hline $\ca{M}_5$ & \rule{0pt}{2.7em}$\displaystyle \begin{aligned} & J \circ (J G) + J \circ (G J) + G \circ (J J) \\ & + J \circ (J G^\top) + J \circ (G J^\top) + G \circ (J J^\top) \\ & + J \circ (J^\top G) + J \circ (G^\top J) + G \circ (J^\top J) \end{aligned} $\rule[-2em]{0pt}{1em} & & $\frac{1}{3} \big(C + C^\top\big)$ \\ \hline $\ca{M}_6$ & $J \circ (J G_\mathrm{d}) + J \circ (G J_\mathrm{d}) + G \circ (J J_\mathrm{d}) + J_\mathrm{d} \circ (J^\top G)$ & $G_\mathrm{d} \circ (J^\top J)$ & $\frac{1}{4} \big(C + C^\top + C' \big)$ \\ \hline $\ca{M}_7$ & $J \circ (J_\mathrm{d} G) + J \circ (G_\mathrm{d} J) + G \circ (J_\mathrm{d} J)$ & $J_\mathrm{d} \circ (J G^\top) + J_\mathrm{d} \circ (G J^\top) + G_\mathrm{d} \circ (J J^\top)$ & $ \frac{1}{4} \big(C + C^\top + C' \big)$ \\ \hline $\ca{M}_8$ & $J \circ (G J_\mathrm{n}) + G \circ (J J_\mathrm{n})$ & $J_\mathrm{n} \circ (J^\top G) + J_\mathrm{n} \circ (G^\top J)$ & $\frac{1}{2} \big(C + C^\top + C' \big)$ \\ \hline $\ca{M}_9$ & \rule{0pt}{1.9em}$\displaystyle \begin{aligned} & J \circ (J_\mathrm{n} G^\top) + G \circ (J_\mathrm{n} J^\top) + J_\mathrm{n} \circ (J G) \\ & + J_\mathrm{n} \circ (G J) + J \circ (G^\top J_\mathrm{n}) + G \circ (J^\top J_\mathrm{n}) \end{aligned} $\rule[-1.3em]{0pt}{1em} & & $\frac{1}{2} \big(C + C^\top\big)$ \\ \hline $\ca{M}_{10}$ & $J \circ (J_\mathrm{n} G) + G \circ (J_\mathrm{n} J)$ & $J_\mathrm{n} \circ (J G^\top) + J_\mathrm{n} \circ (G J^\top)$ & $\frac{1}{2} \big(C + C^\top + C' \big)$ \\ \hline $\ca{M}_{11}$ & \rule{0pt}{1.9em}$\displaystyle \begin{aligned} & J_\mathrm{d} \circ (G J_\mathrm{n}) + G_\mathrm{d} \circ (J J_\mathrm{n}) + J_\mathrm{n} \circ (J_\mathrm{d} G) \\ & + J_\mathrm{n} \circ (G_\mathrm{d} J) + J \circ (G_\mathrm{d} J_\mathrm{n}) + G \circ (J_\mathrm{d} J_\mathrm{n}) \end{aligned} $\rule[-1.3em]{0pt}{1em} & & $\frac{1}{3} \big(C + C^\top\big)$ \\ \hline $\ca{M}_{12}$ & \rule{0pt}{1.9em}$\displaystyle \begin{aligned} & J_\mathrm{d} \circ (J_\mathrm{n} G) + G_\mathrm{d} \circ (J_\mathrm{n} J) + J_\mathrm{n} \circ (J G_\mathrm{d}) \\ & + J_\mathrm{n} \circ (G J_\mathrm{d}) + J \circ (J_\mathrm{n} G_\mathrm{d}) + G \circ (J_\mathrm{n} J_\mathrm{d}) \end{aligned} $\rule[-1.3em]{0pt}{1em} & & $ \frac{1}{3} \big(C + C^\top\big)$ \\ \hline $\ca{M}_{13}$ & $J_\mathrm{d} \circ (G_\mathrm{d} J_\mathrm{n}) + G_\mathrm{d} \circ (J_\mathrm{d} J_\mathrm{n}) + J_\mathrm{n} \circ (J_\mathrm{d} G_\mathrm{d})$ & & $\frac{1}{4} \big(C + C^\top \big)$ \\ \hline $\ca{M}_\mathrm{coll}$ & $J_\mathrm{n} \circ (J G^\top)$ & & $\frac{1}{2} \big( C + C^\top \big)$ \\ \hline $\ca{M}_\mathrm{expa}$ & $J_\mathrm{n} \circ (J^\top G)$ & & $\frac{1}{2} \big( C + C^\top \big)$ \\ \hline \end{tabular} \caption{Functional motif adjacency matrix formulae} \label{tab:motif_adj_mat_table} \end{table} \clearpage{} \clearpage{} \chapter{Further Notes} \section{Computation} \label{sec:notes_computation} \subsection{Hardware and software} \label{sec:notes_hardware} The hardware used for computation was an \emph{Intel Core i7-4790} CPU at 3.60\,GHz, with 32\,GB of RAM. The software used was R 3.5.1 \cite{r_rsoftware}, along with several R packages: % % \begin{itemize} \item \textbf{igraph} \cite{r_igraph} for plotting networks \item \textbf{LICORS} \cite{r_LICORS} for an implementation of $k$-means++ \item \textbf{mclust} \cite{r_mclust} for an implementation of ARI \item \textbf{rnaturalearth} \cite{r_rnaturalearth} for world territory boundary data \item \textbf{RSpectra} \cite{r_RSpectra} for eigendecomposition of sparse matrices \item \textbf{USAboundaries} \cite{r_USAboundaries} for US county and state boundary data \end{itemize} \subsection{Timings for MAM computations} \label{sec:notes_timing} We record timings (in seconds) for the MAM formulae given in Table~\ref{tab:motif_adj_mat_table}. We test on DSBMs (Section~\ref{sec:motif_dsbms}) with $k=1$, and vary the graph size $n$ and sparsity parameter $p$. \vspace*{0.3cm} \begin{table}[H] \centering \renewcommand{\arraystretch}{1.5} \setlength\tabcolsep{0.2em} \scriptsize \begin{tabular}{|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|} \hline \cellcolor[HTML]{E9E9E9} \smash{\raisebox{0.7pt}{$p$}} & \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{s}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{d}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_1$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_2$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_3$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_4$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_5$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_6$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_7$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_8$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_9$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{10}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{11}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{12}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{13}$ \\ \hline \cellcolor[HTML]{E9E9E9} 0.0001 & 0.013 & 0.012 & 0.017 & 0.029 & 0.034 & 0.015 & 0.028 & 0.022 & 0.022 & 0.019 & 0.030 & 0.019 & 0.042 & 0.021 & 0.016 \\ \hline \cellcolor[HTML]{E9E9E9} 0.001 & 0.013 & 0.011 & 0.016 & 0.035 & 0.027 & 0.017 & 0.028 &0.024 & 0.023 & 0.026 & 0.027 & 0.018 & 0.021 & 0.022 & 0.016 \\ \hline \cellcolor[HTML]{E9E9E9} 0.01 & 0.013 & 0.012 & 0.024 & 0.028 & 0.028 & 0.016 & 0.028 & 0.022 & 0.032 & 0.021 & 0.026 & 0.020 & 0.023 & 0.023 & 0.017 \\ \hline \cellcolor[HTML]{E9E9E9} 0.1 & 0.014 & 0.019 & 0.019 & 0.031 & 0.029 & 0.019 & 0.033 & 0.025 & 0.032 & 0.023 & 0.028 & 0.023 & 0.026 & 0.025 & 0.019 \\ \hline \end{tabular} \caption{Timings for MAM computation with $n=100$}% \label{tab:timing_n_100}% \end{table} \begin{table}[H] \centering \renewcommand{\arraystretch}{1.5} \setlength\tabcolsep{0.2em} \scriptsize \begin{tabular}{ |c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c| } \hline \cellcolor[HTML]{E9E9E9} \smash{\raisebox{0.7pt}{$p$}} & \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{s}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{d}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_1$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_2$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_3$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_4$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_5$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_6$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_7$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_8$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_9$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{10}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{11}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{12}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{13}$ \\ \hline \cellcolor[HTML]{E9E9E9} 0.0001 & 0.13 & 0.14 & 0.14 & 0.32 & 0.14 & 0.13 & 0.14 & 0.31 & 0.13 & 0.21 & 0.22 & 0.21 & 0.20 & 0.34 & 0.16 \\ \hline \cellcolor[HTML]{E9E9E9} 0.001 & 0.30 & 0.13 & 0.15 & 0.16 & 0.16 & 0.14 & 0.16 & 0.32 & 0.14 & 0.48 & 0.37 & 0.29 & 0.31 & 0.29 & 0.17 \\ \hline \cellcolor[HTML]{E9E9E9} 0.01 & 0.11 & 0.14 & 0.17 & 0.19 & 0.14 & 0.13 & 0.21 & 0.18 & 0.18 & 0.64 & 0.73 & 0.89 & 0.46 & 0.56 & 0.18 \\ \hline \cellcolor[HTML]{E9E9E9} 0.1 & 0.23 & 0.22 & 0.60 & 1.1 & 0.57 & 0.24 & 1.4 & 0.86 & 0.69 & 1.5 & 2.3 & 1.6 & 1.6 & 1.6 & 0.67 \\ \hline \end{tabular} \caption{Timings for MAM computation with $n=1000$} \label{tab:timing_n_1000} \end{table} \begin{table}[H] \centering \renewcommand{\arraystretch}{1.5} \setlength\tabcolsep{0.2em} \scriptsize \begin{tabular}{ |c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c| } \hline \cellcolor[HTML]{E9E9E9} \smash{\raisebox{0.7pt}{$p$}} & \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{s}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{d}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_1$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_2$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_3$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_4$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_5$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_6$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_7$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_8$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_9$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{10}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{11}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{12}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{13}$ \\ \hline \cellcolor[HTML]{E9E9E9} 0.0001 & 11 & 12 & 12 & 12 & 12 & 12 & 12 & 12 & 12 & 41 & 55 & 37 & 38 & 34 & 15 \\ \hline \cellcolor[HTML]{E9E9E9} 0.001 & 13 & 12 & 13 & 13 & 12 & 12 & 13 & 12 & 12 & 61 & 89 & 54 & 56 & 48 & 15 \\ \hline \cellcolor[HTML]{E9E9E9} 0.01 & 13 & 13 & 36 & 36 & 14 & 13 & 82 & 36 & 36 & 150 & 230 & 130 & 130 & 99 & 36 \\ \hline \cellcolor[HTML]{E9E9E9} 0.1 & 33 & 31 & 170 & 260 & 160 & 53 & 410 & 210 & 210 & 700 & 1100 & 520 & 760 & 580 & 150 \\ \hline \end{tabular} \caption{Timings for MAM computation with $n=10 \, 000$} \label{tab:timing_n_10000} \end{table} \section{Data preprocessing} \label{sec:notes_preprocessing} All real networks were preprocessed by restriction to their largest connected component. The Unicode Languages network in Section~\ref{sec:bipartite_languages} was also preprocessed to remove territories with under one million inhabitants and languages with under one million speakers. Vertex and edge counts of all networks are stated \emph{after} this preprocessing. \section{US map} \label{sec:notes_us_map} % \vspace*{-0.8cm} \begin{figure}[H] \centering %\includegraphics[scale=0.6,draft=false]{% %../../results/us_migration/us_migration_map_state_names.pdf} \vspace*{-0.5cm} \caption{US map with state boundaries and state abbreviations} \label{fig:notes_us_map} \end{figure} \section{Word count} The word count of this dissertation is 6230 \unskip, obtained using \TeX \hspace*{-0.15cm} count by running % \begin{center} \texttt{texcount -relaxed -inc -0 -sum=1,1,1,0,0,0,0\,}. \end{center} % %The final dissertation should be no longer than 7,500 words, this usually %equates to 25--30 pages. The word count may exclude any table of contents, %all mathematical equations and symbols, diagrams, tables, bibliography and %the texts of computer programs. However any preface, footnotes, and %appendices must be included \clearpage{} %TC:ignore % add the bibliography to the contents page \pagestyle{empty} \cleardoublepage \phantomsection \addcontentsline{toc}{chapter}{References} % change bibliography name to References \renewcommand{\bibname}{References} \pagestyle{fancy} \fancyhead[RO]{\itshape{\nouppercase{References}}} \bibliography{refs} \bibliographystyle{abbrv} %TC:endignore \end{document} tex-fmt-0.5.2/tests/source/ociamthesis.cls000066400000000000000000000165271473573253500206100ustar00rootroot00000000000000% ociamthesis v2.2 % By Keith A. Gillow % Version 1.0 released 26/11/1997 %-------------------------- identification --------------------- \NeedsTeXFormat{LaTeX2e} \ProvidesClass{ociamthesis}[2010/11/22 v2.2 OCIAM thesis class] %-------------------------- initial code ----------------------- \def\logoversion{squarelogo} \DeclareOption{beltcrest}{\def\logoversion{beltcrest}} \DeclareOption{shieldcrest}{\def\logoversion{shieldcrest}} \DeclareOption*{\PassOptionsToClass{\CurrentOption}{report}} \ProcessOptions\relax \LoadClass[a4paper]{report} % As an alternative to the above could use next line for twosided output %\LoadClass[a4paper,twoside,openright]{report} \RequirePackage{graphicx} % needed for latest frontpage logo \RequirePackage{ifthen} % needed for option parsing for logo \raggedbottom %define the default submitted text \newcommand{\submittedtext}{{A thesis submitted for the degree of}} % % DECLARATIONS % % These macros are used to declare arguments needed for the % construction of the title page and other preamble. % The year and term the thesis is submitted \def\degreedate#1{\gdef\@degreedate{#1}} % The full (unabbreviated) name of the degree \def\degree#1{\gdef\@degree{#1}} % The name of your Oxford college (e.g. Christ Church, Pembroke) \def\college#1{\gdef\@college{#1}} % % Setup chosen crest/logo % \ifthenelse{\equal{\logoversion}{shieldcrest}}% { % Traditional Oxford shield crest %Using latex metafont (Mathematical Institute system) \font\crestfont=oxcrest40 scaled\magstep3 \def\logo{{\crestfont \char1}} %For comlab system replace 1st line above with %\font\crestfont=crest scaled\magstep3 }{} \ifthenelse{\equal{\logoversion}{beltcrest}}% { % Newer Oxford Belt crest %Using latex metafont (Mathematical Institute system) \font\beltcrestfont=oxbeltcrest \def\logo{{\beltcrestfont \char0}} %For comlab system replace 1st line above with %\font\beltcrestfont=newcrest }{} \ifthenelse{\equal{\logoversion}{squarelogo}}% { % Latest Logo, Square version (the default!) % you need an oxlogo.eps or oxlogo.pdf file as appropriate \def\logo{{ %\includegraphics[width=32mm,draft=false]{../graphics/branding/oxlogo} }} }{} % % Define text area of page and margin offsets % \setlength{\topmargin}{0.0in} %0.0in \setlength{\oddsidemargin}{0.167in} % 0.33in \setlength{\evensidemargin}{-0.08in} %-0.08in \setlength{\textheight}{9.2in} %9.0in \setlength{\textwidth}{6.0in} %6.0in \setlength{\headheight}{15pt} % not set \setlength{\voffset}{-0.2in} % not set % % Environments % % This macro define an environment for front matter that is always % single column even in a double-column document. \newenvironment{alwayssingle}{% \@restonecolfalse \if@twocolumn\@restonecoltrue\onecolumn \else\if@openright\cleardoublepage\else\clearpage\fi \fi}% {\if@restonecol\twocolumn \else\newpage\thispagestyle{empty}\fi} %define title page layout \renewcommand{\maketitle}{% \begin{alwayssingle} \renewcommand{\footnotesize}{\small} \renewcommand{\footnoterule}{\relax} \thispagestyle{empty} \null\vfill \begin{center} { \Huge {\bfseries {\@title}} \par} {\large \vspace*{40mm} {\logo \par} \vspace*{25mm}} {{\Large \@author} \par} {\large \vspace*{1.5ex} % 1ex {{\@college} \par} \vspace*{1ex} {University of Oxford \par} \vspace*{25mm} {{\submittedtext} \par} \vspace*{1ex} {\it {\@degree} \par} \vspace*{2ex} {\@degreedate}} \end{center} \null\vfill \end{alwayssingle}} % DEDICATION % % The dedication environment makes sure the dedication gets its % own page and is set out in verse format. \newenvironment{dedication} {\begin{alwayssingle} \thispagestyle{empty} \begin{center} \vspace*{1.5cm} {\LARGE } \end{center} \vspace{0.5cm} \begin{verse}\begin{center}} {\end{center}\end{verse}\end{alwayssingle}} % ACKNOWLEDGEMENTS % % The acknowledgements environment puts a large, bold, centered % "Acknowledgements" label at the top of the page. The acknowledgements % themselves appear in a quote environment, i.e. tabbed in at both sides, and % on its own page. \newenvironment{acknowledgements} {\begin{alwayssingle} \thispagestyle{empty} \begin{center} \vspace*{1.5cm} {\Large \bfseries Acknowledgements} \end{center} \vspace{0.5cm} \begin{quote}} {\end{quote}\end{alwayssingle}} % The acknowledgementslong environment puts a large, bold, centered % "Acknowledgements" label at the top of the page. The acknowledgement itself % does not appears in a quote environment so you can get more in. \newenvironment{acknowledgementslong} {\begin{alwayssingle} \thispagestyle{empty} \begin{center} \vspace*{1.5cm} {\Large \bfseries Acknowledgements} \end{center} \vspace{0.5cm}} {\end{alwayssingle}} % STATEMENT OF ORIGINALITY (AS SUGGESTED BY GSW) % % The originality environment puts a large, bold, centered % "Statement of originality" label at the top of the page. The statement % of originality itself appears in a quote environment, i.e. tabbed in at % both sides, and on its own page. \newenvironment{originality} {\begin{alwayssingle} \thispagestyle{empty} \begin{center} \vspace*{1.5cm} {\Large \bfseries Statement of Originality} \end{center} \vspace{0.5cm} \begin{quote}} {\end{quote}\end{alwayssingle}} % The originalitylong environment puts a large, bold, centered % "Statement of originality" label at the top of the page. The statement % of originality itself does not appears in a quote environment so you can % get more in. \newenvironment{originalitylong} {\begin{alwayssingle} \thispagestyle{empty} \begin{center} \vspace*{1.5cm} {\Large \bfseries Statement of Originality} \end{center} \vspace{0.5cm}} {\end{alwayssingle}} %ABSTRACT % %The abstract environment puts a large, bold, centered "Abstract" label at %the top of the page. The abstract itself appears in a quote environment, %i.e. tabbed in at both sides, and on its own page. \renewenvironment{abstract} {\begin{alwayssingle} \thispagestyle{empty} \begin{center} \vspace*{1.5cm} {\Large \bfseries Abstract} \end{center} \vspace{0.5cm} \begin{quote}} {\end{quote}\end{alwayssingle}} %The abstractlong environment puts a large, bold, centered "Abstract" label at %the top of the page. The abstract itself does not appears in a quote %environment so you can get more in. \newenvironment{abstractlong} {\begin{alwayssingle} \thispagestyle{empty} \begin{center} \vspace*{1.5cm} {\Large \bfseries Abstract} \end{center} \vspace{0.5cm}} {\end{alwayssingle}} %The abstractseparate environment is for running of a page with the abstract %on including title and author etc as required to be handed in separately \newenvironment{abstractseparate} {\begin{alwayssingle} \thispagestyle{empty} \vspace*{-1in} \begin{center} { \Large {\bfseries {\@title}} \par} {{\large \vspace*{1ex} \@author} \par} {\large \vspace*{1ex} {{\@college} \par} {University of Oxford \par} \vspace*{1ex} {{\it \submittedtext} \par} {\it {\@degree} \par} \vspace*{2ex} {\@degreedate}} \end{center}} {\end{alwayssingle}} %ROMANPAGES % % The romanpages environment set the page numbering to lowercase roman one % for the contents and figures lists. It also resets % page-numbering for the remainder of the dissertation (arabic, starting at 1). \newenvironment{romanpages} {\cleardoublepage\setcounter{page}{1}\renewcommand{\thepage}{\roman{page}}} {\cleardoublepage\renewcommand{\thepage}{\arabic{page}}\setcounter{page}{1}} tex-fmt-0.5.2/tests/source/phd_dissertation.tex000066400000000000000000026224011473573253500216560ustar00rootroot00000000000000% !TeX program = lualatex %! TeX root = phd_dissertation.tex %\pdfvariable suppressoptionalinfo 512\relax \documentclass[11pt,lof]{puthesis} % packages \usepackage{amsmath} \usepackage{amssymb} \usepackage[amsmath,thmmarks,noconfig]{ntheorem} \usepackage{mathtools} \usepackage{multirow} \usepackage{pgfplots} \usepackage{graphicx} \usepackage{enumitem} \usepackage{subcaption} \usepackage{titlesec} \usepackage{stackengine} \usepackage{scalerel} \usepackage{microtype} \usepackage[boxruled,linesnumbered,commentsnumbered,procnumbered]{algorithm2e} \usepackage[longnamesfirst]{natbib} \usepackage[hypertexnames=false,hidelinks]{hyperref} \usepackage[norefs,nocites]{refcheck} \usepackage[defaultlines=3,all]{nowidow} \usepackage{float} % settings \pgfplotsset{compat=1.9} \setcitestyle{round} \captionsetup[subfigure]{justification=centering} \def\arraystretch{1.3} \renewcommand{\descriptionlabel}[1]{\hspace{\labelsep}\textit{#1}} % tables numbered as figures \def\table{\def\figurename{Table}\figure} \let\endtable\endfigure \renewcommand\listfigurename{List of Figures and Tables} % arxiv \newcommand{\arxiv}[1]{\href{https://arxiv.org/abs/#1}{\texttt{arXiv:#1}}} % github \newcommand{\github}[1]{\href{https://github.com/#1}{\texttt{github.com/#1}}} % blackboard \renewcommand{\P}{\ensuremath{\mathbb{P}}} \newcommand{\N}{\ensuremath{\mathbb{N}}} \newcommand{\R}{\ensuremath{\mathbb{R}}} \newcommand{\E}{\ensuremath{\mathbb{E}}} \newcommand{\Q}{\ensuremath{\mathbb{Q}}} \newcommand{\I}{\ensuremath{\mathbb{I}}} \newcommand{\Z}{\ensuremath{\mathbb{Z}}} % roman \newcommand{\rF}{\ensuremath{\mathrm{F}}} \newcommand{\rH}{\ensuremath{\mathrm{H}}} \newcommand{\rL}{\ensuremath{\mathrm{L}}} \newcommand{\rk}{\ensuremath{\mathrm{k}}} \newcommand{\rd}{\ensuremath{\mathrm{d}}} \newcommand{\comp}{\ensuremath{\mathrm{c}}} \newcommand{\TV}{\mathrm{TV}} % bold \newcommand{\bW}{\ensuremath{\mathbf{W}}} \newcommand{\bY}{\ensuremath{\mathbf{Y}}} \newcommand{\bX}{\ensuremath{\mathbf{X}}} \newcommand{\bT}{\ensuremath{\mathbf{T}}} \newcommand{\bA}{\ensuremath{\mathbf{A}}} \newcommand{\bV}{\ensuremath{\mathbf{V}}} % calligraphic \newcommand{\cH}{\ensuremath{\mathcal{H}}} \newcommand{\cF}{\ensuremath{\mathcal{F}}} \newcommand{\cN}{\ensuremath{\mathcal{N}}} \newcommand{\cX}{\ensuremath{\mathcal{X}}} \newcommand{\cG}{\ensuremath{\mathcal{G}}} \newcommand{\cW}{\ensuremath{\mathcal{W}}} \newcommand{\cB}{\ensuremath{\mathcal{B}}} \newcommand{\cS}{\ensuremath{\mathcal{S}}} \newcommand{\cT}{\ensuremath{\mathcal{T}}} \newcommand{\cV}{\ensuremath{\mathcal{V}}} \newcommand{\cE}{\ensuremath{\mathcal{E}}} \newcommand{\cU}{\ensuremath{\mathcal{U}}} \newcommand{\cR}{\ensuremath{\mathcal{R}}} \newcommand{\cA}{\ensuremath{\mathcal{A}}} \newcommand{\cC}{\ensuremath{\mathcal{C}}} \newcommand{\cM}{\ensuremath{\mathcal{M}}} \newcommand{\cD}{\ensuremath{\mathcal{D}}} \newcommand{\cP}{\ensuremath{\mathcal{P}}} \newcommand{\cI}{\ensuremath{\mathcal{I}}} \newcommand{\cY}{\ensuremath{\mathcal{Y}}} % sans serif \newcommand{\T}{\ensuremath{\mathsf{T}}} % symbols \newcommand{\vvvert}{{\vert\kern-0.25ex\vert\kern-0.25ex\vert}} \newcommand{\bigvvvert}{{\big\vert\kern-0.35ex\big\vert\kern-0.35ex\big\vert}} \newcommand{\Bigvvvert}{{\Big\vert\kern-0.3ex\Big\vert\kern-0.3ex\Big\vert}} \newcommand{\bigsetminus}{\mathbin{\big\backslash}} \newcommand{\Bigsetminus}{\mathbin{\Big\backslash}} \newcommand{\dprime}{\ensuremath{\prime\prime}} \newcommand{\tprime}{\ensuremath{\prime\prime\prime}} \newcommand{\objective}{\ensuremath{\mathrm{obj}}} \newcommand{\Dl}{\ensuremath{D_{\textup{lo}}}} \newcommand{\Du}{\ensuremath{D_{\textup{up}}}} % floor of beta \newcommand{\flbeta}{{\ThisStyle{% \ensurestackMath{\stackengine{-0.5\LMpt}{\SavedStyle \beta}% {\SavedStyle {\rule{3.7\LMpt}{0.3\LMpt}}} {U}{c}{F}{F}{S}}\vphantom{\beta}}}} % operators \DeclareMathOperator{\Var}{Var} \DeclareMathOperator{\Cov}{Cov} \DeclareMathOperator{\AIMSE}{AIMSE} \DeclareMathOperator{\LOOCV}{LOOCV} \DeclareMathOperator{\symconv}{symconv} \DeclareMathOperator{\GCV}{GCV} \DeclareMathOperator{\Unif}{Unif} \DeclareMathOperator*{\logistic}{logistic} \DeclareMathOperator{\Bias}{Bias} \DeclareMathOperator{\Env}{Env} \DeclareMathOperator*{\esssup}{ess\,sup} \DeclareMathOperator{\Ber}{Ber} \DeclareMathOperator{\KL}{KL} \DeclareMathOperator{\Gam}{Gam} \DeclareMathOperator{\Yule}{Yule} \DeclareMathOperator{\rank}{rank} \DeclareMathOperator{\Exp}{Exp} \DeclareMathOperator{\Bin}{Bin} \DeclareMathOperator{\Tr}{Tr} \DeclareMathOperator{\Leb}{Leb} \DeclareMathOperator*{\argmin}{arg\,min} \DeclareMathOperator*{\minimize}{minimize:} \DeclareMathOperator*{\subjectto}{subject\ to:} \DeclareMathOperator{\ROT}{ROT} \newcommand{\diff}[1]{\,\mathrm{d}#1} % theorem environments \renewtheoremstyle{break}{% \item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ % ##2}\hbox{\strut}}}]% }{% \item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ % ##2\ \normalfont (##3)}\hbox{\strut}}}]% } \theoremstyle{break} \theorempreskip{7mm} \newtheorem{theorem}{Theorem}[section] \newtheorem{lemma}{Lemma}[section] \newtheorem{assumption}{Assumption}[section] \newtheorem{corollary}{Corollary}[section] \newtheorem{proposition}{Proposition}[section] \newtheorem{definition}{Definition}[section] \newtheorem{remark}{Remark}[section] % proof environments \let\proof\relax \newtheoremstyle{proof}{% \item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ % }\hbox{\strut}}}]% }{% \item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ % \normalfont (##3)}\hbox{\strut}}}]% } \theoremstyle{proof} \theorembodyfont{\upshape} \theorempreskip{7mm} \theoremsymbol{\ensuremath{\square}} \newtheorem{proof}{Proof} \AtBeginEnvironment{proof}{\setcounter{proofparagraphcounter}{0}}% % proof paragraphs \titleformat{\paragraph}[hang]{\bfseries\upshape}{}{0pt}{}[] \titlespacing*{\paragraph}{0pt}{6pt}{0pt} \newcounter{proofparagraphcounter} \newcommand{\proofparagraph}[1]{ \refstepcounter{proofparagraphcounter}% \paragraph{Part \theproofparagraphcounter : #1}}% % inline roman lists \newlist{inlineroman}{enumerate*}{1} \setlist[inlineroman]{afterlabel=~,label=(\roman*)} % algorithms \DontPrintSemicolon% \makeatletter% \renewcommand{\SetKwInOut}[2]{% \sbox\algocf@inoutbox{\KwSty{#2}\algocf@typo:}% \expandafter\ifx\csname InOutSizeDefined\endcsname\relax% \newcommand\InOutSizeDefined{}% \setlength{\inoutsize}{\wd\algocf@inoutbox}% \sbox\algocf@inoutbox{% \parbox[t]{\inoutsize}% {\KwSty{#2}\algocf@typo:\hfill}~% }% \setlength{\inoutindent}{\wd\algocf@inoutbox}% \else% \ifdim\wd\algocf@inoutbox>\inoutsize% \setlength{\inoutsize}{\wd\algocf@inoutbox}% \sbox\algocf@inoutbox{% \parbox[t]{\inoutsize}% {\KwSty{#2}\algocf@typo:\hfill}~% }% \setlength{\inoutindent}{\wd\algocf@inoutbox}% \fi% \fi% \algocf@newcommand{#1}[1]{% \ifthenelse{\boolean{algocf@inoutnumbered}}{\relax}{\everypar={\relax}}{% \let\\\algocf@newinout\hangindent=\inoutindent\hangafter=1\parbox[t]% {\inoutsize}{\KwSty{#2}% \algocf@typo:\hfill}~##1\par% }% \algocf@linesnumbered% }% }% \makeatother% \SetKwInOut{Input}{Input}% \SetKwInOut{Output}{Output}% \setlength{\algomargin}{2em}% \author{William George Underwood} \adviser{Matias Damian Cattaneo} \title{Estimation and Inference in \\ Modern Nonparametric Statistics} \abstract{ % 350 words max Nonparametric methods are central to modern statistics, enabling data analysis with minimal assumptions in a wide range of scenarios. While contemporary procedures such as random forests and kernel methods are popular due to their performance and flexibility, their statistical properties are often less well understood. The availability of sound inferential techniques is vital in the sciences, allowing researchers to quantify uncertainty in their models. We develop methodology for robust and practical statistical estimation and inference in some modern nonparametric settings involving complex estimators and nontraditional data. We begin in the regression setting by studying the Mondrian random forest, a variant in which the partitions are drawn from a Mondrian process. We present a comprehensive analysis of the statistical properties of Mondrian random forests, including a central limit theorem for the estimated regression function and a characterization of the bias. We show how to conduct feasible and valid nonparametric inference by constructing confidence intervals, and further provide a debiasing procedure that enables minimax-optimal estimation rates for smooth function classes in arbitrary dimension. Next, we turn our attention to nonparametric kernel density estimation with dependent dyadic network data. We present results for minimax-optimal estimation, including a novel lower bound for the dyadic uniform convergence rate, and develop methodology for uniform inference via confidence bands and counterfactual analysis. Our methods are based on strong approximations and are designed to be adaptive to potential dyadic degeneracy. We give empirical results with simulated and real-world economic trade data. Finally, we develop some new probabilistic results with applications to nonparametric statistics. Coupling has become a popular approach for distributional analysis in recent years, and Yurinskii's method stands out for its wide applicability and explicit formulation. We present a generalization of Yurinskii's coupling, treating approximate martingale data under weaker conditions than previously imposed. We allow for Gaussian mixture coupling distributions, and a third-order method permits faster rates in certain situations. We showcase our results with applications to factor models and martingale empirical processes, as well as nonparametric partitioning-based and local polynomial regression procedures. } \acknowledgments{ I am extremely fortunate to have been surrounded by many truly wonderful people over the course of my career, and without their support this dissertation would not have been possible. While it is impossible for me to identify every one of them individually, I would like to mention a few names in particular to recognize those who have been especially important to me during the last few years. Firstly, I would like to express my utmost gratitude to my Ph.D.\ adviser, Matias Cattaneo. Working with Matias has been genuinely inspirational for me, and I could not have asked for a more rewarding start to my journey as a researcher. From the very beginning, he has guided me expertly through my studies, providing hands-on assistance when required while also allowing me the independence necessary to develop as an academic. I hope that, during the four years we have worked together, I have acquired just a fraction of his formidable mathematical intuition, keen attention to detail, boundless creativity, and inimitable pedagogical skill. Alongside his role as my adviser, Matias has been above all a friend, who has been in equal measure inspiring, insightful, dedicated, understanding, and kind. Secondly, I would like to thank all of the faculty members at Princeton and beyond who have acted as my collaborators and mentors, without whom none of my work could have been realized. In particular, I express my gratitude to my tireless Ph.D.\ committee members and letter writers Jianqing Fan and Jason Klusowski, my coauthors Yingjie Feng and Ricardo Masini, my dissertation reader Boris Hanin, my teachers Amir Ali Ahmadi, Ramon van Handel, Mikl{\'o}s R{\'a}cz, and Mykhaylo Shkolnikov, my colleagues Sanjeev Kulkarni and Roc{\'i}o Titiunik, and my former supervisor Mihai Cucuringu. I am also thankful for the staff members at Princeton who have been perpetually helpful, and I would like to identify Kim Lupinacci in particular; her assistance in all things administrative has been invaluable. I am grateful to my fellow graduate students in the ORFE department for their technical expertise and generosity with their time, and for making Sherrerd Hall such a vibrant and exciting space, especially Jose Avilez, Pier Beneventano, Ben Budway, Rajita Chandak, Abraar Chaudhry, Stefan Clarke, Giulia Crippa, G{\"o}k{\c{c}}e Dayan{\i}kl{\i}, Nicolas Garcia, Felix Hoefer, Erica Lai, Jackie Lok, Maya Mutic, Dan Rigobon, Till Saenger, Rajiv Sambharya, Boris Shigida, Igor Silin, Giang Truong, and Rae Yu. Our regular social events made a contribution to my well-being which is difficult to overstate. My thanks extend also to the students I taught, as well as to my group of senior thesis undergraduates, for their commitment, patience, and responsiveness. More broadly, I would like to thank all of my friends, near and far, for their unfailing support and reliability, and for helping to create so many of my treasured memories. In particular, Ole Agersnap, James Ashford, Christian Baehr, Chris Bambic, Kevin Beeson, James Broadhead, Alex Cox, Reece Edmends, Robin Franklin, Greg Henderson, Bonnie Ko, Grace Matthews, Dan Mead, Ben Musachio, Jacob Neis, Monika Papayova, Will Pedrick, Oliver Philcox, Nandita Rao, Alex Rice, Edward Rowe, David Snyder, Titi Sodimu, Nikitas Tampakis, and Anita Zhang. Thank you to the Princeton Chapel Choir for being such a wonderful community of musicians and a source of close friends, and to our directors, Nicole Aldrich and Penna Rose, and organist Eric Plutz. Lastly, yet most importantly, I want to thank my family for their unwavering support throughout my studies. My visits back home have been a source of joy throughout my long and often challenging Ph.D., and I cherish every moment I have spent with my parents, sister, grandparents, and extended family. } \begin{document} \chapter{Introduction} % nonparametric estimation is common Nonparametric estimation procedures are at the heart of many contemporary theoretical and methodological topics within the fields of statistics, data science, and machine learning. Where classical parametric techniques impose specific distributional and structural assumptions when modeling statistical problems, nonparametric methods instead take a more flexible approach, typically positing only high-level restrictions such as moment conditions, independence criteria, and smoothness assumptions. Examples of such procedures abound in modern data science and machine learning, encompassing histograms, kernel estimators, smoothing splines, decision trees, nearest neighbor methods, random forests, neural networks, and many more. % nonparametric estimation is good The benefits of the nonparametric framework are clear: statistical procedures can be formulated in cases where the stringent assumptions of parametric models are untestable, demonstrably violated, or simply unreasonable. As a consequence, the resulting methods often inherit desirable robustness properties against various forms of misspecification or misuse. The class of problems that can be formulated is correspondingly larger: arbitrary distributions and relationships can be characterized and estimated in a principled manner. % nonparametric estimation is hard Nonetheless, these attractive properties do come at a price. In particular, as its name suggests, the nonparametric approach forgoes the ability to reduce a complex statistical problem to that of estimating a fixed, finite number of parameters. Rather, nonparametric procedures typically involve making inferences about a growing number of parameters simultaneously, as witnessed in high-dimensional regimes, or even directly handling infinite-dimensional objects such as entire regression or density functions. As a consequence, nonparametric estimators are usually less efficient than their correctly specified parametric counterparts, when they are available; rates of convergence tend to be slower, and confidence sets more conservative. Another challenge is that theoretical mathematical analyses of nonparametric estimators are often significantly more demanding than those required for low-dimensional parametric settings, necessitating tools from contemporary developments in high-dimensional concentration phenomena, coupling and strong approximation theory, empirical processes, mathematical optimization, and stochastic calculus. % nonparametric inference In addition to providing accurate point estimates of unknown (possibly high-dimensional or infinite-dimensional) quantities of interest, modern nonparametric procedures are also expected to come equipped with methodologies for conducting statistical inference. The availability of such inferential techniques is paramount, with contemporary nonparametric methods forming a ubiquitous component of modern data science tool kits. Valid uncertainty quantification is essential for hypothesis testing, error bar construction, assessing statistical significance, and performing power analyses. Inference is a central concept in classical statistics, and despite the rapid recent development of theory for modern nonparametric estimators, their applicability to statistical inference is in certain cases rather less well studied; theoretically sound and practically implementable inference procedures are sometimes absent in the literature. % complex data In any statistical modeling problem, the selection and application of an estimator must naturally be tailored to the available data. Today, much of the data produced and analyzed does not necessarily fit neatly into the classical framework of independent and identically distributed samples, and instead might consist of time series, stochastic processes, networks, or high-dimensional or functional data, to name just a few. Therefore, it is important to understand how nonparametric methods might be adapted to correctly handle these data types, maintaining fast estimation rates and valid techniques for statistical inference. The technical challenges associated with such an endeavor are non-trivial; many standard techniques are ineffective in the presence of dependent or infinite-dimensional data, for example. As such, the development of new mathematical results in probability theory plays an important role in the comprehensive treatment of nonparametric statistics with complex data. \section*{Overview of the dissertation} % what we do This dissertation presents a selection of topics relating to nonparametric estimation and inference, and the associated technical mathematical tools. % mondrian Chapter~\ref{ch:mondrian}, titled ``Inference with Mondrian Random Forests,'' is based on the work of \citet{cattaneo2023inference}. % what are random forests Random forests are popular ensembling-based methods for classification and regression, which are well known for their good performance, flexibility, robustness, and efficiency. The majority of random forest models share the following common framework for producing estimates of a classification or regression function using covariates and a response variable. Firstly, the covariate space is partitioned in some algorithmic manner, possibly using a source of external randomness. Secondly, a local estimator of the classification or regression function is fitted to the responses in each cell separately, yielding a tree estimator. Finally, this process is repeated with many different partitions, and the resulting tree estimators are averaged to produce a random forest. % why are there variants Many different variants of random forests have been proposed in recent years, typically with the aim of improving their statistical or computational properties, or simplifying their construction in order to permit a more detailed theoretical analysis. % mondrian random forests One interesting such example is that of the Mondrian random forest, in which the underlying partitions (or trees) are constructed independently of the data. Naturally, this restriction rules out many classical random forest models, which exhibit a complex and data-dependent partitioning scheme. Instead, trees are sampled from a canonical stochastic process known as the Mondrian process, which endows the resulting tree and forest estimators with various agreeable features. % what we do We study the estimation and inference properties of Mondrian random forests in the nonparametric regression setting. In particular, we establish a novel central limit theorem for the estimates made by a Mondrian random forest which, when combined with a characterization of the bias and a consistent variance estimator, allows one to perform asymptotically valid statistical inference, such as constructing confidence intervals, on the unknown regression function. We also provide a debiasing procedure for Mondrian random forests, which allows them to achieve minimax-optimal estimation rates with H{\"o}lder smooth regression functions, for any smoothness parameter and in arbitrary dimension. % kernel Chapter~\ref{ch:kernel}, titled ``Dyadic Kernel Density Estimators,'' is based on the work of \citet{cattaneo2024uniform}. Network data plays an important role in statistics, econometrics, and many other data science disciplines, providing a natural framework for modeling relationships between units, be they people, financial institutions, proteins, or economic entities. Of prominent interest is the task of performing statistical estimation and inference with data sampled from the edges of such networks, known as dyadic data. The archetypal lack of independence between edges in a network renders many classical statistical tools unsuited for direct application. As such, researchers must appeal to techniques tailored to dyadic data in order to accurately capture the complex structure present in the network. % broad scope We focus on nonparametric estimation and inference with dyadic data, and in particular we seek methods that are robust in the sense that our results should hold uniformly across the support of the data. Such uniformity guarantees allow for statistical inference in a broader range of settings, including specification testing and distributional counterfactual analysis. We specifically consider the problem of uniformly estimating a dyadic density function, focusing on kernel estimators taking the form of dyadic empirical processes. % main contributions Our main contributions include the minimax-optimal uniform convergence rate of the dyadic kernel density estimator, along with strong approximation results for the associated standardized and Studentized $t$-processes. A consistent variance estimator enables the construction of feasible uniform confidence bands for the unknown density function. We showcase the broad applicability of our results by developing novel counterfactual density estimation and inference methodology for dyadic data, which can be used for causal inference and program evaluation. % why it is difficult A crucial feature of dyadic distributions is that they may be ``degenerate'' at certain points in the support of the data, a property that makes our analysis somewhat delicate. Nonetheless, our methods for uniform inference remain robust to the potential presence of such points. % applications For implementation purposes, we discuss inference procedures based on positive semi-definite covariance estimators, mean squared error optimal bandwidth selectors, and robust bias correction. We illustrate the empirical performance of our methods in simulations and with real-world trade data, for which we make comparisons between observed and counterfactual trade distributions in different years. Our technical results on strong approximations and maximal inequalities are of potential independent interest. % yurinskii Finally, Chapter~\ref{ch:yurinskii}, titled ``Yurinskii's Coupling for Martingales,'' is based on the work of \citet{cattaneo2022yurinskii}. Yurinskii's coupling is a popular theoretical tool for non-asymptotic distributional analysis in mathematical statistics and applied probability. Coupling theory, also known as strong approximation, provides an alternative framework to the more classical weak convergence approach to statistical analysis. Rather than merely approximating the distribution of a random variable, strong approximation techniques construct a sequence of random variables which are close almost surely or in probability, often with finite-sample guarantees. % what is it used for Coupling allows distributional analysis in settings where weak convergence fails, including many applications to nonparametric or high-dimensional statistics; it is a key technical component in the main strong approximation results of our Chapter~\ref{ch:kernel}. The Yurinskii method specifically offers a Gaussian coupling with an explicit error bound under easily verified conditions; originally stated in $\ell^2$-norm for sums of independent random vectors, it has recently been extended both to the $\ell^p$-norm, for $1 \leq p \leq \infty$, and to vector-valued martingales in $\ell^2$-norm, under some strong conditions. % what we do We present as our main result a Yurinskii coupling for approximate martingales in $\ell^p$-norm, under substantially weaker conditions than previously imposed. Our formulation allows the coupling variable to follow a general Gaussian mixture distribution, and we provide a novel third-order coupling method which gives tighter approximations in certain situations. We specialize our main result to mixingales, martingales, and independent data, and derive uniform Gaussian mixture strong approximations for martingale empirical processes. Applications to nonparametric partitioning-based and local polynomial regression procedures are provided. % appendices Supplementary materials for Chapters~\ref{ch:mondrian}, \ref{ch:kernel}, and \ref{ch:yurinskii} are provided in Appendices~\ref{app:mondrian}, \ref{app:kernel}, and \ref{app:yurinskii} respectively. These contain detailed proofs of the main results, additional technical contributions, and further discussion. \chapter[Inference with Mondrian Random Forests]% {Inference with \\ Mondrian Random Forests} \label{ch:mondrian} % abstract Random forests are popular methods for classification and regression, and many different variants have been proposed in recent years. One interesting example is the Mondrian random forest, in which the underlying trees are constructed according to a Mondrian process. In this chapter we give a central limit theorem for the estimates made by a Mondrian random forest in the regression setting. When combined with a bias characterization and a consistent variance estimator, this allows one to perform asymptotically valid statistical inference, such as constructing confidence intervals, on the unknown regression function. We also provide a debiasing procedure for Mondrian random forests which allows them to achieve minimax-optimal estimation rates with $\beta$-H{\"o}lder regression functions, for all $\beta$ and in arbitrary dimension, assuming appropriate parameter tuning. \section{Introduction} Random forests, first introduced by \citet{breiman2001random}, are a workhorse in modern machine learning for classification and regression tasks. Their desirable traits include computational efficiency (via parallelization and greedy heuristics) in big data settings, simplicity of configuration and amenability to tuning parameter selection, ability to adapt to latent structure in high-dimensional data sets, and flexibility in handling mixed data types. Random forests have achieved great empirical successes in many fields of study, including healthcare, finance, online commerce, text analysis, bioinformatics, image classification, and ecology. Since Breiman introduced random forests over twenty years ago, the study of their statistical properties remains an active area of research: see \citet{scornet2015consistency}, \citet{chi2022asymptotic}, \citet{klusowski2024large}, and references therein, for a sample of recent developments. Many fundamental questions about Breiman's random forests remain unanswered, owing in part to the subtle ingredients present in the estimation procedure which make standard analytical tools ineffective. These technical difficulties stem from the way the constituent trees greedily partition the covariate space, utilizing both the covariate and response data. This creates complicated dependencies on the data which are often exceedingly hard to untangle without overly stringent assumptions, thereby hampering theoretical progress. To address the aforementioned technical challenges while retaining the phenomenology of Breiman's random forests, a variety of stylized versions of random forest procedures have been proposed and studied in the literature. These include centered random forests \citep{biau2012analysis,arnould2023interpolation} and median random forests \citep{duroux2018impact,arnould2023interpolation}. Each tree in a centered random forest is constructed by first choosing a covariate uniformly at random and then splitting the cell at the midpoint along the direction of the chosen covariate. Median random forests operate in a similar way, but involve the covariate data by splitting at the empirical median along the direction of the randomly chosen covariate. Known as purely random forests, these procedures simplify Breiman's original---albeit more data-adaptive---version by growing trees that partition the covariate space in a way that is statistically independent of the response data. Yet another variant of random forests, Mondrian random forests \citep{lakshminarayanan2014mondrian}, have received significant attention in the statistics and machine learning communities in recent years \citep{ma2020isolation, mourtada2020minimax, scillitoe2021uncertainty, mourtada2021amf, vicuna2021reducing, gao2022towards, oreilly2022stochastic}. Like other purely random forest variants, Mondrian random forests offer a simplified modification of Breiman's original proposal in which the partition is generated independently of the data and according to a canonical stochastic process known as the Mondrian process \citep{roy2008mondrian}. The Mondrian process takes a single parameter $\lambda > 0$ known as the ``lifetime'' and enjoys various mathematical properties. These probabilistic features allow Mondrian random forests to be fitted in an online manner as well as being subject to a rigorous statistical analysis, while also retaining some of the appealing features of other more traditional random forest methods. This chapter studies the statistical properties of Mondrian random forests. We focus on this purely random forest variant not only because of its importance in the development of random forest theory in general, but also because the Mondrian process is, to date, the only known recursive tree mechanism involving randomization, pure or data-dependent, for which the resulting random forest is minimax-optimal for point estimation over a class of smooth regression functions in arbitrary dimension \citep{mourtada2020minimax}. In fact, when the covariate dimension exceeds one, the aforementioned centered and median random forests are both minimax-\emph{suboptimal}, due to their large biases, over the class of Lipschitz smooth regression functions \citep{klusowski2021sharp}. It is therefore natural to focus our study of inference for random forests on versions that at the very least exhibit competitive bias and variance, as this will have important implications for the trade-off between precision and confidence. Despite their recent popularity, relatively little is known about the formal statistical properties of Mondrian random forests. Focusing on nonparametric regression, \citet{mourtada2020minimax} recently showed that Mondrian forests containing just a single tree (called a Mondrian tree) can be minimax-optimal in integrated mean squared error whenever the regression function is $\beta$-H{\"o}lder continuous for some $\beta \in (0, 1]$. The authors also showed that, when appropriately tuned, large Mondrian random forests can be similarly minimax-optimal for $\beta \in (0, 2]$, while the constituent trees cannot. See also \citet{oreilly2022stochastic} for analogous results for more general Mondrian tree and forest constructions. These results formally demonstrate the value of ensembling with random forests from a point estimation perspective. No results are currently available in the literature for statistical inference using Mondrian random forests. This chapter contributes to the literature on the foundational statistical properties of Mondrian random forest regression estimation with two main results. Firstly, we give a central limit theorem for the classical Mondrian random forest point estimator, and propose valid large-sample inference procedures employing a consistent standard error estimator. We establish this result by deploying a martingale central limit theorem \citep[Theorem~3.2]{hall1980martingale} because we need to handle delicate probabilistic features of the Mondrian random forest estimator. In particular, we deal with the existence of Mondrian cells which are ``too small'' and lead to a reduced effective (local) sample size for some trees in the forest. Such pathological cells are in fact typical in Mondrian random forests and complicate the probability limits of certain sample averages; in fact, small Mondrian random forests (or indeed single Mondrian trees) remain random even in the limit due to the lack of ensembling. The presence of small cells renders inapplicable prior distributional approximation results for partitioning-based estimators in the literature \citep{huang2003local,cattaneo2020large}, since the commonly required quasi-uniformity assumption on the underlying partitioning scheme is violated by cells generated using the Mondrian process. We circumvent this technical challenge by establishing new theoretical results for Mondrian partitions and their associated Mondrian trees and forests, which may be of independent interest. The second main contribution of the chapter is to propose a debiasing approach for the Mondrian random forest point estimator. We accomplish this by first precisely characterizing the probability limit of the large sample conditional bias, and then applying a debiasing procedure based on the generalized jackknife \citep{schucany1977improvement}. We thus exhibit a Mondrian random forest variant which is minimax-optimal in pointwise mean squared error when the regression function is $\beta$-H{\"o}lder for any $\beta > 0$. Our method works by generating an ensemble of Mondrian random forests carefully chosen to have smaller misspecification bias when extra smoothness is available, resulting in minimax optimality even for $\beta > 2$. This result complements \citet{mourtada2020minimax} by demonstrating the existence of a class of Mondrian random forests that can efficiently exploit the additional smoothness of the unknown regression function for minimax-optimal point estimation. Our proposed debiasing procedure is also useful when conducting statistical inference because it provides a principled method for ensuring that the bias is negligible relative to the standard deviation of the estimator. More specifically, we use our debiasing approach to construct valid inference procedures based on robust bias correction \citep{calonico2018effect,calonico2022coverage}. This chapter is structured as follows. In Section~\ref{sec:mondrian_setup} we introduce the Mondrian process and give our assumptions on the data generating process, using a H{\"o}lder smoothness condition on the regression function to control the bias of various estimators. We define the Mondrian random forest estimator and present our assumptions on its lifetime parameter and the number of trees. We give our notation for the following sections in this chapter. Section~\ref{sec:mondrian_inference} presents our first set of main results, beginning with a central limit theorem for the centered Mondrian random forest estimator (Theorem~\ref{thm:mondrian_clt}), in which we characterize the limiting variance. Theorem~\ref{thm:mondrian_bias} complements this result by precisely calculating the limiting bias of the estimator, with the aim of subsequently applying a debiasing procedure. To enable valid feasible statistical inference, we provide a consistent variance estimator in Theorem~\ref{thm:mondrian_variance_estimation} and briefly discuss implications for lifetime parameter selection. In Section~\ref{sec:mondrian_overview_proofs} we provide a brief overview of the proofs of these first main results. We focus on the technical innovations and general strategic approach, giving some insight into the challenges involved, and refer the reader to Section~\ref{sec:mondrian_app_proofs} for detailed proofs. In Section~\ref{sec:mondrian_debiased} we define debiased Mondrian random forests, a collection of estimators based on linear combinations of Mondrian random forests with varying lifetime parameters. These parameters are carefully chosen to annihilate leading terms in our bias characterization, yielding an estimator with provably superior bias properties (Theorem~\ref{thm:mondrian_bias_debiased}). In Theorem~\ref{thm:mondrian_clt_debiased} we verify that a central limit theorem continues to hold for the debiased Mondrian random forest. We again state the limiting variance, discuss the implications for the lifetime parameter, and provide a consistent variance estimator (Theorem~\ref{thm:mondrian_variance_estimation_debiased}) for constructing confidence intervals (Theorem~\ref{thm:mondrian_confidence_debiased}). As a final corollary of the improved bias properties, we demonstrate in Theorem~\ref{thm:mondrian_minimax} that the debiased Mondrian random forest estimator is minimax-optimal in pointwise mean squared error for all $\beta > 0$, provided that $\beta$ is known a priori. Section~\ref{sec:mondrian_parameter_selection} discusses tuning parameter selection, beginning with a data-driven approach to selecting the crucial lifetime parameter using polynomial estimation, alongside other practical suggestions including generalized cross-validation. We also give advice on choosing the number of trees, and other parameters associated with the debiasing procedure. In Section~\ref{sec:mondrian_weather} we present an illustrative example application of our proposed methodology for estimation and inference in the setting of weather forecasting in Australia. We demonstrate the use of our debiased Mondrian random forest estimator and our generalized cross-validation procedure for lifetime parameter selection, as well as the construction of point estimates and confidence intervals. Concluding remarks are given in Section~\ref{sec:mondrian_conclusion}, while Appendix~\ref{app:mondrian} contains all the mathematical proofs of our theoretical contributions, along with some other technical probabilistic results on the Mondrian process which may be of interest. \subsection{Notation} We write $\|\cdot\|_2$ for the usual Euclidean $\ell^2$-norm on $\R^d$. The natural numbers are $\N = \{0, 1, 2, \ldots \}$. We use $a \wedge b$ for the minimum and $a \vee b$ for the maximum of two real numbers. For a set $A$, we use $A^{\comp}$ for the complement whenever the background space is clear from context. We use $C$ to denote a positive constant whose value may change from line to line. For non-negative sequences $a_n$ and $b_n$, write $a_n \lesssim b_n$ or $a_n = O(b_n)$ to indicate that $a_n / b_n$ is bounded for $n\geq 1$. Write $a_n \ll b_n$ or $a_n = o(b_n)$ if $a_n / b_n \to 0$. If $a_n \lesssim b_n \lesssim a_n$, write $a_n \asymp b_n$. For random non-negative sequences $A_n$ and $B_n$, similarly write $A_n \lesssim_\P B_n$ or $A_n = O_\P(B_n)$ if $A_n / B_n$ is bounded in probability, and $A_n = o_\P(B_n)$ if $A_n / B_n \to 0$ in probability. Convergence of random variables $X_n$ in distribution to a law $\P$ is denoted by $X_n \rightsquigarrow \P$. \section{Setup} \label{sec:mondrian_setup} When using a Mondrian random forest, there are two sources of randomness. The first is of course the data, and here we consider the nonparametric regression setting with $d$-dimensional covariates. The second source is a collection of independent trees drawn from a Mondrian process, which we define in the subsequent section, using a specified lifetime parameter. \subsection{The Mondrian process} \label{sec:mondrian_process} The Mondrian process was introduced by \citet{roy2008mondrian} and offers a canonical method for generating random rectangular partitions, which can be used as the trees for a random forest \citep{lakshminarayanan2014mondrian,lakshminarayanan2016mondrian}. For the reader's convenience, we give a brief description of this process here; see \citet[Section~3]{mourtada2020minimax} for a more complete definition. For a fixed dimension $d$ and lifetime parameter $\lambda > 0$, the Mondrian process is a stochastic process taking values in the set of finite rectangular partitions of $[0,1]^d$. For a rectangle $D = \prod_{j=1}^d [a_j, b_j] \subseteq [0,1]^d$, we denote the side aligned with dimension $j$ by $D_j = [a_j, b_j]$, write $D_j^- = a_j$ and $D_j^+ = b_j$ for its left and right endpoints respectively, and use $|D_j| = D_j^+ - D_j^-$ for its length. The volume of $D$ is $|D| = \prod_{j=1}^{d} |D_j|$ and its linear dimension (or half-perimeter) is $|D|_1 = \sum_{j=1}^{d} |D_j|$. To sample a partition $T$ from the Mondrian process $\cM \big( [0,1]^d, \lambda \big)$ we start at time $t=0$ with the trivial partition of $[0,1]^d$ which has no splits. We then repeatedly apply the following procedure to each cell $D$ in the partition. Let $t_D$ be the time at which the cell was formed, and sample $E_D \sim \Exp \left( |D|_1 \right)$. If $t_D + E_D \leq \lambda$, then we split $D$. This is done by first selecting a split dimension $J$ with $\P(J=j) = |D_j| / |D|_1$, and then sampling a split location $S_J \sim \Unif\big[D_J^-, D_J^+\big]$. The cell $D$ splits into the two new cells $\{x \in D : x_J \leq S_J\}$ and $\{x \in D : x_J > S_J\}$, each with formation time $t_D + E_D$. The final outcome is the partition $T$ consisting of the cells $D$ which were not split because $t_D + E_D > \lambda$. The cell in $T$ containing a point $x \in [0,1]^d$ is written $T(x)$. Figure~\ref{fig:mondrian_process} shows typical realizations of $T \sim \cM\big( [0,1]^d, \lambda \big)$ for $d=2$ and with different lifetime parameters $\lambda$. % \begin{figure}[t] \centering % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/plot_mondrian_process_1.pdf} \caption{$\lambda = 3$} \end{subfigure} % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/plot_mondrian_process_2.pdf} \caption{$\lambda = 10$} \end{subfigure} % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/plot_mondrian_process_3.pdf} \caption{$\lambda = 30$} \end{subfigure} % \caption[The Mondrian process]{ The Mondrian process $T \sim \cM \big( [0,1]^d, \lambda \big)$ with $d=2$ and lifetime parameters $\lambda$.} \label{fig:mondrian_process} \end{figure} \subsection{Data generation} Throughout this chapter, we assume that the data satisfies Assumption~\ref{ass:mondrian_data}. We begin with a definition of H{\"o}lder continuity which will be used for controlling the bias of various estimators. \begin{definition}[H{\"o}lder continuity]% Take $\beta > 0$ and define $\flbeta$ to be the largest integer which is strictly less than $\beta$. We say a function $g: [0,1]^d \to \R$ is $\beta$-H{\"o}lder continuous and write $g \in \cH^\beta$ if $g$ is $\flbeta$ times differentiable and $\max_{|\nu| = \flbeta} \left| \partial^\nu g(x) - \partial^{\nu} g(x') \right| \leq C \|x-x'\|_2^{\beta - \flbeta}$ for some constant $C > 0$ and all $x, x' \in [0,1]^d$. Here, $\nu \in \N^d$ is a multi-index with $|\nu| = \sum_{j=1}^d \nu_j$ and $\partial^{\nu} g(x) = \partial^{|\nu|} g(x) \big/ \prod_{j=1}^d \partial x_j^{\nu_j}$. We say $g$ is Lipschitz if $g \in \cH^1$. \end{definition} \begin{assumption}[Data generation]% \label{ass:mondrian_data} Fix $d \geq 1$ and let $(X_i, Y_i)$ be i.i.d.\ samples from a distribution on $\R^d \times \R$, writing $\bX = (X_1, \ldots, X_n)$ and $\bY = (Y_1, \ldots, Y_n)$. Suppose $X_i$ has a Lebesgue density function $f(x)$ on $[0,1]^d$ which is bounded away from zero and satisfies $f \in \cH^\beta$ for some $\beta \geq 1$. Suppose $\E[Y_i^2 \mid X_i]$ is bounded, let $\mu(X_i) = \E[Y_i \mid X_i]$, and assume $\mu \in \cH^\beta$. Write $\varepsilon_i = Y_i - \mu(X_i)$ and assume $\sigma^2(X_i) = \E[\varepsilon_i^2 \mid X_i]$ is Lipschitz and bounded away from zero. \end{assumption} Some comments are in order surrounding Assumption~\ref{ass:mondrian_data}. The requirement that the covariate density $f(x)$ be strictly positive on all of $[0,1]^d$ may seem strong, particularly when $d$ is moderately large. However, since our theory is presented pointwise in $x$, it is sufficient for this to hold only on some neighborhood of $x$. To see this, note that continuity implies the density is positive on some hypercube containing $x$. Upon rescaling the covariates, we can map this hypercube onto $[0,1]^d$. The same argument of course holds for the H{\"o}lder smoothness assumptions and the upper and lower bounds on the conditional variance function. \subsection{Mondrian random forests} \label{sec:mondrian_forests} We define the basic Mondrian random forest estimator \eqref{eq:mondrian_estimator} as in \citet{lakshminarayanan2014mondrian} and \citet{mourtada2020minimax}, and will later extend it to a debiased version in Section~\ref{sec:mondrian_debiased}. For a lifetime parameter $\lambda > 0$ and forest size $B \geq 1$, let $\bT = (T_1, \ldots, T_B)$ be a Mondrian forest where $T_b \sim \cM\big([0,1]^d, \lambda\big)$ are i.i.d.\ Mondrian trees which are independent of the data. For $x \in [0,1]^d$, write $N_b(x) = \sum_{i=1}^{n} \I \left\{ X_i \in T_b(x) \right\}$ for the number of samples in $T_b(x)$, with $\I$ denoting an indicator function. Then the Mondrian random forest estimator of $\mu(x)$ is % \begin{equation} \label{eq:mondrian_estimator} \hat\mu(x) = \frac{1}{B} \sum_{b=1}^B \frac{\sum_{i=1}^n Y_i \, \I\big\{ X_i \in T_b(x) \big\}} {N_b(x)}. \end{equation} % If there are no samples $X_i$ in $T_b(x)$ then $N_b(x) = 0$, so we define $0/0 = 0$ (see Section~\ref{sec:mondrian_app_proofs} for details). To ensure the bias and variance of the Mondrian random forest estimator converge to zero (see Section~\ref{sec:mondrian_inference}), and to avoid boundary issues, we impose some basic conditions on $x$, $\lambda$, and $B$ in Assumption~\ref{ass:mondrian_estimator}. \begin{assumption}[Mondrian random forest estimator]% \label{ass:mondrian_estimator} % Suppose $x \in (0,1)^d$ is an interior point of the support of $X_i$, $\frac{\lambda^d}{n} \to 0$, $\log \lambda \asymp \log n$, and $B \asymp n^{\xi}$ for some $\xi \in (0, 1)$, which may depend on the dimension $d$ and smoothness $\beta$. % \end{assumption} Assumption~\ref{ass:mondrian_estimator} implies that the size of the forest $B$ grows with $n$. For the purpose of mitigating the computational burden, we suggest the sub-linear polynomial growth $B \asymp n^{\xi}$, satisfying the conditions imposed in our main results. Large forests usually do not present computational challenges in practice as the ensemble estimator is easily parallelizable over the trees. We emphasize places where this ``large forest'' condition is important to our theory as they arise throughout the chapter. \section{Inference with Mondrian random forests}% \label{sec:mondrian_inference} We begin with a bias--variance decomposition for the Mondrian random forest estimator: % \begin{align} \nonumber \hat\mu(x) - \mu(x) &= \Big( \hat\mu(x) - \E \big[ \hat \mu(x) \mid \bX, \bT \big]\Big) + \Big( \E \big[ \hat \mu(x) \mid \bX, \bT \big] - \mu(x)\Big) \\ &= \nonumber \frac{1}{B} \sum_{b=1}^B \frac{\sum_{i=1}^n \varepsilon_i \, \I\big\{ X_i \in T_b(x) \big\}} {N_b(x)} \\ \label{eq:mondrian_bias_variance} &\quad+ \frac{1}{B} \sum_{b=1}^B \frac{\sum_{i=1}^n \big(\mu(X_i) - \mu(x)\big) \, \I\big\{ X_i \in T_b(x) \big\}} {N_b(x)}. \end{align} % Our approach to inference is summarized as follows. Firstly, we provide a central limit theorem (weak convergence to a Gaussian) for the first ``variance'' term in \eqref{eq:mondrian_bias_variance}. Secondly, we precisely compute the probability limit of the second ``bias'' term. By ensuring that the standard deviation dominates the bias, a corresponding central limit theorem holds for the Mondrian random forest. With an appropriate estimator for the limiting variance, we establish procedures for valid and feasible statistical inference on the unknown regression function $\mu(x)$. We begin with the aforementioned central limit theorem, which forms the core of our methodology for performing statistical inference. Before stating our main result, we highlight some of the challenges involved. At first glance, the summands in the first term in \eqref{eq:mondrian_bias_variance} seem to be independent over $1 \leq i \leq n$, conditional on the forest $\bT$, depending only on $X_i$ and $\varepsilon_i$. However, the $N_b(x)$ appearing in the denominator depends on all $X_i$ simultaneously, violating this independence assumption and rendering classical central limit theorems inapplicable. A natural preliminary attempt to resolve this issue is to observe that % \begin{equation*} N_b(x)= \sum_{i=1}^{n} \I\big\{X_i \in T_b(x)\big\} \approx n \, \P \big( X_i \in T_b(x) \mid T_b \big) \approx n f(x) |T_b(x)| \end{equation*} % with high probability. One could attempt to use this by approximating the estimator with an average of i.i.d.\ random variables, or by employing a central limit theorem conditional on $\bX$ and $\bT$. However, such an approach fails because $\E \left[ \frac{1}{|T_b(x)|^2} \right] = \infty$; the possible existence of small cells causes the law of the inverse cell volume to have heavy tails. For similar reasons, attempts to directly establish a central limit theorem based on $2 + \delta$ moments, such as the Lyapunov central limit theorem, are ineffective. We circumvent these problems by directly analyzing $\frac{\I\{N_b(x) \geq 1\}}{N_b(x)}$. We establish concentration properties for this non-linear function of $X_i$ via the Efron--Stein inequality \citep[Section 3.1]{boucheron2013concentration} along with a sequence of somewhat delicate preliminary lemmas regarding inverse moments of truncated (conditional) binomial random variables. In particular, we show that $\E \left[ \frac{\I \{N_b(x) \geq 1\}}{N_b(x)} \right] \lesssim \frac{\lambda^d}{n}$ and $\E \left[ \frac{\I \{N_b(x) \geq 1\}}{N_b(x)^2} \right] \lesssim \frac{\lambda^{2d} \log n}{n^2}$. Asymptotic normality is then established using a central limit theorem for martingale difference sequences \citep[Theorem~3.2]{hall1980martingale} with respect to an appropriate filtration. Section~\ref{sec:mondrian_overview_proofs} gives an overview our proof strategy in which we further discuss the underlying challenges, while Section~\ref{sec:mondrian_app_proofs} gives all the technical details. \subsection{Central limit theorem} \label{sec:mondrian_clt} Theorem~\ref{thm:mondrian_clt} gives our first main result. \begin{theorem}[Central limit theorem for the centered Mondrian random forest estimator]% \label{thm:mondrian_clt} % Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold, and further assume that $\E[Y_i^4 \mid X_i ]$ is bounded almost surely and $\frac{\lambda^d \log n}{n} \to 0$. Then % \begin{align*} \sqrt{\frac{n}{\lambda^d}} \Big( \hat \mu(x) - \E \big[ \hat \mu(x) \mid \bX, \bT \big] \Big) &\rightsquigarrow \cN\big(0, \Sigma(x)\big) & &\text{where} &\Sigma(x) &= \frac{\sigma^2(x)}{f(x)} \left( \frac{4 - 4 \log 2}{3 } \right)^d. \end{align*} \end{theorem} The condition of $B \to \infty$ is crucial, ensuring sufficient ``mixing'' of different Mondrian cells to escape the heavy-tailed phenomenon detailed in the preceding discussion. For concreteness, the large forest condition allows us to deal with expressions such as $\E \left[ \frac{1}{|T_b(x)| |T_{b'}(x)|} \right] = \E \left[ \frac{1}{|T_b(x)|} \right] \E \left[ \frac{1}{|T_{b'}(x)|} \right] \approx \lambda^{2d} < \infty$ where $b \neq b'$, by independence of the trees, rather than the ``no ensembling'' single tree analog $\E \left[ \frac{1}{|T_b(x)|^2} \right] = \infty$. We take this opportunity to contrast Mondrian random forests with more classical kernel-based smoothing methods. The lifetime $\lambda$ plays a similar role to the inverse bandwidth in determining the effective sample size $n / \lambda^d$, and thus the associated rate of convergence. However, due to the Mondrian process construction, some cells are typically ``too small'' (equivalent to an insufficiently large bandwidth) to give an appropriate effective sample size. Similarly, classical methods based on non-random partitioning such as spline estimators \citep{huang2003local,cattaneo2020large} typically impose a quasi-uniformity assumption to ensure all the cells are of comparable size, a property which does not hold for the Mondrian process (not even with probability approaching one). \subsection*{Bias characterization} We turn to the second term in \eqref{eq:mondrian_bias_variance}, which captures the bias of the Mondrian random forest estimator conditional on the covariates $\bX$ and the forest $\bT$. As such, it is a random quantity which, as we will demonstrate, converges in probability. We precisely characterize the limiting non-random bias, including high-degree polynomials in $\lambda$ which for now may seem ignorable. Indeed the magnitude of the bias is determined by its leading term, typically of order $1/\lambda^2$ whenever $\beta \geq 2$, and this suffices for ensuring a negligible contribution from the bias with an appropriate choice of lifetime parameter. However, the advantage of specifying higher-order bias terms is made apparent in Section~\ref{sec:mondrian_debiased} when we construct a debiased Mondrian random forest estimator. There, we target and annihilate the higher-order terms in order to furnish superior estimation and inference properties. Theorem~\ref{thm:mondrian_bias} gives our main result on the bias of the Mondrian random forest estimator. \begin{theorem}[Bias of the Mondrian random forest estimator]% \label{thm:mondrian_bias} % Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold. Then for each $1 \leq r \leq \lfloor \flbeta / 2 \rfloor$ there exists $B_r(x) \in \R$, which is a function only of the derivatives of $f$ and $\mu$ at $x$ up to order $2r$, with % \begin{equation*} \E \left[ \hat \mu(x) \mid \bX, \bT \right] = \mu(x) + \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor} \frac{B_r(x)}{\lambda^{2r}} + O_\P \left( \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} + \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}} \right). \end{equation*} % Whenever $\beta > 2$ the leading bias is the quadratic term % \begin{equation*} \frac{B_1(x)}{\lambda^2} = \frac{1}{2 \lambda^2} \sum_{j=1}^d \frac{\partial^2 \mu(x)}{\partial x_j^2} + \frac{1}{2 \lambda^2} \frac{1}{f(x)} \sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j} \frac{\partial f(x)}{\partial x_j}. \end{equation*} % If $X_i \sim \Unif\big([0,1]^d\big)$ then $f(x) = 1$, and using multi-index notation we have % \begin{equation*} \frac{B_r(x)}{\lambda^{2r}} = \frac{1}{\lambda^{2r}} \sum_{|\nu|=r} \partial^{2 \nu} \mu(x) \prod_{j=1}^d \frac{1}{\nu_j + 1}. \end{equation*} % \end{theorem} In Theorem~\ref{thm:mondrian_bias} we give some explicit examples of calculating the limiting bias if $\beta > 2$ or when $X_i$ are uniformly distributed. The general form of $B_r(x)$ is provided in Section~\ref{sec:mondrian_app_proofs} but is somewhat unwieldy except in specific situations. Nonetheless the most important properties are that $B_r(x)$ are non-random and do not depend on the lifetime $\lambda$, crucial facts for our debiasing procedure given in Section~\ref{sec:mondrian_debiased}. If the forest size $B$ does not diverge to infinity then we suffer the first-order bias term $\frac{1}{\lambda \sqrt B}$. This phenomenon was explained by \citet{mourtada2020minimax}, who noted that it allows single Mondrian trees to achieve minimax optimality only when $\beta \in (0, 1]$. Large forests remove this first-order bias and are optimal for all $\beta \in (0, 2]$. Using Theorem~\ref{thm:mondrian_clt} and Theorem~\ref{thm:mondrian_bias} together, along with an appropriate choice of lifetime parameter $\lambda$, gives a central limit theorem for the Mondrian random forest estimator which can be used, for example, to build confidence intervals for the unknown regression function $\mu(x)$ whenever the bias shrinks faster than the standard deviation. In general this will require $\frac{1}{\lambda^2} + \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} \ll \sqrt{\frac{\lambda^d}{n}}$, which can be satisfied by imposing the restrictions $\lambda \gg n^{\frac{1}{d + 2(2 \wedge \beta)}}$ and $B \gg n^{\frac{2(2 \wedge \beta) - 2}{d + 2(2 \wedge \beta)}}$ on the lifetime $\lambda$ and forest size $B$. If instead we aim for optimal point estimation, then balancing the bias and standard deviation requires $\frac{1}{\lambda^2} + \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} \asymp \sqrt{\frac{\lambda^d}{n}}$, which can be satisfied by $\lambda \asymp n^{\frac{1}{d + 2(2 \wedge \beta)}}$ and $B \gtrsim n^{\frac{2(2 \wedge \beta) - 2}{d + 2(2 \wedge \beta)}}$. Such a choice of $\lambda$ gives the convergence rate $n^{\frac{-(2 \wedge \beta)}{d + 2(2 \wedge \beta)}}$ which is the minimax-optimal rate of convergence \citep{stone1982optimal} for $\beta$-H{\"o}lder functions with $\beta \in (0,2]$ as shown by \citet[Theorem~2]{mourtada2020minimax}. In Section~\ref{sec:mondrian_debiased} we will show how the Mondrian random forest estimator can be debiased, giving both weaker lifetime conditions for inference and also improved rates of convergence, under additional smoothness assumptions. \subsection*{Variance estimation} The limiting variance $\Sigma(x)$ from the resulting central limit theorem depends on the unknown quantities $\sigma^2(x)$ and $f(x)$. To conduct feasible inference, we must therefore first estimate $\Sigma(x)$. To this end, define % \begin{align} \label{eq:mondrian_sigma2_hat} \hat\sigma^2(x) &= \frac{1}{B} \sum_{b=1}^{B} \sum_{i=1}^n \frac{\big(Y_i - \hat \mu(x)\big)^2 \, \I\{X_i \in T_b(x)\}} {N_b(x)}, \\ \nonumber \hat\Sigma(x) &= \hat\sigma^2(x) \frac{n}{\lambda^d} \sum_{i=1}^n \left( \frac{1}{B} \sum_{b=1}^B \frac{\I\{X_i \in T_b(x)\}}{N_b(x)} \right)^2. \end{align} % In Theorem~\ref{thm:mondrian_variance_estimation} we show that this estimator is consistent, and establish its rate of convergence. % \begin{theorem}[Variance estimation]% \label{thm:mondrian_variance_estimation} Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}, and suppose $\E[Y_i^4 \mid X_i ]$ is bounded almost surely. Then % \begin{align*} \hat\Sigma(x) = \Sigma(x) + O_\P \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{1}{\sqrt B} + \sqrt{\frac{\lambda^d \log n}{n}} \right). \end{align*} \end{theorem} \subsection{Confidence intervals} Theorem~\ref{thm:mondrian_confidence} shows how to construct valid confidence intervals for the regression function $\mu(x)$ under the lifetime and forest size assumptions previously discussed. For details on feasible and practical selection of the lifetime parameter $\lambda$, see Section~\ref{sec:mondrian_parameter_selection}. % \begin{theorem}[Feasible confidence intervals using a Mondrian random forest]% \label{thm:mondrian_confidence} % Suppose that Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold, $\E[Y_i^4 \mid X_i ]$ is bounded almost surely, and $\frac{\lambda^d \log n}{n} \to 0$. Assume that $\lambda \gg n^{\frac{1}{d + 2(2 \wedge \beta)}}$ and $B \gg n^{\frac{2 (2 \wedge \beta) - 2}{d + 2 (2 \wedge \beta)}}$. For a confidence level $\alpha \in (0, 1)$, let $q_{1 - \alpha / 2}$ be the normal quantile satisfying $\P \left( \cN(0, 1) \leq q_{1 - \alpha / 2} \right) = 1 - \alpha / 2$. Then % \begin{align*} \P \left( \mu(x) \in \left[ \hat \mu(x) - \sqrt{\frac{\lambda^d}{n}} \hat \Sigma(x)^{1/2} q_{1 - \alpha / 2}, \ \hat \mu(x) + \sqrt{\frac{\lambda^d}{n}} \hat \Sigma(x)^{1/2} q_{1 - \alpha / 2} \right] \right) \to 1 - \alpha. \end{align*} \end{theorem} When coupled with an appropriate lifetime selection method, Theorem~\ref{thm:mondrian_confidence} gives a fully feasible procedure for uncertainty quantification in Mondrian random forests. Our procedure requires no adjustment of the original Mondrian random forest estimator beyond ensuring that the bias is negligible, and in particular does not rely on sample splitting. The construction of confidence intervals is just one corollary of the weak convergence result given in Theorem~\ref{thm:mondrian_clt}, and follows immediately from Slutsky's theorem \citep[Chapter~7]{pollard2002user} with a consistent variance estimator. Other applications include hypothesis testing on the value of $\mu(x)$ at a design point $x$ by inversion of the confidence interval, as well as parametric specification testing by comparison with a $\sqrt{n}$-consistent parametric regression estimator. The construction of simultaneous confidence intervals for finitely many points $x_1, \ldots, x_D$ can be accomplished either using standard multiple testing corrections or by first establishing a multivariate central limit theorem using the Cram{\'e}r--Wold device \citep[Chapter~8]{pollard2002user} and formulating a consistent multivariate variance estimator. \section{Overview of proof strategy}% \label{sec:mondrian_overview_proofs} This section provides some insight into the general approach we use to establish the main results in the preceding sections. We focus on the technical innovations forming the core of our arguments, and refer the reader to Section~\ref{sec:mondrian_app_proofs} for detailed proofs, including those for the debiased estimator discussed in the upcoming Section~\ref{sec:mondrian_debiased}. \subsection*{Preliminary results} The starting point for our proofs is a characterization of the exact distribution of the shape of a Mondrian cell $T(x)$. This property is a direct consequence of the fact that the restriction of a Mondrian process to a subcell remains Mondrian \citep[Fact~2]{mourtada2020minimax}. We have % \begin{align*} |T(x)_j| &= \left( \frac{E_{j1}}{\lambda} \wedge x_j \right) + \left( \frac{E_{j2}}{\lambda} \wedge (1-x_j) \right) \end{align*} % for all $1 \leq j \leq d$, recalling that $T(x)_j$ is the side of the cell $T(x)$ aligned with axis $j$, and where $E_{j1}$ and $E_{j2}$ are mutually independent $\Exp(1)$ random variables. Our assumptions that $x \in (0,1)$ and $\lambda \to \infty$ make the boundary terms $x_j$ and $1-x_j$ eventually ignorable so % \begin{align*} |T(x)_j| &= \frac{E_{j1} + E_{j2}}{\lambda} \end{align*} % with high probability. Controlling the size of the largest cell in the forest containing $x$ is now straightforward with a union bound, exploiting the sharp tail decay of the exponential distribution, and thus % \begin{align*} \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_b(x)_j| \lesssim_\P \frac{\log B}{\lambda}. \end{align*} % This shows that up to logarithmic terms, none of the cells in the forest at $x$ are significantly larger than average, ensuring that the Mondrian random forest estimator is localized around $x$ on the scale of $1/\lambda$, an important property for the upcoming bias characterization. Having provided upper bounds for the sizes of Mondrian cells, we also must establish some lower bounds in order to quantify the ``small cell'' phenomenon mentioned previously. The first step towards this is to bound the first two moments of the truncated inverse Mondrian cell volume; we show that % \begin{align*} \E\left[ 1 \wedge \frac{1}{n |T(x)|} \right] &\asymp \frac{\lambda^d}{n} &&\text{and} &\frac{\lambda^{2d}}{n^2} &\lesssim \E\left[ 1 \wedge \frac{1}{n^2 |T(x)|^2} \right] \lesssim \frac{\lambda^{2d} \log n}{n^2}. \end{align*} % These bounds are computed directly using the exact distribution of $|T(x)|$. Note that $\E\left[ \frac{1}{|T(x)|^2} \right] = \infty$ because $\frac{1}{E_{j1} + E_{j2}}$ has only $2 - \delta$ finite moments, so the truncation is crucial here. Since we nearly have two moments, this truncation is at the expense of only a logarithmic term. Nonetheless, third and higher truncated moments will not enjoy such tight bounds, demonstrating both the fragility of this result and the inadequacy of tools such as the Lyapunov central limit theorem which require $2 + \delta$ moments. To conclude this investigation into the small cell phenomenon, we apply the previous bounds to ensure that the empirical effective sample sizes $N_b(x) = \sum_{i=1}^{n} \I \left\{ X_i \in T_b(x) \right\}$ are approximately of the order $n / \lambda^d$ in an appropriate sense; we demonstrate that % \begin{align*} \E\left[ \frac{\I\{N_b(x) \geq 1\}}{N_b(x)} \right] &\lesssim \frac{\lambda^d}{n} &&\text{and} &\E\left[ \frac{\I\{N_b(x) \geq 1\}}{N_b(x)^2} \right] &\lesssim \frac{\lambda^{2d} \log n}{n^2}, \end{align*} % as well as similar bounds for mixed terms such as % $\E \left[ \frac{\I\{N_b(x) \geq 1\}}{N_b(x)} \frac{\I\{N_{b'}(x) \geq 1\}}{N_{b'}(x)} \right] \lesssim \frac{\lambda^{2d}}{n^2}$ % when $b \neq b'$, which arise from covariance terms across multiple trees. The proof of this result is involved and technical, and proceeds by induction. The idea is to construct a class of subcells by taking all possible intersections of the cells in $T_b$ and $T_{b'}$ (we show two trees here for clarity; there may be more) and noting that each $N_b(x)$ is the sum of the number of points in each such refined cell intersected with $T_b(x)$. We then swap out each refined cell one at a time and replace the number of data points it contains with its volume multiplied by $n f(x)$, showing that the expectation on the left hand side does not increase too much using a moment bound for inverse binomial random variables based on Bernstein's inequality. By induction and independence of the trees, eventually the problem is reduced to computing moments of truncated inverse Mondrian cell volumes, as above. \subsection*{Central limit theorem} To prove our main central limit theorem result (Theorem~\ref{thm:mondrian_clt}), we use the martingale central limit theorem given by \citet[Theorem~3.2]{hall1980martingale}. For each $1 \leq i \leq n$ define $\cH_{n i}$ to be the filtration generated by $\bT$, $\bX$, and $(\varepsilon_j : 1 \leq j \leq i)$, noting that $\cH_{n i} \subseteq \cH_{(n+1)i}$ because $B$ increases as $n$ increases. Define the $\cH_{n i}$-measurable and square integrable variables % \begin{align*} S_i(x) &= \sqrt{\frac{n}{\lambda^d}} \frac{1}{B} \sum_{b=1}^B \frac{\I \{X_i \in T_b(x)\} \varepsilon_i} {N_{b}(x)}, \end{align*} % which satisfy the martingale difference property $\E [ S_i(x) \mid \cH_{n i} ] = 0$. Further, % \begin{align*} \sqrt{\frac{n}{\lambda^d}} \big( \hat\mu(x) - \E\left[ \hat\mu(x) \mid \bX, \bT \right] \big) = \sum_{i=1}^n S_i(x). \end{align*} % To establish weak convergence to $\cN\big(0, \Sigma(x)\big)$, it suffices to check that $\max_i |S_i(x)| \to 0$ in probability, $\E\left[\max_i S_i(x)^2\right] \lesssim 1$, and $\sum_i S_i(x)^2 \to \Sigma(x)$ in probability. Checking the first two of these is straightforward given the denominator moment bounds derived above. For the third condition, we demonstrate that $\sum_i S_i(x)^2$ concentrates by checking its variance is vanishing. To do this, first observe that $S_i(x)^2$ is the square of a sum over the $B$ trees. Expanding this square, we see that the diagonal terms (where $b = b'$) provide a negligible contribution due to the large forest assumption. For the other terms, we apply the law of total variance and the moment bounds detailed earlier. Here, it is crucial that $b \neq b'$ in order to exploit the independence of the trees and avoid having to control any higher moments. The law of total variance requires that we bound % \begin{align*} \Var \left[ \E \left[ \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I\{X_i \in T_b(x) \cap T_{b'}(x)\} \varepsilon_i^2} {N_{b}(x) N_{b'}(x)} \Bigm| \bX, \bY \right] \right], \end{align*} % which is the variance of a non-linear function of the i.i.d.\ variables $(X_i, \varepsilon_i)$, and so we apply the Efron--Stein inequality. The important insight here is that replacing a sample $(X_i, \varepsilon_i)$ with an independent copy $(\tilde X_i, \tilde \varepsilon_i)$ can change the value of $N_b(x)$ by at most one. Further, this can happen only on the event $\{ X_i \in T_{b}(x) \} \cup \{ \tilde X_i \in T_{b}(x) \}$, which occurs with probability on the order $1/\lambda^d$ (the expected cell volume). The final part of the central limit theorem proof is to calculate the limiting variance $\Sigma(x)$. The penultimate step showed that we must have % \begin{align*} \Sigma(x) &= \lim_{n \to \infty} \sum_{i=1}^n \E \left[S_i(x)^2 \right] = \lim_{n \to \infty} \frac{n^2}{\lambda^d} \, \E \left[ \frac{\I\{X_i \in T_b(x) \cap T_{b'}(x)\} \varepsilon_i^2} {N_{b}(x) N_{b'}(x)} \right], \end{align*} % assuming the limit exists, so it remains to check this and calculate the limit. It is a straightforward but tedious exercise to verify that each term can be replaced with its conditional expectation given $T_b$ and $T_{b'}$, using some further properties of the binomial and exponential distributions. This yields % \begin{align*} \Sigma(x) &= \frac{\sigma^2(x)}{f(x)} \lim_{\lambda \to \infty} \frac{1}{\lambda^d} \E \left[ \frac{|T_{b}(x) \cap T_{b'}(x)|} {|T_{b}(x)| \, |T_{b'}(x)|} \right] = \frac{\sigma^2(x)}{f(x)} \E \left[ \frac{(E_{1} \wedge E'_{1}) + (E_{2} \wedge E'_{2})} {(E_{1} + E_{2}) (E'_{1} + E'_{2})} \right]^d \end{align*} % where $E_1$, $E_2$, $E'_1$, and $E'_2$ are independent $\Exp(1)$, by the cell shape distribution and independence of the trees. This final expectation is calculated by integration, using various incomplete gamma function identities. \subsection*{Bias characterization} Our second substantial technical result is the bias characterization given as Theorem~\ref{thm:mondrian_bias}, in which we precisely characterize the probability limit of the conditional bias % \begin{align*} \E \left[ \hat \mu(x) \mid \bX, \bT \right] - \mu(x) &= \frac{1}{B} \sum_{b=1}^B \sum_{i=1}^n \big( \mu(X_i) - \mu(x) \big) \frac{\I\{X_i \in T_b(x)\}}{N_b(x)}. \end{align*} % The first step is to pass to the ``infinite forest'' limit by taking an expectation conditional on $\bX$, or equivalently marginalizing over $\bT$, applying the conditional Markov inequality to see % \begin{align*} \big| \E \left[ \hat \mu(x) \mid \bX, \bT \right] - \E \left[ \hat \mu(x) \mid \bX \right] \big| &\lesssim_\P \frac{1}{\lambda \sqrt B}. \end{align*} % While this may seem a crude approximation, it is already known that fixed-size Mondrian forests have suboptimal bias properties when compared to forests with a diverging number of trees. In fact, the error $\frac{1}{\lambda \sqrt B}$ exactly accounts for the first-order bias of individual Mondrian trees noted by \citet{mourtada2020minimax}. Next we show that $\E \left[ \hat \mu(x) \mid \bX \right]$ converges in probability to its expectation, again using the Efron--Stein theorem for this non-linear function of the i.i.d.\ variables $X_i$. The Lipschitz property of $\mu$ and the upper bound on the maximum cell size give $|\mu(X_i) - \mu(x)| \lesssim \max_{1 \leq j \leq d} |T_b(x)_j| \lesssim_\P \frac{\log B}{\lambda}$ whenever $X_i \in T_b(x)$, so we combine this with moment bounds for the denominator $N_b(x)$ to see % \begin{align*} \left| \E \left[ \hat \mu(x) \mid \bX \right] - \E \left[ \hat \mu(x) \right] \right| \lesssim_\P \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}}. \end{align*} The next step is to approximate the resulting non-random bias $\E \left[ \hat \mu(x) \right] - \mu(x)$ as a polynomial in $1/\lambda$. To this end, we firstly apply a concentration-type result for the binomial distribution to deduce that % \begin{align*} \E \left[ \frac{\I\{N_b(x) \geq 1\}}{N_b(x)} \Bigm| \bT \right] \approx \frac{1}{n \int_{T_b(x)} f(s) \diff s} \end{align*} % in an appropriate sense, and hence, by conditioning on $\bT$ and $\bX$ without $X_i$, we write % \begin{align} \label{eq:mondrian_bias_ratio} \E \left[ \hat \mu(x) \right] - \mu(x) &\approx \E \left[ \frac{\int_{T_b(x)} (\mu(s) - \mu(x)) f(s) \diff s} {\int_{T_b(x)} f(s) \diff s} \right]. \end{align} % Next we apply the multivariate version of Taylor's theorem to the integrands in both the numerator and the denominator in \eqref{eq:mondrian_bias_ratio}, and then apply the Maclaurin series of $\frac{1}{1+x}$ and the multinomial theorem to recover a single polynomial in $1/\lambda$. The error term is on the order of $1/\lambda^\beta$ and depends on the smoothness of $\mu$ and $f$, and the polynomial coefficients are given by various expectations involving exponential random variables. The final step is to verify using symmetry of Mondrian cells that all the odd monomial coefficients are zero, and to calculate some explicit examples of the form of the limiting bias. \section{Debiased Mondrian random forests}% \label{sec:mondrian_debiased} In this section we give our next main contribution, proposing a variant of the Mondrian random forest estimator which corrects for higher-order bias with an approach based on generalized jackknifing \citep{schucany1977improvement}. This estimator retains the basic form of a Mondrian random forest estimator in the sense that it is a linear combination of Mondrian tree estimators, but in this section we allow for non-identical linear coefficients, some of which may be negative, and for differing lifetime parameters across the trees. Since the basic Mondrian random forest estimator is a special case of this more general debiased version, we will discuss only the latter throughout the rest of the chapter. We use the explicit form of the bias given in Theorem~\ref{thm:mondrian_bias} to construct a debiased version of the Mondrian forest estimator. Let $J \geq 0$ be the bias correction order. As such, with $J=0$ we retain the original Mondrian forest estimator, with $J=1$ we remove second-order bias, and with $J = \lfloor\flbeta / 2 \rfloor$ we remove bias terms up to and including order $2 \lfloor\flbeta / 2 \rfloor$, giving the maximum possible bias reduction achievable in the H{\"o}lder class $\cH^\beta$. As such, only bias terms of order $1/\lambda^\beta$ will remain. For $0 \leq r \leq J$ let $\hat \mu_r(x)$ be a Mondrian forest estimator based on the trees $T_{b r} \sim \cM\big([0,1]^d, \lambda_r \big)$ for $1 \leq b \leq B$, where $\lambda_r = a_r \lambda$ for some $a_r > 0$ and $\lambda > 0$. Write $\bT$ to denote the collection of all the trees, and suppose they are mutually independent. We find values of $a_r$ along with coefficients $\omega_r$ in order to annihilate the leading $J$ bias terms of the debiased Mondrian random forest estimator % \begin{align} \label{eq:mondrian_debiased} \hat \mu_\rd(x) &= \sum_{r=0}^J \omega_r \hat \mu_r(x) = \sum_{r=0}^{J} \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\sum_{i=1}^n Y_i \, \I\big\{ X_i \in T_{r b}(x) \big\}} {N_{r b}(x)}. \end{align} % This ensemble estimator retains the ``forest'' structure of the original estimators, but with varying lifetime parameters $\lambda_r$ and coefficients $\omega_r$. Thus by Theorem~\ref{thm:mondrian_bias} we want to solve % \begin{align*} \sum_{r=0}^{J} \omega_r \left( \mu(x) + \sum_{s=1}^{J} \frac{B_{s}(x)}{a_r^{2s} \lambda^{2s}} \right) &= \mu(x) \end{align*} % for all $\lambda$, or equivalently the system of linear equations $\sum_{r=0}^{J} \omega_r = 1$ and $\sum_{r=0}^{J} \omega_r a_r^{-2s} = 0$ for each $1 \leq s \leq J$. We solve these as follows. Define the $(J+1) \times (J+1)$ Vandermonde matrix $A_{r s} = a_{r-1}^{2-2s}$, and let $\omega = (\omega_0, \ldots, \omega_J)^\T \in \R^{J+1}$ and $e_0 = (1, 0, \ldots, 0)^\T \in \R^{J+1}$. Then a solution for the debiasing coefficients is given by $\omega = A^{-1} e_0$ whenever $A$ is non-singular. In practice we can take $a_r$ to be a fixed geometric or arithmetic sequence to ensure this is the case, appealing to the Vandermonde determinant formula: $\det A = \prod_{0 \leq r < s \leq J} (a_r^{-2} - a_s^{-2}) \neq 0$ whenever $a_r$ are distinct. For example, we could set $a_r = (1 + \gamma)^r$ or $a_r = 1 + \gamma r$ for some $\gamma > 0$. Because we assume $\beta$, and therefore the choice of $J$, do not depend on $n$, there is no need to quantify the invertibility of $A$ by, for example, bounding its eigenvalues away from zero as a function of $J$. \subsection{Central limit theorem} In Theorem~\ref{thm:mondrian_clt_debiased}, we verify that a central limit theorem holds for the debiased random forest estimator $\hat\mu_\rd(x)$ and give its limiting variance. The strategy and challenges associated with proving Theorem~\ref{thm:mondrian_clt_debiased} are identical to those discussed earlier surrounding Theorem~\ref{thm:mondrian_clt}. In fact in Section~\ref{sec:mondrian_app_proofs} we provide a direct proof only for Theorem~\ref{thm:mondrian_clt_debiased} and deduce Theorem~\ref{thm:mondrian_clt} as a special case. \begin{theorem}[Central limit theorem for the debiased Mondrian random forest estimator]% \label{thm:mondrian_clt_debiased} % Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold, $\E[Y_i^4 \mid X_i ]$ is bounded, and $\frac{\lambda^d \log n}{n} \to 0$. Then % \begin{align*} \sqrt{\frac{n}{\lambda^d}} \Big( \hat \mu_\rd(x) - \E \big[ \hat \mu_\rd(x) \mid \bX, \bT \big] \Big) &\rightsquigarrow \cN\big(0, \Sigma_\rd(x)\big) \end{align*} % where, with $\ell_{r r'} = \frac{2 a_r}{3} \left( 1 - \frac{a_{r}}{a_{r'}} \log\left(\frac{a_{r'}}{a_{r}} + 1\right) \right)$, the limiting variance is % \begin{align*} \Sigma_\rd(x) &= \frac{\sigma^2(x)}{f(x)} \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \left( \ell_{r r'} + \ell_{r' r} \right)^d. \end{align*} % \end{theorem} It is easy to verify that in the case of no debiasing we have $J=0$ and $a_0 = \omega_0 = 1$, yielding $\Sigma_\rd(x) = \Sigma(x)$, and recovering Theorem~\ref{thm:mondrian_clt}. \subsection*{Bias characterization} In Theorem~\ref{thm:mondrian_bias_debiased} we verify that this debiasing procedure does indeed annihilate the desired bias terms, and its proof is a consequence of Theorem~\ref{thm:mondrian_bias} and the construction of the debiased Mondrian random forest estimator $\hat\mu_\rd(x)$. \begin{theorem}[Bias of the debiased Mondrian random forest estimator]% \label{thm:mondrian_bias_debiased} Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}. In the notation of Theorem~\ref{thm:mondrian_bias} with $\bar\omega = \sum_{r=0}^J \omega_r a_r^{-2J - 2}$, % \begin{align*} \E \big[ \hat \mu_\rd(x) \mid \bX, \bT \big] &= \mu(x) + \I\{2J+2 < \beta \} \frac{\bar\omega B_{J+1}(x)}{\lambda^{2J + 2}} \\ &\quad+ O_\P \left( \frac{1}{\lambda^{2J + 4}} + \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} + \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}} \right). \end{align*} % \end{theorem} Theorem~\ref{thm:mondrian_bias_debiased} has the following consequence: the leading bias term is characterized in terms of $B_{J+1}(x)$ whenever $J < \beta/2 - 1$, or equivalently $J < \lfloor \flbeta/2 \rfloor$, that is, the debiasing order $J$ does not exhaust the H{\"o}lder smoothness $\beta$. If this condition does not hold, then the estimator is fully debiased, and the resulting leading bias term is bounded above by $1/\lambda^\beta$ up to constants, but its form is left unspecified. \subsection*{Variance estimation} As before, we propose a variance estimator in order to conduct feasible inference and show that it is consistent. With $\hat\sigma^2(x)$ as in \eqref{eq:mondrian_sigma2_hat} in Section~\ref{sec:mondrian_inference}, define the estimator % \begin{align} \label{eq:mondrian_debiased_variance_estimator} \hat\Sigma_\rd(x) &= \hat\sigma^2(x) \frac{n}{\lambda^d} \sum_{i=1}^n \left( \sum_{r=0}^J \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\I\{X_i \in T_{r b}(x)\}} {N_{r b}(x)} \right)^2. \end{align} % \begin{theorem}[Variance estimation]% \label{thm:mondrian_variance_estimation_debiased} Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}, and suppose $\E[Y_i^4 \mid X_i ]$ is bounded almost surely. Then % \begin{align*} \hat\Sigma_\rd(x) = \Sigma_\rd(x) + O_\P \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{1}{\sqrt B} + \sqrt{\frac{\lambda^d \log n}{n}} \right). \end{align*} % \end{theorem} \subsection{Confidence intervals} In analogy to Section~\ref{sec:mondrian_inference}, we now demonstrate the construction of feasible valid confidence intervals using the debiased Mondrian random forest estimator in Theorem~\ref{thm:mondrian_confidence_debiased}. Once again we must ensure that the bias (now significantly reduced due to our debiasing procedure) is negligible when compared to the standard deviation (which is of the same order as before). We assume for simplicity that the estimator has been fully debiased by setting $J \geq \lfloor \flbeta / 2\rfloor$ to yield a leading bias of order $1/\lambda^\beta$, but intermediate ``partially debiased'' versions can easily be provided, with leading bias terms of order $1/\lambda^{\beta \wedge (2J+2)}$ in general. We thus require $\frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} \ll \sqrt{\frac{\lambda^d}{n}}$, which can be satisfied by imposing the restrictions $\lambda \gg n^{\frac{1}{d + 2 \beta}}$ and $B \gg n^{\frac{2\beta - 2}{d + 2\beta}}$ on the lifetime parameter $\lambda$ and forest size $B$. \begin{theorem}[Feasible confidence intervals using a debiased Mondrian random forest]% \label{thm:mondrian_confidence_debiased} % Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold, $\E[Y_i^4 \mid X_i ]$ is bounded, and $\frac{\lambda^d \log n}{n} \to 0$. Fix $J \geq \lfloor \flbeta / 2 \rfloor$ and assume that $\lambda \gg n^{\frac{1}{d + 2 \beta}}$ and $B \gg n^{\frac{2 \beta - 2}{d + 2 \beta}}$. For a confidence level $\alpha \in (0, 1)$, let $q_{1 - \alpha / 2}$ be as in Theorem~\ref{thm:mondrian_confidence}. Then % \begin{align*} \P \left( \mu(x) \in \left[ \hat \mu_\rd(x) - \sqrt{\frac{\lambda^d}{n}} \hat \Sigma_\rd(x)^{1/2} q_{1 - \alpha / 2}, \ \hat \mu_\rd(x) + \sqrt{\frac{\lambda^d}{n}} \hat \Sigma_\rd(x)^{1/2} q_{1 - \alpha / 2} \right] \right) \to 1 - \alpha. \end{align*} \end{theorem} One important benefit of our debiasing technique is made clear in Theorem~\ref{thm:mondrian_confidence_debiased}: the restrictions imposed on the lifetime parameter $\lambda$ are substantially relaxed, especially in smooth classes with large $\beta$. As well as the high-level of benefit of relaxed conditions, this is also useful for practical selection of appropriate lifetimes for estimation and inference respectively; see Section~\ref{sec:mondrian_parameter_selection} for more details. Nonetheless, such improvements do not come without concession. The limiting variance $\Sigma_\rd(x)$ of the debiased estimator is larger than that of the unbiased version (the extent of this increase depends on the choice of the debiasing parameters $a_r$), leading to wider confidence intervals and larger estimation error in small samples despite the theoretical asymptotic improvements. \subsection{Minimax optimality} Our final result Theorem~\ref{thm:mondrian_minimax} shows that, when using an appropriate sequence of lifetime parameters $\lambda$, the debiased Mondrian random forest estimator achieves, up to constants, the minimax-optimal rate of convergence for estimating a regression function $\mu \in \cH^\beta$ in $d$ dimensions \citep{stone1982optimal}. This result holds for all $d \geq 1$ and all $\beta > 0$, complementing a previous result established only for $\beta \in (0, 2]$ by \citet{mourtada2020minimax}. % \begin{theorem}[Minimax optimality of the debiased Mondrian random forest estimator]% \label{thm:mondrian_minimax} Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}, and let $J \geq \lfloor \flbeta / 2 \rfloor$, $\lambda \asymp n^{\frac{1}{d + 2 \beta}}$, and $B \gtrsim n^{\frac{2 \beta - 2}{d + 2 \beta}}$. Then % \begin{align*} \E \left[ \big( \hat \mu_\rd(x) - \mu(x) \big)^2 \right]^{1/2} &\lesssim \sqrt{\frac{\lambda^d}{n}} + \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} \lesssim n^{-\frac{\beta}{d + 2 \beta}}. \end{align*} % \end{theorem} The sequence of lifetime parameters $\lambda$ required in Theorem~\ref{thm:mondrian_minimax} are chosen to balance the bias and standard deviation bounds implied by Theorem~\ref{thm:mondrian_bias_debiased} and Theorem~\ref{thm:mondrian_clt_debiased} respectively, in order to minimize the pointwise mean squared error. While selecting an optimal debiasing order $J$ needs only knowledge of an upper bound on the smoothness $\beta$, choosing an optimal sequence of $\lambda$ values does assume that $\beta$ is known a priori. The problem of adapting to $\beta$ from data is challenging and beyond the scope of this chapter; we provide some practical advice for tuning parameter selection in Section~\ref{sec:mondrian_parameter_selection}. Theorem~\ref{thm:mondrian_minimax} complements the minimaxity results proven by \citet{mourtada2020minimax} for Mondrian trees (with $\beta \leq 1$) and for Mondrian random forests (with $\beta \leq 2$), with one modification: our version is stated in pointwise rather than integrated mean squared error. This is because our debiasing procedure is designed to handle interior smoothing bias and so does not provide any correction for boundary bias. We leave the development of such boundary corrections to future work, but constructions similar to higher-order boundary-correcting kernels should be possible. If the region of integration is a compact set in the interior of $[0,1]^d$, then we do obtain an optimal integrated mean squared error bound: if $\delta \in (0, 1/2)$ is fixed then under the same conditions as Theorem~\ref{thm:mondrian_minimax}, % \begin{align*} \E \left[ \int_{[\delta, 1-\delta]^d} \big( \hat \mu_\rd(x) - \mu(x) \big)^2 \diff x \right]^{1/2} &\lesssim \sqrt{\frac{\lambda^d}{n}} + \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} \lesssim n^{-\frac{\beta}{d + 2 \beta}}. \end{align*} The debiased Mondrian random forest estimator defined in \eqref{eq:mondrian_debiased} is a linear combination of Mondrian random forests, and as such contains both a sum over $0 \leq r \leq J$, representing the debiasing procedure, and a sum over $1 \leq b \leq B$, representing the forest averaging. We have thus far been interpreting this estimator as a debiased version of the standard Mondrian random forest given in \eqref{eq:mondrian_estimator}, but it is equally valid to swap the order of these sums. This gives rise to an alternative point of view: we replace each Mondrian random tree with a ``debiased'' version, and then take a forest of such modified trees. This perspective is more in line with existing techniques for constructing randomized ensembles, where the outermost operation represents a $B$-fold average of randomized base learners, not necessarily locally constant decision trees, each of which has a small bias component \citep{caruana2004ensemble, zhou2019deep, friedberg2020local}. \section{Tuning parameter selection}% \label{sec:mondrian_parameter_selection} We discuss various procedures for selecting the parameters involved in fitting a debiased Mondrian random forest; namely the base lifetime parameter $\lambda$, the number of trees in each forest $B$, the bias correction order $J$, and the debiasing scale parameters $a_r$ for $0 \leq r \leq J$. \subsection{Selecting the base lifetime parameter \texorpdfstring{$\lambda$}{lambda}}% \label{sec:mondrian_lifetime_selection} The most important parameter is the base Mondrian lifetime parameter $\lambda$, which plays the role of a complexity parameter and thus governs the overall bias--variance trade-off of the estimator. Correct tuning of $\lambda$ is especially important in two main respects: % firstly, in order to use the central limit theorem established in Theorem~\ref{thm:mondrian_clt_debiased}, we must have that the bias converges to zero, requiring $\lambda \gg n^{\frac{1}{d + 2\beta}}$. % Secondly, the minimax optimality result of Theorem~\ref{thm:mondrian_minimax} is valid only in the regime $\lambda \asymp n^{\frac{1}{d + 2\beta}}$, and thus requires careful determination in the more realistic finite-sample setting. For clarity, in this section we use the notation $\hat\mu_\rd(x; \lambda, J)$ for the debiased Mondrian random forest with lifetime $\lambda$ and debiasing order $J$ as in \eqref{eq:mondrian_debiased}. Similarly, write $\hat\Sigma_\rd(x; \lambda, J)$ for the associated variance estimator given in \eqref{eq:mondrian_debiased_variance_estimator}. For minimax-optimal point estimation when $\beta$ is known, choose any sequence $\lambda \asymp n^{\frac{1}{d + 2\beta}}$ and use $\hat\mu_\rd(x; \lambda, J)$ with $J = \lfloor \flbeta / 2 \rfloor$, following the theory given in Theorem~\ref{thm:mondrian_minimax}. For an explicit example of how to choose the lifetime, one can instead use $\hat\mu_\rd\big(x; \hat\lambda_{\AIMSE}(J-1), J-1\big)$ so that the leading bias is explicitly characterized by Theorem~\ref{thm:mondrian_bias_debiased}, and with $\hat\lambda_{\AIMSE}(J-1)$ as defined below. This is no longer minimax-optimal as $J-1 < J$ does not satisfy the conditions of Theorem~\ref{thm:mondrian_minimax}. For performing inference, a more careful procedure is required; we suggest the following method assuming $\beta > 2$. Set $J = \lfloor \flbeta / 2 \rfloor$ as before, and use $\hat\mu_\rd\big(x; \hat\lambda_{\AIMSE}(J-1), J\big)$ and $\hat\Sigma_\rd\big(x; \hat\lambda_{\AIMSE}(J-1), J\big)$ to construct a confidence interval. The reasoning for this is that we select a lifetime tailored for a more biased estimator than we actually use. This results in an inflated lifetime estimate, guaranteeing the resulting bias is negligible when it is plugged into the fully debiased estimator. This approach to tuning parameter selection and debiasing for valid nonparametric inference corresponds to an application of robust bias correction \citep{calonico2018effect,calonico2022coverage}, where the point estimator is bias-corrected and the robust standard error estimator incorporates the additional sampling variability introduced by the bias correction. This leads to a more refined distributional approximation but does not necessarily exhaust the underlying smoothness of the regression function. An alternative inference approach based on Lepskii's method \citep{lepskii1992asymptotically,birge2001alternative} could be developed with the latter goal in mind. It remains to propose a concrete method for computing $\hat\lambda_{\AIMSE}(J)$ in the finite-sample setting; we suggest two such procedures based on plug-in selection with polynomial estimation and cross-validation respectively, building on classical ideas from the nonparametric smoothing literature \citep{fan2020statistical}. \subsubsection*{Lifetime selection with polynomial estimation} Firstly, suppose $X_i \sim \Unif\big([0,1]^d\big)$ and that the leading bias of $\hat\mu_\rd(x)$ is well approximated by an additively separable function so that, writing $\partial^{2 J + 2}_j \mu(x)$ for $\partial^{2 J + 2}_j \mu(x) / \partial x_j^{2 J + 2}$, % \begin{align*} \frac{\bar \omega B_{J+1}(x)}{\lambda^{2 J + 2}} &\approx \frac{1}{\lambda^{2 J + 2}} \frac{\bar \omega }{J + 2} \sum_{j=1}^d \partial^{2 J + 2}_j \mu(x). \end{align*} % Now suppose the model is homoscedastic so $\sigma^2(x) = \sigma^2$ and the limiting variance of $\hat\mu_\rd$ is % \begin{align*} \frac{\lambda^d}{n} \Sigma_\rd(x) &= \frac{\lambda^d \sigma^2}{n} \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \left( \ell_{r r'} + \ell_{r' r} \right)^d. \end{align*} % The asymptotic integrated mean squared error (AIMSE) is % \begin{align*} \AIMSE(\lambda, J) &= \frac{1}{\lambda^{4 J + 4}} \frac{\bar \omega^2}{(J + 2)^2} \int_{[0,1]^d} \left( \sum_{j=1}^d \partial^{2 J + 2}_j \mu(x) \right)^2 \diff x \\ &\quad+ \frac{\lambda^d \sigma^2}{n} \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \left( \ell_{r r'} + \ell_{r' r} \right)^d. \end{align*} % Minimizing over $\lambda > 0$ yields the AIMSE-optimal lifetime parameter % \begin{align*} \lambda_{\AIMSE}(J) &= \left( \frac{ \frac{(4 J + 4) \bar \omega^2}{(J + 2)^2} n \int_{[0,1]^d} \left( \sum_{j=1}^d \partial^{2 J + 2}_j \mu(x) \right)^2 \diff x }{ d \sigma^2 \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \left( \ell_{r r'} + \ell_{r' r} \right)^d } \right)^{\frac{1}{4 J + 4 + d}}. \end{align*} % An estimator of $\lambda_{\AIMSE}(J)$ is therefore given by % \begin{align*} \hat\lambda_{\AIMSE}(J) &= \left( \frac{ \frac{(4 J + 4) \bar \omega^2}{(J + 2)^2} \sum_{i=1}^n \left( \sum_{j=1}^d \partial^{2 J + 2}_j \hat\mu(X_i) \right)^2 }{ d \hat\sigma^2 \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \left( \ell_{r r'} + \ell_{r' r} \right)^d } \right)^{\frac{1}{4 J + 4 + d}} \end{align*} % for some preliminary estimators $\partial^{2 J + 2}_j \hat\mu(x)$ and $\hat\sigma^2$. These can be obtained by fitting a global polynomial regression to the data of order $2 J + 4$ without interaction terms. To do this, define the $n \times ((2 J + 4)d + 1)$ design matrix $P$ with rows % \begin{align*} P_i = \big( 1, X_{i1}, X_{i1}^2, \ldots, X_{i1}^{2 J + 4}, X_{i2}, X_{i2}^2, \ldots, X_{i2}^{2 J + 4}, \ldots, X_{id}, X_{id}^2, \ldots, X_{id}^{2 J + 4} \big), \end{align*} % and let % $P_x = \big( 1, x_{1}, x_{1}^2, \ldots, x_{1}^{2 J + 4}, x_{2}, x_{2}^2, \ldots, x_{2}^{2 J + 4}, \ldots, x_{d}, x_{d}^2, \ldots, x_{d}^{2 J + 4} \big). $ % Then we define the derivative estimator as % \begin{align*} \partial^{2 J + 2}_j \hat\mu(x) &= \partial^{2 J + 2}_j P_x \big( P^\T P \big)^{-1} P^\T \bY \\ &= (2J + 2)! \left( 0_{1 + (j-1)(2 J + 4) + (2J + 1)}, 1, x_j, x_j^2 / 2, 0_{(d-j)(2 J + 4)} \right) \big( P^\T P \big)^{-1} P^\T \bY, \end{align*} % and the variance estimator $\hat\sigma^2$ is based on the residual sum of squared errors of this model: % \begin{align*} \hat\sigma^2 &= \frac{1}{n - (2J + 4)d - 1} \big( \bY^\T \bY - \bY^\T P \big( P^\T P \big)^{-1} P^\T \bY \big). \end{align*} \subsubsection*{Lifetime selection with cross-validation} As an alternative to the analytic plug-in methods described above, one can use a cross-validation approach. While leave-one-out cross-validation (LOOCV) can be applied directly \citep{fan2020statistical}, the linear smoother structure of the (debiased) Mondrian random forest estimator allows a computationally simpler formulation. Writing $\hat\mu_\rd^{-i}(x)$ for a debiased Mondrian random forest estimator fitted without the $i$th data sample, it is easy to show that % \begin{align*} \LOOCV(\lambda, J) &= \frac{1}{n} \sum_{i=1}^{n} \left( Y_i - \hat\mu_\rd^{-i}(X_i) \right)^2 \\ &= \frac{1}{n} \sum_{i=1}^{n} \left( \sum_{r=0}^{J} \omega_r \frac{1}{B} \sum_{b=1}^{B} \frac{1}{1 - 1/N_{r b}(X_i)} \left( Y_i - \sum_{j=1}^{n} \frac{ Y_j \I \left\{ X_j \in T_{r b}(X_i) \right\}} {N_{r b}(X_i)} \right) \right)^{2}, \end{align*} % avoiding refitting the model leaving each sample out in turn. Supposing $X_i \sim \Unif\big([0,1]^d\big)$ and replacing $1/N_{r b}(X_i)$ with their average expectation $ \frac{1}{J+1} \sum_{r=0}^{J} \E \left[ 1/N_{r b}(X_i) \right] \approx \bar a^d \lambda^d / n$ where $\bar a^d = \frac{1}{J+1} \sum_{r=0}^{J} a_r^d$ gives the generalized cross-validation (GCV) formula % \begin{align} \label{eq:mondrian_gcv} \GCV(\lambda, J) &= \frac{1}{n} \sum_{i=1}^{n} \left( \frac{Y_i - \hat\mu_\rd(X_i)} {1 - \bar a^d \lambda^d / n} \right)^2. \end{align} % The lifetime can then be selected by computing either $\hat\lambda_{\LOOCV} \in \argmin_\lambda \LOOCV(\lambda, J)$ or $\hat\lambda_{\GCV} \in \argmin_\lambda \GCV(\lambda, J)$. See Section~\ref{sec:mondrian_weather} for a practical illustration. \subsection{Choosing the other parameters} \subsubsection*{The number \texorpdfstring{$B$}{B} of trees in each forest}% If no debiasing is applied, we suggest $B = \sqrt{n}$ to satisfy Theorem~\ref{thm:mondrian_confidence}. If debiasing is used then we recommend $B = n^{\frac{2J-1}{2J}}$, consistent with Theorem~\ref{thm:mondrian_confidence_debiased} and Theorem~\ref{thm:mondrian_minimax}. \subsubsection*{The debiasing order \texorpdfstring{$J$}{J}}% When debiasing a Mondrian random forest, one must decide how many orders of bias to remove. This requires some oracle knowledge of the H{\"o}lder smoothness of $\mu$ and $f$, which is difficult to estimate statistically. As such, we recommend removing only the first one or two bias terms, taking $J \in \{0,1,2\}$ to avoid overly inflating the variance of the estimator. \subsubsection*{The debiasing coefficients \texorpdfstring{$a_r$}{ar}}% As in Section~\ref{sec:mondrian_debiased}, we take $a_r$ to be a fixed geometric or arithmetic sequence. For example, one could set $a_r = (1+\gamma)^r$ or $a_r = 1 + \gamma r$ for some $\gamma > 0$. We suggest taking $a_r = 1.05^r$. \section{Illustrative example: weather forecasting}% \label{sec:mondrian_weather} To demonstrate our methodology for estimation and inference with Mondrian random forests, we consider a simple application to a weather forecasting problem. We emphasize that the main aim of this section is to provide intuition and understanding for how a Mondrian random forest may be used in practice, and we refrain from an in-depth analysis of the specific results obtained. Indeed, our assumption of i.i.d.\ data is certainly violated with weather data, due to the time-series structure of sequential observations. Nonetheless, we use data from the \citet{bureau2017daily}, containing daily weather information from 2007--2017, at 49 different locations across Australia, with $n = 125\,927$ samples. \begin{figure}[b!] \centering \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/weather_data.png}% \end{subfigure} \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/weather_data_filled_partition.png}% \end{subfigure} \caption[Australian weather forecasting data]{ Australian weather forecasting data. Left: colors indicate the response variable of dry (orange) or wet (blue) on the following day. Right: the data is overlaid with a Mondrian random tree, fitted with a lifetime of $\lambda = 5$ selected by generalized cross-validation. Cell colors represent the response proportions.} \label{fig:mondrian_weather_data} \end{figure} We consider the classification problem of predicting whether or not it will rain on the following day using two covariates: the percentage relative humidity, and the pressure in mbar, both at 3pm on the current day. For the purpose of framing this as a nonparametric regression problem, we consider estimating the probability of rain as the regression function by setting $Y_i = 1$ if there is rain on the following day and $Y_i = 0$ otherwise. Outliers with pressure less than 985\,mbar or more than 1040\,mbar are removed to justify the assertion in Assumption~\ref{ass:mondrian_data} that the density of the covariates should be bounded away from zero, and the features are linearly scaled to provide normalized samples $(X_i, Y_i) \in [0, 1]^2 \times \{0, 1\}$. We fit a Mondrian random forest to the data as defined in Section~\ref{sec:mondrian_forests}, selecting the lifetime parameter with the generalized cross-validation (GCV) method detailed in Section~\ref{sec:mondrian_lifetime_selection}. Figure~\ref{fig:mondrian_weather_data} plots the data, using colors to indicate the response values, and illustrates how a single Mondrian tree is fitted by sampling from an independent Mondrian process and then computing local averages (equivalent to response proportions in this special setting with binary outcomes) within each cell. The general pattern of rain being predicted by high humidity and low pressure is apparent, with the preliminary tree estimator taking the form of a step function on axis-aligned rectangles. This illustrates the first-order bias of Mondrian random trees discussed in Section~\ref{sec:mondrian_clt}, with the piecewise constant estimator providing a poor approximation for the smooth true regression function. \begin{figure}[b!] \centering \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/weather_forest_2.png}% \end{subfigure} \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/weather_forest_design.png}% \end{subfigure} \caption[Fitting Mondrian random forests to the Australian weather data]{ Fitting Mondrian random forests to the Australian weather data. Left: with $B=2$ trees, individual cells are clearly visible and the step function persists. Right: with $B=40$ trees, the estimate is much smoother as the individual tree estimates average out. Three design points are identified for further analysis.} \label{fig:mondrian_weather_forest} \end{figure} Figure~\ref{fig:mondrian_weather_forest} adds more trees to the estimator, demonstrating the effect of increasing the forest size first to $B=2$ and then to $B=40$. As more trees are included in the Mondrian random forest, the regression estimate $\hat \mu(x)$ becomes smoother and therefore also enjoys improved bias properties as shown in Theorem~\ref{thm:mondrian_bias}, assuming a correct model specification. We also choose three specific design points in the (humidity, pressure) covariate space, namely (20\%, 1020\,mbar), (70\%, 1000\,mbar), and (80\%, 990\,mbar), at which to conduct inference by constructing confidence intervals. See Table~\ref{tab:mondrian_weather_ci} for the results. \begin{figure}[b!] \centering \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/weather_gcv.png}% \end{subfigure} \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/weather_debiased_forest_design.png}% \end{subfigure} \caption[Cross-validation and debiasing for the Australian weather data]{ Left: mean squared error and generalized cross-validation scores for Mondrian random forests with the Australian weather data. Right: a debiased Mondrian random forest with $B=20$, giving $40$ trees in total. Three design points are identified for further analysis.} \label{fig:mondrian_weather_gcv} \end{figure} In Figure~\ref{fig:mondrian_weather_gcv} we show the mean squared error and GCV scores computed using \eqref{eq:mondrian_gcv} with $B=400$ trees for several candidate lifetime parameters $\lambda$. As expected, the mean squared error decreases monotonically as $\lambda$ increases and the model overfits, but the GCV score is minimized at a value which appropriately balances the bias and variance; we take $\lambda = 5$. We then fit a debiased Mondrian forest with bias correction order $J = 1$ as described in Section~\ref{sec:mondrian_debiased}, using $B=20$ trees at each debiasing level $r \in \{0, 1\}$ for a total of $40$ trees. We continue to use the same lifetime parameter $\lambda = 5$ selected through GCV without debiasing, following the approach recommended in Section~\ref{sec:mondrian_lifetime_selection} to ensure valid inference through negligible bias. The resulting debiased Mondrian random forest estimate is noticeably less smooth than the version without bias correction. This is expected due to both the inflated variance resulting from the debiasing procedure, and the undersmoothing enacted by selecting a lifetime parameter using GCV on the original estimator without debiasing. \begin{table}[t] \centering \begin{tabular}{|c|c|c|c|c|c|c|} \hline \multirow{2}{*}{Point} & \multirow{2}{*}{Humidity} & \multirow{2}{*}{Pressure} & \multicolumn{2}{|c|}{No debiasing, $J=0$} & \multicolumn{2}{|c|}{Debiasing, $J=1$} \\ \cline{4-7} & & & $\hat\mu(x)$ & 95\% CI & $\hat\mu(x)$ & 95\% CI \\ \hline $1$ & $20\%$ & $1020\,\textrm{mbar}$ & $\phantom{0}4.2\%$ & $3.9\%$ -- $4.5\%$ & $\phantom{0}2.0\%$ & $1.6\%$ -- $2.4\%$ \\ $2$ & $70\%$ & $1000\,\textrm{mbar}$ & $52.6\%$ & $51.7\%$ -- $53.6\%$ & $59.8\%$ & $57.8\%$ -- $61.9\%$ \\ $3$ & $80\%$ & $\phantom{1}990\,\textrm{mbar}$ & $78.1\%$ & $75.0\%$ -- $81.2\%$ & $93.2\%$ & $86.7\%$ -- $99.6\%$ \\ \hline \end{tabular} \caption[Results for the Australian weather data]{ Results for the Australian weather data at three specified design points.} \label{tab:mondrian_weather_ci} \end{table} Table~\ref{tab:mondrian_weather_ci} presents numerical results for estimation and inference at the three specified design points. We first give the outcomes without debiasing, using a Mondrian random forest with $B = 400$ trees and $\lambda = 5$ selected by GCV. We then show the results with a first-order ($J=1$) debiased Mondrian random forest using $B = 200$ (again a total of $400$ trees) and the same value of $\lambda = 5$. The predicted chance of rain $\hat\mu(x)$ is found to vary substantially across different covariate values, and the resulting confidence intervals (CI) are generally narrow due to the large sample size and moderate lifetime parameter. The forest with debiasing exhibits more extreme predictions away from $50\%$ and wider confidence intervals in general, in line with the illustration in Figure~\ref{fig:mondrian_weather_gcv}. Interestingly, the confidence intervals for the non-debiased and debiased estimators do not intersect, indicating that the original estimator is severely biased, and providing further justification for our modified debiased random forest estimator. \section{Conclusion}% \label{sec:mondrian_conclusion} We gave a central limit theorem for the Mondrian random forest estimator and showed how to perform statistical inference on an unknown nonparametric regression function. We introduced debiased versions of the Mondrian random forest, and demonstrated their advantages for statistical inference and minimax-optimal estimation. We discussed tuning parameter selection, enabling a fully feasible and practical methodology. An application to weather forecasting was presented as an illustrative example. Implementations of this chapter's methodology and empirical results are provided by a Julia package at \github{wgunderwood/MondrianForests.jl}. This work is based on \citet{cattaneo2023inference}, and has been presented by Underwood at the University of Illinois Statistics Seminar (2024), the University of Michigan Statistics Seminar (2024), and the University of Pittsburgh Statistics Seminar (2024). \chapter{Dyadic Kernel Density Estimators} \label{ch:kernel} % abstract Dyadic data is often encountered when quantities of interest are associated with the edges of a network. As such, it plays an important role in statistics, econometrics, and many other data science disciplines. We consider the problem of uniformly estimating a dyadic Lebesgue density function, focusing on nonparametric kernel-based estimators taking the form of dyadic empirical processes. The main contributions of this chapter include the minimax-optimal uniform convergence rate of the dyadic kernel density estimator, along with strong approximation results for the associated standardized and Studentized $t$-processes. A consistent variance estimator enables the construction of valid and feasible uniform confidence bands for the unknown density function. We showcase the broad applicability of our results by developing novel counterfactual density estimation and inference methodology for dyadic data, which can be used for causal inference and program evaluation. A crucial feature of dyadic distributions is that they may be ``degenerate'' at certain points in the support of the data, a property making our analysis somewhat delicate. Nonetheless our methods for uniform inference remain robust to the potential presence of such points. For implementation purposes, we discuss inference procedures based on positive semi-definite covariance estimators, mean squared error optimal bandwidth selectors, and robust bias correction techniques. We illustrate the empirical finite-sample performance of our methods both in simulations and with real-world trade data, for which we make comparisons between observed and counterfactual trade distributions in different years. Our technical results concerning strong approximations and maximal inequalities are of potential independent interest. \section{Introduction} \label{sec:kernel_introduction} Dyadic data, also known as graphon data, plays an important role in the statistical, social, behavioral, and biomedical sciences. In network settings, this type of dependent data captures interactions between the units of study, and its analysis is of interest in statistics \citep{kolaczyk2009statistical}, economics \citep{graham2020network}, psychology \citep{kenny2020dyadic}, public health \citep{luke2007network}, and many other data science disciplines. For $n \geq 2$, a dyadic data set contains $\frac{1}{2}n(n-1)$ observed real-valued random variables % \begin{align*} \bW_n = (W_{i j}:1\leq i0$, define the H\"{o}lder class with smoothness parameter $\beta > 0$ to be $\cH^\beta_C(\cX) = \big\{ g \in \cC^{\flbeta}(\cX) \! : \! \max_{1 \leq r \leq \flbeta} \big| g^{(r)}(x) \big| \leq C, \big| g^{(\flbeta)}(x) - g^{(\flbeta)}(x') \big| \leq C |x-x'|^{\beta - \flbeta}, \forall x, x' \in \cX \big\}$, where $\flbeta$ denotes the largest integer which is strictly less than $\beta$. Note that $\cH^1_C(\cX)$ is the class of $C$-Lipschitz functions on $\cX$. For $a \in \R$ and $b \geq 0$, we write $[a \pm b]$ for the interval $[a-b, a+b]$. For non-negative sequences $a_n$ and $b_n$, write $a_n \lesssim b_n$ or $a_n = O(b_n)$ to indicate that $a_n / b_n$ is bounded for $n\geq 1$. Write $a_n \ll b_n$ or $a_n = o(b_n)$ if $a_n / b_n \to 0$. If $a_n \lesssim b_n \lesssim a_n$, write $a_n \asymp b_n$. For random non-negative sequences $A_n$ and $B_n$, write $A_n \lesssim_\P B_n$ or $A_n = O_\P(B_n)$ if $A_n / B_n$ is bounded in probability. Write $A_n = o_\P(B_n)$ if $A_n / B_n \to 0$ in probability. For $a,b \in \R$, define $a\wedge b=\min\{a,b\}$ and $a \vee b = \max\{a,b\}$. \section{Setup}\label{sec:kernel_setup} We impose the following two assumptions throughout this chapter, which concern firstly the dyadic data generating process, and secondly the choice of kernel and bandwidth sequence. % \begin{assumption}[Data generation] \label{ass:kernel_data} % % A and V variables Let $\bA_n = (A_i: 1 \leq i \leq n)$ be i.i.d.\ random variables supported on $\cA \subseteq \R$ and let $\bV_n = (V_{i j}: 1 \leq i < j \leq n)$ be i.i.d.\ random variables with a Lebesgue density $f_V$ on $\R$, with $\bA_n$ independent of $\bV_n$. % % W variables Let $W_{i j} = W(A_i, A_j, V_{i j})$ and $\bW_n = (W_{i j}: 1 \leq i < j \leq n)$, where $W$ is an unknown real-valued function which is symmetric in its first two arguments. % Let $\cW \subseteq \R$ be a compact interval with positive Lebesgue measure $\Leb(\cW)$. The conditional distribution of $W_{i j}$ given $A_i$ and $A_j$ admits a Lebesgue density $f_{W \mid AA}(w \mid A_i, A_j)$. For $C_\rH > 0$ and $\beta \geq 1$, take $f_W \in \cH^\beta_{C_\rH}(\cW)$ where $f_{W}(w) = \E\left[f_{W \mid AA}(w \mid A_i,A_j)\right]$ and $f_{W \mid AA}(\cdot \mid a, a') \in \cH^1_{C_\rH}(\cW)$ for all $a,a' \in \cA$. Suppose $\sup_{w \in \cW} \|f_{W \mid A}(w \mid \cdot\,)\|_\TV <\infty$ where $f_{W \mid A}(w \mid a) = \E\left[f_{W \mid AA}(w \mid A_i,a)\right]$. % \end{assumption} In Assumption~\ref{ass:kernel_data} we require the density $f_W$ be in a $\beta$-smooth H\"older class of functions on the compact interval $\cW$. H\"older classes are well established in the minimax estimation literature \citep{stone1982optimal,gine2021mathematical}, with the smoothness parameter $\beta$ appearing in the minimax-optimal rate of convergence. If the H\"older condition is satisfied only piecewise, then our results remain valid provided that the boundaries between the pieces are known and treated as boundary points. If $W(a_1, a_2, v)$ is strictly monotonic and continuously differentiable in its third argument, we can give the conditional density of $W_{i j}$ explicitly using the usual change-of-variables formula: with $w=W(a_1,a_2,v)$, we have $f_{W \mid AA}(w \mid a_1,a_2) = f_V(v) \big|\partial W(a_1,a_2,v)/\partial v\big|^{-1}$. \begin{assumption}[Kernels and bandwidth] \label{ass:kernel_bandwidth}% % Let $h = h(n) > 0$ be a sequence of bandwidths satisfying $h \log n \to 0$ and $\frac{\log n}{n^2h} \to 0$. For each $w \in \cW$, let $k_h(\cdot, w)$ be a real-valued function supported on $[w \pm h] \cap \cW$. For an integer $p \geq 1$, let $k_h$ belong to a family of boundary bias-corrected kernels of order $p$, i.e., % \begin{align*} \int_{\cW} (s-w)^r k_h(s,w) \diff{s} \quad \begin{cases} \begin{alignedat}{2} &= 1 &\qquad &\text{for all } w \in \cW \text{ if }\, r = 0, \\ &= 0 & &\text{for all } w \in \cW \text{ if }\, 1 \leq r \leq p-1, \\ &\neq 0 & &\text{for some } w \in \cW \text{ if }\, r = p. \end{alignedat} \end{cases} \end{align*} % Also, for $C_\rL > 0$, suppose $k_h(s, \cdot) \in \cH^1_{C_\rL h^{-2}}(\cW)$ for all $s \in \cW$. % \end{assumption} This assumption allows for all standard compactly supported and possibly boundary-corrected kernel functions \citep{wand1994kernel,simonoff1996smoothing}, constructed for example by taking polynomials on a compact interval and solving a linear system for the coefficients. Assumption~\ref{ass:kernel_bandwidth} implies (see Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded} in Appendix~\ref{app:kernel}) that if $h \leq 1$ then $k_h$ is uniformly bounded by $C_\rk h^{-1}$ where $C_\rk \vcentcolon = 2 C_\rL + 1 + 1/\Leb(\cW)$. \subsection{Bias characterization} \label{sec:kernel_bias} We begin by characterizing and bounding the bias $B_n(w) = \E \big[ \hat f_W(w) \big] - f_W(w)$. Theorem~\ref{thm:kernel_bias} is a standard result for the non-random smoothing bias in kernel density estimation with higher-order kernels and boundary bias correction, and does not rely on the dyadic structure. \begin{theorem}[Bias bound] \label{thm:kernel_bias} Suppose that Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold. For $w \in \cW$ define the leading bias term as % \begin{align*} b_p(w) &= \frac{f_W^{(p)}(w)}{p!} \int_{\cW} k_h(s,w) \left( \frac{s-w}{h} \right)^p \diff{s}. \end{align*} % for $1 \leq p \leq \flbeta$. Then we have the following bias bounds. % \begin{enumerate}[label=(\roman*)] \item If $p \leq \flbeta - 1$, then $\sup_{w \in \cW} | B_n(w) - h^p b_p(w) | \leq \frac{2 C_\rk C_\rH}{(p+1)!} h^{p+1}$. \item If $p = \flbeta$, then $\sup_{w \in \cW} | B_n(w) - h^p b_p(w) | \leq \frac{2 C_\rk C_\rH}{\flbeta !} h^\beta$. \item If $p \geq \flbeta+1$, then $\sup_{w \in \cW} | B_n(w) | \leq \frac{2 C_\rk C_\rH}{\flbeta !} h^\beta$. \end{enumerate} % Noting that $\sup_{\cW} |b_p(w)| \leq 2 C_\rk C_\rH / p!$, we deduce that for $h \leq 1$, % \begin{align*} \sup_{w \in \cW} | B_n(w) | \leq \frac{4 C_\rk C_\rH}{(p \wedge \flbeta)!} h^{p \wedge \beta} \lesssim h^{p \wedge \beta}. \end{align*} \end{theorem} \subsection{Hoeffding-type decomposition and degeneracy} \label{sec:kernel_degeneracy} Our next step is to consider the stochastic part $\hat f_W(w) - \E \big[ \hat f_W(w) \big]$ of the classical bias--variance decomposition. This term is akin to a U-statistic and thus admits a Hoeffding-type decomposition, presented in Lemma~\ref{lem:kernel_hoeffding}, which is a key element in our analysis. \begin{lemma}[Hoeffding-type decomposition for $\hat f_W$] \label{lem:kernel_hoeffding} Suppose that Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth} hold. Define the linear, quadratic, and error terms % \begin{align*} L_n(w) &= \frac{2}{n} \sum_{i=1}^n l_i(w), &Q_n(w) &= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} q_{i j}(w), \\ E_n(w) &= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} e_{i j}(w) \end{align*} % respectively, where % \begin{align*} l_i(w) &= \E\left[k_h(W_{i j},w) \mid A_i\right] - \E\left[k_h(W_{i j},w)\right], \\ q_{i j}(w) &= \E\left[k_h(W_{i j},w) \mid A_i, A_j\right] - \E\left[k_h(W_{i j},w) \mid A_i\right] - \E\left[k_h(W_{i j},w) \mid A_j\right] + \E\left[k_h(W_{i j},w)\right], \\ e_{i j}(w) &= k_h(W_{i j},w) - \E\left[k_h(W_{i j},w) \mid A_i, A_j\right]. \end{align*} % Then, recalling the bias term $B_n$ from Section~\ref{sec:kernel_bias}, we have the Hoeffding-type decomposition % \begin{align} \label{eq:kernel_hoeffding} \hat f_W(w) - f_W(w) = L_n(w) + Q_n(w) + E_n(w) + B_n(w). \end{align} % The processes $L_n$, $Q_n$, and $E_n$ are mean-zero with $\E\big[L_n(w)\big] = \E\big[Q_n(w)\big] = \E\big[E_n(w)\big] = 0$ for all $w \in \cW$. They are also orthogonal, satisfying $\E\big[ L_n(w) Q_n(w') \big] = \E\big[ L_n(w) E_n(w') \big] = \E\big[ Q_n(w) E_n(w') \big] = 0$ for all $w, w' \in \cW$. % \end{lemma} The process $L_n$ is the H{\'a}jek projection of a U-process, which can exhibit degeneracy if $\Var[L_n(w)] = 0$ at some or all points $w \in \cW$. To characterize the different possible degeneracy types in Lemma~\ref{lem:kernel_trichotomy}, we first introduce the following lower and upper degeneracy constants: % \begin{align*} \Dl^2 := \inf_{w \in \cW} \Var\left[f_{W \mid A}(w \mid A_i)\right] \qquad \text{ and } \qquad \Du^2 := \sup_{w \in \cW} \Var\left[f_{W \mid A}(w \mid A_i)\right]. \end{align*} % \begin{lemma}[Trichotomy of degeneracy]% \label{lem:kernel_trichotomy}% % Grant Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth}. Then the type of degeneracy exhibited by $\hat f_W(w)$ is precisely one of the following three possibilities. % \begin{enumerate}[label=(\roman*)] \item Total degeneracy: $\Du = \Dl = 0$. Then $L_n(w) = 0$ for all $w \in \cW$ almost surely. \item No degeneracy: $\Dl > 0$. Then $\inf_{w \in \cW} \Var[L_n(w)] \geq \frac{2 \Dl}{n}$ for all large enough n. \item Partial degeneracy: $\Du > \Dl = 0$. There exists $w \in \cW$ with $\Var\left[f_{W \mid A}(w \mid A_i)\right] = 0$; such a point is labeled \emph{degenerate} and satisfies $\Var[L_n(w)] \leq 64 C_\rk C_\rH C_\rd \frac{h}{n}$. There is also a point $w' \in \cW$ with $\Var\left[f_{W \mid A}(w' \mid A_i)\right] > 0$; such a point is labeled \emph{non-degenerate} and satisfies $\Var[L_n(w')] \geq \frac{2}{n} \Var\left[f_{W \mid A}(w' \mid A_i)\right]$ for all large enough $n$. \end{enumerate} \end{lemma} The following lemma describes the uniform stochastic order of the different terms in the Hoeffding-type decomposition, explicitly accounting for potential degeneracy. \begin{lemma}[Uniform concentration] \label{lem:kernel_uniform_concentration} Suppose Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold. Then % \begin{align*} \E\left[ \sup_{w \in \cW} |L_n(w)| \right] &\lesssim \frac{\Du}{\sqrt n}, &\E\left[ \sup_{w \in \cW} |Q_n(w)| \right] &\lesssim \frac{1}{n}, &\E\left[ \sup_{w \in \cW} |E_n(w)| \right] &\lesssim \sqrt{\frac{\log n}{n^2h}}. \end{align*} \end{lemma} Lemma~\ref{lem:kernel_uniform_concentration} captures the potential total degeneracy of $L_n$ by illustrating how if $\Du=0$ then $L_n=0$ everywhere on $\cW$ almost surely. The following lemma captures the potential partial degeneracy of $L_n$, where $\Du > \Dl = 0$. For $w,w' \in \cW$, define the covariance function % \begin{align*} \Sigma_n(w,w') = \E\Big[ \Big( \hat f_W(w) - \E\big[\hat f_W(w)\big] \Big) \Big( \hat f_W(w') - \E\big[\hat f_W(w')\big] \Big) \Big]. \end{align*} % \begin{lemma}[Variance bounds] \label{lem:kernel_variance_bounds} Suppose that Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth} hold. Then for sufficiently large $n$, % \begin{align*} \frac{\Dl^2}{n} + \frac{1}{n^2h} \inf_{w \in \cW} f_W(w) &\lesssim \inf_{w \in \cW} \Sigma_n(w,w) \leq \sup_{w \in \cW} \Sigma_n(w,w) \lesssim \frac{\Du^2}{n} + \frac{1}{n^2h}. \end{align*} % \end{lemma} As a simple example of the different types of degeneracy, consider the family of dyadic distributions $\P_{\pi}$ indexed by $\pi = (\pi_1, \pi_2, \pi_3)$ with $\sum_{i=1}^3 \pi_i = 1$ and $\pi_i \geq 0$, generated by $W_{i j} = A_i A_j + V_{i j}$, where $A_i$ equals $-1$ with probability $\pi_1$, equals $0$ with probability $\pi_2$ and equals $+1$ with probability $\pi_3$, and $V_{i j}$ is standard Gaussian. This model induces a latent ``community structure'' where community membership is determined by the value of $A_i$ for each node $i$, and the interaction outcome $W_{i j}$ is a function only of the communities which $i$ and $j$ belong to and some idiosyncratic noise. Unlike the stochastic block model \citep{kolaczyk2009statistical}, our setup assumes that community membership has no impact on edge existence, as we work with fully connected networks; see Section~\ref{sec:kernel_trade_data} for a discussion of how to handle missing edges in practice. Also note that the parameter of interest in this chapter is the Lebesgue density of a continuous random variable $W_{i j}$ rather than the probability of network edge existence, which is the focus of the graphon estimation literature \citep{gao2021minimax}. In line with Assumption~\ref{ass:kernel_data}, $\bA_n$ and $\bV_n$ are i.i.d.\ sequences independent of each other. Then $f_{W \mid AA}(w \mid A_i, A_j) = \phi(w - A_i A_j)$,\, $f_{W \mid A}(w \mid A_i) = \pi_1 \phi(w + A_i) + \pi_2 \phi(w) + \pi_3 \phi(w - A_i)$, and $f_W(w) = (\pi_1^2 + \pi_3^2) \phi(w-1) + \pi_2 (2 - \pi_2) \phi(w) + 2 \pi_1 \pi_3 \phi(w+1),$ where $\phi$ denotes the probability density function of the standard normal distribution. Note that $f_W(w)$ is strictly positive for all $w \in \R$. Consider the parameter choices: % \begin{enumerate}[label=(\roman*)] \item $\pi = \left( \frac{1}{2}, 0, \frac{1}{2} \right)$:\quad $\P_\pi$ is degenerate at all $w \in \R$, \item $\pi = \left( \frac{1}{4}, 0, \frac{3}{4} \right)$:\quad $\P_\pi$ is degenerate only at $w=0$, \item $\pi = \left( \frac{1}{5}, \frac{1}{5}, \frac{3}{5} \right)$:\quad $\P_\pi$ is non-degenerate for all $w \in \R$. \end{enumerate} % Figure~\ref{fig:kernel_distribution} demonstrates these phenomena, plotting the density $f_W$ and the standard deviation of the conditional density $f_{W|A}$ over $\cW = [-2,2]$ for each choice of the parameter $\pi$. The trichotomy of total/partial/no degeneracy is useful for understanding the distributional properties of the dyadic kernel density estimator $\hat{f}_W(w)$. Crucially, our need for uniformity in $w$ complicates the simpler degeneracy/no degeneracy dichotomy observed previously in the literature \citep{graham2024kernel}. From a pointwise-in-$w$ perspective, partial degeneracy causes no issues, while it is a fundamental problem when conducting inference uniformly over $w \in \cW$. We develop methods that are valid regardless of the presence of partial or total degeneracy. \begin{figure}[t] \centering % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/distribution_plot_total.pdf} \caption{Total degeneracy, \\ $\pi = \left( \frac{1}{2}, 0, \frac{1}{2} \right)$.} \end{subfigure} % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/distribution_plot_partial.pdf} \caption{Partial degeneracy, \\ $\pi = \left( \frac{1}{4}, 0, \frac{3}{4} \right)$.} \end{subfigure} % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/distribution_plot_none.pdf} \caption{No degeneracy, \\ $\pi = \left( \frac{1}{5}, \frac{1}{5}, \frac{3}{5} \right)$.} \end{subfigure} % \caption[The family of distributions $\P_\pi$]{ Density $f_W$ and standard deviation of $f_{W|A}$ for the family of distributions $\P_\pi$.} % \label{fig:kernel_distribution} \end{figure} \section{Point estimation results} \label{sec:kernel_point_estimation} Using the bias bound from Theorem~\ref{thm:kernel_bias} and the concentration results from Lemma~\ref{lem:kernel_uniform_concentration}, the next theorem establishes an upper bound on the uniform convergence rate of $\hat f_W$. % \begin{theorem}[Uniform convergence rate]% \label{thm:kernel_uniform_consistency}% Suppose that Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold. Then % \begin{align*} \E\left[ \sup_{w \in \cW} \big|\hat{f}_W(w) - f_W(w)\big| \right] \lesssim h^{p\wedge\beta} + \frac{\Du}{\sqrt n} + \sqrt{\frac{\log n}{n^2h}}. \end{align*} \end{theorem} % The implicit constant in Theorem~\ref{thm:kernel_uniform_consistency} depends only on $\cW$, $\beta$, $C_\rH$, and the choice of kernel. We interpret this result in light of the degeneracy trichotomy from Lemma~\ref{lem:kernel_trichotomy}. These results generalize \citet*[Theorem~1]{chiang2020empirical} by allowing for compactly supported data and more general kernels $k_h(\cdot,w)$, enabling boundary-adaptive estimation. % \begin{enumerate}[label=(\roman*)] \item Partial or no degeneracy: $\Du > 0$. Any bandwidths satisfying $n^{-1} \log n \lesssim h \lesssim n^{-\frac{1}{2(p\wedge\beta)}}$ yield $\E\big[\sup_{w \in \cW}\big|\hat f_W(w) - f_W(w)\big| \big] \lesssim \frac{1}{\sqrt n}$, the ``parametric'' bandwidth-independent rate noted by \citet{graham2024kernel}. \item Total degeneracy: $\Du = 0$. Minimizing the bound in Theorem~\ref{thm:kernel_uniform_consistency} with $h \asymp \left( \frac{\log n}{n^2} \right)^{\frac{1}{2(p\wedge\beta)+1}}$ yields $\E\big[ \sup_{w \in \cW} \big|\hat f_W(w) - f_W(w)\big| \big] \lesssim \big(\frac{\log n}{n^2} \big)^{\frac{p\wedge\beta}{2(p\wedge\beta)+1}}$. \end{enumerate} \subsection{Minimax optimality} We establish the minimax rate under the supremum norm for density estimation with dyadic data. This implies minimax optimality of the kernel density estimator $\hat f_W$, regardless of the degeneracy type of the dyadic distribution. \begin{theorem}[Uniform minimax optimality] \label{thm:kernel_minimax} Fix $\beta \geq 1$ and $C_\rH > 0$, and take $\cW$ a compact interval with positive Lebesgue measure. Define $\cP = \cP(\cW, \beta, C_\rH)$ as the class of dyadic distributions satisfying Assumption~\ref{ass:kernel_data}. Define $\cP_\rd$ as the subclass of $\cP$ containing only those distributions which are totally degenerate on $\cW$ in the sense that $\sup_{w \in \cW} \Var\left[f_{W \mid A}(w \mid A_i)\right] = 0$. Then % \begin{align*} \inf_{\tilde f_W} \sup_{\P \in \cP} \E_\P\left[ \sup_{w \in \cW} \big| \tilde f_W(w) - f_W(w) \big| \right] &\asymp \frac{1}{\sqrt n}, \\ \inf_{\tilde f_W} \sup_{\P \in \cP_\rd} \E_\P\left[ \sup_{w \in \cW} \big| \tilde f_W(w) - f_W(w) \big| \right] &\asymp \left( \frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta+1}}, \end{align*} % where $\tilde f_W$ is any estimator depending only on the data $\bW_n = (W_{i j}: 1 \leq i < j \leq n)$ distributed according to the dyadic law $\P$. The constants in $\asymp$ depend only on $\cW$, $\beta$, and $C_\rH$. \end{theorem} Theorem~\ref{thm:kernel_minimax} shows that the uniform convergence rate of $n^{-1/2}$ obtained in Theorem~\ref{thm:kernel_uniform_consistency} (coming from the $L_n$ term) is minimax-optimal in general. When attention is restricted to totally degenerate dyadic distributions, $\hat f_W$ also achieves the minimax rate of uniform convergence (assuming a kernel of sufficiently high order $p \geq \beta$), which is on the order of $\left(\frac{\log n}{n^2}\right)^{\frac{\beta}{2\beta+1}}$ and is determined by the bias $B_n$ and the leading variance term $E_n$ in \eqref{eq:kernel_hoeffding}. Combining Theorems \ref{thm:kernel_uniform_consistency}~and~\ref{thm:kernel_minimax}, we conclude that $\hat{f}_W(w)$ achieves the minimax-optimal rate for uniformly estimating $f_W(w)$ if $h \asymp \left( \frac{\log n}{n^2} \right)^{\frac{1}{2\beta+1}}$ and a kernel of sufficiently high order ($p \geq \beta$) is used, whether or not there are any degenerate points in the underlying data generating process. This result appears to be new to the literature on nonparametric estimation with dyadic data. See \citet{gao2021minimax} for a contemporaneous review. \section{Distributional results} \label{sec:kernel_inference} We investigate the distributional properties of the standardized $t$-statistic process % \begin{align*} T_n(w) = \frac{\hat{f}_W(w) - f_W(w)}{\sqrt{\Sigma_n(w,w)}}, \end{align*} % which is not necessarily asymptotically tight. Therefore, to approximate the distribution of the entire $t$-statistic process, as well as specific functionals thereof, we rely on a novel strong approximation approach outlined in this section. Our results can be used to perform valid uniform inference irrespective of the degeneracy type. This section is largely concerned with distributional properties and thus frequently requires copies of stochastic processes. For succinctness of notation, we will not differentiate between a process and its copy, but details are available in Section~\ref{sec:kernel_app_technical}. \subsection{Strong approximation} By the Hoeffding-type decomposition \eqref{eq:kernel_hoeffding} and Lemma~\ref{lem:kernel_uniform_concentration}, it suffices to consider the distributional properties of the stochastic process $L_n + E_n$. Our approach combines the K{\'o}mlos--Major--Tusn{\'a}dy (KMT) approximation \citep{komlos1975approximation} to obtain a strong approximation of $L_n$ with a Yurinskii approximation \citep{yurinskii1978error} to obtain a \emph{conditional} (on $\bA_n$) strong approximation of $E_n$. The latter is necessary because $E_n$ is akin to a local empirical process of i.n.i.d.\ random variables, conditional on $\bA_n$, and therefore the KMT approximation is not applicable. These approximations are then combined to give a final (unconditional) strong approximation for $L_n+E_n$, and thus for the $t$-statistic process $T_n$. The following lemma is an application of our generic KMT approximation result for empirical processes, given in Section~\ref{sec:kernel_app_technical}, which builds on earlier work by \citet{gine2004kernel} and \citet{gine2010confidence} and may be of independent interest. \begin{lemma}[Strong approximation of $L_n$] \label{lem:kernel_strong_approx_Ln} % Suppose that Assumptions \ref{ass:kernel_data}~and~\ref{ass:kernel_bandwidth} hold. For each $n$ there exists a mean-zero Gaussian process $Z^L_n$ indexed on $\cW$ satisfying $\E\big[ \sup_{w \in \cW} \big| \sqrt{n} L_n(w) - Z_n^L(w) \big| \big] \lesssim \frac{\Du \log n}{\sqrt{n}}$, where $\E[Z_n^L(w)Z_n^L(w')] = n\E[L_n(w)L_n(w')]$ for all $w, w' \in \cW$. The process $Z_n^L$ is a function only of $\bA_n$ and some random noise independent of $(\bA_n, \bV_n)$. \end{lemma} % donsker case The strong approximation result in Lemma~\ref{lem:kernel_strong_approx_Ln} would be sufficient to develop valid and even optimal uniform inference procedures whenever both $\Dl > 0$ (no degeneracy in $L_n$) and $n h \gg \log n$ ($L_n$ is leading). In this special case, the recent Donsker-type results of \citet{davezies2021exchangeable} can be applied to analyze the limiting distribution of the stochastic process $\hat{f}_W$. Alternatively, again only when $L_n$ is non-degenerate and leading, standard empirical process methods could also be used. However, even in the special case when $\hat{f}_W(w)$ is asymptotically Donsker, our result in Lemma~\ref{lem:kernel_strong_approx_Ln} improves upon the literature by providing a rate-optimal strong approximation for $\hat{f}_W$ as opposed to only a weak convergence result. See Theorem \ref{thm:kernel_infeasible_ucb} and the subsequent discussion below. % however often non-donsker More importantly, as illustrated above, it is common in the literature to find dyadic distributions which exhibit partial or total degeneracy, making the process $\hat{f}_W$ non-Donsker. Thus approximating only $L_n$ is in general insufficient for valid uniform inference, and it is necessary to capture the distributional properties of $E_n$ as well. % we do better The following lemma is an application of our strong approximation result for empirical processes based on the Yurinskii approximation, which builds on a refinement by \citet{belloni2019conditional}. \begin{lemma}[Conditional strong approximation of $E_n$] \label{lem:kernel_conditional_strong_approx_En} % Suppose Assumptions \ref{ass:kernel_data}~and~\ref{ass:kernel_bandwidth} hold and take any $R_n \to \infty$. For each $n$ there exists $\tilde Z^E_n$ a mean-zero Gaussian process conditional on $\bA_n$ satisfying $\sup_{w \in \cW} \big| \sqrt{n^2h} E_n(w) - \tilde Z_n^E(w) \big| \lesssim_\P \frac{(\log n)^{3/8} R_n}{n^{1/4}h^{3/8}}$, where $\E[\tilde Z_n^E(w)\tilde Z_n^E(w')\bigm\vert \bA_n] =n^2h\E[E_n(w)E_n(w')\bigm\vert \bA_n]$ for all $w, w' \in \cW$. % \end{lemma} The process $\tilde Z_n^E$ is a Gaussian process conditional on $\bA_n$ but is not in general a Gaussian process unconditionally. The following lemma constructs an unconditional Gaussian process $Z_n^E$ that approximates $\tilde Z_n^E$. \begin{lemma}[Unconditional strong approximation of $E_n$] \label{lem:kernel_unconditional_strong_approx_En} Suppose that Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold. For each $n$ there exists a mean-zero Gaussian process $Z^E_n$ satisfying $\E\big[ \sup_{w \in \cW} \big|\tilde Z_n^E(w) - Z_n^E(w)\big| \big] \lesssim \frac{(\log n)^{2/3}}{n^{1/6}}$, where $Z_n^E$ is independent of $\bA_n$ and $\E[Z_n^E(w)Z_n^E(w')]=\E[\tilde Z_n^E(w)\tilde Z_n^E(w')] = n^2h \, \E[E_n(w)E_n(w')]$ for all $w, w' \in \cW$. % \end{lemma} Combining Lemmas \ref{lem:kernel_conditional_strong_approx_En} and~\ref{lem:kernel_unconditional_strong_approx_En}, we obtain an unconditional strong approximation for $E_n$. The resulting rate of approximation may not be optimal, due to the Yurinskii coupling, but to the best of our knowledge it is the first in the literature for the process $E_n$, and hence for $\hat{f}_W$ and its associated $t$-process in the context of dyadic data. The approximation rate is sufficiently fast to allow for optimal bandwidth choices; see Section \ref{sec:kernel_implementation} for more details. Strong approximation results for local empirical processes (e.g.\ \citealp{gine2010confidence}) are not applicable here because the summands in the non-negligible $E_n$ are not (conditionally) i.i.d. Likewise, neither standard empirical process and U-process theory \citep{van1996weak,gine2021mathematical} nor the recent results in \citet{davezies2021exchangeable} are applicable to the non-Donsker process $E_n$. The previous lemmas showed that $L_n$ is $\sqrt{n}$-consistent while $E_n$ is $\sqrt{n^2h}$-consistent (pointwise in $w$), showcasing the importance of careful standardization (cf.\ Studentization in Section~\ref{sec:kernel_implementation}) for the purpose of rate adaptivity to the unknown degeneracy type. In other words, a challenge in conducting uniform inference is that the finite-dimensional distributions of the stochastic process $L_n+E_n$, and hence those of $\hat{f}_W$ and its associated $t$-process $T_n$, may converge at different rates at different points $w\in\cW$. The following theorem provides an (infeasible) inference procedure which is fully adaptive to such potential unknown degeneracy. \begin{theorem}[Strong approximation of $T_n$] \label{thm:kernel_strong_approx_Tn} Suppose that Assumptions~\ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold and $f_W(w) > 0$ on $\cW$, and take any $R_n \to \infty$. Then for each $n$ there exists a centered Gaussian process $Z_n^{T}$ such that % \begin{align*} &\sup_{w \in \cW} \left| T_n(w) - Z_n^{T}(w) \right| \lesssim_\P \! \frac{ n^{-1} \! \log n + n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-7/6} h^{-1/2} (\log n)^{2/3} + h^{p\wedge\beta}} {\Dl/\sqrt{n} + 1/\sqrt{n^2h}}, \end{align*} % where $\E[Z_n^T(w)Z_n^T(w')] = \E[T_n(w)T_n(w')]$ for all $w,w' \in \cW$. % \end{theorem} The first term in the numerator corresponds to the strong approximation for $L_n$ in Lemma~\ref{lem:kernel_strong_approx_Ln} and the error introduced by $Q_n$. The second and third terms correspond to the conditional and unconditional strong approximation errors for $E_n$ in Lemmas \ref{lem:kernel_conditional_strong_approx_En} and \ref{lem:kernel_unconditional_strong_approx_En}. The fourth term is from the smoothing bias result in Theorem~\ref{thm:kernel_bias}. The denominator is the lower bound on the standard deviation $\Sigma_n(w,w)^{1/2}$ formulated in Lemma~\ref{lem:kernel_variance_bounds}. In the absence of degenerate points ($\Dl > 0$) and if $n h^{7/2}\gtrsim 1$, Theorem~\ref{thm:kernel_strong_approx_Tn} offers a strong approximation of the $t$-process at the rate $(\log n)/\sqrt{n}+\sqrt{n}h^{p\wedge\beta}$, which matches the celebrated KMT approximation rate for i.i.d.\ data plus the smoothing bias. Therefore, our novel $t$-process strong approximation can achieve the optimal KMT rate for non-degenerate dyadic distributions provided that $p\wedge\beta \geq 3.5$. This is achievable if a fourth-order (boundary-adaptive) kernel is used and $f_W$ is sufficiently smooth. In the presence of partial or total degeneracy ($\Dl =0$), Theorem~\ref{thm:kernel_strong_approx_Tn} provides a strong approximation for the $t$-process at the rate $\sqrt{h}\log n + n^{-1/4}h^{-3/8}(\log n)^{3/8} R_n + n^{-1/6}(\log n)^{2/3} + n h^{1/2+p\wedge\beta}$. If, for example, $n h^{p\wedge\beta}\lesssim 1$, then our result can achieve a strong approximation rate of $n^{-1/7}$ up to $\log n $ terms. Theorem~\ref{thm:kernel_strong_approx_Tn} appears to be the first in the dyadic literature which is also robust to the presence of degenerate points in the underlying dyadic distribution. \subsection{Application: confidence bands} Theorem~\ref{thm:kernel_infeasible_ucb} constructs standardized confidence bands for $f_W$ which are infeasible as they depend on the unknown population variance $\Sigma_n$. In Section~\ref{sec:kernel_implementation} we will make this inference procedure feasible by proposing a valid estimator of the covariance function $\Sigma_n$ for Studentization, as well as developing bandwidth selection and robust bias correction methods. Before presenting our result on valid infeasible uniform confidence bands, we first impose in Assumption~\ref{ass:kernel_rates} some extra restrictions on the bandwidth sequence, which depend on the degeneracy type of the dyadic distribution, to ensure the coverage rate converges. \begin{assumption}[Rate restriction for uniform confidence bands] \label{ass:kernel_rates} Assume that one of the following holds: % \begin{enumerate}[label=(\roman*)] \item \label{it:kernel_rate_non} No degeneracy ($\Dl > 0$): $n^{-6/7} \log n \ll h \ll (n \log n)^{-\frac{1}{2(p \wedge \beta)}}$, \item \label{it:kernel_rate_degen} Partial or total degeneracy ($\Dl = 0$): $n^{-2/3} (\log n)^{7/3} \ll h \ll (n^2 \log n)^{-\frac{1}{2(p \wedge \beta) + 1}}$. \end{enumerate} \end{assumption} We now construct the infeasible uniform confidence bands. For $\alpha \in (0,1)$, let $q_{1-\alpha}$ be the quantile satisfying $ \P\left(\sup_{w \in \cW} \left| Z_n^T(w) \right| \leq q_{1-\alpha} \right) = 1 - \alpha$. The following result employs the anti-concentration idea due to \citet{chernozhukov2014anti} to deduce valid standardized confidence bands, where we approximate the quantile of the unknown finite sample distribution of $\sup_{w\in\cW} |T_n(w)|$ by the quantile $q_{1-\alpha}$ of $\sup_{w\in\cW}|Z_n^T(w)|$. This approach offers a better rate of convergence than relying on extreme value theory for the distributional approximation, hence improving the finite sample performance of the proposed confidence bands. \begin{theorem}[Infeasible uniform confidence bands] \label{thm:kernel_infeasible_ucb} Suppose that Assumptions~\ref{ass:kernel_data},~\ref{ass:kernel_bandwidth}, and~\ref{ass:kernel_rates} hold and $f_W(w) > 0$ on $\cW$. Then % \begin{align*} \P\left( f_W(w) \in \left[ \hat f_W(w) \pm q_{1-\alpha} \sqrt{\Sigma_n(w,w)} \, \right] \, \textup{for all } w \in \cW \right) \to 1 - \alpha. \end{align*} % \end{theorem} By Theorem~\ref{thm:kernel_uniform_consistency}, the asymptotically optimal choice of bandwidth for uniform convergence is $h \asymp ((\log n)/n^2)^{\frac{1}{2(p \wedge \beta)+1}}$. As discussed in the next section, the approximate IMSE-optimal bandwidth is $h \asymp (1/n^2)^{\frac{1}{2(p \wedge \beta)+1}}$. Both bandwidth choices satisfy Assumption~\ref{ass:kernel_rates} only in the case of no degeneracy. The degenerate cases in Assumption~\ref{ass:kernel_rates}\ref{it:kernel_rate_degen}, which require $p \wedge \beta > 1$, exhibit behavior more similar to that of standard nonparametric kernel-based estimation and so the aforementioned optimal bandwidth choices will lead to a non-negligible smoothing bias in the distributional approximation for $T_n$. Different approaches are available in the literature to address this issue, including undersmoothing or ignoring the bias \citep{hall2001bootstrapping}, bias correction \citep{hall1992effect}, robust bias correction \citep{calonico2018effect, calonico2022coverage}, and Lepskii's method \citep{lepskii1992asymptotically,birge2001alternative}, among others. In the next section we develop a feasible uniform inference procedure, based on robust bias correction methods, which amounts to first selecting an optimal bandwidth for the point estimator $\hat{f}_W$ using a $p$th-order kernel, and then correcting the bias of the point estimator while also adjusting the standardization (Studentization) when forming the $t$-statistic $T_n$. Importantly, regardless of the specific implementation details, Theorem~\ref{thm:kernel_infeasible_ucb} shows that any bandwidth sequence $h$ satisfying both \ref{it:kernel_rate_non} and \ref{it:kernel_rate_degen} in Assumption~\ref{ass:kernel_rates} leads to valid uniform inference which is robust and adaptive to the (unknown) degeneracy type. \section{Implementation} \label{sec:kernel_implementation} We address outstanding implementation details to make our main uniform inference results feasible. In Section~\ref{sec:kernel_covariance_estimation} we propose a covariance estimator along with a modified version which is guaranteed to be positive semi-definite. This allows for the construction of fully feasible confidence bands in Section~\ref{sec:kernel_feasible_confidence_bands}. In Section~\ref{sec:kernel_bandwidth_selection} we discuss bandwidth selection and formalize our procedure for robust bias correction inference. \subsection{Covariance function estimation} \label{sec:kernel_covariance_estimation} Define the following plug-in covariance function estimator of $\Sigma_n$. For $w, w' \in \cW$, let $S_i(w) = \frac{1}{n-1} \big( \sum_{j = 1}^{i-1} k_h(W_{j i}, w) + \sum_{j = i+1}^n k_h(W_{i j}, w) \big)$ estimate $\E[k_h(W_{i j},w) \mid A_i]$ and take % \begin{align*} \hat \Sigma_n(w,w') &= \frac{4}{n^2} \sum_{i=1}^n S_i(w) S_i(w') - \frac{4}{n^2(n-1)^2} \sum_{i 0$ on $\cW$. Then % \begin{align*} \sup_{w,w' \in \cW} \left| \frac{\hat \Sigma_n(w,w') - \Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right| &\lesssim_\P \frac{\sqrt{\log n}}{n}. \end{align*} % The optimization problem \eqref{eq:kernel_sdp} is a semi-definite program \citep[SDP,][]{laurent2005semidefinite} and has an approximately optimal solution $\hat\Sigma_n^+$ satisfying % \begin{align*} \sup_{w,w' \in \cW} \left| \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right| &\lesssim_\P \frac{\sqrt{\log n}}{n}. \end{align*} % \end{lemma} In practice we take $w, w' \in \cW_d$ where $\cW_d$ is a finite subset of $\cW$, typically taken to be an equally-spaced grid. This yields finite-dimensional covariance matrices, for which \eqref{eq:kernel_sdp} can be solved in polynomial time in $|\cW_d|$ using a general-purpose SDP solver \citep[e.g.\ by interior point methods,][]{laurent2005semidefinite}. The number of points in $\cW_d$ should be taken as large as is computationally practical in order to generate confidence bands rather than merely simultaneous confidence intervals. It is worth noting that the complexity of solving \eqref{eq:kernel_sdp} does not depend on the number of vertices $n$, and so does not influence the ability of our methodology to handle large and possibly sparse networks. The bias-corrected variance estimator in \citet[Section~3.2]{matsushita2021jackknife} takes a similar form to our estimator $\hat\Sigma_n$ but in the parametric setting, and is therefore also not guaranteed to be positive semi-definite in finite samples. Our approach addresses this issue, ensuring a positive semi-definite estimator $\hat\Sigma_n^+$ is always available. \subsection{Feasible confidence bands} \label{sec:kernel_feasible_confidence_bands} Given a choice of the kernel order $p$ and a bandwidth $h$, we construct a valid confidence band that is implementable in practice. Define the Studentized $t$-statistic process % \begin{align*} \hat T_n(w) = \frac{\hat{f}_W(w) - f_W(w)}{\sqrt{\hat \Sigma_n^+(w,w)}}. \end{align*} % Let $\hat Z_n^T(w)$ be a process which, conditional on the data $\bW_n$, is mean-zero and Gaussian, whose conditional covariance structure is $\E\big[ \hat Z_n^T(w) \hat Z_n^T(w') \bigm\vert \bW_n \big] = \frac{\hat \Sigma_n^+(w,w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}$. For $\alpha \in (0,1)$, let $\hat q_{1-\alpha}$ be the conditional quantile satisfying $\P\big(\sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \leq \hat q_{1-\alpha} \bigm\vert \bW_n \big) = 1 - \alpha$, which is shown to be well defined in Section~\ref{sec:kernel_app_proofs}. \begin{theorem}[Feasible uniform confidence bands] \label{thm:kernel_ucb} Suppose that Assumptions \ref{ass:kernel_data}, \ref{ass:kernel_bandwidth}, and \ref{ass:kernel_rates} hold and $f_W(w) > 0$ on $\cW$. Then % \begin{align*} \P\left( f_W(w) \in \left[ \hat f_W(w) \pm \hat q_{1-\alpha} \sqrt{\hat\Sigma_n^+(w,w)} \,\right] \,\textup{for all } w \in \cW \right) \to 1 - \alpha. \end{align*} % \end{theorem} Recently, \citet{chiang2022inference} derived high-dimensional central limit theorems over rectangles for exchangeable arrays and applied them to construct simultaneous confidence intervals for a sequence of design points. Their inference procedure relies on the multiplier bootstrap, and their conditions for valid inference depend on the number of design points considered. In contrast, Theorem~\ref{thm:kernel_ucb} constructs a feasible uniform confidence band over the entire domain of inference $\cW$ based on our strong approximation results for the whole $t$-statistic process and the covariance estimator $\hat\Sigma_n^+$. The required rate condition specified in Assumption~\ref{ass:kernel_rates} does not depend on the number of design points. Furthermore, our proposed inference methods are robust to potential unknown degenerate points in the underlying dyadic data generating process. In practice, suprema over $\cW$ can be replaced by maxima over sufficiently many design points in $\cW$. The conditional quantile $\hat q_{1-\alpha}$ can be estimated by Monte Carlo simulation, resampling from the Gaussian process defined by the law of $\hat Z_n^T \mid \bW_n$. The bandwidth restrictions in Theorem~\ref{thm:kernel_ucb} are the same as those for the infeasible version given in Theorem~\ref{thm:kernel_infeasible_ucb}, namely those imposed in Assumption \ref{ass:kernel_rates}. This follows from the rates of convergence obtained in Lemma~\ref{lem:kernel_sdp}, coupled with some careful technical work given in Section~\ref{sec:kernel_app_proofs} to handle the potential presence of degenerate points in $\Sigma_n$. \subsection{Bandwidth selection and robust bias-corrected inference} \label{sec:kernel_bandwidth_selection} We give practical suggestions for selecting the bandwidth parameter $h$. Let $\nu(w)$ be a non-negative real-valued function on $\cW$ and suppose we use a kernel of order $p < \beta$ of the form $k_h(s,w) = K\big((s-w) / h\big)/h$. The $\nu$-weighted asymptotic IMSE (AIMSE) is minimized by % \begin{align*} h^*_{\AIMSE} &= \left( \frac{p!(p-1)! \Big(\int_\cW f_W(w) \nu(w) \diff{w}\Big) \Big(\int_\R K(w)^2 \diff{w}\Big)} {2 \Big( \int_{\cW} f_W^{(p)}(w)^2 \nu(w) \diff{w} \Big) \Big( \int_\R w^p K(w) \diff{w} \Big)^2 } \right)^{\frac{1}{2p+1}} \left( \frac{n(n-1)}{2} \right)^{-\frac{1}{2p+1}}. \end{align*} % This is akin to the AIMSE-optimal bandwidth choice for traditional monadic kernel density estimation with a sample size of $\frac{1}{2}n(n-1)$. The choice $h^*_{\AIMSE}$ is slightly undersmoothed (up to a polynomial $\log n$ factor) relative to the uniform minimax-optimal bandwidth choice discussed in Section~\ref{sec:kernel_point_estimation}, but it is easier to implement in practice. To implement the AIMSE-optimal bandwidth choice, we propose a simple rule-of-thumb (ROT) approach based on Silverman's rule. Suppose $p\wedge\beta=2$ and let $\hat\sigma^2$ and $\hat I$ be the sample variance and sample interquartile range respectively of the data $\bW_n$. Then $\hat{h}_{\ROT} = C(K) \big( \hat\sigma \wedge \frac{\hat I}{1.349} \big) \big(\frac{n(n-1)}{2} \big)^{-1/5}$, where we have $C(K)=2.576$ for the triangular kernel $K(w) = (1 - |w|) \vee 0$, and $C(K)=2.435$ for the Epanechnikov kernel $K(w) = \frac{3}{4}(1 - w^2) \vee 0$. The AIMSE-optimal bandwidth selector $h^*_{\AIMSE}\asymp n^{-\frac{2}{2p+1}}$ and any of its feasible estimators only satisfy Assumption~\ref{ass:kernel_rates} in the case of no degeneracy ($\Dl>0$). Under partial or total degeneracy, such bandwidths are not valid due to the usual leading smoothing (or misspecification) bias of the distributional approximation. To circumvent this problem and construct feasible uniform confidence bands for $f_W$, we employ the following robust bias correction approach. \begin{algorithm}[b!] \caption{Feasible uniform confidence bands} \label{alg:kernel_method} \setstretch{1.5} Choose a kernel $k_h$ of order $p \geq 2$ satisfying Assumption~\ref{ass:kernel_bandwidth}. \\ Select a bandwidth $h \approx h^*_{\AIMSE}$ for $k_h$ as in Section~\ref{sec:kernel_bandwidth_selection}, perhaps using $h = \hat{h}_{\ROT}$. \\ Choose another kernel $k_h'$ of order $p'>p$ satisfying Assumption~\ref{ass:kernel_bandwidth}. For $d \geq 1$, choose a set of $d$ distinct evaluation points $\cW_d$. \\ For each $w \in \cW_d$, construct the density estimate $\hat f_W(w)$ using $k'_{h}$ as in Section~\ref{sec:kernel_introduction}. \\ For $w, w' \in \cW_d$, estimate the covariance $\hat \Sigma_n(w,w')$ using $k'_{h}$ as in Section~\ref{sec:kernel_covariance_estimation}. \\ Construct positive semi-definite covariance estimate $\hat \Sigma_n^+$ as in Section~\ref{sec:kernel_covariance_estimation}. \\ For $B \geq 1$, let $(\hat Z_{n,r}^T: 1\leq r\leq B)$ be i.i.d.\ from $\hat{Z}_n^T$ as in Section~\ref{sec:kernel_feasible_confidence_bands}. \\ For $\alpha \in (0,1)$, set $\hat q_{1-\alpha} = \inf_{q \in \R} \{ q : \# \{r: \max_{w\in\cW_d}|\hat Z_{n,r}^T(w)| \leq q \} \geq B(1-\alpha) \}$. \\ Construct $ \big[\hat f_W(w) \pm \hat q_{1-\alpha} \hat\Sigma_n^+(w,w)^{1/2} \big]$ for each $w \in \cW_d$. % \end{algorithm} Firstly, estimate the bandwidth $h^*_{\AIMSE}\asymp n^{-\frac{2}{2p+1}}$ using a kernel of order $p$, which leads to an AIMSE-optimal point estimator $\hat{f}_W$ in an $L^2(\nu)$ sense. Then use this bandwidth and a kernel of order $p' > p$ to construct the statistic $\hat T_n$ and the confidence band as detailed in Section~\ref{sec:kernel_feasible_confidence_bands}. Importantly, both $\hat{f}_W$ and $\hat{\Sigma}^+_n$ are recomputed with the new higher-order kernel. The change in centering is equivalent to a bias correction of the original AIMSE-optimal point estimator, while the change in scale captures the additional variability introduced by the bias correction itself. As shown formally in \citet{calonico2018effect, calonico2022coverage} for the case of kernel-based density estimation with i.i.d.\ data, this approach leads to higher-order refinements in the distributional approximation whenever additional smoothness is available ($p'\leq\beta$). In the present dyadic setting, this procedure is valid so long as $n^{-2/3} (\log n)^{7/3} \ll n^{-\frac{2}{2p+1}} \ll (n^2 \log n)^{-\frac{1}{2p' + 1}}$, which is equivalent to $2 \leq p < p'$. For concreteness, we recommend taking $p = 2$ and $p' = 4$, and using the rule-of-thumb bandwidth choice $\hat{h}_{\ROT}$ defined above. In particular, this approach automatically delivers a KMT-optimal strong approximation whenever there are no degeneracies in the underlying dyadic data generating process. Our feasible robust bias correction method based on AIMSE-optimal dyadic kernel density estimation for constructing uniform confidence bands for $f_W$ is summarized in Algorithm~\ref{alg:kernel_method}. \section{Simulations} \label{sec:kernel_simulations} We investigate the empirical finite-sample performance of the kernel density estimator with dyadic data using simulations. The family of dyadic distributions defined in Section~\ref{sec:kernel_degeneracy}, with its three parameterizations, is used to generate data sets with different degeneracy types. We use two different boundary bias-corrected Epanechnikov kernels of orders $p=2$ and $p=4$ respectively, on the inference domain $\cW = [-2,2]$. We select an optimal bandwidth for $p=2$ as recommended in Section~\ref{sec:kernel_bandwidth_selection}, using the rule-of-thumb with $C(K) = 2.435$. The semi-definite program in Section~\ref{sec:kernel_covariance_estimation} is solved with the MOSEK interior point optimizer \citep{mosek}, ensuring positive semi-definite covariance estimates. Gaussian vectors are resampled $B = 10\,000$ times. \begin{figure}[b!] \centering % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/outcome_plot_total.pdf} \caption{Total degeneracy, \\ $\pi = \left( \frac{1}{2}, 0, \frac{1}{2} \right)$.} \end{subfigure} % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/outcome_plot_partial.pdf} \caption{Partial degeneracy, \\ $\pi = \left( \frac{1}{4}, 0, \frac{3}{4} \right)$.} \end{subfigure} % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/outcome_plot_none.pdf} \caption{No degeneracy, \\ $\pi = \left( \frac{1}{5}, \frac{1}{5}, \frac{3}{5} \right)$.} \end{subfigure} % \caption[Typical outcomes for different values of the parameter $\pi$] {Typical outcomes for three different values of the parameter $\pi$.} % \label{fig:kernel_results} % \end{figure} In Figure~\ref{fig:kernel_results} we plot a typical outcome for each of the three degeneracy types (total, partial, none), using the Epanechnikov kernel of order $p=2$, with sample size $n=100$ (so $N=4950$ pairs of nodes) and with $d=100$ equally-spaced evaluation points. Each plot contains the true density function $f_W$, the dyadic kernel density estimate $\hat f_W$ and two different approximate $95\%$ confidence bands for $f_W$. The first is the uniform confidence band (UCB) constructed using one of our main results, Theorem~\ref{thm:kernel_ucb}. The second is a sequence of pointwise confidence intervals (PCI) constructed by finding a confidence interval for each evaluation point separately. We show only $10$ pointwise confidence intervals for clarity. In general, the PCIs are too narrow as they fail to provide simultaneous (uniform) coverage over the evaluation points. Note that under partial degeneracy the confidence band narrows near the degenerate point $w = 0$. \begin{table}[b!] \centering \begin{tabular}{|c|c|c|c|c|cc|cc|} \hline \multirow{2}{*}{$ \pi $} & \multirow{2}{*}{Degeneracy type} & \multirow{2}{*}{$ \hat h_{\ROT} $} & \multirow{2}{*}{$ p $} & \multirow{2}{*}{RIMSE} & \multicolumn{2}{|c|}{UCB} & \multicolumn{2}{|c|}{PCI} \\ \cline{6-9} & & & & & CR & AW & CR & AW \\ \hline \multirow{2}{*}{$ \left(\frac{1}{2}, 0, \frac{1}{2}\right) $} & \multirow{2}{*}{Total} & \multirow{2}{*}{0.161} & 2 & 0.00048 & 87.1\% & 0.0028 & 6.5\% & 0.0017 \\ & & & 4 & 0.00068 & 95.2\% & 0.0042 & 9.7\% & 0.0025 \\ \hline \multirow{2}{*}{$ \left(\frac{1}{4}, 0, \frac{3}{4}\right) $} & \multirow{2}{*}{Partial} & \multirow{2}{*}{0.158} & 2 & 0.00228 & 94.5\% & 0.0112 & 75.6\% & 0.0083 \\ & & & 4 & 0.00234 & 94.7\% & 0.0124 & 65.3\% & 0.0087 \\ \hline \multirow{2}{*}{$ \left(\frac{1}{5}, \frac{1}{5}, \frac{3}{5}\right) $} & \multirow{2}{*}{None} & \multirow{2}{*}{0.145} & 2 & 0.00201 & 94.2\% & 0.0106 & 73.4\% & 0.0077 \\ & & & 4 & 0.00202 & 95.6\% & 0.0117 & 64.3\% & 0.0080 \\ \hline \end{tabular} \caption[Numerical results for three values of the parameter $\pi$]{ Numerical results for three values of the parameter $\pi$.} \label{tab:kernel_results} \end{table} Next, Table~\ref{tab:kernel_results} presents numerical results. For each degeneracy type (total, partial, none) and each kernel order ($p=2$, $p=4$), we run $2000$ repeats with sample size $n=3000$ (giving $N=4\,498\,500$ pairs of nodes) and with $d=50$ equally-spaced evaluation points. We record the average rule-of-thumb bandwidth $\hat{h}_{\ROT}$ and the average root integrated mean squared error (RIMSE). For both the uniform confidence bands (UCB) and the pointwise confidence intervals (PCI), we report the coverage rate (CR) and the average width (AW). % The lower-order kernel ($p=2$) ignores the bias, leading to good RIMSE performance and acceptable UCB coverage under partial or no degeneracy, but gives invalid inference under total degeneracy. In contrast, the higher-order kernel ($p=4$) provides robust bias correction and hence improves the coverage of the UCB in every regime, particularly under total degeneracy, at the cost of increasing both the RIMSE and the average widths of the confidence bands. % As expected, the pointwise (in $w\in\cW$) confidence intervals (PCIs) severely undercover in every regime. Thus our simulation results show that the proposed feasible inference methods based on robust bias correction and proper Studentization deliver valid uniform inference which is robust to unknown degenerate points in the underlying dyadic distribution. \section{Counterfactual dyadic density estimation} \label{sec:kernel_counterfactual} To further showcase the applicability of our main results, we develop a kernel density estimator for dyadic counterfactual distributions. The aim of such counterfactual analysis is to estimate the distribution of an outcome variable had some covariates followed a distribution different from the actual one, and it is important in causal inference and program evaluation settings \citep{dinardo1996distribution,chernozhukov2013inference}. For each $r \in \{0,1\}$, let $\bW_n^r$, $\bA_n^r$, and $\bV_n^r$ be random variables as defined in Assumption~\ref{ass:kernel_data} and $\bX_n^r = (X_1^r, \ldots, X_n^r)$ be some covariates. We assume that $(A_i^r, X_i^r)$ are independent over $1 \leq i \leq n$ and that $\bX_n^r$ is independent of $\bV_n^r$, that $W_{i j}^r \mid X_i^r, X_j^r$ has a conditional Lebesgue density $f_{W \mid XX}^r(\,\cdot \mid x_1, x_2) \in \cH^\beta_{C_\rH}(\cW)$, that $X_i^r$ follows a distribution function $F_X^r$ on a common support $\cX$, and that $(\bA_n^0, \bV_n^0, \bX_n^0)$ is independent of $(\bA_n^1, \bV_n^1, \bX_n^1)$. We interpret $r$ as an index for two populations, labeled $0$ and $1$. The counterfactual density of population $1$ had it followed the same covariate distribution as population $0$ is % \begin{align*} f_W^{1 \triangleright 0}(w) &= \E\left[ f_{W \mid XX}^1\big(w \mid X_1^0, X_2^0\big) \right] \\ &= \int_{\cX} \int_{\cX} f_{W \mid XX}^{1}(w \mid x_1, x_2) \psi(x_1) \psi(x_2) \diff F_X^{1}(x_1) \diff F_X^{1}(x_2), \end{align*} % where $\psi(x) = \mathrm{d} F_X^0(x) / \mathrm{d} F_X^1(x)$ for $x \in \cX$ is a Radon--Nikodym derivative. If $X^0_i$ and $X^1_i$ have Lebesgue densities, it is natural to consider a parametric model of the form $\mathrm{d} F_X^{r}(x)=f_X^r(x;\theta)\diff x$ for some finite-dimensional parameter $\theta$. Alternatively, if the covariates $X_n^r$ are discrete and have a positive probability mass function $p_X^r(x)$ on a finite support $\cX$, the object of interest becomes $f_W^{1 \triangleright 0}(w) = \sum_{x_1 \in \cX} \sum_{x_2 \in \cX} f_{W \mid XX}^{1}(w \mid x_1, x_2) \psi(x_1) \psi(x_2) p_X^{1}(x_1) p_X^{1}(x_2)$, where $\psi(x) = p_X^0(x)/p_X^1(x)$ for $x \in \cX$. We consider discrete covariates for simplicity, and hence the counterfactual dyadic kernel density estimator is % \begin{align*} \hat f_W^{\,1 \triangleright 0}(w) &= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^n \hat \psi(X_i^1) \hat \psi(X_j^1) k_h(W_{i j}^1, w), \end{align*} % where $\hat\psi(x) = \hat p_X^{\,0}(x) / \hat p_X^{\,1}(x)$ and $\hat p_X^{\,r}(x) = \frac{1}{n}\sum_{i = 1}^n \I\{X_i^r = x\}$, with $\I$ the indicator function. Section~\ref{sec:kernel_app_main} provides technical details: we show how an asymptotic linear representation for $\hat\psi(x)$ leads to a Hoeffding-type decomposition of $\hat f_W^{\,1 \triangleright 0}(w)$, which is then used to establish that $\hat f_W^{\,1 \triangleright 0}$ is uniformly consistent for $f_W^{\,1 \triangleright 0}(w)$ and also admits a Gaussian strong approximation, with the same rates of convergence as for the standard density estimator. Furthermore, define the covariance function of $\hat f_W^{\,1 \triangleright 0}(w)$ as $\Sigma_n^{1 \triangleright 0}(w,w') = \Cov\big[ \hat f_W^{\,1 \triangleright 0}(w), \hat f_W^{\,1 \triangleright 0}(w') \big]$, which can be estimated as follows. First let $\hat\kappa(X_i^0, X_i^1, x) = \frac{\I\{X_i^0 = x\} - \hat p_X^0(x)}{\hat p_X^1(x)} - \frac{\hat p_X^0(x)}{\hat p_X^1(x)} \frac{\I\{X_i^1 = x\} - \hat p_X^1(x)}{\hat p_X^1(x)}$ be a plug-in estimate of the influence function for $\hat\psi(x)$ and define the leave-one-out conditional expectation estimators $S_i^{1 \triangleright 0}(w) = \frac{1}{n-1} \big( \sum_{j=1}^{i-1} k_h(W_{j i}^1,w) \hat\psi(X_j^1) + \sum_{j=i+1}^n k_h(W_{i j}^1,w) \hat\psi(X_j^1) \big)$ and $\tilde S_i^{1 \triangleright 0}(w) = \frac{1}{n-1} \sum_{j=1}^n \I\{j \neq i\} \hat\kappa(X_i^0, X_i^1, X_j^1) S_j^{1 \triangleright 0}(w)$. Define the covariance estimator % \begin{align*} \hat\Sigma_n^{1 \triangleright 0}(w,w') &= \frac{4}{n^2} \sum_{i=1}^n \big( \hat\psi(X_i^1) S_i^{1 \triangleright 0}(w) + \tilde S_i^{1 \triangleright 0}(w) \big) \big( \hat\psi(X_i^1) S_i^{1 \triangleright 0}(w') + \tilde S_i^{1 \triangleright 0}(w') \big) \\ &\quad- \frac{4}{n^3(n-1)} \sum_{i 0$ and a kernel function $k_h$ on $\R^m \times \R^m$, the local polynomial regression estimator of $\mu(x_1, x_2)$ is $\hat\mu(x_1, x_2) = e_1^\T \hat\beta(x_1, x_2)$ where $e_1$ is the first standard unit vector in $\R^q$ for $q=\binom{2m+\gamma}{\gamma}$ and % \begin{align} \nonumber \hat{\beta}(x_1, x_2) &= \argmin_{\beta \in \R^q} \sum_{i=1}^{n-1} \sum_{j=i+1}^n \left( Y_{i j} - r(X_i-x_1, X_j-x_2)^\T \beta \right)^2 k_h(X_i-x_1, X_j-x_2) \\ \label{eq:kernel_locpol} &= \left( \sum_{i=1}^{n-1} \sum_{j=i+1}^n k_{i j} r_{i j} r_{i j}^\T \right)^{-1} \left( \sum_{i=1}^{n-1} \sum_{j=i+1}^n k_{i j} r_{i j} Y_{i j} \right), \end{align} % with $k_{i j} = k_h(X_i-x_1, X_j-x_2)$ and $r_{i j} = r(X_i-x_1, X_j-x_2)$. \citet{graham2021minimax} established pointwise distribution theory for the special case of the dyadic Nadaraya--Watson kernel regression estimator ($\gamma=0$), but no uniform analogues have yet been given. It can be shown that the ``denominator'' matrix in \eqref{eq:kernel_locpol} converges uniformly to its expectation, while the U-process-like ``numerator'' matrix can be handled the same way as we analyzed $\hat f_W(w)$ in this chapter, through a Hoeffding-type decomposition and strong approximation methods, along with standard bias calculations. Such distributional approximation results can be used to construct valid uniform confidence bands for the regression function $\mu(x_1, x_2)$, as well as to conduct hypothesis testing for parametric specifications or shape constraints. As a third example, we consider applying our results to semiparametric semi-linear regression problems. The dyadic semi-linear regression model is $\E[Y_{i j} \mid W_{i j}, X_i, X_j] = \theta^\T W_{i j} + g(X_i, X_j)$ where $\theta$ is the finite-dimensional parameter of interest and $g(X_i, X_j)$ is an unknown function of the covariates $(X_i, X_j)$. Local polynomial (or other) methods can be used to estimate $\theta$ and $g$, where the estimator of the nonparametric component $g$ takes a similar form to \eqref{eq:kernel_locpol}, that is, a ratio of two kernel-based estimators as in \eqref{eq:kernel_estimator}. Consequently, the strong approximation techniques presented in this chapter can be appropriately modified to develop valid uniform inference procedures for $g$ and $\E[Y_{i j} \mid W_{i j}=w, X_i=x_1, X_j=x_2]$, as well as functionals thereof. \section{Conclusion} \label{sec:kernel_conclusion} We studied the uniform estimation and inference properties of the dyadic kernel density estimator $\hat{f}_W$ given in \eqref{eq:kernel_estimator}, which forms a class of U-process-like estimators indexed by the $n$-varying kernel function $k_h$ on $\cW$. We established uniform minimax-optimal point estimation results and uniform distributional approximations for this estimator based on novel strong approximation strategies. We then applied these results to derive valid and feasible uniform confidence bands for the dyadic density estimand $f_W$, and also developed a substantive application of our theory to counterfactual dyadic density analysis. We gave some other statistical applications of our methodology as well as potential avenues for future research. From a technical perspective, Appendix~\ref{app:kernel} contains several generic results concerning strong approximation methods and maximal inequalities for empirical processes that may be of independent interest. Implementations of this chapter's methodology, along with replication files for the empirical results, are provided by a Julia package available at \github{wgunderwood/DyadicKDE.jl}. This work is based on \citet{cattaneo2024uniform}, and has been presented by Cattaneo at the Columbia University Biostatistics Colloquium Seminar (2022) and the Georgia Institute of Technology Statistics Seminar (2022), by Feng at the Renmin University Econometrics Seminar (2022), the Xiamen University Symposium on Modern Statistics (2022), the Peking University Econometrics Seminar (2023), and the Asian Meeting of the Econometric Society in East and Southeast Asia, Singapore (2023), and by Underwood at the University of Illinois Statistics Seminar (2024), the University of Michigan Statistics Seminar (2024), and the University of Pittsburgh Statistics Seminar (2024). \chapter[Yurinskii's Coupling for Martingales]% {Yurinskii's Coupling \\ for Martingales} \label{ch:yurinskii} % abstract Yurinskii's coupling is a popular theoretical tool for non-asymptotic distributional analysis in mathematical statistics and applied probability, offering a Gaussian strong approximation with an explicit error bound under easily verified conditions. Originally stated in $\ell^2$-norm for sums of independent random vectors, it has recently been extended both to the $\ell^p$-norm, for $1 \leq p \leq \infty$, and to vector-valued martingales in $\ell^2$-norm, under some strong conditions. We present as our main result a Yurinskii coupling for approximate martingales in $\ell^p$-norm, under substantially weaker conditions than those previously imposed. Our formulation further allows for the coupling variable to follow a more general Gaussian mixture distribution, and we provide a novel third-order coupling method which gives tighter approximations in certain settings. We specialize our main result to mixingales, martingales, and independent data, and derive uniform Gaussian mixture strong approximations for martingale empirical processes. Substantive applications of our theory to nonparametric partitioning-based and local polynomial regression procedures are provided. \section{Introduction} Yurinskii's coupling \citep{yurinskii1978error} has proven to be an important theoretical tool for developing non-asymptotic distributional approximations in mathematical statistics and applied probability. For a sum $S$ of $n$ independent zero-mean $d$-dimensional random vectors, this coupling technique constructs (on a suitably enlarged probability space) a zero-mean $d$-dimensional Gaussian vector $T$ with the same covariance matrix as $S$ and which is close to $S$ in probability, bounding the discrepancy $\|S-T\|$ as a function of $n$, $d$, the choice of the norm, and some features of the underlying distribution. See, for example, \citet[Chapter 10]{pollard2002user} for a textbook introduction. When compared to other coupling approaches, such as the celebrated Hungarian construction \citep{komlos1975approximation} or Zaitsev's coupling \citep{zaitsev1987estimates,zaitsev1987gaussian}, Yurinskii's approach stands out for its simplicity, robustness, and wider applicability, while also offering tighter couplings in some applications (see below for more discussion and examples). These features have led many scholars to use Yurinskii's coupling to study the distributional features of high-dimensional statistical procedures in a variety of settings, often with the end goal of developing uncertainty quantification or hypothesis testing methods. For example, in recent years, Yurinskii's coupling has been used to construct Gaussian approximations for the suprema of empirical processes \citep{chernozhukov2014gaussian}; to establish distribution theory for non-Donsker stochastic $t$-processes generated in nonparametric series regression \citep{belloni2015some}; to prove distributional approximations for high-dimensional $\ell^p$-norms \citep{biau2015high}; to develop distribution theory for vector-valued martingales \citep{belloni2018high,li2020uniform}; to derive a law of the iterated logarithm for stochastic gradient descent optimization methods \citep{anastasiou2019normal}; to establish uniform distributional results for nonparametric high-dimensional quantile processes \citep{belloni2019conditional}; to develop distribution theory for non-Donsker stochastic $t$-processes generated in partitioning-based series regression \citep{cattaneo2020large}; to deduce Bernstein--von Mises theorems in high-dimensional settings \citep{ray2021bernstein}; and to develop distribution theory for non-Donsker U-processes based on dyadic network data \citep{cattaneo2024uniform}. There are also many other early applications of Yurinskii's coupling: \citet{dudley1983invariance} and \citet{dehling1983limit} establish invariance principles for Banach space-valued random variables, and \citet{lecam1988} and \citet{sheehy1992uniform} obtain uniform Donsker results for empirical processes, to name just a few. This chapter presents a new Yurinskii coupling which encompasses and improves upon all of the results previously available in the literature, offering four new features: % \begin{enumerate}[label=(\roman*),leftmargin=*] \item \label{it:yurinskii_contribution_approximate_martingale} It applies to vector-valued \textit{approximate martingale} data. \item \label{it:yurinskii_contribution_gaussian_mixture} It allows for a \textit{Gaussian mixture} coupling distribution. \item \label{it:yurinskii_contribution_degeneracy} It imposes \textit{no restrictions on degeneracy} of the data covariance matrix. \item \label{it:yurinskii_contribution_third_order} It establishes a \textit{third-order} coupling to improve the approximation in certain situations. \end{enumerate} % Closest to our work are the unpublished manuscript by \citet{belloni2018high} and the recent paper by \citet{li2020uniform}, which both investigated distribution theory for martingale data using Yurinskii's coupling and related methods. Specifically, \citet{li2020uniform} established a Gaussian $\ell^2$-norm Yurinskii coupling for mixingales and martingales under the assumption that the covariance structure has a minimum eigenvalue bounded away from zero. As formally demonstrated in this chapter (Section~\ref{sec:yurinskii_kde}), such eigenvalue assumptions can be prohibitively strong in practically relevant applications. In contrast, our Yurinskii coupling does not impose any restrictions on covariance degeneracy \ref{it:yurinskii_contribution_degeneracy}, in addition to offering several other new features not present in \citet{li2020uniform}, including \ref{it:yurinskii_contribution_approximate_martingale}, \ref{it:yurinskii_contribution_gaussian_mixture}, \ref{it:yurinskii_contribution_third_order}, and applicability to general $\ell^p$-norms. In addition, we correct a slight technical inaccuracy in their proof relating to the derivation of bounds in probability (Remark \ref{rem:yurinskii_coupling_bounds_probability}). \citet{belloni2018high} did not establish a Yurinskii coupling for martingales, but rather a central limit theorem for smooth functions of high-dimensional martingales using the celebrated second-order Lindeberg method \citep[see][and references therein]{chatterjee2006generalization}, explicitly accounting for covariance degeneracy. As a consequence, their result could be leveraged to deduce a Yurinskii coupling for martingales with additional, non-trivial technical work (see Section~\ref{sec:yurinskii_app_proofs} in Appendix~\ref{app:yurinskii} for details). Nevertheless, a Yurinskii coupling derived from \citet{belloni2018high} would not feature \ref{it:yurinskii_contribution_approximate_martingale}, \ref{it:yurinskii_contribution_gaussian_mixture}, \ref{it:yurinskii_contribution_third_order}, or general $\ell^p$-norms, as our results do. We discuss further the connections between our work and the related literature in the upcoming sections, both when introducing our main theoretical results and when presenting the examples and statistical applications. The most general coupling result of this chapter (Theorem~\ref{thm:yurinskii_sa_dependent}) is presented in Section~\ref{sec:yurinskii_main_results}, where we also specialize it to a slightly weaker yet more user-friendly formulation (Proposition~\ref{pro:yurinskii_sa_simplified}). Our Yurinskii coupling for approximate martingales is a strict generalization of all previous Yurinskii couplings available in the literature, offering a Gaussian mixture strong approximation for approximate martingale vectors in $\ell^p$-norm, with an improved rate of approximation when the third moments of the data are negligible, and with no assumptions on the spectrum of the data covariance matrix. A key technical innovation underlying the proof of Theorem~\ref{thm:yurinskii_sa_dependent} is that we explicitly account for the possibility that the minimum eigenvalue of the variance may be zero, or its lower bound may be unknown, with the argument proceeding using a carefully tailored regularization. Establishing a coupling to a Gaussian mixture distribution is achieved by an appropriate conditioning argument, leveraging a conditional version of Strassen's theorem established by \citet{chen2020jackknife}, along with some related technical work detailed in Section~\ref{sec:yurinskii_app_proofs}. A third-order coupling is obtained via a modification of a standard smoothing technique for Borel sets from classical versions of Yurinskii's coupling, enabling improved approximation errors whenever third moments are negligible. In Proposition~\ref{pro:yurinskii_sa_simplified}, we explicitly tune the parameters of the aforementioned regularization to obtain a simpler, parameter-free version of Yurinskii's coupling for approximate martingales, again offering Gaussian mixture coupling distributions and an improved third-order approximation error. This specialization of our main result takes an agnostic approach to potential singularities in the data covariance matrix and, as such, may be improved in specific applications where additional knowledge of the covariance structure is available. Section~\ref{sec:yurinskii_main_results} also presents some further refinements when additional structure is imposed, deriving Yurinskii couplings for mixingales, martingales, and independent data as Corollaries~\ref{cor:yurinskii_sa_mixingale}, \ref{cor:yurinskii_sa_martingale}, and \ref{cor:yurinskii_sa_indep}, respectively. We take the opportunity to discuss and correct in Remark~\ref{rem:yurinskii_coupling_bounds_probability} a technical issue which is often neglected \citep{pollard2002user, li2020uniform} when using Yurinskii's coupling to derive bounds in probability. Section~\ref{sec:yurinskii_factor} presents a stylized example portraying the relevance of our main technical results in the context of canonical factor models, illustrating the importance of each of our new Yurinskii coupling features \ref{it:yurinskii_contribution_approximate_martingale}--% \ref{it:yurinskii_contribution_third_order}. Section~\ref{sec:yurinskii_emp_proc} considers a substantive application of our main results: strong approximation of martingale empirical processes. We begin with the motivating example of canonical kernel density estimation, demonstrating how Yurinskii's coupling can be applied, and showing in Lemma~\ref{lem:yurinskii_kde_eigenvalue} why it is essential that we do not place any conditions on the minimum eigenvalue of the variance matrix \ref{it:yurinskii_contribution_degeneracy}. We then present a general-purpose strong approximation for martingale empirical processes in Proposition~\ref{pro:yurinskii_emp_proc}, combining classical results in the empirical process literature \citep{van1996weak} with our Corollary~\ref{cor:yurinskii_sa_martingale}. This statement appears to be the first of its kind for martingale data, and when specialized to independent (and not necessarily identically distributed) data, it is shown to be superior to the best known comparable strong approximation result available in the literature \citep{berthet2006revisiting}. Our improvement comes from using Yurinskii's coupling for the $\ell^\infty$-norm, where \citet{berthet2006revisiting} apply Zaitsev's coupling \citep{zaitsev1987estimates, zaitsev1987gaussian} with the larger $\ell^2$-norm. Section~\ref{sec:yurinskii_nonparametric} further illustrates the applicability of our results through two examples in nonparametric regression estimation. Firstly, we deduce a strong approximation for partitioning-based least squares series estimators with time series data, applying Corollary~\ref{cor:yurinskii_sa_martingale} directly and additionally imposing only a mild mixing condition on the regressors. We show that our Yurinskii coupling for martingale vectors delivers the same distributional approximation rate as the best known result for independent data, and discuss how this can be leveraged to yield a feasible statistical inference procedure. We also show that if the residuals have vanishing conditional third moment, an improved rate of Gaussian approximation can be established. Secondly, we deduce a strong approximation for local polynomial estimators with time series data, using our result on martingale empirical processes (Proposition~\ref{pro:yurinskii_emp_proc}) and again imposing a mixing assumption. Appealing to empirical process theory is essential here as, in contrast with series estimators, local polynomials do not possess certain additive separability properties. The bandwidth restrictions we require are relatively mild, and, as far as we know, they have not been improved upon even with independent data. Section \ref{sec:yurinskii_conclusion} concludes the chapter. All proofs are collected in Appendix~\ref{app:yurinskii}, which also includes other technical lemmas of potential independent interest, alongside some further results on applications of our theory to deriving high-dimensional central limit theorems for martingales in Section~\ref{sec:yurinskii_app_high_dim_clt}. \subsection{Notation} We write $\|x\|_p$ for $p\in[1,\infty]$ to denote the $\ell^p$-norm if $x$ is a (possibly random) vector or the induced operator $\ell^p$--$\ell^p$-norm if $x$ is a matrix. For $X$ a real-valued random variable and an Orlicz function $\psi$, we use $\vvvert X \vvvert_\psi$ to denote the Orlicz $\psi$-norm \citep[Section~2.2]{van1996weak} and $\vvvert X \vvvert_p$ for the $L^p(\P)$-norm where $p\in [1,\infty]$. For a matrix $M$, we write $\|M\|_{\max}$ for the maximum absolute entry and $\|M\|_\rF$ for the Frobenius norm. We denote positive semi-definiteness by $M \succeq 0$ and write $I_d$ for the $d \times d$ identity matrix. For scalar sequences $x_n$ and $y_n$, we write $x_n \lesssim y_n$ if there exists a positive constant $C$ such that $|x_n| \leq C |y_n|$ for sufficiently large $n$. We write $x_n \asymp y_n$ to indicate both $x_n \lesssim y_n$ and $y_n \lesssim x_n$. Similarly, for random variables $X_n$ and $Y_n$, we write $X_n \lesssim_\P Y_n$ if for every $\varepsilon > 0$ there exists a positive constant $C$ such that $\P(|X_n| \leq C |Y_n|) \leq \varepsilon$, and write $X_n \to_\P X$ for limits in probability. For real numbers $a$ and $b$ we use $a \vee b = \max\{a,b\}$. We write $\kappa \in \N^d$ for a multi-index, where $d \in \N = \{0, 1, 2, \ldots\}$, and define $|\kappa| = \sum_{j=1}^d \kappa_j$ and $x^\kappa = \prod_{j=1}^d x_j^{\kappa_j}$ for $x \in \R^d$, and $\kappa! = \prod_{j=1}^{d} \kappa_j !$. Since our results concern couplings, some statements must be made on a new or enlarged probability space. We omit the details of this for clarity of notation, but technicalities are handled by the Vorob'ev--Berkes--Philipp Theorem~\citep[Theorem~1.1.10]{dudley1999uniform}. \section{Main results} \label{sec:yurinskii_main_results} We begin with our most general result: an $\ell^p$-norm Yurinskii coupling of a sum of vector-valued approximate martingale differences to a Gaussian mixture-distributed random vector. The general result is presented in Theorem~\ref{thm:yurinskii_sa_dependent}, while Proposition~\ref{pro:yurinskii_sa_simplified} gives a simplified and slightly weaker version which is easier to use in applications. We then further specialize Proposition~\ref{pro:yurinskii_sa_simplified} to three scenarios with successively stronger assumptions, namely mixingales, martingales, and independent data in Corollaries~\ref{cor:yurinskii_sa_mixingale}, \ref{cor:yurinskii_sa_martingale}, and \ref{cor:yurinskii_sa_indep} respectively. In each case we allow for possibly random quadratic variations (cf.\ mixing convergence), thereby establishing a Gaussian mixture coupling in the general setting. In Remark~\ref{rem:yurinskii_coupling_bounds_probability} we comment on and correct an often overlooked technicality relating to the derivation of bounds in probability from Yurinskii's coupling. As a first illustration of the power of our generalized $\ell^p$-norm Yurinskii coupling, we present in Section~\ref{sec:yurinskii_factor} a simple factor model example relating to all three of the aforementioned scenarios. \begin{theorem}[Strong approximation for vector-valued approximate martingales] \label{thm:yurinskii_sa_dependent} Take a complete probability space with a countably generated filtration $\cH_0, \ldots, \cH_n$ for $n \geq 1$, supporting the $\R^d$-valued square-integrable variables $X_1, \ldots, X_n$. Let $S = \sum_{i=1}^n X_i$ and define % \begin{align*} \tilde X_i &= \sum_{r=1}^n \big(\E[X_{r} \mid \cH_{i}] - \E[X_{r} \mid \cH_{i-1}]\big) & &\text{and} &U &= \sum_{i=1}^{n} \big( X_i - \E[ X_i \mid \cH_n] + \E[ X_i \mid \cH_0 ] \big). \end{align*} % Let $V_i = \Var[\tilde X_i \mid \cH_{i-1}]$ and define $\Omega = \sum_{i=1}^n V_i - \Sigma$ where $\Sigma$ is an almost surely positive semi-definite $\cH_0$-measurable $d \times d$ matrix. Then, for each $\eta > 0$ and $p \in [1,\infty]$, there exists, on an enlarged probability space, an $\R^d$-valued random vector $T$ with $T \mid \cH_0 \sim \cN(0, \Sigma)$ and % \begin{align} \label{eq:yurinskii_sa_dependent} \P\big(\|S-T\|_p > 6\eta\big) &\leq \inf_{t>0} \left\{ 2 \P\big( \|Z\|_p > t \big) + \min\left\{ \frac{\beta_{p,2} t^2}{\eta^3}, \frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3} \right\} \right\} \nonumber \\ &\quad+ \inf_{M \succeq 0} \Big\{ 2 \P\big(\Omega \npreceq M\big) + \delta_p(M,\eta) + \varepsilon_p(M, \eta)\Big\} +\P\big(\|U\|_p>\eta\big), \end{align} % where $Z, Z_1,\dots ,Z_n$ are i.i.d.\ standard Gaussian random variables on $\R^d$ independent of $\cH_n$, the second infimum is taken over all positive semi-definite $d \times d$ non-random matrices $M$, % \begin{align*} \beta_{p,k} &= \sum_{i=1}^n \E\left[\| \tilde X_i \|^k_2 \| \tilde X_i \|_p + \|V_i^{1/2} Z_i \|^k_2 \|V_i^{1/2} Z_i \|_p \right], &\pi_3 &= \sum_{i=1}^{n} \sum_{|\kappa| = 3} \E \Big[ \big| \E [ \tilde X_i^\kappa \mid \cH_{i-1} ] \big| \Big] \end{align*} % for $k \in \{2, 3\}$, with $\pi_3 = \infty$ if the associated conditional expectation does not exist, and with % \begin{align*} \delta_p(M,\eta) &= \P\left( \big\|\big((\Sigma +M)^{1/2}- \Sigma^{1/2}\big) Z\big\|_p \geq \eta \right), \\ \varepsilon_p(M, \eta) &= \P\left(\big\| (M - \Omega)^{1/2} Z \big\|_p\geq \eta, \ \Omega \preceq M\right). \end{align*} \end{theorem} This theorem offers four novel contributions to the literature on coupling theory and strong approximation, as discussed in the introduction. % approximate martingales Firstly \ref{it:yurinskii_contribution_approximate_martingale}, it allows for approximate vector-valued martingales, with the variables $\tilde X_i$ forming martingale differences with respect to $\cH_i$ by construction, and $U$ quantifying the associated martingale approximation error. Such martingale approximation techniques for sequences of dependent random vectors are well established and have been used in a range of scenarios: see, for example, \citet{wu2004martingale}, \citet{dedecker2007weak}, \citet{zhao2008martingale}, \citet{peligrad2010conditional}, \citet{atchade2014martingale}, \citet{cuny2014martingale}, \citet{magda2018martingale}, and references therein. In Section~\ref{sec:yurinskii_mixingales} we demonstrate how this approximation can be established in practice by restricting our general theorem to the special case of mixingales, while the upcoming example in Section~\ref{sec:yurinskii_factor} provides an illustration in the context of auto-regressive factor models. % Gaussian mixture Secondly \ref{it:yurinskii_contribution_gaussian_mixture}, Theorem~\ref{thm:yurinskii_sa_dependent} allows for the resulting coupling variable $T$ to follow a multivariate Gaussian distribution only conditionally, and thus we offer a useful analog of mixing convergence in the context of strong approximation. To be more precise, the random matrix $\sum_{i=1}^{n} V_i$ is the quadratic variation of the constructed martingale $\sum_{i=1}^n \tilde X_i$, and we approximate it using the $\cH_0$-measurable random matrix $\Sigma$. This yields the coupling variable $T \mid \cH_0 \sim \cN(0, \Sigma)$, which can alternatively be written as $T=\Sigma^{1/2} Z$ with $Z \sim \cN(0,I_d)$ independent of $\cH_0$. The errors in this quadratic variation approximation are accounted for by the terms $\P(\Omega \npreceq M)$, $\delta_p(M, \eta)$, and $\varepsilon_p(M, \eta)$, utilizing a regularization argument through the free matrix parameter $M$. If a non-random $\Sigma$ is used, then $T$ is unconditionally Gaussian, and one can take $\cH_0$ to be the trivial $\sigma$-algebra. As demonstrated in our proof, our approach to establishing a mixing approximation is different from naively taking an unconditional version of Yurinskii's coupling and applying it conditionally on $\cH_0$, which will not deliver the same coupling as in Theorem~\ref{thm:yurinskii_sa_dependent} for a few reasons. To begin with, we explicitly indicate in the conditions of Theorem~\ref{thm:yurinskii_sa_dependent} where conditioning is required. Next, our error of approximation is given unconditionally, involving only marginal expectations and probabilities. Finally, we provide a rigorous account of the construction of the conditionally Gaussian coupling variable $T$ via a conditional version of Strassen's theorem \citep{chen2020jackknife}. Section~\ref{sec:yurinskii_martingales} illustrates how a strong approximation akin to mixing convergence can arise when the data forms an exact martingale, and Section~\ref{sec:yurinskii_factor} gives a simple example relating to factor modeling in statistics and data science. % remove lower bound on minimum eigenvalue As a third contribution to the literature \ref{it:yurinskii_contribution_degeneracy}, and of particular importance for applications, Theorem~\ref{thm:yurinskii_sa_dependent} makes no requirements on the minimum eigenvalue of the quadratic variation of the approximating martingale sequence. Instead, our proof technique employs a careful regularization scheme designed to account for any such exact or approximate rank degeneracy in $\Sigma$. This capability is fundamental in some applications, a fact which we illustrate in Section \ref{sec:yurinskii_kde} by demonstrating the significant improvements in strong approximation errors delivered by Theorem~\ref{thm:yurinskii_sa_dependent} relative to those obtained using prior results in the literature. % matching third moments Finally \ref{it:yurinskii_contribution_third_order}, Theorem~\ref{thm:yurinskii_sa_dependent} gives a third-order strong approximation alongside the usual second-order version considered in all prior literature. More precisely, we observe that an analog of the term $\beta_{p,2}$ is present in the classical Yurinskii coupling and comes from a Lindeberg telescoping sum argument, replacing random variables by Gaussians with the same mean and variance to match the first and second moments. Whenever the third moments of $\tilde X_i$ are negligible (quantified by $\pi_3$), this moment-matching argument can be extended to third-order terms, giving a new term $\beta_{p,3}$. In certain settings, such as when the data is symmetrically distributed around zero, using $\beta_{p,3}$ rather than $\beta_{p,2}$ can give smaller approximation errors in the coupling given in \eqref{eq:yurinskii_sa_dependent}. Such a refinement can be viewed as a strong approximation counterpart to classical Edgeworth expansion methods. We illustrate this phenomenon in our upcoming applications to nonparametric inference (Section~\ref{sec:yurinskii_nonparametric}). \subsection{User-friendly formulation of the main result}% The result in Theorem~\ref{thm:yurinskii_sa_dependent} is given in a somewhat implicit manner, involving infima over the free parameters $t > 0$ and $M \succeq 0$, and it is not clear how to compute these in general. In the upcoming Proposition~\ref{pro:yurinskii_sa_simplified}, we set $M = \nu^2 I_d$ and approximately optimize over $t > 0$ and $\nu > 0$, resulting in a simplified and slightly weaker version of our main general result. In specific applications, where there is additional knowledge of the quadratic variation structure, other choices of regularization schemes may be more appropriate. Nonetheless, the choice $M = \nu^2 I_d$ leads to arguably the principal result of our work, due to its simplicity and utility in statistical applications. For convenience, define the functions $\phi_p : \N \to \R$ for $p \in [0, \infty]$, % \begin{align*} \phi_p(d) = \begin{cases} \sqrt{pd^{2/p} } & \text{ if } p \in [1,\infty), \\ \sqrt{2\log 2d} & \text{ if } p =\infty, \end{cases} \end{align*} % which are related to tail probabilities of the $\ell^p$-norm of a standard Gaussian. \begin{proposition}[Simplified strong approximation for approximate martingales]% \label{pro:yurinskii_sa_simplified} Assume the setup and notation of Theorem~\ref{thm:yurinskii_sa_dependent}. For each $\eta > 0$ and $p \in [1,\infty]$, there exists a random vector $T \mid \cH_0 \sim \cN(0, \Sigma)$ satisfying % \begin{align*} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3} +\P\left(\|U\|_p>\frac{\eta}{6}\right). \end{align*} % If further $\pi_3 = 0$ then % \begin{align*} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4} \right)^{1/4} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3} +\P\left(\|U\|_p>\frac{\eta}{6}\right). \end{align*} % \end{proposition} Proposition~\ref{pro:yurinskii_sa_simplified} makes clear the potential benefit of a third-order coupling when $\pi_3 = 0$, as in this case the bound features $\beta_{p,3}^{1/4}$ rather than $\beta_{p,2}^{1/3}$. If $\pi_3$ is small but non-zero, an analogous result can easily be derived by adjusting the optimal choices of $t$ and $\nu$, but we omit this for clarity of notation. In applications (see Section~\ref{sec:yurinskii_series}), this reduction of the exponent can provide a significant improvement in terms of the dependence of the bound on the sample size $n$, the dimension $d$, and other problem-specific quantities. When using our results for strong approximation, it is usual to set $p = \infty$ to bound the maximum discrepancy over the entries of a vector (to construct uniform confidence sets, for example). In this setting, we have that $\phi_\infty(d) = \sqrt{2 \log 2d}$ has a sub-Gaussian slow-growing dependence on the dimension. The remaining term depends on $\E[\|\Omega\|_2]$ and requires that the matrix $\Sigma$ be a good approximation of $\sum_{i=1}^{n} V_i$, while remaining $\cH_0$-measurable. In some applications (such as factor modeling; see Section~\ref{sec:yurinskii_factor}), it can be shown that the quadratic variation $\sum_{i=1}^n V_i$ remains random and $\cH_0$-measurable even in large samples, giving a natural choice for $\Sigma$. In the next few sections, we continue to refine Proposition~\ref{pro:yurinskii_sa_simplified}, presenting a sequence of results with increasingly strict assumptions on the dependence structure of the data $X_i$. These allow us to demonstrate the broad applicability of our main results, providing more explicit bounds in settings which are likely to be of special interest. In particular, we consider mixingales, martingales, and independent data, comparing our derived results with those in the existing literature. \subsection{Mixingales} \label{sec:yurinskii_mixingales} In our first refinement, we provide a natural method for bounding the martingale approximation error term $U$. Suppose that $X_i$ form an $\ell^p$-mixingale in $L^1(\P)$ in the sense that there exist non-negative $c_1, \ldots, c_n$ and $\zeta_0, \ldots, \zeta_n$ such that for all $1 \leq i \leq n$ and $0 \leq r \leq i$, % \begin{align} \label{eq:yurinskii_mixingale_1} \E \left[ \left\| \E \left[ X_i \mid \cH_{i-r} \right] \right\|_p \right] &\leq c_i \zeta_r, \end{align} % and for all $1 \leq i \leq n$ and $0 \leq r \leq n-i$, % \begin{align} \label{eq:yurinskii_mixingale_2} \E \left[ \big\| X_i - \E \big[ X_i \mid \cH_{i+r} \big] \big\|_p \right] &\leq c_i \zeta_{r+1}. \end{align} % These conditions are satisfied, for example, if $X_i$ are integrable strongly $\alpha$-mixing random variables \citep{mcleish1975invariance}, or if $X_i$ are generated by an auto-regressive or auto-regressive moving average process (see Section~\ref{sec:yurinskii_factor}), among many other possibilities \citep{bradley2005basic}. Then, in the notation of Theorem~\ref{thm:yurinskii_sa_dependent}, we have by Markov's inequality that % \begin{align*} \P \left( \|U\|_p > \frac{\eta}{6} \right) &\leq \frac{6}{\eta} \sum_{i=1}^{n} \E \left[ \big\| X_i - \E \left[ X_i \mid \cH_n \right] \big\|_p + \big\| \E \left[ X_i \mid \cH_0 \right] \big\|_p \right] \leq \frac{\zeta}{\eta}, \end{align*} % with $\zeta = 6 \sum_{i=1}^{n} c_i (\zeta_{i} + \zeta_{n-i+1})$. Combining Proposition~\ref{pro:yurinskii_sa_simplified} with this martingale error bound yields the following result for mixingales. % \begin{corollary}[Strong approximation for vector-valued mixingales]% \label{cor:yurinskii_sa_mixingale} Assume the setup and notation of Theorem~\ref{thm:yurinskii_sa_dependent}, and suppose the mixingale conditions \eqref{eq:yurinskii_mixingale_1} and \eqref{eq:yurinskii_mixingale_2} hold. For each $\eta > 0$ and $p \in [1,\infty]$ there is a random vector $T \mid \cH_0 \sim \cN(0, \Sigma)$ with % \begin{align*} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3} + \frac{\zeta}{\eta}. \end{align*} % If further $\pi_3 = 0$ then % \begin{align*} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4} \right)^{1/4} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3} + \frac{\zeta}{\eta}. \end{align*} % \end{corollary} The closest antecedent to Corollary~\ref{cor:yurinskii_sa_mixingale} is found in \citet[Theorem~4]{li2020uniform}, who also considered Yurinskii's coupling for mixingales. Our result improves on this work in the following manner: it removes any requirements on the minimum eigenvalue of the quadratic variation of the mixingale sequence; it allows for general $\ell^p$-norms with $p\in[1,\infty]$; it establishes a coupling to a multivariate Gaussian mixture distribution in general; and it permits third-order couplings (when $\pi_3=0$). These improvements have important practical implications as demonstrated in Sections \ref{sec:yurinskii_factor} and \ref{sec:yurinskii_nonparametric}, where significantly better coupling approximation errors are demonstrated for a variety of statistical applications. On the technical side, our result is rigorously established using a conditional version of Strassen's theorem \citep{chen2020jackknife}, a carefully crafted regularization argument, and a third-order Lindeberg method \citep[see][and references therein, for more discussion on the standard second-order Lindeberg method]{chatterjee2006generalization}. Furthermore, as explained in Remark~\ref{rem:yurinskii_coupling_bounds_probability}, we clarify a technical issue in \citet{li2020uniform} surrounding the derivation of valid probability bounds for $\|S-T\|_p$. Corollary~\ref{cor:yurinskii_sa_mixingale} focused on mixingales for simplicity, but, as previously discussed, any method for constructing a martingale approximation $\tilde X_i$ and bounding the resulting error $U$ could be used instead in Proposition~\ref{pro:yurinskii_sa_simplified} to derive a similar result. \subsection{Martingales} \label{sec:yurinskii_martingales} For our second refinement, suppose that $X_i$ form martingale differences with respect to $\cH_i$. In this case, $\E[X_i \mid \cH_n] = X_i$ and $\E[X_i \mid \cH_0] = 0$, so $U = 0$, and the martingale approximation error term vanishes. Applying Proposition~\ref{pro:yurinskii_sa_simplified} in this setting directly yields the following result. % \begin{corollary}[Strong approximation for vector-valued martingales]% \label{cor:yurinskii_sa_martingale} With the setup and notation of Theorem~\ref{thm:yurinskii_sa_dependent}, suppose that $X_i$ is $\cH_i$-measurable satisfying $\E[X_i \mid \cH_{i-1}] = 0$ for $1 \leq i \leq n$. Then, for each $\eta > 0$ and $p \in [1,\infty]$, there is a random vector $T \mid \cH_0 \sim \cN(0, \Sigma)$ with % \begin{align} \label{eq:yurinskii_sa_martingale_order_2} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3}. \end{align} % If further $\pi_3 = 0$ then % \begin{align} \label{eq:yurinskii_sa_martingale_order_3} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4} \right)^{1/4} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3}. \end{align} % \end{corollary} The closest antecedents to Corollary~\ref{cor:yurinskii_sa_martingale} are \citet{belloni2018high} and \citet{li2020uniform}, who also implicitly or explicitly considered Yurinskii's coupling for martingales. More specifically, \citet[Theorem~1]{li2020uniform} established an explicit $\ell^2$-norm Yurinskii coupling for martingales under a strong assumption on the minimum eigenvalue of the martingale quadratic variation, while \citet[Theorem~2.1]{belloni2018high} established a central limit theorem for vector-valued martingale sequences employing the standard second-order Lindeberg method, implying that their proof could be adapted to deduce a Yurinskii coupling for martingales with the help of a conditional version of Strassen's theorem \citep{chen2020jackknife} and some additional nontrivial technical work. Corollary~\ref{cor:yurinskii_sa_martingale} improves over this prior work as follows. With respect to \citet{li2020uniform}, our result establishes an $\ell^p$-norm Gaussian mixture Yurinskii coupling for martingales without any requirements on the minimum eigenvalue of the martingale quadratic variation, and permits a third-order coupling if $\pi_3=0$. The first probability bound \eqref{eq:yurinskii_sa_martingale_order_2} in Corollary~\ref{cor:yurinskii_sa_martingale} gives the same rate of strong approximation as that in Theorem~1 of \citet{li2020uniform} when $p=2$, with non-random $\Sigma$, and when the eigenvalues of a normalized version of $\Sigma$ are bounded away from zero. In Section~\ref{sec:yurinskii_kde} we demonstrate the crucial importance of removing this eigenvalue lower bound restriction in applications involving nonparametric kernel estimators, while in Section~\ref{sec:yurinskii_series} we demonstrate how the availability of a third-order coupling \eqref{eq:yurinskii_sa_martingale_order_3} can give improved approximation rates in applications involving nonparametric series estimators with conditionally symmetrically distributed residual errors. Finally, our technical work improves on \citet{li2020uniform} in two respects: % \begin{inlineroman} \item we employ a conditional version of Strassen's theorem (see Lemma~\ref{lem:yurinskii_app_strassen} in the appendix) to appropriately handle the conditioning arguments; and \item we deduce valid probability bounds for $\|S-T\|_p$, as the following Remark~\ref{rem:yurinskii_coupling_bounds_probability} makes clear. \end{inlineroman} \begin{remark}[Yurinskii's coupling and bounds in probability] \label{rem:yurinskii_coupling_bounds_probability} Given a sequence of random vectors $S_n$, Yurinskii's method provides a coupling in the following form: for each $n$ and any $\eta > 0$, there exists a random vector $T_n$ with $\P\big(\|S_n - T_n\| > \eta\big) < r_n(\eta)$, where $r_n(\eta)$ is the approximation error. Crucially, each coupling variable $T_n$ is a function of the desired approximation level $\eta$ and, as such, deducing bounds in probability on $\|S_n - T_n\|$ requires some extra care. One option is to select a sequence $R_n \to \infty$ and note that $\P\big(\|S_n - T_n\| > r_n^{-1}(1 / R_n)\big) < 1 / R_n \to 0$ and hence $\|S_n - T_n\| \lesssim_\P r_n^{-1}(1 / R_n)$. In this case, $T_n$ depends on the choice of $R_n$, which can in turn typically be chosen to diverge slowly enough to cause no issues in applications. \end{remark} Technicalities akin to those outlined in Remark~\ref{rem:yurinskii_coupling_bounds_probability} have been both addressed and neglected alike in the prior literature. \citet[Chapter 10.4, Example 16]{pollard2002user} apparently misses this subtlety, providing an inaccurate bound in probability based on the Yurinskii coupling. \citet{li2020uniform} seem to make the same mistake in the proof of their Lemma~A2, which invalidates the conclusion of their Theorem~1. In contrast, \citet{belloni2015some} and \citet{belloni2019conditional} directly provide bounds in $o_\P$ instead of $O_\P$, circumventing these issues in a manner similar to our approach involving a diverging sequence $R_n$. To see how this phenomenon applies to our main results, observe that the second-order martingale coupling given as \eqref{eq:yurinskii_sa_martingale_order_2} in Corollary~\ref{cor:yurinskii_sa_martingale} implies that for any $R_n \to \infty$, % \begin{align*} \|S - T\|_p \lesssim_\P \beta_{p,2}^{1/3} \phi_p(d)^{2/3} R_n + \E[\|\Omega\|_2]^{1/2} \phi_p(d) R_n. \end{align*} % This bound is comparable to that obtained by \citet[Theorem~1]{li2020uniform} with $p=2$, albeit with their formulation missing the $R_n$ correction terms. In Section~\ref{sec:yurinskii_series} we discuss further their (amended) result, in the setting of nonparametric series estimation. Our approach using $p = \infty$ obtains superior distributional approximation rates, alongside exhibiting various other improvements such as the aforementioned third-order coupling. Turning to the comparison with \citet{belloni2018high}, our Corollary~\ref{cor:yurinskii_sa_martingale} again offers the same improvements, with the only exception being that the authors did account for the implications of a possibly vanishing minimum eigenvalue. However, their results exclusively concern high-dimensional central limit theorems for vector-valued martingales, and therefore while their findings could in principle enable the derivation of a result similar to our Corollary~\ref{cor:yurinskii_sa_martingale}, this would require additional technical work on their behalf in multiple ways (see Appendix~\ref{app:yurinskii}): % \begin{inlineroman} \item a correct application of a conditional version of Strassen's theorem (Lemma~\ref{lem:yurinskii_app_strassen}); \item the development of a third-order Borel set smoothing technique and associated $\ell^p$-norm moment control (Lemmas \ref{lem:yurinskii_app_smooth_approximation}, \ref{lem:yurinskii_app_gaussian_useful}, and \ref{lem:yurinskii_app_gaussian_pnorm}); \item a careful truncation scheme to account for $\Omega\npreceq0$; and \item a valid third-order Lindeberg argument (Lemma \ref{lem:yurinskii_app_sa_martingale}), among others. \end{inlineroman} \subsection{Independence} As a final refinement, suppose that $X_i$ are independent and zero-mean conditionally on $\cH_0$, and take $\cH_i$ to be the filtration generated by $X_1, \ldots, X_i$ and $\cH_0$ for $1 \leq i \leq n$. Then, taking $\Sigma = \sum_{i=1}^n V_i$ gives $\Omega = 0$, and hence Corollary~\ref{cor:yurinskii_sa_martingale} immediately yields the following result. % \begin{corollary}[Strong approximation for sums of independent vectors]% \label{cor:yurinskii_sa_indep} Take the setup of Theorem~\ref{thm:yurinskii_sa_dependent}, and let $X_i$ be independent given $\cH_0$, with $\E[X_i \mid \cH_0] = 0$. Then, for each $\eta > 0$ and $p \in [1,\infty]$, with $\Sigma = \sum_{i=1}^n V_i$, there is $T \mid \cH_0 \sim \cN(0, \Sigma)$ with % \begin{align} \label{eq:yurinskii_sa_indep_order_2} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3}. \end{align} % If further $\pi_3 = 0$ then % \begin{align*} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4} \right)^{1/4}. \end{align*} % \end{corollary} Taking $\cH_0$ to be trivial, \eqref{eq:yurinskii_sa_indep_order_2} provides an $\ell^p$-norm approximation analogous to that presented in \citet{belloni2019conditional}. By further restricting to $p=2$, we recover the original Yurinskii coupling as presented in \citet[Theorem~1]{lecam1988} and \citet[Theorem~10]{pollard2002user}. Thus, in the independent data setting, our result improves on prior work as follows: \begin{inlineroman} \item it establishes a coupling to a multivariate Gaussian mixture distribution; and \item it permits a third-order coupling if $\pi_3=0$. \end{inlineroman} \subsection{Stylized example: factor modeling} \label{sec:yurinskii_factor} In this section, we present a simple statistical example of how our improvements over prior coupling results can have important theoretical and practical implications. Consider the stylized factor model % \begin{align*} X_i = L f_i + \varepsilon_i, \qquad 1 \leq i \leq n, \end{align*} % with random variables $L$ taking values in $\R^{d \times m}$, $f_i$ in $\R^m$, and $\varepsilon_i$ in $\R^d$. We interpret $f_i$ as a latent factor variable and $L$ as a random factor loading, with idiosyncratic disturbances $\varepsilon_i$. See \citet{fan2020statistical}, and references therein, for a textbook review of factor analysis in statistics and econometrics. We employ the above factor model to give a first illustration of the applicability of our main result Theorem~\ref{thm:yurinskii_sa_dependent}, the user-friendly Proposition~\ref{pro:yurinskii_sa_simplified}, and their specialized Corollaries~\ref{cor:yurinskii_sa_mixingale}--\ref{cor:yurinskii_sa_indep}. We consider three different sets of conditions to demonstrate the applicability of each of our corollaries for mixingales, martingales, and independent data, respectively. We assume throughout that $(\varepsilon_1, \ldots, \varepsilon_n)$ is zero-mean and finite variance, and that $(\varepsilon_1, \ldots, \varepsilon_n)$ is independent of $L$ and $(f_1, \ldots, f_n)$. Let $\cH_i$ be the $\sigma$-algebra generated by $L$, $(f_1, \ldots, f_i)$, and $(\varepsilon_1, \ldots, \varepsilon_i)$, with $\cH_0$ the $\sigma$-algebra generated by $L$ alone. \begin{itemize} \item \emph{Independent data}. Suppose that the factors $(f_1, \ldots, f_n)$ are independent conditional on $L$ and satisfy $\E [ f_i \mid L ] = 0$. Then, since $X_i$ are independent conditional on $\cH_0$ and with $\E [ X_i \mid \cH_0 ] = \E [ L f_i + \varepsilon_i \mid L ] = 0$, we can apply Corollary~\ref{cor:yurinskii_sa_indep} to $\sum_{i=1}^n X_i$. In general, we will obtain a coupling variable which has the Gaussian mixture distribution $T \mid \cH_0 \sim \cN(0, \Sigma)$ where $\Sigma= \sum_{i=1}^n (L\Var[f_i \mid L]L^\T +\Var[\varepsilon_i])$. In the special case where $L$ is non-random and $\cH_0$ is trivial, the coupling is Gaussian. Further, if $f_i\mid L$ and $\varepsilon_i$ are symmetric about zero and bounded, then $\pi_3=0$, and the coupling is improved. \item \emph{Martingales}. Suppose instead that we assume only a martingale condition on the latent factor variables so that $\E \left[ f_i \mid L, f_1, \ldots, f_{i-1} \right] = 0$. Then $\E [ X_i \mid \cH_{i-1} ] = L\, \E \left[ f_i \mid \cH_{i-1} \right] = 0$ and Corollary~\ref{cor:yurinskii_sa_martingale} is applicable to $\sum_{i=1}^n X_i$. The preceding comments on Gaussian mixture distributions and third-order couplings continue to apply. \item \emph{Mixingales}. Finally, assume that the factors follow the auto-regressive model $f_i = A f_{i-1} + u_i$ where $A \in \R^{m \times m}$ is non-random and $(u_1, \ldots, u_n)$ are zero-mean, independent, and independent of $(\varepsilon_1, \ldots, \varepsilon_n)$. Then $\E \left[ f_i \mid f_0 \right] = A^i f_0$, so taking $p \in [1, \infty]$ we see that $\E \big[ \| \E [ f_i \mid f_0 ] \|_p \big] = \E \big[ \| A^i f_0 \|_p \big] \leq \|A\|_p^i\,\E [ \|f_0\|_p ]$, and that clearly $f_i - \E [ f_i \mid \cH_n ] = 0$. Thus, whenever $\|A\|_p < 1$, the geometric sum formula implies that we can apply the mixingale result from Corollary~\ref{cor:yurinskii_sa_mixingale} to $\sum_{i=1}^n X_i$. The conclusions on Gaussian mixture distributions and third-order couplings parallel the previous cases. % \end{itemize} This simple application to factor modeling gives a preliminary illustration of the power of our main results, encompassing settings which could not be handled by employing Yurinskii couplings available in the existing literature. Even with independent data, we offer new Yurinskii couplings to Gaussian mixture distributions (due to the presence of the common random factor loading $L$), which could be further improved whenever the factors and residuals possess symmetric (conditional) distributions. Furthermore, our results do not impose any restrictions on the minimum eigenvalue of $\Sigma$, thereby allowing for more general factor structures. These improvements are maintained in the martingale, mixingale, and weakly dependent stationary data settings. \section{Strong approximation for martingale empirical processes}% \label{sec:yurinskii_emp_proc} In this section, we demonstrate how our main results can be applied to some more substantive problems in statistics. Having until this point studied only finite-dimensional (albeit potentially high-dimensional) random vectors, we now turn our attention to infinite-dimensional stochastic processes. Specifically, we consider empirical processes of the form $S(f) = \sum_{i=1}^{n} f(X_i)$ for $f \in \cF$ a problem-specific class of real-valued functions, where each $f(X_i)$ forms a martingale difference sequence with respect to an appropriate filtration. We construct (conditionally) Gaussian processes $T(f)$ for which an upper bound on the uniform coupling error $\sup_{f \in \cF} |S(f) - T(f)|$ is precisely quantified. We control the complexity of $\cF$ using metric entropy under Orlicz norms. The novel strong approximation results which we present concern the entire martingale empirical process $(S(f):f \in \cF)$, as opposed to just the scalar supremum of the empirical process, $\sup_{f \in \cF} |S(f)|$. This distinction has been carefully noted by \citet{chernozhukov2014gaussian}, who studied Gaussian approximation of empirical process suprema in the independent data setting and wrote (p.\ $1565$): ``A related but different problem is that of approximating \textit{whole} empirical processes by a sequence of Gaussian processes in the sup-norm. This problem is more difficult than [approximating the supremum of the empirical process].'' Indeed, the results we establish in this section are for a strong approximation for the entire empirical process by a sequence of Gaussian mixture processes in the supremum norm, when the data has a martingale difference structure (cf.\ Corollary \ref{cor:yurinskii_sa_martingale}). Our results can be further generalized to approximate martingale empirical processes (cf.\ Corollary \ref{cor:yurinskii_sa_mixingale}), but we do not consider this extension to reduce notation and the technical burden. \subsection{Motivating example: kernel density estimation} \label{sec:yurinskii_kde} We begin with a brief study of a canonical example of an empirical process which is non-Donsker (thus precluding the use of uniform central limit theorems) due to the presence of a function class whose complexity increases with the sample size: the kernel density estimator with i.i.d.\ scalar data. We give an overview of our general strategy for strong approximation of stochastic processes via discretization, and show explicitly in Lemma~\ref{lem:yurinskii_kde_eigenvalue} how it is crucial that we do not impose lower bounds on the eigenvalues of the discretized covariance matrix. Detailed calculations for this section are relegated to Appendix~\ref{app:yurinskii} for conciseness. Let $X_1, \ldots, X_n$ be i.i.d.\ $\Unif[0,1]$, take $K(x) = \frac{1}{\sqrt{2 \pi}} e^{-x^2/2}$ the Gaussian kernel and let $h \in (0,1]$ be a bandwidth. Then, for $a \in (0,1/4]$ and $x \in \cX = [a, 1-a]$ to avoid boundary issues, the kernel density estimator of the true density function $g(x) = 1$ is % \begin{align*} \hat g(x) &= \frac{1}{n} \sum_{i=1}^{n} K_h( X_i - x), \qquad K_h(u) = \frac{1}{h} K\left( \frac{u}{h} \right). \end{align*} % Consider establishing a strong approximation for the stochastic process $(\hat g(x)-\E [ \hat g(x) ] : x\in\cX)$ which is, upon rescaling, non-Donsker whenever the bandwidth decreases to zero in large samples. To match notation with the upcoming general result for empirical processes, set $f_x(u) = \frac{1}{n} (K_h( u - x) - \E[K_h( X_i - x)])$ so $S(x) \vcentcolon= S(f_x) = \hat g(x)-\E [ \hat g(x) ]$. The next step is standard: a mesh separates the local oscillations of the processes from the finite-dimensional coupling. For $\delta \in (0,1/2)$, set $N = \left\lfloor 1 + \frac{1 - 2a}{\delta} \right\rfloor$ and $\cX_\delta = (a + (j-1)\delta : 1 \leq j \leq N)$. Letting $T(x)$ be the approximating stochastic process to be constructed, consider the decomposition % \begin{align*} \sup_{x \in \cX} \big|S(x) - T(x)\big| &\leq \sup_{|x-x'| \leq \delta} \big|S(x) - S(x') \big| + \max_{x \in \cX_\delta} |S(x) - T(x)| + \sup_{|x-x'| \leq \delta} \big|T(x) - T(x')\big|. \end{align*} % Writing $S(\cX_\delta)$ for $\big(S(x) : x \in \cX_\delta\big)\in \mathbb{R}^N$, noting that this is a sum of i.i.d.\ random vectors, we apply Corollary~\ref{cor:yurinskii_sa_indep} as $\max_{x \in \cX_\delta} |S(x) - T(x)| = \| S(\cX_\delta) - T(\cX_\delta) \|_\infty$. We obtain that for each $\eta > 0$ there is a Gaussian vector $T(\cX_\delta)$ with the same covariance matrix as $S(\cX_\delta)$ satisfying % \begin{align*} \P\left( \|S(\cX_\delta) - T(\cX_\delta)\|_\infty > \eta \right) &\leq 31 \left( \frac{N \log 2 N}{\eta^3 n^2 h^2} \right)^{1/3} \end{align*} % assuming that $1/h \geq \log 2 N$. By the Vorob'ev--Berkes--Philipp theorem \citep[Theorem~1.1.10]{dudley1999uniform}, $T(\cX_\delta)$ extends to a Gaussian process $T(x)$ defined for all $x \in \cX$ and with the same covariance structure as $S(x)$. Next, chaining with the Bernstein--Orlicz and sub-Gaussian norms \citep[Section~2.2]{van1996weak} shows that if $\log(N/h) \lesssim \log n$ and $n h \gtrsim \log n$, % \begin{align*} \sup_{|x-x'| \leq \delta} \big\|S(x) - S(x') \big\|_\infty &\lesssim_\P \delta \sqrt{\frac{\log n}{n h^3}} \ \quad\text{and}\quad \sup_{|x-x'| \leq \delta} \big\|T(x) - T(x')\big\|_\infty \lesssim_\P \delta \sqrt{\frac{\log n}{n h^3}}. \end{align*} % Finally, for any $R_n\to\infty$ (see Remark~\ref{rem:yurinskii_coupling_bounds_probability}), the resulting bound on the coupling error is % \begin{align*} \sup_{x \in \cX} \big| S(x) - T(x) \big| &\lesssim_\P \left( \frac{N \log 2N}{n^2 h^2} \right)^{1/3} R_n + \delta \sqrt{\frac{\log n}{n h^3}}, \end{align*} % where the mesh size $\delta$ can then be approximately optimized to obtain the tightest possible strong approximation. The discretization strategy outlined above is at the core of the proof strategy for our upcoming Proposition~\ref{pro:yurinskii_emp_proc}. Since we will consider martingale empirical processes, our proof will rely on Corollary~\ref{cor:yurinskii_sa_martingale}, which, unlike the martingale Yurinskii coupling established by \citet{li2020uniform}, does not require a lower bound on the minimum eigenvalue of $\Sigma$. Using the simple kernel density example just discussed, we now demonstrate precisely the crucial importance of removing such eigenvalue conditions. The following Lemma~\ref{lem:yurinskii_kde_eigenvalue} shows that the discretized covariance matrix $\Sigma = n h\Var[S(\cX_\delta)]$ has exponentially small eigenvalues, which in turn will negatively affect the strong approximation bound if the \citet{li2020uniform} coupling were to be used instead of the results in this dissertation. \begin{lemma}[Minimum eigenvalue of a kernel density estimator covariance matrix]% \label{lem:yurinskii_kde_eigenvalue} % The minimum eigenvalue of $\Sigma=n h\Var[S(\cX_\delta)] \in \R^{N \times N}$ satisfies the upper bound % \begin{align*} \lambda_{\min}(\Sigma) &\leq 2 e^{-h^2/\delta^2} + \frac{h}{\pi a \delta} e^{-a^2 / h^2}. \end{align*} \end{lemma} % Figure~\ref{fig:yurinskii_min_eig} shows how the upper bound in Lemma \ref{lem:yurinskii_kde_eigenvalue} captures the behavior of the simulated minimum eigenvalue of $\Sigma$. In particular, the smallest eigenvalue decays exponentially fast in the discretization level $\delta$ and the bandwidth $h$. As seen in the calculations above, the coupling rate depends on $\delta / h$, while the bias will generally depend on $h$, implying that both $\delta$ and $h$ must converge to zero to ensure valid statistical inference. In general, this will lead to $\Sigma$ possessing extremely small eigenvalues, rendering strong approximation approaches such as that of \citet{li2020uniform} ineffective in such scenarios. % \begin{figure}[t] \centering \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/sim_2.pdf} \caption{$h = 0.03$} \end{subfigure} \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/sim_1.pdf} \caption{$h = 0.01$} \end{subfigure} \caption[Minimum eigenvalue of the kernel density covariance matrix]{ Upper bounds on the minimum eigenvalue of the discretized covariance matrix in kernel density estimation, with $n=100$ and $a = 0.2$. Simulated: the kernel density estimator is simulated, resampling the data $100$ times to estimate its covariance. Computing matrix: the minimum eigenvalue of the limiting covariance matrix $\Sigma$ is computed explicitly. Upper bound: the bound derived in Lemma~\ref{lem:yurinskii_kde_eigenvalue} is shown. } \label{fig:yurinskii_min_eig} \end{figure} The discussion in this section focuses on the strong approximation of the centered process $\hat g(x)-\E [ \hat g(x) ]$. In practice, the goal is often rather to approximate the feasible process $\hat g(x)- g(x)$. The difference between these is captured by the smoothing bias $\E [ \hat g(x) ] - g(x)$, which is straightforward to control in this case with $\sup_{x \in \cX} \big| \E [ \hat g(x) ] - g(x) \big| \lesssim \frac{h}{a} e^{-a^2 / (2 h^2)}$. See Section \ref{sec:yurinskii_nonparametric} for further comments. \subsection{General result for martingale empirical processes} We now give our general result on a strong approximation for martingale empirical processes, obtained by applying the first result \eqref{eq:yurinskii_sa_martingale_order_2} in Corollary~\ref{cor:yurinskii_sa_martingale} with $p=\infty$ to a discretization of the empirical process, as in Section~\ref{sec:yurinskii_kde}. We then control the increments in the stochastic processes using chaining with Orlicz norms, but note that other tools are available, including generalized entropy with bracketing \citep{geer2000empirical} and sequential symmetrization \citep{rakhlin2015sequential}. A class of functions is said to be \emph{pointwise measurable} if it contains a countable subclass which is dense under the pointwise convergence topology. For a finite class $\cF$, write $\cF(x) = \big(f(x) : f \in \cF\big)$. Define the set of Orlicz functions % \begin{align*} \Psi &= \left\{ \psi: [0, \infty) \to [0, \infty) \text{ convex increasing, } \psi(0) = 0,\ \limsup_{x,y \to \infty} \tfrac{\psi(x) \psi(y)}{\psi(C x y)} < \infty \text{ for } C > 0 \right\} \end{align*} % and, for real-valued $Y$, the Orlicz norm $\vvvert Y \vvvert_\psi = \inf \left\{ C > 0: \E \left[ \psi(|Y|/C) \leq 1 \right] \right\}$ as in \citet[Section~2.2]{van1996weak}. \begin{proposition}[Strong approximation for martingale empirical processes]% \label{pro:yurinskii_emp_proc} Let $X_i$ be random variables for $1 \leq i \leq n$ taking values in a measurable space $\cX$, and $\cF$ be a pointwise measurable class of functions from $\cX$ to $\R$. Let $\cH_0, \ldots, \cH_n$ be a filtration such that each $X_i$ is $\cH_i$-measurable, with $\cH_0$ the trivial $\sigma$-algebra, and suppose that $\E[f(X_i) \mid \cH_{i-1}] = 0$ for all $f \in \cF$. Define $S(f) = \sum_{i=1}^n f(X_i)$ for $f\in\cF$ and let $\Sigma: \cF \times \cF \to \R$ be an almost surely positive semi-definite $\cH_0$-measurable random function. Suppose that for a non-random metric $d$ on $\cF$, constant $L$, and $\psi \in \Psi$, % \begin{align}% \label{eq:yurinskii_emp_proc_var} \Sigma(f,f) - 2\Sigma(f,f') + \Sigma(f',f') + \bigvvvert S(f) - S(f') \bigvvvert_\psi^2 &\leq L^2 d(f,f')^2 \quad \text{a.s.} \end{align} % Then for each $\eta > 0$ there is a process $T(f)$ which, conditional on $\cH_0$, is zero-mean and Gaussian, satisfying $\E\big[ T(f) T(f') \mid \cH_0 \big] = \Sigma(f,f')$ for all $f, f' \in \cF$, and for all $t > 0$ has % \begin{align*} &\P\left( \sup_{f \in \cF} \big| S(f) - T(f) \big| \geq C_\psi(t + \eta) \right) \leq C_\psi \inf_{\delta > 0} \inf_{\cF_\delta} \Bigg\{ \frac{\beta_\delta^{1/3} (\log 2 |\cF_\delta|)^{1/3}}{\eta } \\ &\qquad\quad+ \left(\frac{\sqrt{\log 2 |\cF_\delta|} \sqrt{\E\left[\|\Omega_\delta\|_2\right]}}{\eta }\right)^{2/3} + \psi\left(\frac{t}{L J_\psi(\delta)}\right)^{-1} + \exp\left(\frac{-t^2}{L^2 J_2(\delta)^2}\right) \Bigg\} \end{align*} % where $\cF_\delta$ is any finite $\delta$-cover of $(\cF,d)$ and $C_\psi$ is a constant depending only on $\psi$, with % \begin{align*} \beta_\delta &= \sum_{i=1}^n \E\left[ \|\cF_\delta(X_i)\|^2_2\|\cF_\delta(X_i)\|_\infty + \|V_i(\cF_\delta)^{1/2}Z_i\|^2_2 \|V_i(\cF_\delta)^{1/2}Z_i\|_\infty \right], \\ V_i(\cF_\delta) &= \E\big[\cF_\delta(X_i) \cF_\delta(X_i)^\T \mid \cH_{i-1} \big], \hspace*{27.7mm} \Omega_\delta = \sum_{i=1}^n V_i(\cF_\delta) - \Sigma(\cF_\delta), \\ J_\psi(\delta) &= \int_0^\delta \psi^{-1}\big( N_\varepsilon \big) \diff{\varepsilon} + \delta \psi^{-1} \big( N_\delta^2 \big), \hspace*{19mm} J_2(\delta) = \int_0^\delta \sqrt{\log N_\varepsilon} \diff{\varepsilon}, \end{align*} % where $N_\delta = N(\delta, \cF, d)$ is the $\delta$-covering number of $(\cF, d)$ and $Z_i$ are i.i.d.\ $\cN\big(0, I_{|\cF_\delta|}\big)$ independent of $\cH_n$. If $\cF_\delta$ is a minimal $\delta$-cover of $(\cF, d)$, then $|\cF_\delta| = N_\delta$. \end{proposition} Proposition~\ref{pro:yurinskii_emp_proc} is given in a rather general form to accommodate a range of different settings and applications. In particular, consider the following well-known Orlicz functions. % \begin{description} \item[Polynomial:] $\psi(x) = x^a$ for $a \geq 2$ has $\vvvert X \vvvert_2 \leq \vvvert X \vvvert_\psi$ and $\sqrt{\log x} \leq \sqrt{a} \psi^{-1}(x)$. \item[Exponential:] $\psi(x) = \exp(x^a) - 1$ for $a \in [1,2]$ has $\vvvert X \vvvert_2 \leq 2\vvvert X \vvvert_\psi$ and $\sqrt{\log x} \leq \psi^{-1}(x)$. \item[Bernstein:] $\psi(x) = \exp \Big( \Big(\frac{\sqrt{1+2ax}-1}{a}\Big)^{2} \Big)-1$ for $a > 0$ has $\vvvert X \vvvert_2 \leq (1+a)\vvvert X \vvvert_\psi$ \\ and $\sqrt{\log x}~\leq~\psi^{-1}(x)$. \end{description} % For these Orlicz functions and when $\Sigma(f, f') = \Cov[S(f), S(f')]$ is non-random, the terms involving $\Sigma$ in \eqref{eq:yurinskii_emp_proc_var} can be controlled by the Orlicz $\psi$-norm term; similarly, $J_2$ is bounded by $J_\psi$. Further, $C_\psi$ can be replaced by a universal constant $C$ which does not depend on the parameter $a$. See Section~2.2 in \citet{van1996weak} for details. If the conditional third moments of $f(X_i)$ given $\cH_{i-1}$ are all zero (if $f$ and $X_i$ are appropriately symmetric, for example), then the second inequality in Corollary~\ref{cor:yurinskii_sa_martingale} can be applied to obtain a tighter coupling inequality; the details of this are omitted for brevity, and the proof would proceed in exactly the same manner. In general, however, Proposition~\ref{pro:yurinskii_emp_proc} allows for a random covariance function, yielding a coupling to a stochastic process that is Gaussian only conditionally. Such a process can equivalently be viewed as a mixture of Gaussian processes, writing $T=\Sigma^{1/2} Z$ with an operator square root and where $Z$ is a Gaussian white noise on $\cF$ independent of $\cH_0$. This extension is in contrast with much of the existing strong approximation and empirical process literature, which tends to focus on couplings and weak convergence results with marginally Gaussian processes \citep{settati2009gaussian,chernozhukov2016empirical}. A similar approach was taken by \citet{berthet2006revisiting}, who used a Gaussian coupling due to \citet{zaitsev1987estimates,zaitsev1987gaussian} along with a discretization method to obtain strong approximations for empirical processes with independent data. They handled fluctuations in the stochastic processes with uniform $L^2$ covering numbers and bracketing numbers where we opt instead for chaining with Orlicz norms. Our version using the martingale Yurinskii coupling can improve upon theirs in approximation rate even for independent data in certain circumstances. Suppose the setup of Proposition~1 in \citet{berthet2006revisiting}; that is, $X_1, \ldots, X_n$ are i.i.d.\ and $\sup_{\cF} \|f\|_\infty \leq M$, with the VC-type assumption $\sup_\Q N(\varepsilon, \cF, d_\Q) \leq c_0 \varepsilon^{-\nu_0}$ where $d_\Q(f,f')^2 = \E_\Q\big[(f-f')^2\big]$ for a measure $\Q$ on $\cX$ and $M, c_0, \nu_0$ are constants. Using uniform $L^2$ covering numbers rather than Orlicz chaining in our Proposition~4 gives the following. Firstly, as $X_i$ are i.i.d., take $\Sigma(f, f') = \Cov[S(f), S(f')]$ so $\Omega_\delta = 0$. Let $\cF_\delta$ be a minimal $\delta$-cover of $(\cF, d_\P)$ with cardinality $N_\delta \lesssim \delta^{-\nu_0}$ where $\delta \to 0$. It is easy to show that $\beta_\delta \lesssim n \delta^{-\nu_0} \sqrt{\log(1/\delta)}$. Theorem~2.2.8 and Theorem~2.14.1 in \citet{van1996weak} then give % \begin{align*} \E\left[ \sup_{d_\P(f,f') \leq \delta} \Big( |S(f) - S(f')| + |T(f) - T(f')| \Big) \right] &\lesssim \sup_\Q \int_0^\delta \sqrt{n \log N(\varepsilon, \cF, d_\Q)} \diff{\varepsilon} \\ &\lesssim \delta \sqrt{n\log(1/\delta)}, \end{align*} % where we used the VC-type property to bound the entropy integral. So by our Proposition~\ref{pro:yurinskii_emp_proc}, for any sequence $R_n \to \infty$ (see Remark~\ref{rem:yurinskii_coupling_bounds_probability}), % \begin{align*} \sup_{f \in \cF} \big| S(f) - T(f) \big| &\lesssim_\P n^{1/3} \delta^{-\nu_0/3} \sqrt{\log(1/\delta)} R_n + \delta \sqrt{n\log(1/\delta)} \lesssim_\P n^{\frac{2+\nu_0}{6+2\nu_0}} \sqrt{\log n} R_n, \end{align*} % where we minimized over $\delta$ in the last step. \citet[Proposition~1]{berthet2006revisiting} achieved % \begin{align*} \sup_{f \in \cF} \big| S(f) - T(f) \big| &\lesssim_\P n^{\frac{5\nu_0}{4+10\nu_0}} (\log n)^{\frac{4+5\nu_0}{4+10\nu_0}}, \end{align*} % showing that our approach achieves a better approximation rate whenever $\nu_0 > 4/3$. In particular, our method is superior in richer function classes with larger VC-type dimension. For example, if $\cF$ is smoothly parameterized by $\theta \in \Theta \subseteq \R^d$ where $\Theta$ contains an open set, then $\nu_0 > 4/3$ corresponds to $d \geq 2$ and our rate is better as soon as the parameter space is more than one-dimensional. The difference in approximation rate is due to Zaitsev's coupling having better dependence on the sample size but worse dependence on the dimension. In particular, Zaitsev's coupling is stated only in $\ell^2$-norm and hence \citet[Equation~5.3]{berthet2006revisiting} are compelled to use the inequality $\|\cdot\|_\infty \leq \|\cdot\|_2$ in the coupling step, a bound which is loose when the dimension of the vectors (here on the order of $\delta^{-\nu_0}$) is even moderately large. We use the fact that our version of Yurinskii's coupling applies directly to the supremum norm, giving sharper dependence on the dimension. In Section~\ref{sec:yurinskii_local_poly} we apply Proposition~\ref{pro:yurinskii_emp_proc} to obtain strong approximations for local polynomial estimators in the nonparametric regression setting. In contrast with the series estimators of the upcoming Section~\ref{sec:yurinskii_series}, local polynomial estimators are not linearly separable and hence cannot be analyzed directly using the finite-dimensional Corollary~\ref{cor:yurinskii_sa_martingale}. \section{Applications to nonparametric regression} \label{sec:yurinskii_nonparametric} We illustrate the applicability of our previous strong approximation results with two substantial and classical examples in nonparametric regression estimation. Firstly, we present an analysis of partitioning-based series estimators, where we can apply Corollary~\ref{cor:yurinskii_sa_martingale} directly due to an intrinsic linear separability property. Secondly, we consider local polynomial estimators, this time using Proposition~\ref{pro:yurinskii_emp_proc} due to a non-linearly separable martingale empirical process. \subsection{Partitioning-based series estimators} \label{sec:yurinskii_series} Partitioning-based least squares methods are essential tools for estimation and inference in nonparametric regression, encompassing splines, piecewise polynomials, compactly supported wavelets and decision trees as special cases. See \citet{cattaneo2020large} for further details and references throughout this section. We illustrate the usefulness of Corollary~\ref{cor:yurinskii_sa_martingale} by deriving a Gaussian strong approximation for partitioning series estimators based on multivariate martingale data. Proposition~\ref{pro:yurinskii_series} shows how we achieve the best known rate of strong approximation for independent data by imposing an additional mild $\alpha$-mixing condition to control the time series dependence of the regressors. Consider the nonparametric regression setup with martingale difference residuals defined by $Y_i = \mu(W_i) + \varepsilon_i$ for $ 1 \leq i \leq n$ where the regressors $W_i$ have compact connected support $\cW \subseteq \R^m$, $\cH_i$ is the $\sigma$-algebra generated by $(W_1, \ldots, W_{i+1}, \varepsilon_1, \ldots, \varepsilon_i)$, $\E[\varepsilon_i \mid \cH_{i-1}] = 0$ and $\mu: \cW \to \R$ is the estimand. Let $p(w)$ be a $k$-dimensional vector of bounded basis functions on $\cW$ which are locally supported on a quasi-uniform partition \citep[Assumption~2]{cattaneo2020large}. Under minimal regularity conditions, the least-squares partitioning-based series estimator is $\hat\mu(w) = p(w)^{\T} \hat H^{-1} \sum_{i=1}^n p(W_i) Y_i$ with $\hat H = \sum_{i=1}^n p(W_i) p(W_i)^\T$. The approximation power of the estimator $\hat\mu(w)$ derives from letting $k\to\infty$ as $n\to\infty$. The assumptions made on $p(w)$ are mild enough to accommodate splines, wavelets, piecewise polynomials, and certain types of decision trees. For such a tree, $p(w)$ is comprised of indicator functions over $k$ axis-aligned rectangles forming a partition of $\cW$ (a Haar basis), provided that the partitions are constructed using independent data (e.g., with sample splitting). Our goal is to approximate the law of the stochastic process $(\hat\mu(w)-\mu(w):w\in\cW)$, which upon rescaling is typically not asymptotically tight as $k \to \infty$ and thus does not converge weakly. Nevertheless, exploiting the intrinsic linearity of the estimator $\hat\mu(w)$, we can apply Corollary~\ref{cor:yurinskii_sa_martingale} directly to construct a Gaussian strong approximation. Specifically, we write % \begin{equation*} \hat\mu(w) - \mu(w) = p(w)^\T H^{-1} S + p(w)^\T \big(\hat H^{-1} - H^{-1}\big) S + \Bias(w), \end{equation*} % where $H= \sum_{i=1}^n \E\left[p(W_i) p(W_i)^\T\right]$ is the expected outer product matrix, $S = \sum_{i=1}^n p(W_i) \varepsilon_i$ is the score vector, and $\Bias(w) = p(w)^{\T} \hat H^{-1}\sum_{i=1}^n p(W_i) \mu(W_i) - \mu(w)$. Imposing some mild time series restrictions and assuming stationarity, it is not difficult to show (see Section~\ref{sec:yurinskii_app_proofs}) that $\|\hat H - H\|_1 \lesssim_\P \sqrt{n k}$ and $\sup_{w\in\cW} |\Bias(w)| \lesssim_\P k^{-\gamma}$ for some $\gamma>0$, depending on the specific structure of the basis functions, the dimension $m$ of the regressors, and the smoothness of the regression function $\mu$. It remains to study the $k$-dimensional mean-zero martingale $S$ by applying Corollary~\ref{cor:yurinskii_sa_martingale} with $X_i=p(W_i) \varepsilon_i$. Controlling the convergence of the quadratic variation term $\E[\|\Omega\|_2]$ requires some time series dependence assumptions; we impose an $\alpha$-mixing condition on $(W_1, \ldots, W_n)$ for illustration \citep{bradley2005basic}. \begin{proposition}[Strong approximation for partitioning series estimators]% \label{pro:yurinskii_series} % Consider the nonparametric regression setup described above and further assume the following: % \begin{enumerate}[label=(\roman*)] \item $(W_i, \varepsilon_i)_{1 \leq i \leq n}$ is strictly stationary. \item $W_1, \ldots, W_n$ is $\alpha$-mixing with mixing coefficients satisfying $\sum_{j=1}^\infty \alpha(j) < \infty$. \item $W_i$ has a Lebesgue density on $\cW$ which is bounded above and away from zero. \item $\E\big[|\varepsilon_i|^3 \big] < \infty$ and $\E\big[\varepsilon_i^2 \mid \cH_{i-1}\big]=\sigma^2(W_i)$ is bounded away from zero. \item $p(w)$ is a basis with $k$ features satisfying Assumptions~2 and~3 in \citet{cattaneo2020large}. \end{enumerate} % Then, for any sequence $R_n \to \infty$, there is a zero-mean Gaussian process $G(w)$ indexed on $\cW$ with $\Var[G(w)] \asymp\frac{k}{n}$ satisfying $\Cov[G(w), G(w')] = \Cov[p(w)^\T H^{-1} S,\, p(w')^\T H^{-1} S]$ and % \begin{align*} \sup_{w \in \cW} \left| \hat\mu(w) - \mu(w) - G(w) \right| &\lesssim_\P \sqrt{\frac{k}{n}} \left( \frac{k^3 (\log k)^3}{n} \right)^{1/6} R_n + \sup_{w \in \cW} |\Bias(w)| \end{align*} % assuming the number of basis functions satisfies $k^3 / n \to 0$. If further $\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$ then % \begin{align*} \sup_{w \in \cW} \left| \hat\mu(w) - \mu(w) - G(w) \right| &\lesssim_\P \sqrt{\frac{k}{n}} \left( \frac{k^3 (\log k)^2}{n} \right)^{1/4} R_n + \sup_{w \in \cW} |\Bias(w)|. \end{align*} % \end{proposition} The core concept in the proof of Proposition~\ref{pro:yurinskii_series} is to apply Corollary~\ref{cor:yurinskii_sa_martingale} with $S = \sum_{i=1}^n p(W_i) \varepsilon_i$ and $p=\infty$ to construct $T \sim \cN\big(0, \Var[S]\big)$ such that $\|S - T \|_\infty$ is small, and then setting $G(w) = p(w)^\T H^{-1} T$. So long as the bias can be appropriately controlled, this result allows for uniform inference procedures such as uniform confidence bands or shape specification testing. The condition $k^3 / n \to 0$ is the same (up to logs) as that imposed by \citet{cattaneo2020large} for i.i.d. data, which gives the best known strong approximation rate for this problem. Thus, Proposition~\ref{pro:yurinskii_series} gives the same best approximation rate without requiring any extra restrictions for $\alpha$-mixing time series data. Our results improve substantially on \citet[Theorem~1]{li2020uniform}: using the notation of our Corollary~\ref{cor:yurinskii_sa_martingale}, and with any sequence $R_n \to \infty$, a valid (see Remark~\ref{rem:yurinskii_coupling_bounds_probability}) version of their martingale Yurinskii coupling is % \begin{align*} \|S-T\|_2 \lesssim_\P d^{1/2} r^{1/2}_n + (B_n d)^{1/3} R_n, \end{align*} % where $B_n = \sum_{i=1}^n \E[\|X_i\|_2^3]$ and $r_n$ is a term controlling the convergence of the quadratic variation, playing a similar role to our term $\E[\|\Omega\|_2]$. Under the assumptions of our Proposition~\ref{pro:yurinskii_series}, applying this result with $S = \sum_{i=1}^n p(W_i) \varepsilon_i$ yields a rate no better than $\|S-T\|_2 \lesssim_\P (n k)^{1/3} R_n$. As such, they attain a rate of strong approximation no faster than % \begin{align*} \sup_{w \in \cW} \left| \hat\mu(w) - \mu(w) - G(w) \right| &\lesssim_\P \sqrt{\frac{k}{n}} \left( \frac{k^5}{n} \right)^{1/6} R_n + \sup_{w \in \cW} |\Bias(w)|. \end{align*} % Hence, for this approach to yield a valid strong approximation, the number of basis functions must satisfy $k^5/n \to 0$, a more restrictive assumption than our $k^3 / n \to 0$ (up to logs). This difference is due to \citet{li2020uniform} using the $\ell^2$-norm version of Yurinskii's coupling rather than the recently established $\ell^\infty$ version. Further, our approach allows for an improved rate of distributional approximation whenever the residuals have zero conditional third moment. To illustrate the statistical applicability of Proposition~\ref{pro:yurinskii_series}, consider constructing a feasible uniform confidence band for the regression function $\mu$, using standardization and Studentization for statistical power improvements. We assume throughout that the bias is negligible. Proposition~\ref{pro:yurinskii_series} and anti-concentration for Gaussian suprema \citep[Corollary~2.1]{chernozhukov2014anti} yield a distributional approximation for the supremum statistic whenever $k^3(\log n)^6 / n \to 0$, giving % \begin{align*} \sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \left| \frac{\hat\mu(w)-\mu(w)}{\sqrt{\rho(w,w)}} \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \frac{G(w)}{\sqrt{\rho(w,w)}} \right| \leq t \right) \right| &\to 0, \end{align*} % where $\rho(w,w') = \E[G(w)G(w')]$. Further, by a Gaussian--Gaussian comparison result \citep[Lemma~3.1]{chernozhukov2013gaussian} and anti-concentration, we show (see the proof of Proposition~\ref{pro:yurinskii_series}) that with $\bW = (W_1, \ldots, W_n)$ and $\bY = (Y_1, \ldots, Y_n)$, % \begin{align*} \sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \left| \frac{\hat\mu(w)-\mu(w)}{\sqrt{\hat\rho(w,w)}} \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \frac{\hat G(w)}{\sqrt{\hat\rho(w,w)}} \right| \leq t \biggm| \bW, \bY \right) \right| &\to_\P 0, \end{align*} % where $\hat G(w)$ is a zero-mean Gaussian process conditional on $\bW$ and $\bY$ with conditional covariance function $\hat\rho(w,w') =\E\big[\hat G(w) \hat G(w') \mid \bW, \bY \big] = p(w)^\T \hat H^{-1} \hat V \hat H^{-1}p(w')$ for some estimator $\hat V$ satisfying $\frac{k (\log n)^2}{n} \big\|\hat V-\Var[S]\big\|_2 \to_\P 0$. For example, one could use the plug-in estimator $\hat V=\sum_{i=1}^n p(W_i) p(W_i)^\T \hat{\sigma}^2(W_i)$ where $\hat{\sigma}^2(w)$ satisfies $(\log n)^2 \sup_{w \in \cW} |\hat{\sigma}^2(w)-\sigma^2(w)| \to_\P 0$. This leads to the following feasible and asymptotically valid $100(1-\tau)\%$ uniform confidence band for partitioning-based series estimators based on martingale data. \begin{proposition}[Feasible uniform confidence bands for partitioning series estimators]% \label{pro:yurinskii_series_feasible} % Assume the setup of the preceding section. Then % \begin{align*} \P\Big( \mu(w) \in \Big[ \hat\mu(w) \pm \hat q(\tau) \sqrt{\hat\rho(w,w)} \Big] \ \text{for all } w \in \cW \Big) \to 1-\tau, \end{align*} % where % \begin{align*} \hat{q}(\tau) &= \inf \left\{ t \in \R: \P\left( \sup_{w \in \cW} \left| \frac{\hat G(w)}{\sqrt{\hat\rho(w,w)}} \right| \leq t \Bigm| \bW, \bY \right) \geq \tau \right\} \end{align*} % is the conditional quantile of the supremum of the Studentized Gaussian process. This can be estimated by resampling the conditional law of $\hat G(w) \mid \bW, \bY$ with a discretization of $w \in \cW$. \end{proposition} \subsection{Local polynomial estimators} \label{sec:yurinskii_local_poly} As a second example application we consider nonparametric regression estimation with martingale data employing local polynomial methods \citep{fan1996local}. In contrast with the partitioning-based series methods of Section~\ref{sec:yurinskii_series}, local polynomials induce stochastic processes which are not linearly separable, allowing us to showcase the empirical process result given in Proposition \ref{pro:yurinskii_emp_proc}. As before, suppose that $Y_i = \mu(W_i) + \varepsilon_i$ for $ 1 \leq i \leq n$ where $W_i$ has compact connected support $\cW \subseteq \R^m$, $\cH_i$ is the $\sigma$-algebra generated by $(W_1, \ldots, W_{i+1}, \varepsilon_1, \ldots, \varepsilon_i)$, $\E[\varepsilon_i \mid \cH_{i-1}] = 0$, and $\mu: \cW \to \R$ is the estimand. Let $K$ be a kernel function on $\R^m$ and $K_h(w) = h^{-m} K(w/h)$ for some bandwidth $h > 0$. Take $\gamma \geq 0$ a fixed polynomial order and let $k = (m+\gamma)!/(m!\gamma!)$ be the number of monomials up to order $\gamma$. Using multi-index notation, let $p(w)$ be the $k$-dimensional vector collecting the monomials $w^{\kappa}/\kappa!$ for $0 \leq |\kappa| \leq \gamma$, and set $p_h(w) = p(w/h)$. The local polynomial regression estimator of $\mu(w)$ is, with $e_1 = (1, 0, \ldots, 0)^\T \in \R^k$ the first standard unit vector, % \begin{align*} \hat{\mu}(w) &= e_1^\T\hat{\beta}(w) &\text{where} & &\hat{\beta}(w) &= \argmin_{\beta \in \R^{k}} \sum_{i=1}^n \left(Y_i - p_h(W_i-w)^\T \beta \right)^2 K_h(W_i-w). \end{align*} Our goal is again to approximate the distribution of the entire stochastic process, $(\hat{\mu}(w)-\mu(w):w\in\cW)$, which upon rescaling is non-Donsker if $h \to 0$, and decomposes as follows: % \begin{align*} \hat{\mu}(w)-\mu(w) &= e_1^\T H(w)^{-1} S(w) + e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big) S(w) + \Bias(w) \end{align*} % where $\hat H(w) = \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) p_h(W_i-w)^\T$, $H(w) = \E \big[ \hat H(w) \big]$, $S(w)= \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \varepsilon_i$, and $\Bias(w) = e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \mu(W_i) - \mu(w)$. A key distinctive feature of local polynomial regression is that both $\hat H(w)$ and $S(w)$ are functions of the evaluation point $w\in\cW$; contrast this with the partitioning-based series estimator discussed in Section~\ref{sec:yurinskii_series}, for which neither $\hat H$ nor $S$ depend on $w$. Therefore we use Proposition \ref{pro:yurinskii_emp_proc} to obtain a Gaussian strong approximation for the martingale empirical process directly. Under mild regularity conditions, including stationarity for simplicity and an $\alpha$-mixing assumption on the time-dependence of the data, we show $\sup_{w\in\cW} \|\hat H(w)-H(w)\|_2 \lesssim_\P \sqrt{n h^{-2m}\log n}$. Further, $\sup_{w\in\cW} |\Bias(w)| \lesssim_\P h^\gamma$ provided that the regression function is sufficiently smooth. It remains to analyze the martingale empirical process given by $\big(e_1^\T H(w)^{-1} S(w) : w\in\cW\big)$ via Proposition \ref{pro:yurinskii_emp_proc} by setting % \begin{align*} \cF = \left\{ (W_i, \varepsilon_i) \mapsto e_1^\T H(w)^{-1} K_h(W_i-w) p_h(W_i-w) \varepsilon_i : w \in \cW \right\}. \end{align*} % With this approach, we obtain the following result. \begin{proposition}[Strong approximation for local polynomial estimators]% \label{pro:yurinskii_local_poly} Under the nonparametric regression setup described above, assume further that % \begin{enumerate}[label=(\roman*)] \item $(W_i, \varepsilon_i)_{1 \leq i \leq n}$ is strictly stationary. \item $(W_i, \varepsilon_i)_{1 \leq i \leq n}$ is $\alpha$-mixing with mixing coefficients $\alpha(j) \leq e^{-2 j / C_\alpha}$ for some $C_\alpha > 0$. \item $W_i$ has a Lebesgue density on $\cW$ which is bounded above and away from zero. \item $\E\big[e^{|\varepsilon_i|/C_\varepsilon}\big] < \infty$ for $C_\varepsilon > 0$ and $\E\left[\varepsilon^2_i \mid \cH_{i-1}\right]=\sigma^2(W_i)$ is bounded away from zero. \item $K$ is a non-negative Lipschitz compactly supported kernel with $\int K(w) \diff{w} = 1$. \end{enumerate} % Then for any $R_n \to \infty$, there is a zero-mean Gaussian process $T(w)$ on $\cW$ with $\Var[T(w)] \asymp\frac{1}{n h^m}$ satisfying $\Cov[T(w), T(w')] = \Cov[e_1^\T H(w)^{-1} S(w),\, e_1^\T H(w')^{-1} S(w')]$ and % \begin{align*} \sup_{w \in \cW} \left|\hat \mu(w) - \mu(w) - T(w) \right| &\lesssim_\P \frac{R_n}{\sqrt{n h^m}} \left( \frac{(\log n)^{m+4}}{n h^{3m}} \right)^{\frac{1}{2m+6}} + \sup_{w \in \cW} |\Bias(w)|, \end{align*} % provided that the bandwidth sequence satisfies $n h^{3m} \to \infty$. % \end{proposition} If the residuals further satisfy $\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$, then a third-order Yurinskii coupling delivers an improved rate of strong approximation for Proposition~\ref{pro:yurinskii_local_poly}; this is omitted here for brevity. For completeness, the proof of Proposition~\ref{pro:yurinskii_local_poly} verifies that if the regression function $\mu(w)$ is $\gamma$ times continuously differentiable on $\cW$ then $\sup_w |\Bias(w)| \lesssim_\P h^\gamma$. Further, the assumption that $p(w)$ is a vector of monomials is unnecessary in general; any collection of bounded linearly independent functions which exhibit appropriate approximation power will suffice \citep{eggermont2009maximum}. As such, we can encompass local splines and wavelets, as well as polynomials, and also choose whether or not to include interactions between the regressor variables. The bandwidth restriction of $n h^{3m} \to \infty$ is analogous to that imposed in Proposition~\ref{pro:yurinskii_series} for partitioning-based series estimators, and as far as we know, has not been improved upon for non-i.i.d.\ data. Applying an anti-concentration result for Gaussian process suprema, such as Corollary~2.1 in \citet{chernozhukov2014anti}, allows one to write a Kolmogorov--Smirnov bound comparing the law of $\sup_{w \in \cW}|\hat\mu(w) - \mu(w)|$ to that of $\sup_{w \in \cW}|T(w)|$. With an appropriate covariance estimator, we can further replace $T(w)$ by a feasible version $\hat T(w)$ or its Studentized counterpart, enabling procedures for uniform inference analogous to the confidence bands constructed in Section~\ref{sec:yurinskii_series}. We omit the details of this to conserve space but note that our assumptions on $W_i$ and $\varepsilon_i$ ensure that Studentization is possible even when the discretized covariance matrix has small eigenvalues (Section~\ref{sec:yurinskii_kde}), as we normalize only by the diagonal entries. \citet[Remark~3.1]{chernozhukov2014gaussian} achieve better rates for approximating the supremum of the $t$-process based on i.i.d.\ data in Kolmogorov--Smirnov distance by bypassing the step where we first approximate the entire stochastic process (see Section~\ref{sec:yurinskii_emp_proc} for a discussion). Nonetheless, our approach targeting the entire process allows for a potential future treatment of other functionals as well as the supremum. We finally remark that in this setting of kernel-based local empirical processes, it is essential that our initial strong approximation result (Corollary~\ref{cor:yurinskii_sa_martingale}) does not impose a lower bound on the eigenvalues of the variance matrix $\Sigma$. This effect was demonstrated by Lemma \ref{lem:yurinskii_kde_eigenvalue}, Figure~\ref{fig:yurinskii_min_eig}, and their surrounding discussion in Section~\ref{sec:yurinskii_kde}. As such, the result of \citet{li2020uniform} is unsuited for this application, even in its simplest formulation, due to the strong minimum eigenvalue assumption. \section{Conclusion} \label{sec:yurinskii_conclusion} In this chapter we introduced as our main result a new version of Yurinskii's coupling which strictly generalizes all previously known forms of the result. Our formulation gave a Gaussian mixture coupling for approximate martingale vectors in $\ell^p$-norm where $1 \leq p \leq \infty$, with no restrictions on the minimum eigenvalues of the associated covariance matrices. We further showed how to obtain an improved approximation whenever third moments of the data are negligible. We demonstrated the applicability of this main result by first deriving a user-friendly version, and then specializing it to mixingales, martingales, and independent data, illustrating the benefits with a collection of simple factor models. We then considered the problem of constructing uniform strong approximations for martingale empirical processes, demonstrating how our new Yurinskii coupling can be employed in a stochastic process setting. As substantive illustrative applications of our theory to some well-established problems in statistical methodology, we showed how to use our coupling results for both vector-valued and empirical process-valued martingales in developing uniform inference procedures for partitioning-based series estimators and local polynomial models in nonparametric regression. At each stage we addressed issues of feasibility, compared our work with the existing literature, and provided implementable statistical inference procedures. The work in this chapter is based on \citet{cattaneo2022yurinskii}. \appendix \chapter{Supplement to Inference with Mondrian Random Forests} \label{app:mondrian} In this section we present the full proofs of all our results, and also state some useful technical preliminary and intermediate lemmas, along with some further properties of the Mondrian process not required for our primary analysis. See Section~\ref{sec:mondrian_overview_proofs} in the main text for an overview of the main proof strategies and a discussion of the challenges involved. We use the following simplified notation for convenience, whenever it is appropriate. We write $\I_{i b}(x) = \I \left\{ X_i \in T_b(x) \right\}$ and $N_b(x) = \sum_{i=1}^{n} \I_{i b}(x)$, as well as $\I_b(x) = \I \left\{ N_b(x) \geq 1 \right\}$. \section{Preliminary lemmas} We begin by bounding the maximum size of any cell in a Mondrian forest containing $x$. This result is used regularly throughout many of our other proofs, and captures the ``localizing'' behavior of the Mondrian random forest estimator, showing that Mondrian cells have side lengths at most on the order of $1/\lambda$. \begin{lemma}[Upper bound on the largest cell in a Mondrian forest]% \label{lem:mondrian_app_largest_cell} % Let $T_1, \ldots, T_b \sim \cM\big([0,1]^d, \lambda\big)$ and take $x \in (0,1)^d$. Then for all $t > 0$ % \begin{align*} \P \left( \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_b(x)_j| \geq \frac{t}{\lambda} \right) &\leq 2dB e^{-t/2}. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_largest_cell}] % We use the distribution of the Mondrian cell shape \citep[Proposition~1]{mourtada2020minimax}. We have $|T_b(x)_j| = \left( \frac{E_{bj1}}{\lambda} \wedge x_j \right) + \left( \frac{E_{bj2}}{\lambda} \wedge (1-x_j) \right)$ where $E_{bj1}$ and $E_{bj2}$ are i.i.d.\ $\Exp(1)$ variables for $1 \leq b \leq B$ and $1 \leq j \leq d$. Thus $|T_b(x)_j| \leq \frac{E_{bj1} + E_{bj2}}{\lambda}$ so by a union bound % \begin{align*} \P \left( \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_b(x)_j| \geq \frac{t}{\lambda} \right) &\leq \P \left( \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} (E_{bj1} \vee E_{bj2}) \geq \frac{t}{2} \right) \\ &\leq 2dB\, \P \left( E_{bj1} \geq \frac{t}{2} \right) \leq 2dB e^{-t/2}. \end{align*} % \end{proof} Next is another localization result, showing that the union of the cells $T_b(x)$ containing $x$ does not contain ``too many'' samples $X_i$. Thus the Mondrian random forest estimator fitted at $x$ only depends on $n/\lambda^d$ (the effective sample size) data points up to logarithmic terms. \begin{lemma}[Upper bound on the number of active data points]% \label{lem:mondrian_app_active_data} Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold, and define $N_{\cup}(x) = \sum_{i=1}^{n} \I \left\{ X_i \in \bigcup_{b=1}^{B} T_b(x) \right\}$. Then for $t > 0$ and sufficiently large $n$, with $\|f\|_\infty = \sup_{x \in [0,1]^d} f(x)$, % \begin{align*} \P \left( N_{\cup}(x) > t^{d+1} \frac{n}{\lambda^d} \|f\|_\infty \right) &\leq 4 d B e^{-t/4}. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_active_data}] Note $N_\cup(x) \sim \Bin\left(n, \int_{\bigcup_{b=1}^{B} T_b(x)} f(s) \diff s \right) \leq \Bin\left(n, 2^d \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_b(x)_j|^d \|f\|_\infty \right)$ conditionally on $\bT$. If $N \sim \Bin(n,p)$ then, by Bernstein's inequality, $\P\left( N \geq (1 + t) n p\right) \leq \exp\left(-\frac{t^2 n^2 p^2 / 2}{n p(1-p) + t n p / 3}\right) \leq \exp\left(-\frac{3t^2 n p}{6 + 2t}\right)$. Thus for $t \geq 2$, % \begin{align*} \P \left( N_{\cup}(x) > (1+t) n \frac{2^d t^d}{\lambda^d} \|f\|_\infty \Bigm| \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_j(x)| \leq \frac{t}{\lambda} \right) &\leq \exp\left(- \frac{2^d t^{d} n}{\lambda^d}\right). \end{align*} % By Lemma~\ref{lem:mondrian_app_largest_cell}, $\P \left( \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_j(x)| > \frac{t}{\lambda} \right) \leq 2 d B e^{-t/2}$. Hence % \begin{align*} &\P \left( N_{\cup}(x) > 2^{d+1} t^{d+1} \frac{n}{\lambda^d} \|f\|_\infty \right) \\ &\quad\leq \P \left( N_{\cup}(x) > 2 t n \frac{2^d t^d}{\lambda^d} \|f\|_\infty \Bigm| \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_j(x)| \leq \frac{t}{\lambda} \right) + \P \left( \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_j(x)| > \frac{t}{\lambda} \right) \\ &\quad\leq \exp\left(- \frac{2^d t^{d} n}{\lambda^d}\right) + 2 d B e^{-t/2}. \end{align*} % Replacing $t$ by $t/2$ gives that for sufficiently large $n$ such that $n / \lambda^d \geq 1$, % \begin{align*} \P \left( N_{\cup}(x) > t^{d+1} \frac{n}{\lambda^d} \|f\|_\infty \right) &\leq 4 d B e^{-t/4}. \end{align*} % \end{proof} Next we give a series of results culminating in a generalized moment bound for the denominator appearing in the Mondrian random forest estimator. We begin by providing a moment bound for the truncated inverse binomial distribution, which will be useful for controlling $\frac{\I_b(x)}{N_b(x)} \leq 1 \wedge \frac{1}{N_b(x)}$ because conditional on $T_b$ we have $N_b(x) \sim \Bin \left( n, \int_{T_b(x)} f(s) \diff s \right)$. Our constants could be significantly suboptimal but they are sufficient for our applications. \begin{lemma}[An inverse moment bound for the binomial distribution]% \label{lem:mondrian_app_binomial_bound} For $n \geq 1$ and $p \in [0,1]$, let $N \sim \Bin(n, p)$ and $a_1, \ldots, a_k \geq 0$. Then % \begin{align*} \E\left[ \prod_{j=1}^k \left( 1 \wedge \frac{1}{N + a_j} \right) \right] &\leq (9k)^k \prod_{j=1}^k \left( 1 \wedge \frac{1}{n p + a_j} \right). \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_binomial_bound}] By Bernstein's inequality, $\P\left( N \leq n p - t \right) \leq \exp\left(-\frac{t^2/2}{n p(1-p) + t/3}\right) \leq \exp\left(-\frac{3t^2}{6n p + 2t}\right)$. Therefore we have $\P\left( N \leq n p/4 \right) \leq \exp\left(-\frac{27 n^2 p^2 / 16}{6n p + 3 n p / 2}\right) = e^{-9 n p / 40}$. Partitioning by this event gives % \begin{align*} \E\left[ \prod_{j=1}^k \left( 1 \wedge \frac{1}{N + a_j} \right) \right] &\leq e^{-9 n p / 40} \prod_{j=1}^k \frac{1}{1 \vee a_j} + \prod_{j=1}^k \frac{1}{1 \vee (\frac{n p}{4} + a_j)} \\ &\leq \prod_{j=1}^k \frac{1}{\frac{9 n p}{40k} + (1 \vee a_j)} + \prod_{j=1}^k \frac{1}{1 \vee (\frac{n p}{4} + a_j)} \\ &\leq \prod_{j=1}^k \frac{1}{1 \vee \left(\frac{9 n p}{40k} + a_j\right)} + \prod_{j=1}^k \frac{1}{1 \vee (\frac{n p}{4} + a_j)} \\ &\leq 2 \prod_{j=1}^k \frac{1}{1 \vee \left(\frac{9 n p}{40k} + a_j\right)} \leq 2 \prod_{j=1}^k \frac{40k/9}{1 \vee \left(n p + a_j\right)} \\ &\leq (9k)^k \prod_{j=1}^k \left( 1 \wedge \frac{1}{n p + a_j} \right). \end{align*} \end{proof} Our next result is probably the most technically involved, allowing one to bound moments of (products of) $\frac{\I_b(x)}{N_b(x)}$ by the corresponding moments of (products of) $\frac{1}{n |T_b(x)|}$, again based on the heuristic that $N_b(x)$ is conditionally binomial so concentrates around its conditional expectation $n \int_{T_b(x)} f(x) \diff s \asymp n |T_b(x)|$. By independence of the trees, the latter expected products then factorize since the dependence on the data $X_i$ has been eliminated. The proof is complicated, and relies on the following induction procedure. First we consider the common refinement consisting of the subcells $\cR$ generated by all possible intersections of $T_b(x)$ over the selected trees (say $T_{b}(x), T_{b'}(x), T_{b''}(x)$ though there could be arbitrarily many). Note that $N_b(x)$ is the sum of the number of samples $X_i$ in each such subcell in $\cR$. We then apply Lemma~\ref{lem:mondrian_app_binomial_bound} repeatedly to each subcell in $\cR$ in turn, replacing the number of samples $X_i$ in that subcell with its volume multiplied by $n$, and controlling the error incurred at each step. We record the subcells which have been ``checked'' in this manner using the class $\cD \subseteq \cR$ and proceed by finite induction, beginning with $\cD = \emptyset$ and ending at $\cD = \cR$. \begin{lemma}[Generalized moment bound for Mondrian random forest denominators]% \label{lem:mondrian_app_moment_denominator} Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold. Let $T_b \sim \cM\big([0,1]^d, \lambda\big)$ be independent and $k_b \geq 1$ for $1 \leq b \leq B_0$. Then with $k = \sum_{b=1}^{B_0} k_b$, for sufficiently large $n$, % \begin{align*} \E\left[ \prod_{b=1}^{B_0} \frac{\I_b(x)}{N_b(x)^{k_b}} \right] &\leq \left( \frac{36k}{\inf_{x \in [0,1]^d} f(x)} \right)^{2^{B_0} k} \prod_{b=1}^{B_0} \E \left[ 1 \wedge \frac{1}{(n |T_b(x)|)^{k_b}} \right]. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_moment_denominator}] Define the common refinement of $\left\{ T_b(x) : 1 \leq b \leq {B_0} \right\}$ as the class of sets % \begin{align*} \cR &= \left\{ \bigcap_{b=1}^{B_0} D_b : D_b \in \big\{ T_b(x), T_b(x)^{\comp} \big\} \right\} \bigsetminus \left\{ \emptyset,\, \bigcap_{b=1}^{B_0} T_b(x)^\comp \right\} \end{align*} % and let $\cD \subset \cR$. We will proceed by induction on the elements of $\cD$, which represents the subcells we have checked, starting from $\cD = \emptyset$ and finishing at $\cD = \cR$. For $D \in \cR$ let $\cA(D) = \left\{ 1 \leq b \leq {B_0} : D \subseteq T_b(x) \right\}$ be the indices of the trees which are active on subcell $D$, and for $1 \leq b \leq {B_0}$ let $\cA(b) = \left\{ D \in \cR : D \subseteq T_b(x) \right\}$ be the subcells which are contained in $T_b(x)$, so that $b \in \cA(D) \iff D \in \cA(b)$. For a subcell $D \in \cR$, write $N_b(D) = \sum_{i=1}^{n} \I \left\{ X_i \in D \right\}$ so that $N_b(x) = \sum_{D \in \cA(b)} N_b(D)$. Note that for any $D \in \cR \setminus \cD$, % \begin{align*} &\E \left[ \prod_{b=1}^{B_0} \frac{1}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus \cD} N_b(D') + n \sum_{D' \in \cA(b) \cap \cD} |D'| \right)^{k_b} } \right] \\ &\quad= \E \left[ \prod_{b \notin \cA(D)} \frac{1}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus \cD} N_b(D') + n \sum_{D' \in \cA(b) \cap \cD} |D'| \right)^{k_b} } \right. \\ &\left. \qquad \times\,\E\left[ \prod_{b \in \cA(D)} \frac{1}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus \cD} N_b(D') + n \sum_{D' \in \cA(b) \cap \cD} |D'| \right)^{k_b} } \right.\right. \\ &\left.\left. \quad\qquad\qquad\biggm| \bT, N_b(D') : D' \in \cR \setminus (\cD \cup \{D\}) \right] \right]. \end{align*} % Now the inner conditional expectation is over $N_b(D)$ only. Since $f$ is bounded away from zero, % \begin{align*} N_b(D) &\sim \Bin\left( n - \sum_{D' \in \cR \setminus (\cD \cup \{D\})} N_b(D'), \ \frac{\int_{D} f(s) \diff s} {1 - \int_{\bigcup \left( \cR \setminus \cD \right) \setminus D} f(s) \diff s} \right) \\ &\geq \Bin\left( n - \sum_{D' \in \cR \setminus (\cD \cup \{D\})} N_b(D'), \ |D| \inf_{x \in [0,1]^d} f(x) \right) \end{align*} % conditional on $\bT$ and $N_b(D') : D' \in \cR \setminus (\cD \cup \{D\})$. For sufficiently large $t$ by Lemma~\ref{lem:mondrian_app_active_data} % \begin{align*} \P \left( \sum_{D' \in \cR \setminus (\cD \cup \{D\})} N_b(D') > t^{d+1} \frac{n}{\lambda^d} \|f\|_\infty \right) &\leq \P \left( N_{\cup}(x) > t^{d+1} \frac{n}{\lambda^d} \|f\|_\infty \right) \leq 4 d B_0 e^{-t/4}. \end{align*} % Thus $N_b(D) \geq \Bin(n/2, |D| \inf_x f(x))$ conditional on $\left\{ \bT, N_b(D') : D' \in \cR \setminus (\cD \cup \{D\}) \right\}$ with probability at least $1 - 4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}$. So by Lemma~\ref{lem:mondrian_app_binomial_bound}, % \begin{align*} &\E \Bigg[ \prod_{b \in \cA(D)} \! \frac{1}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus \cD} N_b(D') + n \sum_{D' \in \cA(b) \cap \cD} |D'| \right)^{k_b} } \biggm| \! \bT, N_b(D')\! : D' \in \cR \setminus \! (\cD \cup \{D\}) \Bigg] \\ &\quad\leq \E \! \left[ \prod_{b \in \cA(D)} \frac{(9k)^{k_b}}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus (\cD \cup \{D\})} N_b(D') + n |D| \inf_x f(x) / 2 + n \sum_{D' \in \cA(b) \cap \cD} |D'| \right)^{k_b}} \right] \\ &\qquad+ 4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}} \\ &\quad\leq \left( \frac{18k}{\inf_x f(x)} \right)^k \! \E \! \left[ \prod_{b \in \cA(D)} \frac{1}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus (\cD \cup \{D\})} N_b(D') + n \sum_{D' \in \cA(b) \cap (\cD \cup \{D\})} |D'| \right)^{k_b}} \right] \\ &\qquad+ 4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}. \end{align*} % Therefore plugging this back into the marginal expectation yields % \begin{align*} &\E\left[ \prod_{b=1}^{B_0} \frac{1}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus \cD} N_b(D') + n \sum_{D' \in \cA(b) \cap \cD} |D'| \right)^{k_b} } \right] \\ &\quad\leq \left( \frac{18k}{\inf_x f(x)} \right)^k \E \left[ \prod_{b=1}^{B_0} \frac{1}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus (\cD \cup \{D\})} N_b(D') + n \sum_{D' \in \cA(b) \cap (\cD \cup \{D\})} |D'| \right)^{k_b}} \right] \\ &\qquad+ 4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}. \end{align*} % Now we apply induction, starting with $\cD = \emptyset$ and adding $D \in \cR \setminus \cD$ to $\cD$ until $\cD = \cR$. This takes at most $|\cR| \leq 2^{B_0}$ steps and yields % \begin{align*} \E\left[ \prod_{b=1}^{B_0} \frac{\I_b(x)}{N_b(x)^{k_b}} \right] &\leq \E\left[ \prod_{b=1}^{B_0} \frac{1}{1 \vee N_b(x)^{k_b}} \right] = \E\left[ \prod_{b=1}^{B_0} \frac{1}{1 \vee \left( \sum_{D \in \cA(b)} N_b(D) \right)^{k_b}} \right] \leq \cdots \\ &\leq \left( \frac{18k}{\inf_x f(x)} \right)^{2^{B_0} k} \left( \prod_{b=1}^{B_0} \,\E \left[ \frac{1}{1 \vee (n |T_b(x)|)^{k_b}} \right] + 4 d B_0 2^{B_0} e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}} \right), \end{align*} % where the expectation factorizes due to independence of $T_b(x)$. The last step is to remove the trailing exponential term. To do this, note that by Jensen's inequality, % \begin{align*} \prod_{b=1}^{B_0} \,\E \left[ \frac{1}{1 \vee (n |T_b(x)|)^{k_b}} \right] &\geq \prod_{b=1}^{B_0} \frac{1} {\E \left[ 1 \vee (n |T_b(x)|)^{k_b} \right]} \geq \prod_{b=1}^{B_0} \frac{1}{n^{k_b}} = n^{-k} \geq 4 d B_0 2^{B_0} e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}} \end{align*} % for sufficiently large $n$ because $B_0$, $d$, and $k$ are fixed while $\log \lambda \gtrsim \log n$. \end{proof} Now that moments of (products of) $\frac{\I_b(x)}{N_b(x)}$ have been bounded by moments of (products of) $\frac{1}{n |T_b(x)|}$, we establish further explicit bounds for these in the next result. Note that the problem has been reduced to determining properties of Mondrian cells, so once again we return to the exact cell shape distribution given by \citet{mourtada2020minimax}, and evaluate the appropriate expectations by integration. Note that the truncation by taking the minimum with one inside the expectation is essential here, as otherwise second moment of the inverse Mondrian cell volume is not even finite. As such, there is a ``penalty'' of $\log n$ when bounding truncated second moments, and the upper bound for the $k$th moment is significantly larger than the naive assumption of $(\lambda^d / n)^k$ whenever $k \geq 3$. This ``small cell'' phenomenon in which the inverse volumes of Mondrian cells have heavy tails is a recurring challenge. \begin{lemma}[Inverse moments of the volume of a Mondrian cell]% \label{lem:mondrian_app_moment_cell} Suppose Assumption~\ref{ass:mondrian_estimator} holds and let $T \sim \cM\big([0,1]^d, \lambda\big)$. Then for sufficiently large $n$, % \begin{align*} \E\left[ 1 \wedge \frac{1}{(n |T(x)|)^k} \right] &\leq \left( \frac{\lambda^d}{n} \right)^{\I \left\{ k = 1 \right\}} \left( \frac{3 \lambda^{2d} \log n}{n^2} \right)^{\I \left\{ k \geq 2 \right\}} \prod_{j=1}^{d} \frac{1}{x_j (1-x_j)}. \end{align*} % \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_moment_cell}] By \citet[Proposition~1]{mourtada2020minimax}, $|T(x)| = \prod_{j=1}^{d} \left( \left(\frac{1}{\lambda} E_{j1} \right) \wedge x_j + \left( \frac{1}{\lambda} E_{j2} \right) \wedge (1-x_j) \right)$ where $E_{j1}$ and $E_{j2}$ are mutually independent $\Exp(1)$ random variables. Thus for $02$ we use $\frac{1}{1 \vee (n |T(x)|)^k} \leq \frac{1}{1 \vee (n |T(x)|)^{k-1}}$ to reduce $k$. Now if $k = 1$ we let $t \to 0$, giving % \begin{align*} \E \left[ \frac{1}{1 \vee (n |T(x)|)} \right] &\leq \frac{\lambda^d}{n} \prod_{j=1}^d \frac{1}{x_j(1-x_j)}, \end{align*} % and if $k = 2$ then we set $t = 1/n^2$ so that for sufficiently large $n$, % \begin{align*} \E \left[ \frac{1}{1 \vee (n |T(x)|)^2} \right] &\leq \frac{d}{n^2} + \frac{2 \lambda^{2d} \log n}{n^2} \prod_{j=1}^d \frac{1}{x_j(1-x_j)} \leq \frac{3 \lambda^{2d} \log n}{n^2} \prod_{j=1}^d \frac{1}{x_j(1-x_j)}. \end{align*} % Lower bounds which match up to constants for the first moment and up to logarithmic terms for the second moment are obtained as $\E \left[ 1 \wedge \frac{1}{(n|T(x)|)^2} \right] \geq \E \left[ 1 \wedge \frac{1}{n|T(x)|} \right]^2$ by Jensen, and % \begin{align*} \E \left[ 1 \wedge \frac{1}{n|T(x)|} \right] &\geq \frac{1}{1 + n \E \left[ |T(x)| \right]} \geq \frac{1}{1 + 2^d n / \lambda^d} \gtrsim \frac{\lambda^d}{n}. \end{align*} \end{proof} The endeavor to bound moments of (products of) $\frac{\I_b(x)}{N_b(x)}$ is concluded with the next result, combining the previous two lemmas to give a bound without expectations on the right. \begin{lemma}[Simplified generalized moment bound for Mondrian forest denominators]% \label{lem:mondrian_app_simple_moment_denominator} % Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold. Let $T_b \sim \cM\big([0,1]^d, \lambda\big)$ and $k_b \geq 1$ for $1 \leq b \leq B_0$. Then with $k = \sum_{b=1}^{B_0} k_b$, % \begin{align*} &\E\left[ \prod_{b=1}^{B_0} \frac{\I_b(x)}{N_b(x)^{k_b}} \right] \\ &\quad\leq \left( \frac{36k}{\inf_{x \in [0,1]^d} f(x)} \right)^{2^{B_0} k} \left( \prod_{j=1}^{d} \frac{1}{x_j (1-x_j)} \right)^{B_0} \prod_{b=1}^{B_0} \left( \frac{\lambda^d}{n} \right)^{\I \left\{ k_b = 1 \right\}} \left( \frac{\lambda^{2d} \log n}{n^2} \right)^{\I \left\{ k_b \geq 2 \right\}} \end{align*} % for sufficiently large $n$. % \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_simple_moment_denominator}] This follows directly from Lemmas~\ref{lem:mondrian_app_moment_denominator} and \ref{lem:mondrian_app_moment_cell}. \end{proof} Our final preliminary lemma is concerned with further properties of the inverse truncated binomial distribution, again with the aim of analyzing $\frac{\I_b(x)}{N_b(x)}$. This time, instead of merely upper bounding the moments, we aim to give convergence results for those moments, again in terms of moments of $\frac{1}{n |T_b(x)|}$. This time we only need to handle the first and second moment, so this result does not strictly generalize Lemma~\ref{lem:mondrian_app_binomial_bound} except in simple cases. The proof is by Taylor's theorem and the Cauchy--Schwarz inequality, using explicit expressions for moments of the binomial distribution and bounds from Lemma~\ref{lem:mondrian_app_binomial_bound}. \begin{lemma}[Expectation inequalities for the binomial distribution]% \label{lem:mondrian_app_binomial_expectation} Let $N \sim \Bin(n, p)$ and take $a, b \geq 1$. Then % \begin{align*} 0 &\leq \E \left[ \frac{1}{N+a} \right] - \frac{1}{n p+a} \leq \frac{2^{19}}{(n p+a)^2}, \\ 0 &\leq \E \left[ \frac{1}{(N+a)(N+b)} \right] - \frac{1}{(n p+a)(n p+b)} \leq \frac{2^{27}}{(n p +a)(n p +b)} \left( \frac{1}{n p + a} + \frac{1}{n p + b} \right). \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_binomial_expectation}] For the first result, Taylor's theorem with Lagrange remainder for $N \mapsto \frac{1}{N+a}$ around $n p$ gives % \begin{align*} \E \left[ \frac{1}{N+a} \right] &= \E \left[ \frac{1}{n p+a} - \frac{N - n p}{(n p+a)^2} + \frac{(N - n p)^2}{(\xi+a)^3} \right] \end{align*} % for some $\xi$ between $n p$ and $N$. The second term in the expectation is zero-mean, showing the non-negativity part, and the Cauchy--Schwarz inequality for the remaining term gives % \begin{align*} \E \left[ \frac{1}{N+a} \right] - \frac{1}{n p+a} &\leq \E \left[ \frac{(N - n p)^2}{(n p+a)^3} + \frac{(N - n p)^2}{(N+a)^3} \right] \\ &\leq \frac{\E\big[(N - n p)^2\big]}{(n p+a)^3} + \sqrt{ \E\big[(N - n p)^4\big] \E \left[ \frac{1}{(N+a)^6} \right]}. \end{align*} % Now we use $\E\big[(N - n p)^4\big] \leq n p(1+3n p)$ and apply Lemma~\ref{lem:mondrian_app_binomial_bound} to see that % \begin{align*} \E \left[ \frac{1}{N+a} \right] - \frac{1}{n p+a} &\leq \frac{n p}{(n p+a)^3} + \sqrt{\frac{54^6 n p(1+3 n p)}{(n p + a)^6}} \leq \frac{2^{19}}{(n p+a)^2}. \end{align*} % For the second result, Taylor's theorem applied to $N \mapsto \frac{1}{(N+a)(N+b)}$ around $n p$ gives % \begin{align*} \E \left[ \frac{1}{(N+a)(N+b)} \right] &= \E \left[ \frac{1}{(n p+a)(n p + b)} - \frac{(N - n p)(2 n p + a + b)}{(n p + a)^2 (n p + b)^2} \right] \\ &\quad+ \E \left[ \frac{(N - n p)^2}{(\xi+a)(\xi+b)} \left( \frac{1}{(\xi + a)^2} + \frac{1}{(\xi + a)(\xi + b)} + \frac{1}{(\xi + b)^2} \right) \right] \end{align*} % for some $\xi$ between $n p$ and $N$. The second term on the right is zero-mean, showing non-negativity, and applying the Cauchy--Schwarz inequality to the remaining term gives % \begin{align*} &\E \left[ \frac{1}{(N+a)(N+b)} \right] - \frac{1}{n p+a} \\ &\quad\leq \E \left[ \frac{2 (N - n p)^2}{(N+a)(N+b)} \left( \frac{1}{(N + a)^2} + \frac{1}{(N + b)^2} \right) \right] \\ &\qquad+ \E \left[ \frac{2 (N - n p)^2}{(n p +a)(n p +b)} \left( \frac{1}{(n p + a)^2} + \frac{1}{(n p + b)^2} \right) \right] \\ &\quad\leq \sqrt{ 4 \E \left[ (N - n p)^4 \right] \E \left[ \frac{1}{(N + a)^6 (N+b)^2} + \frac{1}{(N + b)^6 (N+a)^2} \right]} \\ &\qquad+ \frac{2 \E\big[(N - n p)^2\big]}{(n p +a)(n p +b)} \left( \frac{1}{(n p + a)^2} + \frac{1}{(n p + b)^2} \right). \end{align*} % Now we use $\E\big[(N - n p)^4\big] \leq n p(1+3n p)$ and apply Lemma~\ref{lem:mondrian_app_binomial_bound} to see that % \begin{align*} \E \left[ \frac{1}{(N+a)(N+b)} \right] - \frac{1}{n p+a} &\leq \sqrt{ \frac{4n p (1 + 3n p) \cdot 72^8}{(n p + a)^2 (n p + b)^2} \left( \frac{1}{(n p + a)^4} + \frac{1}{(n p + b)^4} \right)} \\ &\quad+ \frac{2 n p}{(n p +a)(n p +b)} \left( \frac{1}{(n p + a)^2} + \frac{1}{(n p + b)^2} \right) \\ &\leq \frac{2^{27}}{(n p + a) (n p + b)} \left( \frac{1}{n p + a} + \frac{1}{n p + b} \right). \end{align*} % \end{proof} \section{Proofs of main results} \label{sec:mondrian_app_proofs} \subsection{Mondrian random forests} We give rigorous proofs of the central limit theorem, bias characterization, and variance estimation results for the Mondrian random forest estimator without debiasing. See Section~\ref{sec:mondrian_overview_proofs} in the main text for details on our approaches to these proofs. \begin{proof}[Theorem~\ref{thm:mondrian_clt}] From the debiased version (Theorem~\ref{thm:mondrian_clt_debiased}) with $J=0$, $a_0 = 1$, and $\omega_0 = 1$. \end{proof} \begin{proof}[Theorem~\ref{thm:mondrian_bias}] \proofparagraph{removing the dependence on the trees} By measurability and with $\mu(X_i) = \E[Y_i \mid X_i]$ almost surely, % \begin{align*} \E \left[ \hat \mu(x) \mid \bX, \bT \right] - \mu(x) &= \frac{1}{B} \sum_{b=1}^B \sum_{i=1}^n \big( \mu(X_i) - \mu(x) \big) \frac{\I_{i b}(x)}{N_b(x)}. \end{align*} % Conditional on $\bX$, the terms in the outer sum depend only on $T_b$ so are i.i.d. As $\mu$ is Lipschitz, % \begin{align*} &\Var \big[ \E \left[ \hat \mu(x) \mid \bX, \bT \right] - \mu(x) \mid \bX \big] \leq \frac{1}{B} \E \left[ \left( \sum_{i=1}^n \big( \mu(X_i) - \mu(x) \big) \frac{\I_{i b}(x)}{N_b(x)} \right)^2 \Bigm| \bX \right] \\ &\quad\lesssim \frac{1}{B} \E \left[ \max_{1 \leq i \leq n} \big\| X_i - x \big\|_2^2 \left( \sum_{i=1}^n \frac{\I_{i b}(x)}{N_b(x)} \right)^2 \Bigm| \bX \right] \lesssim \frac{1}{B} \sum_{j=1}^{d} \E \left[ |T(x)_j|^2 \right] \lesssim \frac{1}{\lambda^2 B}, \end{align*} % using the law of $T(x)_j$ from \citet[Proposition~1]{mourtada2020minimax}. By Chebyshev's inequality, % \begin{align*} \big| \E \left[ \hat \mu(x) \mid \bX, \bT \right] - \E \left[ \hat \mu(x) \mid \bX \right] \big| &\lesssim_\P \frac{1}{\lambda \sqrt B}. \end{align*} \proofparagraph{showing the conditional bias converges in probability} Now $\E \left[ \hat\mu(x) \mid \bX \right]$ is a non-linear function of the i.i.d.\ random variables $X_i$, so we use the Efron--Stein inequality \citep{efron1981jackknife} to bound its variance. Let $\tilde X_{i j} = X_i$ if $i \neq j$ and be an independent copy of $X_j$, denoted $\tilde X_j$, if $i = j$. Write $\tilde \bX_j = (\tilde X_{1j}, \ldots, \tilde X_{n j})$ and similarly $\tilde \I_{i j b}(x) = \I \big\{ \tilde X_{i j} \in T_b(x) \big\}$ and $N_{j b}(x) = \sum_{i=1}^{n} \tilde \I_{i j b}(x)$. % \begin{align} \nonumber &\Var \left[ \sum_{i=1}^{n} \big( \mu(X_i) - \mu(x) \big) \E \left[ \frac{\I_{i b}(x)}{N_b(x)} \Bigm| \bX \right] \right] \\ \nonumber &\quad\leq \frac{1}{2} \sum_{j=1}^{n} \E \! \left[ \! \left( \sum_{i=1}^{n} \big( \mu(X_i) - \mu(x) \big) \E \! \left[ \frac{\I_{i b}(x)}{N_b(x)} \Bigm| \bX \right] - \sum_{i=1}^{n} \left( \mu(\tilde X_{i j}) - \mu(x) \right) \E \! \left[ \frac{\tilde \I_{i j b}(x)}{\tilde N_{j b}(x)} \Bigm| \tilde \bX_j \right] \right)^{\! \! 2} \right] \\ \nonumber &\quad\leq \frac{1}{2} \sum_{j=1}^{n} \E \left[ \left( \sum_{i=1}^{n} \left( \big( \mu(X_i) - \mu(x) \big) \frac{\I_{i b}(x)}{N_b(x)} - \left( \mu(\tilde X_{i j}) - \mu(x) \right) \frac{\tilde \I_{i j b}(x)}{\tilde N_{j b}(x)} \right) \right)^2 \right] \\ \nonumber &\quad\leq \sum_{j=1}^{n} \E \left[ \left( \sum_{i \neq j} \big( \mu(X_i) - \mu(x) \big) \left( \frac{\I_{i b}(x)}{N_b(x)} - \frac{\I_{i b}(x)}{\tilde N_{j b}(x)} \right) \right)^{\!\!2} \, \right] \\ \label{eq:mondrian_app_bias_efron_stein} &\qquad+ 2 \sum_{j=1}^{n} \E \left[ \left( \mu(X_j) - \mu(x) \right)^2 \frac{\I_{j b}(x)}{N_b(x)^2} \right]. \end{align} % For the first term in \eqref{eq:mondrian_app_bias_efron_stein} to be non-zero, we must have $|N_b(x) - \tilde N_{j b}(x)| = 1$. Writing $N_{-j b}(x) = \sum_{i \neq j} \I_{i b}(x)$, assume by symmetry that $\tilde N_{j b}(x) = N_{-j b}(x)$ and $N_b(x) = N_{-j b}(x) + 1$, and $\I_{j b}(x) = 1$. As $f$ is bounded and $\mu$ is Lipschitz, writing $\I_{-j b}(x) = \I \left\{ N_{-j b}(x) \geq 1 \right\}$, % \begin{align*} &\sum_{j=1}^{n} \E \left[ \left( \sum_{i \neq j} \left( \mu(X_i) - \mu(x) \right) \left( \frac{\I_{i b}(x)}{N_b(x)} - \frac{\I_{i b}(x)}{\tilde N_{j b}(x)} \right) \right)^{\! 2} \, \right] \\ &\quad\lesssim \sum_{j=1}^{n} \E \left[ \max_{1 \leq l \leq d} |T_b(x)_l|^2 \left( \frac{\sum_{i \neq j}\I_{i b}(x) \I_{j b}(x)} {N_{-j b}(x)(N_{-j b}(x) + 1)} \right)^2 \right] \lesssim \E \left[ \max_{1 \leq l \leq d} |T_b(x)_l|^2 \frac{\I_{b}(x)}{N_{b}(x)} \right]. \end{align*} % For $t > 0$, partition by $\left\{ \max_{1 \leq l \leq d} |T_b(x)_l| \geq t/\lambda \right\}$ and apply Lemma~\ref{lem:mondrian_app_largest_cell} and Lemma~\ref{lem:mondrian_app_simple_moment_denominator}: % \begin{align*} \E \left[ \max_{1 \leq l \leq d} |T_b(x)_l|^2 \frac{\I_{b}(x)}{N_{b}(x)} \right] &\leq \P \left( \max_{1 \leq l \leq d} |T_b(x)_l| \geq t/\lambda \right) + (t / \lambda)^2\, \E \left[ \frac{\I_{b}(x)}{N_{b}(x)} \right] \\ &\lesssim e^{-t/2} + \left( \frac{t}{\lambda} \right)^2 \frac{\lambda^d}{n} \lesssim \frac{1}{n^2} + \frac{(\log n)^2}{\lambda^2} \frac{\lambda^d}{n} \lesssim \frac{(\log n)^2}{\lambda^2} \frac{\lambda^{d}}{n}, \end{align*} % where we set $t = 4 \log n$. For the second term in \eqref{eq:mondrian_app_bias_efron_stein} we have % \begin{align*} \sum_{j=1}^{n} \E \left[ \left( \mu(X_j) - \mu(x) \right)^2 \frac{\I_{j b}(x)}{N_b(x)^2} \right] &\lesssim \E \left[ \max_{1 \leq l \leq d} |T_b(x)_l|^{2} \frac{\I_{b}(x)}{N_b(x)} \right] \lesssim \frac{(\log n)^2}{\lambda^2} \frac{\lambda^{d}}{n} \end{align*} % in the same manner. Hence % \begin{align*} \Var \left[ \sum_{i=1}^{n} \left( \mu(X_i) - \mu(x) \right) \E \left[ \frac{\I_{i b}(x)}{N_b(x)} \Bigm| \bX \right] \right] &\lesssim \frac{(\log n)^2}{\lambda^2} \frac{\lambda^{d}}{n}, \end{align*} % and so by Chebyshev's inequality, % \begin{align*} \big| \E \left[ \hat \mu(x) \mid \bX, \bT \right] - \E \left[ \hat \mu(x) \right] \big| &\lesssim_\P \frac{1}{\lambda \sqrt B} + \frac{\log n}{\lambda} \sqrt{ \frac{\lambda^{d}}{n} }. \end{align*} \proofparagraph{computing the limiting bias} It remains to compute the limit of $\E \left[ \hat \mu(x) \right] - \mu(x)$. Let $\bX_{-i} = (X_1, \ldots, X_{i-1}, X_{i+1}, \ldots, X_n)$ and $N_{-i b}(x) = \sum_{j=1}^n \I\{j \neq i\} \I\{X_j \in T_b(x)\}$. Then % \begin{align*} &\E \left[ \hat \mu(x) \right] - \mu(x) = \E \left[ \sum_{i=1}^{n} \left( \mu(X_i) - \mu(x) \right) \frac{\I_{i b}(x)}{N_b(x)} \right] \\ &\quad= \sum_{i=1}^{n} \E \left[ \E \left[ \frac{\left( \mu(X_i) - \mu(x) \right)\I_{i b}(x)} {N_{-i b}(x) + 1} \bigm| \bT, \bX_{-i} \right] \right] = n \, \E \left[ \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s} {N_{-i b}(x) + 1} \right]. \end{align*} % By Lemma~\ref{lem:mondrian_app_binomial_expectation}, as $N_{-i b}(x) \sim \Bin\left(n-1, \int_{T_b(x)} f(s) \diff s \right)$ given $\bT$ and $f$ is bounded below, % \begin{align*} \left| \E \! \left[ \frac{1}{N_{-i b}(x) + 1} \Bigm| \bT \right] - \frac{1}{(n-1) \! \int_{T_b(x)} \! f(s) \diff s + 1} \right| &\lesssim \frac{1}{n^2 \! \left( \int_{T_b(x)} f(s) \diff s \right)^2} \wedge 1 \lesssim \frac{1}{n^2 |T_b(x)|^2} \wedge 1, \end{align*} % and also % \begin{align*} \left| \frac{1}{(n-1) \int_{T_b(x)} f(s) \diff s + 1} - \frac{1}{n \int_{T_b(x)} f(s) \diff s} \right| &\lesssim \frac{1}{n^2 \left( \int_{T_b(x)} f(s) \diff s\right)^2} \wedge 1 \lesssim \frac{1}{n^2 |T_b(x)|^2} \wedge 1. \end{align*} % So by Lemmas~\ref{lem:mondrian_app_largest_cell} and \ref{lem:mondrian_app_moment_cell}, since $f$ is Lipschitz and bounded, using Cauchy--Schwarz, % \begin{align*} &\left| \E \left[ \hat \mu(x) \right] - \mu(x) - \E \left[ \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s} {\int_{T_b(x)} f(s) \diff s} \right] \right| \lesssim \E \left[ \frac{n \int_{T_b(x)} \left| \mu(s) - \mu(x) \right| f(s) \diff s} {n^2 |T_b(x)|^2 \vee 1} \right] \\ &\qquad\lesssim \E \left[ \frac{\max_{1 \leq l \leq d} |T_b(x)_l| } {n |T_b(x)| \vee 1} \right] \\ &\qquad\lesssim \frac{2 \log n}{\lambda} \, \E \left[ \frac{1}{n |T_b(x)| \vee 1} \right] + \P \left( \max_{1 \leq l \leq d} |T_b(x)_l| > \frac{2 \log n}{\lambda} \right)^{1/2} \E \left[ \frac{1} {n^2 |T_b(x)|^2 \vee 1} \right]^{1/2} \\ &\qquad\lesssim \frac{\log n}{\lambda} \, \frac{\lambda^d}{n} + \frac{d}{n} \frac{\lambda^d \sqrt{\log n}}{n} \lesssim \frac{\log n}{\lambda} \, \frac{\lambda^d}{n}. \end{align*} % Next set $A = \frac{1}{f(x) |T_b(x)|} \int_{T_b(x)} (f(s) - f(x)) \diff s \geq \inf_{s \in [0,1]^d} \frac{f(s)}{f(x)} - 1$. Use the Maclaurin series of $\frac{1}{1+x}$ up to order $\flbeta$ to see $\frac{1}{1 + A} = \sum_{k=0}^{\flbeta} (-1)^k A^k + O \left( A^{\flbeta + 1} \right)$. Hence % \begin{align*} &\E \left[ \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s} {\int_{T_b(x)} f(s) \diff s} \right] = \E \left[ \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s} {f(x) |{T_b(x)}|} \frac{1}{1 + A} \right] \\ &\quad= \E \left[ \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s} {f(x) |{T_b(x)}|} \left( \sum_{k=0}^{\flbeta} (-1)^k A^k + O \left( |A|^{\flbeta + 1} \right) \right) \right]. \end{align*} % Note that since $f$ and $\mu$ are Lipschitz, and by integrating the tail probability given in Lemma~\ref{lem:mondrian_app_largest_cell}, the Maclaurin remainder term is bounded by % \begin{align*} &\E \left[ \frac{\int_{T_b(x)} \left| \mu(s) - \mu(x) \right| f(s) \diff s} {f(x) |{T_b(x)}|} |A|^{\flbeta + 1} \right] \\ &\qquad= \E \left[ \frac{\int_{T_b(x)} \left| \mu(s) - \mu(x) \right| f(s) \diff s} {f(x) |{T_b(x)}|} \left( \frac{1}{f(x) |{T_b(x)}|} \int_{T_b(x)} (f(s) - f(x)) \diff s \right)^{\flbeta + 1} \right] \\ &\qquad\lesssim \E \left[ \max_{1 \leq l \leq d} |T_b(x)_l|^{\flbeta+2} \right] = \int_{0}^{\infty} \P \left( \max_{1 \leq l \leq d} |T_b(x)_l| \geq t^{\frac{1}{\flbeta+2}} \right) \diff t \leq \int_{0}^{\infty} 2 d e^{- \lambda t^{\frac{1}{\flbeta+2}} / 2} \diff t \\ &\qquad= \frac{2^{\flbeta + 3} d (\flbeta + 2)! } {\lambda^{\flbeta + 2}} \lesssim \frac{1}{\lambda^{\beta}}, \end{align*} % since $\int_0^\infty e^{-a x^{1/k}} \diff x = a^{-k} k!$. To summarize the progress so far, we have % \begin{align*} &\left| \E \left[ \hat \mu(x) \right] - \mu(x) - \sum_{k=0}^{\flbeta} (-1)^k \, \E \left[ \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s} {f(x)^{k+1} |T_b(x)|^{k+1}} \left( \int_{T_b(x)} (f(s) - f(x)) \diff s \right)^k \right] \right| \\ &\qquad\lesssim \frac{\log n}{\lambda} \frac{\lambda^d}{n} + \frac{1}{\lambda^\beta}. \end{align*} % We evaluate the expectation. By Taylor's theorem, with $\nu$ a multi-index, as $f \in \cH^\beta$, % \begin{align*} \left( \int_{T_b(x)} (f(s) - f(x)) \diff s \right)^k &= \left( \sum_{|\nu| = 1}^\flbeta \frac{\partial^\nu f(x)}{\nu !} \! \int_{T_b(x)} \!\! (s - x)^\nu \diff s \right)^k + O \! \left( |T_b(x)| \max_{1 \leq l \leq d} |T_b(x)_l|^\beta \right). \end{align*} % Next, by the multinomial theorem with a multi-index $u$ indexed by $\nu$ with $|\nu| \geq 1$, % \begin{align*} \left( \sum_{|\nu| = 1}^\flbeta \frac{\partial^\nu f(x)}{\nu !} \int_{T_b(x)} (s - x)^\nu \diff s \right)^k &= \sum_{|u| = k} \binom{k}{u} \left( \frac{\partial^\nu f(x)}{\nu !} \int_{T_b(x)} (s-x)^\nu \diff s \right)^u \end{align*} % where $\binom{k}{u}$ is a multinomial coefficient. By Taylor's theorem with $f, \mu \in \cH^\beta$, % \begin{align*} &\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s \\ &\quad= \sum_{|\nu'|=1}^{\flbeta} \sum_{|\nu''|=0}^{\flbeta} \frac{\partial^{\nu'} \mu(x)}{\nu' !} \frac{\partial^{\nu''} f(x)}{\nu'' !} \int_{T_b(x)} (s-x)^{\nu' + \nu''} \diff s + O \left( |T_b(x)| \max_{1 \leq l \leq d} |T_b(x)_l|^\beta \right). \end{align*} % Now by integrating the tail probabilities in Lemma~\ref{lem:mondrian_app_largest_cell}, $ \E \left[ \max_{1 \leq l \leq d} |T_b(x)_l|^\beta \right] \lesssim \frac{1}{\lambda^\beta}$. Therefore, by Lemma~\ref{lem:mondrian_app_moment_cell}, writing $T_b(x)^\nu$ for $\int_{T_b(x)} (s-x)^\nu \diff s$, % \begin{align*} &\sum_{k=0}^{\flbeta} (-1)^k \, \E \left[ \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s} {f(x)^{k+1} |T_b(x)|^{k+1}} \left( \int_{T_b(x)} (f(s) - f(x)) \diff s \right)^k \right] \\ &\,= \! \sum_{k=0}^{\flbeta} (-1)^k \, \E \! \left[ \! \frac{ \sum_{|\nu'|=1}^{\flbeta} \! \sum_{|\nu''|=0}^{\flbeta} \! \frac{\partial^{\nu'} \mu(x)}{\nu' !} \frac{\partial^{\nu''} f(x)}{\nu'' !} T_b(x)^{\nu' + \nu''\!\!\!} }{f(x)^{k+1} |T_b(x)|^{k+1}} \!\! \sum_{|u| = k} \! \binom{k}{u} \!\! \left( \frac{\partial^\nu f(x)}{\nu !} T_b(x)^\nu \right)^{\!\! u} \right] \! + O \! \left( \frac{1}{\lambda^\beta} \right) \\ &\,= \sum_{|\nu'|=1}^{\flbeta} \sum_{|\nu''|=0}^{\flbeta} \sum_{|u|=0}^{\flbeta} \frac{\partial^{\nu'} \mu(x)}{\nu' !} \frac{\partial^{\nu''} f(x)}{\nu'' !} \left( \frac{\partial^\nu f(x)}{\nu !} \right)^u \binom{|u|}{u} \frac{(-1)^{|u|}}{f(x)^{|u|+1}} \E \left[ \frac{ T_b(x)^{\nu' + \nu''} (T_b(x)^\nu)^u}{|T_b(x)|^{|u|+1}} \right] \\ &\quad+ O \left( \frac{1}{\lambda^\beta} \right) . \end{align*} % We show this is a polynomial in $1/\lambda$. For $1 \leq j \leq d$, define $E_{1j*} \sim \Exp(1) \wedge (\lambda x_j)$ and $E_{2j*} \sim \Exp(1) \wedge (\lambda (1-x_j))$ independent so $T_b(x) = \prod_{j=1}^{d} [x_j - E_{1j*} / \lambda, x_j + E_{2j*} / \lambda]$. Then % \begin{align*} T_b(x)^\nu &= \int_{T_b(x)} (s-x)^\nu \diff s = \prod_{j=1}^d \int_{x_j - E_{1j*}/\lambda}^{x_j+E_{2j*}/\lambda} (s - x_j)^{\nu_j} \diff s = \prod_{j=1}^d \int_{-E_{1j*}}^{E_{2j*}} (s / \lambda)^{\nu_j} 1/\lambda \diff s \\ &= \lambda^{-d - |\nu|} \prod_{j=1}^d \int_{-E_{1j*}}^{E_{2j*}} s^{\nu_j} \diff s = \lambda^{-d - |\nu|} \prod_{j=1}^d \frac{E_{2j*}^{\nu_j + 1} + (-1)^{\nu_j} E_{1j*}^{\nu_j + 1}} {\nu_j + 1}. \end{align*} % So by independence over $j$, % \begin{align} \label{eq:mondrian_app_bias_calc} &\E \left[ \frac{ T_b(x)^{\nu' + \nu''} (T_b(x)^\nu)^u}{|T_b(x)|^{|u|+1}} \right] \\ \nonumber &\quad= \lambda^{- |\nu'| - |\nu''| - |\nu| \cdot u} \prod_{j=1}^d \E \left[ \frac{E_{2j*}^{\nu'_j + \nu''_j + 1} + (-1)^{\nu'_j + \nu''_j} E_{1j*}^{\nu'_j + \nu''_j + 1}} {(\nu'_j + \nu''_j + 1) (E_{2j*} + E_{1j*})} \frac{\left(E_{2j*}^{\nu_j + 1} + (-1)^{\nu_j} E_{1j*}^{\nu_j + 1}\right)^u} {(\nu_j + 1)^u (E_{2j*} + E_{1j*})^{|u|}} \right]. \end{align} % The final step is to replace $E_{1j*}$ by $E_{1j} \sim \Exp(1)$ and similarly for $E_{2j*}$. For some $C > 0$, % \begin{align*} \P \! \left( \bigcup_{j=1}^{d} \left( \left\{ E_{1j*} \neq E_{1j} \right\} \cup \left\{ E_{2j*} \neq E_{2j} \right\} \right) \! \right) &\leq 2d\, \P \! \left( \Exp(1) \geq \lambda \min_{1 \leq j \leq d} (x_j \wedge (1-x_j)) \! \right) \leq 2d e^{-C \lambda}. \end{align*} % Further, the quantity inside the expectation in \eqref{eq:mondrian_app_bias_calc} is bounded almost surely by one and so the error incurred by replacing $E_{1j*}$ and $E_{2j*}$ by $E_{1j}$ and $E_{2j}$ in \eqref{eq:mondrian_app_bias_calc} is at most $2 d e^{-C \lambda} \lesssim \lambda^{-\beta}$. Thus the limiting bias is % \begin{align} \nonumber &\E \left[ \hat \mu(x) \right] - \mu(x) \\ \nonumber &\quad= \sum_{|\nu'|=1}^{\flbeta} \sum_{|\nu''|=0}^{\flbeta} \sum_{|u|=0}^{\flbeta} \frac{\partial^{\nu'} \mu(x)}{\nu' !} \frac{\partial^{\nu''} f(x)}{\nu'' !} \left( \frac{\partial^\nu f(x)}{\nu !} \right)^u \binom{|u|}{u} \frac{(-1)^{|u|}}{f(x)^{|u|+1}} \, \lambda^{- |\nu'| - |\nu''| - |\nu| \cdot u} \\ \nonumber &\qquad\quad\times \prod_{j=1}^d \E \left[ \frac{E_{2j}^{\nu'_j + \nu''_j + 1} + (-1)^{\nu'_j + \nu''_j} E_{1j}^{\nu'_j + \nu''_j + 1}} {(\nu'_j + \nu''_j + 1) (E_{2j} + E_{1j})} \frac{\left(E_{2j}^{\nu_j + 1} + (-1)^{\nu_j} E_{1j}^{\nu_j + 1}\right)^u} {(\nu_j + 1)^u (E_{2j} + E_{1j})^{|u|}} \right] \\ \label{eq:mondrian_app_bias} &\qquad+ O \left( \frac{\log n}{\lambda} \frac{\lambda^d}{n} \right) + O \left( \frac{1}{\lambda^\beta} \right), \end{align} % recalling that $u$ is a multi-index which is indexed by the multi-index $\nu$. This is a polynomial in $\lambda$ of degree at most $\flbeta$, since higher-order terms can be absorbed into $O(1 / \lambda^\beta)$, which has finite coefficients depending only on the derivatives up to order $\flbeta$ of $f$ and $\mu$ at $x$. Now we show that the odd-degree terms in this polynomial are all zero. Note that a term is of odd degree if and only if $|\nu'| + |\nu''| + |\nu| \cdot u$ is odd. This implies that there exists $1 \leq j \leq d$ such that exactly one of either $\nu'_j + \nu''_j$ is odd or $\sum_{|\nu|=1}^{\flbeta} \nu_j u_\nu$ is odd. If $\nu'_j + \nu''_j$ is odd, then $\sum_{|\nu|=1}^{\flbeta} \nu_j u_\nu$ is even, so $|\{\nu : \nu_j u_\nu \text{ is odd}\}|$ is even. Consider the effect of swapping $E_{1j}$ and $E_{2j}$, an operation which preserves their joint law, in each of % \begin{align} \label{eq:mondrian_app_bias_odd_1} \frac{E_{2j}^{\nu'_j + \nu''_j + 1} - (-E_{1j})^{\nu'_j + \nu''_j + 1}} {E_{2j} + E_{1j}} \end{align} % and % \begin{align} \label{eq:mondrian_app_bias_odd_2} &\frac{\left(E_{2j}^{\nu_j + 1} - (-E_{1j})^{\nu_j + 1}\right)^u} {(E_{2j} + E_{1j})^{|u|}} = \!\!\! \prod_{\substack{|\nu| = 1 \\ \nu_j u_\nu \text{ even}}}^\beta \!\!\! \frac{\left(E_{2j}^{\nu_j + 1} - (-E_{1j})^{\nu_j + 1}\right)^{u_\nu}} {(E_{2j} + E_{1j})^{u_\nu}} \!\!\! \prod_{\substack{|\nu| = 1 \\ \nu_j u_\nu \text{ odd}}}^\beta \!\!\! \frac{\left(E_{2j}^{\nu_j + 1} - (-E_{1j})^{\nu_j + 1}\right)^{u_\nu}} {(E_{2j} + E_{1j})^{u_\nu}}. \end{align} % Clearly, $\nu'_j + \nu''_j$ being odd inverts the sign of \eqref{eq:mondrian_app_bias_odd_1}. For \eqref{eq:mondrian_app_bias_odd_2}, each term in the first product has either $\nu_j$ even or $u_\nu$ even, so its sign is preserved. Every term in the second product of \eqref{eq:mondrian_app_bias_odd_2} has its sign inverted due to both $\nu_j$ and $u_\nu$ being odd, but there are an even number of terms, preserving the overall sign. Therefore the expected product of \eqref{eq:mondrian_app_bias_odd_1} and \eqref{eq:mondrian_app_bias_odd_2} is zero by symmetry. If however $\nu'_j + \nu''_j$ is even, then $\sum_{|\nu|=1}^{\flbeta} \nu_j u_\nu$ is odd so $|\{\nu : \nu_j u_\nu \text{ is odd}\}|$ is odd. Clearly, the sign of \eqref{eq:mondrian_app_bias_odd_1} is preserved. Again the sign of the first product in \eqref{eq:mondrian_app_bias_odd_2} is preserved, and the sign of every term in \eqref{eq:mondrian_app_bias_odd_2} is inverted. However there are now an odd number of terms in the second product, so its overall sign is inverted. Therefore the expected product of \eqref{eq:mondrian_app_bias_odd_1} and \eqref{eq:mondrian_app_bias_odd_2} is again zero. \proofparagraph{calculating the second-order bias} Next we calculate some special cases, beginning with the form of the leading second-order bias, where the exponent in $\lambda$ is $|\nu'| + |\nu''| + u \cdot |\nu| = 2$, proceeding by cases on the values of $|\nu'|$, $|\nu''|$, and $|u|$. Firstly, if $|\nu'| = 2$ then $|\nu''| = |u| = 0$. Note that if any $\nu'_j = 1$ then the expectation in \eqref{eq:mondrian_app_bias} is zero. Hence we can assume $\nu'_j \in \{0, 2\}$, yielding % \begin{align*} \frac{1}{2 \lambda^2} \! \sum_{j=1}^d \frac{\partial^2 \mu(x)}{\partial x_j^2} \frac{1}{3} \E \! \left[ \frac{E_{2j}^{3} + E_{1j}^{3}} {E_{2j} + E_{1j}} \right] &\!= \frac{1}{2 \lambda^2} \! \sum_{j=1}^d \frac{\partial^2 \mu(x)}{\partial x_j^2} \frac{1}{3} \E \! \left[ E_{1j}^{2} + E_{2j}^{2} - E_{1j} E_{2j} \right] = \frac{1}{2 \lambda^2} \! \sum_{j=1}^d \frac{\partial^2 \mu(x)}{\partial x_j^2}, \end{align*} % where we used that $E_{1j}$ and $E_{2j}$ are independent $\Exp(1)$. Next we consider $|\nu'| = 1$ and $|\nu''| = 1$, so $|u| = 0$. Note that if $\nu'_j = \nu''_{j'} = 1$ with $j \neq j'$ then the expectation in \eqref{eq:mondrian_app_bias} is zero. So we need only consider $\nu'_j = \nu''_j = 1$, giving % \begin{align*} \frac{1}{\lambda^2} \frac{1}{f(x)} \sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j} \frac{\partial f(x)}{\partial x_j} \frac{1}{3} \E \left[ \frac{E_{2j}^{3} + E_{1j}^{3}} {E_{2j} + E_{1j}} \right] &= \frac{1}{\lambda^2} \frac{1}{f(x)} \sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j} \frac{\partial f(x)}{\partial x_j}. \end{align*} % Finally, we have the case where $|\nu'| = 1$, $|\nu''| = 0$ and $|u|=1$. Then $u_\nu = 1$ for some $|\nu| = 1$ and zero otherwise. Note that if $\nu'_j = \nu_{j'} = 1$ with $j \neq j'$ then the expectation is zero. So we need only consider $\nu'_j = \nu_j = 1$, giving % \begin{align*} &- \frac{1}{\lambda^2} \frac{1}{f(x)} \sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j} \frac{\partial f(x)}{\partial x_j} \frac{1}{4} \E \left[ \frac{(E_{2j}^2 - E_{1j}^2)^2} {(E_{2j} + E_{1j})^2} \right] \\ &\quad= - \frac{1}{4 \lambda^2} \frac{1}{f(x)} \sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j} \frac{\partial f(x)}{\partial x_j} \E \left[ E_{1j}^2 + E_{2j}^2 - 2 E_{1j} E_{2j} \right] = - \frac{1}{2 \lambda^2} \frac{1}{f(x)} \sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j} \frac{\partial f(x)}{\partial x_j}. \end{align*} % Hence the second-order bias term is % \begin{align*} \frac{1}{2 \lambda^2} \sum_{j=1}^d \frac{\partial^2 \mu(x)}{\partial x_j^2} + \frac{1}{2 \lambda^2} \frac{1}{f(x)} \sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j} \frac{\partial f(x)}{\partial x_j}. \end{align*} \proofparagraph{calculating the bias if the data is uniformly distributed} If $X_i \sim \Unif\big([0,1]^d\big)$ then $f(x) = 1$ and the bias expansion from \eqref{eq:mondrian_app_bias} becomes % \begin{align*} \sum_{|\nu'|=1}^{\flbeta} \lambda^{- |\nu'|} \frac{\partial^{\nu'} \mu(x)}{\nu' !} \prod_{j=1}^d \E \left[ \frac{E_{2j}^{\nu'_j + 1} + (-1)^{\nu'_j} E_{1j}^{\nu'_j + 1}} {(\nu'_j + 1) (E_{2j} + E_{1j})} \right]. \end{align*} % This is zero if any $\nu_j'$ is odd, so we group these terms based on the exponent of $\lambda$ to see % \begin{align*} \frac{B_r(x)}{\lambda^{2r}} &= \frac{1}{\lambda^{2r}} \sum_{|\nu|=r} \frac{\partial^{2 \nu} \mu(x)}{(2 \nu) !} \prod_{j=1}^d \frac{1}{2\nu_j + 1} \E \left[ \frac{E_{2j}^{2\nu_j + 1} + E_{1j}^{2\nu_j + 1}} {E_{2j} + E_{1j}} \right]. \end{align*} % Since $\int_0^\infty \frac{e^{-t}}{a+t} \diff t = e^a \Gamma(0,a)$ and $\int_0^\infty s^a \Gamma(0, a) \diff s = \frac{a!}{a+1}$, with $\Gamma(0, a) = \int_a^\infty \frac{e^{-t}}{t} \diff t$ the upper incomplete gamma function, the expectation is easily calculated as % \begin{align*} \E \left[ \frac{E_{2j}^{2\nu_j + 1} + E_{1j}^{2\nu_j + 1}} {E_{2j} + E_{1j}} \right] &= 2 \int_{0}^{\infty} s^{2\nu_j + 1} e^{-s} \int_{0}^{\infty} \frac{e^{-t}} {s + t} \diff t \diff s \\ &= 2 \int_{0}^{\infty} s^{2\nu_j + 1} \Gamma(0, s) \diff s = \frac{(2 \nu_j + 1)!}{\nu_j + 1}, \end{align*} % so finally % \begin{align*} \frac{B_r(x)}{\lambda^{2r}} &= \frac{1}{\lambda^{2r}} \sum_{|\nu|=r} \frac{\partial^{2 \nu} \mu(x)}{(2 \nu) !} \prod_{j=1}^d \frac{1}{2\nu_j + 1} \frac{(2 \nu_j + 1)!}{\nu_j + 1} = \frac{1}{\lambda^{2r}} \sum_{|\nu|=r} \partial^{2 \nu} \mu(x) \prod_{j=1}^d \frac{1}{\nu_j + 1}. \end{align*} % \end{proof} \begin{proof}[Theorem~\ref{thm:mondrian_variance_estimation}] This follows from the debiased version in Theorem~\ref{thm:mondrian_variance_estimation_debiased} with $J=0$, $a_0 = 1$, and $\omega_0 = 1$. \end{proof} \begin{proof}[Theorem~\ref{thm:mondrian_confidence}] % By Theorem~\ref{thm:mondrian_bias} and Theorem~\ref{thm:mondrian_variance_estimation}, % \begin{align*} \sqrt{\frac{n}{\lambda^d}} \frac{\hat \mu(x) - \mu(x)}{\hat \Sigma(x)^{1/2}} &= \sqrt{\frac{n}{\lambda^d}} \frac{\hat \mu(x) - \E \left[ \hat \mu(x) \mid \bX, \bT \right]} {\hat \Sigma(x)^{1/2}} + \sqrt{\frac{n}{\lambda^d}} \frac{\E \left[ \hat \mu(x) \mid \bX, \bT \right] - \mu(x)} {\hat \Sigma(x)^{1/2}} \\ &= \sqrt{\frac{n}{\lambda^d}} \frac{\hat \mu(x) - \E \left[ \hat \mu(x) \mid \bX, \bT \right]} {\hat \Sigma(x)^{1/2}} + \sqrt{\frac{n}{\lambda^d}} \, O_\P \left( \frac{1}{\lambda^{\beta \wedge 2}} + \frac{1}{\lambda \sqrt B} + \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}} \right). \end{align*} % The first term now converges weakly to $\cN(0,1)$ by Slutsky's theorem, Theorem~\ref{thm:mondrian_clt}, and Theorem~\ref{thm:mondrian_variance_estimation}, while the second term is $o_\P(1)$ by assumption. Validity of the confidence interval follows immediately. % \end{proof} \subsection{Debiased Mondrian random forests} We give rigorous proofs of the central limit theorem, bias characterization, variance estimation, confidence interval validity, and minimax optimality results for the debiased Mondrian random forest estimator. \begin{proof}[Theorem~\ref{thm:mondrian_clt_debiased}] We use the martingale central limit theorem given by \citet[Theorem~3.2]{hall1980martingale}. For each $1 \leq i \leq n$ define $\cH_{n i}$ to be the filtration generated by $\bT$, $\bX$, and $(\varepsilon_j : 1 \leq j \leq i)$, noting that $\cH_{n i} \subseteq \cH_{(n+1)i}$ because $B$ increases weakly as $n$ increases. Let $\I_{i b r}(x) = \I\{X_i \in T_{b r}(x)\}$ where $T_{b r}(x)$ is the cell containing $x$ in tree $b$ used to construct $\hat \mu_r(x)$, and similarly let $N_{b r}(x) = \sum_{i=1}^n \I_{i b r}(x)$ and $\I_{b r}(x) = \I\{N_{b r}(x) \geq 1\}$. Define the $\cH_{n i}$-measurable and square integrable variables % \begin{align*} S_i(x) &= \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)}, \end{align*} % which satisfy the martingale difference property $\E [ S_i(x) \mid \cH_{n i} ] = 0$. Further, % \begin{align*} \sqrt{\frac{n}{\lambda^d}} \big( \hat\mu_\rd(x) - \E\left[ \hat\mu_\rd(x) \mid \bX, \bT \right] \big) = \sum_{i=1}^n S_i(x). \end{align*} % By \citet[Theorem~3.2]{hall1980martingale} it suffices to check that % \begin{inlineroman} \item $\max_i |S_i(x)| \to 0$ in probability,% \label{it:mondrian_app_hall_prob} \item $\E\left[\max_i S_i(x)^2\right] \lesssim 1$, and% \label{it:mondrian_app_hall_exp} \item $\sum_i S_i(x)^2 \to \Sigma_\rd(x)$ in probability. \label{it:mondrian_app_hall_var} \end{inlineroman} \proofparagraph{checking condition \ref{it:mondrian_app_hall_prob}} % Since $J$ is fixed and $\E[|\varepsilon_i|^3 \mid X_i]$ is bounded, by Jensen's inequality and Lemma~\ref{lem:mondrian_app_simple_moment_denominator}, % \begin{align*} \E\left[\max_{1 \leq i \leq n} |S_i(x)| \right] &= \E\left[\max_{1 \leq i \leq n} \left| \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)} \right| \right] \\ &\leq \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} |\omega_r| \frac{1}{B} \E\left[\max_{1 \leq i \leq n} \left| \sum_{b=1}^B \frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)} \right| \right] \\ &\leq \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} |\omega_r| \frac{1}{B} \E\left[ \sum_{i=1}^{n} \left( \sum_{b=1}^B \frac{\I_{i b r}(x) |\varepsilon_i|} {N_{b r}(x)} \right)^3 \right]^{1/3} \\ &= \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} |\omega_r| \frac{1}{B} \E\left[ \sum_{i=1}^{n} |\varepsilon_i|^3 \sum_{b=1}^B \sum_{b'=1}^B \sum_{b''=1}^B \frac{\I_{i b r}(x) } {N_{b r}(x)} \frac{\I_{i b' r}(x) } {N_{b' r}(x)} \frac{\I_{i b'' r}(x) } {N_{b'' r}(x)} \right]^{1/3} \\ &\lesssim \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} |\omega_r| \frac{1}{B^{2/3}} \E\left[ \sum_{b=1}^B \sum_{b'=1}^B \frac{\I_{b r}(x)} {N_{b r}(x)} \frac{\I_{b' r}(x)} {N_{b' r}(x)} \right]^{1/3} \\ &\lesssim \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} |\omega_r| \frac{1}{B^{2/3}} \left( B^2 \frac{a_r^{2d} \lambda^{2d}}{n^2} + B \frac{a_r^{2d} \lambda^{2d} \log n}{n^2} \right)^{1/3} \\ &\lesssim \left( \frac{\lambda^d}{n} \right)^{1/6} + \left( \frac{\lambda^d}{n} \right)^{1/6} \left( \frac{\log n}{B} \right)^{1/3} \to 0. \end{align*} \proofparagraph{checking condition \ref{it:mondrian_app_hall_exp}} % Since $\E[\varepsilon_i^2 \mid X_i]$ is bounded and by Lemma~\ref{lem:mondrian_app_simple_moment_denominator}, % \begin{align*} \E\left[\max_{1 \leq i \leq n} S_i(x)^2 \right] &= \E\left[ \max_{1 \leq i \leq n} \left( \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)} \right)^2 \right] \\ &\leq \frac{n}{\lambda^d} \frac{1}{B^2} (J+1)^2 \max_{0 \leq r \leq J} \omega_r^2 \,\E\left[ \sum_{i=1}^{n} \sum_{b=1}^B \sum_{b'=1}^B \frac{\I_{i b r}(x) \I_{i b' r}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r}(x)} \right] \\ &\lesssim \frac{n}{\lambda^d} \max_{0 \leq r \leq J} \E\left[ \frac{\I_{b r}(x)}{N_{b r}(x)} \right] \lesssim \frac{n}{\lambda^d} \max_{0 \leq r \leq J} \frac{a_r^d \lambda^d}{n} \lesssim 1. \end{align*} \proofparagraph{checking condition \ref{it:mondrian_app_hall_var}} Next, we have % \begin{align} \label{eq:mondrian_app_clt_condition_sum} \sum_{i=1}^n S_i(x)^2 &= \sum_{i=1}^n \left( \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)} \right)^2 \\ &= \nonumber \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \sum_{b=1}^B \sum_{b'=1}^B \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \\ \nonumber &= \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \sum_{b=1}^B \left( \frac{\I_{i b r}(x) \I_{i b r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b r'}(x)} + \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right). \end{align} % By boundedness of $\E[\varepsilon_i^2 \mid X_i]$ and Lemma~\ref{lem:mondrian_app_simple_moment_denominator}, the first term in \eqref{eq:mondrian_app_clt_condition_sum} vanishes as % \begin{align*} \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \sum_{b=1}^B \E \left[ \frac{\I_{i b r}(x) \I_{i b r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b r'}(x)} \right] &\lesssim \frac{n}{\lambda^d} \frac{1}{B^2} \max_{0 \leq r \leq J} \sum_{b=1}^B \E \left[ \frac{\I_{b r}(x)}{N_{b r}(x)} \right] \lesssim \frac{1}{B} \to 0. \end{align*} % For the second term in \eqref{eq:mondrian_app_clt_condition_sum}, the law of total variance gives % \begin{align} \nonumber &\Var \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] \\ \nonumber &\quad\leq (J+1)^4 \max_{0 \leq r, r' \leq J} \omega_r \omega_{r'} \Var \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] \\ \nonumber &\quad\lesssim \max_{0 \leq r, r' \leq J} \E \left[ \Var \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX, \bY \right] \right] \\ \label{eq:mondrian_app_total_variance} &\qquad+ \max_{0 \leq r, r' \leq J} \Var \left[ \E \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX, \bY \right] \right] \end{align} % For the first term in \eqref{eq:mondrian_app_total_variance}, % \begin{align*} &\E \left[ \Var \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX, \bY \right] \right] \\ &\quad= \frac{n^2}{\lambda^{2d}} \frac{1}{B^4} \sum_{i=1}^n \sum_{j=1}^n \sum_{b=1}^B \sum_{b' \neq b} \sum_{\tilde b=1}^B \sum_{\tilde b' \neq \tilde b} \E \Bigg[ \varepsilon_i^2 \varepsilon_j^2 \left( \frac{\I_{i b r}(x) \I_{i b' r'}(x) } {N_{b r}(x) N_{b' r'}(x)} - \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) } {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX \right] \right) \\ &\qquad\quad \times \left( \frac{\I_{j \tilde b r}(x) \I_{j \tilde b' r'}(x) } {N_{\tilde b r}(x) N_{ \tilde b' r'}(x)} - \E \left[ \frac{\I_{j \tilde b r}(x) \I_{j \tilde b' r'}(x) } {N_{\tilde b r}(x) N_{\tilde b' r'}(x)} \Bigm| \bX \right] \right) \Bigg]. \end{align*} % Since $T_{b r}$ is independent of $T_{b' r'}$ given $\bX, \bY$, the summands are zero whenever $\big|\{b, b', \tilde b, \tilde b'\}\big| = 4$. Since $\E[ \varepsilon_i^2 \mid X_i]$ is bounded and by the Cauchy--Schwarz inequality and Lemma~\ref{lem:mondrian_app_simple_moment_denominator}, % \begin{align*} &\E \left[ \Var \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX, \bY \right] \right] \\ &\quad\lesssim \frac{n^2}{\lambda^{2d}} \frac{1}{B^3} \sum_{b=1}^B \sum_{b' \neq b} \E \left[ \left( \sum_{i=1}^n \frac{\I_{i b r}(x) \I_{i b' r'}(x) } {N_{b r}(x) N_{b' r'}(x)} \right)^2 \right] \lesssim \frac{n^2}{\lambda^{2d}} \frac{1}{B} \E \left[ \frac{\I_{b r}(x)}{N_{b r}(x)} \frac{\I_{b' r'}(x)}{N_{b' r'}(x)} \right] \lesssim \frac{1}{B} \to 0. \end{align*} % For the second term in \eqref{eq:mondrian_app_total_variance}, the random variable inside the variance is a nonlinear function of the i.i.d.\ variables $(X_i, \varepsilon_i)$, so we apply the Efron--Stein inequality \citep{efron1981jackknife}. Let $(\tilde X_{i j}, \tilde Y_{i j}) = (X_i, Y_i)$ if $i \neq j$ and be an independent copy of $(X_j, Y_j)$, denoted $(\tilde X_j, \tilde Y_j)$, if $i = j$, and define $\tilde \varepsilon_{i j} = \tilde Y_{i j} - \mu(\tilde X_{i j})$. Write $\tilde \I_{i j b r}(x) = \I \big\{ \tilde X_{i j} \in T_{b r}(x) \big\}$ and $\tilde \I_{j b r}(x) = \I \big\{ \tilde X_{j} \in T_{b r}(x) \big\}$, and also $\tilde N_{j b r}(x) = \sum_{i=1}^{n} \tilde \I_{i j b r}(x)$. We use the leave-one-out notation $N_{-j b r}(x) = \sum_{i \neq j} \I_{i b r}(x)$ and also write $N_{-j b r \cap b' r'} = \sum_{i \neq j} \I_{i b r}(x) \I_{i b' r'}(x)$. Since $\E[ \varepsilon_i^4 \mid X_i]$ is bounded, % \begin{align*} &\Var \left[ \E \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX, \bY \right] \right] \\ &\quad\leq \Var \left[ \E \left[ \frac{n}{\lambda^d} \sum_{i=1}^n \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX, \bY \right] \right] \\ &\quad\leq \frac{1}{2} \frac{n^2}{\lambda^{2d}} \sum_{j=1}^{n} \E \left[ \left( \sum_{i=1}^n \left( \frac{\I_{i b r}(x) \I_{i b' r}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} - \frac{\tilde \I_{i j b r}(x) \tilde \I_{i j b' r'}(x) \tilde \varepsilon_{i j}^2} {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)} \right) \right)^2 \right] \\ &\quad\leq \frac{n^2}{\lambda^{2d}} \sum_{j=1}^{n} \E \left[ \left( \left| \frac{1} {N_{b }(x) N_{b' r'}(x)} - \frac{1} {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)} \right| \sum_{i \neq j} \I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2 \right)^2 \right] \\ &\qquad+ \frac{n^2}{\lambda^{2d}} \sum_{j=1}^{n} \E \left[ \left( \left( \frac{\I_{j b r}(x) \I_{j b' r'}(x) \varepsilon_j^2} {N_{b r}(x) N_{b' r'}(x)} - \frac{\tilde \I_{j b r}(x) \tilde \I_{j b' r'}(x) \tilde \varepsilon_j^2} {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)} \right) \right)^2 \right] \\ &\quad\lesssim \frac{n^2}{\lambda^{2d}} \sum_{j=1}^{n} \E \left[ N_{-j b r \cap b' r}(x)^2 \left| \frac{1} {N_{b r}(x) N_{b' r'}(x)} - \frac{1} {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)} \right|^2 + \frac{\I_{j b r}(x) \I_{j b' r'}(x)} {N_{b r}(x)^2 N_{b' r'}(x)^2} \right]. \end{align*} % For the first term in the above display, note that % \begin{align*} &\left| \frac{1}{N_{b r}(x) N_{b' r'}(x)} - \frac{1} {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)} \right| \\ &\quad\leq \frac{1}{N_{b r}(x)} \left| \frac{1} {N_{b' r'}(x)} - \frac{1} {\tilde N_{j b' r'}(x)} \right| + \frac{1}{\tilde N_{j b' r'}(x)} \left| \frac{1} {N_{b r}(x)} - \frac{1} {\tilde N_{j b r}(x)} \right| \\ &\quad\leq \frac{1}{N_{-j b r}(x)} \frac{1} {N_{-j b' r'}(x)^2} + \frac{1}{N_{-j b' r'}(x)} \frac{1} {N_{-j b r}(x)^2} \end{align*} % since $|N_{b r}(x) - \tilde N_{j b r}(x)| \leq 1$ and $|N_{b' r'}(x) - \tilde N_{j b' r'}(x)| \leq 1$. Further, these terms are non-zero only on the events $\{ X_j \in T_{b r}(x) \} \cup \{ \tilde X_j \in T_{b r}(x) \}$ and $\{ X_j \in T_{b' r'}(x) \} \cup \{ \tilde X_j \in T_{b' r'}(x) \}$ respectively, so % \begin{align*} &\Var \left[ \E \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX, \bY \right] \right] \\ &\, \lesssim \frac{n^2}{\lambda^{2d}} \sum_{j=1}^{n} \E \left[ \frac{\I_{j b' r'}(x) + \tilde \I_{j b' r'}(x)}{N_{-j b r}(x)^2} \frac{N_{-j b r \cap b' r}(x)^2} {N_{-j b' r'}(x)^4} \right. \\ &\left. \qquad+ \frac{\I_{j b r}(x) + \tilde \I_{j b r}(x)}{N_{-j b' r'}(x)^2} \frac{N_{-j b r \cap b' r}(x)^2} {N_{-j b r}(x)^4} + \frac{\I_{j b r}(x) \I_{j b' r'}(x)} {N_{b r}(x)^2 N_{b' r'}(x)^2} \right] \\ &\, \lesssim \frac{n^2}{\lambda^{2d}} \sum_{j=1}^{n} \E \left[ \frac{\I_{j b r}(x) \I_{b r}(x) \I_{b' r'}(x)} {N_{b r}(x)^2 N_{b' r'}(x)^2} \right] \lesssim \frac{n^2}{\lambda^{2d}} \E \left[ \frac{\I_{b r}(x) \I_{b' r'}(x)} {N_{b r}(x) N_{b' r'}(x)^2} \right] \\ &\lesssim \frac{n^2}{\lambda^{2d}} \frac{\lambda^d}{n} \frac{\lambda^{2d} \log n}{n^2} \lesssim \frac{\lambda^d \log n}{n} \to 0, \end{align*} % where we used Lemma~\ref{lem:mondrian_app_simple_moment_denominator}. So $\sum_{i=1}^{n} S_i(x)^2 - n \,\E \left[ S_i(x)^2 \right] = O_\P \left( \frac{1}{\sqrt B} + \sqrt{\frac{\lambda^d \log n}{n}} \right) = o_\P(1)$. \proofparagraph{calculating the limiting variance} % Thus by \citet[Theorem~3.2]{hall1980martingale} we conclude that % \begin{align*} \sqrt{\frac{n}{\lambda^d}} \big( \hat\mu_\rd(x) - \E\left[ \hat\mu_\rd(x) \mid \bX, \bT \right] \big) &\rightsquigarrow \cN\big(0, \Sigma_\rd(x)\big) \end{align*} % as $n \to \infty$, assuming that the limit % \begin{align*} \Sigma_\rd(x) &= \lim_{n \to \infty} \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] \end{align*} % exists. Now we verify this and calculate the limit. Since $J$ is fixed, it suffices to find % \begin{align*} \lim_{n \to \infty} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] \end{align*} % for each $0 \leq r, r' \leq J$. Firstly, note that % \begin{align*} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] &= \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \sigma^2(X_i)} {N_{b r}(x) N_{b' r'}(x)} \right] \\ &= \frac{n^2}{\lambda^d} \sigma^2(x) \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x)} {N_{b r}(x) N_{b' r'}(x)} \right] \\ &\quad+ \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \big(\sigma^2(X_i) - \sigma^2(x) \big)} {N_{b r}(x) N_{b' r'}(x)} \right]. \end{align*} % Since $\sigma^2$ is Lipschitz and $\P \left(\max_{1 \leq l \leq d} |T_b(x)_l| \geq t/\lambda \right) \leq 2d e^{-t/2}$ by Lemma~\ref{lem:mondrian_app_largest_cell}, % \begin{align*} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \big|\sigma^2(X_i) - \sigma^2(x) \big|} {N_{b r}(x) N_{b' r'}(x)} \right] &\leq 2de^{-t/2} \frac{n^2}{\lambda^d} + \frac{n^2}{\lambda^d} \frac{t}{\lambda} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x)} {N_{b r}(x) N_{b' r'}(x)} \right] \\ &\lesssim \frac{n^2}{\lambda^d} \frac{\log n}{\lambda} \frac{\lambda^d}{n^2} \lesssim \frac{\log n}{\lambda}, \end{align*} % by Lemma~\ref{lem:mondrian_app_simple_moment_denominator}, where we set $t = 4 \log n$. Therefore % \begin{align*} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] &= \sigma^2(x) \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x)} {N_{b r}(x) N_{b' r'}(x)} \right] + O \left( \frac{\log n}{\lambda} \right). \end{align*} % Next, by conditioning on $T_{b r}$, $T_{b' r'}$, $N_{-i b r}(x)$, and $N_{-i b' r'}(x)$, % \begin{align*} &\E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x)} {N_{b r}(x) N_{b' r'}(x)} \right] = \E \left[ \frac{\int_{T_{b r}(x) \cap T_{b' r'}(x)} f(\xi) \diff \xi} {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)} \right] \\ &\quad= f(x) \, \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)} \right] + \E \left[ \frac{\int_{T_{b r}(x) \cap T_{b' r'}(x)} (f(\xi) - f(x)) \diff \xi} {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)} \right] \\ &\quad= f(x) \, \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)} \right] + O \left( \frac{\lambda^d}{n^2} \frac{(\log n)^{d+1}}{\lambda} \right) \end{align*} % arguing using Lemma~\ref{lem:mondrian_app_largest_cell}, the Lipschitz property of $f(x)$, and Lemma~\ref{lem:mondrian_app_simple_moment_denominator}. So % \begin{align*} \frac{n^2}{\lambda^d} \E \! \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] &= \sigma^2(x) f(x) \frac{n^2}{\lambda^d} \E \! \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)} \right] \! + O \! \left( \frac{(\log n)^{d+1}}{\lambda} \right). \end{align*} % Now we apply the binomial result in Lemma~\ref{lem:mondrian_app_binomial_expectation} to approximate the expectation. With $N_{-i b' r' \setminus b r}(x) = \sum_{j \neq i} \I\{X_j \in T_{b' r'}(x) \setminus T_{b r}(x)\}$, % \begin{align*} &\E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)} \right] = \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {N_{-i b r}(x)+1} \right. \\ &\qquad\left. \times \, \E \left[ \frac{1} {N_{-i b' r' \cap b r}(x)+N_{-i b' r' \setminus b r}(x)+1} \Bigm| \bT, N_{-i b' r' \cap b r}(x), N_{-i b r \setminus b' r'}(x) \right] \right]. \end{align*} % Now conditional on $\bT$, $N_{-i b' r' \cap b r}(x)$, and $N_{-i b r \setminus b' r'}(x)$, % \begin{align*} N_{-i b' r' \setminus b r}(x) &\sim \Bin\left( n - 1 - N_{-i b r}(x), \ \frac{\int_{T_{b' r'}(x) \setminus T_{b r}(x)} f(\xi) \diff \xi} {1 - \int_{T_{b r}(x)} f(\xi) \diff \xi} \right). \end{align*} % We bound these parameters above and below. Firstly, by Lemma~\ref{lem:mondrian_app_active_data} with $B=1$, % \begin{align*} \P \left( N_{-i b r}(x) > t^{d+1} \frac{n}{\lambda^d} \right) &\leq 4 d e^{- t / (4 \|f\|_\infty(1 + 1/a_r))} \leq e^{- t / C} \end{align*} % for some $C > 0$ and sufficiently large $t$. Next, if $f$ is $L$-Lipschitz in $\ell^2$, by Lemma~\ref{lem:mondrian_app_largest_cell}, % \begin{align*} &\P \left( \left| \frac{\int_{T_{b' r'}(x) \setminus T_{b r}(x)} f(\xi) \diff \xi} {1 - \int_{T_{b r}(x)} f(\xi) \diff \xi} - f(x) |T_{b' r'}(x) \setminus T_{b r}(x)| \right| > t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{\lambda} \right) \\ &\quad\leq \P \left( \int_{T_{b' r'}(x) \setminus T_{b r}(x)} \left| f(\xi) - f(x) \right| \diff \xi > t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2 \lambda} \right) \\ &\qquad+ \P \left( \frac{\int_{T_{b' r'}(x) \setminus T_{b r}(x)} f(\xi) \diff \xi \cdot \int_{T_{b r}(x)} f(\xi) \diff \xi} {1 - \int_{T_{b r}(x)} f(\xi) \diff \xi} > t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2\lambda} \right) \\ &\quad\leq \P \left( L d\, |T_{b' r'}(x) \setminus T_{b r}(x)| \max_{1 \leq j \leq d} |T_{b' r'}(x)_j| > t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2\lambda} \right) \\ &\qquad+ \P \left( \|f\|_\infty \,|T_{b' r'}(x) \setminus T_{b r}(x)| \frac{\|f\|_\infty |T_{b r}(x)|} {1 - \|f\|_\infty |T_{b r}(x)|} > t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2\lambda} \right) \\ &\quad\leq \P \left( \max_{1 \leq j \leq d} |T_{b' r'}(x)_j| > \frac{t}{2\lambda L d} \right) +\P \left( |T_{b r}(x)| > \frac{t}{4\lambda \|f\|_\infty^2} \right) \\ &\quad\leq 2 d e^{-t a_r /(4L d)} + 2 d e^{-t a_r / (8 \|f\|_\infty^2)} \leq e^{-t/C}, \end{align*} % for large $t$, increasing $C$ as necessary. Thus with probability at least $1 - e^{-t/C}$, increasing $C$, % \begin{align*} N_{-i b' r' \setminus b r}(x) &\leq \Bin\left( n, \, |T_{b' r'}(x) \setminus T_{b r}(x)| \left( f(x) + \frac{t}{\lambda} \right) \right) \\ N_{-i b' r' \setminus b r}(x) &\geq \Bin\left( n \left( 1 - \frac{t^{d+1}}{\lambda^d} - \frac{1}{n} \right), \, |T_{b' r'}(x) \setminus T_{b r}(x)| \left( f(x) - \frac{t}{\lambda} \right) \right). \end{align*} % So by Lemma~\ref{lem:mondrian_app_binomial_expectation} conditionally on $\bT$, $N_{-i b' r' \cap b r}(x)$, and $N_{-i b r \setminus b' r'}(x)$, we have with probability at least $1 - e^{-t/C}$ that % \begin{align*} &\left| \E \left[ \frac{1} {N_{-i b' r' \cap b r}(x)+N_{-i b' r' \setminus b r}(x)+1} \Bigm| \bT, N_{-i b' r' \cap b r}(x), N_{-i b r \setminus b' r'}(x) \right] \right. \\ &\left. \qquad- \frac{1} {N_{-i b' r' \cap b r}(x) + n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1} \right| \\ &\quad\lesssim \frac{1 + \frac{n t}{\lambda} |T_{b' r'}(x) \setminus T_{b r}(x)|} {\left(N_{-i b' r' \cap b r}(x) + n |T_{b' r'}(x) \setminus T_{b r}(x)|+1\right)^2}. \end{align*} % Therefore, by the same approach as the proof of Lemma~\ref{lem:mondrian_app_moment_denominator}, taking $t = 3 C \log n$, % \begin{align*} & \left| \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)} \right.\right. \\ &\left.\left. \qquad - \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {(N_{-i b r}(x)+1) (N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1)} \right] \right| \\ &\quad\lesssim \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}{N_{-i b r}(x)+1} \frac{1 + \frac{n t}{\lambda} |T_{b' r'}(x) \setminus T_{b r}(x)|} {\left(N_{-i b' r' \cap b r}(x) + n |T_{b' r'}(x) \setminus T_{b r}(x)|+1\right)^2} \right] + e^{-t/C} \\ &\quad\lesssim \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {n |T_{b r}(x)|+1} \frac{1 + \frac{n t}{\lambda} |T_{b' r'}(x) \setminus T_{b r}(x)|} {(n |T_{b' r'}(x)| + 1)^2} \right] + e^{-t/C} \\ &\quad\lesssim \E \left[ \frac{1}{n} \frac{1} {(n |T_{b' r'}(x)| + 1)^2} + \frac{1}{n} \frac{t / \lambda} {n |T_{b' r'}(x)| + 1} \right] + e^{-t/C} \\ &\quad\lesssim \frac{\lambda^{2d} \log n}{n^3} + \frac{\log n}{n \lambda} \frac{\lambda^d}{n} \lesssim \frac{\lambda^d}{n^2} \left( \frac{\lambda^{d} \log n}{n} + \frac{\log n}{\lambda} \right). \end{align*} % Now apply the same argument to the other term in the expectation, to see that % \begin{align*} &\left| \E \left[ \frac{1} {N_{-i b r \cap b' r'}(x)+N_{-i b r \setminus b' r'}(x)+1} \Bigm| \bT, N_{-i b r \cap b' r'}(x), N_{-i b' r' \setminus b r}(x) \right] \right. \\ &\left. \qquad- \frac{1} {N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \setminus T_{b' r'}(x)|+1} \right| \\ &\quad\lesssim \frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \setminus T_{b' r'}(x)|} {\left(N_{-i b r \cap b' r'}(x) + n |T_{b r}(x) \setminus T_{b' r'}(x)|+1\right)^2}. \end{align*} % with probability at least $1 - e^{-t/C}$, and so likewise again with $t = 3 C \log n$, % \begin{align*} &\frac{n^2}{\lambda^d} \left| \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}{N_{-i b r}(x)+1} \frac{1} {N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1} \right] \right. \\ &\left. \quad- \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \setminus T_{b' r'}(x)|+1} \right.\right. \\ &\qquad\qquad\left.\left. \times \frac{1} {N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1} \right] \right| \\ &\lesssim \frac{n^2}{\lambda^d} \, \E \left[ \frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \setminus T_{b' r'}(x)|} {\left(N_{-i b r \cap b' r'}(x) + n |T_{b r}(x) \setminus T_{b' r'}(x)|+1\right)^2} \right. \\ &\qquad\qquad\left. \times \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1} \right] + \frac{n^2}{\lambda^d} e^{-t/C} \\ &\lesssim \frac{\lambda^d \log n}{n} + \frac{\log n}{\lambda}. \end{align*} % Thus far we have proven that % \begin{align*} &\frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] = \sigma^2(x) f(x) \frac{n^2}{\lambda^d} \\ &\quad\times \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \setminus T_{b' r'}(x)|+1} \right. \\ &\left. \qquad\qquad \times \frac{1} {N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1} \right] \\ &\quad+ O \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{\lambda^d \log n}{n} \right). \end{align*} % We remove the $N_{-i b r \cap b' r'}(x)$ terms. With probability at least $1 - e^{-t/C}$, conditional on $\bT$, % \begin{align*} N_{-i b r \cap b' r'}(x) &\leq \Bin\left( n, \, |T_{b r}(x) \cap T_{b' r'}(x)| \left( f(x) + \frac{t}{\lambda} \right) \right), \\ N_{-i b r \cap b' r'}(x) &\geq \Bin\left( n \left( 1 - \frac{t^{d+1}}{\lambda^d} - \frac{1}{n} \right), \, |T_{b r}(x) \cap T_{b' r'}(x)| \left( f(x) - \frac{t}{\lambda} \right) \right). \end{align*} % Therefore, by Lemma~\ref{lem:mondrian_app_binomial_expectation} applied conditionally on $\bT$, with probability at least $1 - e^{-t/C}$, % \begin{align*} & \left| \E \! \left[ \frac{1} {N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \!\setminus\! T_{b' r'}(x)|+1} \frac{1} {N_{-i b' r' \cap b r}(x) + n f(x) |T_{b' r'}(x) \!\setminus\! T_{b r}(x)|+1} \! \Bigm| \! \bT \right] \right. \\ &\left. \qquad- \frac{1} {n f(x) |T_{b r}(x)|+1} \frac{1} {n f(x) |T_{b' r'}(x)|+1} \right| \\ &\quad\lesssim \frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \cap T_{b' r'}(x)|} {(n |T_{b r}(x)| + 1)(n |T_{b' r'}(x)| + 1)} \left( \frac{1}{n |T_{b r}(x)| + 1} + \frac{1}{n |T_{b' r'}(x)| + 1} \right). \end{align*} % Now by Lemma~\ref{lem:mondrian_app_moment_cell}, with $t = 3 C \log n$, % \begin{align*} &\frac{n^2}{\lambda^d} \left| \E \! \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \!\setminus\! T_{b' r'}(x)|+1} \frac{1} {N_{-i b' r' \cap b r}(x) + n f(x) |T_{b' r'}(x) \!\setminus\! T_{b r}(x)|+1} \right] \right. \\ &\left. \qquad- \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {n f(x) |T_{b r}(x)|+1} \frac{1} {n f(x) |T_{b' r'}(x)|+1} \right] \right| \\ &\quad\lesssim \frac{n^2}{\lambda^d} \E \left[ |T_{b r}(x) \cap T_{b' r'}(x)| \frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \cap T_{b' r'}(x)|} {(n |T_{b r}(x)| + 1)(n |T_{b' r'}(x)| + 1)} \frac{1}{n |T_{b r}(x)| + 1} + \frac{1}{n |T_{b' r'}(x)| + 1} \right] \\ &\qquad+ \frac{n^2}{\lambda^d} e^{-t/C} \\ &\quad\lesssim \frac{n^2}{\lambda^d} \frac{1}{n^3} \E \left[ \frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \cap T_{b' r'}(x)|} {|T_{b r}(x)| |T_{b' r'}(x)|} \right] + \frac{n^2}{\lambda^d} e^{-t/C} \\ &\quad\lesssim \frac{1}{n \lambda^d} \E \left[ \frac{1}{|T_{b r}(x)| |T_{b' r'}(x)|} \right] + \frac{t}{\lambda^{d+1}} \E \left[ \frac{1}{|T_{b r}(x)|} \right] + \frac{n^2}{\lambda^d} e^{-t/C} \\ &\quad\lesssim \frac{\lambda^d}{n} + \frac{\log n}{\lambda}. \end{align*} % This allows us to deduce that % \begin{align*} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] &= \sigma^2(x) f(x) \frac{n^2}{\lambda^d} \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {(n f(x) |T_{b r}(x)|+1)(n f(x) |T_{b' r'}(x)|+1)} \right] \\ &\quad+ O \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{\lambda^d \log n}{n} \right). \end{align*} % Now that we have reduced the limiting variance to an expression only involving the sizes of Mondrian cells, we can exploit their exact distribution to compute this expectation. Recall from \citet[Proposition~1]{mourtada2020minimax} that we can write % \begin{align*} |T_{b r}(x)| &= \prod_{j=1}^{d} \left( \frac{E_{1j}}{a_r \lambda} \wedge x_j + \frac{E_{2j}}{a_r \lambda} \wedge (1 - x_j) \right), \\ |T_{b' r'}(x)| &= \prod_{j=1}^{d} \left( \frac{E_{3j}}{a_{r'} \lambda} \wedge x_j + \frac{E_{4j}}{a_{r'} \lambda} \wedge (1 - x_j) \right), \\ |T_{b r }(x)\cap T_{b' r'}(x)| &= \prod_{j=1}^{d} \left( \frac{E_{1j}}{a_r \lambda} \wedge \frac{E_{3j}}{a_{r'} \lambda} \wedge x_j + \frac{E_{2j}}{a_r \lambda} \wedge \frac{E_{4j}}{a_{r'} \lambda} \wedge (1 - x_j) \right) \end{align*} % where $E_{1j}$, $E_{2j}$, $E_{3j}$, and $E_{4j}$ are independent and $\Exp(1)$. Define their non-truncated versions % \begin{align*} |\tilde T_{b r}(x)| &= a_r^{-d} \lambda^{-d} \prod_{j=1}^{d} \left( E_{1j} + E_{2j} \right), \\ |\tilde T_{b' r'}(x)| &= a_{r'}^{-d} \lambda^{-d} \prod_{j=1}^{d} \left( E_{3j} + E_{4j} \right), \\ |\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)| &= \lambda^{-d} \prod_{j=1}^{d} \left( \frac{E_{1j}}{a_r} \wedge \frac{E_{3j}}{a_{r'}} + \frac{E_{2j}}{a_r} \wedge \frac{E_{4j}}{a_{r'}} \right), \end{align*} % and note that % \begin{align*} &\P \left( \big( \tilde T_{b r}(x), \tilde T_{b' r'}(x), \tilde T_{b r}(x) \cap T_{b' r'}(x) \big) \neq \big( T_{b r}(x), T_{b' r'}(x), T_{b r}(x) \cap T_{b' r'}(x) \big) \right) \\ &\,\leq \sum_{j=1}^{d} \big( \P(E_{1j} \geq a_r \lambda x_j) + \P(E_{3j} \geq a_{r'} \lambda x_j) + \P(E_{2j} \geq a_r \lambda (1 - x_j)) + \P(E_{4j} \geq a_{r'} \lambda (1 - x_j)) \big) \\ &\,\leq e^{-C \lambda} \end{align*} % for some $C > 0$ and sufficiently large $\lambda$. So by Cauchy--Schwarz and Lemma~\ref{lem:mondrian_app_moment_cell}, % \begin{align*} & \frac{n^2}{\lambda^d} \left| \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {n f(x) |T_{b r}(x)|+1} \frac{1} {n f(x) |T_{b' r'}(x)|+1} \right] - \E \left[ \frac{|\tilde T_{b r}(x) \cap T_{b' r'}(x)|} {n f(x) |\tilde T_{b r}(x)|+1} \frac{1} {n f(x) |\tilde T_{b' r'}(x)|+1} \right] \right| \\ &\quad\lesssim \frac{n^2}{\lambda^d} e^{-C \lambda} \lesssim e^{-C \lambda / 2} \end{align*} % as $\log \lambda \gtrsim \log n$. Therefore % \begin{align*} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] &= \sigma^2(x) f(x) \frac{n^2}{\lambda^d} \E \left[ \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|} {(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|+1)} \right] \\ &\quad+ O \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{\lambda^d \log n}{n} \right). \end{align*} % We remove the superfluous units in the denominators. Firstly, by independence of the trees, % \begin{align*} & \frac{n^2}{\lambda^d} \left| \E \left[ \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|} {(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|+1)} \right] - \E \left[ \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|} {(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|)} \right] \right| \\ &\quad\lesssim \frac{n^2}{\lambda^d} \E \left[ \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|} {n |\tilde T_{b r}(x)|} \frac{1} {n^2 |\tilde T_{b' r'}(x)|^2} \right] \lesssim \frac{1}{n \lambda^d} \E \left[ \frac{1}{|T_{b r}(x)|} \right] \E \left[ \frac{1}{|T_{b' r'}(x)|} \right] \lesssim \frac{\lambda^d}{n}. \end{align*} % Secondly, we have in exactly the same manner that % \begin{align*} \frac{n^2}{\lambda^d} \left| \E \left[ \frac{|\tilde T_{b r}(x) \cap T_{b' r'}(x)|} {(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|)} \right] - \E \left[ \frac{|\tilde T_{b r}(x) \cap T_{b' r'}(x)|} {n^2 f(x)^2 |\tilde T_{b r}(x)| |\tilde T_{b' r'}(x)|} \right] \right| &\lesssim \frac{\lambda^d}{n}. \end{align*} % Therefore % \begin{align*} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] &= \frac{\sigma^2(x)}{f(x)} \frac{1}{\lambda^d} \E \left[ \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|} {|\tilde T_{b r}(x)| |\tilde T_{b' r'}(x)|} \right] + O \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{\lambda^d \log n}{n} \right). \end{align*} % It remains to compute this integral. By independence over $1 \leq j \leq d$, % \begin{align*} &\E \left[ \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|} {|\tilde T_{b r}(x)| |\tilde T_{b' r'}(x)|} \right] \\ &\quad= a_r^d a_{r'}^d \lambda^d \prod_{j=1}^d \E \left[ \frac{ (E_{1j} / a_r) \wedge (E_{3j} / a_{r'}) + (E_{2j} a_r) \wedge (E_{4j} / a_{r'}) } { \left( E_{1j} + E_{2j} \right) \left( E_{3j} + E_{4j} \right)} \right] \\ &\quad= 2^d a_r^d a_{r'}^d \lambda^d \prod_{j=1}^d \E \left[ \frac{ (E_{1j} / a_r) \wedge (E_{3j} / a_{r'})} { \left( E_{1j} + E_{2j} \right) \left( E_{3j} + E_{4j} \right) } \right] \\ &\quad= 2^d a_r^d a_{r'}^d \lambda^d \prod_{j=1}^d \int_{0}^{\infty} \int_{0}^{\infty} \int_{0}^{\infty} \int_{0}^{\infty} \frac{ (t_1 / a_r) \wedge (t_3 / a_{r'}) } { \left( t_1 + t_2 \right) \left( t_3 + t_4 \right) } e^{-t_1 - t_2 - t_3 - t_4} \diff t_1 \diff t_2 \diff t_3 \diff t_4 \\ &\quad= 2^d a_r^d a_{r'}^d \lambda^d \prod_{j=1}^d \int_{0}^{\infty} \int_{0}^{\infty} ((t_1 / a_r) \wedge (t_3 / a_{r'})) e^{-t_1 - t_3} \\ &\qquad\times \left( \int_{0}^{\infty} \frac{e^{-t_2}}{t_1 + t_2} \diff t_2 \right) \left( \int_{0}^{\infty} \frac{e^{-t_4}}{t_3 + t_4} \diff t_4 \right) \diff t_1 \diff t_3 \\ &\quad= 2^d a_r^d a_{r'}^d \lambda^d \prod_{j=1}^d \int_{0}^{\infty} \int_{0}^{\infty} ((t / a_r) \wedge (s / a_{r'})) \Gamma(0, t) \Gamma(0, s) \diff t \diff s, \end{align*} % as $\int_0^\infty \frac{e^{-t}}{a + t} \diff t = e^a \Gamma(0, a)$ with $\Gamma(0, a) = \int_a^\infty \frac{e^{-t}}{t} \diff t$. Now % \begin{align*} &2 \int_{0}^{\infty} \int_{0}^{\infty} ((t / a_r) \wedge (s / a_{r'})) \Gamma(0, t) \Gamma(0, s) \diff t \diff s \\ &\quad= \int_0^\infty \Gamma(0, t) \left( \frac{1}{a_{r'}} \int_0^{a_{r'} t / a_r} 2 s \Gamma(0, s) \diff{s} + \frac{t}{a_r} \int_{a_{r'} t / a_r}^\infty 2 \Gamma(0, s) \diff{s} \right) \diff{t} \\ &\quad= \int_0^\infty \Gamma(0, t) \left( \frac{t}{a_r} e^{- \frac{a_{r'}}{a_r}t} - \frac{1}{a_{r'}} e^{- \frac{a_{r'}}{a_r}t} + \frac{1}{a_{r'}} - \frac{a_{r'}}{a_r^2} t^2 \Gamma\left(0, \frac{a_{r'}}{a_r} t\right) \right) \diff{t} \\ &\quad= \frac{1}{a_r} \int_0^\infty t e^{- \frac{a_{r'}}{a_r} t} \Gamma(0, t) \diff{t} - \frac{1}{a_{r'}} \int_0^\infty e^{- \frac{a_{r'}}{a_r} t} \Gamma(0, t) \diff{t} \\ &\qquad+ \frac{1}{a_{r'}} \int_0^\infty \Gamma(0, t) \diff{t} - \frac{a_{r'}}{a_r^2} \int_0^\infty t^2 \Gamma\left(0, \frac{a_{r'}}{a_r} t\right) \Gamma(0, t) \diff{t}, \end{align*} % since $\int_0^a 2 t \Gamma(0, t) \diff t = a^2 \Gamma(0, a) - a e^{-a} -e^{-a} + 1$ and $\int_a^\infty \Gamma(0, t) \diff t = e^{-a} - a \Gamma(0, a)$. Next, we use % $ \int_{0}^{\infty} \Gamma(0, t) \diff t = 1$, $\int_{0}^{\infty} e^{-at} \Gamma(0, t) \diff t = \frac{\log(1+a)}{a}$, $\int_{0}^{\infty} t e^{-at} \Gamma(0, t) \diff t = \frac{\log(1+a)}{a^2} - \frac{1}{a(a+1)}$, and $\int_{0}^{\infty} t^2 \Gamma(0, t) \Gamma(0, at) \diff t = - \frac{2a^2 + a + 2}{3a^2 (a+1)} + \frac{2(a^3 + 1) \log(a+1)}{3a^3} - \frac{2 \log a}{3}$ to see % \begin{align*} &2 \int_{0}^{\infty} \int_{0}^{\infty} ((t / a_r) \wedge (s / a_{r'})) \Gamma(0, t) \Gamma(0, s) \diff t \diff s \\ &\quad= \frac{a_r \log(1+a_{r'} / a_r)}{a_{r'}^2} - \frac{a_r / a_{r'}}{a_r + a_{r'}} - \frac{a_r \log(1 + a_{r'} / a_r)}{a_{r'}^2} + \frac{1}{a_{r'}} \\ &\qquad+ \frac{2 a_{r'}^2 + a_r a_{r'} + 2 a_r^2} {3 a_r a_{r'} (a_r + a_{r'})} - \frac{2(a_{r'}^3 + a_r^3) \log(a_{r'} / a_r+1)}{3 a_r^2 a_{r'}^2} + \frac{2 a_{r'} \log (a_{r'} / a_r)}{3 a_r^2} \\ &\quad= \frac{2}{3 a_r} + \frac{2}{3 a_{r'}} - \frac{2(a_r^3 + a_{r'}^3 ) \log(a_{r'} / a_{r}+1)} {3 a_r^2 a_{r'}^2} + \frac{2 a_{r'} \log (a_{r'} / a_{r})}{3 a_r^2} \\ &\quad= \frac{2}{3 a_r} + \frac{2}{3 a_{r'}} - \frac{2 a_{r'} \log(a_{r} / a_{r'} + 1)}{3 a_r^2} - \frac{2 a_r \log(a_{r'} / a_{r} + 1)}{3 a_{r'}^2} \\ &\quad= \frac{2}{3 a_r} \left( 1 - \frac{a_{r'}}{a_r} \log\left(\frac{a_{r}}{a_{r'}} + 1\right) \right) + \frac{2}{3 a_{r'}} \left( 1 - \frac{a_r }{a_{r'}} \log\left(\frac{a_{r'}}{a_{r}} + 1\right) \right). \end{align*} % Finally, we conclude by giving the limiting variance. % \begin{align*} &\sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] \\ &\quad= \frac{\sigma^2(x)}{f(x)} \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \left( \frac{2 a_{r'}}{3} \left( 1 - \frac{a_{r'}}{a_r} \log\left(\frac{a_r}{a_{r'}} + 1\right) \right) + \frac{2 a_r}{3} \left( 1 - \frac{a_r}{a_{r'}} \log\left(\frac{a_{r'}}{a_r} + 1\right) \right) \right)^d \\ &\qquad+ O \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{\lambda^d \log n}{n} \right). \end{align*} % So the limit exists, and with $\ell_{r r'} = \frac{2 a_r}{3} \left( 1 - \frac{a_{r}}{a_{r'}} \log\left(\frac{a_{r'}}{a_{r}} + 1\right) \right)$, the limiting variance is % \begin{align*} \Sigma_\rd(x) &= \frac{\sigma^2(x)}{f(x)} \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \left( \ell_{r r'} + \ell_{r' r} \right)^d. \end{align*} % \end{proof} The new bias characterization with debiasing is an algebraic consequence of the original bias characterization and the construction of the debiased Mondrian random forest estimator. \begin{proof}[Theorem~\ref{thm:mondrian_bias_debiased}] By the definition of the debiased estimator and Theorem~\ref{thm:mondrian_bias}, since $J$ and $a_r$ are fixed, % \begin{align*} \E \big[ \hat \mu_\rd(x) \mid \bX, \bT \big] &= \sum_{l=0}^J \omega_l \E \big[ \hat \mu_l(x) \Bigm| \bX, \bT \big] \\ &= \sum_{l=0}^J \omega_l \left( \mu(x) + \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor} \frac{B_r(x)}{a_l^{2r} \lambda^{2r}} \right) + O_\P \left( \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} + \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}} \right). \end{align*} % It remains to evaluate the first term. Recalling that $A_{r s} = a_{r-1}^{2 - 2s}$ and $A \omega = e_0$, we have % \begin{align*} &\sum_{l=0}^J \omega_l \left( \mu(x) + \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor} \frac{B_r(x)}{a_l^{2r} \lambda^{2r}} \right) \\ &\quad= \mu(x) \sum_{l=0}^J \omega_l + \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor} \frac{B_r(x)}{\lambda^{2r}} \sum_{l=0}^J \frac{\omega_l}{a_l^{2r}} \\ &\quad= \mu(x) (A \omega)_1 + \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor \wedge J} \frac{B_r(x)}{\lambda^{2r}} (A \omega)_{r+1} + \sum_{r = (\lfloor \flbeta / 2 \rfloor \wedge J) + 1} ^{\lfloor \flbeta / 2 \rfloor} \frac{B_r(x)}{\lambda^{2r}} \sum_{l=0}^J \frac{\omega_l}{a_l^{2r}} \\ &\quad= \mu(x) + \I\{\lfloor \flbeta / 2 \rfloor \geq J + 1\} \frac{B_{J+1}(x)}{\lambda^{2J + 2}} \sum_{l=0}^J \frac{\omega_l}{a_l^{2J + 2}} + O \left( \frac{1}{\lambda^{2J + 4}} \right) \\ &\quad= \mu(x) + \I\{2J + 2 < \beta\} \frac{\bar\omega B_{J+1}(x)}{\lambda^{2J + 2}} + O \left( \frac{1}{\lambda^{2J + 4}} \right). \end{align*} % \end{proof} \begin{proof}[Theorem~\ref{thm:mondrian_variance_estimation_debiased}] \proofparagraph{consistency of $\hat\sigma^2(x)$} Recall that % \begin{align} \label{eq:mondrian_app_sigma2_hat_proof} \hat\sigma^2(x) &= \frac{1}{B} \sum_{b=1}^{B} \frac{\sum_{i=1}^n Y_i^2 \, \I\{X_i \in T_b(x)\}} {\sum_{i=1}^n \I\{X_i \in T_b(x)\}} - \hat \mu(x)^2. \end{align} % The first term in \eqref{eq:mondrian_app_sigma2_hat_proof} is simply a Mondrian forest estimator of $\E[Y_i^2 \mid X_i = x] = \sigma^2(x) + \mu(x)^2$, which is bounded and Lipschitz, where $\E[Y_i^4 \mid X_i]$ is bounded almost surely. So its conditional bias is controlled by Theorem~\ref{thm:mondrian_bias} and is at most $O_\P \left( \frac{1}{\lambda} + \frac{\log n}{\lambda} \sqrt{\lambda^d / n} \right)$. Its variance is at most $\frac{\lambda^d}{n}$ by Theorem~\ref{thm:mondrian_clt_debiased}. Consistency of the second term in \eqref{eq:mondrian_app_sigma2_hat_proof} follows directly from Theorems~\ref{thm:mondrian_bias} and \ref{thm:mondrian_clt_debiased} with the same bias and variance bounds. Therefore % \begin{align*} \hat\sigma^2(x) &= \sigma^2(x) + O_\P \left( \frac{1}{\lambda} + \sqrt{\frac{\lambda^d}{n}} \right). \end{align*} \proofparagraph{consistency of the sum} % Note that % \begin{align*} &\frac{n}{\lambda^d} \sum_{i=1}^n \left( \sum_{r=0}^J \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\I\{X_i \in T_{r b}(x)\}} {\sum_{i=1}^n \I\{X_i \in T_{r b}(x)\}} \right)^2 \\ &\quad= \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{r=0}^J \sum_{r'=0}^J \omega_r \omega_{r'} \sum_{b=1}^B \sum_{b'=1}^B \frac{\I_{i b r}(x) \I_{i b' r'}(x)} {N_{b r}(x) N_{b' r'}(x)}. \end{align*} % This is exactly the same as the quantity in \eqref{eq:mondrian_app_clt_condition_sum}, if we were to take $\varepsilon_i$ to be $\pm 1$ with equal probability. Thus we immediately have convergence in probability by the proof of Theorem~\ref{thm:mondrian_clt_debiased}: % \begin{align*} \frac{n}{\lambda^d} \sum_{i=1}^n \left( \sum_{r=0}^J \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\I\{X_i \in T_{r b}(x)\}} {\sum_{i=1}^n \I\{X_i \in T_{r b}(x)\}} \right)^2 &= \frac{n^2}{\lambda^d} \sum_{r=0}^J \sum_{r'=0}^J \omega_r \omega_{r'} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x)} {N_{b r}(x) N_{b' r'}(x)} \right] \\ &\quad+ O_\P \left( \frac{1}{\sqrt B} + \sqrt{\frac{\lambda^d \log n}{n}} \right). \end{align*} \proofparagraph{conclusion} By the proof of Theorem~\ref{thm:mondrian_clt_debiased} with $\varepsilon_i$ being $\pm 1$ with equal probability, and by previous parts, % \begin{align*} \hat\Sigma_\rd(x) = \Sigma_\rd(x) + O_\P \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{1}{\sqrt B} + \sqrt{\frac{\lambda^d \log n}{n}} \right). \end{align*} \end{proof} \begin{proof}[Theorem~\ref{thm:mondrian_confidence_debiased}] % By Theorem~\ref{thm:mondrian_bias_debiased} and Theorem~\ref{thm:mondrian_variance_estimation_debiased}, % \begin{align*} \sqrt{\frac{n}{\lambda^d}} \frac{\hat \mu_\rd(x) - \mu(x)}{\hat \Sigma_\rd(x)^{1/2}} &= \sqrt{\frac{n}{\lambda^d}} \frac{\hat \mu_\rd(x) - \E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right]} {\hat \Sigma_\rd(x)^{1/2}} + \sqrt{\frac{n}{\lambda^d}} \frac{\E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right] - \mu(x)} {\hat \Sigma_\rd(x)^{1/2}} \\ &= \sqrt{\frac{n}{\lambda^d}} \frac{\hat \mu_\rd(x) - \E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right]} {\hat \Sigma_\rd(x)^{1/2}} + \sqrt{\frac{n}{\lambda^d}} \, O_\P \left( \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} + \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}} \right). \end{align*} % The first term converges weakly to $\cN(0,1)$ by Slutsky's theorem and Theorems~\ref{thm:mondrian_clt_debiased} and \ref{thm:mondrian_variance_estimation_debiased}, while the second is $o_\P(1)$ by assumption. Validity of the confidence interval follows. % \end{proof} \begin{proof}[Theorem~\ref{thm:mondrian_minimax}] Theorem~\ref{thm:mondrian_bias_debiased} and the proof of Theorem~\ref{thm:mondrian_clt_debiased} with $J = \lfloor \flbeta / 2 \rfloor$ gives % \begin{align*} \E \left[ \big( \hat \mu_\rd(x) - \mu(x) \big)^2 \right] &= \E \left[ \big( \hat \mu_\rd(x) - \E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right] \big)^2 \right] + \E \left[ \big( \E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right] - \mu(x) \big)^2 \right] \\ &\lesssim \frac{\lambda^d}{n} + \frac{1}{\lambda^{2\beta}} + \frac{1}{\lambda^2 B}. \end{align*} % We use here an $L^2$ version of Theorem~\ref{thm:mondrian_bias_debiased} which is immediate from the proof of Theorem~\ref{thm:mondrian_bias}, since we leveraged Chebyshev's inequality. Now since $\lambda \asymp n^{\frac{1}{d + 2 \beta}}$ and $B \gtrsim n^{\frac{2 \beta - 2}{d + 2 \beta}}$, % \begin{align*} \E \left[ \big( \hat \mu_\rd(x) - \mu(x) \big)^2 \right] &\lesssim n^{-\frac{2\beta}{d + 2 \beta}}. \end{align*} \end{proof} \section{Further properties of the Mondrian process} In section, we state and prove a collection of lemmas concerning various properties of the Mondrian process. While they are not used directly in our analysis of Mondrian random forest estimators, we believe that these results, along with the techniques displayed during their proofs, may be of potential independent interest. Our analysis of Mondrian random forest estimators in the main text is for the most part conducted pointwise, in the sense that we first fix $x \in [0,1]^d$ and then analyze $\hat\mu(x)$. This means that we interact with the Mondrian process only through $T(x)$; that is, the cell in $T$ which contains the point $x$. As such, we rely only on local properties of $T$, and may consider just a single Mondrian cell. The lemmas in this section take a more global approach to analyzing the Mondrian process, and we make statements about the entire process $T$, rather than individual cells $T(x)$. Such results may be useful for a future investigation of the uniform properties of Mondrian forest estimators, as well as being interesting in their own right. We begin with a tail bound for the number of cells appearing in a Mondrian tree, offering a multiplicative exponential inequality which complements the exact expectation result given in \citet[Proposition~2]{mourtada2020minimax}. The resulting bound in probability is the same up to logarithmic terms, and the sharp tail decay is useful in combination with union bounds in our upcoming results. \begin{lemma}[Tail bound for the number of cells in a Mondrian tree] \label{lem:mondrian_app_cells_tail} Let $D \subseteq \R^d$ be a rectangle and $T \sim \cM(D, \lambda)$. Writing $\# T$ for the number of cells in $T$, % \begin{align*} \P\left( \# T > 3 (1 + \lambda |D|_1)^d (t + 1 + d \log(1 + \lambda |D|_1)) \right) &\leq e^{-t}. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_cells_tail}] We refer to this method as the ``subcell trick'' and attribute it to \citet{mourtada2017universal}. For $\varepsilon > 0$, partition $D$ into at most $(1 + 1/\varepsilon)^d$ cells $D' \in \cD_\varepsilon$ with side lengths at most $(|D_1| \varepsilon, \ldots, |D_d| \varepsilon)$. Denote the restriction of a tree $T$ to a subcell $D'$ by $T \cap D'$. Since a split in $T$ induces a split in at least one $T \cap D'$, by a union bound % \begin{align*} \P\left(\# T > t \right) &\leq \P\left(\sum_{D' \in \cD_\varepsilon} \# (T \cap D') > t \right) \leq \sum_{D' \in \cD_\varepsilon} \P\left( \# (T \cap D') > \frac{t}{\# \cD_\varepsilon} \right). \end{align*} % Now $\# (T \cap D')$ is dominated by a Yule process with parameter $|D'|_1$ stopped at time $\lambda$ \citep[proof of Lemma~2]{mourtada2017universal}, so using that fact that if $X \sim \Yule(a)$ then $\P(X_t > n) \leq (1-e^{-at})^{n-1}$, % \begin{align*} \P\left(\# T > t \right) &\leq \# \cD_\varepsilon \, (1 - e^{-\lambda |D|_1 \varepsilon})^{t / \# \cD_\varepsilon - 1} \leq (1 + 1/\varepsilon)^d (1 - e^{-\lambda |D|_1 \varepsilon})^{t (1 + 1/\varepsilon)^{-d} - 1}. \end{align*} % Set $\varepsilon = \frac{1}{\lambda |D|_1}$, note $1-1/e \leq e^{-1/3}$ and replace $t$ by $3 (1 + \lambda |D|_1)^d (t + 1 + d \log(1 + \lambda |D|_1))$: % \begin{align*} &\P\left(\# T > t \right) \leq (1 + \lambda |D|_1)^d (1 - 1/e)^{t (1 + \lambda |D|_1)^{-d} - 1} \leq 2 (1 + \lambda |D|_1)^d e^{-t (1 + \lambda |D|_1)^{-d} / 3}, \\ &\P\left(\# T > 3 (1 + \lambda |D|_1)^d (t + 1 + d \log(1 + \lambda |D|_1)) \right) \leq e^{-t}. \end{align*} % \end{proof} Next we provide a rigorous justification to the observation that the cells in a Mondrian process should have the same shape distribution, though of course they are not independent. To state and prove this result, we need a way to identify a particular cell by endowing the cells in a Mondrian tree with a natural order. \begin{definition}[Canonical order of cells in a Mondrian tree] Let $T \sim \cM(D, \lambda)$. Each cell in a fixed realization of $T$ can be described by a word from the alphabet $\{l, r\}$, where $l$ indicates the cell to the left of a split and $r$ indicates the cell to the right. For example, if there are no splits we have one cell described by the empty word. After one split there are two cells, denoted $l$ and $r$. Now suppose that the cell $r$ splits again, giving two splits and three cells, denoted $l$, $r l$, and $r r$. Define the canonical ordering of the cells of $T$ by applying the lexicographic order to their words, with $l < r$. Note that it does not matter which coordinate each split occurs in: in two dimensions, $l$ might refer to the ``left'' or ``bottom'' and $r$ to the ``right'' or ``top'' cell. \end{definition} \begin{lemma}[Cells in a Mondrian tree have identically distributed shapes] \label{lem:mondrian_app_cells_identically_distributed} Let $T \sim \cM(D, \lambda)$ with ordered cells $D'_1, \ldots, D'_{\# T}$. For $\varepsilon_1, \ldots, \varepsilon_d \geq 0$ and $1 \leq i \leq k$, % \begin{align*} \P\left( |D'_{i1}| \leq \varepsilon_1, \ldots, |D'_{id}| \leq \varepsilon_d, \# T = k \right) &= \P\left( |D'_{11}| \leq \varepsilon_1, \ldots, |D'_{1d}| \leq \varepsilon_d, \# T = k \right). \end{align*} % Marginalizing over $\# T$ with $E_j$ i.i.d.\ $\Exp(1)$, \citet[Proposition~1]{mourtada2020minimax} gives % \begin{align*} \P\left( |D'_{i1}| > \varepsilon_1, \ldots, |D'_{id}| > \varepsilon_d \right) &= \prod_{j=1}^d \P\left( \frac{E_j}{\lambda} \wedge |D_j| > \varepsilon_j \right) = \prod_{j=1}^d \I\{|D_j| > \varepsilon_j\} e^{-\lambda \varepsilon_j}. \end{align*} \end{lemma} We observe a version of the famous Poisson process inspection or waiting time paradox in the sizes of Mondrian cells. The above Lemma~\ref{lem:mondrian_app_cells_identically_distributed} shows that for a large enough lifetime $\lambda$, the volume of any cell $D$ has the same distribution as the volume of a corner cell, and is asymptotically $\E[|D|] \asymp \E \left[ \prod_{j=1}^{d} (E_j / \lambda) \right] = 1/\lambda^d$. This is consistent with \citet[Proposition~2]{mourtada2020minimax} who give $\E[\# T] \asymp \lambda^d$. However, if instead of selecting a cell directly, we instead select a fixed interior point $x$ and query the cell $T(x)$ which contains it, we find that $\E[|T(x)|] \asymp \E \left[ \prod_{j=1}^{d} ((E_{1j} + E_{2j}) / \lambda) \right] = 2^d/\lambda^d$, where $E_{1j}, E_{2j}$ are i.i.d.\ $\Exp(1)$, by \citet[Proposition~1]{mourtada2020minimax}. Since $T(x)$ contains $x$ by construction, a size-biasing phenomenon occurs and we see that $T(x)$ is on average larger than a typical Mondrian cell. \begin{proof}[Lemma~\ref{lem:mondrian_app_cells_identically_distributed}] Let $w$ be the word associated with the cell $D_i \in T$. Note that $i=1$ if and only if $r \notin w$, as then $D_i$ is the left child of every split. So suppose $r \in w$. Let $\tilde w$ be the word obtained by replacing all occurrences of $r$ in $w$ with an $l$. Each such replacement corresponds to a split in $T$. Let $\tilde T$ be the same process as $T$ but with the following modification: for each split where a replacement was made, change the uniform random variable $S$ (from the definition of $T$, see Section~\ref{sec:mondrian_process}) to $1-S$. Since $S$ is independent of everything else in the construction of $T$, we observe that $\tilde T \sim \cM(D, \lambda)$ also. Further, there is almost surely exactly one cell in $\tilde T$ which has the same shape as $D$, as the uniform distribution has no atoms. Denote this cell by $\tilde D$ and note that the replacements imply that its word in $\tilde T$ is $\tilde w$. Thus $\tilde D = \tilde D_1$ in $\tilde T$ and so $(|D_{i1}|, \ldots, |D_{i d}|, \# T) = (|\tilde D_{11}|, \ldots, |\tilde D_{1d}|, \# \tilde T)$. Equality of the distributions follows. \end{proof} As our next result we provide a tail bound for the size of the largest Mondrian cell. The cells within a Mondrian tree are of course not independent, and in fact there should intuitively be some negative correlation between their sizes, due to the fact that they must all fit within the original cell $D$. \begin{lemma}[Tail bound on largest Mondrian cell] \label{lem:mondrian_app_largest_cell_tail} Let $T \sim \cM(D, \lambda)$. For any $\varepsilon > 0$, % \begin{align*} \P\left( \max_{D' \in T} \max_{1 \leq j \leq d} |D'_j| > \varepsilon \right) &\leq 5d (1 + \lambda |D|_1)^{d+1} e^{-\lambda \varepsilon}. \end{align*} % \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_largest_cell_tail}] Let $D_i$ be the ordered cells of $T$ and take $k \geq 1$. By union bounds and Lemma~\ref{lem:mondrian_app_cells_identically_distributed}, % \begin{align*} \P\left( \max_{D' \in T} \max_{1 \leq j \leq d} |D'_j| > \varepsilon \right) &\leq \sum_{l=1}^k \P\left( \max_{1 \leq i \leq l} \max_{1 \leq j \leq d} |D_{i j}| > \varepsilon, \# T = l \right) + \P\left( \# T > k \right) \\ &\leq \sum_{l=1}^k \sum_{i=1}^l \sum_{j=1}^d \P\big( |D_{i j}| > \varepsilon, \# T = l \big) + \P\left( \# T > k \right) \\ &\leq \sum_{l=1}^k l d \, \P\big( |D_{1j}| > \varepsilon, \# T = l \big) + \P\left( \# T > k \right) \\ &\leq k d \, \P\big(|D_{1 j}| > \varepsilon \big) + \P\left( \# T > k \right). \end{align*} % For the first term we use the exact distribution of $D_1$ from Lemma~\ref{lem:mondrian_app_cells_identically_distributed} and for the second term we apply Lemma~\ref{lem:mondrian_app_cells_tail}. % \begin{align*} \P\left( \max_{D' \in T} \max_{1 \leq j \leq d} |D'_j| > \varepsilon \right) &\leq k d \, \P\big(|D_{1 j}| > \varepsilon \big) + \P\left( \# T > k \right) \\ &\leq k d \, e^{-\lambda \varepsilon} + 2 (1 + \lambda |D|_1)^d e^{-k (1 + \lambda |D|_1)^{-d} / 3}. \end{align*} % Finally, set $k = \big\lceil 3 \lambda \varepsilon (1 + \lambda |D|_1)^d \big\rceil$ and note the bound is trivial unless $\varepsilon \leq |D|_1$. % \begin{align*} \P\left( \max_{D' \in T} \max_{1 \leq j \leq d} |D'_j| > \varepsilon \right) &\leq \big( 3 \lambda \varepsilon (1 + \lambda |D|_1)^d + 1 \big) d \, e^{-\lambda \varepsilon} + 2 (1 + \lambda |D|_1)^d e^{-\lambda \varepsilon} \\ &\leq 3d (1 + \lambda |D|_1)^{d+1} e^{-\lambda \varepsilon} + 2 (1 + \lambda |D|_1)^d e^{-\lambda \varepsilon} \\ &\leq 5d (1 + \lambda |D|_1)^{d+1} e^{-\lambda \varepsilon}. \end{align*} % \end{proof} For the remainder of this section, we turn our attention to the partitions generated by Mondrian random forests. In particular, we study the refinement generated by overlaying $B$ independent Mondrian processes with possibly different lifetime parameters, and intersecting their resulting individual partitions. \begin{definition}[Partition refinement]% % Let $T_1, \ldots, T_B$ be partitions of a set. Their common refinement is % \begin{align*} \bigwedge_{b=1}^B T_b = \left\{ \bigcap_{b=1}^B D_b: D_b \in T_b \right\} \bigsetminus \left\{ \emptyset \right\}. \end{align*} % \end{definition} We begin our analysis of Mondrian forest refinements with a pair of simple inequalities for bounding the total number of refined cells in Lemma~\ref{lem:mondrian_app_refinement_inequalities}. This result does not depend on the probabilistic structure of the Mondrian process, and holds for any rectangular partitions. \begin{lemma}[Inequalities for refinements of rectangular partitions] \label{lem:mondrian_app_refinement_inequalities} Let $T_1, \ldots, T_B$ be rectangular partitions of a $d$-dimensional rectangle $D$. Then % \begin{align} \label{eq:mondrian_app_refinement_1} \# \bigwedge_{b=1}^B T_b &\leq \prod_{b=1}^B \# T_b, \end{align} % and for all $B \leq d$ there exist $T_b$ such that \eqref{eq:mondrian_app_refinement_1} holds with equality. If $\# T_{b j}$ denotes the number of splits made by $T_b$ in dimension $j$, then % \begin{align} \label{eq:mondrian_app_refinement_2} \# \bigwedge_{b=1}^B T_b &\leq \prod_{j=1}^d \left( 1 + \sum_{b=1}^B \# T_{b j} \right), \end{align} % and for all $B \geq d$ there exist $T_b$ such that \eqref{eq:mondrian_app_refinement_2} holds with equality. \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_refinement_inequalities}] The first inequality \eqref{eq:mondrian_app_refinement_1} follows because every cell in $\bigwedge_b T_b$ is the intersection of cells $D_b \in T_b$ for $1 \leq b \leq B$, and there at at most $\prod_{b=1}^{B} \# T_b$ ways to choose these. This bound is achievable when $B \leq d$ by setting $T_b$ to be a tree with splits only in dimension $b$, so that every such intersection of cells gives a cell in the refinement. For the second inequality \eqref{eq:mondrian_app_refinement_2}, we construct a new forest of trees. In particular, for each $1 \leq j \leq d$ define $A_j$ to be the set of locations in $D_j$ where a tree $T_b$ makes a split in dimension $j$ for some $b$. Define $T'_j$ to be a tree which has splits only in dimension $j$ and at the locations prescribed by $A_j$. Clearly, since every split in $T'_j$ comes from a split in some $T_b$ in dimension $j$, we have $\# T'_j \leq 1 + \sum_b \# T_{b j}$. Applying the first inequality to this new forest yields $\# \bigwedge_j T'_j \leq \prod_j \# T'_j \leq \prod_j \big( 1 + \sum_b \# T_{b j} \big)$. Finally, note that $\bigwedge_j T'_j$ is a refinement of $\bigwedge_b T_b$ and the result follows. This bound is achievable when $B \geq d$ by letting $T_b$ have splits only in dimension $b$ when $b \leq d$ and to be the trivial partition otherwise. % \end{proof} The inequalities in Lemma~\ref{lem:mondrian_app_refinement_inequalities} provide rather crude bounds for the number of cells in a Mondrian forest refinement as they do not take into account the random structure. Indeed, it should be clear that the ``worst case'' scenarios, involving trees which contain splits only in a single direction, should be extremely unlikely under the Mondrian law. In Lemma~\ref{lem:mondrian_app_refinement} we confirm this intuition and provide an exact value for the expected number of cells in a Mondrian refinement by direct calculation. This result strictly generalizes the single tree version provided as \citet[Proposition~2]{mourtada2020minimax}. \begin{lemma}[Expected number of cells in a Mondrian forest refinement] \label{lem:mondrian_app_refinement} Let $D$ be a $d$-dimensional rectangle and take $\lambda_b > 0$ for $1 \leq b \leq B$. Let $T_b \sim \cM(D, \lambda_b)$ be independent. Then the expected number of cells in their refinement is exactly % \begin{align*} \E\left[\# \bigwedge_{b=1}^B T_b \right] &= \prod_{j=1}^d \left( 1 + |D_j| \sum_{b=1}^B \lambda_b \right). \end{align*} % \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_refinement}] By \citet[Proposition~2]{mourtada2020minimax} we have the result for a single tree: % \begin{align} \label{eq:mondrian_app_single_tree} \E\left[\# T_b \right] &= \prod_{j=1}^d \left( 1 + |D_j| \lambda_b \right). \end{align} % We proceed by induction on $B$. By the tower law, % \begin{align*} \E\left[\# \bigwedge_{b=1}^B T_b \right] &= \E\left[ \sum_{D' \in T_B} \# \bigwedge_{b=1}^{B-1} (T_b \cap D') \right] = \E\left[ \sum_{D' \in T_B} \E\left[ \# \bigwedge_{b=1}^{B-1} (T_b \cap D') \biggm| T_B \right] \right]. \end{align*} % Now by the restriction property of Mondrian processes \citep[Fact~2]{mourtada2020minimax}, observe that $T_b \cap D' \sim \cM(D', \lambda_b)$ conditional on $T_B$. Then by the induction hypothesis, % \begin{align*} \E\left[ \# \bigwedge_{b=1}^{B-1} (T_b \cap D') \biggm| T_B \right] &= \prod_{j=1}^d \left( 1 + |D'_j| \sum_{b=1}^{B-1} \lambda_b \right) = \E\big[ \# T_{D'} \mid T_B \big] \end{align*} % where $T_{D'} \sim \cM\big(D', \sum_{b=1}^{B-1} \lambda_B\big)$ conditional on $T_B$, by the result for a single tree \eqref{eq:mondrian_app_single_tree}. The restriction property finally shows that there exist realizations of $T_{D'}$ which ensure that $\sum_{D' \in T_B} \# T_{D'}$ is equal in distribution to $\# T$, where $T \sim \cM(D, \sum_{b=1}^B \lambda_b)$, so by \eqref{eq:mondrian_app_single_tree}, % \begin{align*} \E\left[\# \bigwedge_{b=1}^B T_b \right] &= \E\left[ \sum_{D' \in T_B} \E\big[ \# T_{D'} \mid T_B \big] \right] = \E\big[\# T \big] = \prod_{j=1}^d \left( 1 + |D_j| \sum_{b=1}^B \lambda_b \right). \end{align*} % \end{proof} While the exact expectation calculation in Lemma~\ref{lem:mondrian_app_refinement} is neat, sharper control on the tail behavior of the number of cells in a Mondrian refinement is desired. Lemma~\ref{lem:mondrian_app_refinement_tail} provides this, again making use of the subcell trick to convert a crude bound based on Lemma~\ref{lem:mondrian_app_refinement_inequalities} into a useful tail inequality. We assume for simplicity that all of the lifetimes are identical. \begin{lemma}[Tail bound on the number of cells in a Mondrian forest refinement] \label{lem:mondrian_app_refinement_tail} Let $T_b \sim \cM(D, \lambda)$ be i.i.d.\ for $1 \leq b \leq B$. Then % \begin{align*} \P\left( \# \bigwedge_{b=1}^B T_b > 3^d 2^{d^2} B^d (1+\lambda|D|_1)^d t^d \right) &\leq 2^{d+1} d B (1 + \lambda |D|_1)^d e^{-t}. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_refinement_tail}] We begin with a coarse estimate and refine it with the subcell trick. By Lemma~\ref{lem:mondrian_app_refinement_inequalities} \eqref{eq:mondrian_app_refinement_2}, for any $t > 0$, recalling that $\# T_{b j}$ is the number of splits made by $T_b$ in dimension $j$, % \begin{align} \nonumber \P\left( \# \bigwedge_{b=1}^B T_b > t \right) &\leq \P\left( \prod_{j=1}^d \left( 1 + \sum_{b=1}^B \# T_{b j} \right) > t \right) \leq \sum_{j=1}^d \P\left( 1 + \sum_{b=1}^B \# T_{b j} > t^{1/d} \right) \\ \label{eq:mondrian_app_refinement_tail_coarse} &\leq d\, \P\left( \sum_{b=1}^B \# T_b > t^{1/d} \right) \leq d B\, \P\left( \# T_b > \frac{t^{1/d}}{B} \right). \end{align} % By the subcell trick, partition $D$ into at most $(1 + 1/\varepsilon)^d$ cells $D' \in \cD_\varepsilon$ with side lengths at most $(|D_1| \varepsilon, \ldots, |D_d| \varepsilon)$. As every cell in $\bigwedge_b T_b$ corresponds to at least one cell in $\bigwedge_b (T_b \cap D')$, % \begin{align*} \P\left( \# \bigwedge_{b=1}^B T_b > t \right) &\leq \P\left( \sum_{D' \in \cD_\varepsilon} \# \bigwedge_{b=1}^B (T_b \cap D') > t \right) \leq \sum_{D' \in \cD_\varepsilon} \P\left( \# \bigwedge_{b=1}^B (T_b \cap D') > \frac{t}{\# \cD_\varepsilon} \right). \end{align*} % Applying the coarse estimate \eqref{eq:mondrian_app_refinement_tail_coarse} to $\# \bigwedge_b (T_b \cap D')$ gives % \begin{align*} \P\left( \# \bigwedge_{b=1}^B T_b > t \right) &\leq d B \sum_{D' \in \cD_\varepsilon} \P\left( \# (T_b \cap D') > \frac{t^{1/d}}{B \# \cD_\varepsilon^{1/d}} \right). \end{align*} % Now apply Lemma~\ref{lem:mondrian_app_cells_tail} and set $\varepsilon = \frac{1}{\lambda |D|_1}$ to obtain % \begin{align*} \P\left( \# \bigwedge_{b=1}^B T_b > t \right) &\leq d B \sum_{D' \in \cD_\varepsilon} \P\left( \# (T_b \cap D') > \frac{t^{1/d}}{B \# \cD_\varepsilon^{1/d}} \right) \\ &\leq d B \sum_{D' \in \cD_\varepsilon} 2 (1 + \lambda |D'|_1)^d e^{- t^{1/d} \# \cD_\varepsilon^{-1/d} B^{-1} (1 + \lambda |D'|_1)^{-d} / 3} \\ &\leq 2 d B (1 + 1 / \varepsilon)^d (1 + \lambda \varepsilon |D|_1)^d e^{- t^{1/d} (1 + 1/\varepsilon)^{-1} B^{-1} (1 + \lambda \varepsilon |D|_1)^{-d} / 3} \\ &\leq 2^{d+1} d B (1 + \lambda |D|_1)^d e^{- t^{1/d} (1 + \lambda |D|_1)^{-1} B^{-1} 2^{-d} / 3}. \end{align*} % Finally, replacing $t$ by $3^d 2^{d^2} B^d (1+\lambda|D|_1)^d t^d$ we have % \begin{align*} \P\left( \# \bigwedge_{b=1}^B T_b > 3^d 2^{d^2} B^d (1+\lambda|D|_1)^d t^d \right) &\leq 2^{d+1} d B (1 + \lambda |D|_1)^d e^{-t}. \end{align*} % \end{proof} \chapter{Supplement to Dyadic Kernel Density Estimators} \label{app:kernel} This section contains complementary detailed expositions of some of our main results, along with additional technical lemmas which may be of independent interest. We also provide full proofs for all of our theoretical contributions. \section{Supplementary main results} In this first section we provide more detailed versions of some of the results presented in the main text, alongside some intermediate lemmas which were skipped for conciseness. We begin with some extra notation used throughout this appendix. For real vectors, $\|\cdot\|_p$ is the standard $\ell^p$-norm defined for $p \in [1, \infty]$. For real square matrices, $\|\cdot\|_p$ is the operator norm induced by the corresponding vector norm. In particular, $\|\cdot\|_1$ is the maximum absolute column sum, $\|\cdot\|_\infty$ is the maximum absolute row sum, and $\|\cdot\|_2$ is the maximum singular value. For real symmetric matrices, $\|\cdot\|_2$ coincides with the maximum absolute eigenvalue. We use $\|\cdot\|_{\max}$ to denote the largest absolute entry of a real matrix. For real-valued functions, $\|\cdot\|_\infty$ denotes the (essential) supremum norm. For a bounded set $\cX \subseteq \R$ and $a \geq 0$ we use $[\cX \pm a]$ to denote the compact interval $[\inf \cX - a, \ \sup \cX + a]$. For measurable subsets of $\R^d$ we use $\Leb$ to denote the Lebesgue measure, and for finite sets we use $|\cdot|$ for the cardinality. Write $\sum_i$ for $\sum_{i=1}^n$ when clear from context. Similarly, use $\sum_{i \Du \frac{t + C_1 \log n}{\sqrt{n}} \right) &\leq C_2 e^{-C_3 t}, \end{align*} % for some positive constants $C_1$, $C_2$, $C_3$, and for all $t > 0$. By integration of tail probabilities, % \begin{align*} \E\left[ \sup_{w \in \cW} \big| \sqrt{n} L_n'(w) - Z_n^{L\prime}(w)\big| \right] &\lesssim \frac{\Du \log n}{\sqrt{n}}. \end{align*} % Further, $Z_n^{L\prime}$ has the same covariance structure as $\sqrt{n} L_n'$ in the sense that for all $w, w' \in \cW$, % \begin{align*} \E\left[ Z_n^{L\prime}(w) Z_n^{L\prime}(w') \right] &= n \E\left[ L_n'(w) L_n'(w') \right]. \end{align*} % It also satisfies the following trajectory regularity property for any $\delta_n \in (0, 1/2]$: % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big| Z_n^{L\prime}(w) - Z_n^{L\prime}(w') \big| \right] &\lesssim \Du \delta_n \sqrt{\log 1/\delta_n}, \end{align*} % and has continuous trajectories. The process $Z_n^{L\prime}$ is a function only of $\bA_n'$ and some random noise which is independent of $(\bA_n', \bV_n')$. \end{lemma} \begin{lemma}[Conditional strong approximation of $E_n$] \label{lem:kernel_app_conditional_strong_approx_En} Suppose Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold. For $n \geq 2$ and $t_n > 0$ with $\left|\log t_n\right| \lesssim \log n$, there exists on some probability space a copy of $\big(\bA_n, \bV_n, E_n\big)$, denoted $\big(\bA_n', \bV_n', E_n'\big)$, and a process $\tilde Z^{E\prime}_n$ which is Gaussian conditional on $\bA_n'$ and mean-zero conditional on $\bA_n'$, satisfying % \begin{align*} \P\left( \sup_{w \in \cW} \big| \sqrt{n^2h} E_n'(w) - \tilde Z_n^{E\prime}(w) \big| > t_n \Bigm\vert \bA_n' \right) &\leq C_1 t_n^{-2} n^{-1/2} h^{-3/4} (\log n)^{3/4}, \end{align*} $\bA_n'$-almost surely for some constant $C_1 > 0$. Setting $t_n = n^{-1/4} h^{-3/8} (\log n)^{3/8} R_n$ for any sequence $R_n \to \infty$ and taking an expectation gives % \begin{align*} \sup_{w \in \cW} \big| \sqrt{n^2h} E_n'(w) - \tilde Z_n^{E\prime}(w) \big| &\lesssim_\P n^{-1/4} h^{-3/8} (\log n)^{3/8} R_n. \end{align*} % Further, $\tilde Z_n^{E\prime}$ has the same conditional covariance as $\sqrt{n^2h} E_n'$ in that for all $w, w' \in \cW$, % \begin{align*} \E\left[ \tilde Z_n^{E\prime}(w) \tilde Z_n^{E\prime}(w') \bigm\vert \bA_n' \right] &= n^2h \E\left[ E_n'(w) E_n'(w') \bigm\vert \bA_n' \right]. \end{align*} % It also satisfies the following trajectory regularity property for any $\delta_n \in (0, 1/(2h)]$: % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big| \tilde Z_n^{E\prime}(w) - \tilde Z_n^{E\prime}(w') \big| \right] &\lesssim \frac{\delta_n}{h} \sqrt{\log \frac{1}{h\delta_n}}, \end{align*} % and has continuous trajectories. \end{lemma} \begin{lemma}[Unconditional strong approximation of $E_n$] \label{lem:kernel_app_unconditional_strong_approx_En} Suppose Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold. Let $\big(\bA_n', \bV_n', \tilde Z_n^{E\prime}\big)$ be defined as in Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}. For each $n \geq 2$ there exists (on some probability space) a copy of $\big(\bA_n', \bV_n', \tilde Z_n^{E\prime}\big)$, denoted $\big(\bA_n'', \bV_n'', \tilde Z_n^{E\dprime}\big)$, and a centered Gaussian process $Z^{E\dprime}_n$ satisfying % \begin{align*} \E\left[ \sup_{w \in \cW} \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big| \right] &\lesssim n^{-1/6} (\log n)^{2/3}. \end{align*} % Further, $Z_n^{E\dprime}$ has the same (unconditional) covariance structure as $\tilde Z_n^{E\dprime}$ and $\sqrt{n^2h} E_n$ in the sense that for all $w, w' \in \cW$, % \begin{align*} \E\left[ Z_n^{E\dprime}(w) Z_n^{E\dprime}(w') \right] &= \E\left[ \tilde Z_n^{E\dprime}(w) \tilde Z_n^{E\dprime}(w') \right] = n^2h \, \E\left[ E_n(w) E_n(w') \right]. \end{align*} % It also satisfies the following trajectory regularity property for any $\delta_n \in (0, 1/(2h)]$: % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big| Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w') \big| \right] &\lesssim \frac{\delta_n}{h} \sqrt{\log \frac{1}{h\delta_n}}. \end{align*} % Finally, $Z_n^{E\dprime}$ is independent of $\bA_n''$ and has continuous trajectories. \end{lemma} We combine these strong approximations to deduce a coupling for $\hat f_W$ in Theorem~\ref{thm:kernel_app_strong_approx_fW}, taking care with independence to ensure the approximating processes are jointly Gaussian. \begin{theorem}[Strong approximation of $\hat f_W$] \label{thm:kernel_app_strong_approx_fW} Suppose that Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold. For each $n \geq 2$ and any sequence $R_n \to \infty$ there exists on some probability space a centered Gaussian process $Z_n^{f\prime}$ and a copy of $\hat f_W$, denoted $\hat f_W'$, satisfying % \begin{align*} &\sup_{w \in \cW} \Big| \hat f_W'(w) - \E[\hat f_W'(w)] - Z_n^{f\prime}(w) \Big| \\ &\quad\lesssim_\P n^{-1} \log n + n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-7/6} h^{-1/2} (\log n)^{2/3}. \end{align*} % Further, $Z_n^{f\prime}$ has the same covariance structure as $\hat f_W'(w)$ in the sense that for all $w, w' \in \cW$, % \begin{align*} \E\big[Z_n^{f\prime}(w) Z_n^{f\prime}(w')\big] &= \Cov\Big[ \hat f_W'(w), \hat f_W'(w') \Big] = \Sigma_n(w,w'). \end{align*} % It has continuous trajectories satisfying the following regularity property for any $\delta_n \in (0, 1/2]$: % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \Big| Z_n^{f\prime}(w) - Z_n^{f\prime}(w') \Big| \right] &\lesssim \frac{\Du}{\sqrt n} \delta_n \sqrt{\log \frac{1}{\delta_n}} + \frac{1}{\sqrt{n^2h}} \frac{\delta_n}{h} \sqrt{\log \frac{1}{h\delta_n}}. \end{align*} % \end{theorem} The main result Theorem~\ref{thm:kernel_strong_approx_Tn} now follows easily using Theorem~\ref{thm:kernel_app_strong_approx_fW}, the bias bound from Theorem~\ref{thm:kernel_bias}, and properties of $\Sigma_n$ established in Lemma~\ref{lem:kernel_variance_bounds}. \subsection{Covariance estimation} \label{sec:kernel_app_covariance_estimation} In this section we carefully construct a consistent estimator for the covariance function $\Sigma_n$. Firstly, we characterize $\Sigma_n$ in Lemma~\ref{lem:kernel_app_covariance_structure}. In Lemma~\ref{lem:kernel_app_covariance_estimation} we define the estimator and demonstrate that it converges in probability in a suitable sense. In Lemma~\ref{lem:kernel_app_alternative_covariance_estimator} we give an alternative representation which is more amenable to computation. \begin{lemma}[Covariance structure] \label{lem:kernel_app_covariance_structure} Suppose Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth} hold. Then $\Sigma_n$, as defined in Section~\ref{sec:kernel_degeneracy}, admits the following representations, where $1 \leq i < j < r \leq n$. % \begin{align*} \Sigma_n(w,w') &= \frac{2}{n(n-1)} \,\Cov\!\big[ k_h(W_{i j},w), k_h(W_{i j},w') \big] + \frac{4(n-2)}{n(n-1)} \,\Cov\!\big[ k_h(W_{i j},w), k_h(W_{i r},w') \big] \\ &= \frac{2}{n(n-1)} \,\Cov\!\big[ k_h(W_{i j},w), k_h(W_{i j},w') \big] \\ &\quad+ \frac{4(n-2)}{n(n-1)} \,\Cov\!\big[ \E[k_h(W_{i j},w) \mid A_i], \E[k_h(W_{i j},w') \mid A_i] \big], \end{align*} % \end{lemma} \begin{lemma}[Covariance estimation] \label{lem:kernel_app_covariance_estimation} Grant Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth}, and suppose $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$. Define % \begin{align*} S_{i j r}(w,w') &= \frac{1}{6} \Big( k_h(W_{i j},w) k_h(W_{i r},w') + k_h(W_{i j},w) k_h(W_{jr},w') + k_h(W_{i r},w) k_h(W_{i j},w') \\ &\quad+ k_h(W_{i r},w) k_h(W_{jr},w') + k_h(W_{jr},w) k_h(W_{i j},w') + k_h(W_{jr},w) k_h(W_{i r},w') \Big), \\ \hat \Sigma_n(w,w') &= \frac{4}{n^2(n-1)^2} \sum_{i 0$ on $\cW$. Then the optimization problem \eqref{eq:kernel_app_sdp} has an approximately optimal solution $\hat\Sigma_n^+$ which is uniformly entrywise-consistent for $\Sigma_n$ in the sense that % \begin{align*} \sup_{w,w' \in \cW} \left| \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right| &\lesssim_\P \frac{\sqrt{\log n}}{n}. \end{align*} \end{lemma} The optimization problem \eqref{eq:kernel_app_sdp} is stated for functions rather than matrices so is infinite-dimensional. However, when restricting to finite-size matrices, Lemma~\ref{lem:kernel_app_sdp} still holds and does not depend on the size of the matrices. Furthermore, the problem then becomes a semi-definite program and so can be solved to arbitrary precision in polynomial time in the size of the matrices \citep{laurent2005semidefinite}. The Lipschitz-type constraint in the optimization problem \eqref{eq:kernel_app_sdp} ensures that $\hat \Sigma_n^+$ is sufficiently smooth and is a technicality required by some of the later proofs. In practice this constraint is readily verified. \begin{lemma}[Positive semi-definite variance estimator bounds] \label{lem:kernel_app_variance_estimator_bounds} Suppose that Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth} hold, and that $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$. Then $\hat \Sigma_n^+(w,w) \geq 0$ almost surely for all $w \in \cW$ and % \begin{align*} \frac{\Dl^2}{n} + \frac{1}{n^2h} &\lesssim_\P \inf_{w \in \cW} \hat \Sigma_n^+(w,w) \leq \sup_{w \in \cW} \hat \Sigma_n^+(w,w) \lesssim_\P \frac{\Du^2}{n} + \frac{1}{n^2h}. \end{align*} \end{lemma} \subsection{Feasible uniform confidence bands} We use the strong approximation derived in Section~\ref{sec:kernel_app_strong_approx} and the positive semi-definite covariance estimator introduced in Section~\ref{sec:kernel_app_covariance_estimation} to construct feasible uniform confidence bands. We drop the prime notation for copies of processes in the interest of clarity. \begin{lemma}[Proximity of the standardized and studentized $t$-statistics] \label{lem:kernel_app_studentized_t_statistic} Let Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold, and suppose that $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$. Define for $w \in \cW$ the Studentized $t$-statistic process % \begin{align*} \hat T_n(w) = \frac{\hat f_W(w) - f_W(w)} {\sqrt{\hat\Sigma_n^+(w,w)}}. \end{align*} % Then % \begin{align*} \sup_{w \in \cW} \left| \hat T_n(w) - T_n(w) \right| &\lesssim_\P \sqrt{\frac{\log n}{n}} \left( \sqrt{\log n} + \frac{\sqrt n h^{p \wedge \beta}} {\Dl + 1/\sqrt{n h}} \right) \frac{1}{\Dl + 1/\sqrt{n h}}. \end{align*} \end{lemma} \begin{lemma}[Feasible Gaussian approximation of the infeasible Gaussian process] \label{lem:kernel_app_distributional_approx_feasible_gaussian} Let Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold, and suppose that $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$. Define a process $\hat Z_n^T(w)$ which, conditional on the data $\bW_n$, is conditionally mean-zero and conditionally Gaussian, and whose conditional covariance structure is % \begin{align*} \E\left[ \hat Z_n^T(w) \hat Z_n^T(w') \bigm| \bW_n \right] &= \frac{\hat \Sigma_n^+(w,w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}} \end{align*} % Then the following conditional Kolmogorov--Smirnov result holds. % \begin{align*} \sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \hat Z_n^T(w) \right| \leq t \biggm\vert \bW_n \right) \right| &\lesssim_\P \frac{n^{-1/6}(\log n)^{5/6}} {\Dl^{1/3} + (n h)^{-1/6}}. \end{align*} \end{lemma} \begin{lemma}[Feasible Gaussian approximation of the studentized $t$-statistic] \label{lem:kernel_app_feasible_gaussian_approx} Let Assumptions \ref{ass:kernel_data}, \ref{ass:kernel_bandwidth} and \ref{ass:kernel_rates} hold, and suppose that $f_W(w) > 0$ on $\cW$. Then % \begin{align*} \sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \hat Z_n^T(w) \right| \leq t \Bigm\vert \bW_n \right) \right| &\ll_\P 1. \end{align*} \end{lemma} These intermediate lemmas can be used to establish the valid and feasible uniform confidence bands presented in Theorem~\ref{thm:kernel_ucb} in the main text. See Section~\ref{sec:kernel_app_proofs} for details. \subsection{Counterfactual dyadic density estimation} In this section we give a detailed analysis of the counterfactual estimator of Section~\ref{sec:kernel_counterfactual}. We begin with an assumption describing the counterfactual setup. \begin{assumption}[Counterfactual data generation] \label{ass:kernel_app_counterfactual} For each $r \in \{0,1\}$, let $\bW_n^r$, $\bA_n^r$, and $\bV_n^r$ be as in Assumption~\ref{ass:kernel_data}. Let $X_i^r$ be finitely-supported variables, setting $\bX_n^r = (X_1^r, \ldots, X_n^r)$. Suppose that $(A_i^r, X_i^r)$ are independent over $1 \leq i \leq n$ and that $\bX_n^r$ is independent of $\bV_n^r$. Assume that $W_{i j}^r \mid X_i^r, X_j^r$ has a Lebesgue density $f_{W \mid XX}^r(\,\cdot \mid x_1, x_2) \in \cH^\beta_{C_\rH}(\cW)$ and that $X_i^r$ has positive probability mass function $p_X^r(x)$ on a common support $\cX$. Suppose that $(\bA_n^0, \bV_n^0, \bX_n^0)$ and $(\bA_n^1, \bV_n^1, \bX_n^1)$ are independent. \end{assumption} The counterfactual density of $W_{i j}$ in population $1$ had $X_i, X_j$ followed population $0$ is % \begin{align*} f_W^{1 \triangleright 0}(w) &= \E\left[ f_{W \mid XX}^1\big(w \mid X_1^0, X_2^0\big) \right] = \sum_{x_1 \in \cX} \sum_{x_2 \in \cX} f_{W \mid XX}^{1}(w \mid x_1, x_2) \psi(x_1) \psi(x_2) p_X^{1}(x_1) p_X^{1}(x_2), \end{align*} % with $\psi(x) = p_X^0(x)/p_X^1(x)$ for $x \in \cX$. Define the counterfactual dyadic kernel density estimator % \begin{align*} \hat f_W^{1 \triangleright 0}(w) &= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^n \hat \psi(X_i^1) \hat \psi(X_j^1) k_h(W_{i j}^1, w), \end{align*} % where $\hat\psi(x) = \I\{\hat p_X^{1}(x) > 0\}\hat p_X^{0}(x) / \hat p_X^{1}(x)$ and $\hat p_X^{r}(x) = \frac{1}{n}\sum_{i = 1}^n \I\{X_i^r = x\}$. Since $p_X^r(x) > 0$, % \begin{align*} \hat\psi(x) - \psi(x) &= \frac{\hat p_X^{0}(x) - p_X^0(x)}{p_X^1(x)} - \frac{p_X^0(x)}{p_X^1(x)} \frac{\hat p_X^{1}(x) - p_X^1(x)}{p_X^1(x)} \\ &\quad+ \frac{\hat p_X^{1}(x) - p_X^1(x)}{p_X^1(x)} \frac{\hat p_X^{1}(x) p_X^0(x) - \hat p_X^{0}(x)p_X^1(x)} {\hat p_X^{1}(x) p_X^1(x)} \\ &= \frac{1}{n} \sum_{r=1}^n \kappa(X_r^0, X_r^1, x) + O_\P\left(\frac{1}{n}\right) \end{align*} % is an asymptotic linear representation where % \begin{align*} \kappa(X_i^0, X_i^1, x) &= \frac{\I\{X_i^0 = x\} - p_X^0(x)}{p_X^1(x)} - \frac{p_X^0(x)}{p_X^1(x)} \frac{\I\{X_i^1 = x\} - p_X^1(x)}{p_X^1(x)} \end{align*} % satisfies $\E[\kappa(X_i^0, X_i^1, x)] = 0$. We now establish uniform consistency and feasible strong approximation results for the counterfactual density estimator. \begin{lemma}[Bias of $\hat f_W^{1 \triangleright 0}$] \label{lem:kernel_app_counterfactual_bias} Suppose that Assumptions~\ref{ass:kernel_data}, \ref{ass:kernel_bandwidth}, and \ref{ass:kernel_app_counterfactual} hold. Then % \begin{align*} \sup_{w \in \cW} \big| \E\big[\hat f_W^{1 \triangleright 0}(w)\big] - f_W^{1 \triangleright 0}(w) \big| \lesssim h^{p \wedge \beta} + \frac{1}{n}. \end{align*} \end{lemma} \begin{lemma}[Hoeffding-type decomposition for $\hat f_W^{1 \triangleright 0}$] \label{lem:kernel_app_counterfactual_hoeffding} Suppose that Assumptions~\ref{ass:kernel_data}, \ref{ass:kernel_bandwidth}, and \ref{ass:kernel_app_counterfactual} hold. With $k_{i j} = k_h(W_{i j}^1, w)$, $\kappa_{r i} = \kappa(X_r^0, X_r^1, X_i^1)$ and $\psi_i = \psi(X_i^1)$, define the projections % \begin{align*} u &= \E\left[ k_{i j} \psi_i \psi_j \right], \\ u_i &= \frac{2}{3} \psi_i \E\left[ k_{i j} \psi_j \mid A_i^1 \right] + \frac{2}{3} \E\left[ k_{jr} \psi_j \kappa_{i r} \mid X_i^0, X_i^1 \right] - \frac{2}{3} u, \\ u_{i j} &= \frac{1}{3} \psi_i \psi_j \E\left[ k_{i j} \mid A_i^1, A_j^1 \right] + \frac{1}{3} \psi_i \E\left[ k_{i r} \psi_r \mid A_i^1 \right] + \frac{1}{3} \psi_i \E\left[ k_{i r} \kappa_{jr} \mid A_i^1, X_j^0, X_j^1 \right] \\ &\quad+ \frac{1}{3} \kappa_{j i} \E\left[ k_{i r} \psi_r \mid A_i^1 \right] + \frac{1}{3} \psi_j \E\left[ k_{jr} \psi_r \mid A_j^1 \right] + \frac{1}{3} \psi_j \E\left[ k_{jr} \kappa_{i r} \mid X_i^0, X_i^1, A_j^1 \right] \\ &\quad+ \frac{1}{3} \kappa_{i j} \E\left[ k_{jr} \psi_r \mid A_j^1 \right] - u_i - u_j + u, \\ u_{i j r} &= \frac{1}{3} \psi_i \psi_j \E\left[ k_{i j} \mid A_i^1, A_j^1 \right] + \frac{1}{3} \psi_i \kappa_{r j} \E\left[ k_{i j} \mid A_i^1, A_j^1 \right] + \frac{1}{3} \psi_j \kappa_{r i} \E\left[ k_{i j} \mid A_i^1, A_j^1 \right] \\ &\quad+ \frac{1}{3} \psi_i \psi_r \E\left[ k_{i r} \mid A_i^1, A_r^1 \right] + \frac{1}{3} \psi_i \kappa_{jr} \E\left[ k_{i r} \mid A_i^1, A_r^1 \right] + \frac{1}{3} \psi_r \kappa_{j i} \E\left[ k_{i r} \mid A_i^1, A_r^1 \right] \\ &\quad+ \frac{1}{3} \psi_j \psi_r \E\left[ k_{jr} \mid A_j^1, A_r^1 \right] + \frac{1}{3} \psi_j \kappa_{i r} \E\left[ k_{jr} \mid A_j^1, A_r^1 \right] + \frac{1}{3} \psi_r \kappa_{i j} \E\left[ k_{jr} \mid A_j^1, A_r^1 \right] \\ &\quad- u_{i j} - u_{i r} - u_{jr} + u_i + u_j + u_r - u, \\ v_{i j r} &= \frac{1}{3} k_{i j} \big(\psi_i \psi_j +\psi_i \kappa_{r j} +\psi_j \kappa_{r i} \big) + \frac{1}{3} k_{i r} \big(\psi_i \psi_r +\psi_i \kappa_{jr} +\psi_r \kappa_{j i} \big) \\ &\quad+ \frac{1}{3} k_{jr} \big(\psi_j \psi_r +\psi_j \kappa_{i r} +\psi_r \kappa_{i j} \big). \end{align*} % With $l_i^{1 \triangleright 0}(w) = u_i$ and $e_{i j r}^{1 \triangleright 0}(w) = v_{i j r} - u_{i j r}$, set % \begin{align*} L_n^{1 \triangleright 0}(w) &= \frac{3}{n} \sum_{i=1}^n l_i^{1 \triangleright 0}(w) &\text{and} & &E_n^{1 \triangleright 0}(w) &= \frac{6}{n(n-1)(n-2)} \sum_{i=1}^{n-2} \sum_{j=i+1}^{n-1} \sum_{r=i+1}^n e_{i j r}^{1 \triangleright 0}(w). \end{align*} % Then the following Hoeffding-type decomposition holds, where $O_\P(1/n)$ is uniform in $w \in \cW$. % \begin{align*} \hat f_W^{1 \triangleright 0}(w) = \E\big[\hat f_W^{1 \triangleright 0}(w)\big] + L_n^{1 \triangleright 0}(w) + E_n^{1 \triangleright 0}(w) + O_\P\left( \frac{1}{n} \right). \end{align*} % Further, the stochastic processes $L_n^{1 \triangleright 0}$ and $E_n^{1 \triangleright 0}$ are mean-zero and orthogonal in $L^2(\P)$. Define the upper and lower degeneracy constants as % \begin{align*} \Du^{1 \triangleright 0} &= \limsup_{n \to \infty} \sup_{w \in \cW} \Var\big[ l_i^{1 \triangleright 0}(w) \big]^{1/2} &\text{and}& & \Dl^{1 \triangleright 0} &= \liminf_{n \to \infty} \inf_{w \in \cW} \Var\big[ l_i^{1 \triangleright 0}(w) \big]^{1/2}. \end{align*} \end{lemma} \begin{lemma}[Uniform consistency of $\hat f_W^{1 \triangleright 0}$] \label{lem:kernel_app_counterfactual_uniform_consistency} Suppose that Assumptions~\ref{ass:kernel_data}, \ref{ass:kernel_bandwidth}, and \ref{ass:kernel_app_counterfactual} hold. Then % \begin{align*} \E\left[ \sup_{w \in \cW} \big|\hat f_W^{1 \triangleright 0}(w) - f_W^{1 \triangleright 0}(w) \right] &\lesssim h^{p \wedge \beta} + \frac{\Du^{1 \triangleright 0}}{\sqrt n} + \sqrt{\frac{\log n}{n^2h}}. \end{align*} \end{lemma} \begin{lemma}[Strong approximation of $\hat f_W^{1 \triangleright 0}$] \label{lem:kernel_app_counterfactual_sa} On an appropriately enlarged probability space and for any sequence $R_n \to \infty$, there exists a mean-zero Gaussian process $Z_n^{f, 1 \triangleright 0}$ with the same covariance structure as $\hat f_W^{1 \triangleright 0}(w)$ satisfying % \begin{align*} &\sup_{w \in \cW} \left| \hat f_W^{1 \triangleright 0}(w) - \E\big[\hat f_W^{1 \triangleright 0}(w)\big] - Z_n^{f, 1 \triangleright 0}(w) \right| \\ &\quad\lesssim_\P n^{-1} \log n + n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-7/6} h^{-1/2} (\log n)^{2/3}. \end{align*} \end{lemma} \begin{lemma}[Counterfactual covariance structure] \label{lem:kernel_app_counterfactual_covariance_structure} Writing $k_{i j}'$ for $k_h(W_{i j}^1, w')$ etc., the counterfactual covariance function is % \begin{align*} &\Sigma_n^{1 \triangleright 0}(w,w') = \Cov\left[ \hat f_W^{1 \triangleright 0}(w), \hat f_W^{1 \triangleright 0}(w') \right] \\ &\quad= \frac{4}{n} \E\left[ \Big( \psi_i \E\big[ k_{i j} \psi_j \mid A_i^1 \big] + \E\left[ k_{r j} \psi_r \kappa_{i j} \mid X_i^0, X_i^1 \right] \Big) \right. \\ &\left. \qquad\qquad\quad \times \Big( \psi_i \E\big[ k_{i j}' \psi_j \mid A_i^1 \big] + \E\left[ k_{r j}' \psi_r \kappa_{i j} \mid X_i^0, X_i^1 \right] \Big) \right] \\ &\qquad+ \frac{2}{n^2} \E\left[ k_{i j} k_{i j}' \psi_i^2 \psi_j^2 \right] - \frac{4}{n} \E\left[ k_{i j} \psi_i \psi_j \right] \E\left[ k_{i j}' \psi_i \psi_j \right] + O\left( \frac{1}{n^{3/2}} + \frac{1}{\sqrt{n^4h}} \right). \end{align*} \end{lemma} \begin{lemma}[Gaussian approximation of the standardized counterfactual $t$-statistic] \label{lem:kernel_app_counterfactual_infeasible_t_statistic} Let Assumptions \ref{ass:kernel_data}, \ref{ass:kernel_bandwidth}, and \ref{ass:kernel_app_counterfactual} hold, and suppose $f_W^{1 \triangleright 0}(w) > 0$ on $\cW$. Define % \begin{align*} T_n^{1 \triangleright 0}(w) &= \frac{\hat f_W^{1 \triangleright 0}(w) - f_W^{1 \triangleright 0}(w)} {\sqrt{\Sigma_n^{1 \triangleright 0}(w,w)}} \quad\text{and}\quad Z_n^{T, 1 \triangleright 0}(w) = \frac{Z_n^{f, 1 \triangleright 0}(w)} {\sqrt{\Sigma_n^{1 \triangleright 0}(w,w)}}. \end{align*} % Then with $R_n \to \infty$ as in Lemma~\ref{lem:kernel_app_counterfactual_sa}, % \begin{align*} &\sup_{w \in \cW} \left| T_n^{1 \triangleright 0}(w) - Z_n^{T, 1 \triangleright 0}(w) \right| \\ &\quad\lesssim_\P \frac{ n^{-1/2} \log n + n^{-3/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-2/3} h^{-1/2} (\log n)^{2/3} + n^{1/2} h^{p \wedge \beta}} {\Dl^{1 \triangleright 0} + 1/\sqrt{n h}}. \end{align*} \end{lemma} \begin{theorem}[Infeasible counterfactual uniform confidence bands] \label{thm:kernel_app_counterfactual_infeasible_ucb} Let Assumptions \ref{ass:kernel_data}, \ref{ass:kernel_bandwidth}, \ref{ass:kernel_rates}, and \ref{ass:kernel_app_counterfactual} hold and suppose that $f_W^{1 \triangleright 0}(w) > 0$ on $\cW$. Let $\alpha \in (0,1)$ be a confidence level and define $q^{1 \triangleright 0}_{1-\alpha}$ as the quantile satisfying % \begin{align*} \P\left( \sup_{w \in \cW} \left| Z_n^{T,1 \triangleright 0}(w) \right| \leq q^{1 \triangleright 0}_{1-\alpha} \right) &= 1 - \alpha. \end{align*} % Then % \begin{align*} \P\left( f_W^{1 \triangleright 0}(w) \in \left[ \hat f_W^{1 \triangleright 0}(w) \pm q^{1 \triangleright 0}_{1-\alpha} \sqrt{\Sigma_n^{1 \triangleright 0}(w,w)} \, \right] \, \textup{for all } w \in \cW \right) \to 1 - \alpha. \end{align*} \end{theorem} % We propose an estimator for the counterfactual covariance function $\Sigma_n^{1 \triangleright 0}$. First let % \begin{align*} \hat\kappa(X_i^0, X_i^1, x) &= \frac{\I\{X_i^0 = x\} - \hat p_X^0(x)}{\hat p_X^1(x)} - \frac{\hat p_X^0(x)}{\hat p_X^1(x)} \frac{\I\{X_i^1 = x\} - \hat p_X^1(x)}{\hat p_X^1(x)}, \end{align*} % and define the leave-out conditional expectation estimators % \begin{align*} S_i^{1 \triangleright 0}(w) &= \hat\E\left[ k_h(W_{i j}^1,w) \psi(X_j^1) \mid A_i^1 \right] \\ &= \frac{1}{n-1} \left( \sum_{j=1}^{i-1} k_h(W_{j i}^1,w) \hat\psi(X_j^1) + \sum_{j=i+1}^n k_h(W_{i j}^1,w) \hat\psi(X_j^1) \right), \\ \tilde S_i^{1 \triangleright 0}(w) &= \hat\E\left[ k_h(W_{r j}^1,w) \psi(X_r^1) \kappa(X_i^0, X_i^1, X_j^1) \mid X_i^0, X_i^1 \right] \\ &= \frac{1}{n-1} \sum_{j=1}^n \I\{j \neq i\} \hat\kappa(X_i^0, X_i^1, X_j^1) S_j^{1 \triangleright 0}(w). \end{align*} % Then set % \begin{align*} \hat\Sigma_n^{1 \triangleright 0}(w,w') &= \frac{4}{n^2} \sum_{i=1}^n \left( \hat\psi(X_i^1) S_i^{1 \triangleright 0}(w) + \tilde S_i^{1 \triangleright 0}(w) \right) \left( \hat\psi(X_i^1) S_i^{1 \triangleright 0}(w') + \tilde S_i^{1 \triangleright 0}(w') \right) \\ &\quad- \frac{4}{n^3(n-1)} \sum_{i 0$ satisfy $\sup_{f \in \cF} \|f\|_{\bar\P,2} \leq \sigma \leq \|F\|_{\bar\P,2}$ and $M = \max_{1 \leq i \leq n} F(X_i)$. Then with $\delta = \sigma / \|F\|_{\bar\P,2} \in (0,1]$, % \begin{align*} \E \left[ \sup_{f \in \cF} \big| G_n(f) \big| \right] &\lesssim \|F\|_{\bar\P,2} \, J\big(\delta, \cF, F \big) + \frac{\|M\|_{\P,2} \, J(\delta, \cF, F)^2}{\delta^2 \sqrt{n}}, \end{align*} % where $\lesssim$ is up to a universal constant, and $J(\delta, \cF, F)$ is the covering integral % \begin{align*} J\big(\delta, \cF, F\big) &= \int_0^\delta \sqrt{1 + \sup_\Q \log N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2})} \diff{\varepsilon}, \end{align*} % with the supremum taken over finite discrete probability measures $\Q$ on $(S, \cS)$. \end{lemma} \begin{lemma}[A VC class maximal inequality for i.n.i.d.\ empirical processes] \label{lem:kernel_app_maximal_vc_inid} Assume the same setup as in Lemma~\ref{lem:kernel_app_maximal_entropy}, and suppose that $\cF$ forms a VC-type class in that % \begin{align*} \sup_\Q N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2}) &\leq (C_1/\varepsilon)^{C_2} \end{align*} % for all $\varepsilon \in (0,1]$, for some constants $C_1 \geq e$ (where $e$ is the standard exponential constant) and $C_2 \geq 1$. Then for $\delta \in (0,1]$ we have the covering integral bound % $J\big(\delta, \cF, F\big) \leq 3 \delta \sqrt{C_2 \log (C_1/\delta)}$, % and so by Lemma~\ref{lem:kernel_app_maximal_entropy}, up to a universal constant, % \begin{align*} \E \left[ \sup_{f \in \cF} \big| G_n(f) \big| \right] &\lesssim \sigma \sqrt{C_2 \log (C_1/\delta)} + \frac{\|M\|_{\P,2} C_2 \log(C_1/\delta)}{\sqrt{n}} \\ &\lesssim \sigma \sqrt{C_2 \log \big(C_1 \|F\|_{\bar\P,2}/\sigma\big)} + \frac{\|M\|_{\P,2} C_2 \log \big(C_1 \|F\|_{\bar\P,2}/\sigma\big)} {\sqrt{n}}. \end{align*} % \end{lemma} \subsection{Strong approximation results} Next we provide two strong approximation results. The first is a corollary of the KMT approximation \citep{komlos1975approximation} which applies to bounded-variation functions of i.i.d.\ variables. The second is an extension of the Yurinskii coupling \citep{belloni2019conditional} which applies to Lipschitz functions of i.n.i.d.\ variables. \begin{lemma}[A KMT approximation corollary] \label{lem:kernel_app_kmt_corollary} For $n \geq 1$ let $X_1, \ldots, X_n$ be i.i.d.\ real-valued random variables and $g_n: \R \times \R \to \R$ be a function satisfying the total variation bound $\sup_{x \in \R} \|g_n(\cdot, x)\|_\TV < \infty$. Then on some probability space there exist independent copies of $X_1, \ldots, X_n$, denoted $X_1', \ldots, X_n'$, and a mean-zero Gaussian process $Z_n(x)$ such that if we define the empirical process % \begin{align*} G_n(x) = \frac{1}{\sqrt n} \sum_{i=1}^n \Big(g_n(X_i',x) - \E\big[g_n(X_i',x)\big]\Big), \end{align*} % then for some universal positive constants $C_1$, $C_2$, and $C_3$, % \begin{align*} \P\left( \sup_{x \in \R} \big|G_n(x) - Z_n(x)\big| > \sup_{x \in \R} \|g_n(\cdot, x)\|_\TV \, \frac{t + C_1 \log n}{\sqrt n} \right) \leq C_2 e^{-C_3 t}. \end{align*} % Further, $Z_n$ has the same covariance structure as $G_n$ in the sense that for all $x,\, x' \in \R$, % \begin{align*} \E\big[Z_n(x) Z_n(x')\big] = \E\big[G_n(x) G_n(x')\big]. \end{align*} % By independently sampling from the law of $Z_n$ conditional on $X_1', \ldots, X_n'$, we can assume that $Z_n$ is a function only of $X_1', \ldots, X_n'$ and some independent random noise. \end{lemma} \begin{lemma}[Yurinskii coupling for Lipschitz i.n.i.d.\ empirical processes] \label{lem:kernel_app_yurinskii_corollary} For $n \geq 2$ let $X_1, \dots, X_n$ be independent but not necessarily identically distributed (i.n.i.d.) random variables taking values in a measurable space $(S, \cS)$ and let $\cX_n \subseteq \R$ be a compact interval with $\left|\log \Leb(\cX_n)\right| \leq C_1 \log n$ where $C_1 > 0$ is a constant. Let $g_n$ be measurable on $S \times \cX_n$ satisfying $\sup_{\xi \in S} \sup_{x \in \cX_n} |g_n(\xi, x)| \leq M_n$ and $\sup_{x \in \cX_n} \max_{1 \leq i \leq n} \Var[g_n(X_i, x)] \leq \sigma_n^2$, with $\left|\log M_n\right| \leq C_1 \log n$ and $\left|\log \sigma_n^2\right| \leq C_1 \log n$. Suppose that $g_n$ satisfies the following uniform Lipschitz condition: % \begin{align*} \sup_{\xi \in S} \sup_{x,x' \in \cX_n} \left| \frac{g_n(\xi, x) - g_n(\xi, x')} {x-x'} \right| \leq l_{n,\infty}, \end{align*} % and also the following $L^2$ Lipschitz condition: % \begin{align*} \sup_{x,x' \in \cX_n} \E\left[ \frac{1}{n} \sum_{i=1}^n \left| \frac{g_n(X_i, x) - g_n(X_i, x')} {x-x'} \right|^2 \right]^{1/2} \leq l_{n,2}, \end{align*} % where $0 < l_{n,2} \leq l_{n,\infty}$, $\left|\log l_{n,2}\right| \leq C_1 \log n$, and $\left|\log l_{n,\infty}\right| \leq C_1 \log n$. Then for any $t_n > 0$ with $\left|\log t_n\right| \leq C_1 \log n$, there is a probability space carrying independent copies of $X_1, \ldots, X_n$ denoted $X_1', \ldots, X_n'$ and a mean-zero Gaussian process $Z_n(x)$ such that if we define the empirical process % $G_n(x) = \frac{1}{\sqrt n} \sum_{i=1}^n \big( g_n(X'_i,x) - \E[g_n(X'_i,x)] \big)$, % then % \begin{align*} &\P\left( \sup_{x \in \cX_n} \big| G_n(x) - Z_n(x) \big| > t_n \right) \\ &\quad\leq \frac{ C_2 \sigma_n \sqrt{\Leb(\cX_n)} \sqrt{\log n} \sqrt{M_n + \sigma_n\sqrt{\log n}} }{n^{1/4} t_n^2} \sqrt{ l_{n,2} \sqrt{\log n} + \frac{l_{n,\infty}}{\sqrt n} \log n} \end{align*} % where $C_2 > 0$ is a constant depending only on $C_1$. Further, $Z_n$ has the same covariance structure as $G_n$ in the sense that for all $x, x' \in \cX_n$, % \begin{align*} \E\big[Z_n(x) Z_n(x')\big] = \E\big[G_n(x) G_n(x')\big]. \end{align*} \end{lemma} \subsection{The Vorob'ev--Berkes--Philipp theorem} We present a generalization of the Vorob'ev--Berkes--Philipp theorem \citep{dudley1999uniform} which allows one to ``glue'' multiple random variables or stochastic processes onto the same probability space, while preserving some pairwise distributions. We begin with some definitions. \begin{definition}[Tree] A \emph{tree} is a finite undirected graph which is connected and contains no cycles or self-loops. \end{definition} \begin{definition}[Polish Borel probability space] A \emph{Polish Borel probability space} is a triple $(\cX, \cF, \P)$, where $\cX$ is a Polish space (a topological space metrizable by a complete separable metric), $\cF$ is the Borel $\sigma$-algebra induced on $\cX$ by its topology, and $\P$ is a probability measure on $(\cX, \cF)$. Important examples of Polish spaces include $\R^d$ and the Skorokhod space $\cD[0,1]^d$ for $d \geq 1$. In particular, one can consider vectors of real-valued random variables or stochastic processes indexed by compact subsets of $\R^d$ which have almost surely continuous trajectories. \end{definition} \begin{definition}[Projection of a law] Let $(\cX_1, \cF_1)$ and $(\cX_2, \cF_2)$ be measurable spaces, and let $\P_{12}$ be a law on the product space $(\cX_1 \times \cX_2, \cF_1 \otimes \cF_2)$. The \emph{projection} of $\P_{12}$ onto $\cX_1$ is the law $\P_1$ defined on $(\cX_1, \cF_1)$ by $\P_1 = \P_{12} \circ \pi_1^{-1}$ where $\pi_1(x_1, x_2) = x_1$ is the first-coordinate projection. \end{definition} \begin{lemma}[Vorob'ev--Berkes--Philipp theorem, tree form] \label{lem:kernel_app_vbp} Let $\cT$ be a tree with vertex set $\cV = \{1, \ldots, n\}$ and edge set $\cE$. Suppose that attached to each vertex $i$ is a Polish Borel probability space $(\cX_i, \cF_i, \P_i)$. Suppose that attached to each edge $(i,j) \in \cE$ (where $i t)\leq r_i$ for each $1 \leq i \leq n-1$, where $\|\cdot\|$ is a norm on $\cD[0,1]$. Then there exist copies of $X_1, \ldots, X_n$ denoted $X_1'', \ldots, X_n''$ satisfying $\P\big(\|X_{i+1}'' - X_i''\| > t)\leq r_i$ for each $1 \leq i \leq n$. That is, all of the inequalities can be satisfied simultaneously on the same probability space. \end{enumerate} \end{remark} \section{Proofs} \label{sec:kernel_app_proofs} We present full proofs of all the results stated in Chapter~\ref{ch:kernel} and Appendix~\ref{app:kernel}. \subsection{Preliminary lemmas} In this section we list some results in probability and U-statistic theory which are used in proofs of our main results. Other auxiliary lemmas will be introduced when they are needed. \begin{lemma}[Bernstein's inequality for independent random variables] \label{lem:kernel_app_bernstein} Let $X_1, \ldots, X_n$ be independent real-valued random variables with $\E[X_i] = 0$, $|X_i| \leq M$, and $\E[X_i^2] \leq \sigma^2$, where $M$ and $\sigma$ are non-random. Then for all $t>0$, % \begin{align*} \P \left( \left| \frac{1}{n} \sum_{i=1}^n X_i \right| \geq t \right) \leq 2 \exp \left( - \frac{t^2 n} {2 \sigma^2 + \frac{2}{3} M t} \right). \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_bernstein}] See for example Lemma~2.2.9 in~\citet{van1996weak}. \end{proof} \begin{lemma}[The matrix Bernstein inequality] \label{lem:kernel_app_matrix_bernstein} For $1 \leq i \leq n$ let $X_i$ be independent symmetric $d \times d$ real random matrices with expected values $\mu_i = \E[X_i]$. Suppose that $\|X_i - \mu_i\|_2 \leq M$ almost surely for all $1 \leq i \leq n$ where $M$ is non-random, and define $\sigma^2 = \big\| \sum_i \E[(X_i - \mu_i)^2] \big\|_2$. Then there exists a universal constant $C > 0$ such that for any $t > 0$ and $q \geq 1$, % \begin{align*} \P\left( \left\| \sum_{i=1}^n \left( X_i - \mu_i \right) \right\|_2 \geq 2 \sigma \sqrt{t} + \frac{4}{3} M t \right) &\leq 2 d e^{-t}, \\ \E\left[ \left\| \sum_{i=1}^n \left( X_i - \mu_i \right) \right\|_2^q \right]^{1/q} &\leq C \sigma \sqrt{q + \log 2d} + C M (q + \log 2d). \end{align*} % Another simplified version of this is as follows: suppose that $\|X_i\|_2 \leq M$ almost surely, so that $\|X_i - \mu_i\|_2 \leq 2M$. Then since $\sigma^2 \leq n M^2$, we have % \begin{align*} \P\left( \left\| \sum_{i=1}^n \left( X_i - \mu_i \right) \right\|_2 \geq 4M \big(t + \sqrt{n t}\big) \right) &\leq 2 d e^{-t}, \\ \E\left[ \left\| \sum_{i=1}^n \left( X_i - \mu_i \right) \right\|_2^q \right]^{1/q} &\leq C M \big(q + \log 2d + \sqrt{n(q + \log 2d)}\big). \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_matrix_bernstein}] See Lemma~3.2 in \citet{minsker2019moment}. \end{proof} \begin{lemma}[A maximal inequality for Gaussian vectors] \label{lem:kernel_app_gaussian_vector_maximal} Take $n \geq 2$. Let $X_i \sim \cN(0, \sigma_i^2)$ for $1 \leq i \leq n$ with $\sigma_i^2 \leq \sigma^2$. Then % \begin{align} \label{eq:kernel_app_gaussian_vector_maximal} \E\left[ \max_{1 \leq i \leq n} X_i \right] &\leq \sigma \sqrt{2 \log n}, \\ \label{eq:kernel_app_gaussian_vector_maximal_abs} \E\left[ \max_{1 \leq i \leq n} |X_i| \right] &\leq 2 \sigma \sqrt{\log n}. \end{align} % If $\Sigma_1$ and $\Sigma_2$ are constant positive semi-definite $n \times n$ matrices and $N \sim \cN(0,I_n)$, then % \begin{align} \label{eq:kernel_app_gaussian_difference_psd} \E\Big[ \big\| \Sigma_1^{1/2} N - \Sigma_2^{1/2} N \big\|_\infty \Big] &\leq 2 \sqrt{\log n} \, \big\| \Sigma_1 - \Sigma_2 \big\|_2^{1/2}. \end{align} % If further $\Sigma_1$ is positive definite, then % \begin{align} \label{eq:kernel_app_gaussian_difference_pd} \E\Big[ \big\| \Sigma_1^{1/2} N - \Sigma_2^{1/2} N \big\|_\infty \Big] &\leq \sqrt{\log n} \, \lambda_{\min}(\Sigma_1)^{-1/2} \, \big\| \Sigma_1 - \Sigma_2 \big\|_2. \end{align} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_gaussian_vector_maximal}] For $t > 0$, Jensen's inequality on the concave logarithm function gives % \begin{align*} \E\left[ \max_{1 \leq i \leq n} X_i \right] &= \frac{1}{t} \E\left[ \log \exp \max_{1 \leq i \leq n} t X_i \right] \leq \frac{1}{t} \log \E\left[ \exp \max_{1 \leq i \leq n} t X_i \right] \leq \frac{1}{t} \log \sum_{i=1}^n \E\left[ \exp t X_i \right] \\ &= \frac{1}{t} \log \sum_{i=1}^n \exp \left( \frac{t^2 \sigma_i^2}{2} \right) \leq \frac{1}{t} \log n + \frac{t \sigma^2}{2}, \end{align*} % by the Gaussian moment generating function. Minimizing with $t = \sqrt{2 \log n} / \sigma$ yields \eqref{eq:kernel_app_gaussian_vector_maximal}. For \eqref{eq:kernel_app_gaussian_vector_maximal_abs}, we use the symmetry of the Gaussian distribution: % \begin{align*} \E\left[ \max_{1 \leq i \leq n} |X_i| \right] &= \E\left[ \max_{1 \leq i \leq n} \{X_i, -X_i\} \right] \leq \sigma \sqrt{2 \log 2n} \leq 2 \sigma \sqrt{\log n}. \end{align*} % For \eqref{eq:kernel_app_gaussian_difference_psd} and \eqref{eq:kernel_app_gaussian_difference_pd}, note that $\Sigma_1^{1/2} N - \Sigma_2^{1/2} N$ is Gaussian with covariance matrix $\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2$. The variances of of its components are the diagonal elements of this matrix, namely % \begin{align*} \sigma_i^2 &= \Var\big[ \big(\Sigma_1^{1/2} N - \Sigma_2^{1/2} N\big)_i \big] = \Big(\big( \Sigma_1^{1/2} - \Sigma_2^{1/2} \big)^2\Big)_{ii}. \end{align*} % Note that if $e_i$ is the $i$th standard unit basis vector, then for any real symmetric matrix $A$, we have $e_i^\T A^2 e_i = (A^2)_{ii}$, so in particular $(A^2)_{ii} \leq \|A\|_2^2$. Therefore % \begin{align*} \sigma_i^2 &\leq \big\| \Sigma_1^{1/2} - \Sigma_2^{1/2} \big\|_2^2 =\vcentcolon \sigma^2. \end{align*} % Applying \eqref{eq:kernel_app_gaussian_vector_maximal_abs} then gives % \begin{align*} \E\Big[ \big\| \Sigma_1^{1/2} N - \Sigma_2^{1/2} N \big\|_\infty \Big] &\leq 2 \sqrt{\log n} \, \big\| \Sigma_1^{1/2} - \Sigma_2^{1/2} \big\|_2. \end{align*} % By Theorem~X.1.1 in \citet{bhatia1997matrix}, we can deduce % \begin{align*} \big\| \Sigma_1^{1/2} - \Sigma_2^{1/2} \big\|_2 &\leq \big\| \Sigma_1 - \Sigma_2 \big\|_2^{1/2}, \end{align*} % giving \eqref{eq:kernel_app_gaussian_difference_psd}. If $\Sigma_1$ is positive definite, Theorem~X.3.8 in \citet{bhatia1997matrix} gives \eqref{eq:kernel_app_gaussian_difference_pd}: % \begin{align*} \big\| \Sigma_1^{1/2} - \Sigma_2^{1/2} \big\|_2 &\leq \frac{1}{2} \lambda_{\min}(\Sigma_1)^{-1/2} \, \big\| \Sigma_1 - \Sigma_2 \big\|_2. \end{align*} % \end{proof} \begin{lemma}[Maximal inequalities for Gaussian processes] \label{lem:kernel_app_gaussian_process_maximal} Let $Z$ be a separable mean-zero Gaussian process indexed by $x \in \cX$. Recall that $Z$ is separable for example if $\cX$ is Polish and $Z$ has continuous trajectories. Define its covariance structure on $\cX \times \cX$ by $\Sigma(x, x') = \E[Z(x) Z(x')]$, and the corresponding semimetric on $\cX$ by % \begin{align*} \rho(x,x') &= \E\big[\big(Z(x) - Z(x')\big)^2\big]^{1/2} = \big(\Sigma(x,x) - 2 \Sigma(x,x') + \Sigma(x',x')\big)^{1/2}. \end{align*} % Let $N(\varepsilon, \cX, \rho)$ denote the $\varepsilon$-covering number of $\cX$ with respect to the semimetric $\rho$. Define $\sigma = \sup_x \Sigma(x,x)^{1/2}$. Then there exists a universal constant $C > 0$ such that for any $\delta > 0$, % \begin{align*} \E\left[ \sup_{x \in \cX} |Z(x)| \right] &\leq C \sigma + C \int_0^{2\sigma} \sqrt{\log N(\varepsilon, \cX, \rho)} \diff{\varepsilon}, \\ \E\left[ \sup_{\rho(x,x') \leq \delta} |Z(x) - Z(x')| \right] &\leq C \int_0^{\delta} \sqrt{\log N(\varepsilon, \cX, \rho)} \diff{\varepsilon}. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_gaussian_process_maximal}] See Corollary~2.2.8 in \citet{van1996weak}, noting that for any $x,x' \in \cX$, we have $\E[|Z(x)|] \lesssim \sigma$ and $\rho(x,x') \leq 2\sigma$, implying that $\log N(\varepsilon, \cX, \rho) = 0$ for all $\varepsilon > 2 \sigma$. \end{proof} \begin{lemma}[Anti-concentration for Gaussian process absolute suprema] \label{lem:kernel_app_anticoncentration} Let $Z$ be a separable mean-zero Gaussian process indexed by a semimetric space $\cX$ with $\E[Z(x)^2] = 1$ for all $x \in \cX$. Then for any $\varepsilon > 0$, % \begin{align*} \sup_{t \in \R} \P\left( \left| \sup_{x \in \cX} \big| Z(x) \big| - t \right| \leq \varepsilon \right) &\leq 4 \varepsilon \left( 1 + \E\left[ \sup_{x \in \cX} \big| Z(x) \big| \right] \right). \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_anticoncentration}] See Corollary~2.1 in \citet{chernozhukov2014anti}. \end{proof} \begin{lemma}[No slowest rate of convergence in probability] \label{lem:kernel_app_slow_convergence} Let $X_n$ be a sequence of real-valued random variables with $X_n = o_\P(1)$. Then there exists a deterministic sequence $\varepsilon_n \to 0$ such that $\P\big(|X_n| > \varepsilon_n\big) \leq \varepsilon_n$ for all $n \geq 1$. \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_slow_convergence}] Define the following deterministic sequence for $k \geq 1$. % \begin{align*} \tau_k &= \sup \big\{ n \geq 1: \P\big(|X_n| > 1/k\big) > 1/k \big\} \vee (\tau_{k-1} +1) \end{align*} % with $\tau_0 = 0$. Since $X_n = o_\P(1)$, each $\tau_k$ is finite and so we can define $\varepsilon_n = \frac{1}{k}$ where $\tau_k < n \leq \tau_{k+1}$. Then, noting that $\varepsilon_n \to 0$, we have $\P\big(|X_n| > \varepsilon_n\big) = \P\big(|X_n| > 1/k\big) \leq 1/k = \varepsilon_n$. \end{proof} \begin{lemma}[General second-order Hoeffding-type decomposition] \label{lem:kernel_app_general_hoeffding} Let $\cU$ be a vector space. Let $u_{i j} \in \cU$ be defined for $1 \leq i, j \leq n$ and $i \neq j$. Suppose that $u_{i j} = u_{j i}$ for all $i,j$. Then for any $u_i \in \cU$ (for $1 \leq i \leq n$) and any $u \in \cU$, the following decomposition holds: % \begin{align*} \sum_{i=1}^n \sum_{\substack{j=1 \\ j \neq i}}^n \big(u_{i j} - u\big) &= 2(n-1) \sum_{i=1}^n \big(u_i - u\big) + \sum_{i=1}^n \sum_{\substack{j=1 \\ j \neq i}}^n \big(u_{i j} - u_i - u_j + u\big). \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_general_hoeffding}] We compute the left hand side minus the right hand side, beginning by observing that all of the $u_{i j}$ and $u$ terms clearly cancel. % \begin{align*} &\sum_{i=1}^n \sum_{j \neq i}^n \big(u_{i j} - u\big) - 2(n-1) \sum_{i=1}^n \big(u_i - u\big) - \sum_{i=1}^n \sum_{j \neq i} \big(u_{i j} - u_i - u_j + u\big) \\ &\qquad= - 2(n-1) \sum_{i=1}^n u_i - \sum_{i=1}^n \sum_{j \neq i}^n \big(- u_i - u_j\big) = - 2(n-1) \sum_{i=1}^n u_i + \sum_{i=1}^n \sum_{j \neq i}^n u_i + \sum_{j=1}^n \sum_{i \neq j}^n u_j \\ &\qquad= - 2(n-1) \sum_{i=1}^n u_i + (n-1) \sum_{i=1}^n u_i + (n-1) \sum_{j=1}^n u_j = 0. \end{align*} \end{proof} \begin{lemma}[A U-statistic concentration inequality] \label{lem:kernel_app_ustat_concentration} Let $(S,\cS)$ be a measurable space and $X_1, \ldots, X_n$ be i.i.d.\ $S$-valued random variables. Let $H: S^m \to \R$ be a function of $m$ variables satisfying the symmetry property $H(x_1, \ldots, x_m) = H(x_{\tau (1)}, \ldots, x_{\tau (m)})$ for any $m$-permutation $\tau$. Suppose also that $\E[H(X_1, \ldots, X_m)] = 0$. Let $M = \|H\|_\infty$ and $\sigma^2 = \E\big[\E[H(X_1, \ldots, X_m) \mid X_1]^2\big]$. Define the U-statistic % \begin{align*} U_n &= \frac{m!(n-m)!}{n!} \sum_{1 \leq i_1 < \cdots < i_m \leq n} H(X_1, \ldots, X_n). \end{align*} % Then for any $t > 0$, with $C_1(m)$, $C_2(m)$ positive constants depending only on $m$, % \begin{align*} \P\left( |U_n| > t \right) &\leq 4 \exp \left( - \frac{n t^2}{C_1(m) \sigma^2 + C_2(m) M t} \right). \end{align*} % \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_ustat_concentration}] See Theorem~2 in \citet{arcones1995bernstein}. \end{proof} \begin{lemma}[A second-order U-process maximal inequality] \label{lem:kernel_app_uprocess_maximal} Let $X_1 \ldots, X_n$ be i.i.d.\ random variables taking values in a measurable space $(S, \cS)$ with distribution $\P$. Let $\cF$ be a class of measurable functions from $S \times S$ to $\R$ which is also pointwise measurable. Define the degenerate second-order U-process % \begin{align*} U_n(f) = \frac{2}{n(n-1)} \sum_{i 0$ be any deterministic value satisfying $\sup_{f \in \cF} \|f\|_{\P,2} \leq \sigma \leq \|F\|_{\P,2}$, and define the random variable $M = \max_{i,j} |F(X_i, X_j)|$. Then there exists a universal constant $C_3 > 0$ satisfying % \begin{align*} n \E\left[ \sup_{f \in \cF} \big| U_n(f) \big| \right] &\leq C_3 \sigma \Big( C_2 \log\big(C_1 \|F\|_{\P,2} / \sigma \big) \Big) + \frac{C_3 \|M\|_{\P,2}}{\sqrt{n}} \Big( C_2 \log\big(C_1 \|F\|_{\P,2} / \sigma \big) \Big)^2. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_uprocess_maximal}] Apply Corollary~5.3 from \citet{chen2020jackknife} with the order of the U-statistic fixed at $r=2$, and with $k=2$. \end{proof} \begin{lemma}[A U-statistic matrix concentration inequality] \label{lem:kernel_app_ustat_matrix_concentration} Let $X_1, \ldots, X_n$ be i.i.d.\ random variables taking values in a measurable space $(S, \cS)$. Suppose $H: S^2 \to \R^{d \times d}$ is a measurable matrix-valued function of two variables satisfying the following: % \begin{enumerate}[label=(\roman*)] \item $H(X_1, X_2)$ is an almost surely symmetric matrix. \item $\|H(X_1, X_2)\|_2 \leq M$ almost surely. \item $H$ is a symmetric function in its arguments in that $H(X_1, X_2) = H(X_2, X_1)$. \item $H$ is degenerate in the sense that $\E[H(X_1, x_2)] = 0$ for all $x_2 \in S$. \end{enumerate} % Let $U_n = \sum_i \sum_{j \neq i} H(X_i, X_j)$ be a U-statistic, and define the variance-type constant % \begin{align*} \sigma^2 &= \E\left[ \left\| \E\left[ H(X_i, X_j)^2 \mid X_j \right] \right\|_2 \right]. \end{align*} % Then for a universal constant $C > 0$ and for all $t > 0$, % \begin{align*} \P\left( \|U_n\|_2 \geq C \sigma n (t + \log d) + C M \sqrt{n} (t + \log d)^{3/2} \right) &\leq C e^{-t}. \end{align*} % By Jensen's inequality, $\sigma^2 \leq \E[ \| H(X_i, X_j)^2 \|_2 ] = \E[ \| H(X_i, X_j) \|_2^2 ] \leq M^2$, giving the simpler % \begin{align*} \P\left( \|U_n\|_2 \geq 2 C M n (t + \log d)^{3/2} \right) &\leq C e^{-t}. \end{align*} % From this last inequality we deduce a moment bound by integration of tail probabilities: % \begin{align*} \E\left[ \|U_n\|_2 \right] &\lesssim M n (\log d)^{3/2}. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_ustat_matrix_concentration}] We apply results from \citet{minsker2019moment}. \proofparagraph{decoupling} Let $\bar U_n = \sum_{i=1}^n \sum_{j=1}^n H(X_i^{(1)}, X_j^{(2)})$ be a decoupled matrix U-statistic, where $X^{(1)}$ and $X^{(2)}$ are i.i.d.\ copies of the sequence $X_1, \ldots, X_n$. By Lemma~5.2 in \citet{minsker2019moment}, since we are only stating this result for degenerate U-statistics of order 2, there exists a universal constant $D_2$ such that for any $t > 0$, we have % \begin{align*} \P\left( \|U_n\|_2 \geq t \right) &\leq D_2 \P\left( \|\bar U_n\|_2 \geq t / D_2 \right). \end{align*} \proofparagraph{concentration of the decoupled U-statistic} By Equation~11 in \citet{minsker2019moment}, we have the following concentration inequality for decoupled degenerate U-statistics. For some universal constant $C_1$ and for any $t > 0$, % \begin{align*} \P\left( \|\bar U_n\|_2 \geq C_1 \sigma n (t + \log d) + C_1 M \sqrt{n} (t + \log d)^{3/2} \right) &\leq e^{-t}. \end{align*} \proofparagraph{concentration of the original U-statistic} Hence we have % \begin{align*} &\P\left( \|U_n\|_2 \geq C_1 D_2 \sigma n (t + \log d) + C_1 D_2 M \sqrt{n} (t + \log d)^{3/2} \right) \\ &\quad\leq D_2 \P\left( \|\bar U_n\|_2 \geq C_1 \sigma n (t + \log d) + C_1 M \sqrt{n} (t + \log d)^{3/2} \right) \leq D_2 e^{-t}. \end{align*} % The main result follows by setting $C = C_1 + C_1 D_2$. \proofparagraph{moment bound} We now obtain a moment bound for the simplified version. We already have that % \begin{align*} \P\left( \|U_n\|_2 \geq 2 C M n (t + \log d)^{3/2} \right) &\leq C e^{-t}. \end{align*} % This implies that for any $t \geq \log d$, we have % \begin{align*} \P\left( \|U_n\|_2 \geq 8 C M n t^{3/2} \right) &\leq C e^{-t}. \end{align*} % Defining $s = 8 C M n t^{3/2}$ so $t = \left( \frac{s}{8C M n} \right)^{2/3}$ shows that for any $s \geq 8C M n(\log d)^{3/2}$, % \begin{align*} \P\left( \|U_n\|_2 \geq s \right) &\leq C e^{-\left( \frac{s}{8C M n} \right)^{2/3}}. \end{align*} % Hence the moment bound is obtained: % \begin{align*} \E\left[ \|U_n\|_2 \right] &= \int_0^\infty \P\left( \|U_n\|_2 \geq s \right) \diff{s} \\ &= \int_0^{8C M n(\log d)^{3/2}} \P\left( \|U_n\|_2 \geq s \right) \diff{s} + \int_{8C M n(\log d)^{3/2}}^\infty \P\left( \|U_n\|_2 \geq s \right) \diff{s} \\ &\leq 8C M n(\log d)^{3/2} + \int_0^\infty C e^{-\left( \frac{s}{8C M n} \right)^{2/3}} \diff{s} \\ &= 8C M n(\log d)^{3/2} + 8C M n \int_0^\infty e^{s^{-2/3}} \diff{s} \lesssim Mn(\log d)^{3/2}. \end{align*} \end{proof} \subsection{Technical lemmas} Before presenting the proof of Lemma~\ref{lem:kernel_app_maximal_entropy}, we give some auxiliary lemmas; namely a symmetrization inequality (Lemma~\ref{lem:kernel_app_symmetrization}), a Rademacher contraction principle (Lemma~\ref{lem:kernel_app_contraction}), and a Hoffman--J{\o}rgensen inequality (Lemma~\ref{lem:kernel_app_hoffmann}). Recall that the Rademacher distribution places probability mass of $1/2$ on each of the points $-1$ and $1$. \begin{lemma}[A symmetrization inequality for i.n.i.d.\ variables] \label{lem:kernel_app_symmetrization} Let $(S, \cS)$ be a measurable space and $\cF$ a class of Borel-measurable functions from $S$ to $\R$ which is pointwise measurable (i.e.\ it contains a countable dense subset under pointwise convergence). Let $X_1, \ldots, X_n$ be independent but not necessarily identically distributed $S$-valued random variables. Let $a_1, \ldots, a_n$ be arbitrary points in $S$ and $\phi$ a non-negative non-decreasing convex function from $\R$ to $\R$. Define $\varepsilon_1, \ldots, \varepsilon_n$ as independent Rademacher random variables, independent of $X_1, \ldots, X_n$. Then % \begin{align*} \E \left[ \phi \left( \sup_{f \in \cF} \left| \sum_{i=1}^n \Big( f(X_i) - \E[f(X_i)] \Big) \right| \right) \right] &\leq \E \left[ \phi \left( 2 \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i \Big( f(X_i) - a_i \Big) \right| \right) \right]. \end{align*} % Note that in particular this holds with $a_i = 0$ and also holds with $\phi(t) = t \vee 0$. \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_symmetrization}] See Lemma~2.3.6 in \citet{van1996weak}. % \end{proof} \begin{lemma}[A Rademacher contraction principle] \label{lem:kernel_app_contraction} Let $\varepsilon_1, \ldots, \varepsilon_n$ be independent Rademacher random variables and $\cT$ be a bounded subset of $\R^n$. Define $M = \sup_{t \in \cT} \max_{1 \leq i \leq n} |t_i|$. Then, noting that the supremum is measurable because $\cT$ is a subset of a separable metric space and is therefore itself separable, % \begin{align*} \E \left[ \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i^2 \right| \right] &\leq 4M \, \E \left[ \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i \right| \right]. \end{align*} % This gives the following corollary. Let $X_1, \ldots, X_n$ be mutually independent and also independent of $\varepsilon_1, \ldots, \varepsilon_n$. Let $\cF$ be a pointwise measurable class of functions from a measurable space $(S, \cS)$ to $\R$, with measurable envelope $F$. Define $M = \max_i F(X_i)$. Then we obtain % \begin{align*} \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i)^2 \right| \right] &\leq 4 \E \left[ M \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right]. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_contraction}] Apply Theorem~4.12 from \citet{ledoux1991probability} with $F$ the identity function and % \begin{align*} \psi_i(s) = \psi(s) &= \min \left( \frac{s^2}{2M}, \frac{M}{2} \right). \end{align*} % This is a weak contraction (i.e.\ 1-Lipschitz) because it is continuous, differentiable on $(-M,M)$ with derivative bounded by $|\psi'(s)| \leq |s|/M \leq 1$, and constant outside $(-M,M)$. Note that since $|t_i| \leq M$ by definition, we have $\psi_i(t_i) = t_i^2 / (2M)$. Hence by Theorem~4.12 from \citet{ledoux1991probability}, % \begin{align*} \E \left[ F \left( \frac{1}{2} \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i \psi_i(t_i) \right| \right) \right] &\leq \E \left[ F \left( \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i \right| \right) \right], \\ \E \left[ \frac{1}{2} \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i \frac{t_i^2}{2M} \right| \right] &\leq \E \left[ \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i \right| \right], \\ \E \left[ \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i^2 \right| \right] &\leq 4M \, \E \left[ \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i \right| \right]. \end{align*} % For the corollary, set $\cT = \left\{\big(f(X_1), \ldots, f(X_n)\big) : f \in \cF\right\}$. For a fixed realization $X_1, \ldots, X_n$, % \begin{align*} \E_\varepsilon \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i)^2 \right| \right] &= \E_\varepsilon \left[ \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i^2 \right| \right] \\ &\leq 4 \E_\varepsilon \left[ M \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i \right| \right] = 4 \E_\varepsilon \left[ M \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right]. \end{align*} % Taking an expectation over $X_1, \ldots, X_n$ and applying Fubini's theorem yields the result. \end{proof} \begin{lemma}[A Hoffmann--J{\o}rgensen inequality] \label{lem:kernel_app_hoffmann} Let $(S, \cS)$ be a measurable space and $X_1, \ldots, X_n$ be $S$-valued random variables. Suppose that $\cF$ is a pointwise measurable class of functions from $S$ to $\R$ with finite envelope $F$. Let $\varepsilon_1, \ldots, \varepsilon_n$ be independent Rademacher variables independent of $X_1, \ldots, X_n$. For $q \in (1, \infty)$, % \begin{align*} \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| ^q \right] ^{1/q} &\leq C_q \left( \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right] + \E \left[ \max_{1 \leq i \leq n} \sup_{f \in \cF} \big| f(X_i) \big|^q \right]^{1/q} \right), \end{align*} % where $C_q$ is a positive constant depending only on $q$. \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_hoffmann}] We use Talagrand's formulation of a Hoffmann--J{\o}rgensen inequality. Consider the independent $\ell^\infty(\cF)$-valued random functionals $u_i$ defined by $u_i(f) = \varepsilon_i f(X_i)$, where $\ell^\infty(\cF)$ is the Banach space of bounded functions from $\cF$ to $\R$, equipped with the norm $\|u\|_\cF = \sup_{f \in \cF} |u(f)|$. Then Remark~3.4 in \citet{kwapien1991hypercontraction} gives % \begin{align*} \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n u_i(f) \right| ^q \right] ^{1/q} &\leq C_q \left( \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n u_i(f) \right| \right] + \E \left[ \max_{1 \leq i \leq n} \sup_{f \in \cF} \left| u_i(f) \right|^q \right]^{1/q} \right) \\ \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| ^q \right] ^{1/q} &\leq C_q \left( \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right] + \E \left[ \max_{1 \leq i \leq n} \sup_{f \in \cF} \big| f(X_i) \big|^q \right]^{1/q} \right). \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_maximal_entropy}] We follow the proof of Theorem~5.2 from \citet{chernozhukov2014gaussian}, using our i.n.i.d.\ versions of the symmetrization inequality (Lemma~\ref{lem:kernel_app_symmetrization}), Rademacher contraction principle (Lemma~\ref{lem:kernel_app_contraction}), and Hoffmann--J{\o}rgensen inequality (Lemma~\ref{lem:kernel_app_hoffmann}). Without loss of generality, we may assume that $J(1, \cF, F) < \infty$ as otherwise there is nothing to prove, and that $F > 0$ everywhere on $S$. Let $\P_n = n^{-1} \sum_i \delta_{X_i}$ be the empirical distribution of $X_i$, and define the empirical variance bound $\sigma_n^2 = \sup_\cF n^{-1} \sum_i f(X_i)^2$. By the i.n.i.d.\ symmetrization inequality (Lemma~\ref{lem:kernel_app_symmetrization}), % \begin{align*} \E \left[ \sup_{f \in \cF} \big| G_n(f) \big| \right] &= \frac{1}{\sqrt n} \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \Big( f(X_i) - \E[f(X_i)] \Big) \right| \right] \leq \frac{2}{\sqrt n} \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right], \end{align*} % where $\varepsilon_1, \ldots, \varepsilon_n$ are independent Rademacher random variables, independent of $X_1, \ldots, X_n$. Then the standard entropy integral inequality from the proof of Theorem~5.2 in the supplemental materials for \citet{chernozhukov2014gaussian} gives for a universal constant $C_1 > 0$, % \begin{align*} \frac{1}{\sqrt n} \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \Bigm\vert X_1, \ldots, X_n \right] &\leq C_1 \|F\|_{\P_n,2} \, J(\sigma_n / \|F\|_{\P_n,2}, \cF, F). \end{align*} % Taking marginal expectations and applying Jensen's inequality along with a convexity result for the covering integral, as in Lemma~A.2 in \citet{chernozhukov2014gaussian}, gives % \begin{align*} Z &\vcentcolon= \frac{1}{\sqrt n} \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right] \leq C_1 \|F\|_{\bar\P,2} \, J(\E[\sigma_n^2]^{1/2} / \|F\|_{\bar\P,2}, \cF, F). \end{align*} % Now use symmetrization (Lemma~\ref{lem:kernel_app_symmetrization}), the contraction principle (Lemma~\ref{lem:kernel_app_contraction}), the Cauchy--Schwarz inequality, and the Hoffmann--J{\o}rgensen inequality (Lemma~\ref{lem:kernel_app_hoffmann}) to deduce that % \begin{align*} \E[\sigma_n^2] &= \E\left[ \sup_{f \in \cF} \frac{1}{n} \sum_{i=1}^n f(X_i)^2 \right] \leq \sup_{f \in \cF} \E_{\bar\P} \left[ f(X_i)^2 \right] + \frac{1}{n} \E\left[ \sup_{f \in \cF} \left| \sum_{i=1}^n f(X_i)^2 - \E \left[ f(X_i)^2 \right] \right| \right] \\ &\leq \sigma^2 + \frac{2}{n} \E\left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i)^2 \right| \right] \leq \sigma^2 + \frac{8}{n} \E\left[ M \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right] \\ &\leq \sigma^2 + \frac{8}{n} \E\left[ M^2 \right]^{1/2} \E\left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right|^2 \right]^{1/2} \\ &\leq \sigma^2 + \frac{8}{n} \|M\|_{\P,2} \, C_2 \left( \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right] + \E \left[ \max_{1 \leq i \leq n} \sup_{f \in \cF} \big| f(X_i) \big|^2 \right]^{1/2} \right) \\ &= \sigma^2 + \frac{8C_2}{n} \|M\|_{\P,2} \, \left( \sqrt{n} Z + \|M\|_{\P,2} \right) \lesssim \sigma^2 + \frac{\|M\|_{\P,2} Z}{\sqrt n} + \frac{\|M\|_{\P,2}^2}{n}, \end{align*} % where $\lesssim$ indicates a bound up to a universal constant. Hence taking a square root we see that, following the notation from the proof of Theorem~5.2 in the supplemental materials to \citet{chernozhukov2014gaussian}, % \begin{align*} \sqrt{\E[\sigma_n^2]} &\lesssim \sigma + \|M\|_{\P,2}^{1/2} Z^{1/2} n^{-1/4} + \|M\|_{\P,2} n^{-1/2} \lesssim \|F\|_{\bar\P,2} \left( \Delta \vee \sqrt{DZ} \right), \end{align*} % where $\Delta^2 = \|F\|_{\bar\P,2}^{-2} \big(\sigma^2 \vee (\|M\|_{\P,2}^2 / n) \big) \geq \delta^2$ and $D = \|M\|_{\P,2} n^{-1/2} \|F\|_{\bar\P,2}^{-2}$. Thus returning to our bound on $Z$, we now have % \begin{align*} Z &\lesssim \|F\|_{\bar\P,2} \, J(\Delta \vee \sqrt{DZ}, \cF, F). \end{align*} % The final steps proceed as in the proof of Theorem~5.2 from \citet{chernozhukov2014gaussian}, considering cases separately for $\Delta \geq \sqrt{DZ}$ and $\Delta < \sqrt{DZ}$, and applying convexity properties of the entropy integral $J$. \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_maximal_vc_inid}] We assume the VC-type condition % $\sup_\Q N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2}) \leq (C_1/\varepsilon)^{C_2}$ % for all $\varepsilon \in (0,1]$, with constants $C_1 \geq e$ and $C_2 \geq 1$. Hence for $\delta \in (0,1]$, the entropy integral can be bounded as % \begin{align*} J\big(\delta, \cF, F\big) &= \int_0^\delta \sqrt{1 + \sup_\Q \log N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2})} \diff{\varepsilon} \leq \int_0^\delta \sqrt{1 + C_2 \log (C_1/\varepsilon)} \diff{\varepsilon} \\ &\leq \int_0^\delta \left( 1 + \sqrt{C_2 \log (C_1/\varepsilon)} \right) \diff{\varepsilon} = \delta + \sqrt{C_2} \int_0^\delta \sqrt{\log (C_1/\varepsilon)} \diff{\varepsilon} \\ &\leq \delta + \sqrt{\frac{C_2}{\log (C_1/\delta)}} \int_0^\delta \log (C_1/\varepsilon) \diff{\varepsilon} = \delta + \sqrt{\frac{C_2}{\log (C_1/\delta)}} \big( \delta + \delta \log (C_1/\delta) \big) \\ &\leq 3 \delta \sqrt{C_2 \log (C_1/\delta)}. \end{align*} % The remaining equations now follow by Lemma~\ref{lem:kernel_app_maximal_entropy}. \end{proof} Before proving Lemma~\ref{lem:kernel_app_kmt_corollary}, we give a bounded-variation characterization (Lemma~\ref{lem:kernel_app_bv_characterization}). \begin{lemma}[A characterization of bounded-variation functions] \label{lem:kernel_app_bv_characterization} Let $\cV_1$ be the class of real-valued functions on $[0,1]$ which are 0 at 1 and have total variation bounded by 1. Also define the class of half-interval indicator functions $\cI = \{\I[0,t]: t \in [0,1]\}$. For any topological vector space $\cX$, define the symmetric convex hull of a subset $\cY \subseteq \cX$ as % \begin{align*} \symconv \cY &= \left\{ \sum_{i=1}^n \lambda_i y_i : \sum_{i=1}^n \lambda_i = 1, \ \lambda_i \geq 0, \ y_i \in \cY \cup -\cY, \ n \in \N \right\}. \end{align*} % Denote its closure by $\overline\symconv \ \cY$. Under the pointwise convergence topology, $\cV_1 \subseteq \overline\symconv \ \cI$. \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_bv_characterization}] Firstly, let $\cD \subseteq \cV_1$ be the class of real-valued functions on $[0,1]$ which are 0 at 1, have total variation exactly 1, and are weakly monotone decreasing. Therefore, for $g \in \cD$, we have $\|g\|_\TV = g(0) = 1$. Let $S = \{s_1, s_2, \dots\} \subseteq [0,1]$ be the countable set of discontinuity points of $g$. We want to find a sequence of convex combinations of elements of $\cI$ which converges pointwise to $g$. To do this, first define the sequence of meshes % \begin{align*} A_n = \{s_k : 1 \leq k \leq n\} \cup \{k/n : 0 \leq k \leq n\}, \end{align*} % which satisfies $\bigcup_n A_n = S \cup ([0,1] \cap \Q)$. Endow $A_n$ with the ordering induced by the canonical order on $\R$, giving $A_n = \{a_1, a_2, \ldots\}$, and define the sequence of functions % \begin{align*} g_n(x) = \sum_{k = 1}^{|A_n|-1} \I[0,a_k] \big( g(a_k) - g(a_{k+1}) \big), \end{align*} % where clearly $\I[0, a_k] \in \cI$, $g(a_k) - g(a_{k+1}) \geq 0$, and $\sum_{k = 1}^{|A_n|-1} \big( g(a_k) - g(a_{k+1}) \big) = g(0) - g(1) = 1$. Therefore $g_n$ is a convex combination of elements of $\cI$. Further, note that for $a_k \in A_n$, % \begin{align*} g_n(a_k) = \sum_{j = k}^{|A_n|-1} \big( g(a_j) - g(a_{j+1}) \big) = g(a_k) - g(a_{|A_n|}) = g(a_k) - g(1) = g(a_k). \end{align*} % Hence if $x \in S$, then eventually $x \in A_n$ so $g_n(x) \to g(x)$. Alternatively, if $x \not\in S$, then $g$ is continuous at $x$. But $g_n \to g$ on the dense set $\bigcup_n A_n$, so also $g_n(x) \to g(x)$. Hence $g_n \to g$ pointwise on $[0,1]$. Now take $f \in \cV_1$. By the Jordan decomposition for total variation functions \citep{royden1988real}, we can write $f = f^+ - f^-$, with $f^+$ and $f^-$ weakly decreasing, $f^+(1) = f^-(1) = 0$, and $\|f^+\|_\TV + \|f^-\|_\TV = \|f\|_\TV$. Supposing that both $\|f^+\|_\TV$ and $\|f^-\|_\TV$ are strictly positive, let $g_n^+$ approximate the unit-variation function $f^+/\|f^+\|_\TV$ and $g_n^-$ approximate $f^-/\|f^-\|_\TV$ as above. Then since trivially % \begin{align*} f = \|f^+\|_\TV f^+ / \|f^+\|_\TV - \|f^-\|_\TV f^- / \|f^-\|_\TV + \big(1 - \|f^+\|_\TV - \|f^-\|_\TV) \cdot 0, \end{align*} % we have that the convex combination % \begin{align*} g_n^+ \|f^+\|_\TV - g_n^- \|f^-\|_\TV + \big(1 - \|f^+\|_\TV - \|f^-\|_\TV) \cdot 0 \end{align*} % converges pointwise to $f$. This also holds if either of the total variations $\|f^\pm\|_\TV$ are zero, since then the corresponding sequence $g_n^\pm$ need not be defined. Now note that each of $g_n^+$, $\,-g_n^-$, and $0$ are in $\symconv \cI$, so $f \in \overline\symconv \ \cI$ under pointwise convergence. \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_kmt_corollary}] We follow the Gaussian approximation method given in Section~2 of \citet{gine2004kernel}. The KMT approximation theorem \citep{komlos1975approximation} asserts the existence of a probability space carrying $n$ i.i.d.\ uniform random variables $\xi_1, \ldots, \xi_n \sim \Unif[0,1]$ and a standard Brownian motion $B_n(s): s \in [0,1]$ such that if % \begin{align*} \alpha_n(s) &\vcentcolon= \frac{1}{\sqrt{n}} \sum_{i=1}^n \big( \I\{\xi_i \leq s\} - s \big), &\beta_n(s) &\vcentcolon= B_n(s) - s B_n(1), \end{align*} % then for some universal positive constants $C_1$, $C_2$, $C_3$, and for all $t > 0$, % \begin{align*} \P\left( \sup_{s \in [0,1]} \big| \alpha_n(s) - \beta_n(s) \big| > \frac{t + C_1\log n}{\sqrt{n}} \right) \leq C_2 e^{-C_3 t}. \end{align*} % We can view $\alpha_n$ and $\beta_n$ as random functionals defined on the class of half-interval indicator functions $\cI = \big\{\I[0,s]: s \in [0,1]\big\}$ in the following way. % \begin{align*} \alpha_n(\I[0,s]) &= \frac{1}{\sqrt{n}} \sum_{i=1}^n \big( \I[0,s](\xi_i) - \E[\I[0,s](\xi_i)]), \\ \beta_n(\I[0,s]) &= \int_0^1 \I[0,s](u) \diff{B_n(u)} - B_n(1) \int_0^1 \I[0,s](u) \diff{u}, \end{align*} % where the integrals are defined as It{\^o} and Riemann--Stieltjes integrals in the usual way for stochastic integration against semimartingales \citep[Chapter~5]{legall2016brownian}. Now we extend their definitions to the class $\cV_1$ of functions on $[0,1]$ which are 0 at 1 and have total variation bounded by 1. This is achieved by noting that by Lemma~\ref{lem:kernel_app_bv_characterization}, we have $\cV_1 \subseteq \overline\symconv \ \cI$ where $\overline{\symconv} \ \cI$ is the smallest symmetric convex class containing $\cI$ which is closed under pointwise convergence. Thus by the dominated convergence theorem, every function in $\cV_1$ is approximated in $L^2$ by finite convex combinations of functions in $\pm\cI$, and the extension to $g \in \cV_1$ follows by linearity and $L^2$ convergence of (stochastic) integrals: % \begin{align*} \alpha_n(g) &= \frac{1}{\sqrt{n}} \sum_{i=1}^n \big( g(\xi_i) - \E[g(\xi_i)]), &\beta_n(g) &= \int_0^1 g(s) \diff{B_n(s)} - B_n(1) \int_0^1 g(s) \diff{s}. \end{align*} % Now we show that the norm induced on $(\alpha_n - \beta_n)$ by the function class $\cV_1$ is a.s.\ identical to the supremum norm. Writing the sums as integrals and using integration by parts for finite-variation Lebesgue--Stieltjes and It\^o integrals, and recalling that $g(1) = \alpha_n(0) = B_n(0) = 0$, % \begin{align*} \sup_{g \in \cV_1} \big|\alpha_n(g) - \beta_n(g)\big| &= \sup_{g \in \cV_1} \left| \int_0^1 g(s) \diff{\alpha_n(s)} - \int_0^1 g(s) \diff{B_n(s)} + B_n(1) \int_0^1 g(s) \diff{s} \right| \\ &= \sup_{g \in \cV_1} \left| \int_0^1 \alpha_n(s) \diff{g(s)} - \int_0^1 B_n(s) \diff{g(s)} + B_n(1) \int_0^1 s \diff{g(s)} \right| \\ &= \sup_{g \in \cV_1} \left| \int_0^1 \big(\alpha_n(s) - \beta_n(s)\big) \diff{g(s)} \right| = \sup_{s \in [0,1]} \big| \alpha_n(s) - \beta_n(s) \big|, \end{align*} % where in the last line the upper bound is because $\|g\|_\TV \leq 1$, and the lower bound is by taking $g_\varepsilon = \pm \I[0,s_\varepsilon]$ where $|\alpha_n(s_\varepsilon) - \beta_n(s_\varepsilon)| \geq \sup_s |\alpha_n(s) - \beta_n(s)| - \varepsilon$. Hence we obtain % \begin{align} \label{eq:kernel_app_kmt_concentration} \P\left( \sup_{g \in \cV_1} \big|\alpha_n(g) - \beta_n(g)\big| > \frac{t + C_1\log n}{\sqrt{n}} \right) \leq C_2 e^{-C_3 t}. \end{align} % Now define $V_n = \sup_{x \in \R} \|g_n(\cdot, x)\|_\TV$, noting that if $V_n = 0$ then the result is trivially true by setting $Z_n = 0$. Let $F_X$ be the common c.d.f.\ of $X_i$, and define the quantile function $F_X^{-1}(s) = \inf \{u: F_X(u) \geq s\}$ for $s \in [0,1]$, writing $\inf \emptyset = \infty$ and $\inf \R = -\infty$. Consider the function class % \begin{align*} \cG_n = \big\{ V_n^{-1} g_n\big(F_X^{-1}(\cdot), x\big) - V_n^{-1} g_n\big(F_X^{-1}(1), x\big) : x \in \R \big\}, \end{align*} % noting that $g_n(\cdot,x)$ is finite-variation so $g_n(\pm \infty, x)$ can be interpreted as the relevant limit. By monotonicity of $F_X$ and the definition of $V_n$, the members of $\cG_n$ have total variation of at most $1$ and are 0 at 1, implying that $\cG_n \subseteq \cV_1$. Noting that $\alpha_n$ and $\beta_n$ are random linear operators which a.s.\ annihilate constant functions, define % \begin{align*} Z_n(x) &= \beta_n \Big(g_n\big(F_X^{-1}(\cdot), x\big)\Big) = V_n \beta_n \Big( V_n^{-1} g_n\big(F_X^{-1}(\cdot), x\big) - V_n^{-1} g_n\big(F_X^{-1}(1), x\big) \Big), \end{align*} % which is a mean-zero continuous Gaussian process. Its covariance structure is % \begin{align*} &\E[Z_n(x) Z_n(x')] \\ &= \E\bigg[ \left( \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{B_n(s)} - B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s} \right) \\ &\quad\times \left( \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{B_n(s)} - B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s} \right) \bigg] \\ &= \E\left[ \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{B_n(s)} \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{B_n(s)} \right] \\ &\quad- \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s} \ \E\left[ B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{B_n(s)} \right] \\ &\quad- \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s} \ \E\left[ B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{B_n(s)} \right] \\ &\quad+ \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s} \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s} \ \E\left[ B_n(1)^2 \right] \\ &= \int_0^1 g_n\big(F_X^{-1}(s),x\big) g_n\big(F_X^{-1}(s),x'\big) \diff{s} - \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s} \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s} \\ &= \E\Big[ g_n\big(F_X^{-1}(\xi_i), x\big) g_n\big(F_X^{-1}(\xi_i), x'\big) \Big] - \E\Big[ g_n\big(F_X^{-1}(\xi_i), x\big) \Big] \E\Big[ g_n\big(F_X^{-1}(\xi_i), x'\big) \Big] \\ &= \E\Big[ g_n\big(X_i, x\big) g_n\big(X_i, x'\big) \Big] - \E\Big[ g_n\big(X_i, x\big) \Big] \E\Big[ g_n\big(X_i, x'\big) \Big] = \E\big[ G_n(x) G_n(x') \big] \end{align*} % as desired, by the It\^o isometry for stochastic integrals, writing $B_n(1) = \int_0^1 \diff{B_n(s)}$; and noting that $F_X^{-1}(\xi_i)$ has the same distribution as $X_i$. Finally, note that % \begin{align*} G_n(x) &= \alpha_n \Big(g_n\big(F_X^{-1}(\cdot), x\big)\Big) = V_n \alpha_n \Big( V_n^{-1} g_n\big(F_X^{-1}(\cdot), x\big) - V_n^{-1} g_n\big(F_X^{-1}(1), x\big) \Big), \end{align*} % and so by \eqref{eq:kernel_app_kmt_concentration} % \begin{align*} \P\left( \sup_{x \in \R} \Big|G_n(x) - Z_n(x)\Big| > V_n \frac{t + C_1 \log n}{\sqrt n} \right) &\leq \P\left( \sup_{g \in \cV_1} \big|\alpha_n(g) - \beta_n(g)\big| > \frac{t + C_1\log n}{\sqrt{n}} \right) \\ &\leq C_2 e^{-C_3 t}. \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_yurinskii_corollary}] Take $0 < \delta_n \leq \Leb(\cX_n)$ and let $\cX_n^\delta = \big\{ x_1, \dots, x_{|\cX_n^\delta|}\big\}$ be a $\delta_n$-covering of $\cX_n$ with cardinality $|\cX_n^\delta| \leq \Leb(\cX_n)/\delta_n$. Suppose that $\left|\log \delta_n\right| \lesssim C_1 \log n$ up to a universal constant. We first use the Yurinskii coupling to construct a Gaussian process $Z_n$ which is close to $G_n$ on this finite cover. Then we bound the fluctuations in $G_n$ and in $Z_n$ using entropy methods. \proofparagraph{Yurinskii coupling} Define the i.n.i.d.\ and mean-zero variables % \begin{align*} h_i(x) &= \frac{1}{\sqrt n} \Big( g_n(X_i', x) - \E[g_n(X_i', x)] \Big), \end{align*} % where $X_1', \ldots, X_n'$ are independent copies of $X_1, \ldots, X_n$ on some new probability space, so that we have $G_n(x) = \sum_{i=1}^n h_i(x)$ in distribution. Also define the length-$|\cX_n^\delta|$ random vector % \begin{align*} h_i^\delta &= \big( h_i(x): x \in \cX_n^\delta \big). \end{align*} % By an extension of Yurinskii's coupling to general norms \citep[supplemental materials, Lemma~38]{belloni2019conditional}, there exists on the new probability space a Gaussian length-$|\cX_n^\delta|$ vector $Z_n^\delta$ which is mean-zero and with the same covariance structure as $ \sum_{i=1}^n h_i^\delta $ satisfying % \begin{align*} \P\left( \bigg\| \sum_{i=1}^n h_i^\delta - Z_n^\delta \bigg\|_\infty > 3 t_n \right) \leq \min_{s > 0} \left( 2 \P\big( \|N\|_\infty > s) + \frac{\beta s^2}{t_n^3} \right), \end{align*} % where % \begin{align*} \beta = \sum_{i=1}^n \Big( \E\big[\|h_i^\delta\|_2^2 \, \|h_i^\delta\|_\infty \big] + \E\big[\|z_i\|_2^2 \, \|z_i\|_\infty \big] \Big), \end{align*} % with $z_i \sim \cN(0, \Var[h_i^\delta])$ independent and $N \sim \cN(0, I_{|\cX_n^\delta|})$. By the bounds on $g_n$, % \begin{align*} \E\big[\|h_i^\delta\|_2^2 \, \|h_i^\delta\|_\infty \, \big] \leq \frac{M_n}{\sqrt n} \E\big[\|h_i^\delta\|_2^2 \, \big] = \frac{M_n}{\sqrt n} \sum_{x \in \cX_n^\delta} \E\big[h_i(x)^2 \, \big] \leq \frac{M_n}{\sqrt n} \frac{|\cX_n^\delta| \sigma_n^2}{n} \leq \frac{M_n \sigma_n^2 \Leb(\cX_n)}{n^{3/2}\delta_n}. \end{align*} % By the fourth moment bound for Gaussian variables, % \begin{align*} \E\big[ \|z_i\|_2^4 \, \big] &\leq |\cX_n^\delta| \, \E\big[ \|z_i\|_4^4 \big] \leq |\cX_n^\delta|^2 \, \max_j \E\big[ (z_i^{(j)})^4 \big] \leq 3 |\cX_n^\delta|^2 \, \max_j \E\big[ (z_i^{(j)})^2 \big]^2 \\ &= 3 |\cX_n^\delta|^2 \, \max_{x \in \cX_n^\delta} \E\big[ h_i(x)^2 \big]^2 \leq \frac{3\sigma_n^4 \Leb(\cX_n)^2}{n^2\delta_n^2} . \end{align*} % Also by Jensen's inequality and for $|\cX_n^\delta| \geq 2$, assuming $C_1 > 1$ without loss of generality, % \begin{align*} \E\big[ \|z_i\|_\infty^2 \big] &\leq \frac{4 \sigma_n^2}{n} \log \E\big[ e^{\|z_i\|_\infty^2 / (4\sigma_n^2/n)} \big] \leq \frac{4 \sigma_n^2}{n} \log \E\left[ \sum_{j=1}^{|\cX_n^\delta|} e^{(z_i^{(j)})^2 / (4\sigma_n^2/n)} \right] \leq \frac{4\sigma_n^2}{n} \log \big(2|\cX_n^\delta|\big) \\ &\leq \frac{4\sigma_n^2}{n} \left( \log 2 + \log \Leb(\cX_n) - \log \delta_n \right) \leq \frac{12 C_1 \sigma_n^2 \log n}{n}, \end{align*} % where we used the moment generating function of a $\chi_1^2$ random variable. Therefore we can apply the Cauchy--Schwarz inequality to obtain % \begin{align*} \E\big[\|z_i\|_2^2 \, \|z_i\|_\infty \big] &\leq \sqrt{ \E\big[\|z_i\|_2^4 \big]} \sqrt{ \E\big[ \|z_i\|_\infty^2 \big]} \leq \sqrt{ \frac{3\sigma_n^4 \Leb(\cX_n)^2}{n^2\delta_n^2}} \sqrt{ \frac{12 C_1 \sigma_n^2 \log n}{n} } \\ &\leq \frac{6\sigma_n^3 \Leb(\cX_n) \sqrt{C_1 \log n}}{n^{3/2} \delta_n}. \end{align*} % Now summing over the $n$ samples gives % \begin{align*} \beta \leq \frac{M_n \sigma_n^2 \Leb(\cX_n)}{\sqrt n \delta_n} + \frac{6\sigma_n^3 \Leb(\cX_n) \sqrt{C_1 \log n}} {\sqrt n \delta_n} = \frac{\sigma_n^2 \Leb(\cX_n)}{\sqrt n \delta_n} \Big(M_n + 6\sigma_n \sqrt{C_1 \log n}\Big). \end{align*} % By a union bound and Gaussian tail probabilities, we have that $\P\big( \|N\|_\infty > s) \leq 2|\cX_n^\delta| e^{-s^2/2}$. Thus we get the following Yurinskii coupling inequality for all $s > 0$: % \begin{align*} \P\left( \bigg\| \sum_{i=1}^n h_i^\delta - Z_n^\delta \bigg\|_\infty > t_n \right) &\leq \frac{4 \Leb(\cX_n)}{\delta_n} e^{-s^2/2} + \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3} \Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big). \end{align*} % Note that $Z_n^\delta$ now extends by the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}) to a mean-zero Gaussian process $Z_n$ on the compact interval $\cX_n$ with covariance structure % \begin{align*} \E\big[ Z_n(x) Z_n(x') \big] = \E\big[ G_n(x) G_n(x') \big], \end{align*} % satisfying for any $s' > 0$ % \begin{align*} &\P\left( \sup_{x \in \cX_n^\delta} \big| G_n(x) - Z_n(x) \big| > t_n \right) \leq \frac{4 \Leb(\cX_n)}{\delta_n} e^{-s^2/2} + \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3} \Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big). \end{align*} \proofparagraph{regularity of $G_n$} Next we bound the fluctuations in the empirical process $G_n$. Consider the following classes of functions on $S$ and their associated (constant) envelope functions. By continuity of $g_n$, each class is pointwise measurable (to see this, restrict the index sets to rationals). % \begin{align*} \cG_n &= \big\{ g_n(\cdot, x): x \in \cX_n \big\}, &\Env(\cG_n) &= M_n, \\ \cG_n^\delta &= \big\{ g_n(\cdot, x) - g_n(\cdot, x'): x, x' \in \cX_n, |x-x'| \leq \delta_n \big\}, &\Env(\cG_n^\delta) &= l_{n,\infty} \delta_n. \end{align*} % We first show these are VC-type. By the uniform Lipschitz assumption, % \begin{align*} \big\| g_n(\cdot, x) - g_n(\cdot, x') \big\|_\infty &\leq l_{n,\infty} |x-x'| \end{align*} % for all $x,x' \in \cX_n$. Therefore, with $\Q$ ranging over the finitely-supported distributions on $(S, \cS)$, noting that any $\|\cdot\|_\infty$-cover is a $\rho_\Q$-cover, % \begin{align*} \sup_\Q N\big(\cG_n, \rho_\Q, \varepsilon l_{n,\infty} \!\Leb(\cX_n)\big) &\leq N\big(\cG_n, \|\cdot\|_\infty, \varepsilon l_{n,\infty} \!\Leb(\cX_n)\big) \leq N\big(\cX_n, |\cdot|, \varepsilon \!\Leb(\cX_n)\big) \leq 1/\varepsilon. \end{align*} % Replacing $\varepsilon$ by $\varepsilon M_n/(l_{n,\infty} \Leb(\cX_n))$ gives % \begin{align*} \sup_\Q N\big(\cG_n, \rho_\Q, \varepsilon M_n \big) &\leq \frac{l_{n,\infty} \Leb(\cX_n)}{\varepsilon M_n}, \end{align*} % and so $\cG_n$ is a VC-type class. To see that $\cG_n^\delta$ is also a VC-type class, we construct a cover in the following way. Let $\cF_n$ be an $\varepsilon$-cover for $(\cG_n, \|\cdot\|_\infty)$. By the triangle inequality, $\cF_n - \cF_n$ is a $2\varepsilon$-cover for $(\cG_n - \cG_n, \|\cdot\|_\infty)$ of cardinality at most $|\cF_n|^2$, where the subtractions are set subtractions. Since $\cG_n^\delta \subseteq \cG_n - \cG_n$, we see that $\cF_n - \cF_n$ is a $2\varepsilon$-external cover for $\cG_n^\delta$. Thus % \begin{align*} \sup_\Q N\big(\cG_n^\delta, \rho_\Q, \varepsilon l_{n,\infty} \Leb(\cX_n)\big) &\leq N\big(\cG_n^\delta, \|\cdot\|_\infty, \varepsilon l_{n,\infty} \Leb(\cX_n)\big) \\ &\leq N\big(\cG_n, \|\cdot\|_\infty, \varepsilon l_{n,\infty} \Leb(\cX_n)\big)^2 \leq 1/\varepsilon^2. \end{align*} % Replacing $\varepsilon$ by $\varepsilon \delta_n/\Leb(\cX_n)$ gives % \begin{align*} \sup_\Q N\big(\cG_n^\delta, \rho_\Q, \varepsilon l_{n,\infty} \delta_n \big) &\leq \frac{\Leb(\cX_n)^2}{\varepsilon^2 \delta_n^2} \leq (C_{1,n}/\varepsilon)^{2} \end{align*} % with $C_{1,n} = \Leb(\cX_n) / \delta_n$, demonstrating that $\cG_n^\delta$ forms a VC-type class. We now apply the maximal inequality for i.n.i.d.\ data given in Lemma~\ref{lem:kernel_app_maximal_vc_inid}. To do this, note that $\sup_{\cG_n^\delta} \|g\|_{\bar\P,2} \leq l_{n,2} \delta_n$ by the $L^2$ Lipschitz condition, and recall $\Env(\cG_n^\delta) = l_{n,\infty} \delta_n$. Therefore Lemma~\ref{lem:kernel_app_maximal_vc_inid} with $\|F\|_{\bar\P,2} = l_{n,\infty} \delta_n$, $\|M\|_{\P,2} = l_{n,\infty} \delta_n$, and $\sigma = l_{n,2} \delta_n$ gives, up to universal constants % \begin{align*} &\E\left[ \sup_{g \in \cG_n^\delta} \left| \frac{1}{\sqrt{n}} \sum_{i=1}^n \Big( g(X_i) - \E[g(X_i)] \Big) \right| \right] \\ &\quad\lesssim \sigma \sqrt{2 \log \big(C_{1,n} \|F\|_{\bar\P,2}/\sigma\big)} + \frac{\|M\|_{\P,2} 2 \log \big(C_{1,n} \|F\|_{\bar\P,2}/\sigma\big)} {\sqrt{n}} \\ &\quad\lesssim l_{n,2} \delta_n \sqrt{C_1 \log n} + \frac{l_{n,\infty} \delta_n}{\sqrt n} C_1 \log n, \end{align*} % and hence by Markov's inequality, % \begin{align*} &\P\left( \sup_{|x-x'| \leq \delta_n} \big| G_n(x) - G_n(x') \big| > t_n \right) \\ &= \P\left( \sup_{|x-x'| \leq \delta_n} \frac{1}{\sqrt{n}} \left| \sum_{i=1}^n \Big( g_n(X_i, x) - \E[g_n(X_i, x)] - g_n(X_i, x') + \E[g_n(X_i, x')] \Big) \right| > t_n \right) \\ &= \P\left( \sup_{g \in \cG_n^\delta} \left| \frac{1}{\sqrt{n}} \sum_{i=1}^n \Big( g(X_i) - \E[g(X_i)] \Big) \right| > t_n \right) \leq \frac{1}{t} \E\left[ \sup_{g \in \cG_n^\delta} \left| \frac{1}{\sqrt{n}} \sum_{i=1}^n \Big( g(X_i) - \E[g(X_i)] \Big) \right| \right] \\ &\lesssim \frac{l_{n,2} \delta_n}{t_n} \sqrt{C_1 \log n} + \frac{l_{n,\infty} \delta_n}{t_n \sqrt n} C_1 \log n. \end{align*} \proofparagraph{regularity of $Z_n$} Next we bound the fluctuations in the Gaussian process $Z_n$. Let $\rho$ be the following semimetric: % \begin{align*} \rho(x, x')^2 &= \E\big[\big( Z_n(x) - Z_n(x') \big)^2\big] = \E\big[\big( G_n(x) - G_n(x') \big)^2\big] \\ &= \frac{1}{n} \sum_{i=1}^n \E\big[\big( h_i(x) - h_i(x') \big)^2\big] \leq l_{n,2}^2 \, |x - x'|^2. \end{align*} % Hence $\rho(x, x') \leq l_{n,2} \, |x - x'|$. By the Gaussian process maximal inequality from Lemma~\ref{lem:kernel_app_gaussian_process_maximal}, we obtain that % \begin{align*} &\E\bigg[ \sup_{|x - x'| \leq \delta_n} \big| Z_n(x) - Z_n(x') \big| \bigg] \lesssim \E\bigg[ \sup_{\rho(x,x') \leq l_{n,2} \delta_n} \big| Z_n(x) - Z_n(x') \big| \bigg] \\ &\quad\leq \int_0^{l_{n,2} \delta_n} \sqrt{\log N(\varepsilon, \cX_n, \rho)} \diff{\varepsilon} \leq \int_0^{l_{n,2} \delta_n} \sqrt{\log N(\varepsilon / l_{n,2}, \cX_n, |\cdot|)} \diff{\varepsilon} \\ &\quad\leq \int_0^{l_{n,2} \delta_n} \sqrt{\log \left( 1 + \frac{\Leb(\cX_n) l_{n,2}}{\varepsilon} \right)} \diff{\varepsilon} \leq \int_0^{l_{n,2} \delta_n} \sqrt{\log \left( \frac{2\Leb(\cX_n) l_{n,2}}{\varepsilon} \right)} \diff{\varepsilon} \\ &\quad\leq \log \left(\frac{2\Leb(\cX_n)}{\delta_n} \right)^{-1/2} \int_0^{l_{n,2} \delta_n} \log \left( \frac{2\Leb(\cX_n) l_{n,2}}{\varepsilon} \right) \diff{\varepsilon} \\ &\quad= \log \left(\frac{2\Leb(\cX_n)}{\delta_n} \right)^{-1/2} \left( l_{n,2} \delta_n \log \left( 2 \Leb(\cX_n) l_{n,2} \right) + l_{n,2} \delta_n + l_{n,2} \delta_n \log \left( \frac{1}{l_{n,2} \delta_n} \right) \right) \\ &\quad= \log \left(\frac{2\Leb(\cX_n)}{\delta_n} \right)^{-1/2} l_{n,2} \delta_n \left( 1 + \log \left( \frac{2\Leb(\cX_n)}{\delta_n} \right) \right) \lesssim l_{n,2} \delta_n \sqrt{\log \left( \frac{\Leb(\cX_n)}{\delta_n} \right)} \\ &\quad\lesssim l_{n,2} \delta_n \sqrt{C_1 \log n}, \end{align*} % where we used that $\delta_n \leq \Leb(\cX_n)$. So by Markov's inequality, % \begin{align*} \P\left( \sup_{|x - x'| \leq \delta_n} \big| Z_n(x) - Z_n(x') \big| > t_n \right) &\lesssim t_n^{-1} l_{n,2} \delta_n \sqrt{C_1 \log n}. \end{align*} \proofparagraph{conclusion} By the results of the previous parts, we have up to universal constants that % \begin{align*} &\P\left( \sup_{x \in \cX_n} \big| G_n(x) - Z_n(x) \big| > t_n \right) \\ &\quad\leq \P\left( \sup_{x \in \cX_n^\delta} \big| G_n(x) - Z_n(x) \big| > t_n / 3 \right) + \P\left( \sup_{|x-x'| \leq \delta_n} \big| G_n(x) - G_n(x') \big| > t_n / 3 \right) \\ &\qquad+ \P\left( \sup_{|x - x'| \leq \delta_n} \big| Z_n(x) - Z_n(x') \big| > t_n / 3 \right) \\ &\quad\lesssim \frac{4 \Leb(\cX_n)}{\delta_n} e^{-s^2/2} + \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3} \Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big) \\ &\qquad+ \frac{l_{n,2} \delta_n}{t_n} \sqrt{C_1 \log n} + \frac{l_{n,\infty} \delta_n}{t_n \sqrt n} C_1 \log n. \end{align*} % Choosing an approximately optimal mesh size of % \begin{align*} \delta_n &= \sqrt{ \frac{\sigma_n^2 \Leb(\cX_n) \log n}{\sqrt n t_n^3} \Big(M_n + \sigma_n \sqrt{\log n}\Big) } \Bigg/ \sqrt{ t_n^{-1} l_{n,2} \sqrt{\log n} \left( 1 + \frac{l_{n,\infty} \sqrt{\log n}}{l_{n,2} \sqrt{n}} \right) } \end{align*} % gives $\log |\delta_n| \lesssim C_1 \log n$ for a universal constant, so with $s$ a large enough multiple of $\sqrt{\log n}$, % \begin{align*} &\P\left( \sup_{x \in \cX_n} \big| G_n(x) - Z_n(x) \big| > t_n \right) \\ &\quad\lesssim \frac{4 \Leb(\cX_n)}{\delta_n} e^{-s^2/2} + \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3} \Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big) \\ &\qquad+ \frac{l_{n,2} \delta_n}{t_n} \sqrt{C_1 \log n} + \frac{l_{n,\infty} \delta_n}{t_n \sqrt n} C_1 \log n \\ &\quad\lesssim \delta_n \frac{l_{n,2} \sqrt {\log n}}{t_n} \left( 1 + \frac{l_{n,\infty} \sqrt{\log n}}{l_{n,2} \sqrt n} \right) \\ &\quad\lesssim \frac{\sigma_n \sqrt{\Leb(\cX_n)} \sqrt{\log n} \sqrt{M_n + \sigma_n \sqrt{\log n}}} {n^{1/4} t_n^2} \sqrt{l_{n,2} \sqrt {\log n} + \frac{l_{n,\infty}}{\sqrt n} \log n}. \end{align*} % \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_vbp}] The proof is by induction on the number of vertices in the tree. Let $\cT$ have $n$ vertices, and suppose that vertex $n$ is a leaf connected to vertex $n-1$ by an edge, relabeling the vertices if necessary. By the induction hypothesis we assume that there is a probability measure $\P^{(n-1)}$ on $\prod_{i=1}^{n-1} \cX_i$ whose projections onto $\cX_i$ are $\P_i$ and whose projections onto $\cX_i \times \cX_j$ are $\P_{i j}$, for $i,j \leq n-1$. Now apply the original Vorob'ev--Berkes--Philipp theorem, which can be found as Theorem~1.1.10 in \citet{dudley1999uniform}, to the spaces $\prod_{i=1}^{n-2} \cX_i$,\, $\cX_{n-1}$, and $\cX_n$; and to the laws $\P^{(n-1)}$ and $\P_{n-1, n}$. This gives a law $\P^{(n)}$ which agrees with $\P_i$ at every vertex by definition, and agrees with $\P_{i j}$ for all $i,j \leq n-1$. It also agrees with $\P_{n-1,n}$, and this is the only edge touching vertex $n$. Hence $\P^{(n)}$ satisfies the desired properties. \end{proof} \subsection{Main results} \label{sec:kernel_app_main} We give supplementary details for our main results on consistency, minimax optimality, strong approximation, covariance estimation, feasible inference and counterfactual estimation. We begin with a basic fact about Lipschitz functions. \begin{lemma}[Lipschitz kernels are bounded] \label{lem:kernel_app_lipschitz_kernels_bounded} Let $\cX \subseteq \R$ be a connected set. Let $f: \cX \to \R$ satisfy the Lipschitz condition $|f(x) - f(x')| \leq C |x-x'|$ for some $C > 0$ and all $x, x' \in \cX$. Suppose also that $f$ is a kernel in the sense that $\int_\cX f(x) \diff{x} = 1$. Then we have % \begin{align*} \sup_{x \in \cX} |f(x)| &\leq C \Leb(\cX) + \frac{1}{\Leb(\cX)}. \end{align*} % Now let $g: \cX \to [0,\infty)$ satisfy $|g(x) - g(x')| \leq C |x-x'|$ for some $C > 0$ and all $x, x' \in \cX$. Suppose $g$ is a sub-kernel with $\int_\cX g(x) \diff{x} \leq 1$. Then for any $M \in \big(0, \Leb(\cX)\big]$, we have % \begin{align*} \sup_{x \in \cX} f(x) &\leq C M + \frac{1}{M}. \end{align*} \end{lemma} Applying Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded} to the density and kernel functions defined in Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth} yields the following. Firstly, since $k_h(\cdot, w)$ is $C_\rL / h^2$-Lipschitz on $[w \pm h] \cap \cW$ and integrates to one, we have by the first inequality in Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded} that % \begin{align*} |k_h(s,w)| &\leq \frac{2 C_\rL + 1}{h} + \frac{1}{\Leb(\cW)}. \end{align*} % Since each of $f_{W \mid AA}(\cdot \mid a,a')$, $f_{W \mid A}(\cdot \mid a)$, and $f_W$ is non-negative, and $C_\rH$-Lipschitz on $\cW$ and integrates to at most one over $\cW$, taking $M = \frac{1}{\sqrt{C_\rH}} \wedge \Leb(\cW)$ in the second inequality in Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded} gives % \begin{align*} f_{W \mid AA}(w \mid a,a') &\leq 2 \sqrt{C_\rH} + \frac{1}{\Leb(\cW)}, \\ f_{W \mid A}(w \mid a) &\leq 2 \sqrt{C_\rH} + \frac{1}{\Leb(\cW)}, \\ f_W(w) &\leq 2 \sqrt{C_\rH} + \frac{1}{\Leb(\cW)}. \end{align*} \begin{proof}[Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded}] We begin with the first inequality. Note that if $\Leb(\cX) = \infty$ there is nothing to prove. Suppose for contradiction that $|f(x)| > C \Leb(\cX) + \frac{1}{\Leb(\cX)}$ for some $x \in \cX$. If $f(x) \geq 0$ then by the Lipschitz property, for any $y \in \cX$, % \begin{align*} f(y) \geq f(x) - C|y-x| > C \Leb(\cX) + \frac{1}{\Leb(\cX)} - C\Leb(\cX) = \frac{1}{\Leb(\cX)}. \end{align*} % Similarly, if $f(x) \leq 0$ then % \begin{align*} f(y) \leq f(x) + C|y-x| < - C \Leb(\cX) - \frac{1}{\Leb(\cX)} + C\Leb(\cX) = -\frac{1}{\Leb(\cX)}. \end{align*} % But then either $\int_\cX f(x) \diff{x} > \int_\cX 1/\Leb(\cX) \diff{x} = 1$ or $\int_\cX f(x) \diff{x} < \int_\cX -1/\Leb(\cX) \diff{x} = -1 < 1$, giving a contradiction. For the second inequality, assume that $f$ is non-negative on $\cX$, and take $M \in \big(0, \Leb(\cX)\big]$. Suppose for contradiction that $f(x) > C M + \frac{1}{M}$ for some $x \in \cX$. Then by the Lipschitz property, $f(y) \geq 1/M$ for all $y$ such that $|y - x| \leq M$. Since $\cX$ is connected, we have $\Leb(\cX \cap [x \pm M]) \geq M$ and so we deduce that $\int_\cX f(x) \diff{x} > M/M = 1$ which is a contradiction. \end{proof} \begin{proof}[Theorem~\ref{thm:kernel_bias}] Begin by defining % \begin{align*} P_p(s,w) &= \sum_{r = 0}^p \frac{f_W^{(r)}(w)}{r!} {(s-w)^r} \end{align*} % for $s, w \in \cW$ as the degree-$p$ Taylor polynomial of $f_W$, centered at $w$ and evaluated at $s$. Note that for $p \leq \flbeta-1$, by Taylor's theorem with Lagrange remainder, % \begin{align*} f_W(s) - P_p(s,w) &= \frac{f_W^{(p+1)}(w')}{(p+1)!} (s-w)^{p+1} \end{align*} % for some $w'$ between $w$ and $s$. Also note that for any $p$, % \begin{align*} \int_{\cW} k_h(s,w) \big( P_p(s,w) - P_{p-1}(s,w) \big) \diff{s} &= \int_{\cW} k_h(s,w) \frac{f_W^{(p)}(w)}{p!} (s-w)^p \diff{s} = h^p b_p(w). \end{align*} % Further, by the order of the kernel, % \begin{align*} \E\big[\hat f_W(w)\big] - f_W(w) &= \int_{\cW} k_h(s,w) f_W(s) \diff{s} - f_W(w) = \int_{\cW} k_h(s,w) \big(f_W(s) - f_W(w)\big) \diff{s} \\ &= \int_{\cW} k_h(s,w) \big(f_W(s) - P_{p-1}(s,w)\big) \diff{s}. \end{align*} \proofparagraph{low-order kernel} Suppose that $p \leq \flbeta - 1$. Then % \begin{align*} &\sup_{w \in \cW} \big| \E[\hat f_W(w)] - f_W(w) - h^p b_p(w) \big| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \big(f_W(s) - P_{p-1}(s,w)\big) \diff{s} - h^p b_p(w) \right| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \big( f_W(s) - P_{p}(s,w) + P_{p}(s,w) - P_{p-1}(s,w) \big) \diff{s} - h^p b_p(w) \right| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \big( f_W(s) - P_{p}(s,w) \big) \diff{s} \right| = \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \frac{f_W^{(p+1)}(w')}{(p+1)!} (s-w)^{p+1} \diff{s} \right| \\ &\quad\leq \sup_{w \in \cW} \left| \int_{[w \pm h]} \frac{C_\rk}{h} \frac{C_\rH}{(p+1)!} h^{p+1} \diff{s} \right| \leq \frac{2C_\rk C_\rH}{(p+1)!} h^{p+1}. \end{align*} \proofparagraph{order of kernel matches smoothness} Suppose that $p = \flbeta$. Then % \begin{align*} &\sup_{w \in \cW} \big| \E[\hat f_W(w)] - f_W(w) - h^p b_p(w) \big| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \big(f_W(s) - P_{\flbeta - 1}(s,w)\big) \diff{s} - h^p b_p(w) \right| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \big( f_W(s) - P_{\flbeta}(s,w) + P_{\flbeta}(s,w) - P_{\flbeta - 1}(s,w) \big) \diff{s} - h^{\flbeta} b_{\flbeta}(w) \right| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \big( f_W(s) - P_{\flbeta}(s,w) \big) \diff{s} \right| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \frac{f_W^{(\flbeta)}(w') - f_W^{(\flbeta)}(w)}{\flbeta!} (s-w)^{\flbeta} \diff{s} \right| \\ &\quad\leq \sup_{w \in \cW} \left| \int_{[w \pm h]} \frac{C_\rk}{h} \frac{C_\rH h^{\beta - \flbeta}}{\flbeta !} h^{\flbeta} \diff{s} \right| \leq \frac{2 C_\rk C_\rH}{\flbeta !} h^\beta. \end{align*} \proofparagraph{high-order kernel} Suppose that $p \geq \flbeta+1$. Then as in the previous part % \begin{align*} \sup_{w \in \cW} \big| \E[\hat f_W(w)] - f_W(w) \big| &= \sup_{w \in \cW} \left| \int_{[w \pm h] \cap \cW} \!\!\!\! k_h(s,w) \big( f_W(s) - P_{\flbeta}(s,w) \big) \diff{s} \right| \leq \frac{2 C_\rk C_\rH}{\flbeta !} h^\beta. \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_hoeffding}] \proofparagraph{Hoeffding-type decomposition} \begin{align*} \hat f_W(w) - E_n(w) - \E[\hat f_W(w)] &= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} \Big( \E[k_h(W_{i j},w) \mid A_i, A_j] - \E[k_h(W_{i j},w)] \Big) \\ &= \frac{1}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j \neq i} \Big( \E[k_h(W_{i j},w) \mid A_i, A_j] - \E[k_h(W_{i j},w)] \Big), \end{align*} % and apply Lemma~\ref{lem:kernel_app_general_hoeffding} with % \begin{align*} u_{i j} &= \frac{1}{n(n-1)} \E\big[k_h(W_{i j},w) \mid A_i, A_j\big], &u_i &= \frac{1}{n(n-1)} \E\big[k_h(W_{i j},w) \mid A_i\big], \\ u &= \frac{1}{n(n-1)} \E\big[k_h(W_{i j},w)\big], \end{align*} % to see % \begin{align*} \hat f_W(w) - E_n(w) - \E[\hat f_W(w)] &= \frac{2}{n} \sum_{i=1}^n \big(u_i - u\big) + \frac{1}{n(n-1)} \sum_{i=1}^n \sum_{j \neq i} \big( u_{i j} - u_i - u_j + u \big) \\ &= \frac{2}{n} \sum_{i=1}^n l_i(w) + \frac{2}{n(n-1)} \sum_{i=1}^n \sum_{j = i+1}^n q_{i j}(w) = L_n + Q_n. \end{align*} \proofparagraph{expectation and covariance of $L_n$, $Q_n$, and $E_n$} $L_n$, $Q_n$, and $E_n$ are clearly mean-zero. For orthogonality, note that their summands have the following properties, for any $1 \leq i < j \leq n$ and $1 \leq r < s \leq n$, and for any $w, w' \in \cW$: % \begin{align*} \E\big[ l_i(w) q_{rs}(w') \big] &= \E\big[ l_i(w) \E\big[ q_{rs}(w') \mid A_i \big] \big] = 0, \\ \E\big[ l_i(w) e_{rs}(w') \big] &= \begin{cases} \E\big[ l_i(w) \big] \E\big[ e_{rs}(w') \big], \text{ if } i \notin \{r,s\}, \\ \E\big[ l_i(w) \E\big[ e_{rs}(w') \mid A_r, A_s \big] \big], \text{ if } i \in \{r,s\}, \end{cases} \\ &= 0, \\ \E\big[ q_{i j}(w) e_{rs}(w') \big] &= \begin{cases} \E\big[ q_{i j}(w) \big] \E\big[ e_{rs}(w') \big], \text{ if } \{i,j\} \cap \{r,s\} = \emptyset, \\ \E\big[ \E\big[ q_{i j}(w) \mid A_i \big] \E\big[ e_{rs}(w') \mid A_i \big] \big], \text{ if } \{i,j\} \cap \{r,s\} = \{i\}, \\ \E\big[ \E\big[ q_{i j}(w) \mid A_j \big] \E\big[ e_{rs}(w') \mid A_j \big] \big], \text{ if } \{i,j\} \cap \{r,s\} = \{j\}, \\ \E\big[ q_{i j}(w) \E\big[ e_{rs}(w') \mid A_r, A_s \big] \big], \text{ if } \{i,j\} = \{r,s\}, \end{cases} \\ &= 0, \end{align*} % by independence of $\bA_n$ and $\bV_n$ and as $\E[q_{rs}(w) \mid A_i] = 0$ and $\E[e_{i j}(w) \mid A_i, A_j] = 0$. \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_trichotomy}] \proofparagraph{total degeneracy} Suppose $\Dl = 0$, so $\Var[f_{W \mid A}(w \mid A_i)] = 0$ for all $w \in \cW$. Therefore, for all $w \in \cW$, we have $f_{W \mid A}(w) = f_W(w)$ almost surely. By taking a union over $\cW \cap \Q$ and by continuity of $f_{W \mid A}$ and $f_W$, this implies that $f_{W \mid A}(w) = f_W(w)$ for all $w \in \cW$ almost surely. Thus % \begin{align*} \E\left[ k_h(W_{i j},w) \mid A_i \right] &= \int_{\cW} k_h(s,w) f_{W \mid A}(s \mid A_i) \diff{s} = \int_{\cW} k_h(s,w) f_W(s) \diff{s} = \E\left[ k_h(W_{i j},w) \right] \end{align*} % for all $w \in \cW$ almost surely. Hence $l_i(w) = 0$ and so $L_n(w) = 0$ for all $w \in \cW$ almost surely. \proofparagraph{no degeneracy} Suppose $\Dl > 0$. As $f_{W|A}(\cdot \mid a)$ is $C_\rH$-Lipschitz for all $a \in \cA$ and since $|k_h| \leq C_\rk/h$, % \begin{align*} &\sup_{w \in \cW} \left| \E[k_h(W_{i j},w) \mid A_i] - f_{W \mid A}(w \mid A_i) \right| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) f_{W \mid A}(s \mid A_i) \diff{s} - f_{W \mid A}(w \mid A_i) \right| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW \cap [w \pm h]} k_h(s,w) \left( f_{W \mid A}(s \mid A_i) - f_{W \mid A}(w \mid A_i) \right) \diff{s} \right| \\ &\quad\leq 2h \frac{C_\rk}{h} C_\rH h \leq 2 C_\rk C_\rH h \end{align*} % almost surely. Therefore, since $f_{W \mid A}(w \mid a) \leq C_\rd$, we have % \begin{align*} \sup_{w \in \cW} \left| \Var\big[ \E[k_h(W_{i j},w) \mid A_i] \big] - \Var\left[ f_{W \mid A}(w \mid A_i) \right] \right| &\leq 16 C_\rk C_\rH C_\rd h \end{align*} % whenever $h$ is small enough that $2 C_\rk C_\rH h \leq C_\rd$. Thus % \begin{align*} \inf_{w \in \cW} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big] &\geq \inf_{w \in \cW}\Var[f_{W \mid A}(w \mid A_i)] - 16 C_\rk C_\rH C_\rd h. \end{align*} % Therefore, if $\Dl > 0$, then eventually $\inf_{w \in \cW} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big] \geq \Dl/2$. Finally, % \begin{align*} \inf_{w \in \cW}\Var[L_n(w)] &= \frac{4}{n} \inf_{w \in \cW} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big] \geq \frac{2 \Dl}{n}. \end{align*} \proofparagraph{partial degeneracy} Since $f_{W \mid A}(w \mid A_i)$ is bounded by $C_\rd$ and $C_\rH$-Lipschitz in $w$, we have that $\Var[f_{W \mid A}(w \mid A_i)]$ is continuous on $\cW$. Thus if $\Dl = 0$, there is at least one point $w \in \cW$ for which $\Var[f_{W \mid A}(w \mid A_i)] = 0$ by compactness. Let $w$ be any such degenerate point. Then by the previous part, % \begin{align*} \Var[L_n(w)] = \frac{4}{n} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big] &\leq 64 C_\rk C_\rH C_\rd \frac{h}{n}. \end{align*} % If conversely $w$ is not a degenerate point then $\Var[f_{W \mid A}(w \mid A_i)] > 0$ so eventually % \begin{align*} \Var[L_n(w)] = \frac{4}{n} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big] &\geq \frac{2}{n} \Var[f_{W \mid A}(w \mid A_i)]. \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_uniform_concentration}] We establish VC-type properties of function classes and apply empirical process theory. \proofparagraph{establishing VC-type classes} Consider the following function classes: % \begin{align*} \cF_1 &= \Big\{ W_{i j} \mapsto k_h(W_{i j},w) : w \in \cW \Big\}, \\ \cF_2 &= \Big\{ (A_i, A_j) \mapsto \E\big[ k_h(W_{i j},w) \mid A_i, A_j \big] : w \in \cW \Big\}, \\ \cF_3 &= \Big\{ A_i \mapsto \E\big[ k_h(W_{i j},w) \mid A_i \big] : w \in \cW \Big\}. \end{align*} % For $\cF_1$, take $0 < \varepsilon \leq \Leb(\cW)$ and $\cW_\varepsilon$ an $\varepsilon$-cover of $\cW$ of cardinality at most $\Leb(\cW)/\varepsilon$. As % \begin{align*} \sup_{s, w, w' \in \cW} \left| \frac{k_h(s,w) - k_h(s,w')} {w-w'} \right| &\leq \frac{C_\mathrm{L}}{h^2} \end{align*} % almost surely, we see that % \begin{align*} \sup_\Q N\left(\cF_1, \rho_\Q, \frac{C_\mathrm{L}}{h^2} \varepsilon \right) &\leq N\left(\cF_1, \|\cdot\|_\infty, \frac{C_\mathrm{L}}{h^2} \varepsilon \right) \leq \frac{\Leb(\cW)}{\varepsilon}, \end{align*} % where $\Q$ ranges over Borel probability measures on $\cW$. Since $\frac{C_\rk}{h}$ is an envelope for $\cF_1$, % \begin{align*} \sup_\Q N\left(\cF_1, \rho_\Q, \frac{C_\rk}{h} \varepsilon \right) &\leq \frac{C_\rL}{C_\rk} \frac{\Leb(\cW)}{h \varepsilon}. \end{align*} % Thus for all $\varepsilon \in (0,1]$, % \begin{align*} \sup_\Q N\left(\cF_1, \rho_\Q, \frac{C_\rk}{h} \varepsilon \right) &\leq \frac{C_\rL}{C_\rk} \frac{\Leb(\cW) \vee 1}{h \varepsilon} \leq (C_1/(h\varepsilon))^{C_2}, \end{align*} % where $C_1 = \frac{C_\rL}{C_\rk} (\Leb(\cW) \vee 1)$ and $C_2 = 1$. Next, $\cF_2$ forms a smoothly parameterized class of functions since for $w,w' \in \cW$ we have by the uniform Lipschitz properties of $f_{W \mid AA}(\cdot \mid A_i, A_j)$ and $k_h(s, \cdot)$, with $|w-w'| \leq h$, % \begin{align*} &\left| \E\big[ k_h(W_{i j},w) \mid A_i, A_j \big] - \E\big[ k_h(W_{i j},w') \mid A_i, A_j \big] \right| \\ &\quad= \left| \int_{[w \pm h] \cap \cW} k_h(s,w) f_{W \mid AA}(s \mid A_i, A_j) \diff{s} - \int_{[w' \pm h] \cap \cW} k_h(s,w') f_{W \mid AA}(s \mid A_i, A_j) \diff{s} \right| \\ &\quad= \left| \int_{[w \pm 2h] \cap \cW} \big( k_h(s,w) - k_h(s,w') \big) f_{W \mid AA}(s \mid A_i, A_j) \diff{s} \right| \\ &\quad= \left| \int_{[w \pm 2h] \cap \cW} \big( k_h(s,w) - k_h(s,w') \big) \big( f_{W \mid AA}(s \mid A_i, A_j) - f_{W \mid AA}(w \mid A_i, A_j) \big) \diff{s} \right| \\ &\quad\leq 4h \frac{C_\rL}{h^2} |w-w'| 2 C_\rH h \leq 8 C_\rL C_\rH |w-w'| \leq C_3 |w-w'|, \end{align*} % where $C_3 = 8 C_\rL C_\rH$. The same holds for $|w-w'| > h$ as the Lipschitz property is local. By taking $\E[\, \cdot \mid A_i]$, it can be seen by the contraction property of conditional expectation that the same holds for the singly-conditioned terms: % \begin{align*} \left| \E\big[ k_h(W_{i j},w) \mid A_i \big] - \E\big[ k_h(W_{i j},w') \mid A_i \big] \right| &\leq C_3 |w-w'|. \end{align*} % Therefore $\cF_3$ is also smoothly parameterized in exactly the same manner. Let % \begin{align*} C_4 &= \sup_{w \in \cW} \esssup_{A_i, A_j} \big| \E\big[ k_h(W_{i j},w) \mid A_i, A_j \big] \big| \\ &= \sup_{w \in \cW} \esssup_{A_i, A_j} \left| \int_{[w \pm h] \cap \cW} k_h(s,w) f_{W \mid AA}(s \mid A_i, A_j) \diff{s} \right| \\ &\leq 2h \frac{C_\rk}{h} C_\rd \leq 2 C_\rk C_\rd. \end{align*} % For $\varepsilon \in (0,1]$, take an $(\varepsilon C_4/C_3)$-cover of $\cW$ of cardinality at most $C_3 \Leb(\cW) / (\varepsilon C_4)$. By the above parameterization properties, this cover induces an $\varepsilon C_4$-cover for both $\cF_2$ and $\cF_3$: % \begin{align*} \sup_\Q N\big(\cF_2, \rho_\Q, \varepsilon C_4 \big) &\leq N\big(\cF_2, \|\cdot\|_\infty, \varepsilon C_4 \big) \leq C_3 \Leb(\cW) / (\varepsilon C_4), \\ \sup_\Q N\big(\cF_3, \rho_\Q, \varepsilon C_4 \big) &\leq N\big(\cF_3, \|\cdot\|_\infty, \varepsilon C_4 \big) \leq C_3 \Leb(\cW) / (\varepsilon C_4). \end{align*} % Hence $\cF_1$, $\cF_2$, and $\cF_3$ form VC-type classes with envelopes $F_1 = C_\rk / h$ and $F_2 = F_3 = C_4$: % \begin{align*} \sup_\Q N\left(\cF_1, \rho_\Q, \varepsilon C_\rk / h \right) &\leq (C_1/(h\varepsilon))^{C_2}, &\sup_\Q N\big(\cF_2, \rho_\Q, \varepsilon C_4 \big) &\leq (C_1/\varepsilon)^{C_2}, \\ \sup_\Q N\big(\cF_3, \rho_\Q, \varepsilon C_4 \big) &\leq (C_1/\varepsilon)^{C_2}, \end{align*} % for some constants $C_1 \geq e$ and $C_2 \geq 1$, where we augment the constants if necessary. \proofparagraph{controlling $L_n$} Observe that $\sqrt{n}L_n$ is the empirical process of the i.i.d.\ variables $A_i$ indexed by $\cF_3$. We apply Lemma~\ref{lem:kernel_app_maximal_vc_inid} with $\sigma = C_4$: % \begin{align*} \E \left[ \sup_{w \in \cW} \big| \sqrt{n} L_ n(w) \big| \right] &\lesssim C_4 \sqrt{C_2 \log C_1} + \frac{C_4 C_2 \log C_1} {\sqrt{n}} \lesssim 1. \end{align*} % By Lemma~\ref{lem:kernel_trichotomy}, the left hand side is zero whenever $\Du = 0$, so we can also write % \begin{align*} \E \left[ \sup_{w \in \cW} \big| \sqrt{n} L_n(w) \big| \right] &\lesssim \Du. \end{align*} \proofparagraph{controlling $Q_n$} Observe that $n Q_n$ is the completely degenerate second-order U-process of the i.i.d.\ variables $A_i$ indexed by $\cF_2$. This function class is again uniformly bounded and VC-type, so applying the U-process maximal inequality from Lemma~\ref{lem:kernel_app_uprocess_maximal} yields with $\sigma = C_4$ % \begin{align*} \E \left[ \sup_{w \in \cW} \big| n Q_n(w) \big| \right] &\lesssim C_4 C_2 \log C_1 + \frac{C_4 (C_2 \log C_1)^2} {\sqrt{n}} \lesssim 1. \end{align*} \proofparagraph{controlling $E_n$} Conditional on $\bA_n$, note that $n E_n$ is the empirical process of the conditionally i.n.i.d.\ variables $W_{i j}$ indexed by $\cF_1$. We apply Lemma~\ref{lem:kernel_app_maximal_vc_inid} conditionally with % \begin{align*} \sigma^2 &= \sup_{w \in \cW} \E\Big[ \big( k_h(W_{i j},w) - \E[k_h(W_{i j},w) \mid A_i, A_j] \big)^2 \mid A_i, A_j \Big] \leq \sup_{w \in \cW} \E\Big[ k_h(W_{i j},w)^2 \mid A_i, A_j \Big] \\ &\leq \sup_{w \in \cW} \int_{[w \pm h] \cap \cW} k_h(s,w)^2 f_{W \mid AA}(s \mid A_i, A_j) \diff{s} \leq 2h \frac{C_\rk^2}{h^2} \lesssim 1/h \end{align*} % and noting that we have a sample size of $\frac{1}{2}n(n-1)$, giving % \begin{align*} \E \left[ \sup_{w \in \cW} \big| n E_n(w) \big| \right] &\lesssim \sigma \sqrt{C_2 \log \big((C_1/h) F_1 / \sigma \big)} + \frac{F_1 C_2 \log \big((C_1/h) F_1 / \sigma\big)} {n} \\ &\lesssim \frac{1}{\sqrt h} \sqrt{C_2 \log \big((C_1/h) (C_\rk/h) \sqrt h \big)} + \frac{(C_\rk/h) C_2 \log \big((C_1/h) (C_\rk/h) \sqrt h \big)} {n} \\ &\lesssim \sqrt{\frac{\log 1/h}{h}} + \frac{\log \big(1/h\big)} {n h} \lesssim \sqrt{\frac{\log n}{h}}, \end{align*} % where the last line follows by the bandwidth assumption of $\frac{\log n}{n^2h} \to 0$. \end{proof} \begin{proof}[Theorem~\ref{thm:kernel_uniform_consistency}] This follows from Theorem~\ref{thm:kernel_bias} and Lemma~\ref{lem:kernel_uniform_concentration}. \end{proof} Before proving Theorem~\ref{thm:kernel_minimax} we first give a lower bound result for parametric point estimation in Lemma~\ref{lem:kernel_app_neyman_pearson_bernoulli}. \begin{lemma}[A Neyman--Pearson result for Bernoulli random variables] \label{lem:kernel_app_neyman_pearson_bernoulli} Recall that the Bernoulli distribution $\Ber(\theta)$ places mass $\theta$ at $1$ and mass $1-\theta$ at $0$. Define $\P_\theta^n$ as the law of $(A_1, A_2, \ldots, A_n, V)$, where $A_1, \ldots, A_n$ are i.i.d.\ $\Ber(\theta)$, and $V$ is an $\R^d$-valued random variable for some $d \geq 1$ which is independent of the $A$ variables and with a fixed distribution that does not depend on $\theta$. Let $\theta_0 = \frac{1}{2}$ and $\theta_{1,n} = \frac{1}{2} + \frac{1}{\sqrt{8n}}$. Then for any estimator $\tilde \theta_n$ which is a function of $(A_1, A_2, \ldots, A_n, V)$ only, % \begin{align*} \P_{\theta_0}^n \left( \big| \tilde \theta_n - \theta_0 \big| \geq \frac{1}{\sqrt{32n}} \right) + \P_{\theta_{1,n}}^n \left( \big| \tilde \theta_n - \theta_{1,n} \big| \geq \frac{1}{\sqrt{32n}} \right) \geq \frac{1}{2}. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_neyman_pearson_bernoulli}] Let $f: \{0,1\}^n \to \{0,1\}$ be any function. Considering this function as a statistical test, the Neyman--Pearson lemma and Pinsker's inequality \citep{gine2021mathematical} give % \begin{align*} \P_{\theta_0}^n \big( f=1 \big) +\P_{\theta_{1,n}}^n \big( f=0 \big) &\geq 1- \TV\left( \P_{\theta_0}^n, \P_{\theta_{1,n}}^n \right) \geq 1- \sqrt{ \frac{1}{2} \KL \left( \P_{\theta_0}^n \bigm\| \P_{\theta_{1,n}}^n \right)} \\ &= 1- \sqrt{ \frac{n}{2} \KL \left( \Ber(\theta_0) \bigm\| \Ber(\theta_{1,n}) \right) + \frac{n}{2} \KL \left( V \bigm\| V \right)} \\ &= 1- \sqrt{ \frac{n}{2} \KL \left( \Ber(\theta_0) \bigm\| \Ber(\theta_{1,n}) \right)}, \end{align*} % where $\TV$ is the total variation distance and $\KL$ is the Kullback--Leibler divergence. In the penultimate line we used the tensorization of Kullback--Leibler divergence \citep{gine2021mathematical}, noting that the law of $V$ is fixed and hence does not contribute. We now evaluate this Kullback--Leibler divergence at the specified parameter values. % \begin{align*} \P_{\theta_0}^n \big( f=1 \big) +\P_{\theta_{1,n}}^n \big( f=0 \big) &\geq 1- \sqrt{ \frac{n}{2} \KL \left( \Ber(\theta_0) \bigm\| \Ber(\theta_{1,n}) \right)} \\ &= 1- \sqrt{\frac{n}{2}} \sqrt{ \theta_0 \log \frac{\theta_0}{\theta_{1,n}} + (1 - \theta_0) \log \frac{1 - \theta_0}{1 - \theta_{1,n}}} \\ &= 1- \sqrt{\frac{n}{2}} \sqrt{ \frac{1}{2} \log \frac{1/2}{1/2 + 1/\sqrt{8n}} + \frac{1}{2} \log \frac{1/2}{1/2 - 1/\sqrt{8n}}} \\ &= 1- \frac{\sqrt n}{2} \sqrt{\log \frac{1}{1 - 1/(2n)}} \geq 1- \frac{\sqrt n}{2} \sqrt{\frac{1}{n}} = \frac{1}{2}, \end{align*} % where in the penultimate line we used that $\log \frac{1}{1-x} \leq 2x$ for $x \in [0,1/2]$. Now define a test $f$ by $f = 1$ if $\tilde \theta_n > \frac{1}{2} + \frac{1}{\sqrt{32n}}$ and $f=0$ otherwise, to see % \begin{align*} \P_{\theta_0}^n \left( \tilde \theta_n > \frac{1}{2} + \frac{1}{\sqrt{32n}} \right) + \P_{\theta_{1,n}}^n \left( \tilde \theta_n \leq \frac{1}{2} + \frac{1}{\sqrt{32n}} \right) \geq \frac{1}{2}. \end{align*} % By the triangle inequality, recalling that $\theta_0 = \frac{1}{2}$ and $\theta_{1,n} = \frac{1}{2} + \frac{1}{\sqrt{8n}}$, we have % \begin{align*} \left\{ \tilde \theta_n > \frac{1}{2} + \frac{1}{\sqrt{32n}} \right\} &\subseteq \left\{ \left| \tilde \theta_n - \theta_0 \right| \geq \frac{1}{\sqrt{32n}} \right\} \\ \left\{ \tilde \theta_n \leq \frac{1}{2} + \frac{1}{\sqrt{32n}} \right\} &\subseteq \left\{ \left| \tilde \theta_n - \theta_{1,n} \right| \geq \frac{1}{\sqrt{32n}} \right\}. \end{align*} % Thus by the monotonicity of measures, % \begin{align*} \P_{\theta_0}^n \left( \big| \tilde \theta_n - \theta_0 \big| \geq \frac{1}{\sqrt{32n}} \right) + \P_{\theta_{1,n}}^n \left( \big| \tilde \theta_n - \theta_{1,n} \big| \geq \frac{1}{\sqrt{32n}} \right) \geq \frac{1}{2}. \end{align*} \end{proof} \begin{proof}[Theorem~\ref{thm:kernel_minimax}] \proofparagraph{lower bound for $\cP$} By translation and scaling of the data, we may assume without loss of generality that $\cW = [-1,1]$. We may also assume that $C_\rH \leq 1/2$, since reducing $C_\rH$ can only shrink the class of distributions. Define the dyadic distribution $\P_\theta$ with parameter $\theta \in [1/2, 1]$ as follows: $A_1, \ldots, A_n$ are i.i.d.\ $\Ber(\theta)$, while $V_{i j}$ for $1 \leq i < j \leq n$ are i.i.d.\ and independent of $\bA_n$. The distribution of $V_{i j}$ is given by its density function $f_V(v) = \frac{1}{2} + C_\rH v$ on $[-1,1]$. Finally, generate $W_{i j} = W(A_i, A_j, V_{i j}) \vcentcolon= (2 A_i A_j - 1) V_{i j}$. Note that the function $W$ does not depend on $\theta$. The conditional and marginal densities of $W_{i j}$ are for $w \in [-1,1]$ % \begin{align*} f_{W \mid AA}(w \mid A_i, A_j) &= \begin{cases} \frac{1}{2} + C_\rH w & \text{if } A_i = A_j = 1, \\ \frac{1}{2} - C_\rH w & \text{if } A_i = 0 \text{ or } A_j = 0, \\ \end{cases} \\ f_{W \mid A}(w \mid A_i) &= \begin{cases} \frac{1}{2} + (2 \theta - 1) C_\rH w & \text{if } A_i = 1, \\ \frac{1}{2} - C_\rH w & \text{if } A_i = 0 , \\ \end{cases} \\ f_W(w)&= \frac{1}{2} + (2\theta^2 - 1) C_\rH w. \end{align*} % Clearly, $f_W \in \cH^\beta_{C_\rH}(\cW)$ and $f_{W \mid AA}(\cdot \mid a, a') \in \cH^1_{C_\rH}(\cW)$. Also $\sup_{w \in \cW} \|f_{W \mid A}(w \mid \cdot\,)\|_\TV \leq 1$. Therefore $\P_\theta$ satisfies Assumption~\ref{ass:kernel_data} and so $\big\{\P_\theta : \theta \in [1/2, 1] \big\} \subseteq \cP$. Note that $f_W(1) = \frac{1}{2} + (2\theta^2 - 1) C_\rH $, so $\theta^2 = \frac{1}{2 C_\rH}(f_W(1) - 1/2 + C_\rH)$. Thus if $\tilde f_W$ is some density estimator depending only on the data $\bW_n$, we define the parameter estimator % \begin{align*} \tilde \theta_n^2 &\vcentcolon= \frac{1}{2 C_\rH}\left( \tilde f_W(1) - \frac{1}{2} + C_\rH \right) \vee 0. \end{align*} % This gives the inequality % \begin{align*} \big| \tilde \theta_n^2 - \theta^2 \big| &= \left| \frac{1}{2 C_\rH}\left( \tilde f_W(1) - \frac{1}{2} + C_\rH \right) \vee 0 - \frac{1}{2 C_\rH}\left( f_W(1) - \frac{1}{2} + C_\rH \right) \right| \\ &\leq \frac{1}{2 C_\rH} \sup_{w \in \cW} \left| \tilde f_W(w) - f_W(w) \right|. \end{align*} % Therefore, since also $\tilde \theta \geq 0$ and $\theta \geq \frac{1}{2}$, % \begin{align*} \big| \tilde \theta_n - \theta \big| &= \frac{\big|\tilde \theta_n^2 - \theta^2\big|} {\tilde \theta_n + \theta} \leq \frac{1}{C_\rH} \sup_{w \in \cW} \left| \tilde f_W(w) - f_W(w) \right|. \end{align*} % Now we apply the point estimation lower bound from Lemma~\ref{lem:kernel_app_neyman_pearson_bernoulli}, setting $\theta_0 = \frac{1}{2}$ and $\theta_{1,n} = \frac{1}{2} + \frac{1}{\sqrt{8n}}$, noting that the estimator $\tilde \theta_n$ is a function of $\bW_n$ only, thus is a function of $\bA_n$ and $\bV_n$ only and so satisfies the conditions. % \begin{align*} &\P_{\theta_0} \left( \sup_{w \in \cW} \big| \tilde f_W(w) - f^{(0)}_W(w) \big| \geq \frac{1}{C\sqrt{n}} \right) + \P_{\theta_{1,n}} \left( \sup_{w \in \cW} \big| \tilde f_W(w) - f^{(1)}_W(w) \big| \geq \frac{1}{C\sqrt{n}} \right) \\ &\quad\geq \P_{\theta_0} \left( \big| \tilde \theta_n - \theta_0 \big| \geq \frac{1}{C C_\rH \sqrt{n}} \right) + \P_{\theta_{1,n}} \left( \big| \tilde \theta_n - \theta_{1,n} \big| \geq \frac{1}{C C_\rH \sqrt{n}} \right) \\ &\quad\geq \P_{\theta_0} \left( \big| \tilde \theta_n - \theta_0 \big| \geq \frac{1}{\sqrt{32n}} \right) + \P_{\theta_{1,n}} \left( \big| \tilde \theta_n - \theta_{1,n} \big| \geq \frac{1}{\sqrt{32n}} \right) \geq \frac{1}{2}, \end{align*} % where we set $C \geq \frac{\sqrt{32}}{C_\rH}$. Therefore we deduce that % \begin{align*} \inf_{\tilde f_W} \sup_{\P \in \cP} \P\left( \sup_{w \in \cW} \big| \tilde f_W(w) - f_W(w) \big| \geq \frac{1}{C \sqrt n} \right) \geq \frac{1}{4} \end{align*} % and so % \begin{align*} \inf_{\tilde f_W} \sup_{\P \in \cP} \E_\P\left[ \sup_{w \in \cW} \big| \tilde f_W(w) - f_W(w) \big| \right] \geq \frac{1}{4 C \sqrt{n}}. \end{align*} \proofparagraph{lower bound for $\cP_\rd$} For the subclass of totally degenerate distributions, we rely on the main theorem from \citet{khasminskii1978lower}. Let $\cP_0$ be the subclass of $\cP_\rd$ consisting of the distributions which satisfy $A_1 = \cdots = A_n = 0$ and $W_{i j} \vcentcolon= A_i + A_j + V_{i j} = V_{i j}$, so that $W_{i j}$ are i.i.d.\ with common density $f_W = f_V$. Define the class % \begin{align*} \cF &= \left\{ f \text{ density function on } \R, \ f \in \cH^\beta_{C_\rH}(\cW) \right\}. \end{align*} % Write $\E_f$ for the expectation under $W_{i j}$ having density $f$. Then by \citet{khasminskii1978lower}, % \begin{align*} \liminf_{n \to \infty} \inf_{\tilde f_W} \sup_{f \in \cF} \E_f\left[ \left( \frac{n^2}{\log n} \right)^{\frac{\beta}{2\beta + 1}} \sup_{w \in \cW} \big| \tilde f_W(w) - f_W(w) \big| \right] > 0, \end{align*} % where $\tilde f_W$ is any density estimator depending only on the $\frac{1}{2}n(n-1)$ i.i.d.\ data samples $\bW_n$. Now every density function in $\cH^\beta_{C_\rH}(\cW)$ corresponds to a distribution in $\cP_0$ and therefore to a distribution in $\cP_\rd$. Thus for large enough $n$ and some positive constant $C$, % \begin{align*} \inf_{\tilde f_W} \sup_{\P \in \cP_\rd} \E_\P\left[ \sup_{w \in \cW} \big| \tilde f_W(w) - f_W(w) \big| \right] \geq \frac{1}{C} \left( \frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta + 1}}. \end{align*} \proofparagraph{upper bounds} The upper bounds follow by using a dyadic kernel density estimator $\hat f_W$ with a boundary bias-corrected Lipschitz kernel of order $p \geq \beta$ and a bandwidth of $h$. Theorem~\ref{thm:kernel_bias} gives % \begin{align*} \sup_{\P \in \cP} \sup_{w \in \cW} \big| \E_\P\big[\hat f_W(w)\big] - f_W(w) \big| \leq \frac{4C_\rk C_\rH}{\flbeta !} h^\beta. \end{align*} % Then, treating the degenerate and non-degenerate cases separately and noting that all inequalities hold uniformly over $\cP$ and $\cP_\rd$, the proof of Lemma~\ref{lem:kernel_uniform_concentration} shows that % \begin{align*} \sup_{\P \in \cP} \E_\P\left[ \sup_{w \in \cW} \big|\hat f_W(w) - \E_\P[\hat f_W(w)]\big| \right] &\lesssim \frac{1}{\sqrt n} + \sqrt{\frac{\log n}{n^2h}}, \\ \sup_{\P \in \cP_\rd} \E_\P\left[ \sup_{w \in \cW} \big|\hat f_W(w) - \E_\P[\hat f_W(w)]\big| \right] &\lesssim \sqrt{\frac{\log n}{n^2h}}. \end{align*} % Thus combining these yields that % \begin{align*} \sup_{\P \in \cP} \E_\P\left[ \sup_{w \in \cW} \big|\hat f_W(w) - f_W(w)\big| \right] &\lesssim h^\beta + \frac{1}{\sqrt n} + \sqrt{\frac{\log n}{n^2h}}, \\ \sup_{\P \in \cP_\rd} \E_\P\left[ \sup_{w \in \cW} \big|\hat f_W(w) - f_W(w)\big| \right] &\lesssim h^\beta + \sqrt{\frac{\log n}{n^2h}}. \end{align*} % Set $h = \left( \frac{\log n}{n^2} \right)^{\frac{1}{2\beta+1}}$ and note that $\beta \geq 1$ implies that $\left(\frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta+1}} \ll \frac{1}{\sqrt n}$. So for $C > 0$, % \begin{align*} \sup_{\P \in \cP} \E_\P\left[ \sup_{w \in \cW} \big|\hat f_W(w) - f_W(w)\big| \right] &\lesssim \frac{1}{\sqrt n} + \left( \frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta+1}} \leq \frac{C}{\sqrt n}, \\ \sup_{\P \in \cP_\rd} \E_\P\left[ \sup_{w \in \cW} \big|\hat f_W(w) - f_W(w)\big| \right] &\leq C\left( \frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta+1}}. \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_covariance_structure}] We write $k_{i j}$ for $k_h(W_{i j},w)$ and $k_{i j}'$ for $k_h(W_{i j},w')$, in the interest of brevity. % \begin{align*} \Sigma_n(w,w') &= \E\Big[ \big( \hat f_W(w) - \E[\hat f_W(w)] \big) \big( \hat f_W(w') - \E[\hat f_W(w')] \big) \Big] \\ &= \E\left[ \left( \frac{2}{n(n-1)} \sum_{i \Du \frac{t + C_1 \log n}{\sqrt n} \right) \leq C_2 e^{-C_3 t}. \end{align*} % Integrating tail probabilities shows that % \begin{align*} \E\left[ \sup_{w \in \cW} \Big|\sqrt{n} L_n'(w) - Z_n^{L\prime}(w)\Big| \right] &\leq \Du \frac{C_1 \log n}{\sqrt n} + \int_0^\infty \frac{\Du}{\sqrt n} C_2 e^{-C_3 t} \diff{t} \lesssim \frac{\Du \log n}{\sqrt n}. \end{align*} % Further, $Z_n^{L\prime}$ has the same covariance structure as $G_n^{L\prime}$ in the sense that for all $w, w' \in \cW$, % \begin{align*} \E\big[Z_n^{L\prime}(w) Z_n^{L\prime}(w')\big] = \E\big[G_n^{L\prime}(w) G_n^{L\prime}(w')\big], \end{align*} % and clearly $L_n'$ is equal in distribution to $L_n$. To obtain the trajectory regularity property of $Z_n^{L\prime}$, note that it was shown in the proof of Lemma~\ref{lem:kernel_uniform_concentration} that for all $w,w' \in \cW$, % \begin{align*} \left| k_h^A(A_i,w) - k_h^A(A_i,w') \right| &\leq C |w-w'| \end{align*} % for some constant $C > 0$. Therefore, since the $A_i$ are i.i.d., % \begin{align*} &\E\left[ \big| Z_n^{L\prime}(w) - Z_n^{L\prime}(w') \big|^2 \right]^{1/2} = \sqrt{n} \E\left[ \big| L_n(w) - L_n(w') \big|^2 \right]^{1/2} \\ &\quad= \sqrt{n} \E\left[ \left| \frac{1}{n} \sum_{i=1}^n \Big( k_h^A(A_i,w) - k_h^A(A_i,w') - \E\big[k_h^A(A_i,w)] + \E\big[k_h^A(A_i,w')] \Big) \right|^2 \right]^{1/2} \\ &\quad= \E\left[ \Big| k_h^A(A_i,w) - k_h^A(A_i,w') - \E\big[k_h^A(A_i,w)] + \E\big[k_h^A(A_i,w')] \Big|^2 \right]^{1/2} \lesssim |w-w'|. \end{align*} % Therefore, by the regularity result for Gaussian processes in Lemma~\ref{lem:kernel_app_gaussian_process_maximal}, with $\delta_n \in (0, 1/2]$: % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big| Z_n^{L\prime}(w) - Z_n^{L\prime}(w') \big| \right] &\lesssim \int_0^{\delta_n} \sqrt{\log 1/\varepsilon} \diff{\varepsilon} \lesssim \delta_n \sqrt{\log 1/\delta_n} \lesssim \Du \delta_n \sqrt{\log 1/\delta_n}, \end{align*} % where the last inequality is because $Z_n^{L\prime} \equiv 0$ whenever $\Du = 0$. There is a modification of $Z_n^{L\prime}$ with continuous trajectories by Kolmogorov's continuity criterion \citep[Theorem~2.9]{legall2016brownian}. Note that $L_n'$ is $\bA_n'$-measurable and so by Lemma~\ref{lem:kernel_app_kmt_corollary} we can assume that $Z_n^{L\prime}$ depends only on $\bA_n'$ and some random noise which is independent of $(\bA_n', \bV_n')$. Finally, in order to have $\bA_n', \bV_n', L_n'$, and $Z_n^{L\prime}$ all defined on the same probability space, we note that $\bA_n$ and $\bV_n$ are random vectors while $L_n'$ and $Z_n^{L\prime}$ are stochastic processes with continuous sample paths indexed on the compact interval $\cW$. Hence the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}) allows us to ``glue'' them together in the desired way on another new probability space, giving $\big(\bA_n', \bV_n', L_n', Z_n^{L\prime}\big)$, retaining the single prime notation for clarity. \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_strong_approx_Ln}] See Lemma~\ref{lem:kernel_app_strong_approx_Ln} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}] We apply Lemma~\ref{lem:kernel_app_yurinskii_corollary} conditional on $\bA_n$. While this lemma is not in its current form stated for conditional distributions, the Yurinskii coupling on which it depends can be readily extended by following the proof of \citet[Lemma~38]{belloni2019conditional}, using a conditional version of Strassen's theorem \cite[Theorem~B.2]{chen2020jackknife}. Care must similarly be taken in embedding the conditionally Gaussian vectors into a conditionally Gaussian process, using the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}). By the mutual independence of $A_i$ and $V_{i j}$, we have that the observations $W_{i j}$ are independent (but not necessarily identically distributed) conditionally on $\bA_n$. Note that $\sup_{s,w \in \cW} |k_h(s,w)| \lesssim M_n = h^{-1}$ and $\E[k_h(W_{i j},w)^2 \mid \bA_n] \lesssim \sigma_n^2 = h^{-1}$. The following uniform Lipschitz condition holds with $l_{n,\infty} = C_\rL h^{-2}$, by the Lipschitz property of the kernels: % \begin{align*} \sup_{s,w,w' \in \cW} \left| \frac{k_h(s, w) - k_h(s, w')} {w-w'} \right| \leq l_{n,\infty}. \end{align*} % Also, the following $L^2$ Lipschitz condition holds uniformly with $l_{n,2} = 2 C_\rL \sqrt{C_\rd} h^{-3/2}$: % \begin{align*} &\E\big[ \big| k_h(W_{i j}, w) - k_h(W_{i j}, w') \big|^2 \mid \bA_n \big]^{1/2} \\ &\quad\leq \frac{C_\rL}{h^2} |w-w'| \left( \int_{([w \pm h] \cup [w' \pm h]) \cap \cW} f_{W \mid AA}(s \mid \bA_n) \diff{s} \right)^{1/2} \\ &\quad\leq \frac{C_\rL}{h^2} |w-w'| \sqrt{4h C_\rd} \leq l_{n,2} |w-w'|. \end{align*} % So we apply Lemma~\ref{lem:kernel_app_yurinskii_corollary} conditionally on $\bA_n$ to the $\frac{1}{2}n(n-1)$ observations, noting that % \begin{align*} \sqrt{n^2h} E_n(w) = \sqrt{\frac{2 n h}{n-1}} \sqrt{\frac{2}{n(n-1)}} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} \Big( k_h(W_{i j},w) - \E[k_h(W_{i j},w) \mid A_i, A_j] \Big), \end{align*} % to deduce that for $t_n > 0$ there exist (an enlarged probability space) conditionally mean-zero and conditionally Gaussian processes $\tilde Z_n^{E\prime}(w)$ with the same conditional covariance structure as $\sqrt{n^2 h} E_n(w)$ and satisfying % \begin{align*} &\P\left( \sup_{w \in \cW} \big| \sqrt{n^2h} E_n(w) - \tilde Z_n^{E\prime}(w) \big| > t_n \Bigm\vert \bA_n' \right) \\ &\quad= \P\left( \sup_{w \in \cW} \left| \sqrt{\frac{n(n-1)}{2}} E_n(w) - \sqrt{\frac{n-1}{2 n h}} \tilde Z_n^{E\prime}(w) \right| > \sqrt{\frac{n-1}{2 n h}} t_n \Bigm\vert \bA_n' \right) \\ &\quad\lesssim \frac{ \sigma_n \sqrt{\Leb(\cW)} \sqrt{\log n} \sqrt{M_n + \sigma_n\sqrt{\log n}} }{n^{1/2} t_n^2 / h} \sqrt{ l_{n,2} \sqrt{\log n} + \frac{l_{n,\infty}}{n} \log n} \\ &\quad\lesssim \frac{ h^{-1/2} \sqrt{\log n} \sqrt{h^{-1} + h^{-1/2} \sqrt{\log n}} }{n^{1/2} t_n^2 / h} \sqrt{ h^{-3/2} \sqrt{\log n} + \frac{h^{-2}}{n} \log n} \\ &\quad\lesssim \sqrt{\frac{\log n}{n}} \frac{ \sqrt{1 + \sqrt{h \log n}} }{t_n^2} \sqrt{ \sqrt{\frac{\log n}{h^3}} \left( 1 + \sqrt{\frac{\log n}{n^2 h}} \right) } \\ &\quad\lesssim \sqrt{\frac{\log n}{n}} \frac{ 1 }{t_n^2} \left( \frac{\log n}{h^3} \right)^{1/4} \lesssim t_n^{-2} n^{-1/2} h^{-3/4} (\log n)^{3/4}, \end{align*} % where we used $h \lesssim 1 / \log n$ and $\frac{\log n}{n^2 h} \lesssim 1$. To obtain the trajectory regularity property of $\tilde Z_n^{E\prime}$, note that for $w, w' \in \cW$, by conditional independence, % \begin{align*} &\E\left[ \big| \tilde Z_n^{E\prime}(w) - \tilde Z_n^{E\prime}(w') \big|^2 \mid \bA_n' \right]^{1/2} = \sqrt{n^2h} \, \E\left[ \big| E_n(w) - E_n(w') \big|^2 \mid \bA_n \right]^{1/2} \\ &\quad\lesssim \sqrt{n^2h} \, \E\left[ \left| \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} \Big( k_h(W_{i j},w) - k_h(W_{i j},w') \Big) \right|^2 \Bigm\vert \bA_n \right]^{1/2} \\ &\quad\lesssim \sqrt{h} \, \E\left[ \big| k_h(W_{i j},w) - k_h(W_{i j},w') \big|^2 \bigm\vert \bA_n \right]^{1/2} \lesssim h^{-1} |w-w'|. \end{align*} % So by the regularity result for Gaussian processes in Lemma~\ref{lem:kernel_app_gaussian_process_maximal}, with $\delta_n \in (0, 1/(2h)]$: % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big| \tilde Z_n^{E\prime}(w) - \tilde Z_n^{E\prime}(w') \big| \mid \bA_n' \right] &\lesssim \int_0^{\delta_n/h} \sqrt{\log (\varepsilon^{-1} h^{-1})} \diff{\varepsilon} \lesssim \frac{\delta_n}{h} \sqrt{\log \frac{1}{h\delta_n}}, \end{align*} % and there exists a modification with continuous trajectories. Finally, in order to have $\bA_n', \bV_n', E_n'$, and $\tilde Z_n^{E\prime}$ all defined on the same probability space, we note that $\bA_n$ and $\bV_n$ are random vectors while $E_n'$ and $\tilde Z_n^{E\prime}$ are stochastic processes with continuous sample paths indexed on the compact interval $\cW$. Hence the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}) allows us to ``glue together'' $\big(\bA_n, \bV_n, E_n\big)$ and $\big(E_n', \tilde Z_n^{E\prime}\big)$ in the desired way on another new probability space, giving $\big(\bA_n', \bV_n', E_n', \tilde Z_n^{E\prime}\big)$, retaining the single prime notation for clarity. The trajectories of the conditionally Gaussian processes $\tilde Z_n^{E\prime}$ depend on the choice of $t_n$, necessitating the use of a divergent sequence $R_n$ to establish bounds in probability. \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_conditional_strong_approx_En}] See Lemma~\ref{lem:kernel_app_conditional_strong_approx_En} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_unconditional_strong_approx_En}] \proofparagraph{defining $Z_n^{E\dprime}$} Pick $\delta_n \to 0$ with $\log 1/\delta_n \lesssim \log n$. Let $\cW_\delta$ be a $\delta_n$-covering of $\cW$ with cardinality $\Leb(\cW)/\delta_n$ which is also a $\delta_n$-packing. Let $\tilde Z_{n,\delta}^{E\prime}$ be the restriction of $\tilde Z_n^{E\prime}$ to $\cW_\delta$. Let $\tilde \Sigma_n^E(w, w') = \E\big[\tilde Z_n^{E\prime}(w) \tilde Z_n^{E\prime}(w') \mid \bA_n' \big]$ be the conditional covariance function of $\tilde Z_n^{E\prime}$, and define $\Sigma_n^E(w,w') = \E\big[\tilde \Sigma_n^E(w,w')\big]$. Let $\tilde \Sigma^E_{n,\delta}$ and $\Sigma^E_{n,\delta}$ be the restriction matrices of $\tilde \Sigma^E_n$ and $\Sigma^E_n$ to $\cW_\delta \times \cW_\delta$, noting that, as (conditional) covariance matrices, these are (almost surely) positive semi-definite. Let $N \sim \cN(0, I_{|\cW_\delta|})$ be independent of $\bA_n'$, and define using the matrix square root $\tilde Z_{n,\delta}^{E\dprime} = \big(\tilde \Sigma^E_{n,\delta})^{1/2} N$, which has the same distribution as $\tilde Z_{n,\delta}^{E\prime}$, conditional on $\bA_n'$. Extend it using the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}) to the compact interval $\cW$, giving a conditionally Gaussian process $\tilde Z_n^{E\dprime}$ which has the same distribution as $\tilde Z_{n}^{E\prime}$, conditional on $\bA_n'$. Define $Z_{n,\delta}^{E\dprime} = \big(\Sigma^E_{n,\delta})^{1/2} N$, noting that this is independent of $\bA_n'$, and extend it using the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}) to a Gaussian process $Z_n^{E\dprime}$ on the compact interval $\cW$, which is independent of $\bA_n'$ and has covariance structure given by $\Sigma_n^E$. \proofparagraph{closeness of $Z_n^{E\dprime}$ and $\tilde Z_n^{E\dprime}$ on the mesh} Note that conditionally on $\bA_n'$, $\tilde Z_{n,\delta}^{E\dprime} - Z_{n,\delta}^{E\dprime}$ is a length-$|\cW_\delta|$ Gaussian random vector with covariance matrix $\big( \big(\tilde \Sigma^E_{n,\delta}\big)^{1/2} - \big(\Sigma^E_{n,\delta}\big)^{1/2} \big)^2$. So by the Gaussian maximal inequality in Lemma~\ref{lem:kernel_app_gaussian_vector_maximal} applied conditionally on $\bA_n'$, % \begin{align*} \E\left[ \max_{w \in \cW_\delta} \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big| \Bigm| \bA_n' \right] &\lesssim \sqrt{\log n} \left\| \tilde\Sigma^E_{n,\delta} - \Sigma^E_{n,\delta} \right\|_2^{1/2}, \end{align*} % since $\log |\cW_\delta| \lesssim \log n$. Next, we apply some U-statistic theory to $\tilde\Sigma^E_{n,\delta} - \Sigma^E_{n,\delta}$, with the aim of applying the matrix concentration result for second-order U-statistics presented in Lemma~\ref{lem:kernel_app_ustat_matrix_concentration}. Firstly, we note that since the conditional covariance structures of $\tilde Z_n^{E\prime}$ and $\sqrt{n^2h} E_n$ are equal in distribution, we have, writing $E_n(\cW_\delta)$ for the vector $\big(E_n(w) : w \in \cW_\delta\big)$ and similarly for $k_h(W_{i j}, \cW_\delta)$, % \begin{align*} \tilde\Sigma^E_{n,\delta} &= n^2h \E[E_n(\cW_\delta) E_n(\cW_\delta)^\T \mid \bA_n] \\ &= n^2h \frac{4}{n^2(n-1)^2} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} \E\left[ \Big( k_h(W_{i j}, \cW_\delta) - \E\left[ k_h(W_{i j}, \cW_\delta) \mid \bA_n \right] \Big) \right. \\ &\qquad\left. \times\Big( k_h(W_{i j}, \cW_\delta) - \E\left[ k_h(W_{i j}, \cW_\delta) \mid \bA_n \right] \Big)^\T \bigm\vert \bA_n \right] \\ &= \frac{4h}{(n-1)^2} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} u(A_i, A_j), \end{align*} % where we define the random $|\cW_\delta| \times |\cW_\delta|$ matrices % \begin{align*} u(A_i, A_j) &= \E\!\left[ k_h(W_{i j}, \cW_\delta) k_h(W_{i j}, \cW_\delta)^\T \mid \bA_n \right] - \E\!\left[ k_h(W_{i j}, \cW_\delta) \mid \bA_n \right] \E\!\left[ k_h(W_{i j}, \cW_\delta) \mid \bA_n \right]^\T. \end{align*} % Let $u(A_i) = \E[u(A_i, A_j) \mid A_i]$ and $u = \E[u(A_i, A_j)]$. The decomposition $\tilde \Sigma^E_{n,\delta} - \Sigma^E_{n,\delta} = \tilde L +\tilde Q$ holds by Lemma~\ref{lem:kernel_app_general_hoeffding}, where % \begin{align*} \tilde L &= \frac{4h}{n-1} \sum_{i=1}^n \big( u(A_i) - u \big), &\tilde Q &= \frac{4h}{(n-1)^2} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} \big( u(A_i, A_j) - u(A_i) - u(A_j) + u \big). \end{align*} % Next, we seek an almost sure upper bound on $\|u(A_i, A_j)\|_2$. Since this is a symmetric matrix, we have by H{\"o}lder's inequality % \begin{align*} \|u(A_i, A_j)\|_2 &\leq \|u(A_i, A_j)\|_1^{1/2} \|u(A_i, A_j)\|_\infty^{1/2} = \max_{1 \leq k \leq |\cW_\delta|} \sum_{l=1}^{|\cW_\delta|} |u(A_i, A_j)_{kl}|. \end{align*} % The terms on the right hand side can be bounded as follows, writing $w, w'$ for the $k$th and $l$th points in $\cW_\delta$ respectively: % \begin{align*} |u(A_i, A_j)_{kl}| &= \big| \E\left[ k_h(W_{i j}, w) k_h(W_{i j}, w') \mid \bA_n \right] - \E\left[ k_h(W_{i j}, w) \mid \bA_n \right] \E\left[ k_h(W_{i j}, w') \mid \bA_n \right] \big| \\ &\lesssim \E\left[ | k_h(W_{i j}, w) k_h(W_{i j}, w') | \mid \bA_n \right] + \E\left[ | k_h(W_{i j}, w) | \mid \bA_n \right] \E\left[ | k_h(W_{i j}, w') | \mid \bA_n \right] \\ &\lesssim h^{-1} \I\big\{ |w-w'| \leq 2h \big\} + 1 \lesssim h^{-1} \I\big\{ |k-l| \leq 2h/\delta_n \big\} + 1, \end{align*} % where we used that $|w-w'| \geq |k-l| \delta_n$ because $\cW_\delta$ is a $\delta_n$-packing. Hence % \begin{align*} \|u(A_i, A_j)\|_2 &\leq \max_{1 \leq k \leq |\cW_\delta|} \sum_{l=1}^{|\cW_\delta|} |u(A_i, A_j)_{kl}| \lesssim \max_{1 \leq k \leq |\cW_\delta|} \sum_{l=1}^{|\cW_\delta|} \Big( h^{-1} \I\big\{ |k-l| \leq 2h/\delta_n \big\} + 1 \Big) \\ &\lesssim 1/\delta_n + 1/h + |\cW_\delta| \lesssim 1/\delta_n + 1/h. \end{align*} % Clearly, the same bound holds for $\|u(A_i)\|_2$ and $\|u\|_2$, by Jensen's inequality. Therefore, applying the matrix Bernstein inequality (Lemma~\ref{lem:kernel_app_matrix_bernstein}) to the zero-mean matrix $\tilde L$ gives % \begin{align*} \E\left[ \left\| \tilde L \right\|_2 \right] &\lesssim \frac{h}{n} \left(\frac{1}{\delta_n} + \frac{1}{h} \right) \left( \log |\cW_\delta| + \sqrt{n \log |\cW_\delta|} \right) \lesssim \left(\frac{h}{\delta_n} + 1 \right) \sqrt{\frac{\log n}{n}}. \end{align*} % The matrix U-statistic concentration inequality (Lemma~\ref{lem:kernel_app_ustat_matrix_concentration}) with $\tilde Q$ gives % \begin{align*} \E\left[ \big\| \tilde Q \big\|_2 \right] &\lesssim \frac{h}{n^2} n \left(\frac{1}{\delta_n} + \frac{1}{h} \right) \left( \log |\cW_\delta| \right)^{3/2} \lesssim \left(\frac{h}{\delta_n} + 1 \right) \frac{(\log n)^{3/2}}{n}. \end{align*} % Hence taking a marginal expectation and applying Jensen's inequality, % \begin{align*} &\E\left[ \max_{w \in \cW_\delta} \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big| \right] \\ &\quad\lesssim \sqrt{\log n} \ \E\left[ \left\| \tilde\Sigma^E_{n,\delta} - \Sigma^E_{n,\delta} \right\|_2^{1/2} \right] \lesssim \sqrt{\log n} \ \E\left[ \left\| \tilde\Sigma^E_{n,\delta} - \Sigma^E_{n,\delta} \right\|_2 \right]^{1/2} \\ &\quad\lesssim \sqrt{\log n} \ \E\left[ \left\| \tilde L + \tilde Q \right\|_2 \right]^{1/2} \lesssim \sqrt{\log n} \ \E\left[ \left\| \tilde L \right\|_2 + \left\| \tilde Q \right\|_2 \right]^{1/2} \\ &\quad\lesssim \sqrt{\log n} \left( \left(\frac{h}{\delta_n} + 1 \right) \sqrt{\frac{\log n}{n}} + \left(\frac{h}{\delta_n} + 1 \right) \frac{(\log n)^{3/2}}{n} \right)^{1/2} \\ &\quad\lesssim \sqrt{\frac{h}{\delta_n} + 1} \frac{(\log n)^{3/4}}{n^{1/4}}. \end{align*} \proofparagraph{regularity of $Z_n^E$ and $\tilde Z_n^{E\prime}$} Define the semimetrics % \begin{align*} \rho(w, w')^2 &= \E\left[ \big|Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w')\big|^2 \right], &\tilde\rho(w, w')^2 &= \E\left[ \big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big|^2 \mid \bA_n \right]. \end{align*} % We bound $\tilde \rho$ as follows, since $\tilde Z_n^{E\dprime}$ and $\sqrt{n^2h} E_n$ have the same conditional covariance structure: % \begin{align*} \tilde\rho(w, w') &= \E\left[ \big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big|^2 \mid \bA_n' \right]^{1/2} \\ &= \sqrt{n^2 h} \, \E\left[ \big|E_n(w) - E_n(w')\big|^2 \mid \bA_n' \right]^{1/2} \lesssim h^{-1} |w-w'|, \end{align*} % uniformly in $\bA_n'$, where the last line was shown in the proof of Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}. Note that also % \begin{align*} \rho(w, w') &= \sqrt{\E[\tilde \rho(w,w')^2]} \lesssim h^{-1} |w-w'|. \end{align*} % Thus Lemma~\ref{lem:kernel_app_gaussian_process_maximal} applies directly to $Z_n^E$ and conditionally to $\tilde Z_n^{E\prime}$, with $\delta_n \in (0, 1/(2h)]$, demonstrating that % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big| \bigm\vert \bA_n' \right] &\lesssim \int_0^{\delta_n / h} \sqrt{\log (1 / (\varepsilon h))} \diff{\varepsilon} \lesssim \frac{\delta_n}{h} \sqrt{\log \frac{1}{h \delta_n}}, \\ \E\left[ \sup_{|w-w'| \leq \delta_n} |Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w')| \right] &\lesssim \int_0^{\delta_n / h} \sqrt{\log (1 / (\varepsilon h))} \diff{\varepsilon} \lesssim \frac{\delta_n}{h} \sqrt{\log \frac{1}{h \delta_n}}. \end{align*} % Continuity of trajectories follows from this. \proofparagraph{conclusion} We use the previous parts to deduce that % \begin{align*} &\E\left[ \sup_{w \in \cW} \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big| \right] \\ &\quad\lesssim \E\left[ \max_{w \in \cW_\delta} \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big| \right] \\ &\qquad+ \E\left[ \sup_{|w-w'| \leq \delta_n} \left\{ \big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big| + \big|Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w')\big| \right\} \right] \\ &\quad\lesssim \sqrt{\frac{h}{\delta_n} + 1} \frac{(\log n)^{3/4}}{n^{1/4}} + \frac{\delta_n \sqrt{\log n}}{h}. \end{align*} % Setting $\delta_n = h \left( \frac{\log n}{n} \right)^{1/6}$ gives % \begin{align*} \E\left[ \sup_{w \in \cW} \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big| \right] &\lesssim n^{-1/6} (\log n)^{2/3}. \end{align*} % Independence of $Z_n^{E\dprime}$ and $\bA_n''$ follows by applying the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}), conditionally on $\bA_n'$, to the variables $\big(\bA_n', \tilde Z_n^{E\prime}\big)$ and $\big(\tilde Z_n^{E\dprime}, Z_n^{E\dprime}\big)$. \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_unconditional_strong_approx_En}] See Lemma~\ref{lem:kernel_app_unconditional_strong_approx_En} \end{proof} \begin{proof}[Theorem~\ref{thm:kernel_app_strong_approx_fW}] We add together the strong approximations for the $L_n$ and $E_n$ terms, and then add an independent Gaussian process to account for the variance of $Q_n$. \proofparagraph{gluing together the strong approximations} Let $\big(\bA_n', \bV_n', L_n', Z_n^{L\prime}\big)$ be the strong approximation for $L_n$ derived in Lemma~\ref{lem:kernel_app_strong_approx_Ln}. Let $\big(\bA_n'', \bV_n'', E_n'', \tilde Z_n^{E\dprime}\big)$ and $\big(\bA_n''', \bV_n''', \tilde Z_n^{E\tprime}, Z_n^{E\tprime}\big)$ be the conditional and unconditional strong approximations for $E_n$ given in Lemmas~\ref{lem:kernel_app_conditional_strong_approx_En} and \ref{lem:kernel_app_unconditional_strong_approx_En} respectively. The first step is to define copies of these variables and processes on the same probability space. This is achieved by applying the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}). Dropping the prime notation for clarity, we construct $\big(\bA_n, \bV_n, L_n, Z_n^L, E_n, \tilde Z_n^E, Z_n^E\big)$ with the following properties: % \begin{enumerate}[label=(\roman*)] \item $\sup_{w \in \cW} \big| \sqrt{n} L_n(w) - Z_n^L(w)\big| \lesssim_\P n^{-1/2} \log n$, \item $\sup_{w \in \cW} \big|\sqrt{n^2h} E_n(w) - \tilde Z^E_n(w) \big| \lesssim_\P n^{-1/4} h^{-3/8} (\log n)^{3/8} R_n$, \item $\sup_{w \in \cW} \big| \tilde Z^E_n(w) - Z^E_n(w) \big| \lesssim_\P n^{-1/6} (\log n)^{2/3}$, \item $Z_n^L$ is independent of $Z_n^E$. \end{enumerate} % Note that the independence of $Z_n^L$ and $Z_n^E$ follows since $Z_n^L$ depends only on $\bA_n$ and some independent random noise, while $Z_n^E$ is independent of $\bA_n$. Therefore $(Z_n^L, Z_n^E)$ are jointly Gaussian. To get the strong approximation result for $\hat f_W$, define the Gaussian process % \begin{align*} Z_n^f(w) &= \frac{1}{\sqrt n} Z_n^L(w) + \frac{1}{n} Z_n^Q(w) + \frac{1}{\sqrt{n^2h}} Z_n^E(w), \end{align*} % where $Z_n^Q(w)$ is a mean-zero Gaussian process independent of everything else with covariance % \begin{align*} \E\big[ Z_n^Q(w) Z_n^Q(w') \big] &= n^2 \E\big[ Q_n(w) Q_n(w') \big]. \end{align*} % As shown in the proof of Lemma~\ref{lem:kernel_uniform_concentration}, the process $Q_n(w)$ is uniformly Lipschitz and uniformly bounded in $w$. Thus by Lemma~\ref{lem:kernel_app_gaussian_process_maximal}, we have $\E\big[\sup_{w \in \cW} |Z_n^Q(w)|\big] \lesssim 1$. Therefore the uniform approximation error is given by % \begin{align*} & \sup_{w \in \cW} \big| \hat f_W(w) - \E[\hat f_W(w)] - Z_n^f(w) \big| \\ &\quad= \sup_{w \in \cW} \left| \frac{1}{\sqrt n} Z_n^L(w) + \frac{1}{n} Z_n^Q(w) + \frac{1}{\sqrt{n^2h}} Z_n^E(w) - \Big( L_n(w) + Q_n(w) + E_n(w) \Big) \right| \\ &\quad\leq \sup_{w \in \cW} \bigg( \frac{1}{\sqrt n} \left| Z_n^L(w) - \sqrt{n} L_n(w) \right| + \frac{1}{\sqrt{n^2h}} \left| \tilde Z_n^E(w) - \sqrt{n^2h} E_n(w) \right| \\ &\qquad+ \frac{1}{\sqrt{n^2h}} \left| Z_n^E(w) - \tilde Z_n^E(w) \right| \big| Q_n(w) \big| + \frac{1}{n} \big| Z_n^Q(w) \big| \bigg) \\ &\quad\lesssim_\P n^{-1} \log n + n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-7/6} h^{-1/2} (\log n)^{2/3}. \end{align*} \proofparagraph{covariance structure} Since $L_n$, $Q_n$, and $E_n$ are mutually orthogonal in $L^2$ (as shown in Lemma~\ref{lem:kernel_hoeffding}), we have the following covariance structure: % \begin{align*} \E\big[Z_n^f(w) Z_n^f(w')\big] &= \frac{1}{n} \E\big[ Z_n^L(w) Z_n^L(w') \big] + \frac{1}{n^2} \E\big[ Z_n^Q(w) Z_n^Q(w') \big] + \frac{1}{n^2h} \E\big[ Z_n^E(w) Z_n^E(w') \big] \\ &= \E\big[ L_n(w) L_n(w') \big] + \E\big[ Q_n(w) Q_n(w') \big] + \E\big[ E_n(w) E_n(w') \big] \\ &= \E\big[ \big(\hat f_W(w) - \E[\hat f_W(w)]\big) \big(\hat f_W(w') - \E[\hat f_W(w')]\big) \big]. \end{align*} \proofparagraph{trajectory regularity} The trajectory regularity of the process $Z_n^f$ follows directly by adding the regularities of the processes $\frac{1}{\sqrt n} Z_n^L$, $\frac{1}{n} Z_n^Q$, and $\frac{1}{\sqrt{n^2h}} Z_n^E$. Similarly, $Z_n^f$ has continuous trajectories. \end{proof} \begin{proof}[Theorem~\ref{thm:kernel_strong_approx_Tn}] Define $Z_n^T(w) = \frac{Z_n^f(w)}{\sqrt{\Sigma_n(w,w)}}$ so that % \begin{align*} \left| T_n(w) - Z_n^T(w) \right| &= \frac{\big| \hat f_W(w) - f_W(w) - Z_n^f(w) \big|} {\sqrt{\Sigma_n(w,w)}}. \end{align*} % By Theorems~\ref{thm:kernel_app_strong_approx_fW} and \ref{thm:kernel_bias}, the numerator can be bounded above by % \begin{align*} &\sup_{w \in \cW} \left| \hat f_W(w) - f_W(w) - Z_n^f(w) \right| \\ &\quad\leq \sup_{w \in \cW} \left| \hat f_W(w) - \E\big[\hat f_W(w)\big] - Z_n^f(w) \right| + \sup_{w \in \cW} \left| \E\big[\hat f_W(w)\big] - f_W(w) \right| \\ &\quad\lesssim_\P n^{-1} \log n + n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-7/6} h^{-1/2} (\log n)^{2/3} + h^{p \wedge \beta}. \end{align*} % By Lemma~\ref{lem:kernel_variance_bounds} with $\inf_\cW f_W(w) > 0$, the denominator is bounded below by % \begin{align*} \inf_{w \in \cW} \sqrt{\Sigma_n(w,w)} &\gtrsim \frac{\Dl}{\sqrt n} + \frac{1}{\sqrt{n^2h}}, \end{align*} % and the result follows. \end{proof} \begin{proof}[Theorem~\ref{thm:kernel_infeasible_ucb}] Note that the covariance structure of $Z_n^T$ is given by % \begin{align*} \Cov\big[ Z_n^T(w), Z_n^T(w') \big] &= \frac{\Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}}. \end{align*} % We apply an anti-concentration result to establish that all quantiles of $\sup_{w \in \cW} \big|Z_n^T(w)\big|$ exist. To do this, we must first establish regularity properties of $Z_n^T$. \proofparagraph{$L^2$ regularity of $Z_n^T$} Writing $k_{i j}'$ for $k_h(W_{i j},w')$ etc., note that by Lemma~\ref{lem:kernel_app_covariance_structure}, % \begin{align*} &\big| \Sigma_n(w,w') - \Sigma_n(w, w'') \big| \\ &\quad= \left| \frac{2}{n(n-1)} \Cov\big[ k_{i j}, k_{i j}' \big] + \frac{4(n-2)}{n(n-1)} \Cov\big[ k_{i j}, k_{i r}' \big] \right. \\ &\left. \quad\qquad- \frac{2}{n(n-1)} \Cov\big[ k_{i j}, k_{i j}'' \big] - \frac{4(n-2)}{n(n-1)} \Cov\big[ k_{i j}, k_{i r}'' \big] \right| \\ &\quad\leq \frac{2}{n(n-1)} \Big| \Cov\big[ k_{i j}, k_{i j}' - k_{i j}'' \big] \Big| + \frac{4(n-2)}{n(n-1)} \Big| \Cov\big[ k_{i j}, k_{i r}' - k_{i r}'' \big] \Big| \\ &\quad\leq \frac{2}{n(n-1)} \|k_{i j}\|_\infty \|k_{i j}' - k_{i j}''\|_\infty + \frac{4(n-2)}{n(n-1)} \|k_{i j}\|_\infty \|k_{i r}' - k_{i r}''\|_\infty \\ &\quad\leq \frac{4}{n h^3} C_\rk C_\rL |w'-w''| \lesssim n^{-1}h^{-3} |w'-w''| \end{align*} % uniformly in $w, w', w'' \in \cW$. Therefore, by Lemma~\ref{lem:kernel_variance_bounds}, with $\delta_n \leq n^{-2} h^2$, we have % \begin{align*} \inf_{|w-w'| \leq \delta_n} \Sigma_n(w,w') &\gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h} - n^{-1} h^{-3} \delta_n \gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h} - \frac{1}{n^3h} \gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h}, \\ \sup_{|w-w'| \leq \delta_n} \Sigma_n(w,w') &\lesssim \frac{\Du^2}{n} + \frac{1}{n^2h} + n^{-1} h^{-3} \delta_n \lesssim \frac{\Du^2}{n} + \frac{1}{n^2h} + \frac{1}{n^3h} \lesssim \frac{\Du^2}{n} + \frac{1}{n^2h}. \end{align*} % The $L^2$ regularity of $Z_n^T$ is % \begin{align*} \E\left[ \big( Z_n^T(w) - Z_n^T(w') \big)^2 \right] &= 2 - 2 \frac{\Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}}. \end{align*} % Applying the elementary result that for $a,b,c > 0$, % \begin{align*} 1 - \frac{a}{\sqrt{b c}} &= \frac{b(c-a) + a(b-a)} {\sqrt{b c}\big(\sqrt{b c} + a\big)}, \end{align*} % with $a = \Sigma_n(w,w')$, $b = \Sigma_n(w,w)$, and $c = \Sigma_n(w',w')$, and noting $|c-a| \lesssim n^{-1} h^{-3} |w-w'|$ and $|b-a| \lesssim n^{-1} h^{-3} |w-w'|$ and $\frac{\Dl^2}{n} + \frac{1}{n^2h} \lesssim a,b,c \lesssim \frac{\Du^2}{n} + \frac{1}{n^2h}$, yields % \begin{align*} \E\left[ \big( Z_n^T(w) - Z_n^T(w') \big)^2 \right] &\lesssim \frac{(\Du^2/n + 1/(n^2h))n^{-1}h^{-3}|w-w'|} {(\Dl^2/n + 1/(n^2h))^2} \\ &\lesssim \frac{n^{2} h^{-4}|w-w'|} {n^{-4}h^{-2}} \lesssim n^2 h^{-2} |w-w'|. \end{align*} % Thus the semimetric induced by $Z_n^T$ on $\cW$ is % \begin{align*} \rho(w,w') &\vcentcolon= \E\left[ \big( Z_n^T(w) - Z_n^T(w') \big)^2 \right]^{1/2} \lesssim n h^{-1} \sqrt{|w-w'|}. \end{align*} \proofparagraph{trajectory regularity of $Z_n^T$} By the bound on $\rho$ from the previous part, we deduce the covering number bound % \begin{align*} N(\varepsilon, \cW, \rho) &\lesssim N\big( \varepsilon, \cW, n h^{-1} \sqrt{|\cdot|} \big) \lesssim N\big( n^{-1} h \varepsilon, \cW, \sqrt{|\cdot|} \big) \\ &\lesssim N\big( n^{-2} h^2 \varepsilon^2, \cW, |\cdot| \big) \lesssim n^2 h^{-2} \varepsilon^{-2}. \end{align*} % Now apply the Gaussian process regularity result from Lemma~\ref{lem:kernel_app_gaussian_process_maximal}. % \begin{align*} \E\left[ \sup_{\rho(w,w') \leq \delta} \big| Z_n^T(w) - Z_n^T(w') \big| \right] &\lesssim \int_0^{\delta} \sqrt{\log N(\varepsilon, \cW, \rho)} \diff{\varepsilon} \lesssim \int_0^{\delta} \sqrt{\log (n^2 h^{-2} \varepsilon^{-2})} \diff{\varepsilon} \\ &\lesssim \int_0^{\delta} \left( \sqrt{\log n} + \sqrt{\log 1/\varepsilon} \right) \diff{\varepsilon} \lesssim \delta \left( \sqrt{\log n} + \sqrt{\log 1/\delta} \right), \end{align*} % and so % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big| Z_n^T(w) - Z_n^T(w') \big| \right] &\lesssim \E\left[ \sup_{\rho(w,w') \leq n h^{-1} \delta_n^{1/2}} \big| Z_n^T(w) - Z_n^T(w') \big| \right] \lesssim n h^{-1} \sqrt{\delta_n \log n}, \end{align*} % whenever $1/\delta_n$ is at most polynomial in $n$. \proofparagraph{existence of the quantile} Apply the Gaussian anti-concentration result from Lemma~\ref{lem:kernel_app_anticoncentration}, noting that $Z_n^T$ is separable, mean-zero, and has unit variance: % \begin{align*} \sup_{t \in \R} \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - t \right| \leq 2\varepsilon_n \right) &\leq 8 \varepsilon_n \left( 1 + \E\left[ \sup_{w \in \cW} \big| Z_n^T(w) \big| \right] \right). \end{align*} % To bound the supremum on the right hand side, apply the Gaussian process maximal inequality from Lemma~\ref{lem:kernel_app_gaussian_process_maximal} with $\sigma \leq 1$ and $N(\varepsilon, \cW, \rho) \lesssim n^2 h^{-2} \varepsilon^{-2}$: % \begin{align*} \E\left[ \sup_{w \in \cW} \big|Z_n^T(w)\big| \right] &\lesssim 1 + \int_0^{2} \sqrt{\log (n^2 h^{-2} \varepsilon^{-2})} \diff{\varepsilon} \lesssim \sqrt{\log n}. \end{align*} % Therefore % \begin{align*} \sup_{t \in \R} \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - t \right| \leq \varepsilon \right) &\lesssim \varepsilon \sqrt{\log n}. \end{align*} % Letting $\varepsilon \to 0$ shows that the distribution function of $\sup_{w \in \cW} \big|Z_n^T(w)\big|$ is continuous, and therefore all of its quantiles exist. \proofparagraph{validity of the infeasible uniform confidence band} Under Assumption~\ref{ass:kernel_rates} and with a sufficiently slowly diverging sequence $R_n$, the strong approximation rate established in Theorem~\ref{thm:kernel_strong_approx_Tn} is % \begin{align*} &\sup_{w \in \cW} \left| T_n(w) - Z_n^T(w) \right| \\ &\quad\lesssim_\P \frac{ n^{-1/2} \log n + n^{-3/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-2/3} h^{-1/2} (\log n)^{2/3} + n^{1/2} h^{p \wedge \beta}} {\Dl + 1/\sqrt{n h}} \ll \frac{1}{\sqrt{\log n}}. \end{align*} % So by Lemma~\ref{lem:kernel_app_slow_convergence}, take $\varepsilon_n$ such that % \begin{align*} \P \left( \sup_{w \in \cW} \left| T_n(w) - Z_n^T(w) \right| > \varepsilon_n \right) &\leq \varepsilon_n \sqrt{\log n} \end{align*} % and $\varepsilon_n \sqrt{\log n} \to 0$. So by the previously established anti-concentration result, % \begin{align*} &\P\left( \left| \hat f_W(w) - f_W(w) \right| \leq q_{1-\alpha} \sqrt{\Sigma_n(w,w)} \textup{ for all } w \in \cW \right) \\ &\quad= \P\left( \sup_{w \in \cW} \left| T_n(w) \right| \leq q_{1-\alpha} \right) \\ &\quad\leq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq q_{1-\alpha} + \varepsilon_n \right) + \P \left( \sup_{w \in \cW} \left| T_n(w) - Z_n^T(w) \right| > \varepsilon_n \right) \\ &\quad\leq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq q_{1-\alpha} \right) + \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - q_{1-\alpha} \right| \leq \varepsilon_n \right) + \varepsilon_n \sqrt{\log n} \\ &\quad\leq 1 - \alpha + 2 \varepsilon_n \sqrt{\log n}. \end{align*} % The lower bound follows analogously: % \begin{align*} &\P\left( \left| \hat f_W(w) - f_W(w) \right| \leq q_{1-\alpha} \sqrt{\Sigma_n(w,w)} \textup{ for all } w \in \cW \right) \\ &\quad\geq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq q_{1-\alpha} - \varepsilon_n \right) - \varepsilon_n \sqrt{\log n} \\ &\quad\geq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq q_{1-\alpha} \right) - \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - q_{1-\alpha} \right| \leq \varepsilon_n \right) - \varepsilon_n \sqrt{\log n} \\ &\quad\leq 1 - \alpha - 2 \varepsilon_n \sqrt{\log n}. \end{align*} % Finally, we apply $\varepsilon_n \sqrt{\log n} \to 0$ to see % \begin{align*} \left| \P\left( \left| \hat f_W(w) - f_W(w) \right| \leq q_{1-\alpha} \sqrt{\Sigma_n(w,w)} \textup{ for all } w \in \cW \right) - (1 - \alpha) \right| &\to 0. \end{align*} \end{proof} Before proving Lemma~\ref{lem:kernel_app_covariance_estimation}, we provide the following useful concentration inequality. This is essentially a corollary of the U-statistic concentration inequality given in Theorem~3.3 in \citet{gine2000exponential}. \begin{lemma}[A concentration inequality] \label{lem:kernel_app_dyadic_concentration} Let $X_{i j}$ be mutually independent for $1 \leq i < j \leq n$ taking values in a measurable space $\cX$. Let $h_1$, $h_2$ be measurable functions from $\cX$ to $\R$ satisfying the following for all $i$ and $j$. % \begin{align*} \E\big[h_1(X_{i j})\big] &= 0, &\E\big[h_2(X_{i j})\big] &=0, \\ \E\big[h_1(X_{i j})^2\big] &\leq \sigma^2, &\E\big[h_2(X_{i j})^2\big] &\leq \sigma^2, \\ \big|h_1(X_{i j})\big| &\leq M, &\big|h_2(X_{i j})\big| &\leq M. \end{align*} % Consider the sum % \begin{align*} S_n &= \sum_{1 \leq i < j < r \leq n} h_1(X_{i j}) h_2(X_{i r}). \end{align*} % Then $S_n$ satisfies the concentration inequality % \begin{align*} \P\big( |S_n| \geq t \big) &\leq C \exp\left( -\frac{1}{C} \min \left\{ \frac{t^2}{n^3 \sigma^4}, \frac{t}{\sqrt{n^3 \sigma^4}}, \frac{t^{2/3}}{(n M \sigma)^{2/3}}, \frac{t^{1/2}}{M} \right\} \right) \end{align*} % for some universal constant $C > 0$ and for all $t>0$. \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_dyadic_concentration}] We proceed in three main steps. Firstly, we write $S_n$ as a second-order U-statistic where we use double indices instead of single indices. Then we use a decoupling result to introduce extra independence. Finally, a concentration result is applied to the decoupled U-statistic. \proofparagraph{writing $S_n$ as a second-order U-statistic} Note that we can write $S_n$ as the second-order U-statistic % \begin{align*} S_n &= \sum_{1 \leq i < j \leq n} \sum_{1 \leq q < r \leq n} h_{i j q r} (X_{i j}, X_{qr}), \end{align*} % where % \begin{align*} h_{i j q r} (a,b) &= h_1(a) h_2(b) \, \I\{j 0$ satisfying % $\P\big( |S_n| \geq t \big) \leq C_1 \P\big( C_1 |\tilde S_n| \geq t \big)$, % where % $\tilde S_n = \sum_{1 \leq i < j \leq n} \sum_{1 \leq q < r \leq n} h_{i j q r} (X_{i j}, X'_{qr})$, % with $(X'_{i j})$ an independent copy of $(X_{i j})$. \proofparagraph{U-statistic concentration} The U-statistic kernel $h_{i j q r}(X_{i j}, X'_{qr})$ is totally degenerate in that % $ \E[h_{i j q r}(X_{i j}, X'_{qr}) \mid X_{i j}] = \E[h_{i j q r}(X_{i j}, X'_{qr}) \mid X'_{qr}] = 0$. % Define and bound the following quantities: % \pagebreak % \begin{align*} A &= \max_{i j q r} \|h_{i j q r}(X_{i j}, X'_{qr})\|_\infty \leq M^2, \\ B &= \max \left\{ \left\| \sum_{1 \leq i < j \leq n} \E\Big[ h_{i j q r}(X_{i j}, X'_{qr})^2 \mid X_{i j} \Big] \right\|_\infty, \left\| \sum_{1 \leq q < r \leq n} \E\Big[ h_{i j q r}(X_{i j}, X'_{qr})^2 \mid X'_{qr} \Big] \right\|_\infty \right\}^{1/2} \\ &= \max \left\{ \left\| \sum_{1 \leq i < j \leq n} h_1(X_{i j})^2 \E\big[ h_2(X_{qr}')^2 \big] \I\{j 0$ and for all $t > 0$, % \begin{align*} \P\left( |\tilde S_n| \geq t \right) &\leq C_2 \exp\left( -\frac{1}{C_2} \min \left\{ \frac{t^2}{C^2}, \frac{t}{D}, \frac{t^{2/3}}{B^{2/3}}, \frac{t^{1/2}}{A^{1/2}} \right\} \right) \\ &\leq C_2 \exp\left( -\frac{1}{C_2} \min \left\{ \frac{t^2}{n^3 \sigma^4}, \frac{t}{\sqrt{n^3 \sigma^4}}, \frac{t^{2/3}}{(n M \sigma)^{2/3}}, \frac{t^{1/2}}{M} \right\} \right). \end{align*} \proofparagraph{Conclusion} By the previous parts and absorbing constants into a new constant $C > 0$, we therefore have % \begin{align*} \P\left( |S_n| \geq t \right) &\leq C_1 \P\left( C_1 |\tilde S_n| \geq t \right) \\ &\leq C_1 C_2 \exp\left( -\frac{1}{C_2} \min \left\{ \frac{t^2}{n^3 \sigma^4 C_1^2}, \frac{t}{\sqrt{n^3 \sigma^4 C_1}}, \frac{t^{2/3}}{(n M \sigma C_1)^{2/3}}, \frac{t^{1/2}}{M C_1^{1/2}} \right\} \right) \\ &\leq C \exp\left( -\frac{1}{C} \min \left\{ \frac{t^2}{n^3 \sigma^4}, \frac{t}{\sqrt{n^3 \sigma^4}}, \frac{t^{2/3}}{(n M \sigma)^{2/3}}, \frac{t^{1/2}}{M} \right\} \right). \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_covariance_estimation}] Throughout this proof we will write $k_{i j}$ for $k_h(W_{i j},w)$ and $k_{i j}'$ for $k_h(W_{i j},w')$, in the interest of brevity. Similarly, we write $S_{i j r}$ to denote $S_{i j r}(w,w')$. The estimand and estimator are reproduced below for clarity. % \begin{align*} \Sigma_n(w,w') &= \frac{2}{n(n-1)} \E[k_{i j} k_{i j}'] + \frac{4(n-2)}{n(n-1)} \E[k_{i j} k_{i r}'] - \frac{4n-6}{n(n-1)} \E[k_{i j}] \E[k_{i j}'] \\ \hat \Sigma_n(w,w') &= \frac{2}{n(n-1)} \frac{2}{n(n-1)} \sum_{i 0$ and since $n h \gtrsim \log n$, the class $\cF$ has a constant envelope function given by $F(a) \lesssim \sqrt{n h}$. Clearly, $M = \sup_a F(a) \lesssim \sqrt{n h}$. Also by definition of $\Sigma_n$ and orthogonality of $L_n$, $Q_n$, and $E_n$, we have $\sup_{f \in \cF} \E[f(A_i)^2] \leq \sigma^2 = 1$. To verify a VC-type condition on $\cF$ we need to establish the regularity of the process. By Lipschitz properties of $L_n$ and $\Sigma_n$ derived in the proofs of Lemma~\ref{lem:kernel_uniform_concentration} and Theorem~\ref{thm:kernel_infeasible_ucb} respectively, we have % \begin{align*} \left| \frac{L_n(w)} {\sqrt{\Sigma_n(w,w)}} - \frac{L_n(w')} {\sqrt{\Sigma_n(w',w')}} \right| &\lesssim \frac{\big|L_n(w) - L_n(w')\big|} {\sqrt{\Sigma_n(w,w)}} + \left| L_n(w') \right| \left| \frac{1} {\sqrt{\Sigma_n(w,w)}} - \frac{1} {\sqrt{\Sigma_n(w',w')}} \right| \\ &\lesssim \sqrt{n^2h} |w-w'| +\left| \frac{\Sigma_n(w,w) - \Sigma_n(w',w')} {\Sigma_n(w,w)\sqrt{\Sigma_n(w',w')}} \right| \\ &\lesssim \sqrt{n^2h} |w-w'| + (n^2h)^{3/2} \left| \Sigma_n(w,w) - \Sigma_n(w',w') \right| \\ &\lesssim \sqrt{n^2h} |w-w'| + (n^2h)^{3/2} n^{-1} h^{-3} |w-w'| \lesssim n^4 |w-w'|, \end{align*} % uniformly over $w,w' \in \cW$. By compactness of $\cW$ we have the covering number bound % $N(\cF, \|\cdot\|_\infty, \varepsilon) \lesssim N(\cW, |\cdot|, n^{-4} \varepsilon) \lesssim n^4 \varepsilon^{-1}$. % Thus by Lemma~\ref{lem:kernel_app_maximal_vc_inid}, % \begin{align*} \E \left[ \sup_{w \in \cW} \left| \frac{L_n(w)} {\sqrt{\Sigma_n(w,w)}} \right| \right] &\lesssim \sqrt{\log n} + \frac{\sqrt{n h} \log n}{\sqrt{n}} \lesssim \sqrt{\log n}. \end{align*} % Therefore % \begin{align*} \sup_{w,w' \in \cW} \left| \frac{ \hat f_W(w) \hat f_W(w') - \E\big[k_{i j}\big] \E\big[k_{i j'}\big]} {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right| &\lesssim_\P \sqrt{\log n}. \end{align*} \proofparagraph{decomposition of the $S_{i j r}$ term} We first decompose the $S_{i j r}$ term into two parts, and obtain a pointwise concentration result for each. This is extended to a uniform concentration result by considering the regularity of the covariance estimator process. Note that $\E[S_{i j r}] = \E[k_{i j} k_{i r}']$, and hence % \begin{align*} &\frac{6}{n(n-1)(n-2)} \sum_{i 0$: % \begin{align*} &\P\left( \left| \sum_{i t \biggm\vert \bA_n \right) \\ &\quad\leq C_1 \exp\left( -\frac{1}{C_1} \min \left\{ \frac{t^2}{n^3 \sigma^4}, \frac{t}{\sqrt{n^3 \sigma^4}}, \frac{t^{2/3}}{(n M \sigma)^{2/3}}, \frac{t^{1/2}}{M} \right\} \right) \\ &\quad\leq C_1 \exp\left( -\frac{1}{C_1} \min \left\{ \frac{t^2 h^2}{n^3}, \frac{t h}{\sqrt{n^3}}, \frac{t^{2/3} h}{n^{2/3}}, t^{1/2} h \right\} \right), \end{align*} % and therefore with $t \geq 1$ and since $n h \gtrsim \log n$, introducing and adjusting a new constant $C_2$ where necessary, % \begin{align*} &\P\left( \left| \frac{6}{n(n-1)(n-2)} \sum_{i t \frac{\log n}{\sqrt{n^3 h^2}} \Bigm\vert \bA_n \right) \\ &\quad\leq \P\left( \left| \sum_{i t n^{3/2} h^{-1} \log n / 24 \Bigm\vert \bA_n \right) \\ &\quad\leq C_2 \exp\left( -\frac{1}{C_2} \min \left\{ (t \log n)^2, t \log n, (t \log n)^{2/3} (n h)^{1/3}, (t n h \log n)^{1/2} n^{1/4} \right\} \right) \\ &\quad\leq C_2 \exp\left( -\frac{1}{C_2} \min \left\{ t \log n, t \log n, t^{2/3} \log n, t^{1/2} n^{1/4} \log n \right\} \right) \\ &\quad= C_2 \exp\left( -\frac{t^{2/3} \log n}{C_2} \right) = C_2 n^{-t^{2/3} / C_2}. \end{align*} % Now for the term in \eqref{eq:kernel_app_Sijr1_decomp2}, note that $\frac{3}{n} \sum_{r=j+1}^n \E[k_{i r}' \mid \bA_n]$ is $\bA_n$-measurable and bounded uniformly in $i,j$. Also, using the previously established conditional variance and almost sure bounds on $k_{i j}$, Bernstein's inequality (Lemma~\ref{lem:kernel_app_bernstein}) applied conditionally gives for some constant $C_3 > 0$ % \begin{align*} &\P\left( \Bigg| \frac{2}{(n-1)(n-2)} \sum_{i=1}^{n-2} \sum_{j=i+1}^{n-1} \Big( k_{i j} - \E[k_{i j} \mid \bA_n] \Big) \cdot \frac{3}{n} \sum_{r=j+1}^n \E[k_{i r}' \mid \bA_n] \Bigg| > t \sqrt{\frac{\log n}{n^2h}} \Bigm\vert \bA_n \right) \\ &\qquad\leq 2 \exp \left( - \frac{t^2 n^2 \log n / (n^2h)} {C_3/(2h) + C_3 t \sqrt{\log n / (n^2h)} / (2h)} \right) \\ &\qquad= 2 \exp \left( - \frac{t^2 \log n} {C_3/2 + C_3 t \sqrt{\log n / (n^2h)} / 2} \right) \leq 2 \exp \left( - \frac{t^2 \log n}{C_3} \right) = 2 n^{-t^2 / C_3}. \end{align*} % The term in \eqref{eq:kernel_app_Sijr1_decomp3} is controlled in exactly the same way. Putting these together, noting the symmetry in $i,j,r$ and taking a marginal expectation, we obtain the unconditional pointwise concentration inequality % \begin{align*} \P\left( \Bigg| \frac{6}{n(n-1)(n-2)} \sum_{i t \frac{\log n}{\sqrt{n^3h^2}} + t \sqrt{\frac{\log n}{n^2h}} \right) &\leq C_2 n^{-t^{2/3} / C_2} + 4 n^{-t^2 / (4C_3)}. \end{align*} % Multiplying by $\big(\Sigma_n(w,w) + \Sigma_n(w',w')\big)^{-1/2} \lesssim \sqrt{n^2h}$ gives (adjusting constants if necessary) % \begin{align*} &\P\left( \Bigg| \frac{6}{n(n-1)(n-2)} \sum_{i t \frac{\log n}{\sqrt{n h}} + t \sqrt{\log n} \right) \\ &\quad\leq C_2 n^{-t^{2/3} / C_2} + 4 n^{-t^2 / (4C_3)}. \end{align*} \proofparagraph{pointwise concentration of the $S_{i j r}^{(2)}$ term} We apply the U-statistic concentration inequality from Lemma~\ref{lem:kernel_app_ustat_concentration}. Note that the terms $\E[S_{i j r} \mid \bA_n]$ are permutation-symmetric functions of the random variables $A_i, A_j$, and $A_r$ only, making $S_{i j r}^{(2)}$ the summands of a (non-degenerate) mean-zero third-order U-statistic. While we could apply a third-order Hoeffding decomposition here to achieve degeneracy, it is unnecessary as Lemma~\ref{lem:kernel_app_ustat_concentration} is general enough to deal with the non-degenerate case directly. The quantity of interest here is % \begin{align*} \frac{6}{n(n-1)(n-2)} \sum_{i t \sqrt{\log n} \sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')} \right) \\ &\quad\leq 4 \exp \left( - \frac{n t^2 (\Sigma_n(w,w) + \Sigma_n(w',w')) \log n} {C_4 (n\Sigma_n(w,w) + n\Sigma_n(w',w')) + C_4 t \sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}\sqrt{\log n}} \right) \\ &\quad\leq 4 \exp \left( - \frac{t^2 \log n} {C_4 + C_4 t (\Sigma_n(w,w) + \Sigma_n(w',w'))^{-1/2} \sqrt{\log n} / n} \right) \\ &\quad\leq 4 \exp \left( - \frac{t^2 \log n} {C_4 + C_4 t \sqrt{h}} \right) \leq 4 n^{-t^2 / C_4} \end{align*} % for some universal constant $C_4 > 0$ (which may change from line to line), since the order of this U-statistic is fixed at three. \proofparagraph{concentration of the $S_{i j r}$ term on a mesh} Pick $\delta_n \to 0$ with $\log 1/\delta_n \lesssim \log n$. Let $\cW_\delta$ be a $\delta_n$-covering of $\cW$ with cardinality $O(1/\delta_n)$. Then $\cW_\delta \times \cW_\delta$ is a $2\delta_n$-covering of $\cW \times \cW$ with cardinality $O(1/\delta_n^2)$, under the Manhattan metric $d\big((w_1, w_1'), (w_2, w_2')\big) = |w_1 - w_2| + |w_1' - w_2'|$. By the previous parts, we have that for fixed $w$ and $w'$: % \begin{align*} &\P\Bigg( \Bigg| \frac{6}{n(n-1)(n-2)} \sum_{i t \frac{\log n}{\sqrt{n h}} + 2t \sqrt{\log n} \Bigg) \\ &\quad\leq C_2 n^{-t^{2/3} / C_2} + 4 n^{-t^2 / (4C_3)} + 4 n^{-t^2 / C_4}. \end{align*} % Taking a union bound over $\cW_\delta \times \cW_\delta$, noting that $n h \gtrsim \log n$ and adjusting constants gives % \begin{align*} &\P\Bigg( \sup_{w, w' \in \cW_\delta} \Bigg| \frac{6}{n(n-1)(n-2)} \sum_{i t \sqrt{\log n} \Bigg) \\ &\quad\lesssim \delta_n^{-2} \Big( C_2 n^{-t^{2/3} / C_2} + 4 n^{-t^2 / (4C_3)} + 4 n^{-t^2 / C_4} \Big) \lesssim \delta_n^{-2} n^{-t^{2/3} / C_5}, \end{align*} % for some constant $C_5 > 0$. \proofparagraph{regularity of the $S_{i j r}$ term} Next we bound the fluctuations in $S_{i j r}(w,w')$. Writing $k_{i j}(w)$ for $k_h(W_{i j},w)$, note that % \begin{align*} \big| k_{i j}(w_1) k_{i r}(w_1') - k_{i j}(w_2) k_{i r}(w_2') \big| &\lesssim \frac{1}{h} \big| k_{i j}(w_1) - k_{i j}(w_2) \big| + \frac{1}{h} \big| k_{i r}(w_1') - k_{i r}(w_2') \big| \\ &\lesssim \frac{1}{h^3} \Big( |w_1 - w_2| + |w_1' - w_2'| \Big), \end{align*} % by the Lipschitz property of the kernel, and similarly for the other summands in $S_{i j r}$. Therefore, % \begin{align*} \sup_{|w_1-w_2| \leq \delta_n} \sup_{|w_1'-w_2'| \leq \delta_n} \big| S_{i j r}(w_1, w_1') - S_{i j r}(w_2, w_2') \big| &\lesssim \delta_n h^{-3}. \end{align*} % Also as noted in the proof of Theorem~\ref{thm:kernel_infeasible_ucb}, % \begin{align*} \sup_{|w_1-w_2| \leq \delta_n} \sup_{|w_1'-w_2'| \leq \delta_n} \big| \Sigma_n(w_1,w_1') - \Sigma_n(w_2, w_2') \big| &\lesssim \delta_n n^{-1}h^{-3}. \end{align*} % Therefore, since $\sqrt{\Sigma_n(w,w)} \gtrsim \sqrt{n^2h}$ and $|S_{i j r}| \lesssim h^{-2}$, using $\frac{a}{\sqrt b} - \frac{c}{\sqrt d} = \frac{a-c}{\sqrt b} + c \frac{d-b}{\sqrt{b d} \sqrt{b+d}}$, % \begin{align*} &\sup_{|w_1-w_2| \leq \delta_n} \sup_{|w_1'-w_2'| \leq \delta_n} \left| \frac{S_{i j r}(w_1, w_1')} {\sqrt{\Sigma_n(w_1,w_1) + \Sigma_n(w_1',w_1')}} - \frac{S_{i j r}(w_2, w_2')} {\sqrt{\Sigma_n(w_2,w_2) + \Sigma_n(w_2',w_2')}} \right| \\ &\quad\lesssim \delta_n h^{-3} \sqrt{n^2h} + h^{-2} \delta_n n^{-1} h^{-3} (n^2h)^{3/2} \lesssim \delta_n n h^{-5/2} + \delta_n n^{2} h^{-7/2} \lesssim \delta_n n^{6}, \end{align*} % where in the last line we use that $1/h \lesssim n$. \proofparagraph{uniform concentration of the $S_{i j r}$ term} By setting $\delta_n = n^{-6} \sqrt{\log n}$, the fluctuations can be at most $\sqrt{\log n}$, so we have for $t \geq 1$ % \begin{align*} &\P\Bigg( \sup_{w, w' \in \cW} \Bigg| \frac{6}{n(n-1)(n-2)} \sum_{i 2t \sqrt{\log n} \Bigg) \\ &\quad\lesssim \delta_n^{-2} n^{-t^{2/3} / C_5} \lesssim n^{12-t^{2/3} / C_5}. \end{align*} % This converges to zero for any sufficiently large $t$, so % \begin{align*} \sup_{w, w' \in \cW} \Bigg| \frac{6}{n(n-1)(n-2)} \sum_{i t \sqrt{\frac{\log n}{n^2h^3}} \Bigm\vert \bA_n \right) \\ &\quad\leq 2 \exp\left( - \frac{t^2 n^2 \log n / (n^2h^3)} {C_6 h^{-3} / 2 + C_6 t h^{-2} \sqrt{\log n / (n^2h^3)} / 2} \right) \\ &\quad\leq 2 \exp\left( - \frac{t^2 \log n} {C_6 / 2 + C_6 t \sqrt{\log n / (n^2h)} / 2} \right) \leq 2 \exp\left( - \frac{t^2 \log n}{C_6} \right) = 2 n^{-t^2 / C_6}, \end{align*} % where $C_6$ is a universal positive constant. \proofparagraph{pointwise concentration of the $S_{i j}^{(2)}$ term} We apply the U-statistic concentration inequality from Lemma~\ref{lem:kernel_app_ustat_concentration}. Note that $S_{i j}^{(2)}$ are permutation-symmetric functions of the random variables $A_i$ and $A_j$ only, making them the summands of a (non-degenerate) mean-zero second-order U-statistic. Note that $\big|S_{i j}^{(2)}\big| \lesssim h^{-1}$ and so trivially $\E\big[\E[S_{i j}^{(2)} \mid A_i ]^2 \big] \lesssim h^{-2}$. Thus by Lemma~\ref{lem:kernel_app_ustat_concentration}, since the order of this U-statistic is fixed at two, for some universal positive constant $C_7$ we have % \begin{align*} \P\left( \Bigg| \frac{2}{n(n-1)} \sum_{i t \sqrt{\frac{\log n}{n h^2}} \right) &\leq 2 \exp\left( - \frac{t^2 n \log n / (n h^2)} {C_7 h^{-2} / 2 + C_7 t h^{-1} \sqrt{\log n / (n h^2)} / 2} \right) \\ &\leq 2 \exp\left( - \frac{t^2 \log n} {C_7 / 2 + C_7 t \sqrt{\log n / n} / 2} \right) \\ &\leq 2 \exp\left( - \frac{t^2 \log n}{C_7} \right) = 2 n^{-t^2 / C_7}. \end{align*} \proofparagraph{concentration of the $k_{i j}k_{i j}'$ term on a mesh} As before, use a union bound on the mesh $\cW_\delta \times \cW_\delta$. % \begin{align*} &\P\left( \sup_{w,w' \in \cW_\delta} \left| \frac{2}{n(n-1)} \sum_{i t \sqrt{\frac{\log n}{n^2h^3}} + t \sqrt{\frac{\log n}{n h^2}} \right) \\ &\ \leq \P\!\left( \!\sup_{w,w' \in \cW_\delta} \Bigg| \frac{2}{n(n-1)} \sum_{i t \sqrt{\frac{\log n}{n^2h^3}} \right) \! + \P\!\left( \!\sup_{w,w' \in \cW_\delta} \Bigg| \frac{2}{n(n-1)} \sum_{i t \sqrt{\frac{\log n}{n h^2}} \right) \\ &\ \lesssim \delta_n^{-2} n^{-t^2 / C_6} + \delta_n^{-2} n^{-t^2 / C_7}. \end{align*} \proofparagraph{regularity of the $k_{i j}k_{i j}'$ term} As for the $S_{i j r}$ term, % $\big| k_{i j}(w_1) k_{i j}(w_1') - k_{i j}(w_2) k_{i j}(w_2') \big| \lesssim \frac{1}{h^3} \Big( |w_1 - w_2| + |w_1' - w_2'| \Big)$. \proofparagraph{uniform concentration of the $k_{i j}k_{i j}'$ term} Setting $\delta_n = h^3\sqrt{\log n / (n h^2)}$, the fluctuations are at most $\sqrt{\log n / (n h^2)}$, so for $t \geq 1$ % \begin{align*} &\P\left( \sup_{w,w' \in \cW} \left| \frac{2}{n(n-1)} \sum_{i t \sqrt{\frac{\log n}{n^2h^3}} + 2t \sqrt{\frac{\log n}{n h^2}} \right) \\ &\quad\leq \P\left( \sup_{w,w' \in \cW_\delta} \left| \frac{2}{n(n-1)} \sum_{i t \sqrt{\frac{\log n}{n^2h^3}} + t \sqrt{\frac{\log n}{n h^2}} \right) \\ &\qquad+ \P\left( \sup_{|w_1-w_2| \leq \delta_n} \sup_{|w_1'-w_2'| \leq \delta_n} \big| k_{i j}(w_1) k_{i j}(w_1') - k_{i j}(w_2) k_{i j}(w_2') \big| > t \sqrt{\frac{\log n}{n h^2}} \right) \\ &\quad\lesssim \delta_n^{-2} n^{-t^2 / C_6} + \delta_n^{-2} n^{-t^2 / C_7} \lesssim n^{1-t^2 / C_6} h^{-4} + n^{1-t^2 / C_7} h^{-4} \lesssim n^{5-t^2 / C_8}, \end{align*} % where $C_8 > 0$ is a constant and in the last line we use $1/h \lesssim n$. This converges to zero for any sufficiently large $t$, so by Lemma~\ref{lem:kernel_variance_bounds} we have % \begin{align*} \sup_{w,w' \in \cW} \left| \frac{2}{n(n-1)} \sum_{i 0$ there exists a feasible function $M_\varepsilon$ with $\objective(M_\varepsilon) \leq \objective^* + \varepsilon$, and we call such a solution $\varepsilon$-optimal. Let $\hat \Sigma_n^+$ be an $n^{-1}$-optimal solution. Then % \begin{align*} \objective(\hat \Sigma_n^+) &\leq \objective^* + n^{-1} \leq \objective(\Sigma_n) + n^{-1}. \end{align*} % Thus by the triangle inequality, % \begin{align*} \sup_{w,w' \in \cW} \left| \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right| &\leq \objective(\hat \Sigma_n^+) + \objective(\Sigma_n) \leq 2 \, \objective(\Sigma_n) + n^{-1} \lesssim_\P \frac{\sqrt{\log n}}{n}. \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_variance_estimator_bounds}] Since $\hat \Sigma_n^+$ is positive semi-definite, we must have $\hat \Sigma_n^+(w,w) \geq 0$. Now Lemma~\ref{lem:kernel_app_sdp} implies that for all $\varepsilon \in (0,1)$ there exists a $C_\varepsilon$ such that % \begin{align*} &\P\left( \Sigma_n(w,w) - C_\varepsilon \frac{\sqrt{\log n}}{n} \sqrt{\Sigma_n(w,w)} \leq \hat \Sigma_n^+(w,w) \right. \\ &\left. \qquad\leq \Sigma_n(w,w) + C_\varepsilon \frac{\sqrt{\log n}}{n} \sqrt{\Sigma_n(w,w)}, \quad \forall w \in \cW \right) \geq 1-\varepsilon. \end{align*} % Consider the function $g_a(t) = t - a \sqrt{t}$ and note that it is increasing on $\{t \geq a^2/4\}$. Applying this with $t = \Sigma_n(w,w)$ and $a = \frac{\sqrt{\log n}}{n}$, noting that by Lemma~\ref{lem:kernel_variance_bounds} we have $t = \Sigma_n(w,w) \gtrsim \frac{1}{n^2h} \gg \frac{\log n}{4n^2} = a^2/4$, shows that for $n$ large enough, % \begin{align*} \inf_{w \in \cW} \Sigma_n(w,w) - \frac{\sqrt{\log n}}{n} \sqrt{\inf_{w \in \cW} \Sigma_n(w,w)} \lesssim_\P \inf_{w \in \cW}\hat \Sigma_n^+(w,w), \\ \sup_{w \in \cW}\hat \Sigma_n^+(w,w) \lesssim_\P \sup_{w \in \cW} \Sigma_n(w,w) + \frac{\sqrt{\log n}}{n} \sqrt{\sup_{w \in \cW} \Sigma_n(w,w)}. \end{align*} % Applying the bounds from Lemma~\ref{lem:kernel_variance_bounds} yields % \begin{align*} \frac{\Dl^2}{n} + \frac{1}{n^2h} - \frac{\sqrt{\log n}}{n} \left( \frac{\Dl}{\sqrt n} + \frac{1}{\sqrt{n^2h}} \right) \lesssim_\P \inf_{w \in \cW}\hat \Sigma_n^+(w,w), \\ \sup_{w \in \cW}\hat \Sigma_n^+(w,w) \lesssim_\P \frac{\Du^2}{n} + \frac{1}{n^2h} + \frac{\sqrt{\log n}}{n} \left( \frac{\Du}{\sqrt n} + \frac{1}{\sqrt{n^2h}} \right) \end{align*} % and so % \begin{align*} \frac{\Dl^2}{n} + \frac{1}{n^2h} \lesssim_\P \inf_{w \in \cW}\hat \Sigma_n^+(w,w) \leq \sup_{w \in \cW}\hat \Sigma_n^+(w,w) \lesssim_\P \frac{\Du^2}{n} + \frac{1}{n^2h}. \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_sdp}] See Lemma~\ref{lem:kernel_app_covariance_estimation} and Lemma~\ref{lem:kernel_app_sdp}. \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_studentized_t_statistic}] % We have % \begin{align*} &\sup_{w \in \cW} \left| \hat T_n(w) - T_n(w) \right| = \sup_{w \in \cW} \bigg\{ \left| \hat f_W(w) - f_W(w) \right| \cdot \bigg| \frac{1} {\hat\Sigma_n^+(w,w)^{1/2}} - \frac{1}{\Sigma_n(w,w)^{1/2}} \bigg| \bigg\} \\ &\quad\leq \sup_{w \in \cW} \left| \frac{\hat f_W(w) - \E\big[\hat f_W(w)\big]} {\sqrt{\Sigma_n(w,w)}} + \frac{\E\big[\hat f_W(w)\big] - f_W(w)} {\sqrt{\Sigma_n(w,w)}} \right| \cdot \sup_{w \in \cW} \left| \frac{\hat\Sigma_n^+(w,w) - \Sigma_n(w,w)} {\sqrt{\Sigma_n(w,w) \hat\Sigma_n^+(w,w)}} \right|. \end{align*} % Now from the proof of Lemma~\ref{lem:kernel_app_covariance_estimation} we have that $\sup_{w \in \cW} \left| \frac{\hat f_W(w) - \E\big[\hat f_W(w)\big]} {\sqrt{\Sigma_n(w,w)}} \right| \lesssim_\P \sqrt{\log n}$, while Theorem~\ref{thm:kernel_bias} gives $\sup_{w \in \cW} \big| \E\big[\hat f_W(w)\big] - f_W(w) \big| \lesssim h^{p \wedge \beta}$. By Lemma~\ref{lem:kernel_variance_bounds}, note that $\sup_{w \in \cW} \Sigma_n(w,w)^{-1/2} \lesssim \frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}}$, and $\sup_{w \in \cW} \hat \Sigma_n^+(w,w)^{-1/2} \lesssim_\P \frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}}$ by Lemma~\ref{lem:kernel_app_variance_estimator_bounds}. Thus, applying Lemma~\ref{lem:kernel_app_sdp} to control the covariance estimation error, % \begin{align*} \sup_{w \in \cW} \left| \hat T_n(w) - T_n(w) \right| &\lesssim_\P \left( \sqrt{\log n} + \frac{h^{p \wedge \beta}}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}} \right) \frac{\sqrt{\log n}}{n} \frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}} \\ &\lesssim_\P \sqrt{\frac{\log n}{n}} \left( \sqrt{\log n} + \frac{\sqrt n h^{p \wedge \beta}} {\Dl + 1/\sqrt{n h}} \right) \frac{1}{\Dl + 1/\sqrt{n h}}. \end{align*} \end{proof} \begin{proof}[% Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}] Firstly, note that $\hat Z_n^T$ exists by noting that $\hat \Sigma_n^+(w,w')$ and therefore also $\frac{\hat \Sigma_n^+(w,w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}$ are positive semi-definite functions and appealing to the Kolmogorov consistency theorem \citep{gine2021mathematical}. To obtain the desired Kolmogorov--Smirnov result we discretize and use the Gaussian--Gaussian comparison result found in Lemma~3.1 in \citet{chernozhukov2013gaussian}. \proofparagraph{bounding the covariance discrepancy} Define the maximum discrepancy in the (conditional) covariances of $\hat Z_n^T$ and $Z_n^T$ by % \begin{align*} \Delta &\vcentcolon= \sup_{w, w' \in \cW} \left| \frac{\hat \Sigma_n^+(w,w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}} - \frac{\Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}} \right|. \end{align*} % This variable can be bounded in probability in the following manner. First note that by the Cauchy--Schwarz inequality for covariances, $|\Sigma_n(w,w')| \leq \sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}$. Hence % \begin{align*} \Delta &\leq \sup_{w, w' \in \cW} \left\{ \left| \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}} \right| + \left| \frac{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')} - \sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}} \right| \right\} \\ &\leq \sup_{w, w' \in \cW} \left\{ \sqrt{\frac{\Sigma_n(w,w) + \Sigma_n(w',w')} {\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}} \left| \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right| \right\} \\ &\quad+ \sup_{w, w' \in \cW} \left| \frac{\hat \Sigma_n^+(w,w)\hat \Sigma_n^+(w',w') - \Sigma_n(w,w) \Sigma_n(w',w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w') \Sigma_n(w,w) \Sigma_n(w',w')}} \right|. \end{align*} % For the first term, note that $\inf_{w \in \cW} \hat \Sigma_n^+(w,w) \gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h}$ by Lemma~\ref{lem:kernel_app_variance_estimator_bounds} and also $\sup_{w \in \cW} \left|\frac{\hat \Sigma_n(w,w)}{\Sigma_n(w,w)} - 1\right| \lesssim_\P \sqrt{h \log n}$ by the proof of Lemma~\ref{lem:kernel_app_sdp}. Thus by Lemma~\ref{lem:kernel_app_sdp}, % \begin{align*} &\sup_{w, w' \in \cW} \left\{ \sqrt{\frac{\Sigma_n(w,w) + \Sigma_n(w',w')} {\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}} \left| \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right| \right\} \\ &\quad\lesssim_\P \frac{\sqrt{\log n}}{n} \frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}} \lesssim_\P \sqrt{\frac{\log n}{n}} \frac{1}{\Dl + 1/\sqrt{n h}}. \end{align*} % For the second term, we have by the same bounds % \begin{align*} &\sup_{w, w' \in \cW} \left| \frac{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w') - \Sigma_n(w,w) \Sigma_n(w',w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w') \Sigma_n(w,w) \Sigma_n(w',w')}} \right| \\ &\quad\leq \sup_{w, w' \in \cW} \left\{ \frac{\big| \hat \Sigma_n^+(w,w) - \Sigma_n(w,w)\big| \hat \Sigma_n^+(w',w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w') \Sigma_n(w,w) \Sigma_n(w',w')}} \right\} \\ &\qquad+ \sup_{w, w' \in \cW} \left\{ \frac{\big| \hat \Sigma_n^+(w',w') - \Sigma_n(w',w')\big| \Sigma_n(w,w)} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w') \Sigma_n(w,w) \Sigma_n(w',w')}} \right\} \\ &\quad\leq \sup_{w, w' \in \cW} \left\{ \frac{\big| \hat \Sigma_n^+(w,w) - \Sigma_n(w,w)\big|} {\sqrt{\Sigma_n(w,w)}} \frac{\sqrt{\hat \Sigma_n^+(w',w')}} {\sqrt{\hat \Sigma_n^+(w,w) \Sigma_n(w',w')}} \right\} \\ &\qquad+ \!\sup_{w, w' \in \cW}\! \left\{ \frac{\big| \hat \Sigma_n^+(w',w') - \Sigma_n(w',w')\big|} {\sqrt{\Sigma_n(w',w')}} \frac{\sqrt{\Sigma_n(w,w)}} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}} \right\} \lesssim_\P \sqrt{\frac{\log n}{n}} \frac{1}{\Dl + 1/\sqrt{n h}}. \end{align*} % Therefore $\Delta \lesssim_\P \sqrt{\frac{\log n}{n}} \frac{1}{\Dl + 1/\sqrt{n h}}$. \proofparagraph{Gaussian comparison on a mesh} Let $\cW_\delta$ be a $\delta_n$-covering of $\cW$ with cardinality $O(1/\delta_n)$, where $1/\delta_n$ is at most polynomial in $n$. The scaled (conditionally) Gaussian processes $Z_n^T$ and $\hat Z_n^T$ both have pointwise (conditional) variances of 1. Therefore, by Lemma~3.1 in \citet{chernozhukov2013gaussian}, % \begin{align*} \sup_{t \in \R} \left| \P\left( \sup_{w \in \cW_\delta} Z_n^T(w) \leq t \right) - \P\left( \sup_{w \in \cW_\delta} \hat Z_n^T(w) \leq t \Bigm\vert \bW_n \right) \right| &\lesssim \Delta^{1/3} \Big( 1 \vee \log \frac{1}{\Delta \delta_n} \Big)^{2/3} \end{align*} % uniformly in the data. By the previous part and since $x (\log 1/x)^2$ is increasing on $\big(0, e^{-2}\big)$, % \begin{align*} &\sup_{t \in \R} \left| \P\left( \sup_{w \in \cW_\delta} Z_n^T(w) \leq t \right) - \P\left( \sup_{w \in \cW_\delta} \hat Z_n^T(w) \leq t \Bigm\vert \bW_n \right) \right| \\ &\quad\lesssim_\P \left( \sqrt{\frac{\log n}{n}} \frac{1}{\Dl + 1/\sqrt{n h}} \right)^{1/3} (\log n)^{2/3} \lesssim_\P \frac{n^{-1/6}(\log n)^{5/6}} {\Dl^{1/3} + (n h)^{-1/6}}. \end{align*} \proofparagraph{trajectory regularity of $Z_n^T$} In the proof of Theorem~\ref{thm:kernel_infeasible_ucb} we established that $Z_n^T$ satisfies the regularity property % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big| Z_n^T(w) - Z_n^T(w') \big| \right] &\lesssim n h^{-1} \sqrt{\delta_n \log n}, \end{align*} % whenever $1/\delta_n$ is at most polynomial in $n$. \proofparagraph{conditional $L^2$ regularity of $\hat Z_n^T$} By Lemma~\ref{lem:kernel_app_sdp}, with $n h \gtrsim \log n$, we have uniformly in $w,w'$, % \begin{align*} \big| \hat \Sigma_n^+(w,w') - \hat \Sigma_n^+(w,w) \big| &\lesssim n^{-1} h^{-3} |w-w'|. \end{align*} % Taking $\delta_n \leq n^{-2} h^2$, Lemma~\ref{lem:kernel_app_variance_estimator_bounds} gives % \begin{align*} \inf_{|w-w'| \leq \delta_n} \hat \Sigma_n^+(w,w') \gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h} - n^{-1} h^{-3} \delta_n \gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h} - \frac{1}{n^3h} \gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h}. \end{align*} % The conditional $L^2$ regularity of $\hat Z_n^T$ is % \begin{align*} \E\left[ \big( \hat Z_n^T(w) - \hat Z_n^T(w') \big)^2 \bigm\vert \bW_n \right] &= 2 - 2 \frac{\hat \Sigma_n^+(w,w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}. \end{align*} % Applying the same elementary result as for $Z_n^T$ in the proof of Theorem~\ref{thm:kernel_infeasible_ucb} yields % \begin{align*} \E\left[ \big( \hat Z_n^T(w) - \hat Z_n^T(w') \big)^2 \bigm\vert \bW_n \right] &\lesssim_\P n^2 h^{-2} |w-w'|. \end{align*} % Thus the conditional semimetric induced by $\hat Z_n^T$ on $\cW$ is % \begin{align*} \hat\rho(w,w') &\vcentcolon= \E\left[ \big( \hat Z_n^T(w) - \hat Z_n^T(w') \big)^2 \bigm\vert \bW_n \right]^{1/2} \lesssim_\P n h^{-1} \sqrt{|w-w'|}. \end{align*} \proofparagraph{conditional trajectory regularity of $\hat Z_n^T$} As for $Z_n^T$ in the proof of Theorem~\ref{thm:kernel_infeasible_ucb}, we apply Lemma~\ref{lem:kernel_app_gaussian_process_maximal}, now conditionally, to obtain % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \left| \hat Z_n^T(w) - \hat Z_n^T(w') \right| \Bigm\vert \bW_n \right] &\lesssim_\P n h^{-1} \sqrt{\delta_n \log n}, \end{align*} % whenever $1/\delta_n$ is at most polynomial in $n$. \proofparagraph{uniform Gaussian comparison} Now we use the trajectory regularity properties to extend the Gaussian--Gaussian comparison result from a finite mesh to all of $\cW$. Write the previously established approximation rate as % \begin{align*} r_n &= \frac{n^{-1/6}(\log n)^{5/6}} {\Dl^{1/3} + (n h)^{-1/6}}. \end{align*} % Take $\varepsilon_n > 0$ and observe that uniformly in $t \in \R$, % \begin{align*} &\P\left( \sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \leq t \Bigm\vert \bW_n \right) \\ &\quad\leq \P\left( \sup_{w \in \cW_\delta} \big| \hat Z_n^T(w) \big| \leq t + \varepsilon_n \Bigm\vert \bW_n \right) + \P\left( \sup_{|w-w'| \leq \delta_n} \left| \hat Z_n^T(w) - \hat Z_n^T(w') \right| \geq \varepsilon_n \Bigm\vert \bW_n \right) \\ &\quad\leq \P\left( \sup_{w \in \cW_\delta} \big| Z_n^T(w) \big| \leq t + \varepsilon_n \right) + O_\P(r_n) + \P\left( \sup_{|w-w'| \leq \delta_n} \left| \hat Z_n^T(w) - \hat Z_n^T(w') \right| \geq \varepsilon_n \Bigm\vert \bW_n \right) \\ &\quad\leq \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t + 2\varepsilon_n \right) + O_\P(r_n) + \P\left( \sup_{|w-w'| \leq \delta_n} \left| Z_n^T(w) - Z_n^T(w') \right| \geq \varepsilon_n \right) \\ &\qquad+ \P\left( \sup_{|w-w'| \leq \delta_n} \left| \hat Z_n^T(w) - \hat Z_n^T(w') \right| \geq \varepsilon_n \Bigm\vert \bW_n \right) \\ &\quad\leq \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t + 2\varepsilon_n \right) + O_\P(r_n) + O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}) \\ &\quad\leq \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t \right) + \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - t \right| \leq 2\varepsilon_n \right) \\ &\qquad+ O_\P(r_n) + O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}). \end{align*} % The converse inequality is obtained analogously as follows: % \begin{align*} &\P\left( \sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \leq t \Bigm\vert \bW_n \right) \\ &\quad\geq \P\left( \sup_{w \in \cW_\delta} \big| \hat Z_n^T(w) \big| \leq t - \varepsilon_n \Bigm\vert \bW_n \right) - \P\left( \sup_{|w-w'| \leq \delta_n} \left| \hat Z_n^T(w) - \hat Z_n^T(w') \right| \geq \varepsilon_n \Bigm\vert \bW_n \right) \\ &\quad\geq \P\left( \sup_{w \in \cW_\delta} \big| Z_n^T(w) \big| \leq t - \varepsilon_n \right) - O_\P(r_n) - \P\left( \sup_{|w-w'| \leq \delta_n} \left| \hat Z_n^T(w) - \hat Z_n^T(w') \right| \geq \varepsilon_n \Bigm\vert \bW_n \right) \\ &\quad\geq \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t - 2\varepsilon_n \right) - O_\P(r_n) - \P\left( \sup_{|w-w'| \leq \delta_n} \left| Z_n^T(w) - Z_n^T(w') \right| \geq \varepsilon_n \right) \\ &\qquad- \P\left( \sup_{|w-w'| \leq \delta_n} \left| \hat Z_n^T(w) - \hat Z_n^T(w') \right| \geq \varepsilon_n \Bigm\vert \bW_n \right) \\ &\quad\geq \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t - 2\varepsilon_n \right) - O_\P(r_n) - O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}) \\ &\quad\geq \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t \right) - \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - t \right| \leq 2\varepsilon_n \right) \\ &\qquad- O_\P(r_n) - O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}). \end{align*} % Combining these uniform upper and lower bounds gives % \begin{align*} &\sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \leq t \Bigm\vert \bW_n \right) - \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t \right) \right| \\ &\qquad\lesssim_\P \sup_{t \in \R} \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - t \right| \leq 2\varepsilon_n \right) + r_n + \varepsilon_n^{-1} n h^{-1/2} \delta_n^{1/2} \sqrt{\log n}. \end{align*} % For the remaining term, apply anti-concentration for $Z_n^T$ from the proof of Theorem~\ref{thm:kernel_infeasible_ucb}: % \begin{align*} \sup_{t \in \R} \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - t \right| \leq \varepsilon \right) &\lesssim \varepsilon \sqrt{\log n}. \end{align*} % Therefore % \begin{align*} &\sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \leq t \Bigm\vert \bW_n \right) - \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t \right) \right| \\ &\qquad\lesssim_\P \varepsilon_n \sqrt{\log n} + r_n + \varepsilon_n^{-1} n h^{-1/2} \delta_n^{1/2} \sqrt{\log n}. \end{align*} % Taking $\varepsilon = r_n / \sqrt{\log n}$ and then $\delta_n = n^{-2} h r_n^2 \varepsilon_n^2 / \log n$ yields % \begin{align*} \left| \P\left( \sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \leq t \Bigm\vert \bW_n \right) - \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t \right) \right| &\lesssim_\P r_n = \frac{n^{-1/6}(\log n)^{5/6}} {\Dl^{1/3} + (n h)^{-1/6}}. \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_feasible_gaussian_approx}] \proofparagraph{Kolmogorov--Smirnov approximation} Let $Z_n^T$ and $\hat Z_n^T$ be defined as in the proof of Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}. Write % \begin{align*} r_n &= \frac{n^{-1/6}(\log n)^{5/6}} {\Dl^{1/3} + (n h)^{-1/6}} \end{align*} % for the rate of approximation from Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}. For any $\varepsilon_n > 0$ and uniformly in $t \in \R$: % \begin{align*} &\P\left( \sup_{w \in \cW} \left| \hat Z_n^T(w) \right| \leq t \Bigm\vert \bW_n \right) \leq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq t \right) + O_\P(r_n) \\ &\quad\leq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq t - \varepsilon_n \right) + \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| -t \right| \leq \varepsilon_n \right) + O_\P(r_n) \\ &\quad\leq \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq t \right) + \P\left( \sup_{w \in \cW} \left| \hat T_n(w) - Z_n^T(w) \right| \geq \varepsilon_n \right) \\ &\qquad+ \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| -t \right| \leq \varepsilon_n \right) + O_\P(r_n) \\ &\quad\leq \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq t \right) + \P\left( \sup_{w \in \cW} \left| \hat T_n(w) - Z_n^T(w) \right| \geq \varepsilon_n \right) + \varepsilon_n \sqrt{\log n} + O_\P(r_n), \end{align*} % where in the last line we used the anti-concentration result from Lemma~\ref{lem:kernel_app_anticoncentration} applied to $Z_n^T$, as in the proof of Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}. The corresponding lower bound is as follows: % \begin{align*} &\P\left( \sup_{w \in \cW} \left| \hat Z_n^T(w) \right| \leq t \Bigm\vert \bW_n \right) \geq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq t \right) - O_\P(r_n) \\ &\quad\geq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq t + \varepsilon_n \right) - \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| -t \right| \leq \varepsilon_n \right) - O_\P(r_n) \\ &\quad\geq \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \hat T_n(w) - Z_n^T(w) \right| \geq \varepsilon_n \right) \\ &\qquad- \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| -t \right| \leq \varepsilon_n \right) - O_\P(r_n) \\ &\quad\geq \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \hat T_n(w) - Z_n^T(w) \right| \geq \varepsilon_n \right) - \varepsilon_n \sqrt{\log n} - O_\P(r_n). \end{align*} \proofparagraph{$t$-statistic approximation} To control the remaining term, note that by Theorem~\ref{thm:kernel_strong_approx_Tn} and Lemma~\ref{lem:kernel_app_studentized_t_statistic}, % \begin{align*} &\sup_{w \in \cW} \left| \hat T_n(w) - Z_n^T(w) \right| \\ &\quad\leq \sup_{w \in \cW} \left| \hat T_n(w) - T_n(w) \right| + \sup_{w \in \cW} \left| T_n(w) - Z_n^T(w) \right| \\ &\quad\lesssim_\P \sqrt{\frac{\log n}{n}} \left( \sqrt{\log n} + \frac{\sqrt n h^{p \wedge \beta}} {\Dl + 1/\sqrt{n h}} \right) \frac{1}{\Dl + 1/\sqrt{n h}} \\ &\qquad+ \frac{ n^{-1/2} \log n + n^{-3/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-2/3} h^{-1/2} (\log n)^{2/3} + n^{1/2} h^{p \wedge \beta}} {\Dl + 1/\sqrt{n h}} \end{align*} % and denote this last quantity by $r_n'$. Then for any $\varepsilon_n \gg r_n'$, we have % \begin{align*} \sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \hat Z_n^T(w) \right| \leq t \Bigm\vert \bW_n \right) \right| &\lesssim_\P \varepsilon_n \sqrt{\log n} + r_n + o(1). \end{align*} \proofparagraph{rate analysis} This rate is $o_\P(1)$ with an appropriate choice of $\varepsilon_n$ whenever $r_n \to 0$ and $r_n' \sqrt{\log n} \to 0$, by Lemma~\ref{lem:kernel_app_slow_convergence}, along with a slowly diverging sequence $R_n$. Explicitly, we require the following. % \begin{align*} \frac{n^{-1/2} (\log n)^{3/2}}{\Dl + 1/\sqrt{n h}} &\to 0, &\frac{h^{p \wedge \beta} \log n}{\Dl^2 + (n h)^{-1}} &\to 0, \\ \frac{n^{-1/2} (\log n)^{3/2}} {\Dl + 1/\sqrt{n h}} &\to 0, &\frac{n^{-3/4} h^{-7/8} (\log n)^{7/8}} {\Dl + 1/\sqrt{n h}} &\to 0, \\ \frac{n^{-2/3} h^{-1/2} (\log n)^{7/6}} {\Dl + 1/\sqrt{n h}} &\to 0, &\frac{n^{1/2} h^{p \wedge \beta} (\log n)^{1/2}} {\Dl + 1/\sqrt{n h}} &\to 0, \\ \frac{n^{-1/6}(\log n)^{5/6}} {\Dl^{1/3} + (n h)^{-1/6}} &\to 0. \end{align*} % Using the fact that $h \lesssim n^{-\varepsilon}$ for some $\varepsilon > 0$ and removing trivial statements leaves us with % \begin{align*} \frac{n^{-3/4} h^{-7/8} (\log n)^{7/8}} {\Dl + 1/\sqrt{n h}} &\to 0, &\frac{n^{1/2} h^{p \wedge \beta} (\log n)^{1/2}} {\Dl + 1/\sqrt{n h}} &\to 0. \end{align*} % We analyze these based on the degeneracy and verify that they hold under Assumption~\ref{ass:kernel_rates}. % \begin{enumerate}[label=(\roman*)] \item No degeneracy: if $\Dl > 0$ then we need % \begin{align*} n^{-3/4} h^{-7/8} (\log n)^{7/8} &\to 0, &n^{1/2} h^{p \wedge \beta} (\log n)^{1/2} &\to 0. \end{align*} % These reduce to $n^{-6/7} \log n \ll h \ll (n \log n)^{-\frac{1}{2(p \wedge \beta)}}$. \item Partial or total degeneracy: if $\Dl = 0$ then we need % \begin{align*} n^{-1/4} h^{-3/8} (\log n)^{7/8} &\to 0, &n h^{(p \wedge \beta) + 1/2} (\log n)^{1/2} &\to 0. \end{align*} % These reduce to $n^{-2/3} (\log n)^{7/3} \ll h \ll (n^2 \log n)^{-\frac{1}{2(p \wedge \beta) + 1}}$. % \end{enumerate} \end{proof} \begin{proof}[Theorem~\ref{thm:kernel_ucb}] \proofparagraph{existence of the conditional quantile} We argue as in the proof of Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}, now also conditioning on the data. In particular, using the anti-concentration result from Lemma~\ref{lem:kernel_app_anticoncentration}, the regularity property of $\hat Z_n^T$, and the Gaussian process maximal inequality from Lemma~\ref{lem:kernel_app_gaussian_process_maximal}, we see that for any $\varepsilon > 0$, % \begin{align*} \sup_{t \in \R} \P\left( \left| \sup_{w \in \cW} \big| \hat Z_n^T(w) \big| - t \right| \leq 2\varepsilon \Bigm\vert \bW_n \right) &\leq 8 \varepsilon \left( 1 + \E\left[ \sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \Bigm\vert \bW_n \right] \right) \lesssim \varepsilon \sqrt{\log n}. \end{align*} % Thus letting $\varepsilon \to 0$ shows that the conditional distribution function of $\sup_{w \in \cW} \big|\hat Z_n^T(w)\big|$ is continuous, and therefore all of its conditional quantiles exist. \proofparagraph{validity of the confidence band} Define the following (conditional) distribution functions. % \begin{align*} F_Z(t \mid \bW_n) &= \P\left( \sup_{w \in \cW} \left| \hat Z_n^T(w) \right| \leq t \Bigm\vert \bW_n \right), &F_T(t) &= \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq t \right), \end{align*} % along with their well-defined right-quantile functions, % \begin{align*} F_Z^{-1}(p \mid \bW_n) &= \sup \big\{ t \in \R \, : \, F_Z(t \mid \bW_n) = p \big\}, &F_T^{-1}(p) &= \sup \big\{ t \in \R \, : \, F_T(t) = p \big\}. \end{align*} % Note that $t \leq F_Z^{-1}(p \mid \bW_n)$ if and only if $F_Z(t \mid \bW_n) \leq p$. Take $\alpha \in (0,1)$ and define the quantile $\hat q_{1-\alpha} = F_Z^{-1}(1-\alpha \mid \bW_n)$, so that $F_Z(\hat q_{1-\alpha} \mid \bW_n) = 1-\alpha$. By Lemma~\ref{lem:kernel_app_feasible_gaussian_approx}, % \begin{align*} \sup_{t \in \R} \big| F_Z(t \mid \bW_n) - F_T(t) \big| &= o_\P(1). \end{align*} % Thus by Lemma~\ref{lem:kernel_app_slow_convergence}, this can be replaced by % \begin{align*} \P\left( \sup_{t \in \R} \big| F_Z(t \mid \bW_n) - F_T(t) \big| > \varepsilon_n \right) &\leq \varepsilon_n \end{align*} % for some $\varepsilon_n \to 0$. Therefore % \begin{align*} \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq \hat q_{1-\alpha} \right) &= \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq F_Z^{-1}(1-\alpha \mid \bW_n) \right) \\ &= \P\left( F_Z\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \Bigm\vert \bW_n \right) \leq 1 - \alpha \right) \\ &\leq \P\left( F_T\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \right) \leq 1 - \alpha + \varepsilon_n \right) + \varepsilon_n \leq 1 - \alpha + 3\varepsilon_n, \end{align*} % where we used the fact that for any real-valued random variable $X$ with distribution function $F$, we have $\big|\P\big(F(X) \leq t\big) - t\big| \leq \Delta$, where $\Delta$ is the size of the largest jump discontinuity in $F$. By uniform integrability, $\sup_{t \in \R} \big| F_Z(t) - F_T(t) \big| = o(\varepsilon_n)$. Since $F_Z$ has no jumps, we must have $\Delta \leq \varepsilon_n$ for $F_T$. Finally, a lower bound is constructed in an analogous manner, giving % \begin{align*} \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq \hat q_{1-\alpha} \right) &\geq 1 - \alpha - 3\varepsilon_n. \end{align*} % \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_counterfactual_bias}] Writing $k_{i j} = k_h(W_{i j}^1, w)$, $\psi_i = \psi(X_i^1)$, $\hat\psi_i = \hat\psi(X_i^1)$, and $\kappa_{i j} = \kappa(X_i^0, X_i^1, X_j^1)$, % \begin{align*} \E\big[\hat f_W^{1 \triangleright 0}(w)\big] &= \E\left[ \frac{2}{n(n-1)} \sum_{i 0$ and $p \in [1, \infty]$, with $\E^*$ the outer expectation, if % \begin{align*} \E^* \left[ \sup_{A \in \cB(\R^d)} \Big\{ \P \big( X \in A \mid \cH' \big) - F \big( A_p^\eta \mid \cH' \big) \Big\} \right] \leq \rho, \end{align*} % where $A_p^\eta = \{x \in \R^d : \|x - A\|_p \leq \eta\}$ and $\|x - A\|_p = \inf_{x' \in A} \|x - x'\|_p$, then there exists an $\R^d$-valued random variable $Y$ with $Y \mid \cH' \sim F(\cdot \mid \cH')$ and $\P \left( \|X-Y\|_p > \eta \right) \leq \rho$. % \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_strassen}] By Theorem~B.2 in \citet{chen2020jackknife}, noting that the $\sigma$-algebra generated by $Z$ is countably generated and using the metric induced by the $\ell^p$-norm. \end{proof} Next, we present in Lemma~\ref{lem:yurinskii_app_smooth_approximation} an analytic result concerning the smooth approximation of Borel set indicator functions, similar to that given in \citet[Lemma~39]{belloni2019conditional}. \begin{lemma}[Smooth approximation of Borel indicator functions]% \label{lem:yurinskii_app_smooth_approximation} Let $A \subseteq \R^d$ be a Borel set and $Z \sim \cN(0, I_d)$. For $\sigma, \eta > 0$ and $p \in [1, \infty]$, define % \begin{align*} g_{A\eta}(x) &= \left( 1 - \frac{\|x-A^\eta\|_p}{\eta} \right) \vee 0 & &\text{and} &f_{A\eta\sigma}(x) &= \E\big[g_{A\eta}(x + \sigma Z) \big]. \end{align*} % Then $f$ is infinitely differentiable and with $\varepsilon = \P(\|Z\|_p > \eta / \sigma)$, for all $k \geq 0$, any multi-index $\kappa = (\kappa_1,\dots, \kappa_d)\in\N^d$, and all $x,y \in \R^d$, we have $|\partial^\kappa f_{A\eta\sigma}(x)| \leq \frac{\sqrt{\kappa!}}{\sigma^{|\kappa|}}$ and % \begin{align*} &\Bigg| f_{A\eta\sigma}(x+y) - \sum_{|\kappa| = 0}^k \frac{1}{\kappa!} \partial^\kappa f_{A\eta\sigma}(x) y^\kappa \Bigg| \leq \frac{\|y\|_p \|y\|_2^k}{\sigma^k \eta \sqrt{k!}}, \\ &(1 - \varepsilon) \I\big\{x \in A\big\} \leq f_{A\eta\sigma}(x) \leq \varepsilon + (1 - \varepsilon) \I\big\{x \in A^{3\eta}\big\}. \end{align*} % \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_smooth_approximation}] Drop subscripts on $g_{A\eta}$ and $f_{A \eta \sigma}$. By Taylor's theorem with Lagrange remainder, for $t \in [0,1]$, % \begin{align*} \Bigg| f(x + y) - \sum_{|\kappa|=0}^{k} \frac{1}{\kappa!} \partial^{\kappa} f(x) y^\kappa \Bigg| \leq \Bigg| \sum_{|\kappa|=k} \frac{y^\kappa}{\kappa!} \big( \partial^{\kappa} f(x + t y) - \partial^{\kappa} f(x) \big) \Bigg|. \end{align*} % Now with $\phi(x) = \frac{1}{\sqrt{2 \pi}} e^{-x^2/2}$, % \begin{align*} f(x) &= \E\big[g(x + \sigma W) \big] = \int_{\R^d} g(x + \sigma u) \prod_{j=1}^{d} \phi(u_j) \diff u = \frac{1}{\sigma^d} \int_{\R^d} g(u) \prod_{j=1}^{d} \phi \left( \frac{u_j-x_j}{\sigma} \right) \diff u \end{align*} % and since the integrand is bounded, we exchange differentiation and integration to compute % \begin{align} \nonumber \partial^\kappa f(x) &= \frac{1}{\sigma^{d+|\kappa|}} \int_{\R^d} g(u) \prod_{j=1}^{d} \partial^{\kappa_j} \phi \left( \frac{u_j-x_j}{\sigma} \right) \diff u = \left( \frac{-1}{\sigma} \right)^{|\kappa|} \int_{\R^d} g(x + \sigma u) \prod_{j=1}^{d} \partial^{\kappa_j} \phi(u_j) \diff u \\ \label{eq:yurinskii_app_smoothing_derivative} &= \left( \frac{-1}{\sigma} \right)^{|\kappa|} \E \Bigg[ g(x + \sigma Z) \prod_{j=1}^{d} \frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)} \Bigg], \end{align} % where $Z \sim \cN(0, I_d)$. Recalling that $|g(x)| \leq 1$ and applying the Cauchy--Schwarz inequality, % \begin{align*} \left| \partial^\kappa f(x) \right| &\leq \frac{1}{\sigma^{|\kappa|}} \prod_{j=1}^{d} \E \left[ \left( \frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)} \right)^2 \right]^{1/2} \leq \frac{1}{\sigma^{|\kappa|}} \prod_{j=1}^{d} \sqrt{\kappa_j!} = \frac{\sqrt{\kappa!}}{\sigma^{|\kappa|}}, \end{align*} % as the expected square of the Hermite polynomial of degree $\kappa_j$ against the standard Gaussian measure is $\kappa_j!$. By the reverse triangle inequality, $|g(x + t y) - g(x)| \leq t \|y\|_p / \eta$, so by \eqref{eq:yurinskii_app_smoothing_derivative}, % \begin{align*} &\left| \sum_{|\kappa|=k} \frac{y^\kappa}{\kappa!} \big( \partial^{\kappa} f(x + t y) - \partial^{\kappa} f(x) \big) \right| \\ &\quad= \left| \sum_{|\kappa|=k} \frac{y^\kappa}{\kappa!} \frac{1}{\sigma^{|\kappa|}} \E \Bigg[ \big( g(x + t y + \sigma Z) - g(x + \sigma Z) \big) \prod_{j=1}^{d} \frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)} \Bigg] \right| \\ &\quad\leq \frac{t \|y\|_p}{\sigma^k \eta} \, \E \left[ \Bigg| \sum_{|\kappa|=k} \frac{y^\kappa}{\kappa!} \prod_{j=1}^{d} \frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)} \Bigg| \right]. \end{align*} % Therefore, by the Cauchy--Schwarz inequality, % \begin{align*} &\Bigg( \sum_{|\kappa|=k} \frac{y^\kappa}{\kappa!} \big( \partial^{\kappa} f(x + t y) - \partial^{\kappa} f(x) \big) \Bigg)^2 \leq \frac{t^2 \|y\|_p^2}{\sigma^{2k} \eta^2} \, \E \left[ \Bigg( \sum_{|\kappa|=k} \frac{y^\kappa}{\kappa!} \prod_{j=1}^{d} \frac{\partial^{\kappa_j} \phi(Z_j)}{\phi(Z_j)} \Bigg)^2 \right] \\ &\quad= \frac{t^2 \|y\|_p^2}{\sigma^{2k} \eta^2} \sum_{|\kappa|=k} \sum_{|\kappa'|=k} \frac{y^{\kappa + \kappa'}}{\kappa! \kappa'!} \prod_{j=1}^{d} \, \E \left[ \frac{\partial^{\kappa_j} \phi(Z_j)}{\phi(Z_j)} \frac{\partial^{\kappa'_j} \phi(Z_j)}{\phi(Z_j)} \right]. \end{align*} % Orthogonality of Hermite polynomials gives zero if $\kappa_j \neq \kappa'_j$. By the multinomial theorem, % \begin{align*} \left| f(x + y) - \sum_{|\kappa|=0}^{k} \frac{1}{\kappa!} \partial^{\kappa} f(x) y^\kappa \right| &\leq \frac{\|y\|_p}{\sigma^k \eta} \Bigg( \sum_{|\kappa|=k} \frac{y^{2 \kappa}}{\kappa!} \Bigg)^{1/2} \leq \frac{\|y\|_p}{\sigma^k \eta \sqrt{k!}} \Bigg( \sum_{|\kappa|=k} \frac{k!}{\kappa!} y^{2 \kappa} \Bigg)^{1/2} \\ &\leq \frac{\|y\|_p \|y\|_2^k}{\sigma^k \eta \sqrt{k!}}. \end{align*} % For the final result, since $f(x) = \E \left[ g(x + \sigma Z) \right]$ and $\I\big\{x \in A^\eta\big\}\leq g(x)\leq \I\big\{x \in A^{2\eta}\big\}$, % \begin{align*} f(x) &\leq \P \left( x + \sigma Z \in A^{2 \eta} \right) \\ &\leq \P \left( \|Z\|_p > \frac{\eta}{\sigma} \right) + \I \left\{ x \in A^{3 \eta} \right\} \P \left( \|Z\|_p \leq \frac{\eta}{\sigma} \right) = \varepsilon + (1 - \varepsilon) \I \left\{ x \in A^{3 \eta} \right\}, \\ f(x) &\geq \P \left( x + \sigma Z \in A^{\eta} \right) \geq \I \left\{ x \in A \right\} \P \left( \|Z\|_p \leq \frac{\eta}{\sigma} \right) = (1 - \varepsilon) \I \left\{ x \in A \right\}. \end{align*} % \end{proof} We provide a useful Gaussian inequality in Lemma~\ref{lem:yurinskii_app_gaussian_useful} which helps bound the $\beta_{\infty,k}$ moment terms appearing in several places throughout the analysis. \begin{lemma}[A useful Gaussian inequality]% \label{lem:yurinskii_app_gaussian_useful} Let $X \sim \cN(0, \Sigma)$ where $\sigma_j^2 = \Sigma_{j j} \leq \sigma^2$ for all $1 \leq j \leq d$. Then % \begin{align*} \E\left[ \|X\|_2^2 \|X\|_\infty \right] &\leq 4 \sigma \sqrt{\log 2d} \,\sum_{j=1}^d \sigma_j^2 &&\text{and} &\E\left[ \|X\|_2^3 \|X\|_\infty \right] &\leq 8 \sigma \sqrt{\log 2d} \,\bigg( \sum_{j=1}^d \sigma_j^2 \bigg)^{3/2}. \end{align*} % \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_gaussian_useful}] By Cauchy--Schwarz, with $k \in \{2,3\}$, we have $\E\left[\|X\|_2^{k} \|X\|_\infty \right] \leq \E\big[\|X\|_2^{2k} \big]^{1/2} \E\big[\|X\|_\infty^2 \big]^{1/2}$. For the first term, by H{\"o}lder's inequality and the even moments of the normal distribution, % \begin{align*} \E\big[\|X\|_2^4 \big] &= \E\Bigg[ \bigg( \sum_{j=1}^d X_j^2 \bigg)^2 \Bigg] = \sum_{j=1}^d \sum_{k=1}^d \E\big[ X_j^2 X_k^2 \big] \leq \bigg( \sum_{j=1}^d \E\big[X_j^4 \big]^{\frac{1}{2}} \bigg)^2 = 3 \bigg( \sum_{j=1}^d \sigma_j^2 \bigg)^2, \\ \E\big[\|X\|_2^6 \big] &= \sum_{j=1}^d \sum_{k=1}^d \sum_{l=1}^d \E\big[ X_j^2 X_k^2 X_l^2 \big] \leq \bigg( \sum_{j=1}^d \E\big[X_j^6 \big]^{\frac{1}{3}} \bigg)^3 = 15 \bigg( \sum_{j=1}^d \sigma_j^2 \bigg)^3. \end{align*} % For the second term, by Jensen's inequality and the $\chi^2$ moment generating function, % \begin{align*} \E\big[\|X\|_\infty^2 \big] &= \E\left[ \max_{1 \leq j \leq d} X_j^2 \right] \leq 4 \sigma^2 \log \sum_{j=1}^d \E\Big[ e^{X_j^2 / (4\sigma^2)} \Big] \leq 4 \sigma^2 \log \sum_{j=1}^d \sqrt{2} \leq 4 \sigma^2 \log 2 d. \end{align*} % \end{proof} We provide an $\ell^p$-norm tail probability bound for Gaussian variables in Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}, motivating the definition of the term $\phi_p(d)$. \begin{lemma}[Gaussian \texorpdfstring{$\ell^p$}{lp}-norm bound]% \label{lem:yurinskii_app_gaussian_pnorm} Let $X \sim \cN(0, \Sigma)$ where $\Sigma \in \R^{d \times d}$ is a positive semi-definite matrix. Then we have that $\E\left[ \|X\|_p \right] \leq \phi_p(d) \max_{1 \leq j \leq d} \sqrt{\Sigma_{j j}}$ with $\phi_p(d) = \sqrt{pd^{2/p} }$ for $p \in [1,\infty)$ and $\phi_\infty(d) = \sqrt{2\log 2d}$. \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}] For $p \in [1, \infty)$, as each $X_j$ is Gaussian, we have $\big(\E\big[|X_j|^p\big]\big)^{1/p} \leq \sqrt{p\, \E[X_j^2]} = \sqrt{p \Sigma_{j j}}$. So % \begin{align*} \E\big[\|X\|_p\big] &\leq \Bigg(\sum_{j=1}^d \E \big[ |X_j|^p \big] \Bigg)^{1/p} \leq \Bigg(\sum_{j=1}^d p^{p/2} \Sigma_{j j}^{p/2} \Bigg)^{1/p} \leq \sqrt{p d^{2/p}} \max_{1\leq j\leq d} \sqrt{\Sigma_{j j}} \end{align*} % by Jensen's inequality. For $p=\infty$, with $\sigma^2 = \max_j \Sigma_{j j}$, for $t>0$, % \begin{align*} \E\big[\|X\|_\infty \big] &\leq t \log \sum_{j=1}^d \E\Big[ e^{|X_j| / t} \Big] \leq t \log \sum_{j=1}^d \E\Big[ 2 e^{X_j / t} \Big] \leq t \log \Big(2 d e^{\sigma^2/(2t^2)}\Big) \leq t \log 2 d + \frac{\sigma^2}{2t}, \end{align*} % again by Jensen's inequality. Setting $t = \frac{\sigma}{\sqrt{2 \log 2d}}$ gives $\E\big[\|X\|_\infty \big] \leq \sigma \sqrt{2 \log 2d}$. % \end{proof} We give a Gaussian--Gaussian $\ell^p$-norm approximation as Lemma~\ref{lem:yurinskii_app_feasible_gaussian}, useful for ensuring approximations remain valid upon substituting an estimator for the true variance matrix. \begin{lemma}[Gaussian--Gaussian approximation in \texorpdfstring{$\ell^p$}{lp}-norm]% \label{lem:yurinskii_app_feasible_gaussian} Let $\Sigma_1, \Sigma_2 \in \R^{d \times d}$ be positive semi-definite and take $Z \sim \cN(0, I_d)$. For $p \in [1, \infty]$ we have % \begin{align*} \P\left( \left\| \left(\Sigma_1^{1/2} - \Sigma_2^{1/2}\right) Z \right\|_p > t \right) &\leq 2 d \exp \left( \frac{-t^2} {2 d^{2/p} \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2^2} \right). \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_feasible_gaussian}] Let $\Sigma \in \R^{d \times d}$ be positive semi-definite and write $\sigma^2_j = \Sigma_{j j} $. For $p \in [1, \infty)$ by a union bound and Gaussian tail probabilities, % \begin{align*} \P\left(\big\| \Sigma^{1/2} Z \big\|_p > t \right) &= \P\Bigg( \sum_{j=1}^d \left| \left( \Sigma^{1/2} Z \right)_j \right|^p > t^p \Bigg) \leq \sum_{j=1}^d \P\Bigg( \left| \left( \Sigma^{1/2} Z \right)_j \right|^p > \frac{t^p \sigma_j^p}{\|\sigma\|_p^p} \Bigg) \\ &= \sum_{j=1}^d \P\Bigg( \left| \sigma_j Z_j \right|^p > \frac{t^p \sigma_j^p}{\|\sigma\|_p^p} \Bigg) = \sum_{j=1}^d \P\left( \left| Z_j \right| > \frac{t}{\|\sigma\|_p} \right) \leq 2 d \, \exp\left( \frac{-t^2}{2 \|\sigma\|_p^2} \right). \end{align*} % The same result holds for $p = \infty$ since % \begin{align*} \P\left(\big\| \Sigma^{1/2} Z \big\|_\infty > t \right) &= \P\left( \max_{1 \leq j \leq d} \left| \left( \Sigma^{1/2} Z \right)_j \right| > t \right) \leq \sum_{j=1}^d \P\left( \left| \left( \Sigma^{1/2} Z \right)_j \right| > t \right) \\ &= \sum_{j=1}^d \P\left( \left| \sigma_j Z_j \right| > t \right) \leq 2 \sum_{j=1}^d \exp\left( \frac{-t^2}{2 \sigma_j^2} \right) \leq 2 d \exp\left( \frac{-t^2}{2 \|\sigma\|_\infty^2} \right). \end{align*} % Now we apply this to the matrix $\Sigma = \big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2$. For $p \in [1, \infty)$, % \begin{align*} \|\sigma\|_p^p &= \sum_{j=1}^d (\Sigma_{j j})^{p/2} = \sum_{j=1}^d \Big(\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big)_{j j}^{p/2} \leq d \max_{1 \leq j \leq d} \Big(\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big)_{j j}^{p/2} \\ &\leq d \, \Big\|\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big\|_2^{p/2} = d \, \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2^p \end{align*} % Similarly, for $p = \infty$ we have % \begin{align*} \|\sigma\|_\infty &= \max_{1 \leq j \leq d} (\Sigma_{j j})^{1/2} = \max_{1 \leq j \leq d} \Big(\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big)_{j j}^{1/2} \leq \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2. \end{align*} % Thus for all $p \in [1, \infty]$ we have $\|\sigma\|_p \leq d^{1/p} \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2$, with $d^{1/\infty} = 1$. Hence % \begin{align*} \P\left( \left\| \left(\Sigma_1^{1/2} - \Sigma_2^{1/2}\right) Z \right\|_p > t \right) &\leq 2 d \exp \left( \frac{-t^2}{2 \|\sigma\|_p^2} \right) \leq 2 d \exp \left( \frac{-t^2} {2 d^{2/p} \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2^2} \right). \end{align*} % \end{proof} We give a variance bound and an exponential inequality for $\alpha$-mixing variables. \begin{lemma}[Variance bounds for \texorpdfstring{$\alpha$}{alpha}-mixing random variables] \label{lem:yurinskii_app_variance_mixing} Let $X_1, \ldots, X_n$ be real-valued $\alpha$-mixing random variables with mixing coefficients $\alpha(j)$. Then % \begin{enumerate}[label=(\roman*)] \item \label{it:yurinskii_app_variance_mixing_bounded} If for constants $M_i$ we have $|X_i| \leq M_i$ a.s.\ then % \begin{align*} \Var\left[ \sum_{i=1}^n X_i \right] &\leq 4 \sum_{j=1}^\infty \alpha(j) \sum_{i=1}^n M_i^2. \end{align*} \item \label{it:yurinskii_app_variance_mixing_exponential} If $\alpha(j) \leq e^{-2j / C_\alpha}$ then for any $r>2$ there is a constant $C_r$ depending only on $r$ with % \begin{align*} \Var\left[ \sum_{i=1}^n X_i \right] &\leq C_r C_\alpha \sum_{i=1}^n \E\big[|X_i|^r\big]^{2/r}. \end{align*} \end{enumerate} % \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_variance_mixing}] Define $\alpha^{-1}(t) = \inf\{j \in \N : \alpha(j) \leq t\}$ and $Q_i(t) = \inf\{s \in \R : \P(|X_i| > s) \leq t\}$. By Corollary~1.1 in \citet{rio2017asymptotic} and H{\"o}lder's inequality for $r > 2$, % \begin{align*} \Var\left[ \sum_{i=1}^n X_i \right] &\leq 4 \sum_{i=1}^n \int_0^1 \alpha^{-1}(t) Q_i(t)^2 \diff{t} \\ &\leq 4 \sum_{i=1}^n \left( \int_0^1 \alpha^{-1}(t)^{\frac{r}{r-2}} \diff{t} \right)^{\frac{r-2}{r}} \left( \int_0^1 |Q_i(t)|^r \diff{t} \right)^{\frac{2}{r}} \diff{t}. \end{align*} % Now note that if $U \sim \Unif[0,1]$ then $Q_i(U)$ has the same distribution as $X_i$. Therefore % \begin{align*} \Var\left[ \sum_{i=1}^n X_i \right] &\leq 4 \left( \int_0^1 \alpha^{-1}(t)^{\frac r{r-2}} \diff{t} \right)^{\frac{r-2}r} \sum_{i=1}^n \E[|X_i|^r]^{\frac 2 r}. \end{align*} % If $\alpha(j) \leq e^{-2j/C_\alpha}$ then $\alpha^{-1}(t) \leq \frac{-C_\alpha \log t}{2}$ so, for some constant $C_r$ depending only on $r$, % \begin{align*} \Var\left[ \sum_{i=1}^n X_i \right] \leq 2 C_\alpha \left( \int_0^1 (-\log t)^{\frac r{r-2}} \diff{t} \right)^{\frac{r-2} r} \sum_{i=1}^n \E[|X_i|^r]^{\frac 2 r} \leq C_r C_\alpha \sum_{i=1}^n \E[|X_i|^r]^{\frac 2 r}. \end{align*} % Alternatively, if for constants $M_i$ we have $|X_i| \leq M_i$ a.s.\ then % \begin{align*} \Var\left[ \sum_{i=1}^n X_i \right] &\leq 4 \int_0^1 \alpha^{-1}(t) \diff{t} \sum_{i=1}^n M_i^2 \leq 4 \sum_{j=1}^\infty \alpha(j) \sum_{i=1}^n M_i^2. \end{align*} % \end{proof} \begin{lemma}[Exponential concentration inequalities for \texorpdfstring{$\alpha$}{alpha}-mixing random variables] \label{lem:yurinskii_app_exponential_mixing} Let $X_1, \ldots, X_n$ be zero-mean real-valued variables with $\alpha$-mixing coefficients $\alpha(j) \leq e^{-2 j / C_\alpha}$. \begin{enumerate}[label=(\roman*)] \item \label{it:yurinskii_app_exponential_mixing_bounded} Suppose $|X_i| \leq M$ a.s.\ for $1 \leq i \leq n$. Then for all $t > 0$ there is a constant $C_1$ with % \begin{align*} \P\left( \left| \sum_{i=1}^n X_i \right| > C_1 M \big( \sqrt{n t} + (\log n)(\log \log n) t \big) \right) &\leq C_1 e^{-t}. \end{align*} % \item \label{it:yurinskii_app_exponential_mixing_bernstein} If further $\sum_{j=1}^n |\Cov[X_i, X_j]| \leq \sigma^2$, then for all $t > 0$ there is a constant $C_2$ with % \begin{align*} \P\left( \left| \sum_{i=1}^n X_i \right| \geq C_2 \big( (\sigma \sqrt n + M) \sqrt t + M (\log n)^2 t \big) \right) &\leq C_2 e^{-t}. \end{align*} \end{enumerate} \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_exponential_mixing}] \begin{enumerate}[label=(\roman*)] \item By Theorem~1 in \citet{merlevede2009bernstein}, % \begin{align*} \P\left( \left| \sum_{i=1}^n X_i \right| > t \right) &\leq \exp\left( -\frac{C_1 t^2}{n M^2 + Mt (\log n)(\log\log n)} \right). \end{align*} % Replace $t$ by $M \sqrt{n t} + M (\log n)(\log \log n) t$. \item By Theorem~2 in \citet{merlevede2009bernstein}, % \begin{align*} \P\left( \left| \sum_{i=1}^n X_i \right| > t \right) &\leq \exp\left( -\frac{C_2 t^2}{n\sigma^2 + M^2 + Mt (\log n)^2} \right). \end{align*} % Replace $t$ by $\sigma \sqrt n \sqrt t + M \sqrt t + M (\log n)^2 t$. \end{enumerate} % \end{proof} \subsection{Main results} To establish Theorem~\ref{thm:yurinskii_sa_dependent}, we first give the analogous result for martingales as Lemma~\ref{lem:yurinskii_app_sa_martingale}. Our approach is similar to that used in modern versions of Yurinskii's coupling for independent data, as in Theorem~1 in \citet{lecam1988} and Theorem~10 in Chapter~10 of \citet{pollard2002user}. The proof of Lemma~\ref{lem:yurinskii_app_sa_martingale} relies on constructing a ``modified'' martingale, which is close to the original martingale, but which has an $\cH_0$-measurable terminal quadratic variation. \begin{lemma}[Strong approximation for vector-valued martingales]% \label{lem:yurinskii_app_sa_martingale} Let $X_1, \ldots, X_n$ be $\R^d$-valued square-integrable random vectors adapted to a countably generated filtration $\cH_0, \ldots, \cH_n$. Suppose that $\E[X_i \mid \cH_{i-1}] = 0$ for all $1 \leq i \leq n$ and define $S = \sum_{i=1}^n X_i$. Let $V_i = \Var[X_i \mid \cH_{i-1}]$ and $\Omega = \sum_{i=1}^n V_i - \Sigma$ where $\Sigma$ is a positive semi-definite $\cH_0$-measurable $d \times d$ random matrix. For each $\eta > 0$ and $p \in [1,\infty]$ there is $T \mid \cH_0 \sim \cN(0, \Sigma)$ with % \begin{align*} \P\big(\|S-T\|_p > 5\eta\big) &\leq \inf_{t>0} \left\{ 2 \P\big( \|Z\|_p > t \big) + \min\left\{ \frac{\beta_{p,2} t^2}{\eta^3}, \frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3} \right\} \right\} \\ \nonumber &\quad+ \inf_{M \succeq 0} \big\{ 2\gamma(M) + \delta_p(M,\eta) + \varepsilon_p(M, \eta)\big\}, \end{align*} % where the second infimum is over all positive semi-definite $d \times d$ non-random matrices, and % \begin{align*} \beta_{p,k} &= \sum_{i=1}^n \E\left[\| X_i \|^k_2 \| X_i \|_p + \|V_i^{1/2} Z_i \|^k_2 \|V_i^{1/2} Z_i \|_p \right], \qquad\gamma(M) = \P\big(\Omega \npreceq M\big), \\ \delta_p(M,\eta) &= \P\left( \big\|\big((\Sigma +M)^{1/2}- \Sigma^{1/2}\big) Z\big\|_p \geq \eta \right), \qquad\pi_3 = \sum_{i=1}^{n+m} \sum_{|\kappa| = 3} \E \Big[ \big| \E \left[ X_i^\kappa \mid \cH_{i-1} \right] \big| \Big], \\ \varepsilon_p(M, \eta) &= \P\left(\big\| (M - \Omega)^{1/2} Z \big\|_p\geq \eta, \ \Omega \preceq M\right), \end{align*} % for $k \in \{2,3\}$, with $Z, Z_1,\dots ,Z_n$ i.i.d.\ standard Gaussian on $\R^d$ independent of $\cH_n$. \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_sa_martingale}] \proofparagraph{constructing a modified martingale} Take $M \succeq 0$ a fixed positive semi-definite $d \times d$ matrix. We start by constructing a new martingale based on $S$ whose quadratic variation is $\Sigma + M$. Take $m \geq 1$ and define % \begin{align*} H_k &= \Sigma + M - \sum_{i=1}^{k} V_i, \qquad\qquad\qquad\qquad\tau = \sup \big\{ k\in\{0,1,\dots,n\} : H_k \succeq 0 \big\}, \\ \tilde X_i &= X_i\I\{i \leq \tau\} + \frac{1}{\sqrt{m}} H_\tau^{1/2} Z_i\I\{n+1 \leq i \leq n+m\}, \qquad\qquad\tilde S = \sum_{i=1}^{n+m} \tilde X_i, \end{align*} % where $Z_{n+1}, \ldots, Z_{n+m}$ is an i.i.d.\ sequence of standard Gaussian vectors in $\R^d$ independent of $\cH_n$, noting that $H_0 = \Sigma + M \succeq 0$ a.s. Define the filtration $\tilde \cH_0, \ldots, \tilde \cH_{n+m}$, where $\tilde \cH_i = \cH_i$ for $0 \leq i \leq n$ and is the $\sigma$-algebra generated by $\cH_n$ and $Z_{n+1}, \dots, Z_{i}$ for $n+1 \leq i\leq n+m$. Observe that $\tau$ is a stopping time with respect to $\tilde\cH_i$ because $H_{i+1} - H_i = -V_{i+1} \preceq 0$ almost surely, so $\{\tau \leq i\} = \{H_{i+1} \nsucceq 0\}$ for $0\leq i \eta \big) \leq \P\big( \| H_n^{1/2} Z \|_p > \eta,\, \Omega \preceq M) + \P\big( \Omega \npreceq M \big)$, so % \begin{align*}% \label{eq:yurinskii_app_approx_modified_original} \P\big( \| S - \tilde S \|_p > \eta\big) &\leq 2 \P\big(\Omega \npreceq M \big) + \P\big( \| (M-\Omega)^{1/2}Z \|_p > \eta,\, \Omega \preceq M \big) = 2 \gamma(M) + \varepsilon_p(M, \eta). \end{align*} \proofparagraph{strong approximation of the modified martingale} Let $\tilde Z_1, \ldots, \tilde Z_{n+m}$ be i.i.d.\ $\cN(0, I_d)$ and independent of $\tilde \cH_{n+m}$. Define $\check X_i = \tilde V_i^{1/2} \tilde Z_i$ and $\check S = \sum_{i=1}^{n+m} \check X_i$. Fix a Borel set $A \subseteq \R^d$ and $\sigma, \eta > 0$ and let $f = f_{A\eta\sigma}$ be the function defined in Lemma~\ref{lem:yurinskii_app_smooth_approximation}. By the Lindeberg method, write the telescoping sum % \begin{align*} \E\Big[f\big(\tilde S\big) - f\big(\check S\big) \mid \cH_0 \Big] &= \sum_{i=1}^{n+m} \E\Big[ f\big(Y_i + \tilde X_i\big) - f\big(Y_i + \check X_i\big) \mid \cH_0 \Big] \end{align*} % where $Y_i = \sum_{j=1}^{i-1} \tilde X_j + \sum_{j=i+1}^{n+m} \check X_j$. By Lemma~\ref{lem:yurinskii_app_smooth_approximation} we have for $k \geq 0$ % \begin{align*} &\Bigg| \E\big[ f(Y_i + \tilde X_i) - f(Y_i + \check X_i) \mid \cH_0 \big] - \sum_{|\kappa| = 0}^k \frac{1}{\kappa!} \E \left[ \partial^\kappa f(Y_i) \left( \tilde X_i^\kappa - \check X_i^\kappa \right) \bigm| \cH_0 \right] \Bigg| \\ &\quad\leq \frac{1}{\sigma^k \eta \sqrt{k!}} \E \left[ \|\tilde X_i\|_p \|\tilde X_i\|_2^k + \|\check X_i\|_p \|\check X_i\|_2^k \bigm| \cH_0 \right]. \end{align*} % With $k \in \{2, 3\}$, we bound each summand. With $|\kappa| = 0$ we have $\tilde X_i^\kappa = \check X_i^\kappa$, so consider $|\kappa| = 1$. Noting that $\sum_{i=1}^{n+m} \tilde V_i = \Sigma + M$, define % \begin{align*} \tilde Y_i &= \sum_{j=1}^{i-1} \tilde X_j + \tilde Z_i \Bigg(\sum_{j=i+1}^{n+m} \tilde V_j\Bigg)^{1/2} = \sum_{j=1}^{i-1} \tilde X_j + \tilde Z_i \Bigg(\Sigma + M - \sum_{j=1}^{i} \tilde V_j\Bigg)^{1/2} \end{align*} % and let $\check \cH_i$ be the $\sigma$-algebra generated by $\tilde \cH_{i-1}$ and $\tilde Z_i$. Note that $\tilde Y_i$ is $\check \cH_i$-measurable and that $Y_i$ and $\tilde Y_i$ have the same distribution conditional on $\tilde \cH_{n+m}$. So % \begin{align*} &\sum_{|\kappa| = 1} \frac{1}{\kappa!} \E\left[ \partial^\kappa f(Y_i) \big( \tilde X_i^\kappa - \check X_i^\kappa \big) \bigm| \cH_0 \right] = \E \left[ \nabla f(Y_i)^\T \big( \tilde X_i - \tilde V_i^{1/2} \tilde Z_i \big) \bigm| \cH_0 \right] \\ &\quad= \E \left[ \nabla f(\tilde Y_i)^\T \tilde X_i \bigm| \cH_0 \right] - \E \left[ \nabla f(Y_i)^\T \tilde V_i^{1/2} \tilde Z_i \bigm| \cH_0 \right] \\ &\quad= \E \left[ \nabla f(\tilde Y_i)^\T \E \left[ \tilde X_i \mid \check \cH_i \right] \bigm| \cH_0 \right] - \E \left[ \tilde Z_i \right] \E \left[ \nabla f(Y_i)^\T \tilde V_i^{1/2} \bigm| \cH_0 \right] \\ &\quad= \E \left[ \nabla f(\tilde Y_i)^\T \E \left[ \tilde X_i \mid \tilde \cH_{i-1} \right] \bigm| \cH_0 \right] - 0 = 0. \end{align*} % Next, if $|\kappa| = 2$ then % \begin{align*} &\sum_{|\kappa| = 2} \frac{1}{\kappa!} \E \left[ \partial^\kappa f(Y_i) \left( \tilde X_i^\kappa - \check X_i^\kappa \right) \bigm| \cH_0 \right] \\ &\quad= \frac{1}{2} \E \left[ \tilde X_i^\T \nabla^2 f(Y_i) \tilde X_i - \tilde Z_i^\T \tilde V_i^{1/2} \nabla^2 f(Y_i) \tilde V_i^{1/2} \tilde Z_i \bigm| \cH_0 \right] \\ &\quad= \frac{1}{2} \E \left[ \E \left[ \Tr \nabla^2 f(\tilde Y_i) \tilde X_i \tilde X_i^\T \bigm| \check \cH_i \right] \bigm| \cH_0 \right] - \frac{1}{2} \E \left[ \Tr \tilde V_i^{1/2} \nabla^2 f(Y_i) \tilde V_i^{1/2} \bigm| \cH_0 \right] \E \left[ \tilde Z_i \tilde Z_i^\T \right] \\ &\quad= \frac{1}{2} \E \left[ \Tr \nabla^2 f(Y_i) \E \left[ \tilde X_i \tilde X_i^\T \bigm| \tilde \cH_{i-1} \right] \bigm| \cH_0 \right] - \frac{1}{2} \E \left[ \Tr \nabla^2 f(Y_i) \tilde V_i \bigm| \cH_0 \right] = 0. \end{align*} % Finally, if $|\kappa| = 3$, then since $\check X_i \sim \cN(0, \tilde V_i)$ conditional on $\tilde \cH_{n+m}$, we have by symmetry of the Gaussian distribution and Lemma~\ref{lem:yurinskii_app_smooth_approximation}, % \begin{align*} & \left| \sum_{|\kappa| = 3} \frac{1}{\kappa!} \E \left[ \partial^\kappa f(Y_i) \left( \tilde X_i^\kappa - \check X_i^\kappa \right) \bigm| \cH_0 \right] \right| \\ &\quad= \left| \sum_{|\kappa| = 3} \frac{1}{\kappa!} \left( \E \left[ \partial^\kappa f(\tilde Y_i) \E \left[ \tilde X_i^\kappa \mid \check \cH_i \right] \bigm| \cH_0 \right] - \E \left[ \partial^\kappa f(Y_i) \, \E \left[ \check X_i^\kappa \bigm| \tilde \cH_{n+m} \right] \bigm| \cH_0 \right] \right) \right| \\ &\quad= \left| \sum_{|\kappa| = 3} \frac{1}{\kappa!} \E \left[ \partial^\kappa f(Y_i) \, \E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right] \bigm| \cH_0 \right] \right| \leq \frac{1}{\sigma^3} \sum_{|\kappa| = 3} \E \left[ \left| \E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right] \right| \bigm| \cH_0 \right]. \end{align*} % Combining these and summing over $i$ with $k=2$ shows % \begin{align*} \E\left[ f\big(\tilde S\big) - f\big(\check S\big) \bigm| \cH_0 \right] &\leq \frac{1}{\sigma^2 \eta \sqrt{2}} \sum_{i=1}^{n+m} \E \left[ \|\tilde X_i\|_p \|\tilde X_i\|_2^2 + \|\check X_i\|_p \|\check X_i\|_2^2 \bigm| \cH_0 \right] \end{align*} % On the other hand, taking $k = 3$ gives % \begin{align*} \E\left[ f\big(\tilde S\big) - f\big(\check S\big) \bigm| \cH_0 \right] &\leq \frac{1}{\sigma^3 \eta \sqrt{6}} \sum_{i=1}^{n+m} \E \left[ \|\tilde X_i\|_p \|\tilde X_i\|_2^3 + \|\check X_i\|_p \|\check X_i\|_2^3 \bigm| \cH_0 \right] \\ &\quad+ \frac{1}{\sigma^3} \sum_{i=1}^{n+m} \sum_{|\kappa| = 3} \E \left[ \left| \E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right] \right| \bigm| \cH_0 \right]. \end{align*} % For $1 \leq i \leq n$ we have $\|\tilde X_i\| \leq \|X_i\|$ and $\|\check X_i\| \leq \|V_i^{1/2} \tilde Z_i\|$. For $n+1 \leq i \leq n+m$ we have $\tilde X_i = H_\tau^{1/2} Z_i / \sqrt m$ and $\check X_i = H_\tau^{1/2} \tilde Z_i / \sqrt m$ which are equal in distribution given $\cH_0$. So with % \begin{align*} \tilde \beta_{p,k} &= \sum_{i=1}^{n} \E \left[ \|X_i\|_p \|X_i\|_2^k + \|V_i^{1/2} Z_i\|_p \|V_i^{1/2} Z_i\|_2^k \bigm| \cH_0 \right], \end{align*} % we have, since $k \in \{2,3\}$, % \begin{align*} &\sum_{i=1}^{n+m} \E \left[ \|\tilde X_i\|_p \|\tilde X_i\|_2^k + \|\check X_i\|_p \|\check X_i\|_2^k \bigm| \cH_0 \right] \leq \tilde\beta_{p,k} + \frac{2}{\sqrt m} \E \left[ \|H_\tau^{1/2} Z\|_p \|H_\tau^{1/2} Z\|_2^k \bigm| \cH_0 \right]. \end{align*} % Since $H_i$ is weakly decreasing under the semi-definite partial order, we have $H_\tau \preceq H_0 = \Sigma + M$ implying that $|(H_\tau)_{j j}| \leq \|\Sigma + M\|_{\max}$ and $\E\big[|(H_\tau^{1/2} Z)_j|^3 \mid \cH_0 \big] \leq \sqrt{8/\pi}\, \|\Sigma + M\|_{\max}^{3/2}$. Hence as $p \geq 1$ and $k \in \{2,3\}$, % \begin{align*} \E\left[ \|H_\tau^{1/2}Z\|_p \|H_\tau^{1/2}Z\|_2^k \bigm| \cH_0 \right] &\leq \E\left[\|H_\tau^{1/2} Z\|_1^{k+1} \bigm| \cH_0 \right] \leq d^{k+1} \max_{1\leq j\leq d} \E\left[|(H_\tau^{1/2} Z)_j|^{k+1} \bigm| \cH_0 \right] \\ &\leq 3 d^4 \, \|\Sigma + M\|_{\max}^{(k+1)/2} \leq 6 d^4 \, \|\Sigma \|_{\max}^{(k+1)/2} + 6 d^4 \|M\|. \end{align*} % Assuming some $X_i$ is not identically zero so the result is non-trivial, and supposing that $\Sigma$ is bounded a.s.\ (replacing $\Sigma$ by $\Sigma \cdot \I\{\|\Sigma\|_{\max} \leq C\}$ for an appropriately large $C$ if necessary), take $m$ large enough that % \begin{align} \label{eq:yurinskii_app_bound_extra_terms} \frac{2}{\sqrt m} \E \left[ \|H_\tau^{1/2} Z\|_p \|H_\tau^{1/2} Z\|_2^k \bigm| \cH_0 \right] \leq \frac{1}{4} \beta_{p,k}. \end{align} % Further, if $|\kappa| = 3$ then $\big|\E \big[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \big]\big| \leq \big| \E \left[ X_i^\kappa \mid \cH_{i-1} \right]\big|$ for $1 \leq i \leq n$ while by symmetry of the Gaussian distribution $\E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right] = 0$ for $n+1 \leq i \leq n+m$. Hence with % \begin{align*} \tilde \pi_3 &= \sum_{i=1}^{n+m} \sum_{|\kappa| = 3} \E \Big[ \big| \E \left[ X_i^\kappa \mid \cH_{i-1} \right] \big| \mid \cH_0 \Big], \end{align*} % we have % \begin{align*} \E\left[ f\big(\tilde S\big) - f\big(\check S\big) \bigm| \cH_0 \right] &\leq \min \left\{ \frac{3 \tilde \beta_{p,2}}{4 \sigma^2 \eta} + \frac{\beta_{p,2}}{4 \sigma^2 \eta}, \frac{3 \tilde \beta_{p,3}}{4 \sigma^3 \eta} + \frac{\beta_{p,3}}{4 \sigma^3 \eta} + \frac{\tilde \pi_3}{\sigma^3} \right\}. \end{align*} % Along with Lemma~\ref{lem:yurinskii_app_smooth_approximation}, and with $\sigma = \eta / t$ and $\varepsilon = \P(\|Z\|_p > t)$, we conclude that % \begin{align*} &\P(\tilde S \in A \mid \cH_0) = \E\big[\I\{\tilde S \in A\} - f(\tilde S) \mid \cH_0 \big] + \E\big[f(\tilde S) - f\big(\check S\big) \mid \cH_0 \big] + \E \big[f\big(\check S\big) \mid \cH_0 \big] \\ &\,\leq \varepsilon\P(\tilde S \in A \mid \cH_0) + \min \! \left\{ \frac{3 \tilde \beta_{p,2}}{4 \sigma^2 \eta} + \frac{\beta_{p,2}}{4 \sigma^2 \eta}, \frac{3 \tilde \beta_{p,3}}{4 \sigma^3 \eta} + \frac{\beta_{p,3}}{4 \sigma^3 \eta} + \frac{\tilde \pi_3}{\sigma^3} \right\} + \varepsilon + (1 - \varepsilon) \P\big(\check S \in A_p^{3\eta} \mid \cH_0 \big) \\ &\,\leq \P\big( \check S \in A_p^{3\eta} \mid \cH_0 \big) + 2 \P(\|Z\|_p > t) + \min\!\left\{ \frac{3 \tilde \beta_{p,2} t^2}{4 \eta^3} + \frac{\beta_{p,2} t^2}{4 \eta^3}, \frac{3 \tilde \beta_{p,3} t^3}{4 \eta^4} + \frac{\beta_{p,3} t^3}{4 \eta^4} + \frac{\tilde \pi_3 t^3}{\eta^3} \right\}. \end{align*} % Taking a supremum and an outer expectation yields with $\beta_{p,k} = \E\big[\tilde \beta_{p,k}\big]$ and $\pi_3 = \E[\tilde \pi_3]$, % \begin{align*} &\E^* \left[ \sup_{A \in \cB(\R^d)} \left\{ \P(\tilde S \in A \mid \cH_0) - \P\big( \check S \in A_p^{3\eta} \mid \cH_0 \big) \right\} \right] \\ &\quad\leq 2 \P(\|Z\|_p > t) + \min \left\{ \frac{\beta_{p,2} t^2}{\eta^3}, \frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3} \right\}. \end{align*} % Finally, since $\check S = \sum_{i=1}^n \tilde V_i^{1/2} \tilde Z_i \sim \cN(0,\Sigma + M)$ conditional on $\cH_0$, the conditional Strassen theorem in Lemma~\ref{lem:yurinskii_app_strassen} ensures the existence of $\tilde S$ and $\tilde T \mid \cH_0 \sim \cN(0, \Sigma + M)$ such that % \begin{align} \label{eq:yurinskii_app_approx_modified_martingale} \P\left(\|\tilde S-\tilde T\|_p>3\eta\right) &\leq \inf_{t>0} \left\{ 2 \P(\|Z\|_p > t) + \min \left\{ \frac{\beta_{p,2} t^2}{\eta^3}, \frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3} \right\} \right\}, \end{align} % since the infimum is attained by continuity of $\|Z\|_p$. \proofparagraph{conclusion} We show how to write $\tilde T = (\Sigma + M)^{1/2} W$ where $W \sim \cN(0,I_d)$ and use this representation to construct $T \mid \cH_0 \sim \cN(0, \Sigma)$. By the spectral theorem, let $\Sigma + M = U \Lambda U^\T$ where $U$ is a $d \times d$ orthogonal random matrix and $\Lambda$ is a diagonal $d \times d$ random matrix with diagonal entries satisfying $\lambda_1 \geq \cdots \geq \lambda_r > 0$ and $\lambda_{r+1} = \cdots = \lambda_d = 0$ where $r = \rank (\Sigma + M)$. Let $\Lambda^+$ be the Moore--Penrose pseudo-inverse of $\Lambda$ (obtained by inverting its non-zero elements) and define $W = U (\Lambda^+)^{1/2} U^\T \tilde T + U \tilde W$, where the first $r$ elements of $\tilde W$ are zero and the last $d-r$ elements are i.i.d.\ $\cN(0,1)$ independent from $\tilde T$. Then, it is easy to check that $W \sim \cN(0, I_d)$ and that $\tilde T = (\Sigma + M)^{1/2} W$. Now define $T = \Sigma^{1/2} W$ so % \begin{equation}% \label{eq:yurinskii_app_approx_target} \P\big(\|T - \tilde T\|_p > \eta\big) = \P\big(\big\|\big((\Sigma + M)^{1/2} - \Sigma^{1/2} \big) W \big\|_p>\eta \big) = \delta_p(M, \eta). \end{equation} % Finally \eqref{eq:yurinskii_app_approx_modified_original}, \eqref{eq:yurinskii_app_approx_modified_martingale}, \eqref{eq:yurinskii_app_approx_target}, the triangle inequality, and a union bound conclude the proof since by taking an infimum over $M \succeq 0$, and by possibly reducing the constant of $1/4$ in \eqref{eq:yurinskii_app_bound_extra_terms} to account for this infimum being potentially unattainable, % \begin{align*} \P\big(\|S-T\|_p > 5\eta\big) &\leq \P\big(\|\tilde S - \tilde T \|_p > 3\eta \big) +\P\big(\|S - \tilde S \|_p > \eta\big) +\P\big(\|T - \tilde T \|_p > \eta\big) \\ &\leq \inf_{t>0} \left\{ 2 \P\big( \|Z\|_p > t \big) + \min\left\{ \frac{\beta_{p,2} t^2}{\eta^3}, \frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3} \right\} \right\} \\ &\quad+ \inf_{M \succeq 0} \big\{ 2\gamma(M) + \delta_p(M,\eta) + \varepsilon_p(M, \eta)\big\}. \end{align*} % \end{proof} Lemma~\ref{lem:yurinskii_app_sa_martingale} and the martingale approximation immediately yield Theorem~\ref{thm:yurinskii_sa_dependent}. \begin{proof}[Theorem~\ref{thm:yurinskii_sa_dependent}] Apply Lemma~\ref{lem:yurinskii_app_sa_martingale} to the martingale $\sum_{i=1}^{n} \tilde X_i$, noting that $S - \sum_{i=1}^{n} \tilde X_i = U$. \end{proof} Bounding the quantities in Theorem~\ref{thm:yurinskii_sa_dependent} gives a user-friendly version as Proposition~\ref{pro:yurinskii_sa_simplified}. \begin{proof}[Proposition~\ref{pro:yurinskii_sa_simplified}] Set $M = \nu^2 I_d$ and bound the terms appearing the main inequality in Proposition~\ref{pro:yurinskii_sa_simplified}. \proofparagraph{bounding $\P( \|Z\|_p > t )$} By Markov's inequality and Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}, we have $\P( \|Z\|_p > t ) \leq \E[\|Z\|_p] / t \leq \phi_p(d) / t$. \proofparagraph{bounding $\gamma(M)$} With $M = \nu^2 I_d$, by Markov's inequality, $\gamma(M) = \P\big(\Omega \npreceq M\big) = \P\big(\|\Omega\|_2 > \nu^2 \big) \leq \nu^{-2} \E[\|\Omega\|_2]$. \proofparagraph{bounding $\delta(M, \eta)$} By Markov's inequality and Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}, using $\max_j |M_{j j}| \leq \|M\|_2$ for $M \succeq 0$, % \begin{align*} \delta_{p}(M,\eta) &= \P\left( \big\|\big((\Sigma +M)^{1/2}- \Sigma^{1/2}\big) Z\big\|_p \geq \eta \right) \leq \frac{\phi_p(d)} {\eta} \E \left[ \big\|(\Sigma +M)^{1/2}- \Sigma^{1/2}\big\|_2 \right]. \end{align*} % For semi-definite matrices the eigenvalue operator commutes with smooth matrix functions so % \begin{align*} \|(\Sigma +M)^{1/2}- \Sigma^{1/2}\|_2 &= \max_{1 \leq j \leq d} \left| \sqrt{\lambda_j(\Sigma) + \nu^2} - \sqrt{\lambda_j(\Sigma)} \right| \leq \nu \end{align*} % and hence $\delta_{p}(M,\eta) \leq \phi_p(d)\nu / \eta$. \proofparagraph{bounding $\varepsilon(M, \eta)$} Note that $(M -\Omega)^{1/2}Z$ is a centered Gaussian conditional on $\cH_n$, on the event $\{\Omega \preceq M\}$. We thus have by Markov's inequality, Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}, and Jensen's inequality that % \begin{align*} \varepsilon_p(M, \eta) &= \P\left(\big\| (M - \Omega)^{1/2} Z \big\|_p\geq \eta, \ \Omega \preceq M\right) \leq \frac{1}{\eta} \E\left[ \I\{\Omega \preceq M\} \E\left[ \big\| (M - \Omega)^{1/2} Z \big\|_p \mid \cH_n \right] \right] \\ &\leq \frac{\phi_p(d)}{\eta} \E\left[ \I\{\Omega \preceq M\} \max_{1 \leq j \leq d} \sqrt{(M - \Omega)_{j j}} \right] \leq \frac{\phi_p(d)}{\eta} \E\left[ \sqrt{\|M - \Omega\|_2} \right] \\ &\leq \frac{\phi_p(d)}{\eta} \E\left[ \sqrt{\|\Omega\|_2} + \nu \right] \leq \frac{\phi_p(d)}{\eta} \left(\sqrt{\E[\|\Omega\|_2]} + \nu \right). \end{align*} % Thus by Theorem~\ref{thm:yurinskii_sa_dependent} and the previous parts, % \begin{align*} \P\big(\|S-T\|_p > 6\eta\big) &\leq \inf_{t>0} \left\{ 2 \P\big(\|Z\|_p>t\big) + \min\left\{ \frac{\beta_{p,2} t^2}{\eta^3}, \frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3} \right\} \right\} \\ &\quad+ \inf_{M \succeq 0} \big\{ 2\gamma(M) + \delta_p(M,\eta) + \varepsilon_p(M, \eta)\big\} +\P\big(\|U\|_p>\eta\big) \\ &\leq \inf_{t>0} \left\{ \frac{2 \phi_p(d)}{t} + \min\left\{ \frac{\beta_{p,2} t^2}{\eta^3}, \frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3} \right\} \right\} \\ &\quad+ \inf_{\nu > 0} \left\{ \frac{2\E \left[ \|\Omega\|_2 \right]}{\nu^2} + \frac{2 \phi_p(d) \nu}{\eta} \right\} + \frac{\phi_p(d) \sqrt{\E \left[ \|\Omega\|_2 \right]}}{\eta} +\P\big(\|U\|_p>\eta\big). \end{align*} % Set $t = 2^{1/3} \phi_p(d)^{1/3} \beta_{p,2}^{-1/3} \eta$ and $\nu = \E[\|\Omega\|_2]^{1/3} \phi_p(d)^{-1/3} \eta^{1/3}$, then replace $\eta$ with $\eta / 6$ to see % \begin{align*} \P\big(\|S-T\|_p > 6\eta\big) &\leq 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3} +\P\left(\|U\|_p>\frac{\eta}{6}\right). \end{align*} % Whenever $\pi_3 = 0$ we can set $t = 2^{1/4} \phi_p(d)^{1/4} \beta_{p,3}^{-1/4} \eta$, and with $\nu$ as above we obtain % \begin{align*} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4} \right)^{1/4} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3} +\P\left(\|U\|_p>\frac{\eta}{6}\right). \end{align*} % \end{proof} After establishing Proposition~\ref{pro:yurinskii_sa_simplified}, Corollaries~\ref{cor:yurinskii_sa_mixingale}, \ref{cor:yurinskii_sa_martingale}, and \ref{cor:yurinskii_sa_indep} follow easily. \begin{proof}[Corollary~\ref{cor:yurinskii_sa_mixingale}] Proposition~\ref{pro:yurinskii_sa_simplified} with $\P ( \|U\|_p > \frac{\eta}{6} ) \leq \frac{6}{\eta} \sum_{i=1}^{n} c_i (\zeta_{i} + \zeta_{n-i+1})$. \end{proof} \begin{proof}[Corollary~\ref{cor:yurinskii_sa_martingale}] By Proposition~\ref{pro:yurinskii_sa_simplified} with $U=0$ a.s. \end{proof} \begin{proof}[Corollary~\ref{cor:yurinskii_sa_indep}] By Corollary~\ref{cor:yurinskii_sa_martingale} with $\Omega=0$ a.s. \end{proof} We conclude this section with a discussion expanding on the comments made in Remark~\ref{rem:yurinskii_coupling_bounds_probability} on deriving bounds in probability from Yurinskii's coupling. Consider for illustration the independent data second-order result given in Corollary~\ref{cor:yurinskii_sa_indep}: for each $\eta > 0$, there exists $T_n \mid \cH_0 \sim \cN(0, \Sigma)$ satisfying % \begin{align*} \P\big(\|S_n-T_n\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3}, \end{align*} % where here we make explicit the dependence on the sample size $n$ for clarity. The naive approach to converting this into a probability bound for $\|S_n-T_n\|_p$ is to select $\eta$ to ensure the right-hand side is of order $1$, arguing that the probability can then be made arbitrarily small by taking, in this case, $\eta$ to be a large enough multiple of $\beta_{p,2}^{1/3} \phi_p(d)^{2/3}$. However, the somewhat subtle mistake is in neglecting the fact that the realization of the coupling variable $T_n$ will in general depend on $\eta$, rendering the resulting bound invalid. As an explicit example of this phenomenon, take $\eta > 1$ and suppose $\|S_n - T_n(\eta)\| = \eta$ with probability $1 - 1/\eta$ and $\|S_n - T_n(\eta)\| = n$ with probability $1/\eta$. Then $\P\big(\|S_n - T_n(\eta)\| > \eta\big) = 1/\eta$ but it is not true for any $\eta$ that $\|S_n - T_n(\eta)\| \lesssim_\P 1$. We propose in Remark~\ref{rem:yurinskii_coupling_bounds_probability} the following fix. Instead of selecting $\eta$ to ensure the right-hand side is of order $1$, we instead choose it so the bound converges (slowly) to zero. This is easily achieved by taking the naive and incorrect bound and multiplying by some divergent sequence $R_n$. The resulting inequality reads, in the case of Corollary~\ref{cor:yurinskii_sa_indep} with $\eta = \beta_{p,2}^{1/3} \phi_p(d)^{2/3} R_n$, % \begin{align*} \P\Big(\|S_n-T_n\|_p > \beta_{p,2}^{1/3} \phi_p(d)^{2/3} R_n \Big) &\leq \frac{24}{R_n} \to 0. \end{align*} % We thus recover, for the price of a rate which is slower by an arbitrarily small amount, a valid upper bound in probability, as we can immediately conclude that % \begin{align*} \|S_n-T_n\|_p \lesssim_\P \beta_{p,2}^{1/3} \phi_p(d)^{2/3} R_n. \end{align*} \subsection{Strong approximation for martingale empirical processes} We begin by presenting some calculations omitted from the main text relating to the motivating example of kernel density estimation with i.i.d.\ data. First, the bias is bounded as % \begin{align*} \big| \E \big[ \hat g(x) \big] - g(x) \big| &= \left| \int_{\frac{-x}{h}}^{\frac{1-x}{h}} K(\xi) \diff \xi - 1 \right| \leq 2 \int_{\frac{a}{h}}^\infty \frac{1}{\sqrt{2 \pi}} e^{-\frac{\xi^2}{2}} \diff \xi \leq \frac{h}{a} \sqrt{\frac{2}{\pi}} e^{-\frac{a^2}{2 h^2}}. \end{align*} % Next, we do the calculations necessary to apply Corollary~\ref{cor:yurinskii_sa_indep}. Define $k_{i j} = \frac{1}{n h} K \left( \frac{X_i - x_j}{h} \right)$ and $k_i = (k_{i j} : 1 \leq j \leq N)$. Then $\|k_i\|_\infty \leq \frac{1}{n h \sqrt{2 \pi}}$ a.s.\ and $\E[\|k_i\|_2^2] \leq \frac{N}{n^2 h} \int_{-\infty}^\infty K(\xi)^2 \diff \xi \leq \frac{N}{2 n^2 h \sqrt{\pi}}$. Let $V = \Var[k_i] \in \R^{N \times N}$, so assuming that $1/h \geq \log 2 N$, by Lemma~\ref{lem:yurinskii_app_gaussian_useful} we bound % \begin{align*} \beta_{\infty,2} &= n \E\left[\| k_i \|^2_2 \| k_i \|_\infty \right] + n \E \left[ \|V^{1/2} Z \|^2_2 \|V^{1/2} Z \|_\infty \right] \leq \frac{N}{\sqrt{8} n^2 h^2 \pi} + \frac{4 N \sqrt{\log 2 N}}{\sqrt{8} n^2 h^{3/2} \pi^{3/4}} \leq \frac{N}{n^2 h^2}. \end{align*} % Finally, we verify the stochastic continuity bounds. By the Lipschitz property of $K$, it is easy to show that for $x,x' \in \cX$ we have $\left|\frac{1}{h} K \left( \frac{X_i - x}{h} \right) - \frac{1}{h} K \left( \frac{X_i - x'}{h} \right)\right| \lesssim \frac{|x-x'|}{h^2}$ almost surely, and also that $\E \Big[ \left|\frac{1}{h} K \left( \frac{X_i - x}{h} \right) - \frac{1}{h} K \left( \frac{X_i - x'}{h} \right)\right|^2 \Big] \lesssim \frac{|x-x'|^2}{h^3}$. By chaining with the Bernstein--Orlicz norm and polynomial covering numbers, % \begin{align*} \sup_{|x-x'| \leq \delta} \big\|S(x) - S(x')\big\|_\infty \lesssim_\P \delta \sqrt{\frac{\log n}{n h^3}} \end{align*} % whenever $\log(N/h) \lesssim \log n$ and $n h \gtrsim \log n$. By a Gaussian process maximal inequality \citep[Corollary~2.2.8]{van1996weak} the same bound holds for $T(x)$ with % \begin{align*} \sup_{|x-x'| \leq \delta} \big\|T(x) - T(x')\big\|_\infty \lesssim_\P \delta \sqrt{\frac{\log n}{n h^3}}. \end{align*} \begin{proof}[Lemma~\ref{lem:yurinskii_kde_eigenvalue}] For $x, x' \in [a, 1-a]$, the scaled covariance function of this nonparametric estimator is % \begin{align*} n h\, \Cov\big[\hat g(x), \hat g(x')\big] &= \frac{1}{h} \E \left[ K \left( \frac{X_i - x}{h} \right) K \left( \frac{X_i - x'}{h} \right) \right] \\ &\quad- \frac{1}{h} \E \left[ K \left( \frac{X_i - x}{h} \right) \right] \E \left[ K \left( \frac{X_i - x'}{h} \right) \right] \\ &= \frac{1}{2 \pi} \int_{\frac{-x}{h}}^{\frac{1-x}{h}} \exp \left( - \frac{t^2}{2} \right) \exp \left( - \frac{1}{2} \left( t + \frac{x - x'}{h} \right)^2 \right) \diff t - h I(x) I(x') \end{align*} % where $I(x) = \frac{1}{\sqrt 2 \pi} \int_{-x/h}^{(1-x)/h} e^{-t^2/2} \diff t$. Completing the square and a substitution gives % \begin{align*} n h\, \Cov\big[\hat g(x), \hat g(x')\big] &= \frac{1}{2 \pi} \exp \left( - \frac{1}{4} \left( \frac{x-x'}{h} \right)^2 \right) \int_{\frac{-x-x'}{2h}}^{\frac{2-x-x'}{2h}} \exp \left(-t^2\right) \diff t - h I(x) I(x'). \end{align*} % Now we show that since $x, x'$ are not too close to the boundary of $[0,1]$, the limits in the above integral can be replaced by $\pm \infty$. Note that $\frac{-x-x'}{2h} \leq \frac{-a}{h}$ and $\frac{2-x-x'}{2h} \geq \frac{a}{h}$ so % \begin{align*} \int_{-\infty}^{\infty} \exp \left(-t^2\right) \diff t - \int_{\frac{-x-x'}{2h}}^{\frac{2-x-x'}{2h}} \exp \left(-t^2\right) \diff t \leq 2 \int_{a/h}^\infty \exp \left(-t^2\right) \diff t \leq \frac{h}{a} \exp \left(- \frac{a^2}{h^2}\right). \end{align*} % Therefore, since $\int_{-\infty}^{\infty} e^{-t^2} \diff t = \sqrt \pi$, % \begin{align*} \left| n h\, \Cov\big[\hat g(x), \hat g(x')\big] - \frac{1}{2 \sqrt \pi} \exp \left( - \frac{1}{4} \left( \frac{x-x'}{h} \right)^2 \right) + h I(x) I(x') \right| \leq \frac{h}{2 \pi a} \exp \left(- \frac{a^2}{h^2}\right). \end{align*} % Define the $N \times N$ matrix $\tilde\Sigma_{i j} = \frac{1}{2 \sqrt \pi} \exp \left( - \frac{1}{4} \left( \frac{x_i-x_j}{h} \right)^2 \right)$. By \citet[Proposition~2.4, Proposition~2.5, and Equation~2.10]{baxter1994norm}, with $\cB_k = \big\{b \in \R^\Z : \sum_{i \in \Z} \I\{b_i \neq 0\} \leq k \big\}$, % \begin{align*} \inf_{k \in \N} \inf_{b \in \R^k} \frac{\sum_{i=1}^k \sum_{j=1}^k b_i b_j \, e^{-\lambda(i-j)^2}} {\sum_{i=1}^k b_i^2} = \sqrt{\frac{\pi}{\lambda}} \sum_{i=-\infty}^{\infty} \exp \left( - \frac{(\pi e + 2 \pi i)^2}{4 \lambda} \right). \end{align*} % We use Riemann sums, noting that $\pi e + 2 \pi x = 0$ at $x = -e/2 \approx -1.359$. Consider the substitutions $\Z \cap (-\infty, -3] \mapsto (-\infty, -2]$, $\{-2, -1\} \mapsto \{-2, -1\}$, and $\Z \cap [0, \infty) \mapsto [-1, \infty)$. % \begin{align*} \sum_{i \in \Z} e^{-(\pi e + 2 \pi i)^2 / 4 \lambda} &\leq \int_{-\infty}^{-2} e^{ - (\pi e + 2 \pi x)^2/4 \lambda} \diff x + e^{- (\pi e - 4 \pi)^2/4 \lambda} \\ &\quad+ e^{ - (\pi e - 2 \pi)^2 / 4 \lambda} + \int_{-1}^{\infty} e^{ -(\pi e + 2 \pi x)^2 / 4 \lambda} \diff x. \end{align*} % Now use the substitution $t = \frac{\pi e + 2 \pi x}{2 \sqrt \lambda}$ and suppose $\lambda < 1$, yielding % \begin{align*} \sum_{i \in \Z} e^{-(\pi e + 2 \pi i)^2 / 4 \lambda} &\leq \frac{\sqrt \lambda}{\pi} \int_{-\infty}^{\frac{\pi e - 4 \pi}{2 \sqrt \lambda}} e^{-t^2} \diff t + e^{- (\pi e - 4 \pi)^2/4 \lambda} + e^{ - (\pi e - 2 \pi)^2 / 4 \lambda} + \frac{\sqrt \lambda}{\pi} \int_{\frac{\pi e - 2 \pi}{2 \sqrt \lambda}}^{\infty} e^{-t^2} \diff t \\ &\leq \left( 1 + \frac{1}{\pi} \frac{\lambda}{4 \pi - \pi e} \right) e^{-(\pi e - 4 \pi)^2 / 4 \lambda} + \left( 1 + \frac{1}{\pi} \frac{\lambda}{\pi e - 2 \pi} \right) e^{- (\pi e - 2 \pi)^2 / 4 \lambda} \\ &\leq \frac{13}{12} e^{-(\pi e - 4 \pi)^2 / 4 \lambda} + \frac{8}{7} e^{- (\pi e - 2 \pi)^2 / 4 \lambda} \leq \frac{9}{4} \exp \left( - \frac{5}{4 \lambda} \right). \end{align*} % Therefore % \begin{align*} \inf_{k \in \N} \inf_{b \in \cB_k} \frac{\sum_{i \in \Z} \sum_{j \in \Z} b_i b_j \, e^{-\lambda(i-j)^2}} {\sum_{i \in \Z} b_i^2} < \frac{4}{\sqrt \lambda} \exp \left( - \frac{5}{4 \lambda} \right) < 4 e^{-1/\lambda}. \end{align*} % From this and since $\tilde\Sigma_{i j} = \frac{1}{2 \sqrt \pi} e^{-\lambda(i-j)^2}$ with $\lambda = \frac{1}{4(N-1)^2 h^2} \leq \frac{\delta^2}{h^2}$, for each $h$ and some $\delta \leq h$, we have $\lambda_{\min}(\tilde\Sigma) \leq 2 e^{-h^2/\delta^2}$. Recall that % \begin{align*} \left| \Sigma_{i j} - \tilde\Sigma_{i j} + h I(x_i) I(x_j) \right| \leq \frac{h}{2 \pi a} \exp \left(- \frac{a^2}{h^2}\right). \end{align*} % For any positive semi-definite $N \times N$ matrices $A$ and $B$ and vector $v$ we have $\lambda_{\min}(A - v v^\T) \leq \lambda_{\min}(A)$ and $\lambda_{\min}(B) \leq \lambda_{\min}(A) + \|B-A\|_2 \leq \lambda_{\min}(A) + N \|B-A\|_{\max}$. Hence with $I_i = I(x_i)$, % \begin{align*} \lambda_{\min}(\Sigma) &\leq \lambda_{\min}(\tilde\Sigma - h I I^\T) + \frac{N h}{2 \pi a} \exp \left(- \frac{a^2}{h^2}\right) \leq 2 e^{-h^2/\delta^2} + \frac{h}{\pi a \delta} e^{-a^2 / h^2}. \end{align*} \end{proof} \begin{proof}[Proposition~\ref{pro:yurinskii_emp_proc}] Let $\cF_\delta$ be a $\delta$-cover of $(\cF, d)$. Using a union bound, we can write % \begin{align*} &\P\left(\sup_{f \in \cF} \big| S(f) - T(f) \big| \geq 2t + \eta \right) \leq \P\left(\sup_{f \in \cF_\delta} \big| S(f) - T(f) \big| \geq \eta \right) \\ &\qquad\qquad+ \P\left(\sup_{d(f,f') \leq \delta} \big| S(f) - S(f') \big| \geq t \right) + \P\left(\sup_{d(f,f') \leq \delta} \big| T(f) - T(f') \big| \geq t \right). \end{align*} \proofparagraph{bounding the difference on $\cF_\delta$} We apply Corollary~\ref{cor:yurinskii_sa_martingale} with $p = \infty$ to the martingale difference sequence $\cF_\delta(X_i) = \big(f(X_i) : f \in \cF_\delta\big)$ which takes values in $\R^{|\cF_\delta|}$. Square integrability can be assumed otherwise $\beta_\delta = \infty$. Note $\sum_{i=1}^n \cF_\delta(X_i) = S(\cF_\delta)$ and $\phi_\infty(\cF_\delta) \leq \sqrt{2 \log 2 |\cF_\delta|}$. Therefore there exists a conditionally Gaussian vector $T(\cF_\delta)$ with the same covariance structure as $S(\cF_\delta)$ conditional on $\cH_0$ satisfying % \begin{align*} \P\left( \sup_{f \in \cF_\delta} \big| S(f) - T(f) \big| \geq \eta \right) &\leq \frac{24\beta_\delta^{\frac{1}{3}} (2\log 2 |\cF_\delta|)^{\frac{1}{3}}}{\eta} + 17\left(\frac{\sqrt{2 \log 2 |\cF_\delta|} \sqrt{\E\left[\|\Omega_\delta\|_2\right]}}{\eta }\right)^{\frac{2}{3}}. \end{align*} \proofparagraph{bounding the fluctuations in $S(f)$} Since $\big\| S(f) - S(f') \big\|_\psi \leq L d(f,f')$, by Theorem~2.2.4 in \citet{van1996weak} % \begin{align*} \left\| \sup_{d(f,f') \leq \delta} \big| S(f) - S(f') \big| \right\|_\psi &\leq C_\psi L \left( \int_0^\delta \psi^{-1}(N_\varepsilon) \diff{\varepsilon} + \delta \psi^{-1}(N_\delta^2) \right) = C_\psi L J_\psi(\delta). \end{align*} % Then, by Markov's inequality and the definition of the Orlicz norm, % \begin{align*} \P\left( \sup_{d(f,f') \leq \delta} \big| S(f) - S(f') \big| \geq t \right) &\leq \psi\left(\frac{t}{C_\psi L J_\psi(\delta)} \right)^{-1}. \end{align*} \proofparagraph{bounding the fluctuations in $T(f)$} By the Vorob'ev--Berkes--Philipp theorem \citep{dudley1999uniform}, $T(\cF_\delta)$ extends to a conditionally Gaussian process $T(f)$. Firstly, since $\bigvvvert T(f) - T(f') \bigvvvert_2 \leq L d(f,f')$ conditionally on $\cH_0$, and $T(f)$ is a conditional Gaussian process, we have $\big\| T(f) - T(f') \big\|_{\psi_2} \leq 2 L d(f,f')$ conditional on $\cH_0$ by \citet[Chapter~2.2, Complement~1]{van1996weak}, where $\psi_2(x) = \exp(x^2) - 1$. Thus again by Theorem~2.2.4 in \citet{van1996weak}, again conditioning on $\cH_0$, % \begin{align*} \left\| \sup_{d(f,f') \leq \delta} \big| T(f) - T(f') \big| \right\|_{\psi_2} &\leq C_1 L \int_0^\delta \sqrt{\log N_\varepsilon} \diff{\varepsilon} = C_1 L J_2(\delta) \end{align*} % for some universal constant $C_1 > 0$, where we used $\psi_2^{-1}(x) = \sqrt{\log(1+x)}$ and monotonicity of covering numbers. Then by Markov's inequality and the definition of the Orlicz norm, % \begin{align*} \P\left( \sup_{d(f,f') \leq \delta} \big| T(f) - T(f') \big| \geq t \right) &\leq \left( \exp\left( \frac{t^2}{C_1^2 L^2 J_2(\delta)^2} \right) - 1 \right)^{-1} \!\vee 1 \leq 2 \exp\left( \frac{-t^2}{C_1^2 L^2 J_2(\delta)^2} \right). \end{align*} % \proofparagraph{conclusion} The result follows by scaling $t$ and $\eta$ and enlarging constants if necessary. % \end{proof} \subsection{Applications to nonparametric regression} \begin{proof}[Proposition~\ref{pro:yurinskii_series}] Proceed according to the decomposition in Section~\ref{sec:yurinskii_series}. By stationarity and Lemma~SA-2.1 in \citet{cattaneo2020large}, we have $\sup_w \|p(w)\|_1 \lesssim 1$ and also $\|H\|_1 \lesssim n/k$ and $\|H^{-1}\|_1 \lesssim k/n$. \proofparagraph{bounding $\beta_{\infty,2}$ and $\beta_{\infty,3}$} Set $X_i = p(W_i) \varepsilon_i$ so $S = \sum_{i=1}^n X_i$, and set $\sigma^2_i = \sigma^2(W_i)$ and $V_i = \Var[X_i \mid \cH_{i-1}] = \sigma_i^2 p(W_i) p(W_i)^\T$. Recall from Corollary~\ref{cor:yurinskii_sa_martingale} that for $r \in \{2,3\}$, % \begin{align*} \beta_{\infty,r} = \sum_{i=1}^n \E\left[\| X_i \|^r_2 \| X_i \|_\infty + \|V_i^{1/2} Z_i \|^r_2 \|V_i^{1/2} Z_i \|_\infty \right] \end{align*} % with $Z_i \sim \cN(0,1)$ i.i.d.\ and independent of $V_i$. For the first term, we use $\sup_w \|p(w)\|_2 \lesssim 1$ and bounded third moments of $\varepsilon_i$: % \begin{align*} \E\left[ \| X_i \|^r_2 \| X_i \|_\infty \right] &\leq \E\left[ |\varepsilon_i|^3 \| p(W_i) \|^{r+1}_2 \right] \lesssim 1. \end{align*} % For the second term, apply Lemma~\ref{lem:yurinskii_app_gaussian_useful} conditionally on $\cH_n$ with $\sup_w \|p(w)\|_2 \lesssim 1$ to see % \begin{align*} &\E\left[ \|V_i^{1/2} Z_i \|^r_2 \|V_i^{1/2} Z_i \|_\infty \right] \lesssim \sqrt{\log 2k} \ \E\left[ \max_{1 \leq j \leq k} (V_i)_{j j}^{1/2} \bigg( \sum_{j=1}^k (V_i)_{j j} \bigg)^{r/2} \right] \\ &\quad\lesssim \sqrt{\log 2k} \ \E\left[ \sigma_i^{r+1} \max_{1 \leq j \leq k} p(W_i)_j \bigg( \sum_{j=1}^k p(W_i)_{j}^2 \bigg)^{r/2} \right] \lesssim \sqrt{\log 2k} \ \E\left[ \sigma_i^{r+1} \right] \lesssim \sqrt{\log 2k}. \end{align*} % Putting these together yields % $\beta_{\infty,2} \lesssim n \sqrt{\log 2k}$ and $\beta_{\infty,3} \lesssim n \sqrt{\log 2k}$. \proofparagraph{bounding $\Omega$} Set $\Omega = \sum_{i=1}^n \big(V_i - \E[V_i] \big)$ so % \begin{align*} \Omega &= \sum_{i=1}^n \big(\sigma_i^2 p(W_i)p(W_i)^\T - \E\left[ \sigma_i^2 p(W_i)p(W_i)^\T \right]\big). \end{align*} % Observe that $\Omega_{j l}$ is the sum of a zero-mean strictly stationary $\alpha$-mixing sequence and so $\E[\Omega_{j l}^2] \lesssim n$ by Lemma~\ref{lem:yurinskii_app_variance_mixing}% \ref{it:yurinskii_app_variance_mixing_bounded}. Since the basis functions satisfy Assumption~3 in \citet{cattaneo2020large}, $\Omega$ has a bounded number of non-zero entries in each row, so by Jensen's inequality % \begin{align*} \E\left[ \|\Omega\|_2 \right] &\leq \E\left[ \|\Omega\|_\rF \right] \leq \left( \sum_{j=1}^k \sum_{l=1}^k \E\left[ \Omega_{j l}^2 \right] \right)^{1/2} \lesssim \sqrt{n k}. \end{align*} % \proofparagraph{strong approximation} By Corollary~\ref{cor:yurinskii_sa_martingale} and the previous parts, with any sequence $R_n \to \infty$, % \begin{align*} \|S - T \|_\infty &\lesssim_\P \beta_{\infty,2}^{1/3} (\log 2k)^{1/3} R_n + \sqrt{\log 2k} \sqrt{\E[\|\Omega\|_2]} R_n \\ &\lesssim_\P n^{1/3} \sqrt{\log 2k} R_n + (n k)^{1/4} \sqrt{\log 2k} R_n. \end{align*} % If further $\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$ then the third-order version of Corollary~\ref{cor:yurinskii_sa_martingale} applies since % \begin{align*} \pi_3 &= \sum_{i=1}^{n} \sum_{|\kappa| = 3} \E \Big[ \big| \E [ X_i^\kappa \mid \cH_{i-1} ] \big| \Big] = \sum_{i=1}^{n} \sum_{|\kappa| = 3} \E \Big[ \big| p(W_i)^\kappa \, \E [ \varepsilon_i^3 \mid \cH_{i-1} ] \big| \Big] = 0, \end{align*} % giving % \begin{align*} \|S - T \|_\infty &\lesssim_\P \beta_{\infty,3}^{1/4} (\log 2k)^{3/8} R_n + \sqrt{\log 2k} \sqrt{\E[\|\Omega\|_2]} R_n \lesssim_\P (n k)^{1/4} \sqrt{\log 2k} R_n. \end{align*} % By H{\"o}lder's inequality and with $\|H^{-1}\|_1 \lesssim k/n$ we have % \begin{align*} \sup_{w \in \cW} \left| p(w)^\T H^{-1} S - p(w)^\T H^{-1} T \right| &\leq \sup_{w \in \cW} \|p(w)\|_1 \|H^{-1}\|_1 \| S - T \|_\infty \lesssim n^{-1} k \| S - T \|_\infty. \end{align*} \proofparagraph{convergence of $\hat H$} We have $\hat H - H = \sum_{i=1}^n \big(p(W_i)p(W_i)^\T - \E\left[ p(W_i)p(W_i)^\T \right]\big)$. Observe that $(\hat H - H)_{j l}$ is the sum of a zero-mean strictly stationary $\alpha$-mixing sequence and so $\E[(\hat H - H)_{j l}^2] \lesssim n$ by Lemma~\ref{lem:yurinskii_app_variance_mixing}% \ref{it:yurinskii_app_variance_mixing_bounded}. Since the basis functions satisfy Assumption~3 in \citet{cattaneo2020large}, $\hat H-H$ has a bounded number of non-zero entries in each row and so by Jensen's inequality % \begin{align*} \E\left[ \|\hat H-H\|_1 \right] &= \E\left[ \max_{1 \leq i \leq k} \sum_{j=1}^k \big|(\hat H-H)_{i j}\big| \right] \leq \E\left[ \sum_{1 \leq i \leq k} \Bigg( \sum_{j=1}^k |(\hat H-H)_{i j}| \Bigg)^2 \right]^{\frac{1}{2}} \lesssim \sqrt{n k}. \end{align*} \proofparagraph{bounding the matrix term} Note $\|\hat H^{-1}\|_1 \leq \|H^{-1}\|_1 + \|\hat H^{-1}\|_1 \|\hat H-H\|_1 \|H^{-1}\|_1$ so by the previous part, we deduce % \begin{align*} \|\hat H^{-1}\|_1 \leq \frac{\|H^{-1}\|_1} {1 - \|\hat H-H\|_1 \|H^{-1}\|_1} \lesssim_\P \frac{k/n} {1 - \sqrt{n k}\, k/n} \lesssim_\P \frac{k}{n} \end{align*} % as $k^3 / n \to 0$. Note that by the martingale structure, since $p(W_i)$ is bounded and supported on a region with volume at most of the order $1/k$, and as $W_i$ has a Lebesgue density, % \begin{align*} \Var[T_j] &= \Var[S_j] = \Var\left[ \sum_{i=1}^n \varepsilon_i p(W_i)_j \right] = \sum_{i=1}^n \E\left[ \sigma_i^2 p(W_i)_j^2 \right] \lesssim \frac{n}{k}. \end{align*} % So by the Gaussian maximal inequality in Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}, $\|T\|_\infty \lesssim_\P \sqrt{\frac{n \log 2k}{k}}$. Since $k^3/n \to 0$, % \begin{align*} \sup_{w \in \cW} \left| p(w)^\T (\hat H^{-1} - H^{-1}) S \right| &\leq \sup_{w \in \cW} \|p(w)^\T\|_1 \|\hat H^{-1}\|_1 \|\hat H - H\|_1 \|H^{-1}\|_1 \|S - T\|_\infty \\ &\quad+ \sup_{w \in \cW} \|p(w)^\T\|_1 \|\hat H^{-1}\|_1 \|\hat H - H\|_1 \|H^{-1}\|_1 \|T\|_\infty \\ &\lesssim_\P \frac{k^2}{n^2} \sqrt{n k} \!\left( n^{1/3} \sqrt{\log 2k} + (n k)^{1/4} \sqrt{\log 2k} \right) \!+ \frac{k^2}{n^2} \sqrt{n k} \sqrt{\frac{n \log 2k}{k}} \\ &\lesssim_\P \frac{k^2}{n} \sqrt{\log 2k}. \end{align*} % \proofparagraph{conclusion of the main result} By the previous parts, with $G(w) = p(w)^\T H^{-1} T$, % \begin{align*} &\sup_{w \in \cW} \left| \hat\mu(w) - \mu(w) - p(w)^\T H^{-1} T \right| \\ &\quad= \sup_{w \in \cW} \left| p(w)^\T H^{-1} (S - T) + p(w)^\T (\hat H^{-1} - H^{-1}) S + \Bias(w) \right| \\ &\quad\lesssim_\P \frac{k}{n} \|S - T\|_\infty + \frac{k^2}{n} \sqrt{\log 2k} + \sup_{w \in \cW} |\Bias(w)| \\ &\quad\lesssim_\P \frac{k}{n} \left( n^{1/3} \sqrt{\log 2k} + (n k)^{1/4} \sqrt{\log 2k} \right) R_n + \frac{k^2}{n} \sqrt{\log 2k} + \sup_{w \in \cW} |\Bias(w)| \\ &\quad\lesssim_\P n^{-2/3} k \sqrt{\log 2k} R_n + n^{-3/4} k^{5/4} \sqrt{\log 2k} R_n + \frac{k^2}{n} \sqrt{\log 2k} + \sup_{w \in \cW} |\Bias(w)| \\ &\quad\lesssim_\P n^{-2/3} k \sqrt{\log 2k} R_n + \sup_{w \in \cW} |\Bias(w)| \end{align*} % since $k^3/n \to 0$. If further $\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$ then % \begin{align*} \sup_{w \in \cW} \left| \hat\mu(w) - \mu(w) - p(w)^\T H^{-1} T \right| &\lesssim_\P \frac{k}{n} \|S - T\|_\infty + \frac{k^2}{n} \sqrt{\log 2k} + \sup_{w \in \cW} |\Bias(w)| \\ &\lesssim_\P n^{-3/4} k^{5/4} \sqrt{\log 2k} R_n + \sup_{w \in \cW} |\Bias(w)|. \end{align*} % Finally, we verify the variance bounds for the Gaussian process. With $\sigma^2(w)$ bounded above, % \begin{align*} \Var[G(w)] &= p(w)^\T H^{-1} \Var\left[ \sum_{i=1}^n p(W_i) \varepsilon_i \right] H^{-1} p(w) \\ &= p(w)^\T H^{-1} \E\left[\sum_{i=1}^n p(W_i) p(W_i)^\T \sigma^2(W_i) \right] H^{-1} p(w) \\ &\lesssim \|p(w)\|_2^2 \|H^{-1}\|_2^2 \|H\|_2 \lesssim k/n. \end{align*} % Similarly, since $\sigma^2(w)$ is bounded away from zero, % \begin{align*} \Var[G(w)] &\gtrsim \|p(w)\|_2^2 \|H^{-1}\|_2^2 \|H^{-1}\|_2^{-1} \gtrsim k/n. \end{align*} \proofparagraph{bounding the bias} We delegate the task of carefully deriving bounds on the bias to \citet{cattaneo2020large}, who provide a high-level assumption on the approximation error in Assumption~4 and then use it to derive bias bounds in Section~3 of the form $\sup_{w \in \cW} |\Bias(w)| \lesssim_\P k^{-\gamma}$. This assumption is then verified for B-splines, wavelets, and piecewise polynomials in their supplemental appendix. \end{proof} \begin{proof}[Proposition~\ref{pro:yurinskii_series_feasible}] \proofparagraph{infeasible supremum approximation} Provided that the bias is negligible, for all $s > 0$ we have % \begin{align*} &\sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \left| \frac{\hat\mu(w)-\mu(w)}{\sqrt{\rho(w,w)}} \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \frac{G(w)}{\sqrt{\rho(w,w)}} \right| \leq t \right) \right| \\ &\quad\leq \sup_{t \in \R} \P\left( t \leq \sup_{w \in \cW} \left| \frac{G(w)}{\sqrt{\rho(w,w)}} \right| \leq t + s \right) + \P\left( \sup_{w \in \cW} \left| \frac{\hat\mu(w)-\mu(w)-G(w)}{\sqrt{\rho(w,w)}} \right| > s \right). \end{align*} % By the Gaussian anti-concentration result given as Corollary~2.1 in \citet{chernozhukov2014anti} applied to a discretization of $\cW$, the first term is at most $s \sqrt{\log n}$ up to a constant factor, and the second term converges to zero whenever $\frac{1}{s} \left( \frac{k^3 (\log k)^3}{n} \right)^{1/6} \to 0$. Thus a suitable value of $s$ exists whenever $\frac{k^3(\log n)^6}{n} \to 0$. \proofparagraph{feasible supremum approximation} By \citet[Lemma~3.1]{chernozhukov2013gaussian} and discretization, with $\rho(w,w') = \E[\hat\rho(w,w')]$, % \begin{align*} &\sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \left| \frac{\hat G(w)}{\sqrt{\hat\rho(w,w)}} \right| \leq t \biggm| \bW, \bY \right) - \P\left( \left| \frac{G(w)}{\sqrt{\rho(w,w)}} \right| \leq t \right) \right| \\ &\quad\lesssim_\P \sup_{w,w' \in \cW} \left| \frac{\hat\rho(w,w')} {\sqrt{\hat\rho(w,w)\hat\rho(w',w')}} - \frac{\rho(w,w')} {\sqrt{\rho(w,w)\rho(w',w')}} \right|^{1/3} (\log n)^{2/3} \\ &\quad\lesssim_\P \left(\frac n k \right)^{1/3} \sup_{w,w' \in \cW} |\hat\rho(w,w') - \rho(w,w')|^{1/3} (\log n)^{2/3} \\ &\quad\lesssim_\P \left( \frac{n (\log n)^2}{k} \right)^{1/3} \sup_{w,w' \in \cW} \left| p(w)^\T \hat H^{-1} \left( \hat{V}[S] - \Var[S] \right) \hat H^{-1} p(w') \right|^{1/3} \\ &\quad\lesssim_\P \left( \frac{k (\log n)^2}{n} \right)^{1/3} \left\| \hat{V}[S] - \Var[S] \right\|_2^{1/3}, \end{align*} % and vanishes in probability whenever $\frac{k (\log n)^2}{n} \big\| \hat{V}[S] - \Var[S] \big\|_2 \to_\P 0$. For the plug-in estimator, % \begin{align*} &\left\| \hat{V}[S] - \Var[S] \right\|_2 = \left\| \sum_{i=1}^n p(W_i) p(W_i^\T) \hat\sigma^2(W_i) - n \E\left[ p(W_i) p(W_i^\T) \sigma^2(W_i) \right] \right\|_2 \\ &\quad\lesssim_\P \sup_{w \in \cW} |\hat{\sigma}^2(w)-\sigma^2(w)| \, \big\| \hat H \big\|_2 \\ &\qquad+ \left\| \sum_{i=1}^n p(W_i) p(W_i^\T) \sigma^2(W_i) - n \E\left[ p(W_i) p(W_i^\T) \sigma^2(W_i) \right] \right\|_2 \\ &\quad\lesssim_\P \frac{n}{k} \sup_{w \in \cW} |\hat{\sigma}^2(w)-\sigma^2(w)| + \sqrt{n k}, \end{align*} % where the second term is bounded by the same argument used to bound $\|\hat H - H\|_1$. Thus, the feasible approximation is valid whenever $(\log n)^2 \sup_{w \in \cW} |\hat{\sigma}^2(w)-\sigma^2(w)| \to_\P 0$ and $\frac{k^3 (\log n)^4}{n} \to 0$. The validity of the uniform confidence band follows immediately. % \end{proof} \begin{proof}[Proposition~\ref{pro:yurinskii_local_poly}] We apply Proposition~\ref{pro:yurinskii_emp_proc} with the metric $d(f_w, f_{w'}) = \|w-w'\|_2$ and the function class % \begin{align*} \cF &= \left\{ (W_i, \varepsilon_i) \mapsto e_1^\T H(w)^{-1} K_h(W_i-w) p_h(W_i-w) \varepsilon_i :\ w \in \cW \right\}, \end{align*} % with $\psi$ chosen as a suitable Bernstein Orlicz function. \proofparagraph{bounding $H(w)^{-1}$} Recall that $H(w) = \sum_{i=1}^n \E[K_h(W_i-w) p_h(W_i-w)p_h(W_i-w)^\T]$ and let $a(w) \in \R^k$ with $\|a(w)\|_2 = 1$. Since the density of $W_i$ is bounded away from zero on $\cW$, % \begin{align*} a(w)^\T H(w) a(w) &= n \E\left[ \big( a(w)^\T p_h(W_i-w) \big)^2 K_h(W_i-w) \right] \\ &\gtrsim n \int_\cW \big( a(w)^\T p_h(u-w) \big)^2 K_h(u-w) \diff{u} \gtrsim n \int_{\frac{\cW-w}{h}} \big( a(w)^\T p(u) \big)^2 K(u) \diff{u}. \end{align*} % This is continuous in $a(w)$ on the compact set $\|a(w)\|_2 = 1$ and $p(u)$ forms a polynomial basis so $a(w)^\T p(u)$ has finitely many zeroes. Since $K(u)$ is compactly supported and $h \to 0$, the above integral is eventually strictly positive for all $x \in \cW$, and hence is bounded below uniformly in $w \in \cW$ by a positive constant. Therefore $\sup_{w \in \cW} \|H(w)^{-1}\|_2 \lesssim 1/n$. \proofparagraph{bounding $\beta_\delta$} Let $\cF_\delta$ be a $\delta$-cover of $(\cF, d)$ with cardinality $|\cF_\delta| \asymp \delta^{-m}$ and let $\cF_\delta(W_i, \varepsilon_i) = \big(f(W_i, \varepsilon_i) : f\in \cF_\delta\big)$. Define the truncated errors $\tilde\varepsilon_i = \varepsilon_i\I\{-a \log n \leq \varepsilon_i \leq b \log n\}$ and note that $\E\big[e^{|\varepsilon_i|/C_\varepsilon}\big] < \infty$ implies that $\P(\exists i: \tilde\varepsilon_i \neq \varepsilon_i) \lesssim n^{1-(a \vee b)/C_\varepsilon}$. Hence, by choosing $a$ and $b$ large enough, with high probability, we can replace all $\varepsilon_i$ by $\tilde\varepsilon_i$. Further, it is always possible to increase either $a$ or $b$ along with some randomization to ensure that $\E[\tilde\varepsilon_i] = 0$. Since $K$ is bounded and compactly supported, $W_i$ has a bounded density and $|\tilde\varepsilon_i| \lesssim \log n$, % \begin{align*} \bigvvvert f(W_i, \tilde\varepsilon_i) \bigvvvert_2 &= \E\left[ \left| e_1^\T H(w)^{-1} K_h(W_i-w) p_h(W_i-w) \tilde\varepsilon_i \right|^2 \right]^{1/2} \\ &\leq \E\left[ \|H(w)^{-1}\|_2^2 K_h(W_i-w)^2 \|p_h(W_i-w)\|_2^2 \sigma^2(W_i) \right]^{1/2} \\ &\lesssim n^{-1} \E\left[ K_h(W_i-w)^2 \right]^{1/2} \lesssim n^{-1} h^{-m / 2}, \\ \bigvvvert f(W_i, \tilde\varepsilon_i) \bigvvvert_\infty &\leq \bigvvvert \|H(w)^{-1}\|_2 K_h(W_i-w) \|p_h(W_i-w)\|_2 |\tilde\varepsilon_i| \bigvvvert_\infty \\ &\lesssim n^{-1} \bigvvvert K_h(W_i-w) \bigvvvert_\infty \log n \lesssim n^{-1} h^{-m} \log n. \end{align*} % Therefore % \begin{align*} \E\left[ \|\cF_\delta(W_i, \tilde\varepsilon_i)\|_2^2 \|\cF_\delta(W_i, \tilde\varepsilon_i)\|_\infty \right] &\leq \!\sum_{f\in\cF_\delta} \!\bigvvvert f(W_i, \tilde\varepsilon_i) \bigvvvert_2^2 \max_{f\in\cF_\delta} \bigvvvert f(W_i, \tilde\varepsilon_i) \bigvvvert_\infty \!\lesssim n^{-3} \delta^{-m} h^{-2m} \log n. \end{align*} % Let $V_i(\cF_\delta) = \E\big[\cF_\delta(W_i, \tilde\varepsilon_i) \cF_\delta(W_i, \tilde\varepsilon_i)^\T \mid \cH_{i-1}\big]$ and $Z_i \sim \cN(0, I_d)$ be i.i.d.\ and independent of $\cH_n$. Note that $V_i(f,f) = \E[f(W_i, \tilde\varepsilon_i)^2 \mid W_i] \lesssim n^{-2} h^{-2m}$ and $\E[V_i(f,f)] = \E[f(W_i, \tilde\varepsilon_i)^2] \lesssim n^{-2} h^{-m}$. Thus by Lemma~\ref{lem:yurinskii_app_gaussian_useful}, % \begin{align*} \E\left[ \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|^2_2 \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|_\infty \right] &= \E\left[ \E\left[ \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|^2_2 \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|_\infty \mid \cH_n \right] \right] \\ &\leq 4 \sqrt{\log 2|\cF_\delta|} \,\E\Bigg[ \max_{f \in \cF_\delta} \sqrt{V_i(f,f)} \sum_{f \in \cF_\delta} V_i(f,f) \Bigg] \\ &\lesssim n^{-3} h^{-2m} \delta^{-m} \sqrt{\log(1/\delta)}. \end{align*} % Thus since $\log(1/\delta) \asymp \log(1/h) \asymp\log n$, % \begin{align*} \beta_\delta &= \sum_{i=1}^n \E\left[ \|\cF_\delta(W_i, \tilde\varepsilon_i)\|_2^2 \|\cF_\delta(W_i, \tilde\varepsilon_i)\|_\infty + \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|^2_2 \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|_\infty \right] \lesssim \frac{\log n} {n^2 h^{2m} \delta^m}. \end{align*} \proofparagraph{bounding $\Omega_\delta$} Let $C_K>0$ be the radius of a $\ell^2$-ball containing the support of $K$ and note that % \begin{align*} \left| V_i(f,f') \right| &= \Big| \E\Big[ e_1^\T H(w)^{-1} p_h(W_i-w) e_1^\T H(w')^{-1} p_h(W_i-w') \\ &\qquad\times K_h(W_i-w) K_h(W_i-w') \tilde\varepsilon_i^2 \Bigm| \cH_{i-1} \Big] \Big| \\ &\lesssim n^{-2} K_h(W_i-w) K_h(W_i-w') \\ &\lesssim n^{-2} h^{-m} K_h(W_i-w) \I\{\|w-w'\|_2 \leq 2 C_K h\}. \end{align*} % Since $W_i$ are $\alpha$-mixing with $\alpha(j) < e^{-2j / C_\alpha}$, Lemma~\ref{lem:yurinskii_app_variance_mixing}% \ref{it:yurinskii_app_variance_mixing_exponential} with $r=3$ gives % \begin{align*} &\Var\left[ \sum_{i=1}^n V_i(f,f') \right] \\ &\quad\lesssim \sum_{i=1}^n \E\left[ |V_i(f,f')|^3 \right] ^{2/3} \lesssim n^{-3} h^{-2m} \E\left[ K_h(W_i-w)^3 \right] ^{2/3} \I\{\|w-w'\|_2 \leq 2 C_K h\} \\ &\quad\lesssim n^{-3} h^{-2m} (h^{-2m})^{2/3} \I\{\|w-w'\|_2 \leq 2 C_K h\} \\ &\quad\lesssim n^{-3} h^{-10m/3} \I\{\|w-w'\|_2 \leq 2 C_K h\}. \end{align*} % Therefore, by Jensen's inequality, % \begin{align*} \E\big[ \|\Omega_\delta\|_2 \big] &\leq \E\big[ \|\Omega_\delta\|_\rF \big] \leq \E\Bigg[ \sum_{f,f' \in \cF_\delta} (\Omega_\delta)_{f,f'}^2 \Bigg]^{1/2} \leq \Bigg( \sum_{f,f' \in \cF_\delta} \Var\left[ \sum_{i=1}^n V_i(f,f') \right] \Bigg)^{1/2} \\ &\lesssim n^{-3/2} h^{-5m/3} \Bigg( \sum_{f,f' \in \cF_\delta} \I\{\|w-w'\|_2 \leq 2 C_K h\} \Bigg)^{1/2} \\ &\lesssim n^{-3/2} h^{-5m/3} \big(h^{m} \delta^{-2m} \big)^{1/2} \lesssim n^{-3/2} h^{-7m/6} \delta^{-m}. \end{align*} % Note that we could have used $\|\cdot\|_1$ rather than $\|\cdot\|_\rF$, but this term is negligible either way. \proofparagraph{regularity of the stochastic processes} For each $f, f' \in \cF$, define the mean-zero and $\alpha$-mixing random variables % \begin{align*} u_i(f,f') &= e_1^\T \big( H(w)^{-1} K_h(W_i-w) p_h(W_i-w) - H(w')^{-1} K_h(W_i-w') p_h(W_i-w') \big) \tilde\varepsilon_i. \end{align*} % Note that for all $1 \leq j \leq k$, by the Lipschitz property of the kernel and monomials, % \begin{align*} &\left| K_h(W_i-w) - K_h(W_i-w') \right| \\ &\quad\lesssim h^{-m-1} \|w-w'\|_2 \big( \I\{\|W_i-w\| \leq C_K h\} + \I\{\|W_i-w'\| \leq C_K h\} \big), \\ &\left| p_h(W_i-w)_j - p_h(W_i-w')_j \right| \lesssim h^{-1} \|w-w'\|_2, \end{align*} % to deduce that for any $1 \leq j,l \leq k$, % \begin{align*} \big| H(w)_{j l} - H(w')_{j l} \big| &= \big| n \E\big[ K_h(W_i-w) p_h(W_i-w)_j p_h(W_i-w)_l \\ &\qquad- K_h(W_i-w') p_h(W_i-w')_j p_h(W_i-w')_l \big] \big| \\ &\leq n\E\left[ \left| K_h(W_i-w) - K_h(W_i-w') \right| \left| p_h(W_i-w)_j p_h(W_i-w)_l \right| \right] \\ &\quad+ n\E\left[ \left| p_h(W_i-w)_j - p_h(W_i-w')_j \right| \left| K_h(W_i-w') p_h(W_i-w)_l \right| \right] \\ &\quad+ n\E\left[ \left| p_h(W_i-w)_l - p_h(W_i-w')_l \right| \left| K_h(W_i-w') p_h(W_i-w')_j \right| \right] \\ &\lesssim n h^{-1}\|w-w'\|_2. \end{align*} % Therefore, as the dimension of the matrix $H(w)$ is fixed, % \begin{align*} \big\| H(w)^{-1} - H(w')^{-1} \big\|_2 &\leq \big\| H(w)^{-1}\big\|_2 \big\| H(w')^{-1}\big\|_2 \big\| H(w) - H(w') \big\|_2 \lesssim \frac{\|w-w'\|_2}{n h}. \end{align*} % Hence % \begin{align*} \big| u_i(f,f') \big| &\leq \big\| H(w)^{-1} K_h(W_i-w) p_h(W_i-w) - H(w')^{-1} K_h(W_i-w') p_h(W_i-w') \tilde\varepsilon_i \big\|_2 \\ &\leq \big\| H(w)^{-1} - H(w')^{-1} \big\|_2 \big\| K_h(W_i-w) p_h(W_i-w) \tilde\varepsilon_i \big\|_2 \\ &\quad+ \big| K_h(W_i-w) - K_h(W_i-w') \big| \big\| H(w')^{-1} p_h(W_i-w) \tilde\varepsilon_i \big\|_2 \\ &\quad+ \big\| p_h(W_i-w) - p_h(W_i-w') \big\|_2 \big\| H(w')^{-1} K_h(W_i-w') \tilde\varepsilon_i \big\|_2 \\ &\lesssim \frac{\|w-w'\|_2}{n h} \big| K_h(W_i-w) \tilde\varepsilon_i \big| + \frac{1}{n} \big| K_h(W_i-w) - K_h(W_i-w') \big| \,|\tilde\varepsilon_i| \\ &\lesssim \frac{\|w-w'\|_2 \log n}{n h^{m+1}}, \end{align*} % and from the penultimate line, we also deduce that % \begin{align*} \Var[u_i(f,f')] &\lesssim \frac{\|w-w'\|_2^2}{n^2h^2} \E\left[ K_h(W_i-w)^2 \sigma^2(X_i) \right] \\ &\quad+ \frac{1}{n^2} \E\left[ \big( K_h(W_i-w) - K_h(W_i-w') \big)^2 \sigma^2(X_i) \right] \lesssim \frac{\|w-w'\|_2^2}{n^2h^{m+2}}. \end{align*} % Further, $\E[u_i(f,f') u_j(f,f')] = 0$ for $i \neq j$ so by Lemma~\ref{lem:yurinskii_app_exponential_mixing}% \ref{it:yurinskii_app_exponential_mixing_bernstein}, for a constant $C_1>0$, % \begin{align*} \P\left( \Big| \sum_{i=1}^n u_i(f,f') \Big| \geq \frac{C_1 \|w-w'\|_2}{\sqrt n h^{m/2+1}} \left( \sqrt{t} + \sqrt{\frac{(\log n)^2}{n h^m}} \sqrt t + \sqrt{\frac{(\log n)^6}{n h^m}} t \right) \right) &\leq C_1 e^{-t}. \end{align*} % Therefore, adjusting the constant if necessary and since $n h^{m} \gtrsim (\log n)^7$, % \begin{align*} \P\left( \Big| \sum_{i=1}^n u_i(f,f') \Big| \geq \frac{C_1 \|w-w'\|_2}{\sqrt{n} h^{m/2+1}} \left( \sqrt{t} + \frac{t}{\sqrt{\log n}} \right) \right) &\leq C_1 e^{-t}. \end{align*} % \Citet[Lemma~2]{van2013bernstein} with $\psi(x) = \exp\Big(\big(\sqrt{1+2 x / \sqrt{\log n}}-1 \big)^2 \log n \Big)-1$ now shows that % \begin{align*} \Bigvvvert \sum_{i=1}^n u_i(f,f') \Bigvvvert_\psi &\lesssim \frac{\|w-w'\|_2}{\sqrt{n} h^{m/2+1}} \end{align*} % so we take $L = \frac{1}{\sqrt{n} h^{m/2+1}}$. Noting $\psi^{-1}(t) = \sqrt{\log(1+t)} + \frac{\log(1+t)}{2\sqrt{\log n}}$ and $N_\delta \lesssim \delta^{-m}$, % \begin{align*} J_\psi(\delta) &= \int_0^\delta \psi^{-1}\big( N_\varepsilon \big) \diff{\varepsilon} + \delta \psi^{-1} \big( N_\delta \big) \lesssim \frac{\delta \log(1/\delta)}{\sqrt{\log n}} + \delta \sqrt{\log(1/\delta)} \lesssim \delta \sqrt{\log n}, \\ J_2(\delta) &= \int_0^\delta \sqrt{\log N_\varepsilon} \diff{\varepsilon} \lesssim \delta \sqrt{\log(1/\delta)} \lesssim \delta \sqrt{\log n}. \end{align*} \proofparagraph{strong approximation} Recalling that $\tilde\varepsilon_i = \varepsilon_i$ for all $i$ with high probability, by Proposition~\ref{pro:yurinskii_emp_proc}, for all $t, \eta > 0$ there exists a zero-mean Gaussian process $T(w)$ satisfying % \begin{align*} \E\left[ \left(\sum_{i=1}^n f_w(W_i, \varepsilon_i)\right) \left(\sum_{i=1}^n f_{w'}(W_i, \varepsilon_i)\right) \right] &= \E\big[ T(w) T(w') \big] \end{align*} % for all $w, w' \in \cW$ and % \begin{align*} &\P\left( \sup_{w \in \cW} \left| \sum_{i=1}^n f_{w}(W_i, \varepsilon_i) - T(w) \right| \geq C_\psi(t + \eta) \right) \\ &\quad\leq C_\psi \inf_{\delta > 0} \inf_{\cF_\delta} \Bigg\{ \frac{\beta_\delta^{1/3} (\log 2 |\cF_\delta|)^{1/3}}{\eta } + \left(\frac{\sqrt{\log 2 |\cF_\delta|} \sqrt{\E\left[\|\Omega_\delta\|_2\right]}}{\eta }\right)^{2/3} \\ &\qquad+ \psi\left(\frac{t}{L J_\psi(\delta)}\right)^{-1} + \exp\left(\frac{-t^2}{L^2 J_2(\delta)^2}\right) \Bigg\} \\ &\quad\leq C_\psi \Bigg\{ \frac{ \left(\frac{\log n} {n^2 h^{2m} \delta^{m}} \right)^{1/3} (\log n)^{1/3}}{\eta } + \left(\frac{\sqrt{\log n} \sqrt{n^{-3/2} h^{-7m/6} \delta^{-m}} }{\eta }\right)^{2/3} \\ &\qquad+ \psi\left(\frac{t}{\frac{1}{\sqrt{n} h^{m/2+1}} J_\psi(\delta)}\right)^{-1} + \exp\left(\frac{-t^2}{ \left( \frac{1}{\sqrt{n} h^{m/2+1}} \right)^2 J_2(\delta)^2}\right) \Bigg\} \\ &\quad\leq C_\psi \Bigg\{ \frac{ (\log n)^{2/3}}{n^{2/3} h^{2m/3} \delta^{m/3} \eta} + \left(\frac{ n^{-3/4} h^{-7m/12} \delta^{-m/2} \sqrt{\log n}} {\eta }\right)^{2/3} \\ &\qquad+ \psi\left(\frac{t\sqrt{n} h^{m/2+1}} {\delta \sqrt{\log n}}\right)^{-1} + \exp\left(\frac{-t^2n h^{m+2}} {\delta^2 \log n}\right) \Bigg\}. \end{align*} % Noting $\psi(x) \geq e^{x^2/4}$ for $x \leq 4 \sqrt{\log n}$, any $R_n \to \infty$ gives the probability bound % \begin{align*} \sup_{w \in \cW} \left| \sum_{i=1}^n f_{w}(W_i, \varepsilon_i) - T(w) \right| &\lesssim_\P \frac{(\log n)^{2/3}}{n^{2/3} h^{2m/3} \delta^{m/3}} R_n + \frac{\sqrt{\log n}}{n^{3/4} h^{7m/12} \delta^{m/2}} R_n + \frac{\delta \sqrt{\log n}} {\sqrt{n} h^{m/2+1}}. \end{align*} % Optimizing over $\delta$ gives $\delta \asymp \left(\frac{\log n}{n h^{m-6}}\right)^{\frac{1}{2m+6}} = h \left( \frac{\log n}{n h^{3m}} \right)^{\frac{1}{2m+6}}$ and so % \begin{align*} \sup_{w \in \cW} \left| \sum_{i=1}^n f_{w}(W_i, \varepsilon_i) - T(w) \right| &\lesssim_\P \left( \frac{(\log n)^{m+4}}{n^{m+4}h^{m(m+6)}} \right)^{\frac{1}{2m+6}} R_n. \end{align*} \proofparagraph{convergence of $\hat H(w)$} For $1 \leq j,l \leq k$ define the zero-mean random variables % \begin{align*} u_{i j l}(w) &= K_h(W_i-w) p_h(W_i-w)_j p_h(W_i-w)_l - \E\big[K_h(W_i-w) p_h(W_i-w)_j p_h(W_i-w)_l \big] \end{align*} % and note that $|u_{i j l}(w)| \lesssim h^{-m}$. By Lemma~\ref{lem:yurinskii_app_exponential_mixing}% \ref{it:yurinskii_app_exponential_mixing_bounded} for a constant $C_2 > 0$ and all $t > 0$, % \begin{align*} \P\left( \left| \sum_{i=1}^n u_{i j l}(w) \right| > C_2 h^{-m} \big( \sqrt{n t} + (\log n)(\log \log n) t \big) \right) &\leq C_2 e^{-t}. \end{align*} % Further, note that by Lipschitz properties, % \begin{align*} \left| \sum_{i=1}^n u_{i j l}(w) - \sum_{i=1}^n u_{i j l}(w') \right| &\lesssim h^{-m-1} \|w-w'\|_2 \end{align*} % so there is a $\delta$-cover of $(\cW, \|\cdot\|_2)$ with size at most $n^a \delta^{-a}$ for some $a > 0$. Adjusting $C_2$, % \begin{align*} \P\left( \sup_{w \in \cW} \left| \sum_{i=1}^n u_{i j l}(w) \right| > C_2 h^{-m} \big( \sqrt{n t} + (\log n)(\log \log n) t \big) + C_2 h^{-m-1} \delta \right) &\leq C_2 n^a \delta^{-a} e^{-t} \end{align*} % and hence % \begin{align*} \sup_{w \in \cW} \left| \sum_{i=1}^n u_{i j l}(w) \right| &\lesssim_\P h^{-m} \sqrt{n \log n} + h^{-m} (\log n)^3 \lesssim_\P \sqrt{\frac{n \log n}{h^{2m}}}. \end{align*} % Therefore % \begin{align*} \sup_{w\in\cW} \|\hat H(w)-H(w)\|_2 &\lesssim_\P \sqrt{\frac{n \log n}{h^{2m}}}. \end{align*} \proofparagraph{bounding the matrix term} Firstly, note that since $\sqrt{\frac{\log n}{n h^{2m}}} \to 0$, we have that uniformly in $w \in \cW$ % \begin{align*} \|\hat H(w)^{-1}\|_2 \leq \frac{\|H(w)^{-1}\|_2} {1 - \|\hat H(w)-H(w)\|_2 \|H(w)^{-1}\|_2} &\lesssim_\P \frac{1/n} {1 - \sqrt{\frac{n \log n}{h^{2m}}} \frac{1}{n}} \lesssim_\P \frac{1}{n}. \end{align*} % Therefore % \begin{align*} &\sup_{w \in \cW} \big| e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big) S(w) \big| \leq \sup_{w \in \cW} \big\|\hat H(w)^{-1} - H(w)^{-1}\big\|_2 \|S(w)\|_2 \\ &\quad\leq \sup_{w \in \cW} \big\|\hat H(w)^{-1}\big\|_2 \big\|H(w)^{-1}\big\|_2 \big\|\hat H(w) - H(w)\big\|_2 \|S(w)\|_2 \lesssim_\P \sqrt{\frac{\log n}{n^3 h^{2m}}} \sup_{w \in \cW} \|S(w)\|_2. \end{align*} % Now for $1 \leq j \leq k$ write $u_{i j}(w) = K_h(W_i-w) p_h(W_i-w)_j \tilde \varepsilon_i$ so that $S(w)_j = \sum_{i=1}^n u_{i j}(w)$ with high probability. Note that $u_{i j}(w)$ are zero-mean with $\Cov[u_{i j}(w), u_{i' j}(w)] = 0$ for $ i \neq i'$. Also $|u_{i j}(w)| \lesssim h^{-m} \log n$ and $\Var[u_{i j}(w)] \lesssim h^{-m}$. By Lemma~\ref{lem:yurinskii_app_exponential_mixing}% \ref{it:yurinskii_app_exponential_mixing_bernstein} for a constant $C_3>0$, % \begin{align*} \P\left( \Big| \sum_{i=1}^n u_{i j}(w) \Big| \geq C_3 \big( (h^{-m/2} \sqrt n + h^{-m} \log n) \sqrt t + h^{-m} (\log n)^3 t \big) \right) &\leq C_3 e^{-t}, \\ \P\left( \Big| \sum_{i=1}^n u_{i j}(w) \Big| > C_3 \left( \sqrt{\frac{tn}{h^{m}}} + \frac{t(\log n)^3}{h^{m}} \right) \right) &\leq C_3 e^{-t}, \end{align*} % where we used $n h^{m} \gtrsim (\log n)^2$ and adjusted the constant if necessary. As before, $u_{i j}(w)$ is Lipschitz in $w$ with a constant which is at most polynomial in $n$, so for some $a>0$ % \begin{align*} \P\left( \sup_{w \in \cW} \Big| \sum_{i=1}^n u_{i j}(w) \Big| > C_3 \left( \sqrt{\frac{tn}{h^{m}}} + \frac{t(\log n)^3}{h^{m}} \right) \right) &\leq C_3 n^a e^{-t}, \\ \sup_{w \in \cW} \|S(w)\|_2 \lesssim_\P \sqrt{\frac{n \log n}{h^{m}}} + \frac{(\log n)^4}{h^{m}} &\lesssim_\P \sqrt{\frac{n \log n}{h^{m}}} \end{align*} % as $n h^m \gtrsim (\log n)^7$. Finally, % \begin{align*} \sup_{w \in \cW} \big| e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big) S(w) \big| &\lesssim_\P \sqrt{\frac{\log n}{n^3 h^{2m}}} \sqrt{\frac{n \log n}{h^{m}}} \lesssim_\P \frac{\log n}{\sqrt{n^2 h^{3m}}}. \end{align*} \proofparagraph{bounding the bias} Since $\mu \in \cC^\gamma$, we have, by the multivariate version of Taylor's theorem, % \begin{align*} \mu(W_i) &= \sum_{|\kappa|=0}^{\gamma-1} \frac{1}{\kappa!} \partial^{\kappa} \mu(w) (W_i-w)^\kappa + \sum_{|\kappa|=\gamma} \frac{1}{\kappa!} \partial^{\kappa} \mu(w') (W_i-w)^\kappa \end{align*} % for some $w'$ on the line segment connecting $w$ and $W_i$. Now since $p_h(W_i-w)_1 = 1$, % \begin{align*} &e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \mu(w) \\ &\quad= e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) p_h(W_i-w)^\T e_1 \mu(w) = e_1^\T e_1 \mu(w) = \mu(w). \end{align*} % Therefore % \begin{align*} \Bias(w) &= e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \mu(W_i) - \mu(w) \\ &= e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \\ &\quad\times \Bigg( \sum_{|\kappa|=0}^{\gamma-1} \frac{1}{\kappa!} \partial^{\kappa} \mu(w) (W_i-w)^\kappa + \sum_{|\kappa|=\gamma} \frac{1}{\kappa!} \partial^{\kappa} \mu(w') (W_i-w)^\kappa - \mu(w) \Bigg) \\ &= \sum_{|\kappa|=1}^{\gamma-1} \frac{1}{\kappa!} \partial^{\kappa} \mu(w) e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) (W_i-w)^\kappa \\ &\quad+ \sum_{|\kappa|=\gamma} \frac{1}{\kappa!} \partial^{\kappa} \mu(w') e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) (W_i-w)^\kappa \\ &= \sum_{|\kappa|=\gamma} \frac{1}{\kappa!} \partial^{\kappa} \mu(w') e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) (W_i-w)^\kappa, \end{align*} % where we used that $p_h(W_i-w)$ is a vector containing monomials in $W_i-w$ of order up to $\gamma$, so $e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) (W_i-w)^\kappa = 0$ whenever $1 \leq |\kappa| \leq \gamma$. Finally, % \begin{align*} \sup_{w\in\cW} |\Bias(w)| &= \sup_{w\in\cW} \Bigg| \sum_{|\kappa|=\gamma} \frac{1}{\kappa!} \partial^{\kappa} \mu(w') e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) (W_i-w)^\kappa \Bigg| \\ &\lesssim_\P \sup_{w\in\cW} \max_{|\kappa| = \gamma} \left| \partial^{\kappa} \mu(w') \right| \|\hat H(w)^{-1}\|_2 \Bigg\| \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \Bigg\|_2 h^\gamma \\ &\lesssim_\P \frac{h^\gamma}{n} \sup_{w\in\cW} \Bigg\| \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \Bigg\|_2. \end{align*} % Write $\tilde u_{i j}(w) = K_h(W_i-w)p_h(W_i-w)_j$ and note $|\tilde u_{i j}(w)| \lesssim h^{-m}$ and $\E[\tilde u_{i j}(w)] \lesssim 1$, so % \begin{align*} \P\left( \left| \sum_{i=1}^n \tilde u_{i j}(w) - \E\left[ \sum_{i=1}^n \tilde u_{i j}(w) \right] \right| > C_4 h^{-m} \big( \sqrt{n t} + (\log n)(\log \log n) t \big) \right) &\leq C_4 e^{-t} \end{align*} % by Lemma~\ref{lem:yurinskii_app_exponential_mixing}% \ref{it:yurinskii_app_exponential_mixing_bounded} for a constant $C_4$, By Lipschitz properties, this implies % \begin{align*} \sup_{w \in \cW} \left| \sum_{i=1}^n \tilde u_{i j}(w) \right| &\lesssim_\P n \left( 1 + \sqrt{\frac{\log n}{n h^{2m}}} \right) \lesssim_\P n. \end{align*} % Therefore $\sup_{w\in\cW} |\Bias(w)| \lesssim_\P n h^\gamma / n \lesssim_\P h^\gamma$. \proofparagraph{conclusion} By the previous parts, % \begin{align*} \sup_{w \in \cW} \left|\hat \mu(w) - \mu(w) - T(w) \right| &\leq \sup_{w \in \cW} \left|e_1^\T H(w)^{-1} S(w) - T(w) \right| \\ &\quad+ \sup_{w \in \cW} \left| e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big) S(w) \right| + \sup_{w \in \cW} |\Bias(w)| \\ &\lesssim_\P \left( \frac{(\log n)^{m+4}}{n^{m+4}h^{m(m+6)}} \right)^{\frac{1}{2m+6}} R_n + \frac{\log n}{\sqrt{n^2 h^{3m}}} + h^\gamma \\ &\lesssim_\P \frac{R_n}{\sqrt{n h^m}} \left( \frac{(\log n)^{m+4}}{n h^{3m}} \right)^{\frac{1}{2m+6}} + h^\gamma, \end{align*} % where the last inequality follows because $n h^{3m} \to \infty$ and $\frac{1}{2m+6} \leq \frac{1}{2}$. Finally, we verify the upper and lower bounds on the variance of the Gaussian process. Since the spectrum of $H(w)^{-1}$ is bounded above and below by $1/n$, % \begin{align*} \Var[T(w)] &= \Var\left[ e_1^\T H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \varepsilon_i \right] \\ &= e_1^\T H(w)^{-1} \Var\left[ \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \varepsilon_i \right] H(w)^{-1} e_1^\T \\ &\lesssim \|H(w)^{-1}\|_2^2 \max_{1 \leq j \leq k} \sum_{i=1}^n \Var\big[ K_h(W_i-w) p_h(W_i-w)_j \sigma(W_i) \big] \\ &\lesssim \frac{1}{n^2} n \frac{1}{h^m} \lesssim \frac{1}{n h^m}. \end{align*} % Similarly, $\Var[T(w)] \gtrsim \frac{1}{n h^m}$ by the same argument used to bound eigenvalues of $H(w)^{-1}$. % \end{proof} \section{High-dimensional central limit theorems for martingales}% \label{sec:yurinskii_app_high_dim_clt} We present an application of our main results to high-dimensional central limit theorems for martingales. Our main contribution here is the generality of our results, which are broadly applicable to martingale data and impose minimal extra assumptions. In exchange for the scope and breadth of our results, we naturally do not necessarily achieve state-of-the-art distributional approximation errors in certain special cases, such as with independent data or when restricting the class of sets over which the central limit theorem must hold. Extensions of our high-dimensional central limit theorem results to mixingales and other approximate martingales, along with third-order refinements and Gaussian mixture target distributions, are possible through methods akin to those used to establish our main results in Section~\ref{sec:yurinskii_main_results}, but we omit these for succinctness. Our approach to deriving a high-dimensional martingale central limit theorem proceeds as follows. Firstly, the upcoming Proposition~\ref{pro:yurinskii_app_clt} uses our main result on martingale coupling (Corollary~\ref{cor:yurinskii_sa_martingale}) to reduce the problem to that of providing anti-concentration results for high-dimensional Gaussian vectors. We then demonstrate the utility of this reduction by employing a few such anti-concentration methods from the existing literature. Proposition~\ref{pro:yurinskii_app_bootstrap} gives a feasible implementation via the Gaussian multiplier bootstrap, enabling valid resampling-based inference using the resulting conditional Gaussian distribution. Finally, in Section~\ref{sec:yurinskii_app_lp} we provide an example application: distributional approximation for $\ell^p$-norms of high-dimensional martingale vectors in Kolmogorov--Smirnov distance, relying on some recent results concerning Gaussian perimetric inequalities \citep{nazarov2003maximal,kozbur2021dimension, giessing2023anti,chernozhukov2017detailed}. We begin this section with some notation. Assume the setup of Corollary~\ref{cor:yurinskii_sa_martingale} and suppose $\Sigma$ is non-random. Let $\cA$ be a class of measurable subsets of $\R^d$ and take $T \sim \cN(0, \Sigma)$. For $\eta>0$ and $p \in [1, \infty]$ define the Gaussian perimetric quantity % \begin{align*} \Delta_p(\cA, \eta) &= \sup_{A\in \cA} \big\{\P(T\in A_p^\eta\setminus A) \vee \P(T\in A \setminus A_p^{-\eta})\big\}, \end{align*} % where $A_p^\eta = \{x \in \R^d : \|x - A\|_p \leq \eta\}$, $A_p^{-\eta} = \R^d \setminus (\R^d \setminus A)_p^\eta$, and $\|x - A\|_p = \inf_{x' \in A} \|x - x'\|_p$. Using this perimetric term allows us to convert coupling results to central limit theorems as follows. Denote by $\Gamma_p(\eta)$ the rate of strong approximation attained in Corollary~\ref{cor:yurinskii_sa_martingale}: % \begin{align*} \Gamma_p(\eta) &= 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3}. \end{align*} \begin{proposition}[High-dimensional central limit theorem for martingales]% \label{pro:yurinskii_app_clt} Take the setup of Corollary~\ref{cor:yurinskii_sa_martingale}, and $\Sigma$ non-random. For a class $\cA$ of measurable sets in $\R^d$, % \begin{equation}% \label{eq:yurinskii_app_high_dim_clt} \sup_{A\in \cA} \big|\P(S\in A) -\P(T\in A)\big| \leq \inf_{p \in [1, \infty]} \inf_{\eta>0} \big\{\Gamma_p(\eta) + \Delta_p(\cA, \eta) \big\}. \end{equation} \end{proposition} \begin{proof}[Proposition~\ref{pro:yurinskii_app_clt}] This follows from Strassen's theorem (Lemma~\ref{lem:yurinskii_app_strassen}), but we provide a proof for completeness. % \begin{align*} \P(S \in A) &\leq \P(T \in A) + \P(T \in A_p^\eta \setminus A) + \P(\|S - T\| > \eta) \end{align*} % and applying this to $\R^d \setminus A$ gives % \begin{align*} \P(S\in A) &= 1 - \P(S\in \R^d \setminus A) \\ &\geq 1 - \P(T \in \R^d \setminus A) - \P(T \in (\R^d \setminus A)_p^\eta \setminus (\R^d \setminus A)) - \P(\|S - T\| > \eta) \\ &= \P(T \in A) - \P(T \in A \setminus A_p^{-\eta}) - \P(\|S - T\| > \eta). \end{align*} % Since this holds for all $p \in [1, \infty]$, % \begin{align*} \sup_{A\in \cA} \big|\P(S\in A) -\P(T\in A)\big| &\leq \sup_{A \in \cA} \big\{\P(T \in A_p^\eta\setminus A) \vee \P(T \in A \setminus A_p^{-\eta})\big\} + \P(\|S - T\| > \eta) \\ &\leq \inf_{p \in [1, \infty]} \inf_{\eta>0} \big\{\Gamma_p(\eta) + \Delta_p(\cA, \eta) \big\}. \end{align*} % \end{proof} The term $\Delta_p(\cA, \eta)$ in \eqref{eq:yurinskii_app_high_dim_clt} is a Gaussian anti-concentration quantity so it depends on the law of $S$ only through the covariance matrix $\Sigma$. A few results are available in the literature for bounding this term. For instance, with $\cA = \cC = \{A \subseteq \R^d \text{ is convex}\}$, \citet{nazarov2003maximal} showed % \begin{equation}% \label{eq:yurinskii_app_convex_anticonc} \Delta_2(\cC, \eta) \asymp \eta\sqrt{\|\Sigma^{-1}\|_{\rF}}, \end{equation} % whenever $\Sigma$ is invertible. Proposition~\ref{pro:yurinskii_app_clt} with $p=2$ and \eqref{eq:yurinskii_app_convex_anticonc} yield for convex sets % \begin{align*} \sup_{A\in \cC} \big|\P(S\in A) -\P(T\in A)\big| &\lesssim \inf_{\eta > 0} \left\{ \left(\frac{\beta_{p,2} d}{\eta^3}\right)^{1/3} + \left(\frac{\E[\|\Omega \|_2] d}{\eta^2}\right)^{1/3} + \eta \sqrt{\|\Sigma^{-1}\|_\rF} \right\}. \end{align*} Alternatively, one can take $\cA = \cR$, the class of axis-aligned rectangles in $\R^d$. By Nazarov's Gaussian perimetric inequality \citep{nazarov2003maximal,chernozhukov2017central}, % \begin{align}% \label{eq:yurinskii_app_rect_anticonc} \Delta_\infty(\cR, \eta) \leq \frac{\eta (\sqrt{2\log d} + 2)}{\sigma_{\min}} \end{align} % whenever $\min_j \, \Sigma_{j j} \geq \sigma_{\min}^2$ for some $\sigma_{\min}>0$. Proposition~\ref{pro:yurinskii_app_clt} with $p = \infty$ and \eqref{eq:yurinskii_app_rect_anticonc} yields % \begin{align*}% &\sup_{A\in \cR} \big|\P(S\in A) -\P(T\in A)\big| \lesssim \inf_{\eta > 0} \left\{ \left(\frac{\beta_{\infty,2} \log 2d}{\eta^3}\right)^{1/3} + \left(\frac{\E[\|\Omega \|_2] \log 2d}{\eta^2}\right)^{1/3} + \frac{\eta \sqrt{\log 2d}}{\sigma_{\min}} \right\}. \end{align*} % In situations where $\liminf_n \min_j \, \Sigma_{j j} = 0$, it may be possible in certain cases to regularize the minimum variance away from zero and then apply a Gaussian--Gaussian rectangular approximation result such as Lemma~2.1 from \citet{chernozhukov2023nearly}. \begin{remark}[Comparisons with the literature] The literature on high-dimensional central limit theorems has developed rapidly in recent years \citep[see][and references therein]{% zhai2018high,% koike2021notes,% buzun2022strong,% lopes2022central,% chernozhukov2023nearly% }, particularly for the special case of sums of independent random vectors on the rectangular sets $\cR$. % Our corresponding results are rather weaker in terms of dependence on the dimension than for example \citet[Theorem~2.1]{chernozhukov2023nearly}. This is an inherent issue due to our approach of first considering the class of all Borel sets and only afterwards specializing to the smaller class $\cR$, where sharper results in the literature directly target the Kolmogorov--Smirnov distance via Stein's method and Slepian interpolation. \end{remark} Next, we present a version of Proposition~\ref{pro:yurinskii_app_clt} in which the covariance matrix $\Sigma$ is replaced by an estimator $\hat \Sigma$. This ensures that the associated conditionally Gaussian vector is feasible and can be resampled, allowing Monte Carlo quantile estimation via a Gaussian multiplier bootstrap. \begin{proposition}[Bootstrap central limit theorem for martingales]% \label{pro:yurinskii_app_bootstrap} Assume the setup of Corollary~\ref{cor:yurinskii_sa_martingale}, with $\Sigma$ non-random, and let $\hat \Sigma$ be an $\bX$-measurable random $d \times d$ positive semi-definite matrix, where $\bX = (X_1, \ldots, X_n)$. For a class $\cA$ of measurable subsets of $\R^d$, % \begin{align*} &\sup_{A\in \cA} \left| \P\big(S \in A\big) - \P\big(\hat \Sigma^{1/2} Z \in A \bigm| \bX \big) \right| \\ &\quad\leq \inf_{p \in [1,\infty]} \inf_{\eta>0} \left\{ \Gamma_p(\eta) + 2 \Delta_p(\cA, \eta) + 2d \exp\left(\frac{-\eta^2} {2d^{2/p}\big\|\hat \Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2} \right) \right\}, \end{align*} % where $Z \sim \cN(0,I_d)$ is independent of $\bX$. \end{proposition} \begin{proof}[Proposition~\ref{pro:yurinskii_app_bootstrap}] Since $T = \Sigma^{1/2} Z$ is independent of $\bX$, % \begin{align*} &\left| \P\big(S \in A\big) - \P\left(\hat \Sigma^{1/2} Z \in A \bigm| \bX\right) \right| \\ &\quad\leq \left| \P\big(S \in A\big) - \P\big(T \in A\big) \right| +\left| \P\big(\Sigma^{1/2} Z \in A\big) - \P\left(\hat \Sigma^{1/2} Z \in A \bigm| \bX\right) \right|. \end{align*} % The first term is bounded by Proposition~\ref{pro:yurinskii_app_clt}; the second by Lemma~\ref{lem:yurinskii_app_feasible_gaussian} conditional on $\bX$. % \begin{align*} &\left| \P\big(S \in A\big) - \P\left(\hat \Sigma^{1/2} Z \in A \bigm| \bX\right) \right| \\ &\quad\leq \Gamma_p(\eta) + \Delta_p(\cA, \eta) + \Delta_{p'}(\cA, \eta') + 2 d \exp \left( \frac{-\eta'^2} {2 d^{2/p'} \big\|\hat\Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2} \right) \end{align*} % for all $A \in \cA$ and any $p, p' \in [1, \infty]$ and $\eta, \eta' > 0$. Taking a supremum over $A$ and infima over $p = p'$ and $\eta = \eta'$ yields the result. We do not need $p = p'$ and $\eta = \eta'$ in general. % \end{proof} A natural choice for $\hat\Sigma$ in certain situations is the sample covariance matrix $\sum_{i=1}^n X_i X_i^\T$, or a correlation-corrected variant thereof. In general, whenever $\hat \Sigma$ does not depend on unknown quantities, one can sample from the law of $\hat T = \hat\Sigma^{1/2} Z$ conditional on $\bX$ to approximate the distribution of $S$. Proposition~\ref{pro:yurinskii_app_bootstrap} verifies that this Gaussian multiplier bootstrap approach is valid whenever $\hat\Sigma$ and $\Sigma$ are sufficiently close. To this end, Theorem~X.1.1 in \citet{bhatia1997matrix} gives $\big\|\hat\Sigma^{1/2} - \Sigma^{1/2}\big\|_2 \leq \big\|\hat\Sigma - \Sigma\big\|_2^{1/2}$ and Problem~X.5.5 in the same gives $\big\|\hat\Sigma^{1/2} - \Sigma^{1/2}\big\|_2 \leq \big\|\Sigma^{-1/2}\big\|_2 \big\|\hat\Sigma - \Sigma\big\|_2$ when $\Sigma$ is invertible. The latter often gives a tighter bound when the minimum eigenvalue of $\Sigma$ can be bounded away from zero, and consistency of $\hat \Sigma$ can be established using a range of matrix concentration inequalities. In Section~\ref{sec:yurinskii_app_lp} we apply Proposition~\ref{pro:yurinskii_app_clt} to the special case of approximating the distribution of the $\ell^p$-norm of a high-dimensional martingale. Proposition~\ref{pro:yurinskii_app_bootstrap} is then used to ensure that feasible distributional approximations are also available. \subsection{Application: distributional approximation of martingale \texorpdfstring{$\ell^p$}{lp}-norms} \label{sec:yurinskii_app_lp} In empirical applications, including nonparametric significance tests \citep{lopes2020bootstrapping} and nearest neighbor search procedures \citep{biau2015high}, an estimator or test statistic can be expressed under the null hypothesis as the $\ell^p$-norm of a zero-mean martingale for some $p \in [1, \infty]$. In the notation of Corollary~\ref{cor:yurinskii_sa_martingale}, it is of interest to bound Kolmogorov--Smirnov quantities of the form $\sup_{t \geq 0} \big| \P( \|S\|_p \leq t) - \P( \|T\|_p \leq t) \big|$. Let $\cB_p$ be the class of closed $\ell^p$-balls in $\R^d$ centered at the origin and set $\Delta_p(\eta) \vcentcolon= \Delta_p(\cB_p, \eta) = \sup_{t \geq 0} \P( t < \|T\|_p \leq t + \eta )$. \begin{proposition}[Distributional approximation of martingale $\ell^p$-norms] \label{pro:yurinskii_app_application_lp} Assume the setup of Corollary~\ref{cor:yurinskii_sa_martingale}, with $\Sigma$ non-random. Then for $T \sim \cN(0, \Sigma)$, % \begin{equation}% \label{eq:yurinskii_app_application_lp} \sup_{t \geq 0} \big| \P( \|S\|_p \leq t ) - \P\left( \|T\|_p \leq t \right) \big| \leq \inf_{\eta>0} \big\{\Gamma_p(\eta) + \Delta_p(\eta) \big\}. \end{equation} % \end{proposition} \begin{proof}[Proposition~\ref{pro:yurinskii_app_application_lp}] Applying Proposition~\ref{pro:yurinskii_app_clt} with $\cA=\cB_p$ gives % \begin{align*} \sup_{t \geq 0} \big| \P( \|S\|_p \leq t ) - \P\left( \|T\|_p \leq t \right) \big| &= \sup_{A\in \cB_p} \big|\P(S\in A) -\P(T\in A)\big| \\ &\leq \inf_{\eta>0} \big\{\Gamma_p(\eta) + \Delta_p(\cB_p, \eta) \big\} \leq \inf_{\eta>0} \big\{\Gamma_p(\eta) + \Delta_p(\eta) \big\}. \end{align*} % \end{proof} The right-hand side of \eqref{eq:yurinskii_app_application_lp} can be controlled in various ways. % In the case of $p=\infty$, note that $\ell^\infty$-balls are rectangles so $\cB_\infty\subseteq \cR$ and \eqref{eq:yurinskii_app_rect_anticonc} applies, giving $\Delta_\infty(\eta) \leq \eta (\sqrt{2\log d} + 2) / \sigma_{\min}$ whenever $\min_j \Sigma_{j j} \geq \sigma_{\min}^2$. Alternatively, \citet[Theorem~1]{giessing2023anti} provides $\Delta_\infty(\eta) \lesssim \eta / \sqrt{\Var[\|T\|_\infty] + \eta^2}$. By H{\"o}lder duality of $\ell^p$-norms, we can write $\|T\|_p = \sup_{\|u\|_q \leq 1} u^\T T$ where $1/p + 1/q = 1$. Applying the Gaussian process anti-concentration result of \citet[Theorem~2]{giessing2023anti} yields the more general $\Delta_p(\eta) \lesssim \eta / \sqrt{\Var[\|T\|_p] + \eta^2}$. Thus, the problem can be reduced to that of bounding $\Var\left[\|T\|_p\right]$, with techniques for doing so discussed in \citet[Section~4]{giessing2023anti}. Alongside the $\ell^p$-norms, other functionals can be analyzed in this manner, including the maximum and other order statistics \citep{kozbur2021dimension,giessing2023anti}. To conduct inference in this setting, we must feasibly approximate the quantiles of $\|T\|_p$. To that end, take a significance level $\tau\in(0,1)$ and set % $\hat q_p(\tau) = \inf \big\{t \in \R: \P(\|\hat T\|_p \leq t \mid \bX) \geq \tau \}$ where $\hat T \mid \bX \sim \cN(0, \hat\Sigma)$, % with $\hat\Sigma$ any $\bX$-measurable positive semi-definite estimator of $\Sigma$. Note that for the canonical estimator $\hat\Sigma = \sum_{i=1}^n X_i X_i^\T$ we can write $\hat T =\sum_{i=1}^n X_i Z_i$ with $Z_1,\dots,Z_n$ i.i.d.\ standard Gaussian independent of $\bX$, yielding the Gaussian multiplier bootstrap. Now assuming the law of $\|\hat T\|_p \mid \bX$ has no atoms, we can apply Proposition~\ref{pro:yurinskii_app_bootstrap} to see % \begin{align*} &\sup_{\tau\in(0,1)} \big|\P\left(\|S\|_p \leq \hat q_p(\tau)\right) - \tau \big| \leq \E\left[ \sup_{t \geq 0} \big| \P(\|S\|_p \leq t) - \P(\|\hat T\|_p \leq t \mid \bX) \big| \right] \\ &\qquad\leq \inf_{\eta>0} \left\{ \Gamma_p(\eta) + 2 \Delta_p(\eta) + 2d\, \E\left[ \exp\left(\frac{-\eta^2} {2d^{2/p}\big\|\hat \Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2}\right) \right] \right\}, \end{align*} % and hence the bootstrap is valid whenever $\|\hat \Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2$ is sufficiently small. See the preceding discussion regarding methods for bounding this object. \begin{remark}[One-dimensional distributional approximations] In our application to distributional approximation of $\ell^p$-norms, the object of interest $\|S\|_p$ is a one-dimensional functional of the high-dimensional martingale; contrast this with the more general Proposition~\ref{pro:yurinskii_app_clt} which directly considers the $d$-dimensional random vector $S$. As such, our coupling-based approach may be improved in certain settings by applying a more carefully tailored smoothing argument. For example, \citet{belloni2018high} employ a ``log sum exponential'' bound \citep[see also][]{chernozhukov2013gaussian} for the maximum statistic $\max_{1 \leq j \leq d} S_j$ along with a coupling due to \citet{chernozhukov2014gaussian} to attain an improved dependence on the dimension. Naturally, their approach does not permit the formulation of high-dimensional central limit theorems over arbitrary classes of Borel sets as in our Proposition~\ref{pro:yurinskii_app_clt}. \end{remark} \clearpage \addcontentsline{toc}{chapter}{Bibliography} \bibliographystyle{phd_dissertation} \bibliography{refs} \end{document} tex-fmt-0.5.2/tests/source/phd_dissertation_refs.bib000066400000000000000000001107321473573253500226260ustar00rootroot00000000000000@article{aldous1981representations, author = {Aldous, David J}, journal = {Journal of Multivariate Analysis}, number = {4}, pages = {581--598}, title = {Representations for partially exchangeable arrays of random variables}, volume = {11}, year = {1981}, } @inproceedings{anastasiou2019normal, title = {Normal approximation for stochastic gradient descent via non-asymptotic rates of martingale {CLT}}, author = {Anastasiou, Andreas and Balasubramanian, Krishnakumar and Erdogdu, Murat A}, booktitle = {Conference on Learning Theory}, pages = {115--137}, year = {2019}, organization = {Proceedings of Machine Learning Research} } @article{arcones1993limit, title = {Limit theorems for {U}-processes}, author = {Arcones, Miguel A and Gin{\'e}, Evarist}, journal = {Annals of Probability}, pages = {1494--1542}, year = {1993}, } @article{arcones1995bernstein, author = {Arcones, Miguel A}, journal = {Statistics \& Probability Letters}, number = {3}, pages = {239--247}, title = {A {Bernstein}-type inequality for {U}-statistics and {U}-processes}, volume = {22}, year = {1995}, } @inproceedings{arnould2023interpolation, title = {Is interpolation benign for random forest regression?}, author = {Arnould, Ludovic and Boyer, Claire and Scornet, Erwan}, booktitle = {International Conference on Artificial Intelligence and Statistics}, pages = {5493--5548}, year = {2023}, organization = {Proceedings of Machine Learning Research}, } @article{atchade2014martingale, title = {A martingale decomposition for quadratic forms of {Markov} chains (with applications)}, author = {Atchad{\'e}, Yves F and Cattaneo, Matias D}, journal = {Stochastic Processes and their Applications}, volume = {124}, number = {1}, pages = {646--677}, year = {2014}, } @article{baxter1994norm, title = {Norm estimates for inverses of {Toeplitz} distance matrices}, author = {Baxter, Brad J. C.}, journal = {Journal of Approximation Theory}, volume = {79}, number = {2}, pages = {222--242}, year = {1994}, } @article{belloni2015some, title = {Some new asymptotic theory for least squares series: Pointwise and uniform results}, author = {Belloni, Alexandre and Chernozhukov, Victor and Chetverikov, Denis and Kato, Kengo}, journal = {Journal of Econometrics}, volume = {186}, number = {2}, pages = {345--366}, year = {2015}, } @article{belloni2018high, title = {A high dimensional central limit theorem for martingales, with applications to context tree models}, author = {Belloni, Alexandre and Oliveira, Roberto I}, journal = {Preprint}, note = {\arxiv{1809.02741}}, year = {2018} } @article{belloni2019conditional, author = {Belloni, Alexandre and Chernozhukov, Victor and Chetverikov, Denis and Fern{\'a}ndez-Val, Iv{\'a}n}, journal = {Journal of Econometrics}, number = {1}, pages = {4--29}, title = {Conditional quantile processes based on series or many regressors}, volume = {213}, year = {2019}, } @article{berthet2006revisiting, title = {Revisiting two strong approximation results of {Dudley} and {Philipp}}, author = {Berthet, Philippe and Mason, David M}, journal = {Lecture Notes--Monograph Series}, pages = {155--172}, volume = {51}, year = {2006}, note = {High Dimensional Probability}, } @book{bhatia1997matrix, author = {Bhatia, Rajendra}, publisher = {Springer}, address = {New York, NY}, series = {Graduate Texts in Mathematics}, title = {Matrix Analysis}, volume = {169}, year = {1997}, } @article{biau2012analysis, title = {Analysis of a random forests model}, author = {Biau, G{\'e}rard}, journal = {Journal of Machine Learning Research}, volume = {13}, pages = {1063--1095}, year = {2012}, } @incollection{biau2015high, title = {High-Dimensional $p$-Norms}, author = {Biau, G{\'e}rard and Mason, David M}, booktitle = {Mathematical Statistics and Limit Theorems}, editor = {Marc Hallin and David M Mason and Dietmar Pfeifer and Josef G. Steinebach}, pages = {21--40}, year = {2015}, publisher = {Springer} } @article{birge2001alternative, author = {Birg{\'e}, Lucien}, journal = {Lecture Notes--Monograph Series}, pages = {113--133}, title = {An alternative point of view on {Lepski}'s method}, volume = {36}, year = {2001}, note = {State of the Art in Probability and Statistics} } @book{boucheron2013concentration, title = {Concentration Inequalities: A Nonasymptotic Theory of Independence}, author = {Boucheron, St{\'e}phane and Lugosi, G{\'a}bor and Massart, Pascal}, year = {2013}, publisher = {Oxford University Press}, } @article{bradley2005basic, title = {Basic Properties of Strong Mixing Conditions. {A} survey and Some Open Questions}, author = {Bradley, Richard C}, journal = {Probability Surveys}, volume = {2}, pages = {107--144}, year = {2005} } @article{breiman2001random, title = {Random forests}, author = {Breiman, Leo}, journal = {Machine learning}, volume = {45}, pages = {5--32}, year = {2001}, } @misc{bureau2017daily, author = {{Bureau of Meteorology, Australian Government}}, title = {Daily Weather Observations}, year = {2017}, note = {\href{http://www.bom.gov.au/climate/data/} {\texttt{http://www.bom.gov.au/climate/data/}}. Accessed October 2023}, } @inproceedings{buzun2022strong, title = {Strong {Gaussian} Approximation for the Sum of Random Vectors}, author = {Buzun, Nazar and Shvetsov, Nikolay and Dylov, Dmitry V}, booktitle = {Conference on Learning Theory}, volume = {178}, pages = {1693--1715}, year = {2022}, organization = {Proceedings of Machine Learning Research} } @article{calonico2018effect, author = {Calonico, Sebastian and Matias D. Cattaneo and Max H. Farrell}, journal = {Journal of the American Statistical Association}, number = {522}, pages = {767--779}, title = {On the Effect of Bias Estimation on Coverage Accuracy in Nonparametric Inference}, volume = {113}, year = {2018}, } @article{calonico2022coverage, author = {Calonico, Sebastian and Matias D. Cattaneo and Max H. Farrell}, journal = {Bernoulli}, volume = {28}, number = {4}, pages = {2998--3022}, title = {Coverage Error Optimal Confidence Intervals for Local Polynomial Regression}, year = {2022}, } @inproceedings{caruana2004ensemble, title = {Ensemble selection from libraries of models}, author = {Caruana, Rich and Niculescu-Mizil, Alexandru and Crew, Geoff and Ksikes, Alex}, booktitle = {Proceedings of the Twenty-First International Conference on Machine Learning}, pages = {18}, year = {2004} } @article{cattaneo2020large, author = {Matias D. Cattaneo and Max H. Farrell and Yingjie Feng}, title = {{Large sample properties of partitioning-based series estimators}}, volume = {48}, journal = {Annals of Statistics}, number = {3}, pages = {1718--1741}, keywords = {Nonparametric regression, robust bias correction, series methods, sieve methods, strong approximation, tuning parameter selection, uniform inference}, year = {2020}, } @article{cattaneo2022yurinskii, author = {Cattaneo, Matias Damian and Masini, Ricardo Pereira and Underwood, William George}, title = {{Yurinskii's} Coupling for Martingales}, year = {2022}, journal = {Preprint}, note = {\arxiv{2210.00362}} } @article{cattaneo2023inference, author = {Cattaneo, Matias Damian and Klusowski, Jason M and Underwood, William George}, title = {Inference with {Mondrian} Random Forests}, journal = {Preprint}, year = {2023}, note = {\arxiv{2310.09702}} } @article{cattaneo2024uniform, author = {Cattaneo, Matias Damian and Feng, Yingjie and Underwood, William George}, title = {Uniform Inference for Kernel Density Estimators with Dyadic Data}, year = {2024}, journal = {Journal of the American Statistical Association}, volume = {forthcoming}, } @article{chatterjee2006generalization, title = {A generalization of the {Lindeberg} principle}, author = {Chatterjee, Sourav}, journal = {Annals of Probability}, volume = {34}, number = {6}, pages = {2061--2076}, year = {2006} } @article{chen2020jackknife, title = {Jackknife multiplier bootstrap: finite sample approximations to the {U}-process supremum with applications}, author = {Chen, Xiaohui and Kato, Kengo}, journal = {Probability Theory and Related Fields}, volume = {176}, number = {3}, pages = {1097--1163}, year = {2020}, } @article{chernozhukov2013gaussian, title = {Gaussian approximations and multiplier bootstrap for maxima of sums of high-dimensional random vectors}, author = {Chernozhukov, Victor and Chetverikov, Denis and Kato, Kengo}, journal = {Annals of Statistics}, volume = {41}, number = {6}, pages = {2786--2819}, year = {2013}, } @article{chernozhukov2013inference, title = {Inference on counterfactual distributions}, author = {Chernozhukov, Victor and Fern{\'a}ndez-Val, Iv{\'a}n and Melly, Blaise}, journal = {Econometrica}, volume = {81}, number = {6}, pages = {2205--2268}, year = {2013}, } @article{chernozhukov2014anti, title = {Anti-concentration and honest, adaptive confidence bands}, author = {Chernozhukov, Victor and Chetverikov, Denis and Kato, Kengo}, journal = {Annals of Statistics}, volume = {42}, number = {5}, pages = {1787--1818}, year = {2014}, } @article{chernozhukov2014gaussian, title = {Gaussian approximation of suprema of empirical processes}, author = {Chernozhukov, Victor and Chetverikov, Denis and Kato, Kengo}, journal = {Annals of Statistics}, volume = {42}, number = {4}, pages = {1564--1597}, year = {2014}, } @article{chernozhukov2016empirical, title = {Empirical and multiplier bootstraps for suprema of empirical processes of increasing complexity, and related {Gaussian} couplings}, author = {Chernozhukov, Victor and Chetverikov, Denis and Kato, Kengo}, journal = {Stochastic Processes and their Applications}, volume = {126}, number = {12}, pages = {3632--3651}, year = {2016}, } @article{chernozhukov2017central, author = {Victor Chernozhukov and Denis Chetverikov and Kengo Kato}, title = {{Central limit theorems and bootstrap in high dimensions}}, volume = {45}, journal = {Annals of Probability}, number = {4}, pages = {2309--2352}, year = {2017}, } @article{chernozhukov2017detailed, title = {Detailed proof of {Nazarov}'s inequality}, author = {Chernozhukov, Victor and Chetverikov, Denis and Kato, Kengo}, journal = {Preprint}, note = {\arxiv{1711.10696}}, year = {2017} } @article{chernozhukov2023nearly, title = {Nearly optimal central limit theorem and bootstrap approximations in high dimensions}, author = {Chernozhukov, Victor and Chetverikov, Denis and Koike, Yuta}, journal = {Annals of Applied Probability}, volume = {33}, number = {3}, pages = {2374--2425}, year = {2023} } @article{chi2022asymptotic, title = {Asymptotic Properties of High-Dimensional Random Forests}, author = {Chi, Chien-Ming and Vossler, Patrick and Fan, Yingying and Lv, Jinchi}, volume = {50}, journal = {Annals of Statistics}, number = {6}, pages = {3415--3438}, year = {2022} } @article{chiang2020empirical, title = {Empirical likelihood and uniform convergence rates for dyadic kernel density estimation}, author = {Harold D. Chiang and Bing Yang Tan}, journal = {Journal of Business and Economic Statistics}, volume = {41}, number = {3}, pages = {906--914}, year = {2023}, } @article{chiang2022inference, author = {Harold D. Chiang and Kengo Kato and Yuya Sasaki}, journal = {Journal of the American Statistical Association}, title = {Inference for High-Dimensional Exchangeable Arrays}, volume = {118}, number = {543}, pages = {1595--1605}, year = {2023}, } @article{cuny2014martingale, title = {On martingale approximations and the quenched weak invariance principle}, author = {Cuny, Christophe and Merlev{\`e}de, Florence}, journal = {Annals of Probability}, volume = {42}, number = {2}, pages = {760--793}, year = {2014}, } @article{davezies2021exchangeable, author = {Laurent Davezies and Xavier D'Haultf{\oe}uille and Yannick Guyonvarch}, journal = {Annals of Statistics}, number = {2}, pages = {845--862}, title = {Empirical process results for exchangeable arrays}, volume = {49}, year = {2021}, } @article{dedecker2007weak, title = {On the weak invariance principle for non-adapted sequences under projective criteria}, author = {Dedecker, J{\'e}r{\^o}me and Merlev{\`e}de, Florence and Voln{\`y}, Dalibor}, journal = {Journal of Theoretical Probability}, volume = {20}, pages = {971--1004}, year = {2007}, } @article{dehling1983limit, title = {Limit theorems for sums of weakly dependent {Banach} space valued random variables}, author = {Dehling, Herold}, journal = {Zeitschrift f{\"u}r Wahrscheinlichkeitstheorie und verwandte Gebiete}, volume = {63}, number = {3}, pages = {393--432}, year = {1983}, } @article{delapena1995decoupling, author = {de la Pe{\~n}a, Victor H and Montgomery-Smith, Stephen J}, journal = {Annals of Probability}, number = {2}, pages = {806--816}, title = {Decoupling inequalities for the tail probabilities of multivariate {U}-statistics}, volume = {23}, year = {1995}, } @article{dinardo1996distribution, title = {Labor Market Institutions and the Distribution of Wages, 1973--1992: A Semiparametric Approach}, author = {John DiNardo and Nicole M Fortin and Thomas Lemieux}, journal = {Econometrica}, volume = {64}, number = {5}, pages = {1001--1004}, year = {1996} } @article{dudley1983invariance, title = {Invariance principles for sums of {Banach} space valued random elements and empirical processes}, author = {Dudley, RM and Philipp, Walter}, journal = {Zeitschrift f{\"u}r Wahrscheinlichkeitstheorie und verwandte Gebiete}, volume = {62}, number = {4}, pages = {509--552}, year = {1983}, } @book{dudley1999uniform, author = {Dudley, R. M.}, publisher = {Cambridge University Press}, series = {Cambridge Studies in Advanced Mathematics}, title = {Uniform Central Limit Theorems}, year = {1999}, } @article{duroux2018impact, title={Impact of subsampling and tree depth on random forests}, author={Duroux, Roxane and Scornet, Erwan}, journal={ESAIM: Probability and Statistics}, volume={22}, pages={96--128}, year={2018}, } @article{efron1981jackknife, title = {The jackknife estimate of variance}, author = {Efron, Bradley and Stein, Charles}, journal = {Annals of Statistics}, pages = {586--596}, year = {1981}, } @book{eggermont2009maximum, title = {Maximum Penalized Likelihood Estimation: Volume II: Regression}, author = {Eggermont, Paul P B and LaRiccia, Vincent N}, series = {Springer Series in Statistics}, year = {2009}, publisher = {Springer}, address = {New York, NY}, } @book{fan1996local, author = {Fan, J. and I. Gijbels}, title = {Local Polynomial Modelling and Its Applications}, series = {Monographs on Statistics and Applied Probability}, volume = {66}, publisher = {Chapman \& Hall/CRC}, address = {New York, NY}, year = {1996} } @book{fan2020statistical, title = {Statistical Foundations of Data Science}, series = {Data Science Series}, author = {Fan, Jianqing and Li, Runze and Zhang, Cun-Hui and Zou, Hui}, year = {2020}, publisher = {Chapman \& Hall/CRC}, address = {New York, NY}, } @article{friedberg2020local, title = {Local linear forests}, author = {Friedberg, Rina and Tibshirani, Julie and Athey, Susan and Wager, Stefan}, journal = {Journal of Computational and Graphical Statistics}, volume = {30}, number = {2}, pages = {503--517}, year = {2020}, } @article{gao2021minimax, author = {Gao, Chao and Ma, Zongming}, journal = {Statistical Science}, number = {1}, pages = {16--33}, title = {Minimax rates in network analysis: Graphon estimation, community detection and hypothesis testing}, volume = {36}, year = {2021}, } @article{gao2022towards, title = {Towards convergence rate analysis of random forests for classification}, author = {Gao, Wei and Xu, Fan and Zhou, Zhi-Hua}, journal = {Artificial Intelligence}, volume = {313}, pages = {103788}, year = {2022}, } @book{geer2000empirical, title = {Empirical Processes in {M}-Estimation}, author = {Sara A van de Geer}, volume = {6}, year = {2000}, publisher = {Cambridge University Press}, series = {Cambridge Series in Statistical and Probabilistic Mathematics}, } @article{giessing2023anti, title = {Anti-concentration of Suprema of {Gaussian} Processes and {Gaussian} Order Statistics}, author = {Giessing, Alexander}, journal = {Preprint}, note = {\arxiv{2310.12119}}, year = {2023} } @incollection{gine2000exponential, author = {Gin{\'e}, Evarist and Lata{\l}a, Rafa{\l} and Zinn, Joel}, booktitle = {High Dimensional Probability II}, pages = {13--38}, publisher = {Birkh{\"a}user}, address = {Boston, MA}, title = {Exponential and moment inequalities for {U}-statistics}, year = {2000}, editor = {Evarist Gin{\'e} and David M Mason and Jon A Wellner}, } @article{gine2004kernel, author = {Gin{\'e}, Evarist and Koltchinskii, Vladimir and Sakhanenko, Lyudmila}, journal = {Probability Theory and Related Fields}, number = {2}, pages = {167--198}, title = {Kernel density estimators: convergence in distribution for weighted sup-norms}, volume = {130}, year = {2004}, } @article{gine2010confidence, author = {Gin{\'e}, Evarist and Nickl, Richard}, journal = {Annals of Statistics}, number = {2}, pages = {1122--1170}, title = {Confidence bands in density estimation}, volume = {38}, year = {2010}, } @book{gine2021mathematical, author = {Gin{\'e}, Evarist and Nickl, Richard}, publisher = {Cambridge University Press}, series = {Cambridge Series in Statistical and Probabilistic Mathematics}, title = {Mathematical Foundations of Infinite-Dimensional Statistical Models}, year = {2021}, } @incollection{graham2020network, author = {Graham, Bryan S}, booktitle = {Handbook of Econometrics}, pages = {111--218}, publisher = {Elsevier}, title = {Network data}, volume = {7}, year = {2020}, editor = {Steven N Durlauf and Lars Peter Hansen and James J. Heckman and Rosa L Matzkin}, } @techreport{graham2021minimax, author = {Graham, Bryan S and Niu, Fengshi and Powell, James L}, institution = {National Bureau of Economic Research}, title = {Minimax Risk and Uniform Convergence Rates for Nonparametric Dyadic Regression}, year = {2021}, } @article{graham2024kernel, title = {Kernel density estimation for undirected dyadic data}, author = {Graham, Bryan S and Niu, Fengshi and Powell, James L}, journal = {Journal of Econometrics}, volume = {240}, number = {2}, year = {2024}, } @book{hall1980martingale, title = {Martingale Limit Theory and its Application}, author = {Hall, Peter and Heyde, Christopher C}, year = {1980}, publisher = {Academic Press}, address = {New York, NY}, } @article{hall1992effect, author = {Hall, Peter}, journal = {Annals of Statistics}, volume = {20}, number = {2}, pages = {675--694}, title = {Effect of bias estimation on coverage accuracy of bootstrap confidence intervals for a probability density}, year = {1992}, } @article{hall2001bootstrapping, author = {Hall, Peter and Kang, Kee-Hoon}, journal = {Annals of Statistics}, number = {5}, pages = {1443--1468}, title = {Bootstrapping nonparametric density estimators with empirically chosen bandwidths}, volume = {29}, year = {2001}, } @incollection{head2014gravity, title = {Gravity equations: Workhorse, toolkit, and cookbook}, author = {Head, Keith and Mayer, Thierry}, booktitle = {Handbook of International Economics}, volume = {4}, pages = {131--195}, year = {2014}, publisher = {Elsevier}, editor = {Gita Gopinath and Elhanan Helpman and Kenneth Rogoff}, } @article{hoover1979relations, author = {Hoover, Douglas N}, journal = {Preprint, Institute for Advanced Study, Princeton, NJ}, title = {Relations on probability spaces and arrays of random variables}, year = {1979}, } @article{huang2003local, title = {Local asymptotics for polynomial spline regression}, author = {Huang, Jianhua Z}, journal = {Annals of Statistics}, volume = {31}, number = {5}, pages = {1600--1635}, year = {2003}, } @book{kenny2020dyadic, title = {Dyadic Data Analysis}, author = {Kenny, David A and Kashy, Deborah A and Cook, William L}, year = {2020}, series = {Methodology in the Social Sciences Series}, publisher = {Guilford Press} } @article{khasminskii1978lower, author = {Khasminskii, Rafail Z}, journal = {Theory of Probability and its Applications}, number = {4}, pages = {794--798}, title = {A lower bound on the risks of nonparametric estimates of densities in the uniform metric}, volume = {23}, year = {1978}, } @inproceedings{klusowski2021sharp, title = {Sharp analysis of a simple model for random forests}, author = {Klusowski, Jason M}, booktitle = {International Conference on Artificial Intelligence and Statistics}, pages = {757--765}, year = {2021}, organization = {Proceedings of Machine Learning Research} } @article{klusowski2024large, title = {Large scale prediction with decision trees}, author = {Klusowski, Jason M and Tian, Peter M}, journal = {Journal of the American Statistical Association}, pages = {525-537}, volume = {119}, number = {545}, year = {2024}, } @article{koike2021notes, title = {Notes on the dimension dependence in high-dimensional central limit theorems for hyperrectangles}, author = {Koike, Yuta}, journal = {Japanese Journal of Statistics and Data Science}, volume = {4}, pages = {257--297}, year = {2021}, } @book{kolaczyk2009statistical, author = {Kolaczyk, Eric D}, year = {2009}, title = {Statistical Analysis of Network Data: Methods and Models}, series = {Springer Series in Statistics}, publisher = {Springer}, address = {New York, NY}, } @article{komlos1975approximation, author = {Koml{\'o}s, J{\'a}nos and Major, P{\'e}ter and Tusn{\'a}dy, G{\'a}bor}, journal = {Zeitschrift f{\"u}r Wahrscheinlichkeitstheorie und verwandte Gebiete}, number = {1-2}, pages = {111--131}, title = {An approximation of partial sums of independent {RV}s, and the sample {DF}. {I}}, volume = {32}, year = {1975}, } @article{kozbur2021dimension, title = {Dimension-Free Anticoncentration Bounds for {Gaussian} Order Statistics with Discussion of Applications to Multiple Testing}, author = {Kozbur, Damian}, journal = {Preprint}, note = {\arxiv{2107.10766}}, year = {2021} } @article{kwapien1991hypercontraction, author = {Kwapien, Stanislaw and Szulga, Jerzy}, journal = {Annals of Probability}, number = {1}, pages = {369--379}, title = {Hypercontraction methods in moment inequalities for series of independent random variables in normed spaces}, volume = {19}, year = {1991}, } @article{lakshminarayanan2014mondrian, title = {Mondrian forests: Efficient online random forests}, author = {Lakshminarayanan, Balaji and Roy, Daniel M and Teh, Yee Whye}, journal = {Advances in Neural Information Processing Systems}, volume = {27}, year = {2014} } @inproceedings{lakshminarayanan2016mondrian, title = {Mondrian forests for large-scale regression when uncertainty matters}, author = {Lakshminarayanan, Balaji and Roy, Daniel M and Teh, Yee Whye}, booktitle = {Artificial Intelligence and Statistics}, pages = {1478--1487}, year = {2016}, organization = {Proceedings of Machine Learning Research} } @incollection{laurent2005semidefinite, author = {Monique Laurent and Franz Rendl}, booktitle = {Discrete Optimization}, pages = {393--514}, publisher = {Elsevier}, series = {Handbooks in Operations Research and Management Science}, title = {Semidefinite Programming and Integer Programming}, volume = {12}, year = {2005}, editor = {K Aardal and G L Nemhauser and R Weismantel}, } @techreport{lecam1988, author = {Le Cam, L}, title = {On the {Prokhorov} distance between the empirical process and the associated {Gaussian} bridge}, institution = {University of California, Berkeley}, year = {1988} } @book{ledoux1991probability, author = {Ledoux, Michel and Talagrand, Michel}, publisher = {Springer}, series = {Classics in Mathematics}, address = {Berlin, Heidelberg}, title = {Probability in Banach Spaces}, year = {1991}, } @book{legall2016brownian, author = {Le Gall, Jean-Fran{\c{c}}ois}, publisher = {Springer}, address = {Berlin, Heidelberg}, title = {Brownian Motion, Martingales, and Stochastic Calculus}, series = {Graduate Texts in Mathematics}, volume = {274}, year = {2016}, } @article{lepskii1992asymptotically, author = {Lepskii, O V}, journal = {Theory of Probability \& its Applications}, number = {4}, pages = {682--697}, title = {Asymptotically minimax adaptive estimation. {I}: Upper bounds. Optimally adaptive estimates}, volume = {36}, year = {1992}, } @article{li2020uniform, title = {Uniform nonparametric inference for time series}, journal = {Journal of Econometrics}, volume = {219}, number = {1}, pages = {38-51}, year = {2020}, author = {Jia Li and Zhipeng Liao} } @article{lopes2020bootstrapping, title = {Bootstrapping max statistics in high dimensions: Near-parametric rates under weak variance decay and application to functional and multinomial data}, author = {Lopes, Miles E and Lin, Zhenhua and M{\"u}ller, Hans-Georg}, journal = {Annals of Statistics}, volume = {48}, number = {2}, pages = {1214--1229}, year = {2020}, } @article{lopes2022central, title = {Central limit theorem and bootstrap approximation in high dimensions: Near $1/n$ rates via implicit smoothing}, author = {Lopes, Miles E}, journal = {Annals of Statistics}, volume = {50}, number = {5}, pages = {2492--2513}, year = {2022}, } @article{luke2007network, title = {Network analysis in public health: history, methods, and applications}, author = {Luke, Douglas A and Harris, Jenine K}, journal = {Annual Review of Public Health}, volume = {28}, pages = {69--93}, year = {2007}, } @inproceedings{ma2020isolation, title = {Isolation {Mondrian} forest for batch and online anomaly detection}, author = {Ma, Haoran and Ghojogh, Benyamin and Samad, Maria N and Zheng, Dongyu and Crowley, Mark}, booktitle = {2020 IEEE International Conference on Systems, Man, and Cybernetics}, pages = {3051--3058}, year = {2020}, organization = {Institute of Electrical and Electronics Engineers}, } @article{magda2018martingale, title = {Martingale approximations for random fields}, author = {Magda, Peligrad and Zhang, Na}, journal = {Electronic Communications in Probability}, volume = {23}, number = {28}, pages = {1--9}, year = {2018} } @article{matsushita2021jackknife, author = {Matsushita, Yukitoshi and Otsu, Taisuke}, journal = {Biometrika}, number = {3}, pages = {661--674}, title = {Jackknife empirical likelihood: small bandwidth, sparse network and high-dimensional asymptotics}, volume = {108}, year = {2021}, } @article{mcleish1975invariance, title = {Invariance principles for dependent variables}, author = {McLeish, Don L}, journal = {Zeitschrift f{\"u}r Wahrscheinlichkeitstheorie und verwandte Gebiete}, volume = {32}, number = {3}, pages = {165--178}, year = {1975}, } @incollection{merlevede2009bernstein, title = {Bernstein inequality and moderate deviations under strong mixing conditions}, author = {Merlev{\`e}de, Florence and Peligrad, Magda and Rio, Emmanuel}, booktitle = {High Dimensional Probability V, the Luminy volume}, pages = {273--292}, year = {2009}, publisher = {Institute of Mathematical Statistics}, editor = {Christian Houdr{\'e} and Vladimir Koltchinskii and David M Mason and Magda Peligrad}, } @article{minsker2019moment, author = {Minsker, Stanislav and Wei, Xiaohan}, journal = {Electronic Journal of Probability}, number = {133}, pages = {1--32}, title = {Moment inequalities for matrix-valued {U}-statistics of order 2}, volume = {24}, year = {2019}, } @manual{mosek, author = {{MOSEK ApS}}, title = {The {MOSEK} {Optimizer} {API} for {C} manual. Version 9.3}, year = {2021}, } @article{mourtada2017universal, title = {Universal consistency and minimax rates for online {Mondrian} forests}, author = {Mourtada, Jaouad and Ga{\"\i}ffas, St{\'e}phane and Scornet, Erwan}, journal = {Advances in Neural Information Processing Systems}, volume = {30}, year = {2017} } @article{mourtada2020minimax, title = {Minimax optimal rates for {Mondrian} trees and forests}, author = {Mourtada, Jaouad and Ga{\"i}ffas, St{\'e}phane and Scornet, Erwan}, journal = {Annals of Statistics}, volume = {48}, number = {4}, pages = {2253--2276}, year = {2020}, } @article{mourtada2021amf, title = {{AMF}: Aggregated {Mondrian} forests for online learning}, author = {Mourtada, Jaouad and Ga{\"\i}ffas, St{\'e}phane and Scornet, Erwan}, journal = {Journal of the Royal Statistical Society Series B: Statistical Methodology}, volume = {83}, number = {3}, pages = {505--533}, year = {2021}, } @incollection{nazarov2003maximal, title = {On the Maximal Perimeter of a Convex Set in $\mathbb{R}^n$ with Respect to a {Gaussian} Measure}, author = {Nazarov, Fedor}, booktitle = {Geometric Aspects of Functional Analysis}, pages = {169--187}, year = {2003}, publisher = {Springer}, editor = {Vitali D Milman and Gideon Schechtman}, } @article{oreilly2022stochastic, title = {Stochastic geometry to generalize the {Mondrian} process}, author = {O'Reilly, Eliza and Tran, Ngoc Mai}, journal = {SIAM Journal on Mathematics of Data Science}, volume = {4}, number = {2}, pages = {531--552}, year = {2022}, } @incollection{peligrad2010conditional, title = {Conditional central limit theorem via martingale approximation}, author = {Peligrad, M}, booktitle = {Dependence in Probability, Analysis and Number Theory, volume in memory of Walter Philipp}, pages = {295--311}, year = {2010}, publisher = {Kendrick Press}, editor = {Istvan Berkes and Richard C Bradley and Herold Dehling and Magda Peligrad and Robert Tichy}, } @book{pollard2002user, author = {Pollard, David}, publisher = {Cambridge University Press}, title = {A User's Guide to Measure Theoretic Probability}, series = {Cambridge Series in Statistical and Probabilistic Mathematics}, year = {2002}, } @article{rakhlin2015sequential, title = {Sequential complexities and uniform martingale laws of large numbers}, author = {Rakhlin, Alexander and Sridharan, Karthik and Tewari, Ambuj}, journal = {Probability Theory and Related Fields}, volume = {161}, number = {1}, pages = {111--153}, year = {2015}, } @article{ray2021bernstein, title = {On the {Bernstein}--von {Mises} theorem for the {Dirichlet} process}, author = {Ray, Kolyan and van der Vaart, Aad}, journal = {Electronic Journal of Statistics}, volume = {15}, number = {1}, pages = {2224--2246}, year = {2021}, } @book{rio2017asymptotic, title = {Asymptotic Theory of Weakly Dependent Random Processes}, series = {Probability Theory and Stochastic Modelling}, author = {Rio, Emmanuel}, volume = {80}, year = {2017}, publisher = {Springer}, address = {Berlin, Heidelberg}, } @inproceedings{roy2008mondrian, title = {The {Mondrian} Process.}, author = {Roy, Daniel M and Teh, Yee Whye}, booktitle = {Neural Information Processing Systems}, volume = {21}, year = {2008} } @book{royden1988real, author = {Royden, Halsey Lawrence and Fitzpatrick, Patrick}, publisher = {Macmillan}, address = {New York, NY}, title = {Real Analysis}, year = {1988}, } @article{schucany1977improvement, title = {Improvement of kernel type density estimators}, author = {Schucany, William R and Sommers, John P}, journal = {Journal of the American Statistical Association}, volume = {72}, number = {358}, pages = {420--423}, year = {1977}, } @article{scillitoe2021uncertainty, title = {Uncertainty quantification for data-driven turbulence modelling with {Mondrian} forests}, author = {Scillitoe, Ashley and Seshadri, Pranay and Girolami, Mark}, journal = {Journal of Computational Physics}, volume = {430}, pages = {110116}, year = {2021}, } @article{scornet2015consistency, author = {Erwan Scornet and G{\'e}rard Biau and Jean-Philippe Vert}, journal = {Annals of Statistics}, keywords = {Additive model, consistency, Dimension reduction, random forests, Randomization, Sparsity}, number = {4}, pages = {1716--1741}, title = {Consistency of random forests}, volume = {43}, year = {2015}, } @article{settati2009gaussian, title = {Gaussian approximation of the empirical process under random entropy conditions}, author = {Settati, Adel}, journal = {Stochastic Processes and their Applications}, volume = {119}, number = {5}, pages = {1541--1560}, year = {2009}, } @article{sheehy1992uniform, title = {Uniform {Donsker} classes of functions}, author = {Sheehy, Anne and Wellner, Jon A}, journal = {Annals of Probability}, volume = {20}, number = {4}, pages = {1983--2030}, year = {1992}, } @book{simonoff1996smoothing, title = {Smoothing Methods in Statistics}, author = {Simonoff, Jeffrey S}, series = {Springer Series in Statistics}, year = {1996}, publisher = {Springer Science}, address = {New York, NY}, } @article{stone1982optimal, title = {Optimal global rates of convergence for nonparametric regression}, author = {Stone, Charles J}, journal = {Annals of Statistics}, pages = {1040--1053}, year = {1982}, } @book{van1996weak, title = {Weak Convergence and Empirical Processes}, author = {van der Vaart, Aad Willem and Wellner, Jon August}, year = {1996}, series = {Springer Series in Statistics}, publisher = {Springer}, address = {New York, NY}, } @article{van2013bernstein, title = {The {Bernstein}--{Orlicz} norm and deviation inequalities}, author = {van de Geer, Sara and Lederer, Johannes}, journal = {Probability Theory and Related Fields}, volume = {157}, number = {1}, pages = {225--250}, year = {2013}, } @inproceedings{vicuna2021reducing, title = {Reducing numerical precision preserves classification accuracy in {Mondrian} Forests}, author = {Vicuna, Marc and Khannouz, Martin and Kiar, Gregory and Chatelain, Yohan and Glatard, Tristan}, booktitle = {2021 IEEE International Conference on Big Data}, pages = {2785--2790}, year = {2021}, organization = {Institute of Electrical and Electronics Engineers}, } @book{wand1994kernel, author = {Wand, Matt P and Jones, M Chris}, publisher = {Chapman \& Hall/CRC}, address = {New York, NY}, title = {Kernel Smoothing}, year = {1994}, series = {Monographs on Statistics and Applied Probability}, volume = {60}, } @article{wu2004martingale, title = {Martingale approximations for sums of stationary processes}, author = {Wu, Wei Biao and Woodroofe, Michael}, journal = {Annals of Probability}, volume = {32}, number = {2}, pages = {1674--1690}, year = {2004} } @article{yurinskii1978error, author = {Yurinskii, Vadim Vladimirovich}, journal = {Theory of Probability \& its Applications}, number = {2}, pages = {236--247}, title = {On the error of the {Gaussian} approximation for convolutions}, volume = {22}, year = {1978}, } @article{zaitsev1987estimates, title = {Estimates of the {L}{\'e}vy--{Prokhorov} distance in the multivariate central limit theorem for random variables with finite exponential moments}, author = {Zaitsev, A Yu}, journal = {Theory of Probability \& Its Applications}, volume = {31}, number = {2}, pages = {203--220}, year = {1987}, } @article{zaitsev1987gaussian, title = {On the {Gaussian} approximation of convolutions under multidimensional analogues of {S.\ N.\ Bernstein's} inequality conditions}, author = {Zaitsev, A Yu}, journal = {Probability Theory and Related Fields}, volume = {74}, number = {4}, pages = {535--566}, year = {1987}, } @article{zhai2018high, title = {A high-dimensional {CLT} in $\mathcal{W}_2$ distance with near optimal convergence rate}, author = {Zhai, Alex}, journal = {Probability Theory and Related Fields}, volume = {170}, number = {3}, pages = {821--845}, year = {2018}, } @article{zhao2008martingale, title = {On martingale approximations}, author = {Zhao, Ou and Woodroofe, Michael}, journal = {Annals of Applied Probability}, volume = {18}, number = {5}, pages = {1831--1847}, year = {2008} } @article{zhou2019deep, title = {Deep forest}, author = {Zhou, Zhi-Hua and Feng, Ji}, journal = {National Science Review}, volume = {6}, number = {1}, pages = {74--86}, year = {2019}, } tex-fmt-0.5.2/tests/source/puthesis.cls000066400000000000000000000065021473573253500201340ustar00rootroot00000000000000\NeedsTeXFormat{LaTeX2e} \ProvidesClass{puthesis} \RequirePackage{setspace} \RequirePackage{xcolor} \def\current@color{ Black} \newcounter{subyear} \setcounter{subyear}{\number\year} \def\submitted#1{\gdef\@submitted{#1}} \def\@submittedyear{\ifnum\month>10 \stepcounter{subyear}\thesubyear \else\thesubyear\fi} \def\@submittedmonth{\ifnum\month>10 January\else\ifnum\month>8 November \else\ifnum\month>6 September\else May\fi\fi\fi} \def\adviser#1{\gdef\@adviser{#1}} \long\def\@abstract{\@latex@error{No \noexpand\abstract given}\@ehc} \newcommand*{\frontmatter}{ %\pagenumbering{roman} } \newcommand*{\mainmatter}{ %\pagenumbering{arabic} } \newcommand*{\makelot}{} \newcommand*{\makelof}{} \newcommand*{\makelos}{} \newcommand*{\begincmd}{ \doublespacing \frontmatter\maketitlepage\makecopyrightpage\makeabstract \makeacknowledgments\makededication\tableofcontents\clearpage \makelot\clearpage\makelof\clearpage\makelos \clearpage\mainmatter} \def\@submitted{\@submittedmonth~\@submittedyear} \def\@dept{Operations Research and Financial Engineering} \def\@deptpref{Department of} \def\departmentprefix#1{\gdef\@deptpref{#1}} \def\department#1{\gdef\@dept{#1}} \long\def\acknowledgments#1{\gdef\@acknowledgments{#1}} \def\dedication#1{\gdef\@dedication{#1}} \newcommand{\maketitlepage}{{ \thispagestyle{empty} \sc \vspace*{0in} \begin{center} \LARGE \@title \end{center} \vspace{.6in} \begin{center} \@author \end{center} \vspace{.6in} \begin{center} A Dissertation \\ Presented to the Faculty \\ of Princeton University \\ in Candidacy for the Degree \\ of Doctor of Philosophy \end{center} \vspace{.3in} \begin{center} Recommended for Acceptance \\ by the \@deptpref \\ \@dept \\ Adviser: \@adviser \end{center} \vspace{.3in} \begin{center} \@submitted \end{center} \clearpage }} \newcommand*{\makecopyrightpage}{ \thispagestyle{empty} \vspace*{0in} \begin{center} \copyright\ Copyright by \@author, \number\year. \\ All rights reserved. \end{center} \clearpage} \newcommand*{\makeabstract}{ \newpage \addcontentsline{toc}{section}{Abstract} \begin{center} \Large \textbf{Abstract} \end{center} \@abstract \clearpage } \def\makeacknowledgments{ \ifx\@acknowledgments\undefined \else \addcontentsline{toc}{section}{Acknowledgments} \begin{center} \Large \textbf{Acknowledgments} \end{center} \@acknowledgments \clearpage \fi } \def\makededication{ \ifx\@dedication\undefined \else \vspace*{1.5in} \begin{flushright} \@dedication \end{flushright} \clearpage \fi } \DeclareOption{myorder}{ \renewcommand*{\begincmd}{\doublespacing}} \DeclareOption{lot}{\renewcommand*{\makelot}{ \addcontentsline{toc}{section}{List of Tables}\listoftables}} \DeclareOption{lof}{\renewcommand*{\makelof}{ \addcontentsline{toc}{section}{List of Figures and Tables}\listoffigures}} \DeclareOption{los}{ \renewcommand*{\makelos}{ \RequirePackage{losymbol} \section*{List of Symbols\@mkboth {LIST OF SYMBOLS}{LIST OF SYMBOLS}} \@starttoc{los} \addcontentsline{toc}{section}{List of Symbols} } } \DeclareOption*{\PassOptionsToClass{\CurrentOption}{report}} \ProcessOptions \LoadClass{report} \setlength{\oddsidemargin}{0.2in} \setlength{\evensidemargin}{0.2in} \setlength{\topmargin}{0in} \setlength{\headheight}{0in} \setlength{\headsep}{0in} \setlength{\textheight}{8.9in} \setlength{\textwidth}{6.1in} \setlength{\footskip}{0.5in} \long\def\abstract#1{\gdef\@abstract{#1}} \AtBeginDocument{\begincmd} \endinput tex-fmt-0.5.2/tests/source/quiver.sty000066400000000000000000000031461473573253500176420ustar00rootroot00000000000000% *** quiver *** % A package for drawing commutative diagrams exported from https://q.uiver.app. % % This package is currently a wrapper around the `tikz-cd` package, importing necessary TikZ % libraries, and defining a new TikZ style for curves of a fixed height. % % Version: 1.4.2 % Authors: % - varkor (https://github.com/varkor) % - AndréC (https://tex.stackexchange.com/users/138900/andr%C3%A9c) \NeedsTeXFormat{LaTeX2e} \ProvidesPackage{quiver}[2021/01/11 quiver] % `tikz-cd` is necessary to draw commutative diagrams. \RequirePackage{tikz-cd} % `amssymb` is necessary for `\lrcorner` and `\ulcorner`. \RequirePackage{amssymb} % `calc` is necessary to draw curved arrows. \usetikzlibrary{calc} % `pathmorphing` is necessary to draw squiggly arrows. \usetikzlibrary{decorations.pathmorphing} % A TikZ style for curved arrows of a fixed height, due to AndréC. \tikzset{curve/.style={settings={#1},to path={(\tikztostart) .. controls ($(\tikztostart)!\pv{pos}!(\tikztotarget)!\pv{height}!270:(\tikztotarget)$) % tex-fmt: skip and ($(\tikztostart)!1-\pv{pos}!(\tikztotarget)!\pv{height}!270:(\tikztotarget)$) % tex-fmt: skip .. (\tikztotarget)\tikztonodes}}, settings/.code={\tikzset{quiver/.cd,#1} \def\pv##1{\pgfkeysvalueof{/tikz/quiver/##1}}}, quiver/.cd,pos/.initial=0.35,height/.initial=0} % TikZ arrowhead/tail styles. \tikzset{tail reversed/.code={\pgfsetarrowsstart{tikzcd to}}} \tikzset{2tail/.code={\pgfsetarrowsstart{Implies[reversed]}}} \tikzset{2tail reversed/.code={\pgfsetarrowsstart{Implies}}} % TikZ arrow styles. \tikzset{no body/.style={/tikz/dash pattern=on 0 off 1mm}} \endinput tex-fmt-0.5.2/tests/source/readme.tex000066400000000000000000000002571473573253500175450ustar00rootroot00000000000000\documentclass{article} \begin{document} \begin{itemize} \item Lists with items over multiple lines \end{itemize} \begin{equation} E = m c^2 \end{equation} \end{document} tex-fmt-0.5.2/tests/source/sections.tex000066400000000000000000000007151473573253500201360ustar00rootroot00000000000000\documentclass{book} \begin{document} \section{Section test} Sectioning commands should be moved to their own lines.\subsection{Result} Even if there is more than one.\subsection{Result 2} Also \section*{A} unnumbered sectioning commands \subsection*{B} should be split onto their own lines, even if there \subsubsection*{C} is more than one. All of this \part{D} should also hold \part*{E} for parts \chapter{F} and chapters \chapter*{G}. \end{document} tex-fmt-0.5.2/tests/source/short_document.tex000066400000000000000000000020761473573253500213460ustar00rootroot00000000000000\documentclass{article} \usepackage{amsmath} \usepackage{amsthm} \newtheorem{theorem}{Theorem} \title{Testing \texttt{tex-fmt}} \author{William G.\ Underwood} \begin{document} \maketitle \begin{align} E = m c^2 \\ 1 + 2 + (3 + 4) + (5 + 6 + 7 + 8) + (9 + 10 + 11 + 12 + 13 + 14) \end{align} \begin{itemize} \item Item one % trailing comment with ]) brackets \item Item two on multiple lines \item Item three \begin{itemize} \item Subitem one of item two % this line has trailing spaces \item Subitem two of item two \end{itemize} \item Item four % trailing comment % with [( brackets \item \end{itemize} \begin{theorem}[Pythagoras]% \label{thm:pythagoras} For a right triangle with hypotenuse $c$ and other sides $a$ and $b$, we have % \begin{align*} a^2 + b^2 = c^2 \end{align*} % % some comments \end{theorem} This line contains \emph{emphasized} text. \emph{This line contains only emphasized text, and is broken over two lines}. \emph{This line contains only emphasized text, and is broken over three lines}. \end{document} % This file ends with trailing newlines tex-fmt-0.5.2/tests/source/tikz_network.sty000066400000000000000000001515171473573253500210670ustar00rootroot00000000000000% ============================================================================ % File : tikz-network.sty -- Library for plotting networks in TikZ % Author : Juergen Hackl % Creation : 2017-02-28 % Time-stamp: % Version : 1.0 (2018-07-30) % % Copyright (c) 2018 Juergen Hackl % % This program is free software: you can redistribute it and/or modify % it under the terms of the GNU General Public License as published by % the Free Software Foundation, either version 3 of the License, or % (at your option) any later version. % % This program is distributed in the hope that it will be useful, % but WITHOUT ANY WARRANTY; without even the implied warranty of % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the % GNU General Public License for more details. % % You should have received a copy of the GNU General Public License % along with this program. If not, see . % ============================================================================ \NeedsTeXFormat{LaTeX2e} \ProvidesPackage{tikz-network}[2018/07/30 tikz-network v1.0] %============================================================================= % Used packages %============================================================================= \RequirePackage{etex} \RequirePackage{xifthen} \RequirePackage{xkeyval}[2005/11/25] \RequirePackage{tikz} \RequirePackage{datatool} \RequirePackage{graphicx} \usetikzlibrary{arrows} \usetikzlibrary{positioning} \usetikzlibrary{3d} \usetikzlibrary{fit} \usetikzlibrary{calc} \usetikzlibrary{backgrounds} \usetikzlibrary{arrows.meta} \usetikzlibrary{shapes.geometric} %============================================================================= %============================================================================= % Predefined variables %============================================================================= %<---------------------------------------------------------------------------> % Vertex %<---------------------------------------------------------------------------> \definecolor{vertexfill}{HTML}{abd7e6} \newcommand*{\DefaultUnit}{cm} \newcommand*{\DistanceScale}{1} \newcommand*{\VertexShape}{circle} \newcommand*{\VertexInnerSep}{2pt} \newcommand*{\VertexOuterSep}{0pt} \newcommand*{\VertexMinSize}{0.6\DefaultUnit} \newcommand*{\VertexLineWidth}{1pt} \newcommand*{\VertexLineColor}{black} \newcommand*{\VertexLineOpacity}{1} \newcommand*{\VertexTextColor}{black} \newcommand*{\VertexFillColor}{vertexfill} \newcommand*{\VertexFillOpacity}{1} \newcommand*{\VertexTextFont}{\scriptsize}%\tiny} \newcommand*{\VertexTextRotation}{0} \newcommand*{\VertexTextOpacity}{1} %<---------------------------------------------------------------------------> % Edge %<---------------------------------------------------------------------------> \newcommand*{\EdgeArrow}{-latex} \newcommand*{\EdgeLineWidth}{1.5pt} \newcommand*{\EdgeColor}{black!75} \newcommand*{\EdgeOpacity}{1} \newcommand*{\EdgeTextFillColor}{white} \newcommand*{\EdgeTextFillOpacity}{1} \newcommand*{\EdgeInnerSep}{0pt} \newcommand*{\EdgeOuterSep}{1pt} \newcommand*{\EdgeTextRotation}{0} \newcommand*{\EdgeTextOpacity}{1} \newcommand*{\EdgeTextFont}{\scriptsize} %<---------------------------------------------------------------------------> % Plane %<---------------------------------------------------------------------------> \newcommand*{\PlaneLineWidth}{1.5pt} \newcommand*{\PlaneLineColor}{black} \newcommand*{\PlaneLineOpacity}{1} \newcommand*{\PlaneGridLineWidth}{.5pt} \newcommand*{\PlaneGridColor}{black} \newcommand*{\PlaneGridOpacity}{.5} \newcommand*{\PlaneFillColor}{vertexfill} \newcommand*{\PlaneFillOpacity}{.3} \newcommand*{\PlaneWidth}{5\DefaultUnit} \newcommand*{\PlaneHeight}{5\DefaultUnit} %<---------------------------------------------------------------------------> % Text %<---------------------------------------------------------------------------> \newcommand*{\TextInnerSep}{2pt} \newcommand*{\TextOuterSep}{0pt} \newcommand*{\TextFont}{\normalsize} \newcommand*{\TextColor}{black} \newcommand*{\TextRotation}{0} \newcommand*{\TextOpacity}{1} %<---------------------------------------------------------------------------> % Network %<---------------------------------------------------------------------------> \newcommand*{\NetworkLayerDistance}{-2} \newcommand*{\xAngle}{-12} \newcommand*{\xLength}{1} \newcommand*{\yAngle}{37} \newcommand*{\yLength}{1} \newcommand*{\zAngle}{90} \newcommand*{\zLength}{1} \tikzset{edge canvas/.style={}} \tikzset{multilayer 2d/.style={y={(0:1cm)},x={(90:1cm)},z={(90:0cm)},every node/.append style={transform shape},}} \def\Origin{\draw [->] (0,0,0) -- (2,0,0) node [at end, right] {$y$}; \draw [->] (0,0,0) -- (0,2,0) node [at end, right] {$x$}; \draw [->] (0,0,0) -- (0,0,2) node [at end, left] {$z$};} %============================================================================= % Predefined Styles %============================================================================= %<---------------------------------------------------------------------------> % Init Default Vertex Style %<---------------------------------------------------------------------------> \define@cmdkey [NW] {DVS} {Shape}{} \define@cmdkey [NW] {DVS} {MinSize}{} \define@cmdkey [NW] {DVS} {LineWidth}{} \define@cmdkey [NW] {DVS} {LineColor}{} \define@cmdkey [NW] {DVS} {LineOpacity}{} \define@cmdkey [NW] {DVS} {FillColor}{} \define@cmdkey [NW] {DVS} {FillOpacity}{} \define@cmdkey [NW] {DVS} {TextColor}{} \define@cmdkey [NW] {DVS} {TextFont}{} \define@cmdkey [NW] {DVS} {TextRotation}{} \define@cmdkey [NW] {DVS} {TextOpacity}{} \define@cmdkey [NW] {DVS} {InnerSep}{} \define@cmdkey [NW] {DVS} {OuterSep}{} \presetkeys [NW] {DVS} { Shape = \VertexShape, MinSize = \VertexMinSize, LineWidth = \VertexLineWidth, LineColor = \VertexLineColor, FillColor = \VertexFillColor, LineOpacity = \VertexLineOpacity, FillOpacity = \VertexFillOpacity, InnerSep = \VertexInnerSep, OuterSep = \VertexOuterSep, TextColor = \VertexTextColor, TextRotation = \VertexTextRotation, TextOpacity = \VertexTextOpacity, TextFont = \VertexTextFont }{} %<---------------------------------------------------------------------------> % Init Default Edge Style %<---------------------------------------------------------------------------> \define@cmdkey [NW] {DES} {Arrow}{} \define@cmdkey [NW] {DES} {LineWidth}{} \define@cmdkey [NW] {DES} {Color}{} \define@cmdkey [NW] {DES} {Opacity}{} \define@cmdkey [NW] {DES} {TextFillColor}{} \define@cmdkey [NW] {DES} {TextFillOpacity}{} \define@cmdkey [NW] {DES} {TextFont}{} \define@cmdkey [NW] {DES} {TextRotation}{} \define@cmdkey [NW] {DES} {TextOpacity}{} \define@cmdkey [NW] {DES} {InnerSep}{} \define@cmdkey [NW] {DES} {OuterSep}{} \presetkeys [NW] {DES} { Arrow = \EdgeArrow, LineWidth = \EdgeLineWidth, Color = \EdgeColor, Opacity = \EdgeOpacity, TextFillColor = \EdgeTextFillColor, TextFillOpacity = \EdgeTextFillOpacity, InnerSep = \EdgeInnerSep, OuterSep = \EdgeOuterSep, TextRotation = \EdgeTextRotation, TextOpacity = \EdgeTextOpacity, TextFont = \EdgeTextFont }{} %<---------------------------------------------------------------------------> % Init Default Plane Style %<---------------------------------------------------------------------------> \define@cmdkey [NW] {DPS} {LineWidth}{} \define@cmdkey [NW] {DPS} {LineColor}{} \define@cmdkey [NW] {DPS} {LineOpacity}{} \define@cmdkey [NW] {DPS} {GridLineWidth}{} \define@cmdkey [NW] {DPS} {GridColor}{} \define@cmdkey [NW] {DPS} {GridOpacity}{} \define@cmdkey [NW] {DPS} {FillColor}{} \define@cmdkey [NW] {DPS} {FillOpacity}{} \presetkeys [NW] {DPS} { LineWidth = \PlaneLineWidth, LineColor = \PlaneLineColor, LineOpacity = \PlaneLineOpacity, GridLineWidth = \PlaneGridLineWidth, GridColor = \PlaneGridColor, GridOpacity = \PlaneGridOpacity, FillColor = \PlaneFillColor, FillOpacity = \PlaneFillOpacity }{} %<---------------------------------------------------------------------------> % Init Default Text Style %<---------------------------------------------------------------------------> \define@cmdkey [NW] {DTS} {InnerSep}{} \define@cmdkey [NW] {DTS} {OuterSep}{} \define@cmdkey [NW] {DTS} {TextFont}{} \define@cmdkey [NW] {DTS} {TextColor}{} \define@cmdkey [NW] {DTS} {TextRotation}{} \define@cmdkey [NW] {DTS} {TextOpacity}{} \presetkeys [NW] {DTS} { InnerSep = \TextInnerSep, OuterSep = \TextOuterSep, TextFont = \TextFont, TextColor = \TextColor, TextRotation = \TextRotation, TextOpacity = \TextOpacity }{} %<---------------------------------------------------------------------------> % Init Default Coordinates 3D %<---------------------------------------------------------------------------> \define@cmdkey [NW] {COS} {xAngle}{} \define@cmdkey [NW] {COS} {xLength}{} \define@cmdkey [NW] {COS} {yAngle}{} \define@cmdkey [NW] {COS} {yLength}{} \define@cmdkey [NW] {COS} {zAngle}{} \define@cmdkey [NW] {COS} {zLength}{} \presetkeys [NW] {COS} { xAngle = \xAngle, xLength = \xLength, yAngle = \yAngle, yLength = \yLength, zAngle = \zAngle, zLength = \zLength }{} %<---------------------------------------------------------------------------> % Default Style %<---------------------------------------------------------------------------> \newcommand*{\SetVertexStyle}[1][]{\NW@SetVertexStyleDefault[#1]}% \def\NW@SetVertexStyleDefault[#1]{% \setkeys[NW]{DVS}{#1}% \tikzset{VertexStyle/.style = { draw, shape = \cmdNW@DVS@Shape, color = \cmdNW@DVS@LineColor, fill = \cmdNW@DVS@FillColor, inner sep = \cmdNW@DVS@InnerSep, outer sep = \cmdNW@DVS@OuterSep, minimum size = \cmdNW@DVS@MinSize, line width = \cmdNW@DVS@LineWidth, font = \cmdNW@DVS@TextFont, fill opacity = \cmdNW@DVS@FillOpacity, draw opacity = \cmdNW@DVS@LineOpacity }} \tikzset{LabelStyle/.style={ \cmdNW@DVS@TextColor, font = \cmdNW@DVS@TextFont, rotate = \cmdNW@DVS@TextRotation, opacity = \cmdNW@DVS@TextOpacity, }} }% \newcommand*{\SetEdgeStyle}[1][]{\NW@SetEdgeStyleDefault[#1]}% \def\NW@SetEdgeStyleDefault[#1]{% \setkeys[NW]{DES}{#1}% \tikzset{EdgeStyle/.style = {\cmdNW@DES@Arrow, line width = \cmdNW@DES@LineWidth, color = \cmdNW@DES@Color, opacity = \cmdNW@DES@Opacity }} \tikzset{EdgeLabelStyle/.style={circle, fill = \cmdNW@DES@TextFillColor, fill opacity = \cmdNW@DES@TextFillOpacity, inner sep = \cmdNW@DES@InnerSep, outer sep = \cmdNW@DES@OuterSep, rotate = \cmdNW@DES@TextRotation, text opacity = \cmdNW@DES@TextOpacity, font = \cmdNW@DES@TextFont }} }% \newcommand*{\SetPlaneStyle}[1][]{\NW@SetPlaneStyleDefault[#1]}% \def\NW@SetPlaneStyleDefault[#1]{% \setkeys[NW]{DPS}{#1}% \tikzset{PlaneBorderStyle/.style = {draw, line width = \cmdNW@DPS@LineWidth, color = \cmdNW@DPS@LineColor, draw opacity = \cmdNW@DPS@LineOpacity }} \tikzset{PlaneFillStyle/.style = { fill = \cmdNW@DPS@FillColor, fill opacity = \cmdNW@DPS@FillOpacity }} \tikzset{PlaneGridStyle/.style = {draw, line width = \cmdNW@DPS@GridLineWidth, color = \cmdNW@DPS@GridColor, opacity = \cmdNW@DPS@GridOpacity }} }% \newcommand*{\SetTextStyle}[1][]{\NW@SetTextStyleDefault[#1]}% \def\NW@SetTextStyleDefault[#1]{% \setkeys[NW]{DTS}{#1}% \tikzset{TextStyle/.style = { inner sep = \cmdNW@DTS@InnerSep, outer sep = \cmdNW@DTS@OuterSep, color = \cmdNW@DTS@TextColor, rotate = \cmdNW@DTS@TextRotation, text opacity = \cmdNW@DTS@TextOpacity, font = \cmdNW@DTS@TextFont }} }% \tikzset{ multilayer/.code={% \ifthenelse{\equal{#1}{3d}}{ \tikzset{edge canvas/.style={canvas is yx plane at z=0}} \tikzset{multilayer 3d} }{ \tikzset{edge canvas/.style={}} \tikzset{multilayer 2d} } }, } \newcommand*{\SetCoordinates}[1][]{\NW@SetCoordinates[#1]}% \def\NW@SetCoordinates[#1]{% \setkeys[NW]{COS}{#1}% \tikzset{multilayer 3d/.style={ y={(\cmdNW@COS@xAngle:\cmdNW@COS@xLength \DefaultUnit)}, x={(\cmdNW@COS@yAngle:\cmdNW@COS@yLength \DefaultUnit)}, z={(\cmdNW@COS@zAngle:\cmdNW@COS@zLength \DefaultUnit)}, every node/.append style={transform shape}, }} %\tikzset{edge canvas/.style={canvas is yx plane at z=0}} }% %<---------------------------------------------------------------------------> % Apply default settings %<---------------------------------------------------------------------------> \SetCoordinates \SetVertexStyle \SetEdgeStyle \SetPlaneStyle \SetTextStyle %<---------------------------------------------------------------------------> % Redefine settings %<---------------------------------------------------------------------------> \newcommand*{\SetLayerDistance}[1]{\renewcommand{\NetworkLayerDistance}{#1}} \newcommand*{\SetDefaultUnit}[1]{\renewcommand{\DefaultUnit}{#1}} \newcommand*{\SetDistanceScale}[1]{\renewcommand{\DistanceScale}{#1}} \newcommand*{\SetPlaneWidth}[1]{\renewcommand{\PlaneWidth}{#1}} \newcommand*{\SetPlaneHeight}[1]{\renewcommand{\PlaneHeight}{#1}} \newcommand*{\EdgesInBG}{\presetkeys [NW] {edge} {NotInBG = false}{}} \newcommand*{\EdgesNotInBG}{\presetkeys [NW] {edge} {NotInBG = true}{}} %============================================================================= % Vertex and Edge creation %============================================================================= %<---------------------------------------------------------------------------> % Init Vertex %<---------------------------------------------------------------------------> \define@cmdkey [NW] {vertex} {x}{} \define@cmdkey [NW] {vertex} {y}{} \define@cmdkey [NW] {vertex} {label}{} \define@cmdkey [NW] {vertex} {size}{} \define@cmdkey [NW] {vertex} {color}{} \define@cmdkey [NW] {vertex} {opacity}{} \define@cmdkey [NW] {vertex} {style}{} \define@cmdkey [NW] {vertex} {layer}{} \define@cmdkey [NW] {vertex} {shape}{} \define@cmdkey [NW] {vertex} {fontsize}{} \define@cmdkey [NW] {vertex} {fontcolor}{} \define@cmdkey [NW] {vertex} {fontscale}{} \define@boolkey [NW] {vertex} {RGB}[true]{} \define@boolkey [NW] {vertex} {IdAsLabel}[true]{} \define@boolkey [NW] {vertex} {NoLabel}[true]{} \define@boolkey [NW] {vertex} {Math}[true]{} \define@boolkey [NW] {vertex} {Network}[true]{} \define@boolkey [NW] {vertex} {Pseudo}[true]{} \define@cmdkey [NW] {vertex} {distance}{} \define@cmdkey [NW] {vertex} {position}{} \presetkeys [NW] {vertex} {Network = false,}{} %<---------------------------------------------------------------------------> % Vertex %<---------------------------------------------------------------------------> \newcommand*{\Vertex}[1][]{\@vertex[#1]}% \def\@vertex[#1]#2{% \setkeys[NW]{vertex}{#1}% % Check if Vertex is used in a network, if so no default settings are % necessary, otherwise default settings are applied. \ifNW@vertex@Network \cmdNW@vertex@opacity \else \setkeys[NW]{vertex}{ x = {0}, y = {0}, label = {}, size = {}, color = {}, opacity = {}, layer = {}, shape = {}, style = {}, fontsize = {}, fontcolor = {}, fontscale = {}, NoLabel = false, IdAsLabel = false, Math = false, RGB = false, Pseudo = false, distance = {0}, position = {center}, } \setkeys[NW]{vertex}{#1}% \fi \@@vertex{#2}% } \def\@@vertex#1{% \def\vstyle{VertexStyle} \begin{scope} % [ % scale=1,yshift=0,every node/.append style={yslant=0.5,xslant=-1},yslant=0.5,xslant=-1 % ] % If option NoLabel is true, no labels are printed in the network \ifNW@vertex@NoLabel \def\vertex@L{}% \def\vertex@Name{}% \else % if IdAsLabel is true, the label of the vertex is equal to the vertex id \ifNW@vertex@IdAsLabel \def\vertex@Name{#1} \def\vertex@L{\vertex@Name} % Otherwise the label is equal to the label if it is non empty \else \ifthenelse{\not\equal{\cmdNW@vertex@label}{}}{ \def\vertex@L{\cmdNW@vertex@label} \def\vertex@Name{#1} }{ \def\vertex@Name{#1} \def\vertex@L{} } \fi \fi % Check if Math is true, if so the label will be in math mode \ifNW@vertex@Math \def\vertex@Label{$\vertex@L$}% \else \def\vertex@Label{\vertex@L}% \fi % Check if the size of the vertex is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@vertex@size}{}}{ \tikzset{LocalVertexSize/.style={minimum size = \cmdNW@vertex@size \DefaultUnit}} }{ \tikzset{LocalVertexSize/.style={}} } % Check if the font size of the vertex label is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@vertex@fontsize}{}}{ \tikzset{LocalVertexFontSize/.style={font = \cmdNW@vertex@fontsize}} }{ \tikzset{LocalVertexFontSize/.style={}} } % Check if the font scale of the vertex label is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@vertex@fontscale}{}}{ \tikzset{LocalVertexFontScale/.style={scale = \cmdNW@vertex@fontscale}} }{ \tikzset{LocalVertexFontScale/.style={}} } % Check if the opacity of the vertex is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@vertex@opacity}{}}{ \tikzset{LocalVertexOpacity/.style={fill opacity = \cmdNW@vertex@opacity}} }{ \tikzset{LocalVertexOpacity/.style={}} } % Check if the shape of the vertex is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@vertex@shape}{}}{ \tikzset{LocalVertexShape/.style={shape = \cmdNW@vertex@shape}} }{ \tikzset{LocalVertexShape/.style={}} } % Check if the color of the vertex is redefined, if so the new style is % used. If the option RGB is true, RGB values can be used to define the % color of the vertex entered in the form {R,G,B}. If RGB is not true the % default colors of tikz can be used (e.g. blue!50!green) \ifNW@vertex@RGB \ifthenelse{\not\equal{\cmdNW@vertex@color}{}}{ \pgfutil@definecolor{LocalColor}{RGB}{\cmdNW@vertex@color} \tikzset{LocalVertexFill/.style={fill = LocalColor}} }{ \tikzset{LocalVertexFill/.style={}} } \ifthenelse{\not\equal{\cmdNW@vertex@fontcolor}{}}{ \pgfutil@definecolor{LocalFontColor}{RGB}{\cmdNW@vertex@fontcolor} \tikzset{LocalVertexFontColor/.style={color = LocalFontColor}} }{ \tikzset{LocalVertexFontColor/.style={}} } \else \ifthenelse{\not\equal{\cmdNW@vertex@color}{}}{ \tikzset{LocalVertexFill/.style={fill = \cmdNW@vertex@color}} }{ \tikzset{LocalVertexFill/.style={}} } \ifthenelse{\not\equal{\cmdNW@vertex@fontcolor}{}}{ \tikzset{LocalVertexFontColor/.style={color = \cmdNW@vertex@fontcolor}} }{ \tikzset{LocalVertexFontColor/.style={}} } \fi % Define empty style for the vertex \ifNW@vertex@Pseudo \tikzset{LocalPseudo/.style={opacity = 0}} \else \tikzset{LocalPseudo/.style={}} \fi % Define local style for the label \tikzset{LocalLabel/.style={label = {[LabelStyle, LocalVertexFontColor, LocalVertexFontSize, LocalVertexFontScale, label distance=\cmdNW@vertex@distance]\cmdNW@vertex@position:\vertex@Label}}} \ifthenelse{\equal{\cmdNW@vertex@layer}{}}{ \protected@edef\@tempa{% \noexpand\node[\vstyle,LocalVertexSize,LocalVertexOpacity, LocalVertexFill,LocalVertexShape,LocalLabel, \cmdNW@vertex@style,LocalPseudo](#1)% at (\cmdNW@vertex@x*\DistanceScale\DefaultUnit, \cmdNW@vertex@y*\DistanceScale\DefaultUnit){}}% \@tempa; }{ \begin{scope}[canvas is yx plane at z=(\cmdNW@vertex@layer-1)*\NetworkLayerDistance] \protected@edef\@tempa{% \noexpand\node[\vstyle,LocalVertexSize,LocalVertexOpacity, LocalVertexFill,LocalVertexShape,LocalLabel, \cmdNW@vertex@style,LocalPseudo](#1)% at (\cmdNW@vertex@x*\DistanceScale\DefaultUnit, \cmdNW@vertex@y*\DistanceScale\DefaultUnit){}}% \@tempa; \end{scope} } \end{scope} } %<---------------------------------------------------------------------------> % Init Edge %<---------------------------------------------------------------------------> \define@cmdkey [NW] {edge} {label}{} \define@cmdkey [NW] {edge} {lw}{} \define@cmdkey [NW] {edge} {color}{} \define@cmdkey [NW] {edge} {opacity}{} \define@cmdkey [NW] {edge} {style}{} \define@cmdkey [NW] {edge} {fontcolor}{} \define@cmdkey [NW] {edge} {fontsize}{} \define@cmdkey [NW] {edge} {fontscale}{} \define@boolkey [NW] {edge} {RGB}[true]{} \define@boolkey [NW] {edge} {Math}[true]{} \define@boolkey [NW] {edge} {Direct}[true]{} \define@boolkey [NW] {edge} {Network}[true]{} \define@cmdkey [NW] {edge} {bend}{} \define@cmdkey [NW] {edge} {position}{} \define@cmdkey [NW] {edge} {distance}{} \define@cmdkey [NW] {edge} {loopsize}{} \define@cmdkey [NW] {edge} {loopposition}{} \define@cmdkey [NW] {edge} {loopshape}{} \define@boolkey [NW] {edge} {NotInBG}[true]{} \define@cmdkey [NW] {edge} {path}{} \presetkeys [NW] {edge} {Network = false,}{} % NotInBG = false,}{} %<---------------------------------------------------------------------------> % Edge %<---------------------------------------------------------------------------> \newcommand*{\Edge}[1][]{\@edge[#1]}% \def\@edge[#1](#2)(#3){% \setkeys[NW]{edge}{#1}% % Check if Vertex is used in a network, if so no default settings are % necessary, otherwise default settings are applied. \ifNW@edge@Network \else \setkeys[NW]{edge}{ label = {}, lw = {}, path = {}, color = {}, opacity = {}, style = {}, fontcolor = {}, fontsize = {}, fontscale = {}, RGB = false, Math = false, Direct = false, NotInBG = false, bend = {0}, loopsize = {1\DefaultUnit}, position = {}, loopposition= {0}, loopshape = {90}, distance = {.5} } \setkeys[NW]{edge}{#1}% \fi \def\estyle{EdgeStyle} % \ifNW@edge@NotInBG \tikzset{EdgeInBG/.style={}} \else \tikzset{EdgeInBG/.style={on background layer}} \fi \begin{scope}[edge canvas,EdgeInBG] % [ % scale=1,yshift=0,every node/.append style={yslant=0.5,xslant=-1},yslant=0.5,xslant=-1 % ] % Check if Direct is true, if so use default arrow style \ifNW@edge@Direct \tikzset{LocalArrow/.style={}} \else \tikzset{LocalArrow/.style={-}} \fi % Check if the line width of the vertex is redefined, if so the new style is % used \ifthenelse{\not\equal{\cmdNW@edge@lw}{}}{ \tikzset{LocalEdgeLW/.style={line width = \cmdNW@edge@lw}} }{ \tikzset{LocalEdgeLW/.style={}} } % Check if the opacity of the vertex is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@edge@opacity}{}}{ \tikzset{LocalEdgeOpacity/.style={opacity = \cmdNW@edge@opacity}} \tikzset{LocalTextOpacity/.style={text opacity = \cmdNW@edge@opacity}} }{ \tikzset{LocalEdgeOpacity/.style={}} \tikzset{LocalTextOpacity/.style={}} } % Check if the font size of the edge is redefined, if so the new style is % used \ifthenelse{\not\equal{\cmdNW@edge@fontsize}{}}{ \tikzset{LocalEdgeFontSize/.style={font = \cmdNW@edge@fontsize}} }{ \tikzset{LocalEdgeFontSize/.style={}} } % Check if the font scale of the edge is redefined, if so the new style is % used \ifthenelse{\not\equal{\cmdNW@edge@fontscale}{}}{ \tikzset{LocalEdgeFontScale/.style={scale = \cmdNW@edge@fontscale}} }{ \tikzset{LocalEdgeFontScale/.style={}} } % Check if the color of the vertex is redefined, if so the new style is % used. If the option RGB is true, RGB values can be used to define the % color of the vertex entered in the form {R,G,B}. If RGB is not true the % default colors of tikz can be used (e.g. blue!50!green) \ifNW@edge@RGB \ifthenelse{\not\equal{\cmdNW@edge@color}{}}{ \pgfutil@definecolor{LocalColor}{RGB}{\cmdNW@edge@color} \tikzset{LocalEdgeColor/.style={color = LocalColor}} }{ \tikzset{LocalEdgeColor/.style={}} } \ifthenelse{\not\equal{\cmdNW@edge@fontcolor}{}}{ \pgfutil@definecolor{LocalFontColor}{RGB}{\cmdNW@edge@fontcolor} \tikzset{LocalEdgeFontColor/.style={text = LocalFontColor}} }{ \tikzset{LocalEdgeFontColor/.style={}} } \else \ifthenelse{\not\equal{\cmdNW@edge@color}{}}{ \tikzset{LocalEdgeColor/.style={color = \cmdNW@edge@color}} }{ \tikzset{LocalEdgeColor/.style={}} } \ifthenelse{\not\equal{\cmdNW@edge@fontcolor}{}}{ \tikzset{LocalEdgeFontColor/.style={text = \cmdNW@edge@fontcolor}} }{ \tikzset{LocalEdgeFontColor/.style={}} } \fi % Check if Math is true, if so the label will be in math mode \ifNW@edge@Math \def\edge@L{$\cmdNW@edge@label$}% \else \def\edge@L{\cmdNW@edge@label}% \fi % Check if a label is assigned, if so create a label variable \ifthenelse{\not\equal{\cmdNW@edge@label}{}}{ \def\edge@Label{node[EdgeLabelStyle,LocalTextOpacity,LocalEdgeFontColor, LocalEdgeFontSize,LocalEdgeFontScale,pos=\cmdNW@edge@distance, \cmdNW@edge@position]{\edge@L}} }{ \def\edge@Label{} } % Check if it is a self loop or a normal edge % Normal edge \ifthenelse{\not\equal{#2}{#3}}{ \ifthenelse{\not\equal{\cmdNW@edge@path}{}}{ \def\edge@pts{}% \@for\tmp:=\cmdNW@edge@path\do{ \edef\edge@pts{\edge@pts (\tmp) --} } \protected@edef\@tempa{% \noexpand\draw[\estyle,LocalEdgeLW,LocalEdgeOpacity,LocalEdgeColor, LocalArrow,\cmdNW@edge@style] (#2) -- \edge@pts (#3)} \@tempa; }{ \protected@edef\@tempa{% \noexpand\path[\estyle,LocalEdgeLW,LocalEdgeOpacity,LocalEdgeColor, LocalArrow,\cmdNW@edge@style] (#2) edge [bend left = \cmdNW@edge@bend] \edge@Label (#3)}% \@tempa; } }{% Self loop \protected@edef\@tempa{% \noexpand\path[\estyle,LocalEdgeLW,LocalEdgeOpacity,LocalEdgeColor, LocalArrow,\cmdNW@edge@style] (#2) edge [in=-\cmdNW@edge@loopshape/2+\cmdNW@edge@loopposition, out=\cmdNW@edge@loopshape/2+\cmdNW@edge@loopposition,loop, distance=\cmdNW@edge@loopsize,] \edge@Label (#3)}% \@tempa; } \end{scope} } %============================================================================= % Vertices and Edges creation %============================================================================= %<---------------------------------------------------------------------------> % Init Vertices %<---------------------------------------------------------------------------> \define@cmdkey [NW] {vertices} {layer}{} \define@cmdkey [NW] {vertices} {size}{} \define@cmdkey [NW] {vertices} {color}{} \define@cmdkey [NW] {vertices} {opacity}{} \define@cmdkey [NW] {vertices} {style}{} \define@cmdkey [NW] {vertices} {shape}{} \define@boolkey [NW] {vertices} {RGB}[true]{} \define@boolkey [NW] {vertices} {IdAsLabel}[true]{} \define@boolkey [NW] {vertices} {NoLabel}[true]{} \define@boolkey [NW] {vertices} {Math}[true]{} \define@boolkey [NW] {vertices} {Pseudo}[true]{} \presetkeys [NW] {vertices} { layer = {}, opacity = {}, size = {}, color = {}, style = {}, shape = {}, RGB = false, IdAsLabel = false, NoLabel = false, Math = false, Pseudo = false, }{} \newcommand*{\setkeysexpanded}[2]{% \expandafter\setkeysexpandedaux\expandafter{#2}{#1}} \newcommand*{\setkeysexpandedaux}[2]{% \setkeys[NW]{#2}{#1}} % \newcommand*{\setkeysexpandedx}[2]{% % \expandafter\setkeysexpandedauxx\expandafter{#2}{#1}} % \newcommand*{\setkeysexpandedauxx}[2]{% % \setkeys[NW]{#2}{#1}} %<---------------------------------------------------------------------------> % Vertices %<---------------------------------------------------------------------------> \newcommand*{\Vertices}[1][]{\@vertices[#1]}% \def\@vertices[#1]#2{% \setkeys[NW]{vertices}{#1}% \@@vertices{#2}% } \def\@@vertices#1{% % Check if data base already exist \DTLifdbexists{#1}{}{ % create dummy data base to store name \DTLnewdb{#1} % delete existing vertices data base \DTLifdbexists{vertices}{ \DTLgdeletedb{vertices} }{} % Load data file for vertices \DTLloaddb[noheader=false]{vertices}{#1} } % Define variables to store option values \def\vertex@Options{}% \def\vertex@id{}% \def\vertex@rgbValues{}% % Go through each row and create vertices \DTLforeach*{vertices}{}{% % reset storage variable to default values \edef\vertex@Options{x=0,y=0,label={},size={},color={},fontcolor={}, fontsize={},fontscale={}, opacity={},layer={},style={},NoLabel=false,IdAsLabel=false, Math=false,RGB=false,Pseudo=false,distance={0},position={center},shape={},}% \edef\vertex@rgbValues{}% % Go through each row element \DTLforeachkeyinrow{\thisValue}{ \DTLifeq{\dtlkey}{id}{ % Assign vertex id to storage variable \edef\vertex@id{\thisValue}% }{ \DTLifeq{\dtlkey}{R}{ \edef\vertex@rgbValues{\vertex@rgbValues \thisValue,} }{ \DTLifeq{\dtlkey}{G}{ \edef\vertex@rgbValues{\vertex@rgbValues \thisValue,} }{ \DTLifeq{\dtlkey}{B}{ \edef\vertex@rgbValues{\vertex@rgbValues \thisValue,} }{ % Assign option to storage variable \edef\vertex@Options{\vertex@Options \dtlkey=\thisValue,} }}}} } % Add general settings for the Vertex % NoLabel \ifNW@vertices@NoLabel \edef\vertex@Options{\vertex@Options NoLabel=true,} \fi % Pseudo \ifNW@vertices@Pseudo \edef\vertex@Options{\vertex@Options Pseudo=true,} \fi % IdAsLabel \ifNW@vertices@IdAsLabel \edef\vertex@Options{\vertex@Options IdAsLabel=true,} \fi % Math \ifNW@vertices@Math \edef\vertex@Options{\vertex@Options Math=true,} \fi % RGB \ifNW@vertices@RGB \edef\vertex@Options{\vertex@Options RGB=true,color={\vertex@rgbValues},} \fi % opacity \ifthenelse{\not\equal{\cmdNW@vertices@opacity}{}} { \edef\vertex@Options{\vertex@Options opacity=\cmdNW@vertices@opacity,} }{} % size \ifthenelse{\not\equal{\cmdNW@vertices@size}{}} { \edef\vertex@Options{\vertex@Options size=\cmdNW@vertices@size,} }{} % shape \ifthenelse{\not\equal{\cmdNW@vertices@shape}{}} { \edef\vertex@Options{\vertex@Options shape=\cmdNW@vertices@shape,} }{} % color \ifthenelse{\not\equal{\cmdNW@vertices@color}{}} { \edef\vertex@Options{\vertex@Options color=\cmdNW@vertices@color,} }{} \ifthenelse{\not\equal{\cmdNW@vertices@style}{}}{ \edef\vertex@Options{\vertex@Options style={\cmdNW@vertices@style},} }{} % Apply settings for the Vertex \setkeysexpanded{vertex}{\vertex@Options}% \ifthenelse{\not\equal{\cmdNW@vertices@layer}{}}{ \ifthenelse{\equal{\cmdNW@vertices@layer}{\cmdNW@vertex@layer}}{ \Vertex[Network]{\vertex@id} }{} }{ \Vertex[Network]{\vertex@id} } % Create Vertex } % Delete data base % \DTLgdeletedb{#1} } %<---------------------------------------------------------------------------> % Init Edges %<---------------------------------------------------------------------------> \def\myvariable{\KeySettingCommand{false}} \define@cmdkey [NW] {edges} {layer}{} \define@cmdkey [NW] {edges} {vertices}{} \define@cmdkey [NW] {edges} {style}{} \define@cmdkey [NW] {edges} {lw}{} \define@cmdkey [NW] {edges} {color}{} \define@cmdkey [NW] {edges} {opacity}{} \define@boolkey [NW] {edges} {RGB}[true]{} \define@boolkey [NW] {edges} {Math}[true]{} \define@boolkey [NW] {edges} {Direct}[true]{} \define@boolkey [NW] {edges} {NoLabel}[true]{} \define@boolkey [NW] {edges} {NotInBG}[true]{} \presetkeys [NW] {edges} { layer = {}, vertices = {}, style = {}, lw = {}, color = {}, opacity = {}, RGB = false, Math = false, Direct = false, NoLabel = false, NotInBG = false, }{} \newcommand{\shortcut}[1]{% \@tempswafalse \@for\next:=#1\do {\if@tempswa+\else\@tempswatrue\fi\textbf{\next}}% } \newcounter{LayerCounter} \newcommand\myfunc[1]{\setcounter{LayerCounter}{0}\@for\tmp:=#1\do{ \stepcounter{LayerCounter} \arabic{LayerCounter}-a-\textbf{\tmp}} } %<---------------------------------------------------------------------------> % Edges %<---------------------------------------------------------------------------> \newcommand*{\Edges}[1][]{\@edges[#1]}% \def\@edges[#1]#2{% \setkeys[NW]{edges}{#1}% \@@edges{#2}% } \def\@@edges#1{% \begin{scope} % Check if data base already exist \DTLifdbexists{#1}{}{ % create dummy data base to store name \DTLnewdb{#1} % delete existing vertices data base \DTLifdbexists{edges}{ \DTLgdeletedb{edges} }{} % Load data file for vertices \DTLloaddb[noheader=false]{edges}{#1} } % % Load data file for vertices % \DTLloaddb[noheader=false]{#1}{#1} % Define variables to store option values \def\edge@Options{}% \def\edge@u{}% \def\edge@v{}% \def\edge@u@layer{}% \def\edge@v@layer{}% \def\edge@rgbValues{}% \def\u@layer{}% \def\v@layer{}% % % Assign where the edges are drawn from to \ifthenelse{\not\equal{\cmdNW@edges@layer}{}}{ % set layer count back to 0 \setcounter{LayerCounter}{0} \@for\tmp:=\cmdNW@edges@layer\do{ \stepcounter{LayerCounter} \ifthenelse{\value{LayerCounter}=1}{ \edef\u@layer{\tmp}% }{ \edef\v@layer{\tmp}% } } }{} % Go through each row and create edges \DTLforeach*{edges}{}{% % reset storage variable to default values \edef\edge@Options{label = {}, lw = {}, color = {}, opacity = {}, style = {}, RGB = false, Math = false, Direct = false, NotInBG = false, bend = {0}, loopsize = {1\DefaultUnit}, position = {}, loopposition = {0}, loopshape = {90}, distance = {.5}, path = {}, fontcolor = {}, fontsize = {}, fontscale ={},} \edef\edge@rgbValues{}% % Go through each row element \DTLforeachkeyinrow{\thisValue}{ \DTLifeq{\dtlkey}{u}{ % Assign edge id to storage variable \edef\edge@u{\thisValue}% }{ \DTLifeq{\dtlkey}{v}{ \edef\edge@v{\thisValue}% }{ \DTLifeq{\dtlkey}{R}{ \edef\edge@rgbValues{\edge@rgbValues \thisValue,} }{ \DTLifeq{\dtlkey}{G}{ \edef\edge@rgbValues{\edge@rgbValues \thisValue,} }{ \DTLifeq{\dtlkey}{B}{ \edef\edge@rgbValues{\edge@rgbValues \thisValue,} }{ % Assign option to storage variable \edef\edge@Options{\edge@Options \dtlkey=\thisValue,} }}}}} } % Add general settings for the Edges % NoLabel \ifNW@edges@NoLabel \edef\edge@Options{\edge@Options label={},} \fi % Direct \ifNW@edges@Direct \edef\edge@Options{\edge@Options Direct=true,} \fi % Math \ifNW@edges@Math \edef\edge@Options{\edge@Options Math=true,} \fi % RGB \ifNW@edges@RGB \edef\edge@Options{\edge@Options RGB=true,color={\edge@rgbValues},} \fi \ifthenelse{\not\equal{\cmdNW@edges@style}{}}{ \edef\edge@Options{\edge@Options style={\cmdNW@edges@style},} }{} % lw \ifthenelse{\not\equal{\cmdNW@edges@lw}{}} { \edef\edge@Options{\edge@Options lw=\cmdNW@edges@lw,} }{} % color \ifthenelse{\not\equal{\cmdNW@edges@color}{}} { \edef\edge@Options{\edge@Options color=\cmdNW@edges@color,} }{} % opacity \ifthenelse{\not\equal{\cmdNW@edges@opacity}{}} { \edef\edge@Options{\edge@Options opacity=\cmdNW@edges@opacity,} }{} % NoLabel \ifNW@edges@NotInBG \edef\edge@Options{\edge@Options NotInBG=true,} \fi % Apply settings for the Edge \setkeysexpanded{edge}{\edge@Options}% % Create Edge \ifthenelse{\equal{\cmdNW@edges@layer}{}}{ \Edge[Network](\edge@u)(\edge@v) }{ \ifthenelse{\not\equal{\cmdNW@edges@vertices}{}}{ \DTLifdbexists{vertices}{ \DTLgdeletedb{vertices} }{} % Load data file for vertices \DTLloaddb[noheader=false]{vertices}{\cmdNW@edges@vertices} }{} % find assigned layer to the used vertices \DTLforeach*{vertices}{\id=id,\layer=layer}{% \ifthenelse{\equal{\id}{\edge@u}}{ \edef\edge@u@layer{\layer}% \dtlbreak }{} } \DTLforeach*{vertices}{\id=id,\layer=layer}{% \ifthenelse{\equal{\id}{\edge@v}}{ \edef\edge@v@layer{\layer}% \dtlbreak }{} } % if the edge is an intra layer edge \ifthenelse{\equal{\u@layer}{\v@layer}}{ \ifthenelse{\equal{\u@layer}{\edge@u@layer}}{ \ifthenelse{\equal{\v@layer}{\edge@v@layer}}{ \Edge[Network](\edge@u)(\edge@v) }{} }{} }{ \ifthenelse{\equal{\u@layer}{\edge@u@layer}}{ \ifthenelse{\equal{\v@layer}{\edge@v@layer}}{ \Edge[Network](\edge@u)(\edge@v) }{} }{} \ifthenelse{\equal{\v@layer}{\edge@u@layer}}{ \ifthenelse{\equal{\u@layer}{\edge@v@layer}}{ \Edge[Network](\edge@u)(\edge@v) }{} }{} } } } \end{scope} % Delete data base % \DTLgdeletedb{#1} } %<---------------------------------------------------------------------------> % Init Layer %<---------------------------------------------------------------------------> \define@cmdkey [NW] {layer} {layer}{} \define@cmdkey [NW] {layer} {z}{} \define@cmdkey [NW] {layer} {opacity}{} \presetkeys [NW] {layer} { layer = {1}, opacity = {}, z = {}, }{} %<---------------------------------------------------------------------------> % Layer %<---------------------------------------------------------------------------> %\def\@layer{canvas is yx plane at z=-3,} \def\@layer[#1]#2{ \setkeys[NW]{layer}{#1} \ifthenelse{\not\equal{\cmdNW@layer@z}{}}{ \tikzset{LocalLayerZ/.style={canvas is yx plane at z=\cmdNW@layer@z}} }{ \tikzset{LocalLayerZ/.style={canvas is yx plane at z=(\cmdNW@layer@layer-1)*\NetworkLayerDistance}} } \ifthenelse{\not\equal{\cmdNW@layer@opacity}{}}{ \tikzset{LocalLayerOpacity/.style={fill opacity = \cmdNW@layer@opacity}} }{ \tikzset{LocalLayerOpacity/.style={}} } \begin{scope}[LocalLayerZ,LocalLayerOpacity] } %\newcommand*{\Layer}[1][]{\@layer[#1]}% \newenvironment{Layer}[1][]{\@layer[#1]1}{ \end{scope} } %\def\@layer[#1]#2{} % \newcommand*{\Edges}[1][]{\@edges[#1]}% % \def\@edges[#1]#2{% % \setkeys[NW]{edges}{#1}% % \@@edges{#2}% % } % \def\@@edges#1{% %<---------------------------------------------------------------------------> % Init Plane %<---------------------------------------------------------------------------> \define@cmdkey [NW] {plane} {x}{} \define@cmdkey [NW] {plane} {y}{} \define@cmdkey [NW] {plane} {width}{} \define@cmdkey [NW] {plane} {height}{} \define@cmdkey [NW] {plane} {color}{} \define@cmdkey [NW] {plane} {opacity}{} \define@cmdkey [NW] {plane} {style}{} \define@cmdkey [NW] {plane} {layer}{} \define@cmdkey [NW] {plane} {grid}{} \define@cmdkey [NW] {plane} {image}{} \define@boolkey [NW] {plane} {RGB}[true]{} \define@boolkey [NW] {plane} {InBG}[true]{} \define@boolkey [NW] {plane} {NoFill}[true]{} \define@boolkey [NW] {plane} {NoBorder}[true]{} \define@boolkey [NW] {plane} {ImageAndFill}[true]{} \presetkeys [NW] {plane} { x = {0}, y = {0}, width = {\PlaneWidth}, height = {\PlaneHeight}, color = {}, opacity = {}, style = {}, layer = {1}, grid = {}, image = {}, RGB = false, InBG = false, NoFill = false, NoBorder= false, ImageAndFill= false, }{} %<---------------------------------------------------------------------------> % Plane %<---------------------------------------------------------------------------> \newcommand*{\Plane}[1][]{\@plane[#1]}% \def\@plane[#1]{% \setkeys[NW]{plane}{#1}% \ifNW@plane@ImageAndFill \setkeys[NW]{plane}{#1}% \else \ifthenelse{\not\equal{\cmdNW@plane@image}{}}{ \setkeys[NW]{plane}{#1,NoFill} }{} \fi \@@plane% } \def\@@plane{% % Draw Plane on the Background layer \ifNW@plane@InBG \tikzset{InBGStyle/.style={on background layer}} \else \tikzset{InBGStyle/.style={}} \fi \begin{scope}[InBGStyle] % Check if the color of the plane is redefined, if so the new style is % used. If the option RGB is true, RGB values can be used to define the % color of the plane entered in the form {R,G,B}. If RGB is not true the % default colors of tikz can be used (e.g. blue!50!green) \ifNW@plane@RGB \ifthenelse{\not\equal{\cmdNW@plane@color}{}}{ \pgfutil@definecolor{LocalColor}{RGB}{\cmdNW@plane@color} \tikzset{LocalPlaneFill/.style={fill = LocalColor}} }{ \tikzset{LocalPlaneFill/.style={}} } \else \ifthenelse{\not\equal{\cmdNW@plane@color}{}}{ \tikzset{LocalPlaneFill/.style={fill = \cmdNW@plane@color}} }{ \tikzset{LocalPlaneFill/.style={}} } \fi % Check if the opacity of the plane is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@plane@opacity}{}}{ \tikzset{LocalPlaneOpacity/.style={fill opacity = \cmdNW@plane@opacity}} }{ \tikzset{LocalPlaneOpacity/.style={}} } \begin{scope}[canvas is yx plane at z=(\cmdNW@plane@layer-1)*\NetworkLayerDistance] % Draw the fill of the Plane \ifNW@plane@NoFill \else \protected@edef\@tempa{% \noexpand\fill[PlaneFillStyle,LocalPlaneFill,LocalPlaneOpacity]( \cmdNW@plane@x*\DistanceScale,\cmdNW@plane@y*\DistanceScale) rectangle ++ (\cmdNW@plane@width*\DistanceScale,\cmdNW@plane@height*\DistanceScale)}% \@tempa; \fi % Draw image on the Plane \ifthenelse{\not\equal{\cmdNW@plane@image}{}}{ %\protected@edef\@tempa{% %\noexpand \node[inner sep=0pt,LocalPlaneOpacity] at ($(\cmdNW@plane@width/2,\cmdNW@plane@height/2)+ (\cmdNW@plane@x,\cmdNW@plane@y)$) {\includegraphics[width=\cmdNW@plane@width\DefaultUnit, height=\cmdNW@plane@height\DefaultUnit]{\cmdNW@plane@image}}; %}% %\@tempa; }{} % Draw grid on the Plane \ifthenelse{\not\equal{\cmdNW@plane@grid}{}}{ \protected@edef\@tempa{% \noexpand\draw[PlaneGridStyle,step=\cmdNW@plane@grid*\DistanceScale]( \cmdNW@plane@x*\DistanceScale,\cmdNW@plane@y*\DistanceScale) grid ++ (\cmdNW@plane@width*\DistanceScale,\cmdNW@plane@height*\DistanceScale)}% \@tempa; }{} % Draw the border of the Plane \ifNW@plane@NoBorder \else \protected@edef\@tempa{% \noexpand\draw[PlaneBorderStyle,\cmdNW@plane@style]( \cmdNW@plane@x*\DistanceScale,\cmdNW@plane@y*\DistanceScale) rectangle ++ (\cmdNW@plane@width*\DistanceScale,\cmdNW@plane@height*\DistanceScale)}% \@tempa; \fi \end{scope} \end{scope} } %<---------------------------------------------------------------------------> % Init Text %<---------------------------------------------------------------------------> \define@cmdkey [NW] {text} {x}{} \define@cmdkey [NW] {text} {y}{} \define@cmdkey [NW] {text} {layer}{} \define@cmdkey [NW] {text} {color}{} \define@cmdkey [NW] {text} {opacity}{} \define@cmdkey [NW] {text} {rotation}{} \define@cmdkey [NW] {text} {fontsize}{} \define@cmdkey [NW] {text} {anchor}{} \define@cmdkey [NW] {text} {position}{} \define@cmdkey [NW] {text} {distance}{} \define@cmdkey [NW] {text} {style}{} \define@cmdkey [NW] {text} {width}{} \define@boolkey [NW] {text} {RGB}[true]{} \presetkeys [NW] {text} { x = {0}, y = {0}, layer = {}, color = {}, opacity = {}, fontsize = {}, anchor = {}, position = {}, rotation = {}, distance = {0\DefaultUnit}, style = {}, width = {}, RGB = false, }{} %<---------------------------------------------------------------------------> % Text %<---------------------------------------------------------------------------> \newcommand*{\Text}[1][]{\@text[#1]}% \def\@text[#1]#2{% \setkeys[NW]{text}{#1}% \@@text{#2}% } \def\@@text#1{% % Check if the color of the text is redefined, if so the new style is % used. If the option RGB is true, RGB values can be used to define the % color of the text entered in the form {R,G,B}. If RGB is not true the % default colors of tikz can be used (e.g. blue!50!green) \ifNW@text@RGB \ifthenelse{\not\equal{\cmdNW@text@color}{}}{ \pgfutil@definecolor{LocalColor}{RGB}{\cmdNW@text@color} \tikzset{LocalTextColor/.style={color = LocalColor}} }{ \tikzset{LocalTextColor/.style={}} } \else \ifthenelse{\not\equal{\cmdNW@text@color}{}}{ \tikzset{LocalTextColor/.style={color = \cmdNW@text@color}} }{ \tikzset{LocalTextColor/.style={}} } \fi % Check if the opacity of the text is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@text@opacity}{}}{ \tikzset{LocalTextOpacity/.style={text opacity = \cmdNW@text@opacity}} }{ \tikzset{LocalTextOpacity/.style={}} } % Check if the rotation of the text is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@text@rotation}{}}{ \tikzset{LocalTextRotation/.style={rotate = \cmdNW@text@rotation}} }{ \tikzset{LocalTextRotation/.style={}} } % Check if the font size of the text is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@text@fontsize}{}}{ \tikzset{LocalTextFontSize/.style={font = \cmdNW@text@fontsize}} }{ \tikzset{LocalTextFontSize/.style={}} } % Check if the position of the text is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@text@position}{}}{ \tikzset{LocalTextPosition/.style={\cmdNW@text@position = \cmdNW@text@distance}} }{ \tikzset{LocalTextPosition/.style={}} } % Check if the anchor of the text is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@text@anchor}{}}{ \tikzset{LocalTextAnchor/.style={anchor = \cmdNW@text@anchor}} }{ \tikzset{LocalTextAnchor/.style={}} } % Check if the text width of the text is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@text@width}{}}{ \tikzset{LocalTextWidth/.style={text width = \cmdNW@text@width}} }{ \tikzset{LocalTextWidth/.style={}} } \ifthenelse{\equal{\cmdNW@text@layer}{}}{ \protected@edef\@tempa{% \noexpand\node[TextStyle, LocalTextColor, LocalTextOpacity, LocalTextFontSize, LocalTextRotation, LocalTextPosition, LocalTextAnchor, LocalTextWidth, \cmdNW@text@style] at (\cmdNW@text@x*\DistanceScale,\cmdNW@text@y*\DistanceScale){#1} }\@tempa;% }{ \begin{scope}[canvas is yx plane at z=(\cmdNW@text@layer-1)*\NetworkLayerDistance] \protected@edef\@tempa{% \noexpand\node[TextStyle, LocalTextColor, LocalTextOpacity, LocalTextFontSize, LocalTextRotation, LocalTextPosition, LocalTextAnchor, LocalTextWidth, \cmdNW@text@style] at (\cmdNW@text@x*\DistanceScale,\cmdNW@text@y*\DistanceScale){#1} }\@tempa;% \end{scope} } } \endinput %============================================================================= % eof % % Local Variables: % mode: latex % mode: flyspell % mode: auto-fill % fill-column: 80 % TeX-master: t % End: tex-fmt-0.5.2/tests/source/unicode.tex000066400000000000000000000006731473573253500177400ustar00rootroot00000000000000\documentclass{article} \begin{document} This is a long line with a unicode arrow in the middle of it ↓ which should be split correctly Here an indent begins ( and should not be closed with this arrow and comment ↓% until the next parenthesis ) This line contains some French accent characters éééééééééééééééééééééééééééééé which include zero-width chars, so look narrower than they are. \end{document} tex-fmt-0.5.2/tests/source/verbatim.tex000066400000000000000000000007641473573253500201240ustar00rootroot00000000000000\documentclass{article} \usepackage{listings} \begin{document} \begin{verbatim} code code code code code code code code code code code code code code code code code code code code code code code code code code code code code code code code code code code code \item \item \item \begin{align} E = mc^2 \end{align} \end{verbatim} \begin{lstlisting}[caption={A very long and complicated caption that does not fit into one line}] Code \end{lstlisting} \end{document} tex-fmt-0.5.2/tests/source/wgu-cv.cls000066400000000000000000000036641473573253500175060ustar00rootroot00000000000000%! TeX root = WGUnderwood.tex % class \NeedsTeXFormat{LaTeX2e} \ProvidesClass{wgu-cv} % packages \LoadClass[10pt]{article} \RequirePackage[margin=1in,top=0.9in]{geometry} \RequirePackage{hyperref} %\RequirePackage{fontspec} \RequirePackage{microtype} \RequirePackage{fancyhdr} \RequirePackage{enumitem} \RequirePackage{ifthen} % variables \def\yourname#1{\def\@yourname{#1}} \def\youraddress#1{\def\@youraddress{#1}} \def\youremail#1{\def\@youremail{#1}} \def\yourwebsite#1{\def\@yourwebsite{#1}} % settings %\setmainfont{Libre Baskerville}[Scale=0.9] %\setmonofont{Source Code Pro}[Scale=0.97] \geometry{a4paper} \setlength\parindent{0pt} \bibliographystyle{abbrvnat} \pagestyle{fancy} \renewcommand{\headrulewidth}{0pt} \cfoot{\thepage} \rfoot{\today} \setlist{ leftmargin=0.5cm, topsep=0cm, partopsep=0cm, parsep=-0.04cm, % item spacing before=\vspace{0.12cm}, after=\vspace{0.08cm}, } % arxiv \newcommand{\arxiv}[1]{% \href{https://arxiv.org/abs/#1}{% \texttt{arXiv{:}{\allowbreak}#1}}% } % github \newcommand{\github}[1]{% GitHub: \href{https://github.com/#1}{% \texttt{#1}}% } % title \renewcommand{\maketitle}{% \vspace*{-1.2cm}% \begin{center}% \begin{huge}% \@yourname \\ \end{huge}% \vspace{0.5cm}% \@youraddress \\ \vspace{0.16cm}% \begin{minipage}{0.45\textwidth}% \centering% \href{mailto:\@youremail}{\nolinkurl{\@youremail}}% \end{minipage}% \begin{minipage}{0.45\textwidth}% \centering% \href{https://\@yourwebsite}{\nolinkurl{\@yourwebsite}}% \end{minipage} \end{center}% } % section \renewcommand{\section}[1]{% \vspace{0.3cm}% \par\hbox{\large\textbf{#1}\strut}% \vspace{-0.25cm}% \rule{\textwidth}{0.8pt}% \vspace{-0.15cm}% } % subsection \renewcommand{\subsection}[2]{% \vspace{0.30cm}% \textbf{#1}% \hfill{#2}% \vspace{0.03cm}% } % subsubsection \renewcommand{\subsubsection}[1]{% \linebreak \textit{#1}% \vspace{0.05cm}% } tex-fmt-0.5.2/tests/source/wrap.tex000066400000000000000000000040551473573253500172610ustar00rootroot00000000000000\documentclass{article} \begin{document} % no comment This line is too long because it has more than eighty characters inside it. Therefore it should be split. % break before comment This line is too long because it has more than eighty characters inside it. Therefore it % should be split. % break after spaced comment This line is too long because it has more than % eighty characters inside it. Therefore it should be split. % break after non-spaced comment This line is too long because it has more than% eighty characters inside it. Therefore it should be split. % unbreakable line Thislineistoolongbecauseithasmorethan%eightycharactersinsideit.Buttherearenospacessoitcannotbesplit. % line can be broken after 80 chars Thislineistoolongbecauseithasmorethaneightycharactersinsideitandtherearenospacesuntillater where there are some spaces so we can split this line here % long line only after indenting ( 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 123 ) % double break after comment This line has a long comment. % This comment is very long so needs to be split over three lines which is another edge case which should be checked here with all these extra words % double break after only comment % This line is all a long comment. This comment is very long so needs to be split over three lines which is another edge case which should be checked here with all these extra words % lines containing \ This line would usually be split at the special character part with a slash\ but it's best to break the line earlier. % long lines with brackets (This line is too long because it has more than eighty characters inside it. Therefore it should be split. It also needs splitting onto multiple lines, and the middle lines should be indented due to these brackets.) % long lines with double brackets ((This line is too long because it has more than eighty characters inside it. Therefore it should be split. It also needs splitting onto multiple lines, and the middle lines should be doubly indented due to these brackets.)) \end{document} tex-fmt-0.5.2/tests/target/000077500000000000000000000000001473573253500155505ustar00rootroot00000000000000tex-fmt-0.5.2/tests/target/brackets.tex000066400000000000000000000014611473573253500200720ustar00rootroot00000000000000\documentclass{article} \begin{document} Matching brackets on a line do nothing (like this). Matching brackets on two lines also do nothing (like this longer example). Matching brackets on three lines get an indent (like this much much longer example right here on these lines). Matching brackets on more lines also get an indent (like this much much much much much longer example here). The brackets could start at the beginning of the line (so maybe they look like this). [They could be any shape of bracket] {Even braces get the same indents too} What about equations? They are the same: $(1 + 2 + 3)$ $(1 + 2 + 3 + 4 + 5 + 7 + 8 + 9)$ And the dollars can go anywhere as expected: $ (1 + 2 + 3 + 4 + 5 + 7 + 8 + 9) $ Note that dollars themselves are not indented \end{document} tex-fmt-0.5.2/tests/target/cam-thesis.cls000066400000000000000000000403771473573253500203230ustar00rootroot00000000000000%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Class ``cam-thesis'' %% %% Version: v0.2 %% Authors: Jean Martina, Rok Strnisa, Matej Urbas %% Date: 30/07/2008 %% %% Copyright (c) 2008-2012, Rok Strniša, Jean Martina, Matej Urbas %% License: Simplified BSD License %% License file: ./License %% Original License URL: http://www.freebsd.org/copyright/freebsd-license.html %%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% General guidelines on which this class is based: %% %% http://www.cl.cam.ac.uk/local/phd/typography/ %% http://www.admin.cam.ac.uk/offices/gradstud/exams/submission/phd/format.html %% %%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Class identification. %% %%%%% \NeedsTeXFormat{LaTeX2e} \ProvidesClass{cam-thesis}[2012/04/12 University of Cambridge thesis class] \typeout{} \typeout{***********************************************} \typeout{***********************************************} \typeout{} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% General Cambridge guidelines. %% %% LIMIT: 60k words (including tables and footnotes, excluding appendices, bib, %% photos, diagrams); title and section headings should be capitalized as normal %% sentences; citations should include authors' initials, and page numbers (if %% possible); double-sided printing is permissible for the soft bound version; %% however, single-sided is required for the text of the final, hard bound %% library copy (diagrams on facing pages are acceptable); always make it %% possible to create the ps file as well (required for technical reports). %% %%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Package options (see README.md for a list of options with descriptions). %% %% These options can be provided within square brackets of the `documentclass' %% command. %% %%%%% % techreport - formats the thesis as a technical report. \newif\ifcam@techreport\cam@techreportfalse \DeclareOption{techreport}{\cam@techreporttrue} % times - tells the class to use the times font. \newif\ifcam@times\cam@timesfalse \DeclareOption{times}{\cam@timestrue} % glossary - puts the glossary (after the TOC). % \newif\ifcam@glossary\cam@glossaryfalse \DeclareOption{glossary}{\cam@glossarytrue} % index - puts the index at the end of the thesis. % \newif\ifcam@index\cam@indexfalse \DeclareOption{withindex}{\cam@indextrue} % 1st year report - omits abstract/declaration % \newif\ifcam@firstyr\cam@firstyrfalse \DeclareOption{firstyr}{\cam@firstyrtrue} % 2nd year report - omits declaration % \newif\ifcam@secondyr\cam@secondyrfalse \DeclareOption{secondyr}{\cam@secondyrtrue} % backrefs - add back references % \newif\ifcam@backrefs\cam@backrefsfalse \DeclareOption{backrefs}{\cam@backrefstrue} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Using report class as base. %% %%%%% \PassOptionsToClass{a4paper,12pt,twoside,openright}{report} \DeclareOption*{\PassOptionsToClass{\CurrentOption}{report}} \ProcessOptions\relax \LoadClass{report} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% For consistent vertical spacing %% %%%%% \raggedbottom %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Additional packages, and their options. %% %%%%% \RequirePackage{graphicx} % Required for the UC Logo (on % the title page) \RequirePackage{calc} % Used for calculating margins % and laying out the title page % Create the index \ifcam@index \RequirePackage{makeidx} \makeindex \newcommand{\printthesisindex}{% \cleardoublepage% \phantomsection% \addcontentsline{toc}{chapter}{Index}% \printindex} \fi % Create the glossary \ifcam@glossary \RequirePackage{glossaries} \makeglossaries% \newcommand{\printthesisglossary}{\printglossary[nonumberlist]} \newcommand{\cam@printthesisglossary}{% \cleardoublepage% \pagestyle{empty}% \renewcommand{\glossarypreamble}{\thispagestyle{empty}}% \printthesisglossary% } \fi %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Page margins (suitable for J.S. Wilson & Son). %% %%%%% \newlength{\cam@topmargin} \newlength{\cam@bottommargin} \newlength{\cam@oddmargin} \newlength{\cam@evenmargin} %% Calculate and set the margins properly (with parameters that actually have %% some meaning for everyday thesis-writers). %% %% @param 1 odd side margin (inner margin). %% @param 2 even side margin (outer margin). %% @param 3 top margin. %% @param 4 bottom margin. \DeclareRobustCommand{\cam@calcpaperdims}[4]{% % MARGINS % 'Top margin' is the distance between the top of the text and the % top of the page. % 'Bottom margin' is the distance between the bottom of the footer % (the page number) and the bottom of the page. \setlength{\cam@oddmargin}{#1} % inner margin \setlength{\cam@evenmargin}{#2} % outer margin \setlength{\cam@topmargin}{#3} % top margin (the % distance from the top of the page to the top of the body text -- % the header is located between) \setlength{\cam@bottommargin}{#4} % bottom margin (the % distance from the bottom of the page to the bottom of the body % text -- the footer is located between) % Horizontal spacing \setlength{\textwidth}{\paperwidth-\cam@oddmargin-\cam@evenmargin} % text takes the remaining width (210 - inner - outer) \setlength{\oddsidemargin}{\cam@oddmargin-1in} % Counter the % LaTeX 1in margin \setlength{\evensidemargin}{\cam@evenmargin-1in} % Counter the % LaTeX 1in margin \setlength{\marginparwidth}{\cam@evenmargin-8mm} % the margin only % has 'outer' space available, so we have to make it a bit thinner. \setlength{\marginparsep}{3mm} % Vertical spacing \setlength{\headheight}{5mm} % The height of the box where the % heading text lives \setlength{\headsep}{5mm} % The distance between the % heading and the top of the text \setlength{\topmargin}{\cam@topmargin-\headheight-\headsep-1in} % % Counter the LaTeX 1in margin \setlength{\textheight}{\paperheight-\cam@topmargin-1.7\cam@bottommargin} % text takes the remaining height (297 - top margin - bottom margin) \setlength{\footskip}{.7\cam@bottommargin} % The distance from the % bottom of the text to the bottom of the footer } \ifcam@techreport \cam@calcpaperdims{25mm}{25mm}{20mm}{20mm} \else \cam@calcpaperdims{30mm}{20mm}{20mm}{20mm} \fi %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Variable definitions and default values: these variables should be defined by %% the user (somewhere in the preamble). For example, to put the abstract into %% the thesis, the thesis writer should type the following somewhere in the %% preamble (before the `\begin{document}` or `\frontmatter` commands are %% called): %% %% \abstract{This is my abstract.} %% %% See below (in the comments starting with 'DOCVAR: ') for a list of all % variables %% the thesis writer is expected to use. %% %%%%% % DOCVAR: abstract (The text that will be inserted into the abstract of the % thesis.) \newcommand{\@abstract}{} \renewcommand{\abstract}[1]{\renewcommand{\@abstract}{#1}} % DOCVAR: acknowledgements (The text that will be inserted into the % acknowledgments of the thesis.) \newcommand{\@acknowledgements}{} \newcommand{\acknowledgements}[1]{\renewcommand{\@acknowledgements}{#1}} % DOCVAR: college (The name of the thesis writer's college, which will appear % just below their name.) \newcommand{\@college}{} \newcommand{\college}[1]{\renewcommand{\@college}{#1}} % DOCVAR: keywords (These keywords will appear in the PDF meta-information % called `pdfkeywords`.) \newcommand{\@keywords}{} \newcommand{\keywords}[1]{\renewcommand{\@keywords}{#1}} % DOCVAR: subjectline (This subject will appear in the PDF meta-information % called `pdfsubject`.) \newcommand{\@subjectline}{} \newcommand{\subjectline}[1]{\renewcommand{\@subjectline}{#1}} % DOCVAR: submissiondate (The date of the submission of this thesis. If the % submission date is provided, it will be printed on the title page--within the % `submissionnotice` by default. Note that the thesis writer can provide their % own `submissionnotice`, in which case it is up to them whether they will use % this date in their notice.) \newif\ifcam@submissiondate\cam@submissiondatefalse \newcommand{\@submissiondate}{} \newcommand{\submissiondate}[1]{% \renewcommand{\@submissiondate}{#1}\cam@submissiondatetrue} % DOCVAR: submissionnotice (The submission notice is shown on the bottom of the % title page.) \newcommand{\@submissionnotice}{% \ifcam@firstyr First year report submitted \else \ifcam@secondyr Second year report submitted \else This dissertation is submitted \fi \fi \ifcam@submissiondate on \@submissiondate{} \fi \ifcam@firstyr in partial fulfilment of the requirements \fi \ifcam@secondyr in partial fulfilment of the requirements \fi for the degree of Doctor of Philosophy% } \newcommand{\submissionnotice}[1]{\renewcommand{\@submissionnotice}{#1}} % DOCVAR: collegeshield (The name of the file that contains the image of the % college's shield. If `collegeshield' is provided, it will be included in the % title page (just below the author's name and above the name of the college). \newif\ifcam@collegeshield\cam@collegeshieldfalse \newcommand{\@collegeshield}{} \newcommand{\collegeshield}[1]{% \renewcommand{\@collegeshield}{#1}\cam@collegeshieldtrue} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Chapter and section numbering %% \setcounter{secnumdepth}{3} \setcounter{tocdepth}{3} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Front matter %% %% - outside and inside front cover %% - title leaf %% Do not include the date of make! %% Institution + department. %% Names of referees. (optional) %% Degree. %% Date of submission and defense. (optional) %% Place and date of publication and publishers (and other info by them). %%%%% \newcommand{\frontmatter}{ \pagestyle{empty} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Title page components %% %%%%% \ifcam@techreport % Technical report mustn't have the custom title page (a standard one will be % prepended by the editor, see % http://www.cl.cam.ac.uk/techreports/submission.html). \else % The boxes below are all that will be displayed on the title page. They are % used to calculate exactly how much space should be left between them % (vertically). %% LOGO box \newlength{\cam@logorightnudge} \setlength{\cam@logorightnudge}{-0.5\paperwidth+12mm} \newsavebox{\cam@logo} \begin{lrbox}{\cam@logo} \hspace*{\cam@logorightnudge} %\includegraphics[width=73mm]{CollegeShields/CUni} \end{lrbox} %% THESIS TITLE box \newsavebox{\cam@title} \begin{lrbox}{\cam@title} \begin{minipage}[c][\height][c]{.98\textwidth} \begin{center} \Huge% \ifcam@times\else% \bfseries% \fi% {\@title{}}% \ifcam@firstyr\\% {\vspace{5mm}\emph{\LARGE PhD Proposal}}% \fi% \ifcam@secondyr\\% {\vspace{5mm}\emph{\LARGE Dissertation Schedule}}% \fi \end{center} \end{minipage} \end{lrbox} %% COLLEGESHIELD box (optional): \ifcam@collegeshield% \newsavebox{\cam@collegeshieldbox} \begin{lrbox}{\cam@collegeshieldbox} \includegraphics[height=20mm]{\@collegeshield} \end{lrbox} \fi %% AUTHOR&COLLEGE box \newsavebox{\cam@authorcollege} \begin{lrbox}{\cam@authorcollege} \begin{minipage}[c][\height][c]{.98\textwidth} \begin{center} {\large \@author{}~\\[1ex]} \ifcam@collegeshield% \vspace{2mm}{\usebox{\cam@collegeshieldbox}}\\ \fi \@college{} \end{center} \end{minipage} \end{lrbox} %% SUBMISSION NOTICE box \newsavebox{\cam@submitnotice} \begin{lrbox}{\cam@submitnotice} \begin{minipage}[c][\height][c]{.98\textwidth} \begin{center} \@submissionnotice{} \end{center} \end{minipage} \end{lrbox} % Now calculate the exact free vertical space \newlength{\cam@titlepagevspace} \setlength{\cam@titlepagevspace}{\textheight% -\totalheightof{\usebox{\cam@logo}}% -\totalheightof{\usebox{\cam@submitnotice}}% -\totalheightof{\usebox{\cam@authorcollege}}% -\totalheightof{\usebox{\cam@title}}} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Title Page: Put the components (logo, title, author, college and submit %% notice) onto the title page. %% %%%%% \begin{center} ~\vspace{.02\cam@titlepagevspace}\\ {\usebox{\cam@logo}}\\ \vspace{.28\cam@titlepagevspace} {\usebox{\cam@title}}\\ \vspace{.23\cam@titlepagevspace} {\usebox{\cam@authorcollege}}\\ \null\vfill {\usebox{\cam@submitnotice}} \end{center} \hypersetup{pdfsubject={\@subjectline},pdfkeywords={\@keywords}} \fi % Epigraph on odd page. (optional) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Declaration %% %%%%% \ifcam@techreport % Technical report doesn't need the declaration % (see http://www.cl.cam.ac.uk/techreports/submission.html). \else {\ifcam@firstyr % First and second yr report don't need the declaration \else \ifcam@secondyr % \else \chapter*{Declaration} \thispagestyle{empty} This dissertation is the result of my own work and includes nothing which is the outcome of work done in collaboration except as declared in the Preface and specified in the text. It is not substantially the same as any that I have submitted, or am concurrently submitting, for a degree or diploma or other qualification at the University of Cambridge or any other University or similar institution except as declared in the Preface and specified in the text. I further state that no substantial part of my dissertation has already been submitted, or is being concurrently submitted, for any such degree, diploma or other qualification at the University of Cambridge or any other University or similar institution except as declared in the Preface and specified in the text. This dissertation does not exceed the prescribed limit of 60\,000 words. % Leaving some space for the signature: \vspace{15mm} \begin{flushright} \@author{}\\ \@date{}\\ \end{flushright} \vfill \fi \fi} \fi %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Abstract %% %%%%% \ifcam@techreport% \setcounter{page}{3} \fi \ifcam@firstyr % First yr report doesn't need a standalone abstract \else \chapter*{Abstract} \thispagestyle{empty} % Cambridge thesis submission guidelines require the title and % author be in the abstract. % For more info see https://www.cambridgestudents.cam.ac.uk/your-course/examinations/graduate-exam-information/after-examination/degree-approval-and-1 % tex-fmt: skip \textbf{\large \@title} \par\vspace{0.3cm} \noindent\textit{\@author} \par\vspace{0.6cm} \@abstract{} \fi %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Acknowledgements %% %%%%% \ifcam@firstyr % First and second yr report don't need the acknowledgements \else {\ifcam@secondyr % \else \chapter*{Acknowledgements} \thispagestyle{empty} \@acknowledgements{} \fi} \fi %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Table of contents, figures, symbols and glossary. %% %%%%% % The following command prevents the page number to be displayed on the first % page of the TOC. \addtocontents{toc}{\protect\thispagestyle{empty}} \pagestyle{empty} \tableofcontents{} \ifcam@glossary% \cam@printthesisglossary \fi \cleardoublepage \pagestyle{plain} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Backrefs %% %%%%% \ifcam@backrefs \RequirePackage[hyperpageref]{backref} \renewcommand*{\backref}[1]{} \renewcommand*{\backrefalt}[4]{% \ifcase #1 % \or {\footnotesize Cited on page #2.}% \else {\footnotesize Cited on pages #2.}% \fi } \fi %%%%% EOF: cam-thesis.cls tex-fmt-0.5.2/tests/target/comments.tex000066400000000000000000000010121473573253500201110ustar00rootroot00000000000000\documentclass{article} \begin{document} % Comments should be indented along with other text (these parentheses make the middle line here % and this comment aligns with the text indented as usual) % Comments do not directly affect indenting, % so they can contain arbitrary brackets (((( % which may not match. % Similarly they might contain \begin{align} unmatched % environment tags. This is a percent sign \% and not a comment Some lines might have both \% percents % and comments \end{align} \end{document} tex-fmt-0.5.2/tests/target/cv.tex000066400000000000000000000142011473573253500167000ustar00rootroot00000000000000% !TeX program = lualatex \documentclass{wgu-cv} \yourname{William G Underwood} \youraddress{ ORFE Department, Sherrerd Hall, Charlton Street, Princeton, NJ 08544, USA } \youremail{wgu2@princeton.edu} \yourwebsite{wgunderwood.github.io} \begin{document} \maketitle \section{Employment} \subsection{Postdoctoral Research Associate in Statistics} {Jul 2024 -- Jul 2026} \subsubsection{University of Cambridge} \begin{itemize} \item Advisor: Richard Samworth, Department of Pure Mathematics and Mathematical Statistics \item Funding: European Research Council Advanced Grant 101019498 \end{itemize} \subsection{Assistant in Instruction} {Sep 2020 -- May 2024} \subsubsection{Princeton University} \begin{itemize} \item ORF 499: Senior Thesis, Spring 2024 \item ORF 498: Senior Independent Research Foundations, Fall 2023 \item SML 201: Introduction to Data Science, Fall 2023 \item ORF 363: Computing and Optimization, Spring 2023, Fall 2020 \item ORF 524: Statistical Theory and Methods, Fall 2022, Fall 2021 \item ORF 526: Probability Theory, Fall 2022 \item ORF 245: Fundamentals of Statistics, Spring 2021 \end{itemize} \section{Education} \subsection{PhD in Operations Research \& Financial Engineering} {Sep 2019 -- May 2024} \subsubsection{Princeton University} \begin{itemize} \item Dissertation: Estimation and Inference in Modern Nonparametric Statistics \item Advisor: Matias Cattaneo, Department of Operations Research \& Financial Engineering \end{itemize} \subsection{MA in Operations Research \& Financial Engineering} {Sep 2019 -- Sep 2021} \subsubsection{Princeton University} \subsection{MMath in Mathematics \& Statistics} {Oct 2015 -- Jun 2019} \subsubsection{University of Oxford} \begin{itemize} \item Dissertation: Motif-Based Spectral Clustering of Weighted Directed Networks \item Supervisor: Mihai Cucuringu, Department of Statistics \end{itemize} \section{Research \& publications} \subsection{Articles}{} \begin{itemize} \item Uniform inference for kernel density estimators with dyadic data, with M D Cattaneo and Y Feng. \emph{Journal of the American Statistical Association}, forthcoming, 2024. \arxiv{2201.05967}. \item Motif-based spectral clustering of weighted directed networks, with A Elliott and M Cucuringu. \emph{Applied Network Science}, 5(62), 2020. \arxiv{2004.01293}. \item Simple Poisson PCA: an algorithm for (sparse) feature extraction with simultaneous dimension determination, with L Smallman and A Artemiou. \emph{Computational Statistics}, 35:559--577, 2019. \end{itemize} \subsection{Preprints}{} \begin{itemize} \item Inference with Mondrian random forests, with M D Cattaneo and J M Klusowski, 2023. \\ \arxiv{2310.09702}. \item Yurinskii's coupling for martingales, with M D Cattaneo and R P Masini. \emph{Annals of Statistics}, reject and resubmit, 2023. \arxiv{2210.00362}. \end{itemize} \pagebreak \subsection{Works in progress}{} \begin{itemize} \item Higher-order extensions to the Lindeberg method, with M D Cattaneo and R P Masini. \item Adaptive Mondrian random forests, with M D Cattaneo, R Chandak and J M Klusowski. \end{itemize} \subsection{Presentations}{} \begin{itemize} \item Statistics Seminar, University of Pittsburgh, February 2024 \item Statistics Seminar, University of Illinois, January 2024 \item Statistics Seminar, University of Michigan, January 2024 \item PhD Poster Session, Two Sigma Investments, July 2023 \item Research Symposium, Two Sigma Investments, June 2022 \item Statistics Laboratory, Princeton University, September 2021 \end{itemize} \subsection{Software}{} \begin{itemize} \item MondrianForests: Mondrian random forests in Julia, 2023. \\ \github{wgunderwood/MondrianForests.jl} \item DyadicKDE: dyadic kernel density estimation in Julia, 2022. \\ \github{wgunderwood/DyadicKDE.jl} \item motifcluster: motif-based spectral clustering in R, Python and Julia, 2020. \\ \github{wgunderwood/motifcluster} \end{itemize} \section{Awards \& funding} \vspace{-0.22cm} \begin{itemize} \item School of Engineering and Applied Science Award for Excellence, Princeton University \hfill 2022% \item Francis Robbins Upton Fellowship in Engineering, Princeton University \hfill 2019% \item Royal Statistical Society Prize, Royal Statistical Society \& University of Oxford \hfill 2019% \item Gibbs Statistics Prize, University of Oxford \hfill 2019% \item James Fund for Mathematics Research Grant, St John's College, University of Oxford \hfill 2017% \item Casberd Scholarship, St John's College, University of Oxford \hfill 2016% \end{itemize} \section{Professional experience} \subsection{Quantitative Research Intern} {Jun 2023 -- Aug 2023} \subsubsection{Two Sigma Investments} \vspace{-0.20cm} \subsection{Machine Learning Consultant} {Oct 2018 -- Nov 2018} \subsubsection{Mercury Digital Assets} \vspace{-0.18cm} \subsection{Educational Consultant} {Feb 2018 -- Sep 2018} \subsubsection{Polaris \& Dawn} \vspace{-0.20cm} \subsection{Premium Tutor} {Feb 2016 -- Oct 2018} \subsubsection{MyTutor} \vspace{-0.20cm} \subsection{Statistics \& Machine Learning Researcher} {Aug 2017 -- Sep 2017} \subsubsection{Cardiff University} \vspace{-0.20cm} \subsection{Data Science Intern} {Jun 2017 -- Aug 2017} \subsubsection{Rolls-Royce} \vspace{-0.20cm} \subsection{Peer review}{} \emph{Econometric Theory, Journal of the American Statistical Association, Journal of Business \& Economic Statistics, Journal of Causal Inference, Journal of Econometrics, Operations Research.} \section{References} \vspace{-0.22cm} \begin{itemize} \item Matias Cattaneo, Professor, ORFE, Princeton University \item Jason Klusowski, Assistant Professor, ORFE, Princeton University \item Jianqing Fan, Professor, ORFE, Princeton University \item Ricardo Masini, Assistant Professor, Statistics, University of California, Davis \end{itemize} \end{document} tex-fmt-0.5.2/tests/target/document.tex000066400000000000000000000001461473573253500201110ustar00rootroot00000000000000\documentclass{article} \begin{document} Documents should not be globally indented. \end{document} tex-fmt-0.5.2/tests/target/empty.tex000066400000000000000000000000011473573253500174170ustar00rootroot00000000000000 tex-fmt-0.5.2/tests/target/environment_lines.tex000066400000000000000000000014431473573253500220320ustar00rootroot00000000000000\documentclass{article} \begin{document} \newenvironment{env1}{}{} \newenvironment{env2}{}{} \newenvironment{env3}{}{} \newenvironment{env4}{}{} % environments on separate lines \begin{env1} \begin{env2} \end{env2} \end{env1} % environments on shared lines \begin{env1} \begin{env2} \end{env2} \end{env1} % environments on shared lines with spaces \begin{env1} \begin{env2} \end{env2} \end{env1} % environments all on same line \begin{env1} \begin{env2} \end{env2} \end{env1} % with a comment \begin{env1} % environments with extra brackets \begin{env1}(a)(b \begin{env2}[c{d}e] \end{env2}[f]g) \end{env1} % environments and a long line \begin{env1} \begin{env2} \begin{env3} \begin{env4} \end{env4} \end{env3} \end{env2} \end{env1} \end{document} tex-fmt-0.5.2/tests/target/heavy_wrap.tex000066400000000000000000000013521473573253500204400ustar00rootroot00000000000000\documentclass{article} \usepackage{amsmath} \usepackage{amsthm} \newtheorem{definition}{Definition} \begin{document} \begin{definition} \begin{definition} \begin{definition} Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. \end{definition} \end{definition} \end{definition} \end{document} tex-fmt-0.5.2/tests/target/higher_categories_thesis.bib000066400000000000000000000540611473573253500232660ustar00rootroot00000000000000@software{alex_rice_2024_10964565, author = {Alex Rice}, title = {Agda formalisation of Catt}, month = apr, year = 2024, publisher = {Zenodo}, version = {thesis}, doi = {10.5281/zenodo.10964565}, url = {https://github.com/alexarice/catt-agda/tree/thesis} } @software{alex_rice_2024_10964705, author = {Alex Rice}, title = {Semistrict Catt implementation}, month = apr, year = 2024, publisher = {Zenodo}, version = {thesis}, doi = {10.5281/zenodo.10966141}, url = {https://github.com/alexarice/catt-strict/tree/thesis} } @software{sd-visualiser, author = {Hu, Nick and Rice, Alex and Tataru, Calin}, title = {\textsf{sd-visualiser}}, year = 2024, url = {https://github.com/sd-visualiser/sd-visualiser} } @unpublished{andrastalk, title= {Efficient Evaluation with Controlled Definition Unfolding}, author = {András Kovács}, year = {2024}, note= {Workshop on the Implementation of Type Systems}, URL= {https://popl24.sigplan.org/details/wits-2024-papers/8/Efficient-Evaluation-with-Controlled-Definition-Unfolding}, % tex-fmt: skip } @inbook{selinger2011survey, title = {A Survey of Graphical Languages for Monoidal Categories}, DOI = {10.1007/978-3-642-12821-9_4}, booktitle = {New Structures for Physics}, publisher = {Springer Berlin Heidelberg}, author = {Selinger, Peter}, year = {2011}, pages = {289-–355}, isbn="978-3-642-12821-9", doi="10.1007/978-3-642-12821-9_4" } @article{forest2022unifying, title={Unifying notions of pasting diagrams}, author={Forest, Simon}, journal={Higher Structures}, volume={6}, number={1}, pages={1--79}, year={2022}, doi={10.21136/HS.2022.01} } @unpublished{makkai2005word, title={The word problem for computads}, author={Makkai, Michael}, note={\url{https://www.math.mcgill.ca/makkai/WordProblem/WordProblemCombined.pdf}}, % tex-fmt: skip year={2005} } @phdthesis{forest2021computational, title={Computational descriptions of higher categories}, author={Forest, Simon}, year={2021}, school={Institut Polytechnique de Paris} } @unpublished{douglas2016internal, title={Internal bicategories}, author={Christopher L. Douglas and André G. Henriques}, year={2016}, eprint={1206.4284}, archivePrefix={arXiv}, primaryClass={math.CT} } @book{leinster2004higher, title={Higher operads, higher categories}, author={Leinster, Tom}, volume={298}, year={2004}, publisher={Cambridge University Press} } @unpublished{simpson1998homotopy, title={Homotopy types of strict 3-groupoids}, author={Carlos Simpson}, year={1998}, eprint={math/9810059}, archivePrefix={arXiv}, primaryClass={math.CT} } @incollection {joyal2006weak, AUTHOR = {Joyal, Andr\'{e} and Kock, Joachim}, TITLE = {Weak units and homotopy 3-types}, BOOKTITLE = {Categories in algebra, geometry and mathematical physics}, SERIES = {Contemp. Math.}, VOLUME = {431}, PAGES = {257--276}, PUBLISHER = {Amer. Math. Soc., Providence, RI}, YEAR = {2007}, ISBN = {978-0-8218-3970-6}, DOI = {10.1090/conm/431/08277}, URL = {https://doi.org/10.1090/conm/431/08277}, } @inproceedings{10.1145/237721.237728, author = {Jim, Trevor}, title = {What are principal typings and what are they good for?}, year = {1996}, isbn = {0897917693}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://doi.org/10.1145/237721.237728}, doi = {10.1145/237721.237728}, abstract = {We demonstrate the pragmatic value of the principal typing property, a property distinct from ML's principal type property, by studying a type system with principal typings. The type system is based on rank 2 intersection types and is closely related to ML. Its principal typing property provides elegant support for separate compilation, including "smartest recompilation" and incremental type inference. Moreover, it motivates a new rule for typing recursive definitions that can type some interesting examples of polymorphic recursion.}, booktitle = {Proceedings of the 23rd ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages}, pages = {42–53}, numpages = {12}, location = {St. Petersburg Beach, Florida, USA}, series = {POPL '96} } @article{10.1145/3450952, author = {Dunfield, Jana and Krishnaswami, Neel}, title = {Bidirectional Typing}, year = {2021}, issue_date = {June 2022}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {54}, number = {5}, issn = {0360-0300}, url = {https://doi.org/10.1145/3450952}, doi = {10.1145/3450952}, abstract = {Bidirectional typing combines two modes of typing: type checking, which checks that a program satisfies a known type, and type synthesis, which determines a type from the program. Using checking enables bidirectional typing to support features for which inference is undecidable; using synthesis enables bidirectional typing to avoid the large annotation burden of explicitly typed languages. In addition, bidirectional typing improves error locality. We highlight the design principles that underlie bidirectional type systems, survey the development of bidirectional typing from the prehistoric period before Pierce and Turner’s local type inference to the present day, and provide guidance for future investigations.}, journal = {ACM Comput. Surv.}, month = {5}, articleno = {98}, numpages = {38}, keywords = {Type checking, type inference} } @article{abel2013normalization, title={Normalization by evaluation: Dependent types and impredicativity}, author={Abel, Andreas}, journal={Habilitation. Ludwig-Maximilians-Universit{\"a}t M{\"u}nchen}, year={2013} } @article{gratzer2019implementing, title={Implementing a modal dependent type theory}, author={Gratzer, Daniel and Sterling, Jonathan and Birkedal, Lars}, journal={Proceedings of the ACM on Programming Languages}, volume={3}, number={ICFP}, pages={1--29}, year={2019}, publisher={ACM New York, NY, USA}, doi = {10.1145/3341711} } @unpublished{hadzihasanovic2019representable, title={Representable diagrammatic sets as a model of weak higher categories}, author={Amar Hadzihasanovic}, year={2019}, eprint={1909.07639}, archivePrefix={arXiv}, primaryClass={math.CT} } @inproceedings{reutter2019high, title={High-level methods for homotopy construction in associative n-categories}, author={Reutter, David and Vicary, Jamie}, booktitle={Proceedings of the 34th Annual ACM/IEEE Symposium on Logic in Computer Science (LICS)}, pages={1--13}, year={2019}, doi={10.1109/LICS.2019.8785895} } @unpublished{corbyn2024homotopy, title={\textsf{homotopy.io}: a proof assistant for finitely-presented globular $n$-categories}, author={Nathan Corbyn and Lukas Heidemann and Nick Hu and Chiara Sarti and Calin Tataru and Jamie Vicary}, year={2024}, eprint={2402.13179}, archivePrefix={arXiv}, primaryClass={cs.LO} } @unpublished{tataru2024theory, title={The theory and applications of anticolimits}, author={Calin Tataru and Jamie Vicary}, year={2024}, eprint={2401.17076}, archivePrefix={arXiv}, primaryClass={math.CT} } @incollection{MARTINLOF197573, title = {An Intuitionistic Theory of Types: Predicative Part}, editor = {H.E. Rose and J.C. Shepherdson}, series = {Studies in Logic and the Foundations of Mathematics}, publisher = {Elsevier}, volume = {80}, pages = {73-118}, year = {1975}, booktitle = {Logic Colloquium '73}, issn = {0049-237X}, doi = {https://doi.org/10.1016/S0049-237X(08)71945-1}, url = {https://www.sciencedirect.com/science/article/pii/S0049237X08719451}, author = {Per Martin-Löf}, abstract = {Publisher Summary The theory of types is intended to be a full-scale system for formalizing intuitionistic mathematics as developed. The language of the theory is richer than the languages of traditional intuitionistic systems in permitting proofs to appear as parts of propositions so that the propositions of the theory can express properties of proofs. There are axioms for universes that link the generation of objects and types and play somewhat the same role for the present theory as does the replacement axiom for Zermelo–Fraenkel set theory. The present theory is based on a strongly impredicative axiom that there is a type of all types in symbols. This axiom has to be abandoned, however, after it has been shown to lead to a contraction. This chapter discusses Normalization theorem, which can be strengthened in two ways: it can be made to cover open terms and it can be proved that every reduction sequence starting from an arbitrary term leads to a unique normal term after a finite number of steps. The definition of the notion of convertibility and the proof that an arbitrary term is convertible can no longer be separated because the type symbols and the terms are generated simultaneously.} } @article{lumsdaine2010weak, title = {Weak omega-categories from intensional type theory}, volume = {Volume 6, Issue 3}, ISSN = {1860-5974}, url = {http://dx.doi.org/10.2168/LMCS-6(3:24)2010}, DOI = {10.2168/lmcs-6(3:24)2010}, journal = {Logical Methods in Computer Science}, publisher = {Centre pour la Communication Scientifique Directe (CCSD)}, author = {Lumsdaine, Peter LeFanu}, year = {2010}, month = sep } @article{garner2011types, title={Types are weak omega-groupoids}, author={Garner, Richard and van den Berg, Benno}, journal={Proceedings of the London Mathematical Society}, volume={102}, number={2}, pages={370--394}, year={2010}, publisher={London Mathematical Society}, DOI = {10.1112/plms/pdq026} } @unpublished{dorn2021framed, title={Framed combinatorial topology}, author={Christoph Dorn and Christopher L. Douglas}, year={2021}, eprint={2112.14700}, archivePrefix={arXiv}, primaryClass={math.GT} } @unpublished{heidemann2023framed, title={Framed Combinatorial Topology with Labels in $\infty$-Categories}, author={Lukas Heidemann}, year={2023}, eprint={2305.06288}, archivePrefix={arXiv}, primaryClass={math.AT} } @article{eckmann1962group, title={Group-like structures in general categories I multiplications and comultiplications}, author={Eckmann, Beno and Hilton, Peter J}, journal={Mathematische Annalen}, volume={145}, number={3}, pages={227--255}, year={1962} } @phdthesis{brunerie2016homotopy, title={On the homotopy groups of spheres in homotopy type theory}, author={Brunerie, Guillaume}, year={2016}, school={Universit{\'e} Nice Sophia Antipolis} } @unpublished{shulman2019all, title={All $(\infty,1)$-toposes have strict univalent universes}, author={Michael Shulman}, year={2019}, eprint={1904.07004}, archivePrefix={arXiv}, primaryClass={math.AT} } @Book{hottbook, author = {The {Univalent Foundations Program}}, title = {Homotopy Type Theory: Univalent Foundations of Mathematics}, publisher = {\url{https://homotopytypetheory.org/book}}, address = {Institute for Advanced Study}, year = 2013} @incollection {hofmannstreicher, AUTHOR = {Hofmann, Martin and Streicher, Thomas}, TITLE = {The groupoid interpretation of type theory}, BOOKTITLE = {Twenty-five years of constructive type theory}, VOLUME = {36}, PAGES = {83--111}, PUBLISHER = {Oxford University Press}, YEAR = {1998}, DOI = {10.1093/oso/9780198501275.003.0008}, } @inproceedings{heidemann2022zigzag, title={Zigzag normalisation for associative n-categories}, author={Heidemann, Lukas and Reutter, David and Vicary, Jamie}, booktitle={Proceedings of the 37th Annual ACM/IEEE Symposium on Logic in Computer Science (LICS)}, pages={1--13}, year={2022}, doi = {10.1145/3531130.3533352} } @article{Batanin2013, author = {Michael Batanin and Denis-Charles Cisinski and Mark Weber}, title = {Multitensor lifting and strictly unital higher category theory}, year = {2013}, journal = {Theory and Applications of Categories}, volume = 28, pages = {804--856} } @phdthesis{dorn2018associative, title={Associative n-categories}, author={Dorn, C}, year={2018}, school={University of Oxford} } @article {joyalcoherence, AUTHOR = {Joyal, Andr\'{e} and Kock, Joachim}, TITLE = {Coherence for weak units}, JOURNAL = {Documenta Mathematica}, VOLUME = {18}, YEAR = {2013}, PAGES = {71--110}, ISSN = {1431-0635,1431-0643}, } @incollection {cheng2007periodic, AUTHOR = {Cheng, Eugenia and Gurski, Nick}, TITLE = {The periodic table of {$n$}-categories for low dimensions {I}. {D}egenerate categories and degenerate bicategories}, BOOKTITLE = {Categories in algebra, geometry and mathematical physics}, SERIES = {Contemp. Math.}, VOLUME = {431}, PAGES = {143--164}, PUBLISHER = {Amer. Math. Soc., Providence, RI}, YEAR = {2007}, ISBN = {978-0-8218-3970-6}, DOI = {10.1090/conm/431/08270}, URL = {https://doi.org/10.1090/conm/431/08270} } @unpublished{cheng2007periodic2, title={The periodic table of $n$-categories for low dimensions II: degenerate tricategories}, author={Eugenia Cheng and Nick Gurski}, year={2007}, eprint={0706.2307}, archivePrefix={arXiv}, primaryClass={math.CT} } @article{Baez1995, title = {Higher-dimensional algebra and topological quantum field theory}, volume = {36}, ISSN = {1089-7658}, url = {http://dx.doi.org/10.1063/1.531236}, DOI = {10.1063/1.531236}, number = {11}, journal = {Journal of Mathematical Physics}, publisher = {AIP Publishing}, author = {Baez, John C. and Dolan, James}, year = {1995}, month = nov, pages = {6073–6105} } @BOOK{Heunen2019-jt, title = "Categories for quantum theory", author = "Heunen, Chris and Vicary, Jamie", publisher = "Oxford University Press", series = "Oxford Graduate Texts in Mathematics", month = nov, year = 2019, address = "London, England", doi = {10.1093/oso/9780198739623.001.0001} } @article{Barr1991, title = {*-Autonomous categories and linear logic}, volume = {1}, ISSN = {1469-8072}, url = {http://dx.doi.org/10.1017/S0960129500001274}, DOI = {10.1017/s0960129500001274}, number = {2}, journal = {Mathematical Structures in Computer Science}, publisher = {Cambridge University Press (CUP)}, author = {Barr, Michael}, year = {1991}, month = jul, pages = {159–178} } @book{riehl2022elements, title={Elements of \(\infty\)-Category Theory}, author={Riehl, Emily and Verity, Dominic}, volume={194}, year={2022}, publisher={Cambridge University Press}, DOI = {10.1017/9781108936880} } @article{Street2012, title = {Monoidal categories in, and linking, geometry and algebra}, volume = {19}, ISSN = {1370-1444}, url = {http://dx.doi.org/10.36045/bbms/1354031551}, DOI = {10.36045/bbms/1354031551}, number = {5}, journal = {Bulletin of the Belgian Mathematical Society - Simon Stevin}, publisher = {The Belgian Mathematical Society}, author = {Street, Ross}, year = {2012}, month = dec } @article{mellies2009categorical, title={Categorical semantics of linear logic}, author={Mellies, Paul-Andr{\'e}}, journal={Panoramas et syntheses}, volume={27}, pages={15--215}, year={2009} } @inproceedings{ghani2018compositional, title={Compositional game theory}, author={Ghani, Neil and Hedges, Jules and Winschel, Viktor and Zahn, Philipp}, booktitle={Proceedings of the 33rd annual ACM/IEEE Symposium on Logic in Computer Science (LICS)}, pages={472--481}, year={2018}, doi = {10.1145/3209108.3209165} } @book{Bourbaki2016, title = {Topologie algébrique}, ISBN = {9783662493618}, url = {http://dx.doi.org/10.1007/978-3-662-49361-8}, DOI = {10.1007/978-3-662-49361-8}, publisher = {Springer Berlin Heidelberg}, author = {Bourbaki, N.}, year = {2016} } @article{Weber2004, title = {Generic Morphisms, Parametric Representations and Weakly Cartesian Monads.}, author = {Weber, Mark}, date = {2004}, journaltitle = {Theory and Applications of Categories}, volume = {13}, pages = {191--234}, publisher = {{Mount Allison University, Department of Mathematics and Computer Science, Sackville}}, url = {http://eudml.org/doc/124614}, langid = {english}, keywords = {braiding,centre,descent,endofunctor,generic morphism,higher category theory,monad,operand,parametric representation,pseudofunctor} } @article{lipparini16, title={An infinite natural sum}, author={Lipparini, Paolo}, journal={Mathematical Logic Quarterly}, DOI = {10.1002/malq.201500017}, volume={62}, number={3}, pages={249--257}, year={2016}, publisher={Wiley Online Library} } @article{newman1942theories, title={On theories with a combinatorial definition of equivalence}, author={Newman, Maxwell and Herman, Alexander}, journal={Annals of mathematics}, pages={223--243}, year={1942}, publisher={JSTOR} } @unpublished{maltsiniotis2010grothendieck, title={Grothendieck $\infty$-groupoids, and still another definition of $\infty$-categories}, author={Georges Maltsiniotis}, year={2010}, eprint={1009.2331}, archivePrefix={arXiv}, primaryClass={math.CT} } @unpublished{leinster2001survey, title={A Survey of Definitions of n-Category}, author={Tom Leinster}, year={2001}, eprint={math/0107188}, archivePrefix={arXiv}, primaryClass={math.CT} } @inproceedings{finster2017type, title={A type-theoretical definition of weak $\omega$-categories}, author={Finster, Eric and Mimram, Samuel}, booktitle={Proceedings of the 32nd Annual ACM/IEEE Symposium on Logic in Computer Science (LICS)}, pages={1--12}, year={2017}, doi={10.1109/LICS.2017.8005124} } @phdthesis{Ara, author={Dimitri Ara}, title={Sur les $\infty$-groupoides de {G}rothendieck et une variante $\infty$-cat\'egorique}, school={Universit\'e Paris Diderot}, year={2010} } @unpublished{PursuingStacks, author={Alexander Grothendieck}, year=1983, title={Pursuing stacks} } @phdthesis{gurski2006algebraic, title={An algebraic theory of tricategories}, author={Gurski, Michael Nicholas}, year={2006}, school={University of Chicago, Department of Mathematics} } @inproceedings{bar2017data, title={Data structures for quasistrict higher categories}, author={Bar, Krzysztof and Vicary, Jamie}, booktitle={Proceedings of the 32nd Annual ACM/IEEE Symposium on Logic in Computer Science (LICS)}, pages={1--12}, year={2017}, doi={10.1109/LICS.2017.8005147} } @book{gordon1995coherence, title={Coherence for tricategories}, author={Gordon, Robert and Power, Anthony John and Street, Ross}, volume={558}, year={1995}, publisher={American Mathematical Soc.} } @inproceedings{finster2022type, title={A type theory for strictly unital ∞-categories}, author={Finster, Eric and Reutter, David and Vicary, Jamie and Rice, Alex}, booktitle={Proceedings of the 37th Annual ACM/IEEE Symposium on Logic in Computer Science (LICS)}, pages={1--12}, year={2022}, doi = {10.1145/3531130.3533363} } @inproceedings{finster2023strictly, title={A Syntax for Strictly Associative and Unital ∞-categories}, author={Finster, Eric and Rice, Alex and Vicary, Jamie}, booktitle={Proceedings of the 39th Annual ACM/IEEE Symposium on Logic in Computer Science (LICS)}, pubstate={forthcoming}, year={2024} } @InProceedings{cwf, author="Dybjer, Peter", editor="Berardi, Stefano and Coppo, Mario", title="Internal type theory", booktitle="Types for Proofs and Programs", year="1996", publisher="Springer Berlin Heidelberg", address="Berlin, Heidelberg", pages="120--134", abstract="We introduce categories with families as a new notion of model for a basic framework of dependent types. This notion is close to ordinary syntax and yet has a clean categorical description. We also present categories with families as a generalized algebraic theory. Then we define categories with families formally in Martin-L{\"o}f's intensional intuitionistic type theory. Finally, we discuss the coherence problem for these internal categories with families.", isbn="978-3-540-70722-6" } @article{batanin1998computads, title={Computads for finitary monads on globular sets}, author={Batanin, Michael A}, journal={Contemporary Mathematics}, volume={230}, pages={37--58}, year={1998}, issn = {0271-4132}, publisher={American Mathematical Society} } @article{street1976limits, title={Limits indexed by category-valued 2-functors}, author={Street, Ross}, journal={Journal of Pure and Applied Algebra}, volume={8}, number={2}, pages={149--181}, year={1976}, publisher={Elsevier}, doi={10.1016/0022-4049(76)90013-X} } @article{burroni1993higher, title={Higher-dimensional word problems with applications to equational logic}, author={Burroni, Albert}, journal={Theoretical computer science}, volume={115}, number={1}, pages={43--62}, year={1993}, publisher={Elsevier} } @unpublished{dean2022computads, title={Computads for weak $\omega$-categories as an inductive type}, author={Christopher J. Dean and Eric Finster and Ioannis Markakis and David Reutter and Jamie Vicary}, year={2024}, eprint={2208.08719}, archivePrefix={arXiv}, primaryClass={math.CT} } @unpublished{benjamin2021globular, title={Globular weak $\omega$-categories as models of a type theory}, author={Thibaut Benjamin and Eric Finster and Samuel Mimram}, year={2024}, eprint={2106.04475}, archivePrefix={arXiv}, primaryClass={cs.LO} } @phdthesis{benjamin2020type, title={A type theoretic approach to weak w-categories and related higher structures}, author={Benjamin, Thibaut}, year={2020}, school={Institut polytechnique de Paris} } @unpublished{benjamin2024duamity, title={Opposites of weak $\omega$-categories and the suspension and hom adjunction}, author={Thibaut Benjamin and Ioannis Markakis}, year={2024}, eprint={2402.01611}, archivePrefix={arXiv}, primaryClass={math.CT} } @article{batanin1998monoidal, title={Monoidal globular categories as a natural environment for the theory of weak n-categories}, author={Batanin, Michael A}, journal={Advances in Mathematics}, volume={136}, number={1}, pages={39--103}, year={1998}, publisher={Academic Press} } tex-fmt-0.5.2/tests/target/higher_categories_thesis.tex000066400000000000000000025741441473573253500233450ustar00rootroot00000000000000\documentclass{cam-thesis} \usepackage[english]{babel} \usepackage[utf8]{inputenc} \usepackage[T1]{fontenc} \usepackage{csquotes} %\usepackage{microtype} \usepackage[ttscale=.75]{libertine} \usepackage{dsfont} \usepackage[parfill]{parskip} % Set nicer (= less bold, less vertical spacing) mathcal font \usepackage[cal=cm]{mathalpha} % % Set up the headers and footers % \usepackage{fancyhdr} % \usepackage{ifthen} % \pagestyle{fancy} % \fancyhf{} % % Use ifthenelse to work around the fact that we wish to have % alternate headers % % but a onesided document % \fancyhead[R]{\ifthenelse{\isodd{\value{page}}}{% % \thepage\hfill\textsc{\nouppercase\leftmark}}{}} % \fancyhead[L]{\ifthenelse{\isodd{\value{page}}}{}{% % \textsc{\nouppercase\rightmark}\hfill\thepage}} % \fancyfoot{} % % Remove page numbers on the first page of a chapter % \fancypagestyle{plain}{% % \renewcommand{\headrulewidth}{0pt}% % \fancyhf{}% % } % See the excellent biblatex documentation for more information \usepackage[ backend=biber,% style=alphabetic,% block=ragged,% backref=false,% useprefix=true,% maxnames=8,% minnames=7,% minalphanames=3,% maxalphanames=4,% url=false, eprint=true, backrefstyle=two]% {biblatex} \renewcommand{\subtitlepunct}{\addcolon\addspace} % \DefineBibliographyStrings{english}{% % bibliography = {References}, } % Enumerations and tables \usepackage{calc} \usepackage[shortlabels]{enumitem} % \setlist{nosep} \setlist[description]{font={\textnormal},labelindent=\parindent} \usepackage{booktabs} \usepackage{longtable} \usepackage[width=.8\textwidth]{caption} \captionsetup[table]{skip=1em} % Math packages \usepackage{mathtools} \usepackage{savesym} \usepackage{amsmath} \savesymbol{openbox} \usepackage{amsthm} \usepackage{thmtools} \savesymbol{Bbbk} \usepackage{amssymb} \usepackage{stmaryrd} \usepackage{bm} % \usepackage{mathabx} % % tocbibind allows us to have the toc in the toc % \usepackage[notbib,notindex]{tocbibind} % % Supposedly it should also allow us to have the index and the bibliography in % % the toc, but it has some bugs (e.g. displaying the right page number in the % % toc, but getting the wrong link with hyperref), so we disable those options % % here and use corresponding separate options for the index, index of symbols % % (nomenclature) and bibliography instead. % % % % The whole is rather finicky and it is somehow crucial that % tocbibind is loaded % % *before* imakeidx. % \usepackage{imakeidx} % \makeindex[intoc,columns=2] % \usepackage[refpage,intoc,noprefix]{nomencl} % % Set fixed width so that descriptions in the index of symbols are aligned. % \setlength{\nomlabelwidth}{5cm} % \renewcommand{\nomname}{Index of symbols} % % Make page numbers links % \renewcommand*{\pagedeclaration}[1]{\unskip, \hyperpage{#1}} % \makenomenclature% % Used in hyperref's setup, and must be loaded before tikz-cd. \usepackage[dvipsnames]{xcolor} \definecolor{Diag1}{RGB}{0,0,255} \definecolor{Diag2}{RGB}{255,0,0} \usepackage[most]{tcolorbox} \usepackage{tikz-cd} \usepackage[ colorlinks=true % Remove the boxes , linktocpage=true % Make page numbers (not section titles) links in ToC , linkcolor=NavyBlue % Colour for internal links , citecolor=Green % Colour for bibliographical citations , urlcolor=BrickRed % Colour for (external) urls ]{hyperref} \usepackage[noabbrev,capitalise]{cleveref} \newcommand{\creflastconjunction}{, and\nobreakspace} \creflabelformat{equation}{#2\textup{#1}#3} % Write Equation x.y.z % instead of Equation (x.y.z) \Crefname{judgement}{Judgement}{Judgements} \Crefname{diagram}{Diagram}{Diagrams} \Crefname{rule}{Rule}{Rules} % Label tables just like equations, theorems, definitions, etc. % % NB: This can be confusing if LaTeX does not place the table at the point of % writing (e.g. for lack of space)! \numberwithin{equation}{section} % Colours are as in Andrej Bauer's notes on realizability: % https://github.com/andrejbauer/notes-on-realizability \colorlet{ShadeOfPurple}{blue!5!white} \colorlet{ShadeOfYellow}{yellow!5!white} \colorlet{ShadeOfGreen} {green!5!white} \colorlet{ShadeOfBrown} {brown!10!white} % Add a blue for principles \colorlet{ShadeOfBlue}{cyan!5!white} % But we also shade proofs \colorlet{ShadeOfGray} {gray!10!white} \declaretheorem[sibling=equation]{theorem} \declaretheorem[sibling=theorem]{lemma} \declaretheorem[sibling=theorem]{proposition} \declaretheorem[sibling=theorem]{corollary} \declaretheorem[sibling=theorem,style=definition]{definition} \declaretheorem[sibling=theorem,style=remark]{example} \declaretheorem[sibling=theorem,style=remark]{remark} \declaretheorem[style=definition,name=Guiding principle for groupoids,numbered=no]{principle-groupoid} \declaretheorem[style=definition,name=Guiding principle for categories,numbered=no]{principle-category} % Now we set the shading using the tcolorbox package. % % The related thmtools' option "shaded" and the package mdframed seem to have % issues: the former does not allow for page breaks in shaded environments and % the latter puts double spacing between two shaded environments. % % Since tcolorbox puts stuff inside a minipage or \parbox (according to this % stackexchange answer: https://tex.stackexchange.com/a/250170), new % paragraphs aren't indented. We can fix this by grabbing the parindent % value and passing it to tcbset. \newlength{\normalparindent} \AtBeginDocument{\setlength{\normalparindent}{\parindent}} \newlength{\normalparskip} \AtBeginDocument{\setlength{\normalparskip}{\parskip}} \tcbset{shadedenv/.style={ colback={#1}, frame hidden, enhanced, breakable, boxsep=0pt, left=2mm, right=2mm, % LaTeX thinks this is too wide (as becomes clear from the many "Overfull % \hbox" warnings, but optically it looks spot on. add to width=1.1mm, enlarge left by=-0.6mm, before upper={\setlength{\parindent}{\normalparindent}% \setlength{\parskip}{\normalparskip}} }} \newcommand{\setenvcolor}[2]{% \tcolorboxenvironment{#1}{shadedenv={#2}} \addtotheorempreheadhook[#1]{\tikzcdset{background color=#2}} } % \setenvcolor{theorem}{ShadeOfPurple} \setenvcolor{lemma}{ShadeOfPurple} \setenvcolor{proposition}{ShadeOfPurple} \setenvcolor{corollary}{ShadeOfPurple} \setenvcolor{definition}{ShadeOfYellow} \setenvcolor{example}{ShadeOfGreen} \setenvcolor{remark}{ShadeOfBrown} \setenvcolor{principle-groupoid}{ShadeOfBlue} \setenvcolor{principle-category}{ShadeOfBlue} \setenvcolor{proof}{ShadeOfGray} \declaretheorem[sibling=theorem,style=remark,numbered=no]{claim} \usepackage{xspace} \usepackage{quiver} \usetikzlibrary{nfold, backgrounds, decorations.pathmorphing, positioning} \tikzcdset{column sep/smaller/.initial=0em} \tikzcdset{arrow style = tikz, diagrams={>=stealth}} \tikzcdset{Rightarrow/.append style ={nfold}} \usepackage{adjustbox} \usepackage{cellspace} \usepackage{makecell} \setlength\cellspacetoplimit{5pt} \setlength\cellspacebottomlimit{5pt} \newcolumntype{P}[1]{>{\centering\arraybackslash}p{#1}} \usepackage{ebproof} \usepackage{mathpartir} \usepackage{subcaption} \usepackage{float} \usepackage{afterpage} \usepackage{listings} \lstdefinestyle{cattstyle}{ keywordstyle=\color{Diag1}, keywordstyle=[2]\color{Diag2}, basicstyle=\ttfamily, breaklines=true, keepspaces=true, belowskip=0pt, } \lstset{style=cattstyle} \lstdefinelanguage{Catt}{ keywords=[1]{def,normalise,assert,size,in}, keywords=[2]{coh,comp,id} } \usepackage{fontspec} \usepackage{fancyvrb} %\setmonofont[Scale=0.8]{Hack Nerd Font Mono} \hfuzz=1.5pt \def\su{\textsf{su}\xspace} \def\sua{\textsf{sua}\xspace} \def\sa{\textsf{sa}\xspace} \def\Catt{\textsc{Catt}\xspace} \def\Cattsua{\textsc{Catt}\textsubscript{\sua}\xspace} \def\Cattsu{\textsc{Catt}\textsubscript{\su}\xspace} \def\Cattsa{\textsc{Catt}\textsubscript{\sa}\xspace} \def\Cattr{\textsc{Catt}\textsubscript{\(\mathcal{R}\)}\xspace} \def\Group{\textsf{Group}\xspace} \def\Reg{\textsf{Reg}\xspace} \def\Std{\textsf{Std}\xspace} \def\dr{\textsf{dr}\xspace} \def\ecr{\textsf{ecr}\xspace} \def\prune{\textsf{prune}\xspace} \def\insert{\textsf{insert}\xspace} \newcommand\id{\ensuremath{\mathsf{id}}} \newcommand\proj{\ensuremath{\mathsf{proj}}} \newcommand*{\Coh}[3]{\ensuremath\mathsf{Coh}_{(#1\,;\,#2)}[#3]} \newcommand*{\SCoh}[3]{\ensuremath\mathsf{SCoh}_{(#1\,;\,#2)}[#3]} \newcommand*{\Ctx}{\ensuremath{\mathsf{Ctx}}} \newcommand*{\Tree}{\ensuremath{\mathsf{Tree}}} \newcommand*{\Sub}{\ensuremath{\mathsf{Sub}}} \newcommand*{\Type}{\ensuremath{\mathsf{Type}}} \newcommand*{\SType}{\ensuremath{\mathsf{SType}}} \newcommand*{\Term}{\ensuremath{\mathsf{Term}}} \newcommand*{\STerm}{\ensuremath{\mathsf{STerm}}} \newcommand*{\arr}[3]{{#1 \to_{#2} #3}} \newcommand*{\sub}[1]{\ensuremath{\llbracket #1 \rrbracket}} \newcommand*{\bound}[2]{\ensuremath{\partial_{#1}({#2})}} \newcommand*{\bdry}[3]{\ensuremath{\partial_{#1}^{#2}({#3})}} \newcommand*{\incbd}[3]{\ensuremath{\delta_{#1}^{#2}({#3})}} \newcommand*{\incbdpath}[3]{\ensuremath{\mathrm{I}_{#1}^{#2}({#3})}} \newcommand*{\stdcoh}[2]{\mathcal{C}_{#1}^{#2}} \newcommand*{\stdty}[2]{\mathcal{U}_{#1}^{#2}} \newcommand*{\stdtm}[2]{\mathcal{T}_{#1}^{#2}} \newcommand*{\stdlbl}[2]{\mathcal{L}_{#1}^{#2}} \newcommand*{\unrestrict}{\mathop\downarrow} \newcommand*{\unrestrictfull}{\mathop{\downarrow\downarrow}} \newcommand*{\restrict}{\mathop\uparrow} \newcommand*{\Dyck}{\mathsf{Dyck}} \newcommand*{\Peak}{\mathsf{Peak}} \newcommand*{\Path}{\mathsf{Path}} \newcommand*{\MaxPath}{\mathsf{MaxPath}} \newcommand*{\SPath}{\mathsf{SPath}} \newcommand*{\SOther}{\mathsf{SOther}} \newcommand*{\Inc}{\mathsf{Inc}} \newcommand*{\UDPeak}{\Updownarrow_{\mathsf{pk}}} \newcommand*{\UpPeak}{\Uparrow_{\mathsf{pk}}} \newcommand*{\DownPeak}{\Downarrow_{\mathsf{pk}}} \newcommand*{\eval}{\mathsf{eval}} \renewcommand*{\quote}{\mathsf{quote}} \newcommand*{\red}{\rightsquigarrow} \newcommand*{\redr}{\rightsquigarrow_{\mathcal{R}}} \newcommand*{\redrts}{\leftrightsquigarrow_{\mathcal{R}}} \DeclareMathOperator{\doubleplus}{+\kern-1ex+} \newcommand\emp{{[\kern3pt]}} \newcommand*{\insertion}[3]{\ensuremath{#1\mathop{\mathord{\ll}_{#2}}#3}} \newcommand*{\insertionprime}[3]{\ensuremath{#1\mathop{\mathord{\ll'}_{#2}}#3}} \renewcommand*{\th}{\ensuremath{\mathsf{th}}} \newcommand*{\bh}{\ensuremath{\mathsf{bh}}} \newcommand*{\lh}{\ensuremath{\mathsf{lh}}} \newcommand*{\+}{\mathbin{\#}} \DeclareMathOperator*{\bighash}{\text{\LARGE \(\+\)}} \renewcommand*{\sc}{\ensuremath{\mathsf{sc}}} \newcommand*{\U}{\mathbf{U}} \DeclareMathOperator{\FV}{FV} \DeclareMathOperator{\DC}{DC} \DeclareMathOperator{\Var}{Var} \DeclareMathOperator{\Supp}{Supp} \DeclareMathOperator{\replace}{replace} \DeclareMathOperator{\drop}{drop} \DeclareMathOperator{\ty}{Ty} \DeclareMathOperator{\tm}{Tm} \DeclareMathOperator{\wk}{wk} \DeclareMathOperator{\src}{src} \DeclareMathOperator{\tgt}{tgt} \DeclareMathOperator{\base}{base} \DeclareMathOperator{\N}{N} \DeclareMathOperator{\inc}{inc} \DeclareMathOperator{\fst}{fst} \DeclareMathOperator{\snd}{snd} \DeclareMathOperator{\dep}{dep} \DeclareMathOperator{\len}{len} \DeclareMathOperator{\ext}{ext} \makeatletter \providecommand{\leftsquigarrow}{% \mathrel{\mathpalette\reflect@squig\relax}% } \newcommand{\reflect@squig}[2]{% \reflectbox{$\m@th#1\rightsquigarrow$}% } \makeatother \newcommand{\olsi}[1]{\,\overline{\!{#1}}} % overline short italic \newcommand*{\module}[1]{% \href{https://alexarice.github.io/catt-agda/#1.html}{#1}} \newcommand*{\funcn}[3]{% \href{https://alexarice.github.io/catt-agda/#1.html\##2}{#3}} \newcommand*{\func}[2]{\funcn{#1}{#2}{#2}} \newlist{lemmaenum}{enumerate}{1} % should only occur inside lemma env. \setlist[lemmaenum]{label=(\roman*),ref=\thelemma(\roman*)} \crefalias{lemmaenumi}{lemma} \addbibresource{higher_categories_thesis.bib} \title{A type-theoretic approach to semistrict higher categories} %% The full name of the author (e.g.: James Smith): \author{Alex Rice} %% College affiliation: \college{Darwin College} %% College shield: %\collegeshield{CollegeShields/Darwin} %% Submission date [optional]: \submissiondate{18\textsuperscript{th} April 2024} %% Declaration date: \date{18\textsuperscript{th} April 2024} %% PDF meta-info: \subjectline{Computer Science} \keywords{category theory, higher category theory, type theory} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Abstract: %% \abstract{% Weak \(\infty\)-categories are known to be more expressive than their strict counterparts, but are more difficult to work with, as constructions in such a category involve the manipulation of explicit coherence data. This motivates the search for definitions of semistrict \(\infty\)-categories, where some, but not all, of the operations have been strictified. We introduce a general framework for adding definitional equality to the type theory \Catt, a type theory whose models correspond to globular weak \(\infty\)-categories, which was introduced by \citeauthor{finster2017type}. Adding equality to this theory causes the models to exhibit \emph{semistrict} behaviour, trivialising some operations while leaving others weak. The framework consists of a generalisation of \Catt extended with an equality relation generated by an arbitrary set of equality rules \(\mathcal{R}\), which we name \Cattr. We study this framework in detail, formalising much of its metatheory in the proof assistant Agda, and studying how certain operations of \Catt behave in the presence of definitional equality. The main contribution of this thesis is to introduce two type theories, \Cattsu and \Cattsua, which are instances of this general framework. \Cattsu, short for \Catt with strict units, is a variant of \Catt where the unitor isomorphisms trivialise to identities. It is primarily generated by a reduction we call \emph{pruning}, which removes identities from composites, simplifying their structure. \Cattsua, which stands for \Catt with strict units and associators, trivialises both the associativity and unitality operations of \Catt, and is generated by a generalisation of pruning called \emph{insertion}. Insertion merges multiple composites into a single operation, flattening the structure of terms in the theory. Further, we provide reduction systems that generate the equality of both \Cattsu and \Cattsua respectively, and prove that these reductions systems are strongly terminating and confluent. We therefore prove that the equality, and hence typechecking, of both theories is decidable. This is used to give an implementation of these type theories, which uses an approach inspired by normalisation by evaluation to efficiently find normal forms for terms. We further introduce a bidirectional typechecking algorithm used by the implementation which allows for terms to be defined in a convenient syntax where many arguments can be left implicit. } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Acknowledgements: %% \acknowledgements{% I would firstly like to thank everyone that I have collaborated with over the course of my PhD, both for their contributions to the work that appears in this thesis, but also for their contributions to my development as a researcher. I would especially like to thank my supervisor, Jamie Vicary, whose guidance throughout was invaluable, for keeping my research on track despite the disruptions caused by the pandemic during the first years of my PhD. I would also like to thank all the friends who have been with me at any point in this journey. I particularly want to show my appreciation (and apologise) to everyone who was bombarded with technical questions throughout the writing up of this text; I thoroughly enjoyed our discussions on correct typesetting and use of the English language. Lastly, I would like to thank my family for supporting me throughout my entire education. I would not have made it to this point without them. } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Contents: %% \begin{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Title page, abstract, declaration etc.: %% - the title page (is automatically omitted in the technical report mode). \frontmatter{} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% Thesis body: %% \chapter*{Introduction} \addcontentsline{toc}{chapter}{Introduction} The study of higher-dimensional structures is becoming more prevalent in both mathematics and computer science. \emph{Higher categories}~\cite{leinster2004higher,riehl2022elements}, a broad term for many different generalisations categories which capture these higher-dimensional ideas, are a central tool for studying these structures. The ``higher'' nature of these categories typically corresponds to the existence of morphisms whose source and target may be other morphisms, instead of just objects. A common method of organising this data is by giving a set of \(n\)-cells for each \(n \in \mathbb{N}\). A \(0\)-cell then corresponds to the objects of an ordinary category, and the source and target of an \((n+1)\)-cell are given by \(n\)-cells. These higher categories present in many forms, and have been characterised into a periodic table of categories~\cite{cheng2007periodic,cheng2007periodic2}. Of particular interest are the \((n,k)\)-categories for \(n,k \in \mathbb{N} \cup \{\infty\}\), higher categories which contain \(m\)-cells for \(m \leq n\), and whose \(m\)-cells are invertible for \(m < k\). In mathematics, the study of \((\infty,0)\)-categories, known as \(\infty\)-groupoids, is motivated by the study of the homotopy structure of topological spaces~\cite{Bourbaki2016}, where \(n\)-cells are given by paths in the topological space, with higher cells taking the form of homotopies between lower cells. In computer science, many applications have been found for \((n,n)\)-categories for smaller \(n\), more commonly referred to as \(n\)-categories, including quantum computing~\cite{Heunen2019-jt}, logic~\cite{Barr1991,mellies2009categorical}, physics~\cite{Baez1995}, and game theory~\cite{ghani2018compositional}, among others~\cite{Street2012}. The composition of \(1\)-cells in an \(n\)-category functions identically to the composition of morphisms in a \(1\) category; two morphisms \(f : x \to y\) and \(g : y \to z\) can be composed to form a \(1\)-cell \(f * g : x \to z\). However, there are two distinct ways of composing \(2\)-cells, depicted by the diagrams below: % https://q.uiver.app/#q=WzAsNSxbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzQsMCwiXFxidWxsZXQiXSxbNSwwLCJcXGJ1bGxldCJdLFs2LDAsIlxcYnVsbGV0Il0sWzAsMSwiIiwwLHsiY3VydmUiOi01fV0sWzAsMSwiIiwyLHsiY3VydmUiOjV9XSxbMCwxXSxbMiwzLCIiLDEseyJjdXJ2ZSI6LTN9XSxbMiwzLCIiLDEseyJjdXJ2ZSI6M31dLFszLDQsIiIsMSx7ImN1cnZlIjotM31dLFszLDQsIiIsMSx7ImN1cnZlIjozfV0sWzcsNSwiXFxhbHBoYSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbNiw3LCJcXGJldGEiLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzksOCwiXFxnYW1tYSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMTEsMTAsIlxcZGVsdGEiLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV1d % tex-fmt: skip \[ \begin{tikzcd} \bullet && \bullet && \bullet & \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=30pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, from=1-1, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=-18pt}, from=1-5, to=1-6] \arrow[""{name=4, anchor=center, inner sep=0}, curve={height=18pt}, from=1-5, to=1-6] \arrow[""{name=5, anchor=center, inner sep=0}, curve={height=-18pt}, from=1-6, to=1-7] \arrow[""{name=6, anchor=center, inner sep=0}, curve={height=18pt}, from=1-6, to=1-7] \arrow["\beta"', shorten <=4pt, shorten >=4pt, Rightarrow, from=2, to=0] \arrow["\alpha"', shorten <=4pt, shorten >=4pt, Rightarrow, from=1, to=2] \arrow["\gamma"', shorten <=5pt, shorten >=5pt, Rightarrow, from=4, to=3] \arrow["\delta"', shorten <=5pt, shorten >=5pt, Rightarrow, from=6, to=5] \end{tikzcd} \] These diagrams mirror the concept of commutative diagrams for \(1\)-categories, where spaces in the commutative diagram representing an equality have been replaced by \(2\)-cell arrows. The first of these composites composes two \(2\)-cells \(\alpha\) and \(\beta\) along a shared \(1\)-cell boundary creating the vertical composite \(\alpha \star_1 \beta\). The second composes the \(2\)-cells \(\gamma\) and \(\delta\) along a \(0\)-cell boundary and creates the horizontal composite \(\gamma \star_0 \delta\). In higher dimensions, the pattern continues of having \(n\) distinct ways of composing two \(n\)-cells. For each \(n\)-cell, there is also an identity \((n+1)\)-cell. Similarly to \(1\)-categories, \(n\)-categories must satisfy various laws concerning their operations. These can be roughly organised into 3 groups: \begin{itemize} \item Associativity laws: Each of the composition operations in an \(n\)-category is associative. \item Unitality laws: The identity morphisms are a left and right unit for the appropriate composition operations. \item Interchange laws: These laws govern the relation between different compositions on the same cells. For any four \(2\)-cells that form the following diagram: % https://q.uiver.app/#q=WzAsNCxbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzYsMF0sWzQsMCwiXFxidWxsZXQiXSxbMCwxLCIiLDAseyJjdXJ2ZSI6LTV9XSxbMCwxLCIiLDIseyJjdXJ2ZSI6NX1dLFswLDFdLFsxLDMsIiIsMix7ImN1cnZlIjotNX1dLFsxLDMsIiIsMix7ImN1cnZlIjo1fV0sWzEsM10sWzYsNCwiXFxiZXRhIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs1LDYsIlxcYWxwaGEiLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzgsOSwiXFxnYW1tYSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbOSw3LCJcXGRlbHRhIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dXQ== % tex-fmt: skip \[ \begin{tikzcd} \bullet && \bullet && \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=30pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, from=1-1, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-3, to=1-5] \arrow[""{name=4, anchor=center, inner sep=0}, curve={height=30pt}, from=1-3, to=1-5] \arrow[""{name=5, anchor=center, inner sep=0}, from=1-3, to=1-5] \arrow["\beta"', shorten <=4pt, shorten >=4pt, Rightarrow, from=2, to=0] \arrow["\alpha"', shorten <=4pt, shorten >=4pt, Rightarrow, from=1, to=2] \arrow["\gamma"', shorten <=4pt, shorten >=4pt, Rightarrow, from=4, to=5] \arrow["\delta"', shorten <=4pt, shorten >=4pt, Rightarrow, from=5, to=3] \end{tikzcd}\] the first of the interchange laws states that two composites below are related: \[ (\alpha \star_1 \beta) \star_0 (\gamma \star_1 \delta) \simeq (\alpha \star_0 \gamma) \star_1 (\beta \star_0 \delta)\] \end{itemize} These laws can be combined to create non-trivial emergent behaviour in a form not seen in the theory of \(1\)-categories. One critical example of this is known as the \emph{Eckmann-Hilton} argument~\cite{eckmann1962group}, which states that the composition of two scalars, morphisms from the identity to the identity, commute. The argument proceeds by moving the two scalars around each other, as depicted in \cref{fig:eh}. This crucially uses both the interchange and unitality laws. \newsavebox{\ehalpha} \savebox{\ehalpha}{\adjustbox{scale=0.8}{ \begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-10pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=10pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-10pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=10pt}, from=1-2, to=1-3] \arrow["\alpha"', color=Diag1, shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \arrow["\id"', shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=2] \end{tikzcd}}} \newsavebox{\ehbeta} \savebox{\ehbeta}{\adjustbox{scale=0.8}{ \begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-10pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=10pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-10pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=10pt}, from=1-2, to=1-3] \arrow["\id"', shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \arrow["\beta"', color=Diag2, shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=2] \end{tikzcd}}} \newsavebox{\ehlefttop} \savebox{\ehlefttop}{ \adjustbox{scale=1}{% \begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, controls=+(80:0.7) and +(100:0.7),, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=0}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, controls=+(100:-0.7) and +(80:-0.7),, from=1-1, to=1-2] \arrow["\alpha", color=Diag1, shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \arrow["\id", shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd}}} \newsavebox{\ehrighttop} \savebox{\ehrighttop}{ \adjustbox{scale=1}{% \begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, controls=+(80:0.7) and +(100:0.7), from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=0}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, controls=+(100:-0.7) and +(80:-0.7),, from=1-1, to=1-2] \arrow["\id", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \arrow["\beta", color=Diag2, shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd}}} \newsavebox{\ehleftbot} \savebox{\ehleftbot}{ \adjustbox{scale=1}{% \begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, controls=+(80:0.7) and +(100:0.7),, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=0}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, controls=+(100:-0.7) and +(80:-0.7),, from=1-1, to=1-2] \arrow["\id", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \arrow["\alpha", color=Diag1, shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd}}} \newsavebox{\ehrightbot} \savebox{\ehrightbot}{ \adjustbox{scale=1}{% \begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, controls=+(80:0.7) and +(100:0.7), from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=0}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, controls=+(100:-0.7) and +(80:-0.7),, from=1-1, to=1-2] \arrow["\beta", color=Diag2, shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \arrow["\id", shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd}}} \begin{figure}[ht] \centering \[ \begin{tikzcd}[ampersand replacement=\&,column sep=small] \bullet \&\& \bullet \& \simeq \& \bullet \&\&\&\&\& \bullet \& \simeq \& \bullet \&\&\& \bullet \&\&\& \bullet \\ \\ \&\&\&\&\&\&\&\&\&\&\&\&\&\& \simeq \\ \\ \bullet \&\& \bullet \& \simeq \& \bullet \&\&\&\&\& \bullet \& \simeq \& \bullet \&\&\& \bullet \&\&\& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, "\id", curve={height=-24pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "\id"', curve={height=24pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "\id"{description}, from=1-1, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, draw=none, controls=+(90:1.8) and +(90:1.8), from=1-5, to=1-10] \arrow[""{name=4, anchor=center, inner sep=0}, draw=none, controls=+(90:-1.8) and +(90:-1.8), from=1-5, to=1-10] \arrow[""{name=5, anchor=center, inner sep=0}, from=1-5, to=1-10] \arrow[""{name=6, anchor=center, inner sep=0}, draw=none, controls=+(90:1.8) and +(90:1.8), from=5-5, to=5-10] \arrow[""{name=7, anchor=center, inner sep=0}, draw=none, controls=+(90:-1.8) and +(90:-1.8), from=5-5, to=5-10] \arrow[""{name=8, anchor=center, inner sep=0}, from=5-5, to=5-10] \arrow[""{name=9, anchor=center, inner sep=0}, "\id", curve={height=-24pt}, from=5-1, to=5-3] \arrow[""{name=10, anchor=center, inner sep=0}, "\id"', curve={height=24pt}, from=5-1, to=5-3] \arrow[""{name=11, anchor=center, inner sep=0}, "\id"{description}, from=5-1, to=5-3] \arrow[""{name=12, anchor=center, inner sep=0}, draw=none, controls=+(80:1.5) and +(100:1.5), from=1-12, to=1-15] \arrow[""{name=13, anchor=center, inner sep=0}, draw=none, controls=+(100:-1.5) and +(80:-1.5), from=1-12, to=1-15] \arrow[""{name=14, anchor=center, inner sep=0}, draw=none, controls=+(80:1.5) and +(100:1.5), from=1-15, to=1-18] \arrow[""{name=15, anchor=center, inner sep=0}, draw=none, controls=+(100:-1.5) and +(80:-1.5), from=1-15, to=1-18] \arrow[""{name=16, anchor=center, inner sep=0}, draw=none, controls=+(80:1.5) and +(100:1.5), from=5-12, to=5-15] \arrow[""{name=17, anchor=center, inner sep=0}, draw=none, controls=+(100:-1.5) and +(80:-1.5), from=5-12, to=5-15] \arrow[""{name=18, anchor=center, inner sep=0}, draw=none, controls=+(80:1.5) and +(100:1.5), from=5-15, to=5-18] \arrow[""{name=19, anchor=center, inner sep=0}, draw=none, controls=+(100:-1.5) and +(80:-1.5), from=5-15, to=5-18] \arrow["\alpha"', color=Diag1, shorten <=3pt, shorten >=5pt, Rightarrow, from=1, to=2] \arrow["\beta"', color=Diag2, shorten <=5pt, shorten >=3pt, Rightarrow, from=2, to=0] \arrow["\beta"', color=Diag2, shorten <=3pt, shorten >=5pt, Rightarrow, from=10, to=11] \arrow["\alpha"', color=Diag1, shorten <=5pt, shorten >=3pt, Rightarrow, from=11, to=9] \arrow["\usebox{\ehalpha}"{description,inner sep = 0,xshift = -1.2pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=4, to=5] \arrow["\usebox{\ehbeta}"{description,inner sep = 0,xshift = -1.2pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=5, to=3] \arrow["\usebox{\ehbeta}"{description,inner sep = 0,xshift = -1.2pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=7, to=8] \arrow["\usebox{\ehalpha}"{description,inner sep = 0,xshift = -1.2pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=8, to=6] \arrow["\usebox{\ehlefttop}"{description,inner sep = 0,xshift = -1.3pt, yshift = 0.2pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=13, to=12] \arrow["\usebox{\ehrighttop}"{description,inner sep = 0,xshift = -1.3pt,yshift = 0.2pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=15, to=14] \arrow["\usebox{\ehleftbot}"{description,inner sep = 0,xshift = -1.3pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=17, to=16] \arrow["\usebox{\ehrightbot}"{description,inner sep = 0,xshift = -1.3pt}, shorten <=3pt, shorten >=3pt, Rightarrow, from=19, to=18] \arrow[controls=+(90:1.8) and +(90:1.8), from=1-5, to=1-10] \arrow[controls=+(90:-1.8) and +(90:-1.8), from=1-5, to=1-10] \arrow[controls=+(90:1.8) and +(90:1.8), from=5-5, to=5-10] \arrow[controls=+(90:-1.8) and +(90:-1.8), from=5-5, to=5-10] \arrow[controls=+(80:1.5) and +(100:1.5), from=1-12, to=1-15] \arrow[controls=+(100:-1.5) and +(80:-1.5), from=1-12, to=1-15] \arrow[controls=+(80:1.5) and +(100:1.5), from=1-15, to=1-18] \arrow[controls=+(100:-1.5) and +(80:-1.5), from=1-15, to=1-18] \arrow[controls=+(80:1.5) and +(100:1.5), from=5-12, to=5-15] \arrow[controls=+(100:-1.5) and +(80:-1.5), from=5-12, to=5-15] \arrow[controls=+(80:1.5) and +(100:1.5), from=5-15, to=5-18] \arrow[controls=+(100:-1.5) and +(80:-1.5), from=5-15, to=5-18] \end{tikzcd} \] \caption{The Eckmann-Hilton argument.} \label{fig:eh} \end{figure} \paragraph{Semistrict higher categories} While we have given the types of laws that must hold in \(n\)-categories, we have not yet stated the full nature of these laws. By taking each of these laws to hold up to equality, one obtains the notion of a \emph{strict} \(n\)-category. It is often the case in category theory that equality is the incorrect notion by which to compare objects, with the coarser relation of isomorphism being preferable. In the presence of higher-dimensional cells, arrows themselves can be compared up to isomorphism. This allows the laws for an \(n\)-category to be stated with isomorphism replacing equality, giving rise to the notion of \emph{weak} \(n\)-category. In such a weak \(n\)-category, each law is given by a set of isomorphisms, which are given as part of the data of the category. For the associativity law of three \(1\)-cells \(f\), \(g\), and \(h\), an invertible \(2\)-cell known as the \emph{associator} must be given, which takes the following form: \[ \alpha_{f,g,h} : (f * g) * h \to f * (g * h)\] Similarly, the unit laws for a \(1\)-cell \(f\) are given by the \emph{left unitor} \(\lambda_f\) and the \emph{right unitor} \(\rho_f\) which take the following form: \[ \lambda_f : \id * f \to f \qquad \rho_f : f * \id \to f\] Whereas two morphisms being equal is a property of those morphisms, an isomorphism between the same morphisms is a form of data, and the choice of isomorphism may not be unique. Weak higher categories therefore contain higher \emph{coherence laws} which govern the interaction of these isomorphisms. These coherence laws can also be given as isomorphisms instead of equalities, and must satisfy their own coherence laws, leading to a tower of coherence laws. The amount of data needed to define an \(n\)-category therefore increases exponentially as \(n\) increases. In addition to the difficulty in defining a weak \(n\)-category, it is also more difficult to give proofs in a weak environment, due to the bureaucracy of working around the various coherence isomorphisms. Consider the proof of Eckmann-Hilton given in \cref{fig:eh}. In a weak environment, we would hope to be able to simply replace each equality by the appropriate isomorphism, however doing so for the first equality in the proof would require us to give an isomorphism: \[ \alpha \cong \alpha * \id\] Each side of this isomorphism has a different source and target, and hence no such isomorphism can be given in the globular setting used in this thesis. A full proof of Eckmann-Hilton is still possible but far more involved. Weak categories are a more general notion than their strict counterparts, with every strict \(n\)-category generating a corresponding weak category by letting every coherence isomorphism be given by the identity morphism. For \(2\)-categories, the converse is in fact possible; every weak \(2\)-category is equivalent to a strict \(2\)-category, allowing proofs for weak \(2\)-categories to be given by instead proving the same property for strict \(2\)-categories. This is no longer possible in \(n\)-categories where \(n \geq 3\). It was shown by \citeauthor{simpson1998homotopy}~\cite{simpson1998homotopy} that strict \(n\)-categories do not model the homotopy structure of all topological spaces, with the topological space \(S^2\) having no interpretation. More concretely, we consider the morphism \(\mathsf{EH}_{\alpha,\beta} : \alpha \star_1 \beta \to \beta \star_1 \alpha\) generated by the Eckmann-Hilton argument for scalars \(\alpha\) and \(\beta\). In a strict \(3\)-category, this morphism is given by the identity and so: \[ \mathsf{EH}_{\alpha,\beta} \star_2 \mathsf{EH}_{\beta,\alpha} = \id\] This equality does not hold in a general weak \(3\)-category (even up to isomorphism), contradicting that each weak \(3\)-category is equivalent to a strict \(3\)-category. This motivates the search for semistrict definitions of \(n\)-category: definitions where some operations are strict, yet do not lose the expressivity of weak \(n\)-categories. For \(3\)-categories, two such definitions have been proposed: \begin{itemize} \item \citeauthor{joyal2006weak}~\cite{joyal2006weak,joyalcoherence} define a monoidal \(2\)-category (which can be viewed as a \(3\)-category with a single \(0\)-cell) which only has weak units and unitors, and is otherwise strict. They prove that all braided monoidal categories (weak \(3\)-categories with a unique \(0\)-cell and unique \(1\)-cell) can be interpreted in this setting as the category of endomorphisms on the weak unit morphism. \item Gray-categories are a form of semistrict \(3\)-categories for which all structure is strict except the interchanger, the isomorphism witnessing the interchange law. \citeauthor{gordon1995coherence}~\cite{gordon1995coherence} prove that every weak \(3\)-category is equivalent to a Gray-category. \end{itemize} It is non-trivial to even define such a notion of semistrict \(n\)-category for \(n > 3\), let alone prove that it loses no expressivity over its weak counterpart. Simpson conjectures~\cite{simpson1998homotopy} that having only the unit laws weak is sufficient to model all homotopy groupoids, \(\infty\)-groupoids arising from the homotopy of topological spaces, though it is unclear if such a definition has been given. \citeauthor{hadzihasanovic2019representable}~% \cite{hadzihasanovic2019representable} defines weak higher categories based on \emph{diagrammatic sets}. It could be argued that such a definition can model strict interchange, though the classes of diagrams that can be composed in this theory are restricted to those that are \emph{spherical}, which disallows horizontal composites in the form stated above and makes comparison difficult. \citeauthor{Batanin2013}~\cite{Batanin2013} define a notion of \(\infty\)-category with strict units based on the language of operads. % A key axiom in this theory is \emph{disc reduction} which % states that composites trivialise over certain configurations of % cells known as discs. Definitions of semistrict \(n\)-categories which are strictly unital and associative have also been defined, primarily inspired by the graphical language of \emph{string diagrams}. \citeauthor{bar2017data}~\cite{bar2017data} define \emph{quasi-strict \(4\)-categories}, where the associativity and unitality laws hold strictly up to equality. \citeauthor{dorn2018associative}~\cite{dorn2018associative} defines \emph{associative \(n\)-categories}: a definition of strictly associative and unital \(n\)-category similarly based on geometric principles. Associative \(n\)-categories are further studied by Heidemann, Reutter, Tataru, and Vicary~\cite{reutter2019high,heidemann2022zigzag,tataru2024theory}, which has recently led to the construction of the graphical proof assistant \textsf{homotopy.io}~\cite{corbyn2024homotopy} for manipulating higher-dimensional string diagrams. Similarly to the case for diagrammatic sets, the composition operations in these theories have a different form to those of strict \(n\)-categories, making comparison difficult. The connection between these definitions and geometry is studied by \citeauthor{dorn2021framed}~\cite{dorn2021framed} and \citeauthor{heidemann2023framed}~\cite{heidemann2023framed}. \paragraph{Type theory and higher categories} Deep links exist between higher category theory and type theory. The identity type in Martin-Löf type theory (\textsc{Mltt})~\cite{MARTINLOF197573} naturally leads to higher-dimensional structure; the identity type \(s =_A t\) can be formed for any two terms \(s\) and \(t\) of type \(A\), but this construction can be iterated since the identity type is a type itself, leading to higher identity types \(p =_{s =_A t} q\) for \(p, q: s =_A t\). Operations on this type are generated by the J-rule, an induction principle for the identity type. Independent proofs by \citeauthor{lumsdaine2010weak}~\cite{lumsdaine2010weak} and \citeauthor{garner2011types}~\cite{garner2011types} show that the J-rule is sufficient to equip identity types with the appropriate operations to form a weak \(\infty\)-groupoid. Terms of the identity type \(s =_A t\) correspond to witnesses of the fact that \(s\) and \(t\) are equal, or can even be viewed as proofs of the equality. The study of these proofs as objects of study in their own right is known as \emph{proof relevance}. Although the axiom of uniqueness of identity proofs (UIP), which states that any two terms of the identity type are themselves equal, is consistent with \textsc{Mltt}, it was shown that it is not provable by \citeauthor{hofmannstreicher}, who constructed a model of \textsc{Mltt} where types are interpreted as \(1\)-groupoids, and identity types are non-trivial. The \(\infty\)-groupoidal nature of \textsc{Mltt} is embraced in Homotopy type theory (\textsc{Hott})~\cite{hottbook}, where types are interpreted as topological spaces. The key component of \textsc{Hott}, the \emph{univalence axiom}, which is incompatible with UIP, states that the identities between types are given by equivalences between these types, which need not be unique. The models of \textsc{Hott} are equipped with more structure than is present in an \(\infty\)-groupoid, and are given by \(\infty\)-toposes~\cite{shulman2019all}. In the appendices of his thesis~\cite{brunerie2016homotopy}, \citeauthor{brunerie2016homotopy} defines a type theory for \(\infty\)-groupoids by removing all structure from \textsc{Mltt} which does not concern the identity type. This theory constructs the identity type similarly to \textsc{Mltt}, but replaces the J-rule with a rule stating that all terms over \emph{contractible contexts} are equal. \citeauthor{finster2017type} further refine this idea to produce the type theory \Catt~\cite{finster2017type}, a type theory for weak \(\infty\)-categories, using techniques from a definition of weak \(\infty\)-categories due to \citeauthor{maltsiniotis2010grothendieck}~\cite{maltsiniotis2010grothendieck} which itself is based on an earlier definition of \(\infty\)-groupoids which was given by \citeauthor{PursuingStacks}~\cite{PursuingStacks}. It was later shown~\cite{benjamin2021globular} that type-theoretic models of \Catt coincide with \(\infty\)-categories defined by \citeauthor{maltsiniotis2010grothendieck}. The type theory \Catt is unusual, due to having no computation or equality rules. In the current work we leverage this to define new notions of semistrict \(\infty\)-category, by adding definitional equality to \Catt. This equality unifies certain terms, which correspond to operations in a weak \(\infty\)-category, causing the semistrict behaviour of the resulting theories. This thesis develops a framework for working with equality relations in \Catt, and uses this to define two new type theories, \Cattsu and \Cattsua: \begin{itemize} \item \Cattsu is a version of \Catt which is strictly unital. It is primarily generated by the \emph{pruning} reduction, a computation rule which removes unnecessary identities from more complex terms. \item \Cattsua is \Catt with strict unitors and associators. In this theory, pruning is replaced by a more general reduction which we call \emph{insertion}, which merges multiple composites into a single composite, flattening the structure of terms in the theory. We claim to give the first algebraic definition of an \(\infty\)-category where the unitality and associativity laws hold strictly as models of \Cattsua. \end{itemize} The majority of the technical content of this thesis is concerned with proving standard metatheoretic properties of these type theories. This includes defining a notion of computation for each theory, given by demonstrating the existence of a confluent and terminating reduction system, which allows these theories to be implemented. This is used to produce interpreters for both theories, allowing complex constructions to be checked mechanically. We demonstrate the utility of this by formalising a proof of the \emph{syllepsis}, a \(5\)-dimensional term witnessing a commutativity property of the Eckmann-Hilton argument. \clearpage \paragraph{Overview} We now give an overview of the content contained in each of the following chapters of the thesis. \begin{itemize} \item \cref{sec:background} gives an introduction to \(\infty\)-category theory. It defines strict \(\infty\)-categories and continues to define the definition of weak \(\infty\)-categories due to Maltsiniotis. The chapter ends by giving a definition of the type theory \Catt, as defined by \citeauthor{finster2017type}, and describing some preliminary well-known constructions in \Catt. \item \cref{cha:gener-pres-catt} introduces a general framework for studying variants of \Catt with definitional equality relations generated from a set of rules \(\mathcal{R}\), which we name \Cattr. The chapter also states various properties concerning the metatheory of \Cattr, including specifying conditions on the set of equality rules \(\mathcal{R}\), under which the theory is well-behaved. The description of \Catt in this chapter is comprehensive and self-contained, although lacks some exposition of the previous chapter. The type theory \Cattr is accompanied by an Agda formalisation, which is introduced in this chapter. \item \cref{sec:operations-catt} takes an arbitrary well-behaved variant of \Cattr, and explores various constructions that can be formed in this setting. The primary purpose of this chapter is to introduce the \emph{pruning operation}, which is done in \cref{sec:pruning}, and the \emph{insertion operation}, which is introduced in \cref{sec:insertion}. \cref{sec:trees,sec:structured-terms} build up theory about a certain class of contexts represented by trees, and terms that appear in these contexts. This theory is vital for a complete understanding of insertion. \item In \cref{cha:cattstrict}, the type theories \Cattsu and \Cattsua are finally defined in \cref{sec:cattsu,sec:cattsua} respectively, as variants of the framework \Cattr. Preliminary results about both theories are proved, primarily by compiling results that have been stated in the previous two chapters. The main technical contribution of this section involves giving reduction systems for both theories, and giving proofs that these reductions systems are strongly terminating and globally confluent, hence making equality in these theories decidable. In \cref{sec:towards-nbe}, the decidability of equality is used to implement a typechecker for both theories \Cattsu and \Cattsua. The typechecker uses \emph{normalisation by evaluation} (NbE) to reduce terms to a canonical form where they can be checked for equality. The section discusses the interaction of NbE with \Catt, as well as discussing limitations of this approach in this setting. \cref{sec:models} discusses some properties of the models of these type theories, introducing a technique which we call \emph{rehydration}, which ``pads out'' terms of the semistrict theory with the necessary coherences to produce a term of \Catt which is equivalent to the original term. Rehydration can be seen as a conservativity result for the semistrict theories introduced at the start of the chapter. A proof of rehydration is given for the restricted case of terms over a certain class of context known as ps-contexts. This partial rehydration result is sufficient to determine that the semistrictness defined by \Cattsu and \Cattsua is a property, a model of \Catt can be a model of \Cattsu or \Cattsua in at most one way. We further explore some obstructions to rehydration in a generic context. The thesis ends with a discussion of further variants of \Catt and other options for future work. \end{itemize} Although results of later chapters depend on definitions and results of the preceding chapters, a linear reading of this thesis is not essential. A reader who is already familiar with the type theory \Catt can safely skip \cref{sec:background}, and a reader who is only interested in the type theory \Cattsu could read \cref{cha:gener-pres-catt} followed by \cref{sec:pruning,sec:cattsu}. Similarly, a reader only interested in \Cattsua can ignore any content on the pruning construction. \cref{sec:towards-nbe} may be of interest to a reader who is purely interested in the type-theoretic techniques used, and not the type theory \Catt itself. \paragraph{Statement of authorship} The type theory \Cattsu was originally developed in collaboration with Eric Finster, David Reutter, and Jamie Vicary, and was presented by the author at the Logic in Computer Science conference in 2022~\cite{finster2022type}. \Cattsua will be presented at Logic in Computer Science 2024~\cite{finster2023strictly} and was developed in collaboration with Eric Finster and Jamie Vicary. The author claims the development of the framework \Cattr and its accompanying Agda formalisation as individual contribution, as well as the implementation of \Cattsu and \Cattsua which appears in \cref{sec:towards-nbe}. \chapter{Background} \label{sec:background} We begin with an overview of the important concepts required for the rest of the thesis. Throughout, we will assume knowledge of various basic concepts from computer science, as well as a basic knowledge of category theory (including functor categories, presheaves, and (co)limits) and type theory. The primary purpose of the following sections is to introduce weak \(\infty\)-categories. While there are many differing definitions of \(\infty\)-categories (see \cite{leinster2001survey}), we focus here on models of the type theory \Catt~\cite{finster2017type}, which are known to be equivalent to a definition of \citeauthor{maltsiniotis2010grothendieck}~\cite{maltsiniotis2010grothendieck} based off an earlier definition by \citeauthor{PursuingStacks}~\cite{PursuingStacks}, which we introduce in \cref{sec:weak}. In \cref{sec:type-theory-catt}, we define the type theory \Catt, similarly to how it was originally defined. This section additionally serves as a place to introduce various syntax and notations which will be used throughout the rest of the thesis. \section{Higher categories} \label{sec:higher-categories} A higher category is a generalisation of the ordinary notion of a category to allow higher-dimensional structure. This manifests in the form of allowing arrows or morphisms to have their source or target be another morphism instead of an object. In this thesis, we are primarily concerned with \(\infty\)-categories, which are equipped with the notion of an \(n\)-cell for each \(n \in \mathbb{N}\), where each \((n+1)\)-cell has a source and target \(n\)-cell, and \(0\)-cells play the role of objects in an ordinary category. The role of objects is played by \(0\)-cells, with \(1\)-cells as the morphisms between these objects. For \(0\)-cells \(x\) and \(y\), a \(1\)-cell \(f\) with source \(x\) and target \(y\) will be drawn as: \[ \begin{tikzcd} x & y \arrow["f", from=1-1, to=1-2] \end{tikzcd} \] or may be written as \(f: x \to y\). Two cells are \emph{parallel} if they have the same source and target. Between any two parallel \(n\)-cells \(f\) and \(g\), we have a set of \((n+1)\)-cells between them. A \(2\)-cell \(\alpha : f \to g\) may be drawn as: \[ \begin{tikzcd} x & y \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-12pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=12pt}, from=1-1, to=1-2] \arrow["\alpha", shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd} \] A \(3\)-cell \(\gamma\) between parallel \(2\)-cells \(\alpha\) and \(\beta\) could be drawn as: \[ \begin{tikzcd} x && y \arrow[""{name=0, anchor=center, inner sep=0}, "f", curve={height=-15pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "g"', curve={height=15pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "\alpha", shift left=4, shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \arrow[""{name=3, anchor=center, inner sep=0}, "\beta"', shift right=4, shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \arrow["\gamma", shorten <=4pt, shorten >=4pt, Rightarrow, nfold=3, from=2, to=3] \end{tikzcd} \] Just as in ordinary \(1\)-category theory, we expect to be able to compose morphisms whose boundaries are compatible. For \(1\)-cells, nothing has changed, given \(1\)-cells \(f: x \to y\) and \(g : y \to z\) we form the composition \(f * g\): \[ \begin{tikzcd} x & y & z \arrow[from=1-1, to=1-2, "f"] \arrow[from=1-2, to=1-3, "g"] \end{tikzcd} \] which has source \(x\) and target \(z\). We pause here to note that composition will be given in ``diagrammatic order'' throughout the whole thesis, which is the opposite of the order of function composition yet the same as the order of the arrows as drawn above. This is chosen as it will be common for us to draw higher-dimensional arrows in a diagram, and rare for us to consider categories where the higher arrows are given by functions. In an attempt to avoid confusion, we use an asterisk (\(*\)) to represent composition of arrows or cells in a higher category, and will use a circle (\(\circ\)) only for function composition. In two dimensions, there is no longer a unique composition operation. For \(2\)-cells \(\alpha : f \to g\) and \(\beta : g \to h\), the composite \(\alpha *_1 \beta\) can be formed as before: % https://q.uiver.app/#q=WzAsMixbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzAsMSwiZiIsMCx7ImN1cnZlIjotNH1dLFswLDEsImgiLDIseyJjdXJ2ZSI6NH1dLFswLDEsImciLDFdLFsyLDQsIlxcYWxwaGEiLDAseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzQsMywiXFxiZXRhIiwwLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dXQ== % tex-fmt: skip \[ \begin{tikzcd} x && y \arrow[""{name=0, anchor=center, inner sep=0}, "f"', curve={height=24pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "h", curve={height=-24pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "g"{description}, from=1-1, to=1-3] \arrow["\alpha", shorten <=3pt, shorten >=3pt, Rightarrow, from=0, to=2] \arrow["\beta", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \end{tikzcd} \] We refer to this composition as \emph{vertical composition}. The cells \(\gamma : i \to j\) and \(\delta : k \to l\) can also be composed in the following way: % https://q.uiver.app/#q=WzAsMyxbMCwwLCJ4Il0sWzEsMCwieSJdLFsyLDAsInoiXSxbMCwxLCIiLDAseyJjdXJ2ZSI6LTN9XSxbMCwxLCIiLDIseyJjdXJ2ZSI6M31dLFsxLDIsIiIsMix7ImN1cnZlIjotM31dLFsxLDIsIiIsMix7ImN1cnZlIjozfV0sWzMsNCwiXFxhbHBoYSIsMCx7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbNSw2LCJcXGJldGEiLDAseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV1d % tex-fmt: skip \[ \begin{tikzcd} x & y & z \arrow[""{name=0, anchor=center, inner sep=0}, "j", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "i"', curve={height=18pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, "l", curve={height=-18pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "k"', curve={height=18pt}, from=1-2, to=1-3] \arrow["\gamma", shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["\delta", shorten <=5pt, shorten >=5pt, Rightarrow, from=3, to=2] \end{tikzcd} \] This composition is called the \emph{horizontal composition}, and is written \(\gamma *_0 \delta\). The subscript refers to the dimension of the shared boundary in the composition, with the \(1\)-cell \(g\) being the shared boundary in the vertical composition example and the \(0\)-cell \(y\) being the shared boundary in the horizontal composition example. The dimension of this shared boundary is the \emph{codimension} of the composition. This pattern continues with \(3\)-cells, which can be composed at codimension \(0\), \(1\), or \(2\), as depicted below: % https://q.uiver.app/#q=WzAsNyxbMiwwLCJcXGJ1bGxldCJdLFswLDAsIlxcYnVsbGV0Il0sWzMsMCwiXFxidWxsZXQiXSxbNSwwLCJcXGJ1bGxldCJdLFs2LDAsIlxcYnVsbGV0Il0sWzcsMCwiXFxidWxsZXQiXSxbOCwwLCJcXGJ1bGxldCJdLFsxLDAsIiIsMCx7ImN1cnZlIjotM31dLFsxLDAsIiIsMix7ImN1cnZlIjozfV0sWzIsMywiIiwwLHsiY3VydmUiOi00fV0sWzIsMywiIiwyLHsiY3VydmUiOjR9XSxbMiwzXSxbNCw1LCIiLDAseyJjdXJ2ZSI6LTN9XSxbNCw1LCIiLDIseyJjdXJ2ZSI6M31dLFs1LDYsIiIsMix7ImN1cnZlIjotM31dLFs1LDYsIiIsMix7ImN1cnZlIjozfV0sWzgsNywiIiwyLHsib2Zmc2V0IjotNSwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs4LDcsIiIsMCx7Im9mZnNldCI6NSwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs4LDcsIiIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMTAsMTEsIiIsMix7Im9mZnNldCI6LTQsInNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMTAsMTEsIiIsMCx7Im9mZnNldCI6NCwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFsxMSw5LCIiLDEseyJvZmZzZXQiOi00LCJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzExLDksIiIsMSx7Im9mZnNldCI6NCwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFsxMywxMiwiIiwyLHsib2Zmc2V0IjotMywic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFsxMywxMiwiIiwwLHsib2Zmc2V0IjozLCJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzE1LDE0LCIiLDIseyJvZmZzZXQiOi0zLCJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzE1LDE0LCIiLDAseyJvZmZzZXQiOjMsInNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMTYsMTgsIlxcZ2FtbWEiLDAseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzE4LDE3LCJcXGRlbHRhIiwwLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFsyMSwyMiwiXFxnYW1tYSIsMCx7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMTksMjAsIlxcZGVsdGEiLDAseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzIzLDI0LCJcXGdhbW1hIiwwLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFsyNSwyNiwiXFxkZWx0YSIsMCx7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XV0= % tex-fmt: skip \[ \begin{tikzcd} \bullet && \bullet & \bullet && \bullet & \bullet & \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-18pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=18pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-24pt}, from=1-4, to=1-6] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=24pt}, from=1-4, to=1-6] \arrow[""{name=4, anchor=center, inner sep=0}, from=1-4, to=1-6] \arrow[""{name=5, anchor=center, inner sep=0}, curve={height=-18pt}, from=1-7, to=1-8] \arrow[""{name=6, anchor=center, inner sep=0}, curve={height=18pt}, from=1-7, to=1-8] \arrow[""{name=7, anchor=center, inner sep=0}, curve={height=-18pt}, from=1-8, to=1-9] \arrow[""{name=8, anchor=center, inner sep=0}, curve={height=18pt}, from=1-8, to=1-9] \arrow[""{name=9, anchor=center, inner sep=0}, shift left=5, shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow[""{name=10, anchor=center, inner sep=0}, shift right=5, shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow[""{name=11, anchor=center, inner sep=0}, shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow[""{name=12, anchor=center, inner sep=0}, shift left=4, shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=4] \arrow[""{name=13, anchor=center, inner sep=0}, shift right=4, shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=4] \arrow[""{name=14, anchor=center, inner sep=0}, shift left=4, shorten <=3pt, shorten >=3pt, Rightarrow, from=4, to=2] \arrow[""{name=15, anchor=center, inner sep=0}, shift right=4, shorten <=3pt, shorten >=3pt, Rightarrow, from=4, to=2] \arrow[""{name=16, anchor=center, inner sep=0}, shift left=3, shorten <=5pt, shorten >=5pt, Rightarrow, from=6, to=5] \arrow[""{name=17, anchor=center, inner sep=0}, shift right=3, shorten <=5pt, shorten >=5pt, Rightarrow, from=6, to=5] \arrow[""{name=18, anchor=center, inner sep=0}, shift left=3, shorten <=5pt, shorten >=5pt, Rightarrow, from=8, to=7] \arrow[""{name=19, anchor=center, inner sep=0}, shift right=3, shorten <=5pt, shorten >=5pt, Rightarrow, from=8, to=7] \arrow["", shorten <=2pt, shorten >=2pt, Rightarrow, nfold=3, from=9, to=11] \arrow["", shorten <=2pt, shorten >=2pt, Rightarrow, nfold=3, from=11, to=10] \arrow["", shorten <=3pt, shorten >=3pt, Rightarrow, nfold=3, from=14, to=15] \arrow["", shorten <=3pt, shorten >=3pt, Rightarrow, nfold=3, from=12, to=13] \arrow["", shorten <=2pt, shorten >=2pt, Rightarrow, nfold=3, from=16, to=17] \arrow["", shorten <=2pt, shorten >=2pt, Rightarrow, nfold=3, from=18, to=19] \end{tikzcd} \] where the unlabelled arrows and objects (which are written \(\bullet\)) are assumed to represent arbitrary potentially-distinct cells. For every \(n\)-cell \(x\), there is an \((n+1)\)-cell \(\id(x) : x \to x\), called the \emph{identity morphism}. Similarly to 1-categories, \(\infty\)-categories need to satisfy certain laws, which fall into 3 groups: associativity, unitality, and interchange. These laws can hold strictly, meaning that they hold up to equality, or weakly, meaning that they hold up to a higher-dimensional isomorphism. We delay the discussion of weak \(\infty\)-categories to \cref{sec:weak}, and begin with the discussion of strict \(\infty\)-categories. In these strict categories, associativity laws are the same as for 1-categories, only now a law is needed for each composition (in every dimension and codimension). Unitality is again similar to the case for 1-categories, except we again need unitality laws for each composition. We note that for lower-codimensional compositions, an iterated identity is needed. For example, given a \(2\)-cell \(\alpha : f \to g\), the appropriate equation for left unitality of horizontal composition is: \[ \id(\id(x)) *_0 \alpha = \alpha \] In general for a unit to be cancelled, it must be iterated a number of times equal to the difference between the dimension and codimension of the composition. Interchange laws do not appear in 1-categories, and specify how compositions of different dimensions interact. The first interchange law states that for suitable \(2\)-cells \(\alpha\), \(\beta\), \(\gamma\), and \(\delta\), that: \[ (\alpha *_0 \gamma) *_1 (\beta *_0 \delta) = (\alpha *_1 \beta) *_0 (\gamma *_1 \delta)\] This can be diagrammatically depicted as: \newsavebox{\innertop} \savebox{\innertop}{ \adjustbox{scale=0.8}{ \begin{tikzcd}[ampersand replacement=\&,column sep=small] \bullet \& \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-12pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=12pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-12pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=12pt}, from=1-2, to=1-3] \arrow["\alpha", shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \arrow["\gamma", shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=2] \end{tikzcd}}} \newsavebox{\innerbot} \savebox{\innerbot}{ \adjustbox{scale=0.8}{ \begin{tikzcd}[ampersand replacement=\&,column sep=small] \bullet \& \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-12pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=12pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-12pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=12pt}, from=1-2, to=1-3] \arrow["\beta", shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \arrow["\delta", shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=2] \end{tikzcd}}} \newsavebox{\innerleft} \savebox{\innerleft}{ \adjustbox{scale=1}{ \begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, controls=+(80:0.7) and +(100:0.7),, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=0}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, controls=+(100:-0.7) and +(80:-0.7),, from=1-1, to=1-2] \arrow["\alpha", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \arrow["\beta", shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd}}} \newsavebox{\innerright} \savebox{\innerright}{ \adjustbox{scale=1}{ \begin{tikzcd}[ampersand replacement=\&,column sep=small,cramped] \bullet \& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, controls=+(80:0.7) and +(100:0.7), from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=0}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, controls=+(100:-0.7) and +(80:-0.7),, from=1-1, to=1-2] \arrow["\gamma", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \arrow["\delta", shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd}}} \[ \begin{tikzcd}[column sep=small] \bullet &&&&& \bullet & {=} & \bullet &&& \bullet &&& \bullet \arrow[""{name=0, anchor=center, inner sep=0}, from=1-1, to=1-6] \arrow[""{name=1, anchor=center, inner sep=0}, draw=none, controls=+(90:2) and +(90:2), from=1-1, to=1-6] \arrow[""{name=2, anchor=center, inner sep=0}, draw=none, controls=+(90:-2) and +(90:-2), from=1-1, to=1-6] \arrow[""{name=4, anchor=center, inner sep=0}, draw=none, controls=+(80:1.5) and +(100:1.5), from=1-8, to=1-11] \arrow[""{name=5, anchor=center, inner sep=0}, draw=none, controls=+(100:-1.5) and +(80:-1.5), from=1-8, to=1-11] \arrow[""{name=6, anchor=center, inner sep=0}, draw=none, controls=+(80:1.5) and +(100:1.5), from=1-11, to=1-14] \arrow[""{name=8, anchor=center, inner sep=0}, draw=none, controls=+(100:-1.5) and +(80:-1.5), from=1-11, to=1-14] \arrow["\usebox{\innertop}"{description, inner sep = 0,xshift = -1.2pt}, shorten <=4pt, shorten >=4pt, Rightarrow, from=2, to=0] \arrow["\usebox{\innerbot}"{description, inner sep = 0,xshift = -1.2pt}, shorten <=4pt, shorten >=4pt, Rightarrow, from=0, to=1] \arrow[""{name=1, anchor=center, inner sep=0}, controls=+(90:2) and +(90:2), from=1-1, to=1-6] \arrow[""{name=2, anchor=center, inner sep=0}, controls=+(90:-2) and +(90:-2), from=1-1, to=1-6] \arrow["\usebox{\innerleft}"{description, inner sep = 0,xshift = -1.3pt}, shorten <=2pt, shorten >=2pt, Rightarrow, from=5, to=4] \arrow["\usebox{\innerright}"{description, inner sep = 0,xshift = -1.3pt}, shorten <=2pt, shorten >=2pt, Rightarrow, from=8, to=6] \arrow[controls=+(80:1.5) and +(100:1.5), from=1-8, to=1-11] \arrow[controls=+(100:-1.5) and +(80:-1.5), from=1-8, to=1-11] \arrow[controls=+(80:1.5) and +(100:1.5), from=1-11, to=1-14] \arrow[controls=+(100:-1.5) and +(80:-1.5), from=1-11, to=1-14] \end{tikzcd} \] There are also interchange laws for the interaction of composition and identities; A composition of two identities is the same as an identity on the composition of the underlying cells. The \(\infty\)-categories that we study in this thesis will be globular, meaning that their cells form a globular set. A globular set can be seen as natural extension of the data of a category, whose data can be arranged into the following diagram: % https://q.uiver.app/#q=WzAsMixbMCwwLCJZIl0sWzEsMCwiWCJdLFswLDEsInMiLDAseyJvZmZzZXQiOi0xfV0sWzAsMSwidCIsMix7Im9mZnNldCI6MX1dXQ== % tex-fmt: skip \[ \begin{tikzcd} M & O \arrow["s", shift left, from=1-1, to=1-2] \arrow["t"', shift right, from=1-1, to=1-2] \end{tikzcd} \] where \(O\) is a set of objects, \(M\) is a set of all morphisms, and \(s\) and \(t\) are functions assigning each morphism to its source and target object respectively. \(2\)-cells can be added to this diagram in a natural way: % https://q.uiver.app/#q=WzAsMyxbMSwwLCJDXzEiXSxbMiwwLCJDXzAiXSxbMCwwLCJDXzIiXSxbMCwxLCJzXzAiLDAseyJvZmZzZXQiOi0xfV0sWzAsMSwidF8wIiwyLHsib2Zmc2V0IjoxfV0sWzIsMCwic18xIiwwLHsib2Zmc2V0IjotMX1dLFsyLDAsInRfMSIsMix7Im9mZnNldCI6MX1dXQ== % tex-fmt: skip \[ \begin{tikzcd} {C_2} & {C_1} & {C_0} \arrow["{s_0}", shift left, from=1-2, to=1-3] \arrow["{t_0}"', shift right, from=1-2, to=1-3] \arrow["{s_1}", shift left, from=1-1, to=1-2] \arrow["{t_1}"', shift right, from=1-1, to=1-2] \end{tikzcd} \] In a globular set, the source and target of any cell must be parallel, meaning they share the same source and target. This condition is imposed by \emph{globularity conditions}. Adding these and iterating the process leads to the following definition. \begin{definition} The category of globes \(\mathbf{G}\) has objects given by the natural numbers and morphisms generated from \(\mathbf{s}_n, \mathbf{t}_n : n \to n + 1\) quotiented by the \emph{globularity conditions}: \begin{align*} \mathbf{s}_{n+1} \circ \mathbf{s}_n &= \mathbf{t}_{n+1} \circ \mathbf{s}_n\\ \mathbf{s}_{n+1} \circ \mathbf{t}_n &= \mathbf{t}_{n+1} \circ \mathbf{t}_n \end{align*} The category of globular sets \(\mathbf{Glob}\), is the presheaf category \([\mathbf{G}^{\mathrm{op}}, \mathbf{Set}]\). \end{definition} Unwrapping this definition, a globular set \(G\) consists of sets \(G(n)\) for each \(n \in \mathbb{N}\), with source and target maps \(s_n, t_n : G(n+1) \to G(n)\), forming the following diagram: \[ \begin{tikzcd} \cdots & {G(3)} & {G(2)} & {G(1)} & {G(0)} \arrow["{s_0}", shift left, from=1-4, to=1-5] \arrow["{t_0}"', shift right, from=1-4, to=1-5] \arrow["{s_1}", shift left, from=1-3, to=1-4] \arrow["{t_1}"', shift right, from=1-3, to=1-4] \arrow["{t_2}"', shift right, from=1-2, to=1-3] \arrow["{s_2}", shift left, from=1-2, to=1-3] \arrow[shift right, from=1-1, to=1-2] \arrow[shift left, from=1-1, to=1-2] \end{tikzcd} \] and satisfying the globularity conditions. A morphism of globular sets \(F : G \to H\) is a collection of functions \(G(n) \to H(n)\) which commute with the source and target maps. Given a globular set \(G\), we will call the elements of \(G(n)\) the \(n\)-cells and write \(f : x \to y\) for an \((n+1)\)-cell \(f\) where \(s_n(f) = x\) and \(t_n(f) = y\). We further define the \(n\)-boundary operators \(\delta_n^-\) and \(\delta_n^+\) which take the source or target respectively of a \((n+k)\)-cell \(k\) times, returning an \(n\)-cell. \begin{example} \label{ex:disc} The \(n\)-disc \(D^n\) is a finite globular set given by \(Y(n)\), where \(Y\) is the Yoneda embedding \(\mathbf{G} \to \mathbf{Glob}\). \(D^n\) has no \(k\)-cells for \(k > n\), a single \(n\)-cell \(d_n\), and two \(m\)-cells \(d_m^-\) and \(d_m^+\) for \(m < n\). Every \((m+1)\)-cell of \(D^n\) has source \(d_m^-\) and target \(d_m^+\). The first few discs are depicted in \cref{fig:discs}. The Yoneda lemma tells us that a map of globular sets \(D^n \to G\) is the same as an \(n\)-cell of \(G\). For an \(n\)-cell \(x\) of \(G\), we let \(\{x\}\) be the unique map \(D^n \to G\) which sends \(d_n\) to \(x\). \end{example} \begin{figure}[h] \centering \begin{tabular}{P{3cm} P{3cm} P{3cm} P{4cm}} \(D^0\)&\(D^1\)&\(D^2\)&\(D^3\)\\ { \begin{tikzcd} d_0 \end{tikzcd} }&{ \begin{tikzcd}[ampersand replacement=\&] d_0^- \& d_0^+ \arrow[from=1-1, to=1-2, "d_1"] \end{tikzcd} }&{ \begin{tikzcd}[ampersand replacement=\&] d_0^- \& d_0^+ \arrow[""{name=0, anchor=center, inner sep=0}, "d_1^+", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "d_1^-"', curve={height=18pt}, from=1-1, to=1-2] \arrow["d_2"', shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \end{tikzcd} }&{ \begin{tikzcd}[ampersand replacement=\&] d_0^- \&\& d_0^+ \arrow[""{name=0, anchor=center, inner sep=0}, "d_1^+", curve={height=-25pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "d_1^-"', curve={height=25pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "d_2^-", shift left=12pt,Rightarrow, shorten <=5pt, shorten >=5pt, from=1,to=0] \arrow[""{name=3, anchor=center, inner sep=0}, "d_2^+"', shift right=12pt,Rightarrow, shorten <=5pt, shorten >=5pt, from=1,to=0] \arrow["d_3", Rightarrow, nfold = 3, shorten <=3pt, shorten >=3pt,from=2,to=3] \end{tikzcd}} \end{tabular} \caption{The first disc globular sets.} \label{fig:discs} \end{figure} \begin{remark} Globular sets are not the only natural extension of the data of a 1-category. The form of this data in a definition of a higher category is referred to as the \emph{shape} of the cells. Notable alternatives to globular sets include simplicial sets, opetopic sets, and cubical sets. \end{remark} We can now give the definition of a strict \(\infty\)-category. \begin{definition} A \emph{strict \(\infty\)-category} is a globular set \(G\) with the following operations: \begin{itemize} \item For \(m < n\), a composition \(*_m\) taking \(n\)-cells \(f\) and \(g\) with \(\delta_m^+(f) = \delta_m^-(g)\) and producing an \(n\)-cell \(f *_m g\) with: \begin{align*} s(f *_m g) &= \begin{cases*} s(f)&\text{if \(m = n - 1\)}\\ s(f) *_m s(g)&\text{otherwise} \end{cases*}\\ t(f *_m g) &= \begin{cases*} t(g)&\text{if \(m = n - 1\)}\\ t(f) *_m t(g)&\text{otherwise} \end{cases*} \end{align*} \item For any \(n\)-cell \(x\), an identity \((n+1)\)-cell \(\id(x) : x \to x\). \end{itemize} and satisfying equalities: \begin{itemize} \item Associativity: Given \(m < n\) and \(n\)-cells \(f\), \(g\), and \(h\) with \(\delta_m^+(f) = \delta_m^-(g)\) and \(\delta_m^+(g) = \delta_m^-(h)\): \[ (f *_m g) *_m h = f *_m (g *_m h) \] \item Unitality: Given \(m < n\) and \(n\)-cell \(f\): \begin{align*} \id^{n-m}(\delta_m^-(f)) *_m f &= f\\ f *_m \id^{n-m}(\delta_m^+(f)) &= f \end{align*} \item Composition interchange: If \(o < m < n\) and \(\alpha\), \(\beta\), \(\gamma\), and \(\delta\) be \(n\)-cells with \[\delta_m^+(\alpha) = \delta_m^-(\beta)\qquad \delta_m^+(\gamma) = \delta_m^-(\delta)\qquad \delta_o^+(\alpha) = \delta_o^-(\gamma)\] then: \[(\alpha *_o \gamma) *_m (\beta *_o \delta) = (\alpha *_m \beta) *_o (\gamma *_m \delta)\] \item Identity interchange: Let \(m < n\) and \(f\) and \(g\) be \(n\)-cells with \(\delta_m^+(f) = \delta_m^-(g)\). Then: \[\id(f) *_m \id(g) = \id(f *_m g)\] \end{itemize} A morphism of \(\infty\) categories is a morphism of the underlying globular sets which preserves composition and identities. \end{definition} There is a clear forgetful functor from the category of strict \(\infty\)-categories to the category of globular sets, which has a left adjoint given by taking the free strict \(\infty\)-category over a globular set. We end this section with an example of a non-trivial application of the axioms of an \(\infty\)-category, known as the Eckmann-Hilton argument. The argument shows that any two scalars (morphisms from the identity to the identity) commute. \begin{proposition}[Eckmann-Hilton] \label{prop:eh} Let \(x\) be an \(n\)-cell in an \(\infty\)-category and let \(\alpha\) and \(\beta\) be \((n+2)\)-cells with source and target \(\id(x)\). Then \(\alpha *_{n+1} \beta = \beta *_{n+1} \alpha\). \end{proposition} \begin{proof} The cells \(\alpha\) and \(\beta\) can be manoeuvred around each other as follows: \begin{align*} &\phantom{{}={}} \alpha *_{n+1} \beta \\ &= (\alpha *_n i) *_{n+1} (i *_n \beta)&\text{Unitality}\\ &= (\alpha *_{n+1} i) *_n (i *_{n+1} \beta)&\text{Interchange}\\ &= \alpha *_n \beta &\text{Unitality}\\ &= (i *_{n+1} \alpha) *_n (\beta *_{n+1} i)&\text{Unitality}\\ &= (i *_n \beta) *_{n+1} (\alpha *_n i)&\text{Interchange}\\ &= \beta *_{n+1} \alpha&\text{Unitality} \end{align*} where \(i = \id(\id(x))\). \end{proof} We give a more graphical representation of the proof in \cref{fig:eh}, which appeared in the introduction. In this proof the \(\alpha\) is moved to the left of \(\beta\), though we equally could have moved it round the right, and the choice made was arbitrary. \subsection{Pasting diagrams} \label{sec:pasting-diagrams} The definition of \(\infty\)-categories given in the previous section is close in spirit to the ordinary definitions of 1-categories and clearly demonstrates the different families of axioms present. However, we will see in \cref{sec:weak} that these sorts of definitions do not scale well to our eventual setting of weak higher categories. There is a special class of (finite) globular sets known as \emph{pasting diagrams}, sometimes known as \emph{pasting schemes}. The elements of the free strict \(\infty\)-category on a globular set \(G\) can instead be represented by a pasting diagram equipped with a map into \(G\). To do this, it must be possible to obtain a canonical composite from each pasting diagram. Informally, we can define an \(n\)-dimensional pasting diagram to be a finite globular set which admits a unique full composite of dimension \(n\), where a full composite of a globular set \(G\) is an element of the free \(\infty\)-category over \(G\) which uses all the maximal elements. This functions as the primary intuition on the role of pasting diagrams. Pasting diagrams were used directly by \citeauthor{batanin1998monoidal}~\cite{batanin1998monoidal} to give a definition of weak \(\infty\)-categories, and will be pivotal in \cref{sec:weak} to define the variety of \(\infty\)-categories that \Catt is based on. A more in-depth discussion of pasting diagrams, representations of free strict \(\infty\)-categories using them, and their use in the definition of weak \(\infty\)-categories can be found in \citetitle{leinster2004higher}~\cite{leinster2004higher}. Before giving a more formal definition of pasting diagrams, we explore some examples and non-examples. In contrast to \citeauthor{leinster2004higher}, we consider pasting diagrams as a full subcategory of globular sets, rather than a separate category with a function sending each pasting diagram to a globular set. The disc contexts introduced in \cref{ex:disc} are all examples of pasting diagrams. The unique ``composite'' of these globular sets is just given by their maximal element, noting that we allow a singular cell in our informal definition of composite. The uniqueness of this is trivial as the only possible operations we could apply are compositions with units, which gives the same cell under the laws of an \(\infty\)-category. The diagrams used to graphically represent our composition operations (of which we recall three below) are also pasting diagrams. \[ \begin{tikzcd} x & y & z \arrow["f", from=1-1, to=1-2] \arrow["g", from=1-2, to=1-3] \end{tikzcd} \qquad \begin{tikzcd} x && y \arrow[""{name=0, anchor=center, inner sep=0}, "f"', curve={height=24pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "h", curve={height=-24pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "g"{description}, from=1-1, to=1-3] \arrow["\alpha", shorten <=3pt, shorten >=3pt, Rightarrow, from=0, to=2] \arrow["\beta", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \end{tikzcd} \qquad \begin{tikzcd} x & y & z \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, "i", curve={height=-18pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "h"', curve={height=18pt}, from=1-2, to=1-3] \arrow["\alpha", shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["\beta", shorten <=5pt, shorten >=5pt, Rightarrow, from=3, to=2] \end{tikzcd} \] The composite of these diagrams is just the composite of the two maximal cells with the appropriate codimension. We can also consider composites which are not binary composites of two cells of equal dimension. For example the following globular set is a pasting diagram: \[ \begin{tikzcd} x & y & z \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-2] \arrow["h", from=1-2, to=1-3] \arrow["\alpha", shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \end{tikzcd} \] with a composite given by \(\alpha *_0 \id(h)\). This operation is fairly common (in fact we have already seen it in \cref{prop:eh}) and is known as \emph{whiskering}. In this case we would say that the composite is given by the right whiskering of \(\alpha\) with \(h\). The 1-dimensional pasting diagrams are all given by chains of 1-cells of the form: \[x_0 \overset{f_0}\to x_1 \overset{f_1}\to x_2 \overset{f_2}\to \cdots \overset{f_n}\to x_{n+1}\] There are multiple ways to form a composite over these diagrams by repeated binary composition, however these all have the same result due to associativity. Lastly we look at the following diagram, where all the \(0\)-cells and \(1\)-cells are assumed to be distinct: \[ \begin{tikzcd}[column sep = large] \bullet & \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=30pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, from=1-1, to=1-2] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-2, to=1-3] \arrow[""{name=4, anchor=center, inner sep=0}, curve={height=30pt}, from=1-2, to=1-3] \arrow[""{name=5, anchor=center, inner sep=0}, from=1-2, to=1-3] \arrow["\alpha", shorten <=4pt, shorten >=4pt, Rightarrow, from=1, to=2] \arrow["\beta", shorten <=4pt, shorten >=4pt, Rightarrow, from=2, to=0] \arrow["\gamma", shorten <=4pt, shorten >=4pt, Rightarrow, from=4, to=5] \arrow["\delta", shorten <=4pt, shorten >=4pt, Rightarrow, from=5, to=3] \end{tikzcd} \] We get a composite given by \((\alpha *_1 \beta) *_0 (\gamma *_1 \delta)\). The uniqueness of this composite is due to the interchange law. Non-examples of pasting diagrams roughly fall into two groups: those that do not admit a composite, and those that admit many distinct composites. The following three globular sets fail to admit a composite (the last is drawn in a box to emphasise that \(z\) is part of the same globular set as \(x\), \(y\), \(f\), \(g\), and \(\alpha\)): \[ \begin{tikzcd}[column sep=large, row sep = small] & y \\ x \\ & z \arrow["f", pos=0.6, from=2-1, to=1-2] \arrow["g"', pos=0.6, from=2-1, to=3-2] \end{tikzcd} \qquad \begin{tikzcd}[column sep=large] x & y \arrow["f", curve={height=-12pt}, from=1-1, to=1-2] \arrow["g"', curve={height=12pt}, from=1-1, to=1-2] \end{tikzcd} \qquad \fbox{% \begin{tikzcd}[column sep=scriptsize, ampersand replacement = \&] x \&\& y \& z \arrow[""{name=0, anchor=center, inner sep=0}, "f", curve={height=-18pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "g"', curve={height=18pt}, from=1-1, to=1-3] \arrow["\alpha", shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \end{tikzcd}} \] The globular set with a single \(0\)-cell \(x\), and a single \(1\)-cell \(f : x \to x\) has too many composites: \(f\) and \(f *_0 f\) need not be equal in an \(\infty\)-category. To describe the free \(\infty\)-category in terms of pasting diagrams we need to be able to extract a composite from a pasting diagram, and construct a pasting diagram from an arbitrary composite. Each pasting diagram having a unique composite solves the former issue. To be able to construct a pasting diagram from a composite, we wish to equip our set of pasting diagrams itself with the structure of an \(\infty\)-category. We therefore need our pasting diagrams to have a notion of boundary and a notion of composition. A natural candidate for composition is given by colimits, as \(\mathbf{Glob}\) has all colimits due to being a presheaf category, and so it is sufficient for our class of pasting diagrams to be closed under these specific colimits. In fact, it is sufficient to contain a class of colimits known as \emph{globular sums}. \begin{definition} A globular category is a category \(\mathcal{C}\), equipped with a disc functor \(D : \mathbf{G} \to \mathcal{C}\), specifying certain objects as discs in the category. A \emph{globular sum} is a colimit of a diagram of the form: \[ \begin{tikzcd}[column sep = tiny, row sep = tiny] {D(i_0)} && {D(i_1)} && {D(i_2)} && {D(i_n)} && {D(i_{n+1})} \\ &&&&& \cdots \\ & {D(j_0)} && {D(j_1)} &&&& {D(j_n)} \arrow["{f_0}", from=3-2, to=1-1] \arrow["{g_0}"', from=3-2, to=1-3] \arrow["{f_n}", from=3-8, to=1-7] \arrow["{g_n}"', from=3-8, to=1-9] \arrow["{f_1}", from=3-4, to=1-3] \arrow["{g_1}"', from=3-4, to=1-5] \end{tikzcd} \] Where all morphisms \(f_i\) are a composite of source maps (\(D(\mathbf{s}_n)\) for some \(n\)) and the morphisms \(g_i\) are a composite of target maps (\(D(\mathbf{t}_n)\) for some \(n\)). Given that the maps \(f_i\) and \(g_i\) are uniquely determined, we may write such a globular sum as: \[ D(i_0) \amalg_{D(j_0)} D(i_1) \amalg_{D(j_1)} D(i_2) \cdots D(i_n) \amalg_{D(j_n)} D(i_{n+ 1})\] A \emph{globular extension} is a globular category where all globular sums exist, and a morphism of globular extensions is a functor of the underlying categories commuting with the disc functors and preserving globular sums. \end{definition} We can now give our first definition of a pasting diagram. \begin{definition} The category \(\mathbf{Glob}\) is a globular category with functor \(\mathbf{G} \to \mathbf{Glob}\) given by the Yoneda embedding. The category of \emph{pasting diagrams}, \(\mathbf{Pd}\), is the full subcategory containing the globular sets which are globular sums. The boundary of an \((n+1)\)-dimensional pasting diagram is given by replacing each instance of \(D^{n+1}\) by \(D^n\) in its globular sum representation. There are two canonical maps including the boundary into the original pasting diagram, whose images give the source and target of the pasting diagram. \end{definition} The category of pasting diagrams clearly forms a globular category, with the functor \(\mathbf{G} \to \mathbf{Pd}\) sending \(n\) to \(D^n\). It is a globular extension and is in fact the universal globular extension; it is initial in the category of globular extensions~\cite{Ara}. We finish this section with one larger example. \begin{example} The following depicts a \(2\)-dimensional pasting diagram. \[ \begin{tikzcd} x & y & z & w \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-2] \arrow["h"', from=1-2, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "k", curve={height=-24pt}, from=1-3, to=1-4] \arrow[""{name=3, anchor=center, inner sep=0}, "i"', curve={height=24pt}, from=1-3, to=1-4] \arrow[""{name=4, anchor=center, inner sep=0}, "j"{description}, from=1-3, to=1-4] \arrow["\alpha", shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["\beta", shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=4] \arrow["\gamma", shorten <=3pt, shorten >=3pt, Rightarrow, from=4, to=2] \end{tikzcd} \] This has the following globular sum decomposition: % https://q.uiver.app/#q=WzAsMTMsWzAsMCwieCJdLFsyLDAsInkiXSxbOCwwLCJ6Il0sWzEwLDAsInciXSxbMywxLCJ5Il0sWzQsMCwieSJdLFs2LDAsInoiXSxbNywxLCJ6Il0sWzksMF0sWzEwLDEsInoiXSxbMTIsMSwidyJdLFsxMiwwLCJ6Il0sWzE0LDAsInciXSxbMCwxLCJnIiwwLHsiY3VydmUiOi0zfV0sWzAsMSwiZiIsMix7ImN1cnZlIjozfV0sWzIsMywiaSIsMix7ImN1cnZlIjo0fV0sWzIsMywiaiIsMV0sWzUsNiwiaCIsMl0sWzQsMSwiIiwyLHsic3R5bGUiOnsiYm9keSI6eyJuYW1lIjoiZGFzaGVkIn19fV0sWzQsNSwiIiwxLHsic3R5bGUiOnsiYm9keSI6eyJuYW1lIjoiZGFzaGVkIn19fV0sWzksMTAsImoiLDFdLFsxMSwxMiwiaiIsMV0sWzExLDEyLCJrIiwxLHsiY3VydmUiOi00fV0sWzcsNiwiIiwwLHsic3R5bGUiOnsiYm9keSI6eyJuYW1lIjoiZGFzaGVkIn19fV0sWzcsMiwiIiwwLHsic3R5bGUiOnsiYm9keSI6eyJuYW1lIjoiZGFzaGVkIn19fV0sWzE0LDEzLCJcXGFscGhhIiwwLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFsxNSwxNiwiXFxiZXRhIiwwLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFsyMSwyMiwiXFxnYW1tYSIsMCx7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMjAsMywiIiwxLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwfSwibGV2ZWwiOjEsInN0eWxlIjp7ImJvZHkiOnsibmFtZSI6ImRhc2hlZCJ9fX1dLFsyMCwxMSwiIiwxLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwfSwibGV2ZWwiOjEsInN0eWxlIjp7ImJvZHkiOnsibmFtZSI6ImRhc2hlZCJ9fX1dXQ== % tex-fmt: skip \[ \begin{tikzcd}[column sep=small, row sep = small] x && y && y && z && z & {} & w && z && w \\ &&& y &&&& z &&& z && w \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "i"', curve={height=24pt}, from=1-9, to=1-11] \arrow[""{name=3, anchor=center, inner sep=0}, "j"{description}, from=1-9, to=1-11] \arrow["h"', from=1-5, to=1-7] \arrow[dashed, from=2-4, to=1-3] \arrow[dashed, from=2-4, to=1-5] \arrow[""{name=4, anchor=center, inner sep=0}, "j"{description}, from=2-11, to=2-13] \arrow[""{name=5, anchor=center, inner sep=0}, "j"{description}, from=1-13, to=1-15] \arrow[""{name=6, anchor=center, inner sep=0}, "k"{description}, curve={height=-24pt}, from=1-13, to=1-15] \arrow[dashed, from=2-8, to=1-7] \arrow[dashed, from=2-8, to=1-9] \arrow["\alpha", shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["\beta", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=3] \arrow["\gamma", shorten <=3pt, shorten >=3pt, Rightarrow, from=5, to=6] \arrow[shorten <=6pt, dashed, from=4, to=1-11] \arrow[shorten <=6pt, dashed, from=4, to=1-13] \end{tikzcd} \] The source and target of the diagram are given by the isomorphic pasting diagrams: \[ \begin{tikzcd} x & y & z & w \arrow["f"', curve={height=18pt}, from=1-1, to=1-2] \arrow["h", from=1-2, to=1-3] \arrow["i"', curve={height=24pt}, from=1-3, to=1-4] \end{tikzcd} \qquad\text{and}\qquad \begin{tikzcd} x & y & z & w \arrow["g", curve={height=-18pt}, from=1-1, to=1-2] \arrow["h", from=1-2, to=1-3] \arrow["k", curve={height=-24pt}, from=1-3, to=1-4] \end{tikzcd} \] \end{example} \subsection{Weak higher categories} \label{sec:weak} The \(\infty\)-categories we have defined so far have all been strict \(\infty\)-categories, meaning that the laws are required to hold up to equality. In ordinary \(1\)-category theory, isomorphism is usually preferred over equality for comparing objects. Similarly, when we have access to higher-dimensional arrows, it follows that we can also consider isomorphisms between morphisms, and therefore consider laws such as associativity up to isomorphism instead of equality. Topological spaces provide one of the primary examples for where it is useful to consider weak laws. Given a topological space \(X\), we can define a globular set of paths and homotopies. Let the \(0\)-cells be given by points \(x\) of the topological space, let morphisms from \(x\) to \(y\) be given as paths \(I \to X\) (where \(I\) is the topological interval \([0,1]\)) which send \(0\) to \(x\) and \(1\) to \(y\), and let higher cells be given by homotopies. The natural composition of two paths \(p\) and \(q\) is the following path: \[ (p * q)(i) = \begin{cases*} p(2i)&when \(i < 0.5\)\\ q(2i-1)&when \(i \geq 0.5\) \end{cases*} \] which effectively lines up the paths end to end. Given \(3\) paths \(p\), \(q\), and \(r\), the compositions \((p * q) * r\) and \(p * (q * r)\) are not identical but are equal up to homotopy, meaning the two compositions are isomorphic. Therefore, in this case the composition \(p * q\) does not form a strict \(\infty\)-category structure, but rather a weak structure. \paragraph{Weak 2-categories} We start our exploration of weak higher categories by considering the lower dimension case of bicategories (weak \(2\)-categories). Here, interchange must still be given by a strict equality, as there are no non-trivial \(3\)-cells in a \(2\)-category. However, associativity and unitality can be given by isomorphisms known as associators and unitors: \begin{align*} \alpha_{f,g,h} &: (f *_0 g) *_0 h \to f *_0 (g *_0 h)\\ \lambda_f &: \id(x) *_0 f \to f\\ \rho_f &: f *_0 \id(y) \to f \end{align*} for \(f : x \to y\), \(g : y \to z\), and \(h : z \to w\). \begin{example} \label{ex:spans} All strict 2-categories are also bicategories. The bicategory of spans is an example of a bicategory which is not strict. Starting with a category \(\mathcal{C}\) equipped with chosen pullbacks, we define the bicategory of spans over \(\mathcal{C}\) to be: \begin{itemize} \item Objects are the same as \(\mathcal{C}\) \item Morphisms \(A\) to \(B\) are spans \(A \leftarrow C \to B\). \item A 2-morphism from \(A \leftarrow C \to B\) to \(A \leftarrow C' \to B\) is a morphism \(C \to C'\) such that the following diagram commutes: \[ \begin{tikzcd}[row sep = small] & C \\ A && B \\ & {C'} \arrow[from=1-2, to=3-2] \arrow[from=3-2, to=2-1] \arrow[from=1-2, to=2-1] \arrow[from=1-2, to=2-3] \arrow[from=3-2, to=2-3] \end{tikzcd} \] \item Compositions and identities of 2-morphisms is given by composition and identities of the underlying morphisms in \(\mathcal{C}\). \item The identity on an object \(A\) is the span \(A \leftarrow A \to A\). \item Given spans \(A \leftarrow D \to B\) and \(B \leftarrow E \to C\), their composite is given by the pullback: \[ \begin{tikzcd}[row sep=small] && {D \times_B E} \\ & D && E \\ A && B && C \arrow[from=2-2, to=3-1] \arrow[from=2-2, to=3-3] \arrow[from=2-4, to=3-3] \arrow[from=2-4, to=3-5] \arrow[from=1-3, to=2-2] \arrow[from=1-3, to=2-4] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=-45}, draw=none, from=1-3, to=3-3] \end{tikzcd} \] \item Associators and unitors are given by the universal property of the pullback. \end{itemize} \end{example} In general, there could be many possible isomorphisms between \((f * g) * h\) and \(f * (g * h)\), and we require that the chosen morphisms satisfy certain compatibility properties. The first is that each of the associator, left unitor, and right unitor should be a natural isomorphism. The second is a property known as \emph{coherence}, saying that any two parallel morphisms built purely from naturality moves, associators, and unitors must be equal. For bicategories it is sufficient to give two coherence laws: the triangle equality and pentagon equality. The triangle equality identifies two ways of cancelling the identity in the composite \(f * \id * g\), giving a compatibility between the left and right unitors. It is given by the following commutative diagram: % https://q.uiver.app/#q=WzAsMyxbMCwwLCIoZiBcXHN0YXIgXFxpZCkgXFxzdGFyIGciXSxbMiwwLCJmIFxcc3RhciAoXFxpZCBcXHN0YXIgZykiXSxbMSwxLCJmIFxcc3RhciBnIl0sWzAsMSwiXFxhbHBoYV97ZixcXGlkLGd9Il0sWzAsMiwiXFxyaG9fZiBcXHN0YXJfMCBcXGlkKGcpIiwyXSxbMSwyLCJcXGlkKGYpXFxzdGFyXzBcXGxhbWJkYV9nIl1d % tex-fmt: skip \[ \begin{tikzcd} {(f * \id) * g} && {f * (\id * g)} \\ & {f * g} \arrow["{\alpha_{f,\id,g}}", from=1-1, to=1-3] \arrow["{\rho_f *_0 \id(g)}"', from=1-1, to=2-2] \arrow["{\id(f)*_0\lambda_g}", from=1-3, to=2-2] \end{tikzcd} \] The pentagon equation identifies two ways of associating \(((f * g) * h) * k\) to \(f * (g * (h * k))\). It is given by the diagram below: % https://q.uiver.app/#q=WzAsNSxbMSwzLCIoZiBcXHN0YXIgKGcgXFxzdGFyIGgpKSBcXHN0YXIgayJdLFswLDEsIigoZiBcXHN0YXIgZykgXFxzdGFyIGgpIFxcc3RhciBrIl0sWzIsMCwiKGYgXFxzdGFyIGcpIFxcc3RhciAoaCBcXHN0YXIgaykiXSxbNCwxLCJmIFxcc3RhciAoZyBcXHN0YXIgKGggXFxzdGFyIGspKSJdLFszLDMsImYgXFxzdGFyICgoZyBcXHN0YXIgaCkgXFxzdGFyIGspIl0sWzEsMiwiXFxhbHBoYV97ZiBcXHN0YXIgZyxoLGt9Il0sWzIsMywiXFxhbHBoYV97ZixnLGhcXHN0YXIga30iXSxbMSwwLCJcXGFscGhhX3tmLGcsaH0gXFxzdGFyXzAgXFxpZChrKSIsMl0sWzAsNCwiXFxhbHBoYV97ZixnXFxzdGFyIGgsa30iLDJdLFs0LDMsIlxcaWQoZilcXHN0YXJfMCBcXGFscGhhX3tnLGgsa30iLDJdXQ== % tex-fmt: skip \[ \begin{tikzcd}[column sep = -1.5em] && {(f * g) * (h * k)} \\ {((f * g) * h) * k} &&&& {f * (g * (h * k))} \\ \\ & {(f * (g * h)) * k} && {f * ((g * h) * k)} \arrow["{\alpha_{f * g,h,k}}", from=2-1, to=1-3] \arrow["{\alpha_{f,g,h* k}}", from=1-3, to=2-5] \arrow["{\alpha_{f,g,h} *_0 \id(k)}"', from=2-1, to=4-2] \arrow["{\alpha_{f,g* h,k}}"', from=4-2, to=4-4] \arrow["{\id(f)*_0 \alpha_{g,h,k}}"', from=4-4, to=2-5] \end{tikzcd} \] Surprisingly, these two equations are enough to give full coherence. For the example of spans from \cref{ex:spans}, these two equations follow from the uniqueness of the universal morphism. \paragraph{Weak \(\infty\)-categories} To move from weak \(2\)-categories to weak \(3\)-categories, new coherence cells for interchangers are added to replace the interchanger equalities, and new equalities must be added to specify the interaction between the interchangers and other coherence morphisms. Furthermore, the triangle and pentagon equations from \(2\)-categories will become isomorphisms in a weak \(3\)-category, causing more coherence equations to be added. As we move up in dimension, the number of coherence morphisms and equalities required increases exponentially. A bicategory has 11 operations (1-identity, 2-identity, 1-composition, vertical composition, horizontal composition, left unitor (and inverse), right unitor (and inverse), and associator (and inverse)), whereas a fully weak tricategory already has around 51 operations~\cite{gurski2006algebraic}. These numbers are obtained by unwrapping various subdefinitions and should be treated as approximate. Comparisons between the size of partially weak definitions can be found in~\cite{bar2017data}. Because of this complexity, we look for more uniform ways to represent the operations and axioms of an \(\infty\)-category. In this thesis, we will work with the type theory \Catt, which is based on a definition of \(\infty\)-categories due to \citeauthor{maltsiniotis2010grothendieck}~\cite{maltsiniotis2010grothendieck}, which is itself based on a definition of \(\infty\)-groupoid by \citeauthor{PursuingStacks}~\cite{PursuingStacks}. We will sketch the ideas behind these definitions here, and give a definition of \Catt in \cref{sec:type-theory-catt}. The key insight behind Grothendieck's definition is that pasting diagrams should be weakly contractible, instead of containing a unique composite. Whereas in a strict \(\infty\)-category, each pasting diagram effectively has 1 composite, in a weak \(\infty\)-category there can be many operations over a pasting diagram. These operations are assembled into a globular extension called a \emph{coherator}. A weak \(\infty\)-groupoid is then a presheaf on this coherator for which the opposite functor preserves globular sums (alternatively, the dual notion of globular product could be defined, and such a presheaf could be asked to preserve globular products). The objects of a coherator are given by pasting diagrams, with \(D^n\) being sent to the \(n\)-cells of the category and other pasting diagrams being sent to composable sets of cells (as determined by the preservation of globular sums). Operations over a pasting diagram \(P\) in the coherator are given by morphisms \(D^n \to P\). When we take a presheaf over this, we obtain a function that takes an \(P\)-shaped collection of cells to a single \(n\)-cell. Operations can be precomposed with source and target maps \(D^{n-1} \to D^n\) to get the source and target of an operation. To build the coherator, we start by taking the category of pasting diagrams. The ``operations'' of this category consist solely of the inclusions of discs into pasting diagrams, which correspond to picking a single element from the pasting diagram. Other operations are then built using the following guiding principle. \begin{principle-groupoid} Let \(f\) and \(g\) be two parallel operations over a pasting diagram \(P\). Then there is an operation \(h\) over \(P\) with source \(f\) and target \(g\). \end{principle-groupoid} We define a pair of operations \(f,g : D^n \to X\) to be \emph{parallel} if \(n = 0\) or both \(n > 0\) and \(f \circ \mathbf{s}_{n-1} = g \circ \mathbf{s}_{n-1}\) and \(f \circ \mathbf{t}_{n-1} = g \circ \mathbf{t}_{n-1}\). A \emph{lift} for such a pair of parallel operations is an operation \(h : D^{n+1} \to X\) such that \(h \circ \mathbf{s}_{n} = f\) and \(h \circ \mathbf{t}_n = g\). Closing under this principle then amounts to inductively adding lifts for all parallel operations, while ensuring that the category remains a globular extension. We start with some basic operations: Consider the pasting diagram \(A = D^1 \amalg D^1\) given by: \[ \begin{tikzcd} x & y & z \arrow["a", from=1-1, to=1-2] \arrow["b", from=1-2, to=1-3] \end{tikzcd} \] Our rule now tells us that since \(x\) and \(z\) are elements of \(A\), that there should be an operation returning a cell with source \(x\) and target \(z\), namely the composition of \(a\) and \(b\). In the language of coherators, there are operations \(f, g : D^0 \to A\), where \(f\) includes into the source of the first disc of \(A\), and \(g\) includes into the target of the second disc of \(A\). These are trivially parallel, and so there exists a lift \(h : D^1 \to A\), giving 1-composition. Similarly, if we take the pasting diagram with a single \(0\)-cell \(x\) and no other cells, then applying our rule with \(f,g\) both being the operation returning the element \(x\) produces an operation with source and target \(x\), the identity on \(x\). We can generate more complicated operations with this principle, consider pasting diagram \(B\): \[ \begin{tikzcd} x & y & z & w \arrow["f", from=1-1, to=1-2] \arrow["g", from=1-2, to=1-3] \arrow["h", from=1-3, to=1-4] \end{tikzcd} \] We already know the coherator contains 1-composition, and using composition and the universal property of globular sums, we can generate operations realising the compound composites \((f * g) * h\) and \(f * (g * h)\). The principle then gives us an operation returning the \(2\)-cell \((f * g) * h \to f * (g * h)\), which is of course the associator. This one principle allows us to generate all the structure we need, as well as structure that is arguably unnecessary, such as ternary compositions that did not appear in the definition of bicategory. Unfortunately, as we have already mentioned, Grothendieck's definition is for \(\infty\)-groupoids, where everything is invertible, instead of \(\infty\)-categories in full generality, as we want to study in this thesis. This can be seen by taking the pasting diagram \(C\): \[ \begin{tikzcd} x & y \arrow["f", from=1-1, to=1-2] \end{tikzcd} \] and applying the rule with \(f\) returning \(y\) and \(g\) returning \(x\), giving an operation that returns a \(1\)-cell \(f^{-1} : y \to x\), the inverse of \(f\). The rule as we have stated it is too powerful. Maltsiniotis' definition provides a solution to this problem by giving a more refined version of the principle. Whereas Grothendieck's definition treats all operations as coherences, Maltsiniotis' definition splits operations into two classes: compositions and equivalences. Both classes are obtained by restricting the classes of parallel operations that admit lifts. We begin by defining what it means for an operation to be algebraic: \begin{definition} Let \(\mathcal{C}\) be a globular extension for which the canonical functor \(P : \mathbf{Pd} \to \mathcal{C}\) is faithful and the identity on objects. Then an operation \(f : D^n \to X\) in \(\mathcal{C}\) is \emph{algebraic} if whenever \(f = P(g) \circ f'\), \(g = \id\). \end{definition} Intuitively, an operation is algebraic when it does not factor through any proper inclusion. Algebraicity is equivalent to requiring that an operation makes use of all the locally maximal elements of the pasting diagram, elements which do not appear in the source or target of a higher-dimensional element of the diagram. Equivalences contain the various invertible laws of our \(\infty\)-categories such as associators, unitors, identities, and interchangers. For two operations \(f,g : D^n \to X\) to admit a lift under the rule for equivalences, they must both be algebraic. This gives the following rule: \begin{principle-category}[Equivalences] Let \(f\) and \(g\) be two parallel operations over a pasting diagram \(P\). If both \(f\) and \(g\) use all locally maximal variables of \(f\), then there is an operation over \(P\) with source \(f\) and target \(g\). \end{principle-category} Clearly any operations generated by this principle are invertible, as the extra condition imposed is symmetric. For compositions, we introduce the following asymmetric principle, recalling that pasting diagrams are equipped with source and target inclusions, and letting \(\partial^-(P)\) and \(\partial^+(P)\) be the images of these inclusions: \begin{principle-category}[Composites] Let \(f\) and \(g\) are parallel operations over a (non-singleton) pasting diagram \(P\) such that \(f\) uses all locally maximal cells of \(\partial^-(P)\) and no cells outside of \(\partial^-(P)\) and \(g\) uses all locally maximal cells of \(\partial^+(P)\) and no cells outside of \(\partial^+(P)\). Then there is an operation over \(P\) with source \(f\) and target \(g\). \end{principle-category} The condition required to form a composite can be expressed by the operation \(f : D^n \to P\) factoring into an algebraic map composed with the source inclusion into \(P\), and similar for \(g\) with the target inclusion. It can be easily checked that the inverse operation given above does not satisfy the criteria for being an equivalence or composite. As with Grothendieck's definition, a coherator can be made by closing the globular extension of pasting diagrams under these restricted principles, and then weak \(\infty\)-categories can be defined to be presheaves on this coherator such that the opposite functor preserves globular sums. \begin{remark} We have claimed that a coherator can be formed by closing under adding lifts to parallel operations, though this is not precise and there are actually multiple ways of performing this closure that lead to different coherators. For example, one could add the lift for 1-composition twice, to get two distinct 1-composition operations, as long as one also added a lift between these now parallel operations. Grothendieck gives a general schema for producing coherators, and conjectures that any two coherators give rise to equivalent models of \(\infty\)-categories. \end{remark} We now turn our attention back to the proof of Eckmann-Hilton from \cref{fig:eh}. Given a \(0\)-cell \(x\) and two scalars \(\alpha, \beta : \id(x) \to \id(x)\), we expect the Eckmann-Hilton argument to give us an isomorphism in a weak higher category, rather than the equality obtained in the strict case. In fact, we immediately see that equalities 2, 3, and 4 in the proof can be immediately replaced by isomorphisms (interchangers and unitors). The first and last equalities however are more problematic, although at first we may believe that there should exist some horizontal unitor isomorphism, upon closer inspection the two compositions do not even have the same boundary and so are not parallel. The composition \(\alpha *_1 \beta\) has source and target \(\id(x)\), whereas the source of \(\alpha *_0 \id(\id(x))\) is \(\id(x) *_0 \id(x)\). To recover the proof in a weak setting, the intermediate composites must be composed with unitors so that they all have source and target \(\id(x)\). To give equivalences for the first and last step, these unitors must be moved around with naturality moves, and at a critical point the isomorphism \(\lambda_{\id(x)} \simeq \rho_{\id(x)}\) is required. Multiple full proofs of Eckmann-Hilton will be given in \cref{sec:examples}. The proof of Eckmann-Hilton is vastly simpler in the strict case, mainly due to the presence of the equation \(\id(x) *_0 \id(x) = \id(x)\). \subsection{Computads} \label{sec:computads} A free group is generated by a set, and a free category is generated by a directed graph, and so it is a natural question what the generating data for a free \(\infty\)-category is. We have already seen that a free \(\infty\)-category can be generated by a globular set, but free \(\infty\)-categories can also be generated by data that does not form a globular set. Consider the minimum data needed to state the Eckmann-Hilton principle (see \cref{fig:eh} or \cref{prop:eh}). We require a single \(0\)-cell \(x\), and two \(2\)-cells \(\alpha, \beta : \id(x) \to \id(x)\). This data does not form a globular set as, for example, the source of the \(2\)-cell \(\alpha\) is not in the generating data, but is rather an operation applied to the data. We could try to remedy this by adding a new \(1\)-cell \(f\) to the data to represent \(\id(x)\), but then the connection between \(\id(x)\) and \(f\) would be lost and \(f\) and \(\id(x)\) would be distinct in any free \(\infty\)-category generated on this data. The correct generating data for an \(\infty\)-category is a \emph{computad}. A version for 2-categories was introduced by \citeauthor{street1976limits}~\cite{street1976limits}, which allows a generating \(2\)-cell to have a composite or identity as its source or target. These were extended to strict \(\infty\)-categories by \citeauthor{burroni1993higher}~\cite{burroni1993higher} and weak \(\infty\)-categories by \citeauthor{batanin1998computads}~\cite{batanin1998computads}, which allow the source and target of an \(n\)-cell to be any \((n-1)\)-cell of the free \(\infty\)-category generated by the lower-dimensional data. A modern approach to computads for weak \(\infty\)-categories is given by \citeauthor{dean2022computads}~\cite{dean2022computads}, which avoids much of the complexity of globular operads, relying only on (mutual) structural induction. This definition of a computad is much closer in style (and is inspired by) the type theory \Catt which we review in \cref{sec:type-theory-catt}. \section{The type theory \Catt} \label{sec:type-theory-catt} In this section we give an overview of the dependent type theory \Catt~\cite{finster2017type}. \Catt serves as a definition of weak \(\infty\)-categories, by defining a weak \(\infty\)-category to be a model of the type theory (e.g.\ using categories with families~\cite{cwf}). In \cref{cha:gener-pres-catt}, we give a more general and comprehensive presentation of \Catt, allowing the addition of equality relations to the type theory, pre-empting \cref{cha:cattstrict}. In contrast, this section presents the version of \Catt closer to the one found in the literature, and compares its various constructions to the ideas introduced in \cref{sec:weak}. \subsection{Syntax of \Catt} \label{sec:syntax-catt} \Catt has 4 classes of syntax: contexts, terms, types, and substitutions. \begin{itemize} \item Contexts contain a list of variables with an associated type. We can consider contexts as finite computads, the generating data for a weak \(\infty\)-category (see \cref{sec:computads}). It is alternatively valid to consider contexts in \Catt as finitely generated \(\infty\)-categories. The set of contexts contains all globular sets (and hence all pasting diagrams). \item Terms over a context \(\Gamma\) correspond to the operations from \cref{sec:weak}. Terms can either be a variable, which corresponds to the operations which pick a single cell out of a globular set, or those generated by the unique constructor \(\mathsf{Coh}\), which correspond to the operations generated by lifting. A term over a context \(\Gamma\) can also be seen as an element of the free \(\infty\)-category generated from \(\Gamma\). \item Types over a context \(\Gamma\) consist of a collection of terms over the same context, and contain the boundary information for a term. Types either take the form of the constructor \(*\), the type of \(0\)-cells (which have no boundary data), or an arrow type \(\arr s A t\), where \(s\) and \(t\) are terms giving the source and target of the boundary and the type \(A\) gives lower-dimensional boundary information. This can be viewed as a directed version of the equality type \(s =_A t\) from Martin-L\"of type theory. \item Substitutions from a context \(\Gamma\) to a context \(\Delta\) are a mapping from variables of \(\Gamma\) to terms of \(\Delta\). These play the role of functors between the \(\infty\)-categories generated by \(\Gamma\) and \(\Delta\) and are also syntactically crucial for forming compound composites in the theory. \end{itemize} \begin{figure}[ht] \centering \begin{tabular}{Sc Sc} { \begin{prooftree} \hypo{\phantom{\Term}} \infer1{\emptyset : \Ctx} \end{prooftree} } & { \begin{prooftree} \hypo{\Gamma : \Ctx} \hypo{A : \Type_\Gamma} \infer2{\Gamma, (x : A) : \Ctx} \end{prooftree}} \\ { \begin{prooftree} \hypo{\phantom{\Term}} \infer1{\langle \rangle : \emptyset \to \Gamma} \end{prooftree} } & { \begin{prooftree} \hypo{\sigma : \Delta \to \Gamma} \hypo{t : \Term_\Gamma} \hypo{A : \Type_\Delta} \infer3{\langle \sigma , t \rangle : \Delta, (x : A) \to \Gamma} \end{prooftree} } \\ { \begin{prooftree} \hypo{\phantom{\Type}} \infer1{\star : \Type_\Gamma} \end{prooftree} } & { \begin{prooftree} \hypo{A : \Type_\Gamma} \hypo{s : \Term_\Gamma} \hypo{t : \Term_\Gamma} \infer3{\arr s A t : \Type_\Gamma} \end{prooftree} } \\ { \begin{prooftree} \hypo{x \in \Var(\Gamma)\vphantom{\Type}} \infer1{x : \Term_\Gamma} \end{prooftree} } & { \begin{prooftree} \hypo{\Delta : \Ctx} \hypo{A : \Type_\Delta} \hypo{\sigma : \Delta \to \Gamma} \infer3{\Coh \Delta A \sigma : \Term_\Gamma} \end{prooftree} } \end{tabular} \vspace{-5pt} \caption{Syntax constructions in \Catt.} \label{fig:syntax} \end{figure} The rules for constructing each piece of syntax are given in \cref{fig:syntax}. To simplify the notation, we may avoid writing substitutions in a fully nested fashion, writing \(\langle \sigma , s , t \rangle\) instead of \(\langle \langle \sigma, s \rangle, t \rangle\), or \(\langle s \rangle\) instead of \(\langle \langle \rangle, s \rangle\). We may also omit the subscript in the arrow type. As opposed to the original paper on \Catt, we fibre terms, types, and substitutions over contexts, allowing us to avoid any problems with substitution only extending to a partial operation on terms. We write \(\Ctx\) for the set of contexts, \(\Term_\Gamma\) for the set of terms in a context \(\Gamma\), \(\Type_\Gamma\) for the set of types in a context \(\Gamma\), and write \(\sigma : \Delta \to \Gamma\) when \(\sigma\) is a substitution taking variables of \(\Delta\) to terms of \(\Gamma\). In the literature, substitutions are often written as going in the opposite direction. We emphasise here that the direction of our substitution morphisms agrees with the direction of the function from variables to terms, the direction of the induced functor between the \(\infty\)-categories freely generated from the domain and codomain contexts, and the direction of arrows in a Grothendieck coherator. We write \(\equiv\) for \emph{syntactic equality}, up to renaming of variables and \(\alpha\)-equivalence. The various pieces of syntax will be considered as equal up to this relation, which can be achieved by using a de Bruijn index representation of the syntax as we present in \cref{cha:gener-pres-catt} for the formalisation. However, we continue to use named variables in the prose of the thesis to aid readability, assuming that all variables in a context are always distinct. We contrast this with the equality symbol, \(=\), which will represent the equality derived from extra equality rules we have placed on \Catt in \cref{sec:catt-with-equality}, and will be referred to as \emph{definitional equality}. The action of a substitution \(\sigma : \Delta \to \Gamma\) can be extended from variables to all terms \(t \in \Term_\Delta\), types \(A \in \Type_\Delta\), and substitutions \(\tau : \Theta \to \Delta\) by mutual recursion: \begin{align*} x \sub \sigma &= t&\text{if }(x \mapsto t) \in \sigma\\ \Coh \Theta A \tau \sub \sigma &= \Coh \Theta A {\tau \bullet \sigma}\\ \star \sub \sigma &= \star\\ \arr s A t \sub \sigma &= \arr {s \sub \sigma} {A \sub \sigma} {t \sub \sigma}\\ \langle \rangle \bullet \sigma &= \langle \rangle\\ \langle \tau , t \rangle \bullet \sigma &= \langle \tau \bullet \sigma , t \sub \sigma \rangle \end{align*} For every context \(\Gamma\), there is an identity substitution \(\id_\Gamma\), which sends every variable to itself, which along with composition of substitutions above gives a category of contexts and substitutions. The coherence constructor \(\Coh \Delta A \sigma\) allows us to construct lifts between parallel operations over pasting diagrams. The context \(\Delta\) plays the role of the pasting diagram. The type \(A\) will always be of the form \(\arr s B t\), and the terms \(s\) and \(t\) play the role of the parallel operation (with the type \(\arr s B t\) being well-formed ensuring that \(s\) and \(t\) are parallel). The substitution \(\sigma : \Delta \to \Gamma\) holds the data of a set of arguments to the coherence, allowing compound composites/operations to be formed and taking the role of composition of morphisms in the coherator. We next define the free variables of each piece of syntax. These will be used to encode the condition of an operation being algebraic from the theory of non-invertible coherators. Let \(\Var(\Gamma)\) denote the variables of \(\Gamma\). For a term \(t \in \Term_\Gamma\), a type \(A \in \Type_\Gamma\) and a substitution \(\sigma : \Delta \to \Gamma\) we define their free variables \(\FV(t), \FV(A), \FV(\sigma) \subseteq \Var(\Gamma)\) by mutual recursion. \begin{align*} \FV(x) &= \{x\} &\text{if \(x\) is a variable}\\ \FV(\Coh \Delta A \sigma) &= \FV(\sigma)\\ \FV(\star) &= \{\}\\ \FV(\arr s A t) &= \FV(s) \cup \FV(A) \cup \FV(t)\\ \FV(\langle \rangle) &= \{\}\\ \FV(\langle \sigma , t \rangle) &= \FV(\sigma) \cup \FV(t) \end{align*} The free variables of a term are often the wrong notion to use for testing algebraicity. For example in the context \(D^1\), the term \(d_1\) has free variables \(\{d_1\}\), whereas the unary composite of \(d_1\), \(\Coh {D_1} {\arr {d_0^-} \star {d_0^+}} {\id_{D^1}}\), has free variables \(\{d_0^-,d_0^+,d_1\}\). To remedy this, the original paper considers \(\FV(t) \cup \FV(A)\), for a term \(t\) of type \(A\). In this thesis we instead define the support of each piece of syntax as a purely syntactic construction. \begin{definition} Fix a context \(\Gamma\). The subset \(V \subseteq \Var(\Gamma)\) is \emph{downwards closed} if for all \((x : A) \in \Gamma\) we have: \[x \in V \implies \FV(A) \subseteq V\] The downwards closure of a set \(V\) in a context \(\Gamma\), \(\DC_\Gamma(V)\) can be defined by induction on the context: \begin{align*} \DC_\emptyset(\emptyset) &= \emptyset\\ \DC_{\Gamma, x : A}(V) &= \begin{cases*} \DC_\Gamma(V)&if \(x \not\in V\)\\ \{x\} \cup \DC_\Gamma(V \cup \FV(A))&if \(x \in V\)\\ \end{cases*} \end{align*} The support of a term, type, or substitution is then defined as the downwards closure of its free variables: \[ \Supp(t) = \DC_\Gamma(\FV(t))\qquad \Supp(A) = \DC_\Gamma(\FV(A))\qquad \Supp(\sigma) = \DC_\Gamma(\FV(\sigma)) \] for terms \(t \in \Term_\Gamma\), types \(A \in \Type_\Gamma\), and substitutions \(\sigma : \Delta \to \Gamma\). \end{definition} We will see later (\cref{item:supp-tm-char-2}) that for well-formed terms \(t\) of typed \(A\) that the support of \(t\) is equal to \(\FV(t) \cup \FV(A)\) and that \(\Supp(A) = \FV(A)\) for well-formed types. Modifying \Catt to use the support operation therefore does not change the theory. We lastly define the \emph{dimension} of types, contexts, and terms. For types this is defined recursively: \[ \dim(\star) = 0 \qquad \dim(\arr s A t) = 1 + \dim(A) \] For contexts, we define \(\dim(\Gamma)\) to be the maximum of the dimension of each type in \(\Gamma\). For coherences \(\Coh \Gamma A \sigma\), the dimension is given by \(\dim(A)\), and for variables the dimension is given by the dimension of the associated type in the context. \subsection{Ps-contexts} \label{sec:ps-contexts} We need to be able to describe pasting diagrams within the theory \Catt. As contexts model globular sets it is natural to treat pasting diagrams as a subset of contexts. We will build pasting diagrams by iteratively attaching discs to a context, which is done by introducing the judgements: \[ \Delta \vdash_{\mathsf{ps}} x : A \qquad \text{and}\qquad \Delta \vdash_{\mathsf{ps}} \] If the first judgement holds, then \(\Delta\) is a pasting diagram for which a disc can be attached to the variable \(x\), called a \emph{dangling variable}, which has type \(A\). The contexts \(\Delta\) for which the second judgement holds are fully formed pasting diagrams, which we call \emph{ps-contexts} (short for pasting scheme contexts). The rules for these judgements are given in \cref{fig:ps-context}. We note that these rules do not just specify which globular sets are pasting diagrams, but they also specify an ordering on the elements of the pasting diagram, ensuring that there is a unique ps-context for each pasting diagram. For example, the following judgement holds: \begin{equation} \label[judgement]{judg:ps} (x : \star), (y : \star), (f : \arr x \star y), (z: \star), (g : \arr y \star z) \vdash_{\mathsf{ps}} \end{equation} However, the context: \[(y : \star), (z : \star), (g : \arr y \star z), (x : \star), (f : \arr x \star y)\] represents the same globular set but is not a ps-context. \begin{figure}[ht] \centering \begin{mathpar} \inferrule{ }{(x : \star) \vdash_{\mathsf{ps}} x : \star} {(\textsc{pss})} \and \inferrule{\Gamma \vdash_{\mathsf{ps}} x : A}{\Gamma, (y : A), (f : \arr x A y)} {(\textsc{pse})} \and \inferrule{\Gamma \vdash_{\mathsf{ps}} x : \arr s A t}{\Gamma \vdash_{\mathsf{ps}} t : A} {(\textsc{psd})} \and \inferrule{\Gamma \vdash_{\mathsf{ps}} x : \star}{\Gamma \vdash_{\mathsf{ps}}} {(\textsc{ps})} \end{mathpar} \caption{Rules for ps-contexts.} \label{fig:ps-context} \end{figure} \begin{example} \Cref{judg:ps} is given by the following derivation: \[ \begin{prooftree} \hypo{ } \infer1[(\textsc{pss})]{(x : \star) \vdash_{\mathsf{ps}} x : \star} \infer1[(\textsc{pse})]{(x : \star), (y : \star), (f : \arr x \star y) \vdash_{\mathsf{ps}} f : \arr x \star y} \infer1[(\textsc{psd})]{(x : \star), (y : \star), (f : \arr x \star y) \vdash_{\mathsf{ps}} y : \star} \infer1[(\textsc{pse})]{(x : \star), (y : \star), (f : \arr x \star y), (z : \star), (g : \arr y \star z) \vdash_{\mathsf{ps}} g : \arr x \star y} \infer1[(\textsc{psd})]{(x : \star), (y : \star), (f : \arr x \star y), (z : \star), (g : \arr y \star z) \vdash_{\mathsf{ps}} z : \star} \infer1[(\textsc{ps})]{(x : \star), (y : \star), (f : \arr x \star y), (z : \star), (g : \arr y \star z) \vdash_{\mathsf{ps}}} \end{prooftree}\] The applications of (\textsc{pse}) allow new variables to be added to the context, by adding a fresh variable, and attaching a variable from the dangling variable to the new fresh variable. The rule (\textsc{psd}) encodes that if we can attach a variable to \(f : x \to y\), then we can also attach a variable to \(y\). The rule (\textsc{ps}) forces as many (\textsc{psd}) rules to be applied as possible before completing the derivation, ensuring that derivations of ps-contexts are unique. \end{example} We now state the following theorem, which follows immediately from~\cite[Theorem~53]{benjamin2021globular}. \begin{theorem} The set of ps-contexts is in bijection with the set of pasting diagrams. \end{theorem} In order to use ps-contexts as our notion of pasting diagram, we need to be able to identify the source and target variables of each ps-context. This will be done by specifying the dimension \(i\) source and target of each pasting context. More precisely, for each ps-context \(\Gamma\) and \(i \in \mathbb{N}\), we define a ps-context \(\bound i \Gamma\) and subcontext inclusions: \[ \incbd i - \Gamma : \bound i \Gamma \to \Gamma \qquad \text{and}\qquad \incbd i + \Gamma : \bound i \Gamma \to \Gamma\] Intuitively, the context \(\bound i \Gamma\) can be constructed by removing any variables of dimension greater than \(i\) from \(\Gamma\), and quotienting the dimension \(i\) variables by the (symmetric transitive closure of the) relation \(x \sim y\) if there exists an \(f : x \to y\). The inclusions then send this quotiented variable to the variable appearing first in the equivalence class for the source inclusion, and the variable appearing last in the class for the target inclusion. These contexts and substitutions can be defined by recursion on the context \(\Gamma\): \begin{align*} \bound i {(x : \star)} &= {(x : \star)}\\ \bound i {\Gamma, (y : A), (f : \arr x A y)} &= \begin{cases*} \bound i \Gamma&if \(i \leq \dim(A)\)\\ \bound i \Gamma, (y : A), (f : \arr x A y)&otherwise \end{cases*}\\ \incbd i \epsilon {(x : \star)} &= \langle x \rangle\\ \incbd i \epsilon {\Gamma, (y : A) , (f : \arr x A y)} &= \begin{cases*} \mathrlap{\incbd i \epsilon \Gamma}{\phantom{\bound i \Gamma, (y : A), (f : \arr x A y)}}&if \(i < \dim(A)\)\\ \incbd i - \Gamma&if \(i = \dim(A)\) and \(\epsilon = -\)\\ \replace(\incbd i + \Gamma, y)&if \(i = \dim(A)\) and \(\epsilon = +\)\\ \langle \incbd i \epsilon \Gamma, y, f \rangle &otherwise \end{cases*} \end{align*} where \(\epsilon \in \{-,+\}\) and \(\replace(\langle \sigma, s \rangle, t) = \langle \sigma, t \rangle\). As it will be common to take the boundary of \(\Gamma\) at the dimension below the dimension of \(\Gamma\) itself, we write \[\incbd {} \epsilon \Gamma = \incbd {\dim(\Gamma) - 1} \epsilon \Gamma\] when \(\dim(\Gamma)\) is not zero. In the original \Catt paper, these inclusion substitutions are not given and instead the source and target variables are given directly as subcontexts. It can be easily checked that the free variables of the inclusions are equal to the subcontexts, and that the free variable sets of these inclusions are downwards closed. It is known, e.g.\ from~\cite[Lemma~55]{benjamin2021globular}, that these constructions agree with the constructions of the source and target pasting diagrams in \cref{sec:pasting-diagrams}. We state the following well-known result (see~\cite{finster2017type}) about isomorphisms between pasting contexts. \begin{proposition} \label{prop:ps-context-iso} Let \(\Gamma\) and \(\Delta\) be ps-contexts and suppose \(\sigma : \Gamma \to \Delta\) is an isomorphism. Then \(\Gamma \equiv \Delta\) and \(\sigma\) is the identity substitution. \end{proposition} \subsection{Typing for \Catt} \label{sec:typing-catt} We now have all the prerequisites in place to state the typing rules for \Catt. These take the form of 4 judgements (not including the judgements for ps-contexts introduced in \cref{sec:ps-contexts}): \begin{alignat*}{2} &\Gamma \vdash&\qquad&\text{\(\Gamma \in \Ctx\) is a well-formed context.}\\ &\Gamma \vdash A&&\text{\(A \in \Type_\Gamma\) is a well-formed type in context \(\Gamma\).}\\ &\Gamma \vdash t : A &&\text{\(t \in \Term_\Gamma\) is a well-formed term of type \(A \in \Type_\Gamma\).}\\ &\Gamma \vdash \sigma : \Delta &&\text{\(\sigma : \Delta \to \Gamma\) is a well-formed substitution.} \end{alignat*} The typing rules for these judgements are then given in \cref{fig:catt-typing}. As most of these are standard we draw attention to a couple of the key rules. The rule for arrow types ensures that both the source and target of the arrow themselves have the same type, namely the one given in the subscript of the arrow. This effectively ensures the globular nature of the type theory, as given a term \(f : \arr s {\arr x A y} t\), both the source of the source and source of the target are \(x\), and both the target of the source and target of the target are \(y\). \begin{figure}[ht] \centering \begin{mathpar} \inferrule{ }{\emptyset \vdash} \and \inferrule{\Gamma \vdash\\ \Gamma \vdash A}{\Gamma, (x : A) \vdash} \and \inferrule{ }{\Gamma \vdash \star} \and \inferrule{\Gamma \vdash s : A \\ \Gamma \vdash A \\ \Gamma \vdash t : A}{\Gamma \vdash \arr s A t} \\ \inferrule{ }{\Gamma \vdash \langle\rangle : \emptyset} \and \inferrule{\Gamma \vdash \sigma : \Delta\\ \Gamma \vdash t : A\sub\sigma}{\Gamma \vdash \langle \sigma , t \rangle : \Delta, (x : A)} \and \inferrule{(x : A) \in \Gamma}{\Gamma \vdash x : A} \and \inferrule{\Delta \vdash_{\mathsf{ps}}\\ \Delta \vdash \arr s A t \\ \Gamma \vdash \sigma : \Delta\\\dim(\Delta) \neq 0\\\Supp(s) = \Supp(\incbd {} - \Delta)\\\Supp(t) = \Supp(\incbd {} + \Delta)}{\Gamma \vdash \Coh \Delta {\arr s A t} \sigma : \arr {s \sub \sigma} {A \sub \sigma} {t \sub \sigma}} \and \inferrule{\Delta \vdash_{\mathsf{ps}}\\ \Delta \vdash \arr s A t \\ \Gamma \vdash \sigma : \Delta\\\Supp(s) = \Supp(t) = \Var(\Delta)}{\Gamma \vdash \Coh \Delta {\arr s A t} \sigma : \arr {s \sub \sigma} {A \sub \sigma} {t \sub \sigma}} \end{mathpar} \caption{Typing rules for \Catt.} \label{fig:catt-typing} \end{figure} There are two rules given for typing coherence, corresponding to the two guiding principles for categories from \cref{sec:weak}. The first rule allows composites to be typed and the second allows equivalences to be typed. In both, the ps-context \(\Delta\) corresponds to the pasting diagram \(P\), the terms \(s\) and \(t\) correspond to the operations \(f\) and \(g\) over \(P\) (with the judgement \(\Delta \vdash \arr s A t\) enforcing that they are parallel), and the conditions involving support give the remaining side conditions. By a straightforward mutual induction we can prove that application of substitution to terms, types, and other substitutions preserves typing. Therefore, the \emph{syntactic category} of \Catt can be formed, which contains well-formed contexts as objects and well-formed substitutions between these contexts as morphisms, which by an abuse of notation we call \textsf{Catt}. There is a full subcategory \(\mathsf{Catt}^{\mathsf{ps}}\), which only contains the contexts which are ps-contexts. \begin{theorem} The category \(\mathsf{Catt}^{\mathsf{ps}}\) is a coherator for \(\infty\)-categories. \end{theorem} \begin{proof} Follows from \cite[Theorem~73]{benjamin2021globular}, noting that the opposite convention for substitution is used in that paper. \end{proof} Thus, we immediately get that a presheaf over \(\mathsf{Catt}^{\mathsf{ps}}\) which preserves globular products is an \(\infty\)-category (using the Maltsiniotis definition). Further, presheaves of this form are equivalent to type-theoretic models of \Catt by \cite[Theorem~88]{benjamin2021globular}, meaning type-theoretic models of \Catt are \(\infty\)-categories. \subsection{Basic constructions} \label{sec:basic-constructions} We now introduce some examples of basic categorical operations in order to give some early examples. Suppose we have terms \(a : \arr s \star t\) and \(b : \arr t \star u\) in some context \(\Gamma\). Then the ps-context \[ \Delta = (x : \star), (y : \star), (f : \arr x \star y), (z : \star), (g : \arr y \star z) \] from \cref{judg:ps} can be used to form the 1-composite: \[ a *_0 b = \Coh \Delta {\arr x \star z} {\langle s, t, a, u, b \rangle}\] It is often not necessary to give all the terms in a substitution, especially when the substitution is from a pasting diagram (or more generally a globular set). In these cases it is sufficient to give terms for the \emph{locally maximal} variables of the context, those that do not appear as the source or target of another variable. For \(\Delta\), the locally maximal variables are \(f\) and \(g\), and so it suffices to give the substitution above as \(\langle a , b \rangle\), with the rest of the terms being inferable. The disc contexts \(D^n\) can be formed in \Catt as the analogue of the disc globular sets given in \cref{ex:disc} and satisfy the property that a substitution from a disc context \(D^n\) contains the same data as a term and \(n\)-dimensional type. Given a term \(t\) of type \(A\) in context \(\Gamma\), we write this substitution \(\{A,t\} : D^{\dim(A)} \to \Gamma\). All disc contexts are ps-contexts. Using these, the identity can be formed on a term \(t\) of type \(A\) in \(\Gamma\): \[\id(A,t) = \Coh {D^n} {\arr {d_n} {} {d_n}} {\{A, t\}}\] where \(\dim(A) = n\), which is typed using the rule for equivalences. The structure of this term changes for different values of \(n\), and we will relate these different terms in \cref{sec:suspension}. As before, the non-locally maximal elements of a substitution can be inferred, and so we may write \(\id(t)\) or \(\{t\}\) when the type \(A\) is inferable. In \Catt, all types are inferable, though later when we consider semistrict variations of \Catt it may be necessary to specify the exact type we are using up to syntactic equality. \paragraph{Standard coherences} The composite and identity above form part of a more general collection of coherences, which we call \emph{standard coherences}. \begin{definition} Given a pasting diagram \(\Delta\), we mutually define for all \(n\) the \emph{standard coherence} \(\stdcoh\Delta n\), the \emph{standard term} \(\stdtm \Delta n\), and the \emph{standard type} \(\stdty \Delta n\): \begin{alignat*}{2} &\stdcoh \Delta n &&= \Coh \Delta {\stdty \Delta n} {\id_\Delta}\\ &\stdtm \Delta n &&= \begin{cases} d^n &\text{when \(\Delta\) is the disc \(D^n\)}\\ \stdcoh \Delta n &\text{otherwise} \end{cases}\\ &\stdty \Delta 0 &&= \star\\ &\stdty \Delta {n+1} &&= \arr {\stdtm {\bound n \Delta} n \sub {\incbd n - \Delta}} {\stdty \Delta n} {\stdtm {\bound n \Delta} n \sub {\incbd n + \Delta}} \end{alignat*} The standard type takes the standard term over each boundary of \(\Delta\), includes these all back into \(\Delta\) and assembles them into a type. When \(n = \dim(\Delta)\) we will refer to the standard coherence as the \emph{standard composite}. \end{definition} Intuitively, the standard coherence \(\stdcoh \Delta n\) is the canonical composite in dimension \(n\) of the pasting diagram \(\Delta\). To give this a type is needed to form the coherence, for which the standard type \(\stdty \Delta n\) is used. The standard term \(\stdtm \Delta n\) is used as a variant of the standard coherence which special cases disc contexts. This avoids the standard type containing unary composites and allows standard composites (of non-disc contexts) to be normal forms of the reduction systems that will be described in \cref{cha:cattstrict}. It is immediate that the composite of \(1\)-cells \(a *_0 b\) is given by \(\stdcoh \Delta 1\sub{\langle a , b \rangle}\) and the identity on a term \(t\) of dimension \(n\) is given by \(\stdcoh {D^n} {n+1}\sub{\{t\}}\). This construction can be used to generate all the composites in the definition of a strict \(\infty\)-category. For example the vertical composite of \(2\)-cells is the standard composite over the context given by the diagram: \[ \begin{tikzcd} x && y \arrow[""{name=0, anchor=center, inner sep=0}, "f"', curve={height=24pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "h", curve={height=-24pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "g"{description}, from=1-1, to=1-3] \arrow["\alpha", shorten <=3pt, shorten >=3pt, Rightarrow, from=0, to=2] \arrow["\beta", shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \end{tikzcd} \] and the horizontal composite of \(2\)-cells is the standard composite over: \[ \begin{tikzcd} x & y & z \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, "i", curve={height=-18pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "h"', curve={height=18pt}, from=1-2, to=1-3] \arrow["\alpha", shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["\beta", shorten <=5pt, shorten >=5pt, Rightarrow, from=3, to=2] \end{tikzcd} \] Noting that the standard type over the above diagram has source \(f * h\) and target \(g * i\), themselves being standard compositions demonstrating the mutual recursive behaviour of these constructions. \begin{remark} Above we gave two ps-contexts by drawing a diagram of the globular set that they represent. Ps-contexts fix the order that variables occur in and as such the mapping from ps-contexts to globular sets is injective. The use of diagrams to define ps-contexts is therefore unambiguous. \end{remark} \paragraph{Further examples} The substitution component of a coherence allows operations to be combined into compound operations. Consider the (Ps-)context given by the following diagram: \[\Gamma = \begin{tikzcd} s & t & u & v \arrow["a", from=1-1, to=1-2] \arrow["b", from=1-2, to=1-3] \arrow["c", from=1-3, to=1-4] \end{tikzcd} \] There are (at least) 3 ways to compose together the elements of this context. We could take the unbiased ternary composite \(a * b * c = \stdcoh \Gamma 1\sub{\langle a, b, c\rangle}\), but could also construct either biased composite: \begin{align*} (a * b) * c &= \stdcoh \Delta 1\sub{\langle \stdcoh \Delta 1\sub{\langle a,b\rangle}, c\rangle}\\ a * (b * c) &= \stdcoh \Delta 1\sub{\langle a, \stdcoh \Delta 1\sub{\langle b, c\rangle}\rangle}\\ \end{align*} Using the equivalence typing rule, we can relate these biased composite with the following term: \[ \alpha_{a,b,c} = \Coh \Gamma {\arr {(a * b) * c} {} {a * (b * c)}} {\id_\Gamma}\] which is the associator. Similarly, for a term \(f : \arr x \star y\), unitors can be formed over the disc context \(D^1\) using the equivalence rule: \begin{align*} \lambda_f &= \Coh {D^1} {\arr {\id(d_0^-) * d_1} {} {d_1}} {\{f\}}\\ \rho_f &= \Coh {D^1} {\arr {d_1 * \id(d_0^-)} {} {d_1}} {\{f\}} \end{align*} The remainder of the operations for a 2-category can be defined similarly, as each displays the equivalence of two terms built over a pasting diagram. We observe that both the unitors and associator (as well as any coherence typed with the equivalence rule) are trivially invertible. \subsection{Suspension} \label{sec:suspension} To end this section, we introduce the meta-operation of \emph{suspension}, as described for \Catt by \citeauthor{benjamin2020type}~\cite{benjamin2020type}. Suspension takes any piece of syntax as input and produces one with a dimension one higher. It can be used as an aid to defining operations in \Catt, but will also form a key part of the formal development of the constructions described in \cref{sec:operations-catt}. Suspension is inspired by the identically named operation on topological spaces. Given a topological space \(X\), its suspension \(\Sigma X\) is formed by quotienting the space \(X \times [0,1]\) by the relation that identifies all points of the form \((x,0)\) for \(x \in X\) and identifies points \((x,1)\) for \(x \in X\). The suspension on a space \(X\) can be alternatively viewed as the space containing two distinguished points \(N\) and \(S\), and a path from \(N\) to \(S\) for each point \(x \in X\). The names \(N\) and \(S\) stand for north and south, as the suspension of a circle can be visualised as a globe, with \(N\) and \(S\) being the north and south pole and each of the paths between them being a meridian. A similar operation can be applied to globular sets. Given a globular set \(G\), its suspension \(\Sigma G\) is obtained by shifting the dimension of every \(n\)-cell up by one (making it into an \((n+1)\)-cell), adding two new \(0\)-cells \(N\) and \(S\), and letting the source of every \(1\)-cell be \(N\) and the target be \(S\). The globularity conditions for this construction can be quickly verified. This construction extends to all computads~\cite{benjamin2024duamity}, and can be defined in \Catt by mutually defining the operation on contexts, types, terms, and substitutions. \begin{definition} For contexts \(\Gamma \in \Ctx\), types \(A \in \Type_\Gamma\), terms \(t \in \Term_\Gamma\), and substitutions \(\sigma : \Delta \to \Gamma\), we define their \emph{suspensions} \(\Sigma(\Gamma) \in \Ctx\), \(\Sigma(A) \in \Type_{\Sigma(\Gamma)}\), \(\Sigma(t)\in \Term_{\Sigma(\Gamma)}\), and \(\Sigma(\sigma) : \Sigma(\Delta) \to \Sigma(\Gamma)\) by mutual recursion. \begin{align*} \Sigma (\emptyset) &= (N : \star), (S : \star) &\Sigma (\Gamma, (x : A)) &= \Sigma \Gamma, (x : \Sigma A)\\ \Sigma (\star) &= \arr N \star S &\Sigma (\arr s A t) &= \arr {\Sigma s} {\Sigma A} {\Sigma t}\\ \Sigma(\langle \rangle) &= \langle N, S \rangle &\Sigma(\langle \sigma, x \rangle) &= \langle \Sigma(\sigma), \Sigma(t) \rangle\\ \Sigma (x) &= x &\Sigma (\Coh \Delta A \sigma) &= \Coh {\Sigma(\Delta)} {\Sigma(A)} {\Sigma(\sigma)} \end{align*} where \(x\) is a variable of \(\Gamma\). \end{definition} The dimension shift of suspension is driven by the cases for types, especially the case for the base type \(\star\), which returns a type of dimension \(1\), namely \(\arr N \star S\), using the two new variables \(N\) and \(S\). We note that the suspension of any ps-context is also a ps-context, and in general the suspension of any piece of well-formed \Catt syntax can be well-formed. These results are given in \cite[Section~3.2]{benjamin2020type}, but will be proved in \cref{sec:ruleset} in more generality. We can now investigate the action of suspension on the operations we have already defined. Take the context: \[ (x : \star), (y : \star), (f : \arr x \star y), (z : \star), (g : \arr y \star z) \] used in \cref{sec:basic-constructions} to generate 1-composition. Applying suspension to this context gives: \[ \begin{tikzcd} N && S \arrow[""{name=0, anchor=center, inner sep=0}, "x"', curve={height=24pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "z", curve={height=-24pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "y"{description}, from=1-1, to=1-3] \arrow["f"', shorten <=3pt, shorten >=3pt, Rightarrow, from=0, to=2] \arrow["g"', shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=1] \end{tikzcd} \] which is the context used to generate vertical 2-composition. Furthermore, applying suspension directly to 1-composition operation forms the vertical 2-composition operation. The suspension of each disc context \(D^n\) is (up to \(\alpha\)-renaming) \(D^{n+1}\). It can be checked that applying suspension to the identity operation for \(n\)-dimensional terms returns the identity operation for \((n+1)\)-dimensional terms. Repeating this logic, all identity operations can be obtained as iterated suspensions of the identity for \(0\)-cells. The following more general result about standard coherences holds: \begin{proposition} The following syntactic equalities hold: \[\Sigma(\stdcoh \Delta n) = \stdcoh {\Sigma(\Delta)} {n+1}\qquad \Sigma(\stdtm \Delta n) = \stdtm {\Sigma(\Delta)} {n+1}\qquad \Sigma(\stdty \Delta n) = \stdty {\Sigma(\Delta)} {n+1}\] for all ps-contexts \(\Delta\) and \(n \in \mathbb{N}\). \end{proposition} The proof of these results is delayed to \cref{sec:operations-catt}, where we will have more tools for dealing with these constructions. \chapter{A formalised presentation of \Catt with equality} \label{cha:gener-pres-catt} The main purpose of this chapter will be to define the family of type theories \Cattr, which extend the base type theory \Catt with a specified set \(\mathcal{R}\) of equality rules. These equality rules equate various terms of the theory, which unifies the corresponding operations their models, allowing us in \cref{cha:cattstrict} to generate type theories that model semistrict categories, categories where some but not all structure is strictified. This chapter will also introduce the Agda formalisation~\cite{alex_rice_2024_10964565} which accompanies this thesis, which compiles with Agda v2.6.4 and standard library v2.0. The formalisation implements the syntax and typing judgements of \Cattr, and contains proofs of most results in this chapter and \cref{sec:operations-catt}. By formalising \Cattr, instead of the more specific type theories \Cattsu and \Cattsua introduced in \cref{sec:cattsu,sec:cattsua}, the formalisation of many results can be applied to both type theories. This also allows these results to be applied to any future type theories of this form. A dependency graph of the formalisation is given in \cref{fig:dep-graph}, and an online version of this graph can be found at \url{https://alexarice.github.io/catt-agda/dep-graph.svg} for which each node is a clickable link to an HTML version of the code. This graph was generated by processing the dependency graph output of Agda with the tool \textsf{sd-visualiser}~\cite{sd-visualiser}. \section{Extended substitution} \label{sec:extend-subst} \Cattr uses the same syntax as \Catt with one exception. In \Cattr we make a natural generalisation to substitutions, which will allow more operations to be defined for working with the suspension operation introduced in \cref{sec:suspension}. Unfortunately, the full utility of this generalisation will not be realised until \cref{sec:structured-terms}, but we choose to introduce it here as it forms a core part of the syntax, and requires little modification to the rules of the type theory. We recall that the suspension operation \(\Sigma\) acts on contexts, substitutions, types, and terms. Given a substitution \(\sigma : \Delta \to \Gamma\), its suspension \(\Sigma(\sigma)\) has domain \(\Sigma(\Delta)\) and codomain \(\Sigma(\Gamma)\). When we define trees and tree labellings in \cref{sec:operations-catt}, which will be used to define the insertion operation in \cref{sec:insertion}, we will need to be able to define substitutions from suspended contexts to arbitrary contexts. More generally, we would like to be able to describe substitutions of the form: \[ \Sigma^n(\Delta) \to \Gamma\] where \(\Sigma^n(\Delta)\) is the operation that applies suspension \(n\) times to \(\Delta\). Consider the data contained in a substitution \(\tau : \Sigma(\Delta) \to \Gamma\). There are two terms \(N \sub \tau\) and \(S \sub \tau\) of type \(\star\), and then a term for each variable of \(\Delta\). Temporarily ignoring the typing conditions for substitutions, we see that the data is equivalent to a substitution from \(\Delta\) to \(\Gamma\) and two additional terms. If we now consider a substitution \(\tau : \Sigma(\Sigma(\Delta)) \to \Gamma\), we notice that there is a term in \(\Gamma\) for each variable of \(\Delta\), as well as two terms \(s = N \sub \tau\) and \(t = S \sub \tau\) for the outer suspension and terms \(u = N' \sub \tau\) and \(v = S' \sub \tau\) for the inner suspension. As before, the terms \(s\) and \(t\) should have type \(\star\), but the terms \(u\) and \(v\) should have type \(\arr s \star t\). We note that this is the exact condition needed for \(\arr u {\arr s \star t} v\) to be a well-formed type. This motivates the notion of an \emph{extended substitution}, which is obtained by equipping a substitution with a type. We have not yet determined the typing conditions required on the substitution part of these extended substitutions. We return to the example of a substitution \(\tau : \Sigma^2(\Delta) \to \Gamma\), and suppose that \(\Delta\) has a variable \(x\) of type \(\star\). In \(\Sigma^2(\Delta)\), \(x\) has the type \(\arr {N'} {\arr N \star S} {S'}\), and so \(x\) should be sent to a term of type \(\arr u {\arr s \star t} v\), the type portion of the extended substitution. In a substitution \(\sigma : \Delta \to \Gamma\), \(x\) would be sent to a term of type \(\star \sub \sigma\), which suggests that \(\star \sub \sigma\) should be redefined to send \(\star\) to the type part of the extended substitution. This one change to the application of substitution to types is sufficient to generalise from substitutions to extended substitutions. An extended substitution \(\sigma : \Delta \to \Gamma\) then has the following intuition: The substitution part specifies where each variable in \(\Delta\) should be sent, and the type part specifies where the base type \(\star\) should be sent. The other cases for the application of substitution extend this to all terms, types, and (extended) substitutions as before. The extended substitution \(\sigma\) then represents a standard substitution \(\Sigma^n(\Delta)\) to \(\Gamma\), where \(n\) is the dimension of the type part of \(\sigma\). Hence, a regular substitution can be recovered as an extended substitution with type part \(\star\). We modify the syntax of \Catt as follows, and will refer to these extended substitutions simply as substitutions, as extended substitutions are a direct generalisation of substitutions, and the notion of substitution is still recoverable by setting the type part to \(\star\): \begin{itemize} \item Substitutions will now be fibred over a type of their codomain context, which we will write \(\sigma : \arr \Delta A \Gamma\) where \(A \in \Type_\Gamma\). We note that this allows us to specify that \(\sigma\) is a regular substitution by writing \(\sigma : \arr \Delta \star \Gamma\). \item The constructor \(\langle\rangle\) is removed, and is replaced by the constructor \(\langle A \rangle : \arr \emptyset A \Gamma\), where \(A \in \Type_\Gamma\). Adding a term to a substitution preserves the type of the substitution. As before we may write a substitution \(\langle \langle \langle A \rangle, s \rangle, t \rangle\) as \(\langle A , s, t\rangle\). We let \(\FV(\langle A \rangle) = \FV(A)\). \item An operation \(\ty(\sigma)\) is introduced that returns the type portion of a substitution. For \(\sigma : \arr \Delta A \Gamma\), we have \(\ty(\sigma) = A\). \item Coherences \(\Coh \Delta A \sigma \in \Term_\Gamma\) are restricted so that \(\sigma\) is a regular substitution. In other words \(\ty(\sigma)\) must be \(\star\) for \(\sigma\) to appear in a substitution. While this condition could be dropped, it is convenient to keep the same operations as \Catt. \end{itemize} To witness the equivalence of extended substitutions \(\Delta \to \Gamma\) and regular substitutions \(\Sigma^n(\Delta) \to \Gamma\), we introduce new operations. \begin{definition} For a substitution \(\sigma : \arr {\Delta} {\arr s A t} \Gamma\), we define its \emph{unrestriction}: \[\unrestrict\sigma : \arr {\Sigma(\Delta)} A \Gamma\] by induction on the length of \(\Delta\): \begin{align*} \unrestrict \langle \arr s A t \rangle &= \langle A, s, t \rangle\\ \unrestrict \langle \sigma' , u \rangle &= \langle \unrestrict \sigma' , u \rangle \end{align*} The unrestrict operation simply moves two terms from the type part of the substitution into the main body of the substitution. \end{definition} To define the second operation, we need to first specify the changes to application of substitution: \begin{itemize} \item The composition of substitutions takes substitutions \(\sigma : \arr \Theta A \Delta\) and \(\tau : \arr \Delta B \Gamma\) to a substitution \(\sigma \bullet \tau : \arr \Theta {A \sub \tau} \Gamma\). \item For a substitution \(\sigma : \arr \Delta A \Gamma\), we define \(\star \sub{\sigma} = A\). \item As the substitution in a coherence must have type \(\star\), we define the application of an extended substitution \(\tau : \arr \Delta {\arr s A t} \Gamma\) to a coherence as: \[ \Coh \Theta A \sigma \sub \tau = \Coh {\Sigma(\Theta)} {\Sigma(A)} {\Sigma(\sigma)} \sub {\unrestrict \tau}\] The case for applying a regular substitution to a coherence remains unchanged. \end{itemize} We can now define an inverse to the unrestriction operation. \begin{definition} For a substitution \(\sigma : \arr {\Sigma(\Delta)} A \Gamma\), its \emph{restriction} \[ \restrict \sigma : \arr \Delta {\arr {N \sub \sigma} A {S \sub \sigma}} \Gamma \] is defined by induction on the length of \(\Delta\): \begin{align*} \restrict \langle A, s, t \rangle &= \langle \arr s A t \rangle\\ \restrict \langle \sigma', u \rangle &= \langle \restrict \sigma', u \rangle \end{align*} Inversely to the unrestrict operation, the restrict operation moves two terms into the type part of the substitution. \end{definition} As restriction and unrestriction cancel each other, the suspension of the substitution \(\sigma : \arr \Delta \star \Gamma\) can be factored into \((\unrestrict \circ (\restrict \circ \Sigma)) (\sigma)\). We observe that the second part of this composition, \(\restrict \circ \Sigma\), is the operation that simply applies the suspension to each term in the substitution as well as the type of the substitution. This motivates the final definition of this section. \begin{definition} Let the \emph{restricted suspension} of a substitution \(\sigma : \arr \Delta A \Gamma\) be a substitution \[\Sigma'(\sigma) : \arr \Delta {\Sigma(A)} {\Sigma(\Gamma)}\] defined inductively by the equations: \begin{align*} \Sigma'(\langle A \rangle) &= \langle \Sigma(A)\rangle \\ \Sigma'(\langle \sigma' , t \rangle) &= \langle \Sigma'(\sigma'), \Sigma(t) \rangle \end{align*} The suspension of a substitution \(\tau : \arr \Delta \star \Gamma\) can be defined by \(\Sigma(\tau) = \unrestrict\Sigma'(\tau)\). \end{definition} For the rest of the thesis and the formalisation, the suspension on a substitution is defined as the composition of unrestriction and restricted suspension. \section[\texorpdfstring{\Cattr}{Cattr}: \Catt with equality]{\boldmath\texorpdfstring{\Cattr}{Cattr}: \Catt with equality} \label{sec:catt-with-equality} This section will define the type theory \Cattr, a variation of \Catt with specified equality rules. This section, in addition to the following sections in this chapter, will be used to motivate certain choices in the formalisation. All the preliminary definitions as well as syntax, typing, and equality rules are assembled in \cref{fig:cattr}. \subsection{Syntax} \label{sec:syntax} The syntax of \Cattr is based on the syntax of \Catt with the changes specified in \cref{sec:extend-subst}. This creates a dependence chain of needing to define the base syntax before suspension can be defined, and needing to define suspension before application of substitution can be defined. In the formalisation these are defined in the following files: \begin{itemize} \item The core syntax is defined in \module{Catt.Syntax.Base}. \item Suspension is defined in \module{Catt.Suspension}. \item Other syntactic operations are defined in \module{Catt.Syntax}, which re-exports the core syntax. \end{itemize} To avoid any issues with \(\alpha\)-equivalence, especially as we have terms that contain contexts, we work with de Bruijn indices throughout the formalisation. This means that a context is simply a vector of types, a fixed length list, which are given a nicer syntax. Variables are then simply bounded natural numbers, represented by the sets \(\mathsf{Fin}_n\), where \(\mathsf{Fin}_n\) is the set \(\{0,\dots,n-1\}\). Given a context \(A , B , C\), the variables over this context are simply \(\mathsf{var\ 0}\), which has type \(C\), \(\mathsf{var\ 1}\), which has type \(B\), and \(\mathsf{var\ 2}\), with type \(A\). We note that \(3\) is not in \(\mathsf{Fin}_3\), and so \(\mathsf{var\ 3}\) is not a term of this context. Hence, we do not need to deal with unknown variables when applying substitutions. We will still make use of variable names in this text to aid readability, and will ignore any potential problems that could arise from this, knowing that the results are formalised in a setting where they do not appear. The formalisation also differs from the presentation in the texts by the way that the various notions of syntax are fibred. We fibre contexts by a natural number representing their length, and then fibre terms, types, and substitutions over these lengths instead of fibring them over the contexts. We then get the following 4 syntactic classes defined as mutually inductive families, where \(\mathcal{U}\) is a type universe: \[ \funcn{Catt.Syntax.Base}{Ctx}{\Ctx} : \mathbb{N} \to \mathcal{U} \quad \funcn{Catt.Syntax.Base}{Ty}\Type : \mathbb{N} \to \mathcal{U} \quad \funcn{Catt.Syntax.Base}{Tm}\Term : \mathbb{N} \to \mathcal{U} \quad \funcn{Catt.Syntax.Base}{Sub}\Sub : (n\ m : \mathbb{N}) \to \Type_m \to \mathcal{U}\] This decision was made purely for convenience, by fibring over natural numbers instead of contexts, we sometimes avoid the need for providing more explicit arguments to syntactic constructions. It comes with drawback that the context must be provided for certain operations, such as the support of a piece of syntax, or the dimension of a term. One place an explicit argument can be avoided is when defining the weakening of a piece of syntax, an operation witnessing that for a piece of syntax living in a context \(\Gamma\), there is a copy living in \(\Gamma , A\) for any \(A\). These operations are defined in \module{Catt.Syntax} and take the following form, where we re-use the name \(\wk\) here as an abuse of notation: \[ \funcn{Catt.Syntax}{wk-tm}{\wk} : \Term_{\Gamma} \to \Term_{\Gamma, A}\quad\funcn{Catt.Syntax}{wk-ty}{\wk} : \Type_{\Gamma} \to \Type_{\Gamma, A}\quad \funcn{Catt.Syntax}{wk-sub}{\wk} : (\arr \Gamma B \Delta) \to (\arr {\Gamma} {\wk(B)} {\Delta, A}) \] If terms are fibred over contexts then this type \(A\) must often be specified, though with the fibring over context length this is no longer necessary. When using de Bruijn indices, this operation is no longer the identity on terms, as each variable must be incremented due to the index in a variable counting from the end of the context. One might ask why de Bruijn levels (which index from the start of the context) were not used instead, but this would not solve our problem as \(\mathsf{Fin}_n\) is not a subtype of \(\mathsf{Fin}_{n+1}\) in Agda. Furthermore, using de Bruijn levels would cause the substitution application introduced in \cref{sec:syntax-catt} (and expanded in \cref{sec:extend-subst}) to compute poorly, due to the way substitutions are defined. The definition of weakening is given in \cref{fig:wk}. Weakening can be used to give a short inductive definition of the identity substitution, a substitution \(\id_\Gamma : \Gamma \to \Gamma\) which sends every variable to itself. On the inductive case \(\id_{\Gamma, (x : A)}\), it is clear that the variable \(x\) should be sent to \(x\), but the constructor for substitutions also requires a substitution \(\Gamma \to \Gamma, (x : A)\). This can be obtained by weakening a recursive call to the identity on \(\Gamma\). Similarly, an inclusion \(\Gamma \to \Gamma, (x : A)\) can be defined as \(\wk(\id_\Gamma)\), and applying this substitution is the same operation as weakening. To begin proving syntactic properties of \Cattr, we need a notion of syntactic equality. This will be written \(\Gamma \equiv \Delta\) for contexts \(\Gamma\) and \(\Delta\), and similarly for terms \(s\) and \(t\), types \(A\) and \(B\), and substitutions \(\sigma\) and \(\tau\). It is given by \(\alpha\)-equivalence, and so we would hope that the formalisation could leverage the use of de Bruijn indices to use the in-built equality type for syntactic equality. This is too restrictive however, there will be many times when we want to compare two terms of differing context length (in practice this context length will be propositionally equal, instead of definitionally equal). Therefore, four syntactic equality relations are defined mutually inductively on the constructors of each piece of syntax in \module{Catt.Syntax.Properties}. These definitions can easily be heterogeneous, allowing two terms \(s : \Term_n\) and \(t : \Term_m\) to be compared. Unfortunately, using these comes at the cost of large amounts of boilerplate, as these inductively defined equalities do not come equipped with the J-rule, and so it must be manually proved that each operation respects syntactic equality. An example of such a function is \funcn{Catt.Syntax.Properties}{wk-tm-≃}{wk-tm-\(\simeq\)}, which states that the weakenings of two syntactically equal terms are syntactically equal. \module{Catt.Syntax.Properties} contains many of the basic properties about the syntax of \Cattr, including: \begin{itemize} \item Syntactic equality is decidable. \item Syntactic equality is propositional, there is at most one proof of \(s \equiv t\). \item Functoriality of suspension. \item Interaction of weakening with substitution application. We have \(\wk(s) \sub {\langle \sigma , t \rangle} \equiv s \sub \sigma\) and \(s \sub {\wk(\sigma)} \equiv \wk(s \sub \sigma)\) and equivalent lemmas for the application of substitution to types and substitutions. \end{itemize} It also contains the following proposition. \begin{proposition} \label{prop:categorical} Application of substitution is associative and unital with respect to the identity substitution. More precisely, given substitutions \(\sigma : \arr \Theta A \Delta\) and \(\tau : \arr \Delta B \Gamma\), the following equalities hold: \begin{mathpar} A \sub \sigma \sub \tau \equiv A \sub {\sigma \bullet \tau} \and A \sub \id_\Theta \equiv A\\ t \sub \sigma \sub \tau \equiv t \sub {\sigma \bullet \tau} \and t \sub \id_\Theta \equiv t\\ (\mu \bullet \sigma) \bullet \tau \equiv \mu \bullet (\sigma \bullet \tau) \and \mu \bullet \id_\Theta \equiv \mu \and \id_\Xi \bullet \mu \equiv \mu \end{mathpar} for types \(A \in \Type_\Theta\), terms \(t \in \Term_\Theta\), and substitutions \(\mu : \arr \Xi C \Theta\). \end{proposition} \begin{proof} The last equation is a simple induction on \(\mu\) (and the context \(\Xi\)). Both the unitality equations and associativity equations, as with the vast majority of syntactic proofs, are given by mutual induction on types, terms, and substitutions. The only difficult case is: \[ \Coh \Theta C \mu \sub \sigma \sub \tau \equiv t \sub {\sigma \bullet \tau} \] where the type part of \(\sigma: \arr \Theta A \Delta\) or \(\tau : \arr \Delta B \Gamma\) is not \(\star\). First suppose \(B = \arr s {B'} t\) but \(A = \star\): \begin{align*} \Coh \Theta C \mu \sub \sigma \sub \tau &\equiv \Coh \Theta C {\mu \bullet \sigma} \sub \tau\\ &\equiv \Coh {\Sigma(\Theta)} {\Sigma(C)} {\Sigma(\mu \bullet \sigma)} \sub {\unrestrict \tau}\\ &\equiv \Coh {\Sigma(\Theta)} {\Sigma(C)} {\Sigma(\mu) \bullet \Sigma(\sigma)} \sub {\unrestrict \tau}\\ &\equiv \Coh {\Sigma(\Theta)} {\Sigma(C)} {\Sigma(\mu)} \sub {\Sigma(\sigma) \bullet \unrestrict \tau}\\ &\equiv \Coh {\Sigma(\Theta)} {\Sigma(C)} {\Sigma(\mu)} \sub {\unrestrict (\sigma \bullet \tau)}\\ &\equiv \Coh {\Theta} {C} {\mu} \sub {\sigma \bullet \tau} \end{align*} where the second to last line is given by property \[\unrestrict (\sigma \bullet \tau) \equiv \Sigma(\sigma) \bullet \unrestrict \tau\] which holds for all \(\sigma : \arr \Theta \star \Delta\) and is proven in \funcn{Catt.Syntax.Properties}{↓-comp}{\textsf{\(\downarrow\)-comp}}, and the line before is given by the inductive hypothesis. If instead we had \(A = \arr s {A'} t\), then: \begin{align*} \Coh \Theta C \mu \sub \sigma \sub \tau &\equiv \Coh {\Sigma(\Theta)} {\Sigma(C)} {\Sigma(\mu)} \sub {\unrestrict \sigma} \sub \tau\\ &\equiv \Coh {\Sigma(\Theta)} {\Sigma(C)} {\Sigma(\mu)} \sub {\unrestrict \sigma \bullet \tau}\\ &\equiv \Coh {\Sigma(\Theta)} {\Sigma(C)} {\Sigma(\mu)} \sub {\unrestrict (\sigma \bullet \tau)}\\ &\equiv \Coh \Theta C \mu \sub {\sigma \bullet \tau} \end{align*} where we use the inductive hypothesis after applying the equality \[ \unrestrict (\sigma \bullet \tau) \equiv \unrestrict \sigma \bullet \tau \] which holds for all \(\sigma : \arr \Theta {\arr s {A'} t} \Delta\) by \funcn{Catt.Syntax.Properties}{↓-comp-higher}% {\textsf{\(\downarrow\)-comp-higher}}. \end{proof} This proposition proves that the syntax of \Cattr forms a category, which we will not name as we will work instead with the subcategory containing well-formed contexts and substitutions, introduced in the following sections. \paragraph{Discs} We finish our discussion of the syntax of \Cattr by giving formal definitions of disc and sphere contexts, some constructions on these, and their properties. This will allow these to be used as examples in following sections, and pre-empts the use of discs in the first two equality rules that we will introduce, disc removal and endo-coherence removal. We begin with the definitions of discs, spheres, and sphere types, which can be found in \module{Catt.Discs} as \func{Catt.Discs}{Disc}, \func{Catt.Discs}{Sphere}, and \func{Catt.Discs}{sphere-type}. We write the sphere type as \(U^n\), which is intentionally close to the notation of the standard type \(\mathcal{U}_\Delta^n\), as it will turn out that these coincide. \begin{definition} We mutually define the disc contexts \(D^n\), sphere contexts \(S^n\), and sphere type \(U^n \in \Type_{S^n}\). \begin{mathpar} D^n = S^n , (d_n^- : U^n) \and S^0 = \emptyset \and S^{n+1} = D^n , (d_n^+ : \wk(U^n)) \\ U^0 = \star \and U^{n+1} = \arr {d_n^-} {\wk(\wk(U^{n+1}))} {d_n^+} \end{mathpar} We will sometimes refer to the last variable of \(D^n\) as \(d_n\) instead of \(d_n^-\), given that there is no \(d_n^+\) in the context. \end{definition} We also characterise the substitutions from a sphere or disc. These are given by \func{Catt.Discs}{sub-from-sphere} and \func{Catt.Discs}{sub-from-disc} in the formalisation. \begin{definition} Let \(A : \Type_\Gamma\) be a type and suppose \(n = \dim(A)\). Define the substitution \(\{A\} : S^n \to \Gamma\) inductively by: \[ \{\star\} = \langle \rangle \qquad \{\arr s A t\} = \langle \{ A \}, s, t \rangle\] Further, given a term \(t : \Term_\Gamma\), define the substitution \(\{A,t\} : D^n \to \Gamma\) by \(\{A, t\} = \langle \{A\}, t \rangle\). \end{definition} In \module{Catt.Discs.Properties}, various facts about these constructions are proved which we list below. \begin{lemma} \label{lem:disc-prop} The following hold: \begin{lemmaenum} \item \label{item:disc-prop-dim}\(\dim(D^n) = \dim(U^n) = n\) and \(\dim(S^n) = \max(n - 1, 0)\). \item \label{item:disc-prop-susp} \(\Sigma(D^n) \equiv D^{n+1}\), \(\Sigma(S^n) \equiv S^{n+1}\), and \(\Sigma(U^n) \equiv U^{n+1}\). \item \label{item:disc-prop-wk} \(\{\wk(A)\} \equiv \wk(\{A\})\) and \(\{\wk(A), \wk(t)\} \equiv \wk(\{A,t\})\). \item \label{item:disc-prop-sub-susp} \(\{\Sigma(A)\} \equiv \Sigma(\{A\})\) and \(\{\Sigma(A),\Sigma(t)\} \equiv \Sigma(\{A,t\})\). \item \label{item:disc-prop-sub-sub} \(\{A \sub \sigma\} \equiv \{A\} \bullet \sigma\) and \(\{A \sub \sigma,t \sub \sigma\} \equiv \{A,t\}\bullet \sigma\). \item \label{item:disc-prop-sub-from} \(U^n \sub{\{A\}} \equiv A\) and hence \(\wk(U^n)\sub{\{A,t\}} \equiv A\). \item For \(\tau : S^n \to \Gamma\), \(\tau \equiv \{U^n \sub \tau\}\). \item For \(\tau : D^n \to \Gamma\), \(\tau \equiv \{\wk(U^n) \sub \tau, d_n \sub \tau\}\). \end{lemmaenum} for all \(n \in \mathbb{N}\) and appropriate \(A\), \(t\), and \(\sigma\). \end{lemma} The last two statements finish the characterisation of substitutions from spheres and discs as all such substitutions are of the form \(\{A\}\) or \(\{A,t\}\) respectively. In \module{Catt.Discs.Pasting}, it is shown that \(D^n\) is a ps-context for each \(n\). Therefore, as in \cref{sec:basic-constructions}, the identity on a term \(t\) of type \(A\) can be defined as: \[ \id(A,t) = \Coh {D^n} {\arr {d_n} {\wk(U^n)} {d_n}} {\{A,t\}} \] where \(n = \dim(A)\). Many properties of identity terms can be easily derived from \cref{lem:disc-prop}. \subsection{Typing and equality} \label{sec:typing-equality} The typing rules for \Cattr differ from those from \Catt in three key ways: \begin{enumerate} \item The fixed conditions on the support of the types in a coherence have been replaced by a set of operations \(\mathcal{O}\). Instead of having two typing rules for coherences, one for equivalences and one for composites, we simply have one typing rule and specify that a coherence \(\Coh \Delta {\arr s A t} \sigma\) can be well-formed when: \[ (\Delta, \Supp(s), \Supp(t)) \in \mathcal{O} \] This will be further motivated and explained in \cref{sec:support}. \item A definitional equality is added to the system, generated by a set of equality rules \(\mathcal{R}\) which specifies pairs of terms which should be equated. The equality takes the form of three new judgements: \begin{alignat*}{2} &\Gamma \vdash A = B&\qquad&\text{\(A, B \in \Type_\Gamma\) are equal in context \(\Gamma\).}\\ &\Gamma \vdash s = t &&\text{\(s, t \in \Term_\Gamma\) are equal in context \(\Gamma\).}\\ &\Gamma \vdash \tau = \sigma &&\text{\(\tau : \Theta \to \Gamma\) and \(\sigma : \Delta \to \Gamma\) are equal.} \end{alignat*} These judgements are all mutually defined (and are in fact mutually defined with the typing judgements). We may sometimes abbreviate these judgements to \(A = B\), \(s = t\), and \(\tau = \sigma\) when the contexts of each piece of syntax is clear. \item The typing rules are adjusted to account for this definitional equality, via the addition of a conversion rule. \end{enumerate} The conversion rule is the only additional typing rule that must be added to \Cattr, and takes the following form: \begin{mathpar} \inferrule {\Gamma \vdash s : A \and \Gamma \vdash A = B}{\Gamma \vdash s : B}\textsc{conv} \end{mathpar} allowing the type of any term to vary up to the definitional equality. This rule accounts for all the semistrict behaviour in the theories we introduce in \cref{cha:cattstrict}. By adding this rule, and allowing the type of a term to vary up to definitional equality instead of syntactic equality, we allow more terms in the theory to become composable. Suppose we have terms \(f : x \to y\) and \(g : y' \to z\). In \Catt, we would not be able to form the vertical composition of these terms, as \(y\) and \(y'\) are not the same. If we now suppose that \(\Gamma \vdash y = y'\), then it will follow that \(\Gamma \vdash (x \to y) = (x \to y')\), and so using the conversion rule we get: \begin{mathpar} \inferrule{\inferrule*{\Gamma \vdash f : x \to y \and \inferrule*{\Gamma \vdash y = y'}{\Gamma \vdash (x \to y) = (x \to y')}}{\Gamma \vdash f : x \to y'} \and \Gamma \vdash g : y' \to z}{\Gamma \vdash f * g : x \to z} \end{mathpar} We remark that adding definitional equality does not simply quotient the terms of the theory, but also allows new terms to be well-formed as above. The definitional equality judgements are given by the rules in \cref{fig:equality} and appear in the formalisation alongside the typing rules in \module{Catt.Typing}. These are generated by the set of \emph{equality rules} \(\mathcal{R}\), which is a set of triples of the form \((\Gamma, s, t)\) where \(\Gamma\) is a context and \(s,t \in \Term_\Gamma\). The key inference rule for equality is then: \begin{mathpar} \inferrule{\Gamma \vdash s : A \and (\Gamma,s,t) \in \mathcal{R}}{\Gamma \vdash s = t}\textsc{rule} \end{mathpar} which says that if a triple \((\Gamma, s, t)\) is in \(\mathcal{R}\), then \(\Gamma \vdash s = t\) if \(s\) is well-formed in \(\Gamma\). The typing prerequisite forces the definitions of equality and typing to be mutually defined, and ensures that we only apply our equality rules to well-behaved terms. We note the asymmetry of this rule, in that only the left-hand side is required to be well-formed. Every rule introduced in this thesis will take the form of some reduction from the left-hand side to the right-hand side, and we will be able to prove that typing for the right-hand side follows from typing for the left-hand side for every equality we consider. The converse may not hold in general, necessitating the condition on the left-hand side. This is similar to \(\beta\)-reduction in the \(\lambda\)-calculus, where an untyped term can reduce to a simply typed term. The remainder of the inference rules for equality simply close under each constructor, reflexivity, symmetry, and transitivity. It is only necessary to give symmetry and transitivity rules for terms, and a reflexivity rule for variables, with these properties following for the other judgements by simple induction. \begin{lemma} The definitional equality relations on terms, types, and substitutions are equivalence relations, for any \(\mathcal{R}\). \end{lemma} \begin{proof} Proofs of these are found in \module{Catt.Typing.Properties.Base}. \end{proof} It is also possible to prove that each term has a canonical type. \begin{definition} The \emph{canonical type} of a term \(t : \Term_\Gamma\), \(\ty(t)\), is defined by a case split on \(t\). If \(t\) is a variable then the canonical type is the corresponding type in the context \(\Gamma\). Otherwise, if \(t \equiv \Coh \Delta A \sigma\) then the canonical type is \(A \sub \sigma\). \end{definition} This can be used to show that the type of a well-formed term is unique up to definitional equality, and is equal to this canonical type. \begin{lemma} \label{lem:ty-unique} If \(\Gamma \vdash s : A\), then \(\Gamma \vdash s : ty(s)\) and \(\Gamma \vdash A = \ty(s)\). Further, if \(\Gamma \vdash s : A\) and \(\Gamma \vdash s : B\) then \(\Gamma \vdash A = B\). \end{lemma} \begin{proof} We prove the first part by induction on the derivation \(\Gamma \vdash s : A\). If the derivation is derived from the conversion rule applied to \(\Gamma \vdash s : B\) and \(\Gamma \vdash A = B\), then by inductive hypothesis we have \(\Gamma \vdash s : \ty(s)\) and \(\Gamma \vdash B = \ty(s)\). By transitivity, we obtain \(\Gamma \vdash A = \ty(s)\) as required. The second part follows directly from the applying the first part to both derivations. \end{proof} Using the canonical type, we can define the canonical identity on a term. \begin{definition} \label{def:canonical-id} Given a term \(t : \Term_\Gamma\), let its \emph{canonical identity} be given by: \[ \id(t) \equiv \id(\ty(t), t)\] This construction can be iterated, and we say that a term is an \emph{iterated canonical identity} if it is on the form \(\id^k(t)\) for some \(k\). \end{definition} There is not much more that can be proved about the definitional equality at this point without knowing more about the rule set \(\mathcal{R}\). In \cref{sec:ruleset}, certain conditions will be imposed on the set of equality rules, that will allow further lemmas to be proved in large generality. \paragraph{Disc removal} We now give our first example of an equality rule, \emph{disc removal}. Disc removal removes unary composites, replacing them with the underlying term. We recall that for every \(n\), there exists the \(n\)-dimensional disc context \(D^n\), and that given a term \(t \in \Term_\Gamma\) and \(n\)-dimensional type \(A \in \Type_\Gamma\), there exists a substitution \(\{A,t\} : D^n \to \Gamma\). The unary composite of a term \(t\) of type \(A\) of dimension \(n\) is then the coherence: \[\Coh {D^n} {\wk(U^n)} {\{A,t\}}\] Disc removal equates this with the term \(t\), making the following rule admissible: \begin{mathpar} \inferrule{\Gamma \vdash t : A \\ \Gamma \vdash A}{\Gamma \vdash \Coh {D^n} {\wk(U^n)} {\{A,t\}} = t}\textsc{dr} \end{mathpar} with the removal of the disc coherence giving the name to this equality rule. Assembling disc removal into a rule set \(\mathcal{R}\) is simple, as it is possible to simply give a syntactic condition with no need to refer to typing. \begin{definition} The \emph{disc removal rule set}, \dr, is the set consisting of the triples: \[ (\Gamma, \Coh {D^n} {\wk(U^n)} {\{A,t\}}, t) \] for each context \(\Gamma\), type \(A : \Type_\Gamma\), and term \(t : \Term_\Gamma\) where \(n = \dim(A)\). A set of rules \(\mathcal{R}\) \emph{contains disc removal} if \(\dr \subseteq \mathcal{R}\). Further we say that \(\mathcal{R}\) \emph{has disc removal} if the rule \textsc{dr} holds in the generated theory. \end{definition} The inference rule \textsc{dr} follows the \textsc{rule} and typing properties about discs which will be given in \cref{sec:ruleset}. We draw attention to the typing premise of \textsc{rule}. If we know that the unary composite of a term \(t\) is well-formed, then it follows that \(t\) itself must have been well-formed, but we cannot infer that the term \(\Coh {D^n} {\wk(U^n)} {\{A,t\}}\) is well-formed from \(t\) being well-formed. In particular, knowing that \(t\) is well-formed does not constrain \(A\) at all without knowing that the given type \(A\) is the type of \(t\). We must therefore include an additional typing premise if we want to avoid well-formed and non-well-formed terms being equated. \afterpage{% \clearpage% flush all other floats \ifodd\value{page} \else% \expandafter\afterpage% put it on the next page if this one is odd \fi {% \begin{figure}[hbtp] \centering \fbox{% \begin{subfigure}{0.47\textwidth} \begin{mathpar} \inferrule{ }{\star : \Type_\Gamma} \and \inferrule{x \in \Var(\Gamma)} {x : \Term_\Gamma} \and \inferrule{A : \Type_\Gamma}{\langle A \rangle : \emptyset \to \Gamma} \and \inferrule{ }{\emptyset : \Ctx} \and \inferrule{\Gamma : \Ctx \\ A : \Type_\Gamma}{\Gamma, (x : A) : \Ctx} \and \inferrule{\sigma : \arr \Delta A \Gamma \\ t : \Term_\Gamma \\ B : \Type_\Delta}{\langle \sigma , t \rangle : \arr {\Delta, (x : B)} A \Gamma} \and \inferrule{A : \Type_\Gamma \\ s : \Term_\Gamma \\ t : \Term_\Gamma} {\arr s A t : \Type_\Gamma} \and \inferrule{\\\\\Delta : \Ctx \\ A : \Type_\Delta \\ \sigma : \arr \Delta \star \Gamma}{\Coh \Delta A \sigma : \Term_\Gamma} \end{mathpar} \caption{Syntax.} \end{subfigure}} \hfill \fbox{% \begin{subfigure}{0.49\textwidth} \begin{mathpar} \inferrule{ }{\emptyset \vdash} \and \inferrule{\Gamma \vdash\\ \Gamma \vdash A}{\Gamma, (x : A) \vdash} \and \inferrule{ }{\Gamma \vdash \star} \and \inferrule{\Gamma \vdash s : A \\ \Gamma \vdash A \\ \Gamma \vdash t : A}{\Gamma \vdash \arr s A t} \and \inferrule{\Gamma \vdash A}{\Gamma \vdash \langle A \rangle : \emptyset} \and \inferrule{\Gamma \vdash \sigma : \Delta\\ \Gamma \vdash t : A\sub\sigma}{\Gamma \vdash \langle \sigma , t \rangle : \Delta, (x : A)} \and \inferrule{(x : A) \in \Gamma}{\Gamma \vdash x : A} \and \inferrule{\Gamma \vdash t : A\\ \Gamma \vdash A = B}{\Gamma \vdash t : B} \and \inferrule{\Delta \vdash_{\mathsf{ps}}\\ \Delta \vdash \arr s A t \\ \Gamma \vdash \sigma : \Delta\\(\Delta, \Supp(s), \Supp(t)) \in \mathcal{O}}{\Gamma \vdash \Coh \Delta {\arr s A t} \sigma : \arr {s \sub \sigma} {A \sub \sigma} {t \sub \sigma}} \end{mathpar} \caption{Typing.} \end{subfigure}} \vspace{7pt} \fbox{% \begin{subfigure}{0.9852\textwidth} \begin{mathpar} \inferrule{\Gamma \vdash s : A \\ (\Gamma, s, t) \in \mathcal{R}}{\Gamma \vdash s = t}\textsc{rule} \and \inferrule{x \in \Var(\Gamma)}{\Gamma \vdash x = x} \and \inferrule{\Gamma \vdash s = t}{\Gamma \vdash t = s} \and \inferrule{\Gamma \vdash s = t \\ \Gamma \vdash t = u}{\Gamma \vdash s = u} \and \inferrule{\Delta \vdash A = B \\ \Gamma \vdash \sigma = \tau}{\Gamma \vdash \Coh \Delta A \sigma = \Coh \Delta B \tau} \and \inferrule{ }{\Gamma \vdash \star = \star} \and \inferrule{\Gamma \vdash s = s' \\ \Gamma \vdash t = t' \\ \Gamma \vdash A = A'}{\Gamma \vdash \arr s A t = \arr {s'} {A'} {t'}}\and \inferrule{\Gamma \vdash A = B}{\Gamma \vdash \langle A \rangle = \langle B \rangle}\and \inferrule{\Gamma \vdash \sigma = \tau \\ \Gamma \vdash s = t}{\Gamma \vdash \langle \sigma, s \rangle = \langle \tau, t \rangle} \end{mathpar} \caption{Equality.} \label{fig:equality} \end{subfigure}} \vspace{7pt} \fbox{% \begin{subfigure}{0.47\textwidth} \vspace{3.7pt} \begin{mathpar} \inferrule{ }{(x : \star) \vdash_{\mathsf{ps}} x : \star} \and \inferrule{\Gamma \vdash_{\mathsf{ps}} x : A}{\Gamma, (y : A), (f : \arr x A y)} \and \inferrule{\Gamma \vdash_{\mathsf{ps}} x : \arr s A t}{\Gamma \vdash_{\mathsf{ps}} t : A} \and \inferrule{\Gamma \vdash_{\mathsf{ps}} x : \star}{\Gamma \vdash_{\mathsf{ps}}} \end{mathpar} \caption{Ps-contexts.} \end{subfigure}} \hfill \fbox{% \begin{subfigure}{0.49\textwidth} \begin{mathpar} \FV(\star) = \{\} \and \FV(\langle A \rangle) = \FV(A) \\ \FV(x) = \{x\} \text{ for }x \in \Var \\ \FV(\Coh \Delta A \sigma) = \FV(\sigma) \\ \FV(\arr s A t) = \FV(s) \cup \FV(A) \cup \FV(t) \\ \FV(\langle \sigma , t \rangle) = \FV(\sigma) \cup \FV(t) \end{mathpar} \caption{Free variables.} \end{subfigure}} \caption{\Cattr: syntax, typing, and operations.} \label{fig:cattr} \end{figure} \begin{figure} \ContinuedFloat \fbox{% \begin{subfigure}{1\textwidth} \begin{align*} \DC_\emptyset(\emptyset) &= \emptyset\\ \DC_{\Gamma, x : A}(V) &= \begin{cases*} \DC_\Gamma(V)&if \(x \not\in V\)\\ \{x\} \cup \DC_\Gamma(V \setminus \{x\} \cup \FV(A))&if \(x \in V\)\\ \end{cases*}\\ \Supp(t) &= \DC_\Gamma(\FV(t))\text{ for }t \in \Term_\Gamma\\ \Supp(A) &= \DC_\Gamma(\FV(A))\text{ for }A \in \Type_\Gamma\\ \Supp(\sigma) &= \DC_\Gamma(\FV(\sigma))\text{ for }\sigma : \arr {\Delta} A \Gamma \end{align*} \caption{Support.} \end{subfigure}} \vspace{7pt} \fbox{% \begin{subfigure}{\textwidth} \begin{align*} x \sub \sigma &= t\text{ if }(x \mapsto t) \in \sigma\\ \Coh \Theta A \tau \sub \sigma &= \begin{cases*} \Coh \Theta A {\tau \bullet \sigma}&if \(\dim(\ty(\sigma)) = 0\)\\ \Coh {\Sigma(\Theta)} {\Sigma(A)} {\Sigma(\tau)} \sub {\unrestrict\sigma}&otherwise \end{cases*} \\ \star \sub \sigma &= \ty(\sigma)\\ (\arr s A t) \sub \sigma &= \arr {s \sub \sigma} {A \sub \sigma} {t \sub \sigma}\\ \langle A \rangle \bullet \sigma &= \langle A \sub \sigma \rangle\\ \langle \tau , t \rangle \bullet \sigma &= \langle \tau \bullet \sigma , t \sub \sigma \rangle \end{align*} \caption{Substitution application.} \end{subfigure}} \vspace{7pt} \fbox{% \begin{subfigure}{0.475\textwidth} \begin{align*} \Sigma (\emptyset) &= (N : \star), (S : \star)\\ \Sigma (\Gamma, (x : A)) &= \Sigma \Gamma, (x : \Sigma A)\\ \Sigma (\star) &= \arr N \star S\\ \Sigma (\arr s A t) &= \arr {\Sigma s} {\Sigma A} {\Sigma t}\\ \Sigma (x) &= x\\ \Sigma (\Coh \Delta A \sigma) &= \Coh {\Sigma(\Delta)} {\Sigma(A)} {\Sigma(\sigma)}\\ \Sigma(\sigma) &= \unrestrict(\Sigma'(\sigma))\\[7.25pt] \Sigma'(\langle A \rangle) &= \langle \Sigma(A) \rangle\\ \Sigma'(\langle \sigma, x \rangle) &= \langle \Sigma'(\sigma), \Sigma(t) \rangle\\ \unrestrict\langle \arr s A t \rangle &= \langle A , s , t \rangle\\ \unrestrict\langle \sigma, t \rangle &= \langle \unrestrict \sigma, t \rangle \end{align*} \caption{Suspension.} \end{subfigure}} \hfill \begin{subfigure}{0.49\textwidth} \fbox{% \begin{subfigure}{1\textwidth} \begin{align*} \wk(\star) &= \star\\ \wk(\arr s A t) &= \arr {\wk(s)} {\wk(A)} {\wk(t)}\\ \wk(x) &= x\\ \wk(\Coh \Delta A \sigma) &= \Coh \Delta A {\wk(\sigma)}\\ \wk(\langle A \rangle) &= \langle \wk(A) \rangle\\ \wk(\langle \sigma, t \rangle) &= \langle \wk(\sigma), \wk(t) \rangle \end{align*} \caption{Weakening.} \label{fig:wk} \end{subfigure}} \vspace{7pt} \fbox{% \begin{subfigure}{1\textwidth} \begin{align*} \id_\emptyset &= \langle \star \rangle\\ \id_{\Gamma, (x : A)} &= \langle \wk(\id_\Gamma), x \rangle \end{align*} \caption{Identity substitution.} \end{subfigure}} \end{subfigure} \caption{\Cattr: syntax, typing, and operations.} \end{figure} }% } \section[The set of operations \texorpdfstring{\(\mathcal{O}\)}{O}]{The set of operations \texorpdfstring{\boldmath\(\mathcal{O}\)}{O}} \label{sec:support} In \cref{sec:typing-equality}, we introduced a set of operations \(\mathcal{O}\), which allows us to vary the operations available in the theory, much like the set \(\mathcal{R}\) allows us to vary the equality rules of the theory. The set \(\mathcal{O}\) replaces the conditions on the support of the type contained in a coherence, and consists of a set of triples of a context \(\Delta\), along with two sets \(x,y \subseteq \Var(\Delta)\). A certain type \(\arr s A t : \Type_\Delta\) is permitted to appear in a coherence exactly when \((\Delta , \Supp(s), \Supp(t))\) is an element of \(\mathcal{O}\). There are two key advantages to setting up the theory this way. \begin{itemize} \item A clear separation is introduced in the metatheory and formalisation between properties that are specific to the support conditions in \Catt and those that are independent of the specific support conditions present. \item The results in the following sections can be proven generically for different variants of \Catt. \end{itemize} In particular, the main utility we extract in this thesis is the ability to define groupoidal versions of the various semistrict theories we define in \cref{cha:cattstrict}. By letting \(\mathcal{O}\) consists of all possible triples, the support condition is effectively removed, producing a version of \Catt closer to Grothendieck's definition of \(\infty\)-groupoid (see \cref{sec:weak}). \subsection{Operation sets} \label{sec:operation-sets} As previously mentioned, an operation set \(\mathcal{O}\) consists of a collection of triples of a context \(\Delta\) and two subsets of the variables of \(\Delta\). We call a subset of the variables of a context a \emph{variable set}. In the formalisation, these variable sets are given as a list of booleans, one boolean for each variable of the context. These are given in \module{Catt.Support}, which also contains many constructions on them, including unions of these sets, subset relations, and the free variables of each piece of syntax. The variable sets of \(\Delta\) form a lattice with top element \(\Var(\Delta)\) and bottom element \(\emptyset\). The free variable constructions commute with weakening, as is proved in \module{Catt.Support.Properties} by mutual induction. We recall the function \(\DC\) on these variable sets, given by \func{Catt.Support}{DC} in the formalisation, which produces the downwards closure of a variable set. This admits the following properties: \begin{proposition} \(\DC\) is an idempotent join-semilattice homomorphism. It preserves binary joins (unions), subset inclusions, and preserves the top and bottom element of the lattice. \end{proposition} We further define the application of a substitution to a variable set below. \begin{definition} Given a variable set \(V\) of \(\Delta\) and (regular) substitution \(\sigma : \Delta \to \Gamma\), we define the application of \(\sigma\) to \(V\), written \(V \sub \sigma\) to be a variable set of \(\Gamma\) given by: \begin{align*} V \sub {\langle \rangle} &= \emptyset\\ V \sub {\langle \sigma , t \rangle} &= \begin{cases*} (V \setminus \{x\}) \sub \sigma \cup \FV(t)&if \(x \in V\)\\ V \sub \sigma &otherwise \end{cases*} \end{align*} Where \(x\) is assumed to be the last variable of \(\Delta\) in the second case. \end{definition} We note that when representing variable sets as a list of booleans, these definitions are given by simple inductions on the length of the context. These constructions admit the following properties. \begin{proposition} \label{prop:vs-sub} Let \(\Delta\) be a context. Then the function taking a variable set \(V\) of \(\Delta\) to \(V \sub \sigma\) is a join-semilattice homomorphism for any substitution \(\sigma : \Delta \to \Gamma\). Further, for a term \(t : \Term_\Delta\), a type \(A : \Type_\Delta\), or a substitution \(\tau : \arr \Theta A \Delta\), the following equalities hold: \begin{align*} \FV(t \sub \sigma) &= \FV(t) \sub \sigma \\ \FV(A \sub \sigma) &= \FV(A) \sub \sigma \\ \FV(\tau \bullet \sigma) &= \FV(\tau) \sub \sigma \end{align*} and hence \(\Var(\Delta) \sub \sigma = \FV(\id_\Delta) \sub \sigma = \FV(\id_\Delta \bullet \sigma) = \FV(\sigma)\). For any variable set \(V \subseteq \Var(\Theta)\) we have: \[ V \sub {\id_\Theta} = V \qquad V \sub {\tau \bullet \sigma} = V \sub \tau \sub \sigma \] for \(\tau : \Theta \to \Delta\) and \(\sigma : \Delta \to \Gamma\). \end{proposition} \begin{proof} All proofs proceed by induction on the length of the context \(\Delta\) and are given in \module{Catt.Support.Properties}. \end{proof} An operation set is then an element of: \[ \Sigma_{\Delta : \Ctx} \mathcal{P}(\Var(\Delta)) \times \mathcal{P}(\Var(\Delta)) \] In the formalisation this is defined in \module{Catt.Ops} to be a function from a context and two variable sets of that context to a universe. \begin{remark} The definition of an operation set in the formalisation deviates from the presentation given here, as the version in the formalisation is proof relevant. The proof relevant definition allows us to give any type as the type of witnesses that a certain triple appears in \(\mathcal{O}\), including a type containing many distinct witnesses. If we wished to recover a definition closer to the classical set-based definition, we could enforce that this function has a universe of propositions as its codomain, instead of a universe of types, and use propositional truncations to define various versions of \(\mathcal{O}\). This is however unnecessary for any of the proofs appearing in this thesis, hence the choice of the proof relevant definition for simplicity. A similar observation will apply to the definition of equality rule sets introduced in \cref{sec:ruleset}. \end{remark} We can now introduce our first operation set, the operation set for groupoidal operations, which imposes no support conditions and allows all operations. \begin{definition} We define the \emph{groupoidal operation set} \(\Group\) as: \[ \Group = \{ (\Delta, U, V) \mid \Delta : \Ctx, U \subseteq \Var(\Delta), V \subseteq \Var(\Delta) \} \] We will refer to \Cattr with the operation set \(\Group\) as \emph{groupoidal \Cattr} or \emph{groupoidal \Catt} (when \(\mathcal{R} = \emptyset\)). \end{definition} To recover the standard definition of \Catt, we must define the boundary sets of a pasting diagram. In \cref{sec:typing-catt}, these are given as the free variables of the boundary inclusion substitutions of pasting diagrams. Here we will instead give a direct definition of the variable sets corresponding to the free variables of the substitutions, delaying the definition of boundary inclusions of pasting diagrams until \cref{sec:trees}. \begin{definition} Let \(\Delta\) be a ps-context. Define the \(n\)-boundary variable sets \(\bdry n - \Delta\) and \(\bdry n + \Delta\) by induction on \(\Delta\): \begin{align*} \bdry i \epsilon {(x : \star)} &= \{ x \}\\ \bdry i \epsilon {\Gamma, (y : A) , (f : \arr x A y)} &= \begin{cases*} \bdry i \epsilon \Gamma&if \(i < \dim(A)\)\\ \bdry i - \Gamma&if \(i = \dim(A)\) and \(\epsilon = -\)\\ (\bdry i + \Gamma \cup \{ y \}) \setminus \{x\}&if \(i = \dim(A)\) and \(\epsilon = +\)\\ \bdry i \epsilon \Gamma \cup \{ y , f \}&otherwise \end{cases*} \end{align*} These boundary sets appear in the formalisation as \func{Catt.Support}{pd-bd-vs}. \end{definition} The following lemma is immediate: \begin{lemma} \label{lem:bdry-full} If \(n \geq \dim(\Delta)\), then \(\bdry n \epsilon \Delta = \Var(\Delta)\). \end{lemma} \begin{proof} A simple induction on the definition. A formalised proof appears as \func{Catt.Support.Properties}{pd-bd-vs-full} in the module \module{Catt.Support.Properties}. \end{proof} With this definition we can introduce the regular operation set, which recovers the regular support conditions used in the definition of \Catt. \begin{definition} The \emph{regular operation set} \Reg is defined to be: \[ \Reg = \{ (\Delta, \Var(\Delta), \Var(\Delta)) \mid \Delta \vdash_{\mathsf{ps}} \} \cup \{ (\Delta, \bdry {\dim(\Delta)-1} - \Delta, \bdry {\dim(\Delta)-1} + \Delta) \mid \Delta \vdash_{\mathsf{ps}} \} \] The first component allows equivalences to be well-formed, and the second gives the support condition for composites. \end{definition} The regular operation set has more standard presentation. \begin{proposition} \label{prop:std-op} Let the set \Std of standard operations be defined as: \[ \Std = \{ (\Delta, \bdry n - \Delta, \bdry n + \Delta) \mid \Delta \vdash_{\mathsf{ps}} , n \geq \dim(\Delta) - 1 \} \] Then \(\Std = \Reg\). \end{proposition} \begin{proof} Suppose \((\Delta, U, V) \in \Reg\). If \(U = \bdry {\dim(\Delta) - 1} - \Delta\) and \(V = \bdry {\dim(\Delta) - 1} + \Delta\), then \((\Delta , U ,V)\) is trivially in \Std by letting \(n = \dim(\Delta) - 1\). If instead \(U = V = \Var(\Delta)\), then \((\Delta, U , V) \in \Std\) by letting \(n = \dim(\Delta)\) and applying \cref{lem:bdry-full}. Conversely, assume \((\Delta, U, V) \in \Std\). Then there is \(n \geq \dim(\Delta) - 1\) with \(U = \bdry n - \Delta\) and \(V = \bdry n + \Delta\). If \(n = \dim(\Delta) - 1\) then \((\Delta, U ,V)\) is trivially in \(\Reg\), and otherwise by \cref{lem:bdry-full} we have \(U = V = \Var(\Delta)\), and so \((\Delta,U,V)\) is again an element of \Reg. Hence, \(\Reg = \Std\). \end{proof} This more uniform presentation is sometimes easier to work with, and will be used to prove properties of \Reg in \cref{sec:operation-properties}. \begin{remark} By letting \(\mathcal{O} = \emptyset\), we recover the type theory \textsf{GSeTT}~\cite{benjamin2021globular}, a type theory for globular sets. \end{remark} It would be possible to generalise the notion of operation set presented here by instead letting the set \(\mathcal{O}\) consist of triples \((\Delta, s,t)\) where \(s\) and \(t\) are terms over \(\Delta\) instead of variable sets over \(\Delta\). This would allow more control over which operations were allowed in the theory. As an example, we would be able to restrict the class of composites to contain only the standard composites, or even further restrict it to binary composites. This is however unnecessary to present the regular and groupoidal versions of \Cattr. By only allowing the set of available operations to be specified up to the support of the contained terms, it is possible to show that a coherence being an operation is closed under equality by proving that equality preserves the support of a term. \subsection{Operation properties} \label{sec:operation-properties} Currently, our set of operations is completely unconstrained, and we will be limited in the constructions that can be made in \Cattr. We therefore constrain these sets in two ways. The first enforces that our set of operations is closed under suspension, for which we need to be able to suspend variable sets. This is defined in the formalisation as \func{Catt.Suspension.Support}{susp-vs}. \begin{definition} Let \(\Delta\) be a context. The suspension of a variable set \(V\) over \(\Delta\) is defined to be: \[ \Sigma(V) = \{ N , S \} \cup V \] where \(\Sigma(V)\), the suspension of \(V\) is a variable set over \(\Sigma(\Delta)\). \end{definition} The suspension of a variable set commutes with taking the support of a piece of syntax, as shown in the next lemma. \begin{lemma} \label{lem:susp-vs-prop} The following equalities hold: \[ \Supp(\Sigma(s)) = \Sigma(\Supp(s)) \qquad \Supp(\Sigma(A)) = \Sigma(\Supp(A)) \qquad \Supp(\Sigma(\sigma)) = \Sigma(\Supp(\sigma)) \] for term \(s : \Term_\Gamma\), type \(A : \Type_\Gamma\), and substitution \(\sigma : \arr \Delta \star \Gamma\). \end{lemma} \begin{proof} All equalities hold by a mutual induction on terms, types, and substitutions, with a secondary induction on the context \(\Gamma\) for the case of the variables and the base type \(\star\). These calculations are given in \module{Catt.Suspension.Support}. \end{proof} We can then define our first property on operation sets. \begin{definition} An operation set \(\mathcal{O}\) is \emph{suspendable} if: \[ (\Delta, U, V) \in \mathcal{O} \implies (\Sigma(\Delta), \Sigma(U), \Sigma(V)) \in \mathcal{O} \] For \(\Delta : \Ctx\) and \(U, V \subseteq \Var(\Delta)\). \end{definition} The groupoidal operation set is trivially suspendable. To show that the regular operation set is suspendable, we prove the following proposition. \begin{proposition} Let \(\Delta\) be a ps-context. Then: \[\Sigma(\bdry n \epsilon \Delta) = \bdry {n + 1} {\epsilon} {\Sigma(\Delta)}\] for \(n \in \mathbb{N}\) and \(\epsilon \in \{-,+\}\). \end{proposition} \begin{proof} We proceed by induction on \(\Delta\). First suppose \(\Delta = (x : \star)\). We then have: \[ \Sigma(\bdry n \epsilon {(x : \star)}) = \Sigma(\{x\}) = \{N,S,x\} = \bdry {n + 1} {\epsilon} {\Sigma((x: \star))} \] Now suppose that \(\Delta = \Delta', (y : A), (f : \arr x A y)\). We split into cases on \(n\), \(\dim(A)\), and \(\epsilon\): \begin{itemize} \item If \(n < \dim(A)\) then \begin{align*} \Sigma(\bdry n \epsilon \Delta) &= \Sigma(\bdry n \epsilon {\Delta'})\\ &= \bdry {n + 1} {\epsilon} {\Sigma(\Delta')} &\text{by inductive hypothesis}\\ &= \bdry {n + 1} {\epsilon} {\Sigma(\Delta)} &\text{as }n + 1 < \dim(\Sigma(A))\\ \intertext{ \item If \(n = \dim(A)\) and \(\epsilon = -\) then the proof is similar to the preceding case. \item If \(n = \dim(A)\) and \(\epsilon = +\) then: } \Sigma(\bdry n + \Delta) &= \Sigma((\bdry n + {\Delta'} \cup \{y\}) \setminus \{x\})\\ &= (\Sigma(\bdry n + {\Delta'}) \cup \{y\}) \setminus \{x\} \\ &= (\bdry {n+1} + {\Sigma(\Delta')} \cup \{y\}) \setminus \{x\} &\text{by inductive hypothesis}\\ &= \bdry {n+1} + {\Sigma(\Delta)} &\text{as }n + 1 = \dim(\Sigma(A))\\ \intertext{ \item If \(n > \dim(A)\) then} \Sigma(\bdry n \epsilon \Delta) &= \Sigma((\bdry n \epsilon {\Delta'}) \cup \{y,f\})\\ &= \Sigma(\bdry n + {\Delta'}) \cup \{y,f\} \\ &= \bdry {n+1} + {\Sigma(\Delta')} \cup \{y, f\} &\text{by inductive hypothesis}\\ &= \bdry {n+1} + {\Sigma(\Delta)} &\text{as }n + 1 > \dim(\Sigma(A)) \end{align*} \end{itemize} Hence, the desired equality holds in all cases. \end{proof} \begin{corollary} The regular operation set is suspendable. \end{corollary} \begin{proof} By \cref{prop:std-op}, it suffices to show that the standard operation set is suspendable, which is clear from the above proposition. \end{proof} The second restriction we put on operation sets is that there are enough operations to create the standard coherences presented in \cref{sec:basic-constructions}. \begin{definition} An operation set \(\mathcal{O}\) \emph{contains the standard operations} if \(\Std \subseteq \mathcal{O}\). \end{definition} The groupoidal operation set clearly contains the standard operations, and the regular operation set also does due to \cref{prop:std-op}. The empty operation set does not contain the standard operations. We end this section with the following proposition about the support of terms in a disc. \begin{proposition} For \(n \in \mathbb{N}\) the following two equations hold: \[ \bdry n - {D^{n+1}} = \Var(S^n) \cup \{d_n^-\} = \Var(D^n) \qquad \bdry n + {D^{n+1}} = \Var(S^n) \cup \{d_{n+1}^+\}\] Further, the following equations hold: \[\FV(U^n) = \Var(S^n) \qquad \Supp(d_n^-) = \Var(D^n) = \bdry n - {D^{n+1}} \qquad \Supp(d_n^+) = \bdry n + {D^{n+1}} \] again for any \(n \in \mathbb{N}\). \end{proposition} \begin{proof} The first equations follow by a simple case analysis, using that \(\bdry n - {D^n} = \Var(D^n)\) by \cref{lem:bdry-full,item:disc-prop-dim}. The free variables of \(U^n\) are easily calculated inductively, and the support of \(d_n^-\) and \(d_n^+\) are easy to compute using the first parts of the proposition, and that \(FV(U^n) \subseteq \Supp(d_n^-)\) and \(\FV(U^n) \subseteq \Supp(d_n^+)\) as the support of a term is downwards closed. These proofs are formalised in \module{Catt.Discs.Support}. \end{proof} \begin{corollary} \label{cor:disc-op} Both \((D^{n+1}, d_n^-, d_n^+)\) and \((D^n, d_n, d_n)\) are in \(\Std\) for each \(n\). \end{corollary} \section[The set of equality rules \texorpdfstring{\(\mathcal{R}\)}{R}]{The set of equality rules \texorpdfstring{\boldmath\(\mathcal{R}\)}{R}} \label{sec:ruleset} In \Cattr, the definitional equality relation is generated by a set of rules \(\mathcal{R}\) formed of triples containing a context and two terms in the context which should be made equal. In this section we discuss some operations on these equality sets and properties that they may have. \begin{remark} In the formalisation the set of equality rules is defined similarly to the set of operations \(\mathcal{O}\). It is defined as a function that takes a context and two terms over that context and returns a type. It is therefore proof relevant in the same way as the operation sets. \end{remark} The equality rule sets inherit some operations and relations just by being sets. We can easily form the empty equality set, which allows us to recover the weak type theory \Catt, and given two equality sets we can take their union, to get a type theory with equalities from both sets (we note that the equality generated by a union is in general coarser than the union of the equalities generated by the individual sets). To aid readability when reasoning about typing and equality with multiple distinct operations, we may subscript the turnstile symbol in various judgements with the set of equality rules being used. For example, we may write the judgements for typing of a term \(t\) in the type theory generated from rules \(\mathcal{R}\) as \[ \Gamma \vdash_{\mathcal{R}} t : A \] and the corresponding judgement for the equality of two terms \(s\) and \(t\) as \[ \Gamma \vdash_{\mathcal{R}} s = t \] Equality rule sets can also be subsets of each other, leading to the following lemma. \begin{lemma} \label{lem:subset-lem} Let \(\mathcal{R}\) and \(\mathcal{S}\) be two equality rule sets and suppose that \[ \Gamma \vdash_{\mathcal{S}} s = t\] for all \((\Gamma,s,t) \in \mathcal{R}\) with \(\Gamma \vdash_{\mathcal{S}} s : A\) for some \(A : \Type_\Gamma\). Then the following inference rules hold: \begin{mathpar} \inferrule{\Gamma \vdash_{\mathcal{R}}}{\Gamma \vdash_{\mathcal{S}}} \and \inferrule{\Gamma \vdash_{\mathcal{R}} t : A}{\Gamma \vdash_{\mathcal{S}} t : A} \and \inferrule{\Gamma \vdash_{\mathcal{R}} A}{\Gamma \vdash_{\mathcal{S}} A} \and \inferrule{\Gamma \vdash_{\mathcal{R}} \sigma : \Delta}{\Gamma \vdash_{\mathcal{S}} \sigma : \Delta} \\ \inferrule{\Gamma \vdash_{\mathcal{R}} s = t}{\Gamma \vdash_{\mathcal{S}} s = t} \and \inferrule{\Gamma \vdash_{\mathcal{R}} A = B}{\Gamma \vdash_{\mathcal{S}} A = B} \and \inferrule{\Gamma \vdash_{\mathcal{R}} \sigma = \tau}{\Gamma \vdash_{\mathcal{S}} \sigma = \tau} \end{mathpar} In particular these inference rules hold when \(\mathcal{R} \subseteq \mathcal{S}\). \end{lemma} \begin{proof} Follows from a simple induction. Details are given in the formalisation in module \module{Catt.Typing.Rule.Properties}. \end{proof} \begin{corollary} \label{cor:catt-to-r} Any context, term, type, or substitution that is well-formed in \Catt is also well-formed in \Cattr, for any equality set \(\mathcal{R}\). \end{corollary} Furthermore, we can immediately show that the application of a substitution to piece of syntax that is well-formed in \Catt is well-formed. \begin{lemma} \label{lem:sub-catt} Let \(\mathcal{R}\) be any equality rule set. Then the following inference rules hold for \(\sigma : \arr \Delta \star \Gamma\): \begin{mathpar} \inferrule{\Delta \vdash_\emptyset A \\ \Gamma \vdash_{\mathcal{R}} \sigma : \Delta}{\Gamma \vdash_{\mathcal{R}} A \sub \sigma }\and \inferrule{\Delta \vdash_\emptyset s : A \\ \Gamma \vdash_{\mathcal{R}} \sigma : \Delta}{\Gamma \vdash_{\mathcal{R}} s \sub \sigma : A \sub \sigma } \and \inferrule{\Delta \vdash_\emptyset \tau : \Theta \\ \Gamma \vdash_{\mathcal{R}} \sigma : \Delta}{\Gamma \vdash_{\mathcal{R}} \tau \bullet \sigma : \Theta } \end{mathpar} where the judgements with a subscript empty set are judgements in the theory generated by the empty rule sets (judgements in \Catt). \end{lemma} \begin{proof} Follows immediately from a mutual induction, using that any equality in \Catt is syntactic. The proof is formalised in \module{Catt.Typing.Properties.Base}. \end{proof} An arbitrary set \(\mathcal{R}\) has very few restrictions on the equality relation it generates, and the terms that are well-formed because of it. A rule set \(\mathcal{R}\) could identify terms of different types, or identify two different variables (or even identify all variables or terms). This makes it difficult to prove much about the theory generated by an arbitrary set \(\mathcal{R}\). To this end, we introduce certain conditions that these equality rule sets can satisfy. The first three of these conditions put certain closure properties on the set of rules \(\mathcal{R}\), and each allow various constructions to be well-formed. We call theories that satisfy these three properties \emph{tame theories} and introduce these in \cref{sec:tame-theories}. In \cref{sec:further-conditions}, we introduce two more conditions which take the form of a property that the generated equality must satisfy. By introducing these conditions, we can prove various metatheoretic properties about \Cattr in a modular and generic way. This will allow the re-use of many constructions and proofs about the properties of these constructions in \cref{cha:cattstrict}, where two distinct type theories for semistrict \(\infty\)-categories are given. In the following subsections, we will also show that the rule set for disc removal satisfies all these conditions. For all these conditions, we will have that if the condition holds on \(\mathcal{R}\) and on \(\mathcal{S}\) then it also holds on \(\mathcal{R}\cup \mathcal{S}\), and so these conditions can be proved individually for each rule set that is introduced. Further, the empty set will satisfy all of these conditions vacuously, and so all proofs and constructions in the section apply to \Catt. \subsection{Tame theories} \label{sec:tame-theories} Here we introduce the three core conditions on the equality rule set \(\mathcal{R}\) which we expect hold for any reasonable choice of rule set: \begin{itemize} \item The \emph{weakening condition}, which allows weakening to be well-formed. \item The \emph{suspension condition}, which allows suspension to be well-formed. \item The \emph{substitution condition}, which implies that the application of substitution to terms, types, and other substitutions (as substitution composition) preserves typing and equality. \end{itemize} We call an equality rule set \emph{tame} if it satisfies all three of these conditions, and call the corresponding theory \Cattr a \emph{tame theory}. \paragraph{Weakening condition} For the weakening operation to be well-formed, meaning that the weakening of a well-formed piece of syntax is itself well-formed, the following closure property must hold on the set of rules \(\mathcal{R}\). \begin{definition} A set of rules \(\mathcal{R}\) satisfies the \emph{weakening condition} if for all \((\Gamma,s,t) \in \mathcal{R}\) we have: \[ ((\Gamma, (x : A)), \wk(s), \wk(t)) \in \mathcal{R} \] for all \(A : \Type_\Gamma\). \end{definition} The following proposition is immediately provable by mutual induction on typing and equality. Its proof is given in \module{Catt.Typing.Properties.Weakening}. \begin{proposition} Let \(\mathcal{R}\) satisfy the weakening condition. Then the following inference rules are admissible in \Cattr. \begin{mathpar} \inferrule{\Gamma \vdash B}{\Gamma, (x : A) \vdash \wk(A)} \and \inferrule{\Gamma \vdash s : B}{\Gamma, (x : A) \vdash \wk(s) : \wk(B)} \and \inferrule{\Gamma \vdash \sigma : \Delta}{\Gamma, (x : A) \vdash \wk(\sigma) : \Delta} \end{mathpar} for types \(A,B : \Type_\Gamma\), term \(s : \Term_\Gamma\) and substitution \(\sigma : \arr \Delta C \Gamma\). \end{proposition} \begin{corollary} \label{cor:id-sub-ty} If \(\mathcal{R}\) satisfies the weakening condition then: \[ \Gamma \vdash \id_\Gamma : \Gamma \] for any \(\Gamma : \Ctx\). \end{corollary} Using only the above proposition we can immediately prove typing properties for several constructions using discs. \begin{lemma} \label{lem:disc-typing} Suppose the weakening condition holds. Then the following judgements hold: \[ S^n \vdash U^n \qquad S^n \vdash \qquad D^n \vdash \] For all \(n \in \mathbb{N}\). Further, the following inference rules are admissible: \begin{mathpar} \inferrule{\Gamma \vdash A \\ n = \dim(A)} {\Gamma \vdash \{A\} : S^n} \and \inferrule{\Gamma \vdash A \\ n = \dim(A) \\ \Gamma \vdash s : A} { \Gamma \vdash \{A,s\} : D^n} \\ \inferrule{\Gamma \vdash \{A\} : S^n}{\Gamma \vdash A} \and \inferrule{\Gamma \vdash \{A,s\} : D^n}{\Gamma \vdash A} \and \inferrule{\Gamma \vdash \{A,s\} : D^n}{\Gamma \vdash s : A} \end{mathpar} For \(A : \Type_\Gamma\) and \(s : \Term_\Gamma\). \end{lemma} \begin{proof} The first three typing judgements follow from a simple mutual induction, making use of the typing of weakening. We prove that \(\Gamma \vdash \{A\} : S^n\) by induction on \(n\) and \(A\). The base case is trivial. For the inductive step we assume that \(\Gamma \vdash \arr s A t\), with \(n = \dim(A)\), and want to show that: \[ \Gamma \vdash \langle \{A\},s ,t \rangle : S^n, (d_{n+1}^- : U^n), (d_{n+1}^+ : \wk(U^n)) \] The judgement \(\Gamma \vdash \{A\} : S^n\) holds by inductive hypothesis, and so it remains to show that the following two judgements hold: \[ \Gamma \vdash s : U^n \sub {\{A\}} \qquad \Gamma \vdash t : \wk(U^n)\sub{\langle\{A\}, s\rangle} \] As \(\Gamma \vdash \arr s A t\), we know (by case analysis on the typing derivation) that \(\Gamma \vdash s : A\) and \(\Gamma \vdash t : A\). These judgements are sufficient to finish the proof, since \(A \equiv U^n \sub {\{A\}} \equiv \wk(U^n) \sub {\langle \{A\}, s \rangle}\) by \cref{item:disc-prop-sub-from} and the interaction of weakening with substitution application. To show that \(\Gamma \vdash A\) follows from \(\Gamma \vdash \{A\} : S^n\), we instead show that \(\Gamma \vdash U^n \sub {\{A\}}\), leveraging that typing is invariant under syntactic equality. The typing of \(U^n \sub {\{A\}}\) follows from \(U^n\) being well-formed in \Catt (as it is well-formed in any theory with the weakening property), and \cref{lem:sub-catt}. The second to last inference rule follows trivially from the preceding one. For the last rule, we get that \(\Gamma \vdash s : U^n\sub{\{A\}}\) by case analysis on \(\Gamma \vdash \{A,s\} : D^n\), and so we are finished by the invariance of typing rules under syntactic equality. \end{proof} If we further have that the set of operations includes the standard operations then we get the following corollary. \begin{corollary} \label{cor:id-typing} Suppose that \(\mathcal{O}\) contains the standard operations in addition to \(\mathcal{R}\) satisfying the weakening condition. Then the following are equivalent: \begin{itemize} \item \(\Gamma \vdash A\) and \(\Gamma \vdash s : A\), \item There exists some \(B: \Type_\Gamma\) such that \(\Gamma \vdash \id(A,t) : B\), \item \(\Gamma \vdash \id(A,t) : \arr t A t\). \end{itemize} If we further have that \(\dim(A) \neq 0\) then the following two conditions are also equivalent: \begin{itemize} \item There exists some \(B: \Type_\Gamma\) such that \(\Gamma \vdash \Coh {D^n} {\wk(U^n)} {\{A,t\}} : B\), \item \(\Gamma \vdash \Coh{D^n} {\wk(U^n)} {\{A,t\}} : A\). \end{itemize} where \(n = \dim(A)\). \end{corollary} \begin{proof} The proof follows from \cref{lem:disc-typing,item:disc-prop-sub-from,cor:disc-op}. \end{proof} We end this discussion with the following \lcnamecref{prop:dr-weak}. \begin{proposition} \label{prop:dr-weak} The set \dr satisfies the weakening condition. \end{proposition} \begin{proof} It suffices to show that for all \(\Gamma : \Ctx\), \(A, B : \Type_\Gamma\), and \(t : \Term_\Gamma\) that: \[ ((\Gamma, (x : B)), \Coh {D^n} {\wk(U^n)} {\wk(\{A,t\})}, \wk(t)) \in \dr \] when \(n = \dim(A)\). By \cref{item:disc-prop-wk}, \(\wk(\{A,t\}) \equiv \{ \wk(A), \wk(t)\}\) and so the triple above is clearly contained in \dr. \end{proof} The semistrict type theories \Cattsu and \Cattsua (which will be introduced in \cref{sec:cattsu,sec:cattsua}) will be generated by equality rule sets that are the union of multiple smaller rule sets (including disc removal). Since the weakening condition is clearly preserved under unions, we will be able to show that the rule sets generating \Cattsu and \Cattsua satisfy the weakening condition by showing that it is satisfied by each individual component. \paragraph{Suspension condition} For suspension, we introduce the following condition, which is similar to the corresponding condition for weakening. \begin{definition} A set of equality rules \(\mathcal{R}\) satisfies the \emph{suspension condition} if \[ (\Sigma(\Gamma), \Sigma(s), \Sigma(t)) \in \mathcal{R} \] for all \((\Gamma,s,t) \in \mathcal{R}\). \end{definition} If the set of operations \(\mathcal{O}\) is suspendable, then this condition is sufficient to show that the suspension of a well-formed piece of syntax is well-formed. \begin{proposition} Suppose \(\mathcal{O}\) is suspendable and \(\mathcal{R}\) satisfies the suspension condition. Then the following inference rules are admissible for \(\Gamma, \Delta, \Delta' : \Ctx\), \(A,B,C,D : \Type_\Gamma\), \(s,t : \Term_\Gamma\), \(\sigma : \arr \Delta C \Gamma\), and \(\tau : \arr {\Delta'} D \Gamma\). \begin{mathpar} \inferrule{\Gamma \vdash}{\Sigma(\Gamma) \vdash}\and \inferrule{\Gamma \vdash A}{\Sigma(\Gamma) \vdash \Sigma(A)}\and \inferrule{\Gamma \vdash s : A}{\Sigma(\Gamma) \vdash \Sigma(s) : \Sigma(A)}\and \inferrule{\Gamma \vdash \sigma : \Delta}{\Sigma(\Gamma) \vdash \Sigma'(\sigma) : \Delta}\\ \inferrule{\Gamma \vdash A = B}{\Sigma(\Gamma) \vdash \Sigma(A) = \Sigma(B)}\and \inferrule{\Gamma \vdash s = t}{\Sigma(\Gamma) \vdash \Sigma(s) = \Sigma(t)}\and \inferrule{\Gamma \vdash \sigma = \tau}{\Sigma'(\Gamma) \vdash \Sigma'(\sigma) = \Sigma(\tau)} \end{mathpar} For all \(\mu : \arr \Delta {\arr s A t} \Gamma\) and \(\mu' : \arr {\Delta'} {\arr {s'} {A'} {t'}} {\Gamma'}\) the following two rules are admissible: \begin{mathpar} \inferrule{\Gamma \vdash \mu : \Delta}{\Gamma \vdash \unrestrict \mu : \Sigma(\Delta)} \and \inferrule{\Gamma \vdash \mu = \mu'}{\Gamma \vdash \unrestrict \mu = \unrestrict \mu'} \end{mathpar} and so the inference rules \begin{mathpar} \inferrule{\Gamma \vdash \sigma : \Delta}{\Sigma(\Gamma) \vdash \Sigma(\sigma) : \Sigma(\Delta)} \and \inferrule{\Gamma \vdash \sigma = \tau} {\Sigma(\Gamma) \vdash \Sigma(\sigma) = \Sigma(\tau)} \end{mathpar} hold for \(\sigma : \arr \Delta \star \Gamma\) and \(\tau : \arr {\Delta'} \star \Gamma\). \end{proposition} \begin{proof} The rules concerning the unrestriction operation follow by simple induction on the typing judgement or equality in the premise, and in fact do not need the suspension condition. The remainder of the rules follow from a routine mutual induction on all typing and equality rules, which can be found in \module{Catt.Suspension.Typing}. The suspendability of the operation set is used for the case involving the typing rule for coherences, which also makes use of \cref{lem:susp-vs-prop}. In this case, the functoriality of suspension is used to show that the coherence has the correct type. The suspension condition is used for the rule constructor of the equality of terms. \end{proof} Similarly to the weakening condition, the suspension condition is closed under unions of rule sets, and we can show it is satisfied by \dr, with a similar proof to the proof for weakening. \begin{proposition} \label{prop:dr-susp} The set \dr satisfies the suspension condition. \end{proposition} \begin{proof} It is sufficient to prove that for all \(\Gamma : \Ctx\), \(A : \Type_\Gamma\), and \(t : \Term_\Gamma\) that: \[(\Sigma(\Gamma), \Coh {\Sigma(D^n)} {\Sigma(\wk(U^n))} {\Sigma(\{A,t\})}, \Sigma(t)) \in \dr\] when \(n = \dim(A)\). By \cref{item:disc-prop-susp}, we get that \(\Sigma(D^n) \equiv D^{n+1}\) and \(\Sigma(\wk(U^n)) \equiv \wk(\Sigma(U^n)) \equiv \wk(U^{n+1})\). By \cref{item:disc-prop-sub-susp}, \(\Sigma(\{A,t\}) \equiv \{\Sigma(A),\Sigma(t)\}\). Therefore, it is sufficient to show that: \[(\Sigma(\Gamma), \Coh {D^{n+1}} {\wk(U^{n+1})} {\{\Sigma(A),\Sigma(t)\}}, \Sigma(t)) \in \dr\] which is clear as \(\dim(\Sigma(A)) = \dim(A) + 1 = n+1\). \end{proof} \paragraph{Substitution condition} The substitution condition takes a slightly different form to the previous two conditions. Instead of requiring that the rule set is closed under application of any arbitrary substitution \(\sigma\), we instead only ensure it is closed under well-formed substitutions. This will not prevent us proving that typing is closed under the application of substitutions, but will be critical in proving that the supported rules construction, which will be given in \cref{def:rule-with-supp} and is used for proving the support condition, satisfies the substitution condition. \begin{definition} An equality rule set \(\mathcal{R}\) satisfies the \emph{\(\mathcal{R}'\)-substitution condition} if: \[ (\Gamma, s \sub \sigma, t\sub \sigma) \in \mathcal{R} \] whenever \((\Delta, s, t) \in \mathcal{R}\) and \(\sigma : \arr \Delta \star \Gamma\) with \(\Gamma \vdash_{\mathcal{R}'} \sigma : \Delta\). We say the set \(\mathcal{R}\) satisfies the \emph{substitution condition} if it satisfies the \(\mathcal{R}\)-substitution condition. \end{definition} We make two comments about this definition: \begin{itemize} \item We only close under substitutions with type part \(\star\). It will still be possible that typing is preserved by arbitrary (well-formed) substitutions when combined with the suspension condition. \item We introduce a second rule set \(\mathcal{R}'\) in the definition, which is only used for the typing premise of the substitution \(\sigma\). The reason for this is that the substitution condition is not closed under unions, and so we will instead prove that certain rule sets satisfy the \(\mathcal{R}'\)-substitution condition for an arbitrary \(\mathcal{R}'\), a condition which is closed under unions. \end{itemize} The substitution condition allows us to give the next proposition. \begin{proposition} \label{prop:sub-prop-1} Suppose \(\mathcal{R}\) satisfies the substitution condition. For any \(\sigma : \arr \Delta \star \Gamma\), the following rules are admissible: \begin{mathpar} \inferrule{\Delta \vdash A \\ \Gamma \vdash \sigma : \Delta}{\Gamma \vdash A \sub \sigma}\and \inferrule{\Delta \vdash s : A \\ \Gamma \vdash \sigma : \Delta}{\Gamma \vdash s \sub \sigma : A \sub \sigma}\and \inferrule{\Delta \vdash \tau : \Theta \\ \Gamma \vdash \sigma : \Delta}{\Gamma \vdash \tau \bullet \sigma : \Theta}\\ \inferrule{\Delta \vdash A = B\\ \Gamma \vdash \sigma : \Delta}{\Gamma \vdash A \sub \sigma = B \sub \sigma}\and \inferrule{\Delta \vdash s = t \\ \Gamma \vdash \sigma : \Delta}{\Gamma \vdash s \sub \sigma = t \sub \sigma}\and \inferrule{\Delta \vdash \tau = \mu \\ \Gamma \vdash \sigma : \Delta}{\Gamma \vdash \tau \bullet \sigma = \mu \bullet \sigma} \end{mathpar} If \(\mathcal{R}\) additionally satisfies the suspension conditions, then all the above rules are admissible for any substitution \(\sigma : \arr \Delta B \Gamma\). \end{proposition} \begin{proof} The proof for a non-extended substitution is given by another routine mutual induction in \module{Catt.Typing.Properties.Substitution}. For an arbitrary substitution \(\sigma : \arr \Delta B \Gamma\), we also proceed by mutual induction, but for the application of the substitution to an equality of terms \(s\) and \(t\) we further split on \(B\). If \(B = \star\), then the proof for non-extended substitutions can be used. Otherwise, we have: \begin{align*} s \sub \sigma &\equiv \Sigma s \sub {\unrestrict \sigma}\\ &= \Sigma t \sub {\unrestrict \sigma}\\ &\equiv t \sub \sigma \end{align*} with the non-syntactic equality following from the preservation of equality by suspension and inductive hypothesis. The proofs that the extended versions of these rules are admissible are found in \module{Catt.Typing.Properties.Substitution.Suspended}. \end{proof} We also prove that application of substitution respects equality in its second argument, which does not in fact need the substitution condition. This is also proved by a simple mutual induction in \module{Catt.Typing.Properties.Substitution}. \begin{proposition} \label{prop:sub-prop-2} The following inference rules are admissible: \begin{mathpar} \inferrule{\Gamma \vdash \sigma = \tau}{\Gamma \vdash s \sub \sigma = s \sub \tau}\and \inferrule{\Gamma \vdash \sigma = \tau}{\Gamma \vdash A \sub \sigma = A \sub \tau}\and \inferrule{\Gamma \vdash \sigma = \tau}{\Gamma \vdash \mu \bullet \sigma = \mu \bullet \tau} \end{mathpar} for substitutions \(\sigma : \arr \Delta A \Gamma\), \(\tau : \arr \Delta B \Gamma\), and \(\mu : \arr \Theta C \Delta\), term \(s : \Term_\Delta\), and type \(A : \Type_\Delta\). \end{proposition} This allows us to define a category of well-formed syntax in \Cattr, which is well-defined by the two preceding definitions. \begin{definition} Suppose \(\mathcal{R}\) satisfies the substitution and weakening conditions. Then we can define the \emph{syntactic category} of \Cattr, which by an abuse of notation we call \(\mathsf{Catt}_{\mathcal{R}}\), to have: \begin{itemize} \item Objects given by contexts \(\Gamma\) where \(\Gamma \vdash\). \item Morphisms \(\Delta \to \Gamma\) given by substitutions \(\sigma : \arr \Delta \star \Gamma\) where \(\Gamma \vdash \sigma : \Delta\) quotiented by the relation which equates substitutions \(\sigma\) and \(\tau\) when \(\Gamma \vdash \sigma = \tau\). \item The identity morphism \(\Gamma \to \Gamma\) given by \(\id_\Gamma\). \item Composition is given by \(\tau \circ \sigma = \sigma \bullet \tau\). \end{itemize} By \cref{cor:id-sub-ty}, the identity substitution is a well-defined morphism, and the above two propositions prove that composition is well-defined. Composition satisfies associativity and unitality by \cref{prop:categorical}. \end{definition} By taking the weakening of the identity substitution \(\id_\Gamma : \Gamma \to \Gamma\), we get a substitution: \[ \proj_{\Gamma} = \wk(\id_\Gamma) : \Gamma \to \Gamma, (x : A)\] which includes \(\Gamma\) into \(\Gamma, x : A\). It can be checked (and is given by \func{Catt.Syntax.Properties}{apply-project-is-wk-tm} in the formalisation) that applying this substitution to a term is the same operation as weakening the term. Using this, the following can be proved: \begin{lemma} Suppose \(\mathcal{R}\) satisfies the substitution condition. Then it also satisfies the weakening condition. \end{lemma} \begin{proof} For \((\Gamma,s ,t) \in \mathcal{R}\) and \(A : \Type_\Gamma\), we must prove that: \[ ((\Gamma,(x:A)),\wk(s),\wk(t)) \equiv ((\Gamma, (x : A)), s \sub {\proj_{\Gamma}}, t \sub {\proj_{\Gamma}}) \in \mathcal{R} \] which will follow from the substitution condition if it can be proved that \[ \Gamma, x : A \vdash_{\mathcal{R}} \proj_\Gamma : \Gamma \] holds. This judgement is easy to derive when \(\mathcal{R}\) satisfies the weakening condition, but this is what we are trying to prove. Instead, since \(\emptyset\) trivially satisfies the weakening condition, \(\proj_\Gamma\) is well-formed in \Catt, and so the derivation above follows from \cref{cor:catt-to-r}. \end{proof} We lastly show that \dr also satisfies the substitution condition. \begin{proposition} \label{prop:dr-sub} The set \dr satisfies the \(\mathcal{R}\)-substitution condition for any equality set \(\mathcal{R}\). \end{proposition} \begin{proof} The proof is similar to \cref{prop:dr-weak,prop:dr-susp}, and follows from the equality \(\{A,t\} \bullet \sigma \equiv \{A \sub \sigma, t \sub \sigma\}\) which holds by \cref{item:disc-prop-sub-sub} . \end{proof} \begin{remark} The proof of the substitution condition for \dr makes no use of the typing of \(\sigma\). In fact this premise is only necessary for the supported rules construction which will be given in \cref{def:rule-with-supp} \end{remark} \paragraph{Tameness} We can now define tameness. \begin{definition} An equality rule set \(\mathcal{R}\) is tame if it satisfies the weakening, substitution, and suspension conditions. An operation set \(\mathcal{O}\) is tame if it is suspendable and contains the standard operations. A theory generated by \(\mathcal{R}\) and \(\mathcal{O}\) is tame if both \(\mathcal{R}\) and \(\mathcal{O}\) are. \end{definition} \begin{proposition} The set \dr is tame. \end{proposition} In the formalisation, each module is parameterised by the various conditions that the module needs, and where possible we avoid using extra unnecessary conditions. Given that every theory we will consider in this thesis is tame, and that it is hard to imagine a sensible theory that isn't tame, the argument could be made that the effort put into making distinctions between these conditions is wasted or at least unnecessary. The case for including the weakening condition is especially unconvincing as it is implied by the substitution condition which likely holds in any theory of significant interest. It is however included here as it is used in the formalisation, where its introduction is an artefact of the natural progression of this research. To this end, from \cref{sec:operations-catt}, we will assume that the theory we are working over is tame, and build a library of constructions and results that work in any tame theory, even when some results may not need all the conditions above. Since we have limited use for proving properties about theories that do not satisfy the substitution condition, we could have instead enforced that all theories respect substitution by adding a constructor to the (term) equality relation that takes an equality \(\Delta \vdash s = t\) and typing relation \(\Gamma \vdash \sigma : \Delta\) to an equality \(\Gamma \vdash s \sub \sigma = t \sub \sigma\). This may remove some overhead of setting up the weakening and substitution conditions. It would also allow more minimal equality rule sets to be given, as a rule set such as disc removal could be given by \[ \{(D^n, \Coh {D^n} {\wk(U^n)} {\id_{D^n}}, d_n) \mid n \in \mathbb{N}\} \] On the other hand, including the extra constructor would effectively add an extra case to each inductive proof, and it is less clear how to minimise some of the equality rules that will be introduced in \cref{sec:operations-catt}. Taking either approach would likely lead to a similar development of the theory. \subsection{Further conditions} \label{sec:further-conditions} Knowing that the theory we are working in is tame will be sufficient for giving most of the constructions and proofs in \cref{sec:operations-catt}. Here we introduce some extra conditions that instead serve to aid in the proof of metatheoretic properties of the generated theory. These conditions take the form of predicates on each rule in the equality rule sets, rather than being closure properties as the conditions for tameness were. \paragraph{Support condition} The support of a term plays a central role in classifying the operations of the theory (see \cref{sec:support}). Although it is known that support is respected by syntactic equality, we have not yet shown it is preserved by definitional equality. The following condition allows this to be proved. \begin{definition} A set \(\mathcal{R}\) satisfies the \emph{\(\mathcal{R}'\)-support condition} for an equality set \(\mathcal{R}'\) when: \[ \Gamma \vdash_{\mathcal{R}'} s : A \implies \Supp(s) = \Supp(t) \] for each \((\Gamma,s,t) \in \mathcal{R}\) and \(A : \Type_\Gamma\). A set \(\mathcal{R}\) satisfies the \emph{support condition} if it satisfies the \(\mathcal{R}\)-support condition. \end{definition} The use of support instead of free variables in this definition is critical, as we do not expect the free variables of a piece of syntax to be preserved by equality in general. As an example, we would like to have the equality: \[ D^1 \vdash \Coh {D^1} {U^1} {\id_{D^1}} = d_1 \] given by disc removal, yet the free variables of each side are not equal (though the support of each side is). We also draw attention to typing premise. Without this, the left-hand side of each equality rule is too unconstrained (at least with how the equality rules are currently presented), and this condition would fail to hold on the equality sets we introduce in this thesis. Having this typing premise come from a separate rule set \(\mathcal{R}'\) allows the support condition to be preserved by unions of equality sets, similar to the substitution condition. From the support condition, we immediately get the following proposition, proved by mutual induction. \begin{proposition} \label{prop:supp-prop} Let \(\mathcal{R}\) satisfy the support condition. Then the following rules are admissible: \begin{mathpar} \inferrule{\Gamma \vdash s = t}{\Supp(s) = \Supp(t)}\and \inferrule{\Gamma \vdash A = B}{\Supp(A) = \Supp(B)}\and \inferrule{\Gamma \vdash \sigma = \tau}{\Supp(\sigma) = \Supp(\tau)} \end{mathpar} For \(s,t: \Term_\Gamma\), \(A,B : \Type_\Gamma\) and substitutions \(\sigma : \arr \Delta A \Gamma\) and \(\tau : \arr \Theta B \Gamma\). \end{proposition} In traditional presentations of \Catt, \(\FV(t) \cup \FV(A)\) is used instead of \(\Supp(t)\) for a term \(t\) of type \(A\). Equipped with the support condition we can now show that these are the same. \begin{lemma} The following hold when \(\mathcal{R}\) satisfies the support condition: \begin{lemmaenum} \item \(\Supp(A) = \FV(A)\) when \(\Gamma \vdash A\), \item \label{item:supp-sub-char} \(\Supp(\sigma) = \FV(\sigma)\) when \(\Gamma \vdash \sigma : \Delta\), \item \label{item:supp-tm-char-1} \(\Supp(t) = \Supp(A) \cup \FV(t)\) when \(\Gamma \vdash t : A\), \item \label{item:supp-tm-char-2} \(\Supp(t) = \FV(A) \cup \FV(t) = \Supp(A) \cup \Supp(t)\) when \(\Gamma \vdash t : A\) and \(\Gamma \vdash A\). \end{lemmaenum} \end{lemma} \begin{proof} All properties are proven by a single mutual induction on the typing derivations in the premises. \begin{enumerate}[(i)] \item Suppose \(\Gamma \vdash A\). If \(A \equiv \star\) then \(\Supp(A) = \FV(A) = \emptyset\). Instead, suppose \(A \equiv \arr s B t\). Then we have that \(\Gamma \vdash B\), \(\Gamma \vdash s : B\), and \(\Gamma \vdash t : B\) and so: \begin{align*} \Supp(A) &= \Supp(B) \cup \Supp(s) \cup \Supp(t)\\ &= \FV(B) \cup (\FV(B) \cup \FV(s)) \cup (\FV(B) \cup \FV(t))&(*)\\ &= \FV(B) \cup \FV(s) \cup \FV(t)\\ &= \FV(A) \end{align*} where the equality \((*)\) is derived from the inductive hypothesis for (i) applied to \(B\) and the inductive hypothesis for (iv) applied to \(s\) and \(t\). \item Suppose \(\Gamma \vdash \sigma : \Delta\). If \(\sigma \equiv \langle A \rangle\) then \(\Gamma \vdash A\) and so: \[\Supp(\sigma) = \Supp(A) = \FV(A) = \FV(\sigma)\] If instead \(\sigma \equiv \langle \tau, t \rangle\) and \(\Delta = \Theta, (x : A)\) then \(\Gamma \vdash \tau : \Theta\) and \(\Gamma \vdash t : A \sub \tau\) and so: \begin{align*} \Supp(\sigma) &= \Supp(\tau) \cup \Supp(t)\\ &= \Supp(\tau) \cup (\Supp(A \sub \tau) \cup \FV(t))&(*)\\ &= \DC_\Gamma(\FV(\tau) \cup \FV(A \sub \tau)) \cup \FV(t)\\ &= \Supp(\tau) \cup \FV(t)&\text{as }\FV(A \sub \tau) \subseteq \FV(\tau)\\ &= \FV(\tau) \cup \FV(t)&(\dagger)\\ &= \FV(\sigma) \end{align*} where the equality \((*)\) is derived from the inductive hypothesis for (iii) applied to \(t\) and the equality \((\dagger)\) is derived from the inductive hypothesis for (ii) applied to \(\tau\). \item Suppose \(\Gamma \vdash t : A\). We then split on the constructor used for the typing derivation: If the derivation is the result of a conversion rule applied to \(\Gamma \vdash t : B\) and \(\Gamma \vdash A = B\), then inductive hypothesis gives \(\Supp(t) = \Supp(B) \cup \FV(t)\) and \cref{prop:supp-prop} gives \(\Supp(A) = \Supp(B)\) and so \(\Supp(t) = \Supp(A) \cup \FV(t)\) as required. If the derivation is derived from the typing rule for variables, then a simple induction on the context \(\Gamma\), using that \(\Supp(\wk(A)) = \Supp(A)\), gives the required result. If the derivation is given by the typing rule for coherences then \(t \equiv \Coh \Delta B \sigma\), \(\Gamma \vdash \sigma : \Delta\), and \(A \equiv B \sub \sigma\). Therefore, \begin{align*} \Supp(t) &= \Supp(\sigma)\\ &= \DC_\Gamma(\FV(B \sub \sigma) \cup \FV(\sigma))&\text{as }\FV(B \sub \sigma) \subseteq \FV(\sigma)\\ &= \Supp(A) \cup \Supp(\sigma)\\ &= \Supp(A) \cup \FV(\sigma)&(*)\\ &= \Supp(A) \cup \FV(t) \end{align*} where the equality \((*)\) is the result of applying the inductive hypothesis for (ii) to \(\sigma\). \item If \(\Gamma \vdash t : A\) and \(\Gamma \vdash A\) then: \[ \Supp(t) = \Supp(A) \cup \FV(t) = \FV(A) \cup \FV(t) \] trivially follows from (i) and (iii) and: \[ \Supp(t) = \DC_\Gamma(\Supp(t)) = \DC_\Gamma(\FV(A) \cup \FV(t)) = \Supp(A) \cup \Supp(t) \] with the first equality resulting from the idempotency of the downwards closure operator. \end{enumerate} This proof is formalised in \module{Catt.Typing.Properties.Support}. \end{proof} \begin{corollary} \label{cor:dc-sub} Let \(\mathcal{R}\) satisfy the support condition and suppose \(\Gamma \vdash \sigma : \Delta\). Then the following equality holds: \[ \DC_\Gamma(V \sub \sigma) = \DC_\Delta(V) \sub \sigma \] for all \(V \subseteq \Var(\Delta)\); downwards closure commutes with the application of \(\sigma\) to variable sets. \end{corollary} \begin{proof} Proceed by induction on \(\Delta\). If \(\Delta \equiv \emptyset\) then the equation is trivial. Therefore, assume \(\Delta \equiv \Theta, (x : A)\) and so \(\sigma \equiv \langle \tau , t \rangle\) with \(\Gamma \vdash \tau : \Theta\) and \(\Gamma \vdash t : A \sub \tau\) by case analysis. We now split on whether \(x \in V\). If \(x \not\in V\) then \(\DC_\Gamma(V \sub \sigma) = \DC_\Gamma(V \sub \tau) = \DC_\Theta(V) \sub \tau = \DC_\Delta(V) \sub \tau\) with the second equality due to inductive hypothesis. Otherwise, \(x \in V\) and so letting \(U = V \setminus \{x\}\) we get the equality: \begin{align*} \DC_\Gamma(V \sub \sigma) &= \DC_\Gamma(U \sub \tau \cup \FV(t))\\ &= \DC_\Gamma(U \sub \tau) \cup \Supp(t)\\ &= \DC_\Gamma(U \sub \tau) \cup \Supp(A \sub \tau) \cup \FV(t)&(\dagger)\\ &= \DC_\Gamma(U \sub \tau) \cup \DC_\Gamma(\FV(A) \sub \tau) \cup \FV(t) \\ &= \DC_\Gamma(U \sub \tau \cup \FV(A) \sub \tau) \cup \FV(t)\\ &= \DC_\Gamma((U \cup \FV(A)) \sub \tau) \cup \FV(t)\\ &= \DC_\Theta(U \cup \FV(A)) \sub \tau \cup \FV(t)&(*)\\ &= (\{x\} \cup \DC_\Theta(U \cup \FV(A))) \sub \sigma \\ &= \DC_\Delta(V) \sub \sigma \end{align*} where equality \((*)\) is by inductive hypothesis and equality \((\dagger)\) is by \cref{item:supp-tm-char-1}. \end{proof} Unfortunately, proving that the support condition holds for most equality rule sets is not as trivial as the proofs for the tameness properties. Consider the case for disc removal, which gives rise to the equality \[ \Gamma \vdash \Coh {D^n} {\wk(U^n)} {\{A,t\}} = t \] To prove the support condition for this case we need to show that: \[ \Supp(\{A,t\}) = \Supp(t) \] where we can assume that \(\Gamma \vdash t : A\). Intuitively this should hold, as the support of a substitution should be equal to the support of the locally maximal arguments, and if the derivation \(\Gamma \vdash t : A\) held in \Catt, we would be able to prove this. However, this proof (and intuition) relies on the derivation \(\Gamma \vdash_{\mathcal{R}} t : A\) holding in a theory generated by \(\mathcal{R}\) where \(\mathcal{R}\) already satisfies the support condition, without which the typing derivation offers little utility. We therefore introduce a proof strategy for showing that the support condition holds. The key insight of this strategy is to prove by induction that every equality and every typing derivation in the system is well-behaved with respect to support. Then, for the case of an equality \(\Gamma \vdash s = t\) arising from a rule \((\Gamma, s, t)\), we have \(\Gamma \vdash s : A\) as a premise and so by inductive hypothesis can assume that this typing derivation is well-behaved with respect to support. We formalise this with the following definition, called the \emph{supported rules} construction: \begin{definition} \label{def:rule-with-supp} Let \(\mathcal{R}\) be some equality rule set. The \emph{supported rules} construction applied to \(\mathcal{R}\) produces the equality rule set \(\mathcal{R}_{\mathsf{S}}\), given by: \[ \mathcal{R}_{\mathsf{S}} = \{ (\Gamma, s, t) \in \mathcal{R} \mid \Supp(s) = \Supp(t)\} \] The rule set \(\mathcal{R}_{\mathsf{S}}\) satisfies the support condition by construction. \end{definition} The proof strategy then proceeds as follows: to prove that \(\mathcal{R}\) satisfies the support condition, we instead prove that \(\mathcal{R}\) satisfies the \(\mathcal{R}_{\mathsf{S}}\)-support condition, leveraging that \(\mathcal{R}_\mathsf{S}\) itself satisfies the support condition. The proof is then completed by the following lemma: \begin{lemma} \label{lem:proof-strat-supp} Let \(\mathcal{R}\) be an equality rule set that satisfies the \(\mathcal{R}_{\mathsf{S}}\)-support condition. Then the following inference rules are admissible: \begin{mathpar} \inferrule{\Gamma \vdash_\mathcal{R} A}{\Gamma \vdash_{\mathcal{R}_\mathsf{S}} A}\and \inferrule{\Gamma \vdash_\mathcal{R} s : A}{\Gamma \vdash_{\mathcal{R}_\mathsf{S}} s : A}\and \inferrule{\Gamma \vdash_\mathcal{R} \sigma : \Delta}{\Gamma \vdash_{\mathcal{R}_\mathsf{S}} \sigma : \Delta}\and \inferrule{\Gamma \vdash_\mathcal{R} A = B}{\Gamma \vdash_{\mathcal{R}_\mathsf{S}} A = B}\and \inferrule{\Gamma \vdash_\mathcal{R} s = t}{\Gamma \vdash_{\mathcal{R}_\mathsf{S}} s = t}\and \inferrule{\Gamma \vdash_\mathcal{R} \sigma = \tau}{\Gamma \vdash_{\mathcal{R}_\mathsf{S}} \sigma = \tau} \end{mathpar} and hence \(\mathcal{R}\) satisfies the support condition. \end{lemma} \begin{proof} The inference rules are all proven using a mutual induction on all typing and equality rules, using that \(\mathcal{R}\) satisfies the \(\mathcal{R}_\mathsf{S}\)-support condition in the case where the equality \(\Gamma \vdash s = t\) is derived from a rule \((\Gamma, s, t) \in \mathcal{R}\). This induction is formalised in \module{Catt.Support.Typing}. The set \(\mathcal{R}\) then satisfies the support condition as if \((\Gamma,s,t) \in \mathcal{R}\) and \(\Gamma \vdash_{\mathcal{R}} s : A\), then \(\Gamma \vdash_{\mathcal{R}_{\mathsf{S}}} s : A\) holds by the first part of the lemma and so \(\Supp(s) = \Supp(t)\) as \(\mathcal{R}\) is already known to satisfy the \(\mathcal{R}_{\mathsf{S}}\)-support condition. \end{proof} \begin{remark} The original motivation for parameterising \Catt by an arbitrary set of equality rules \(\mathcal{R}\) was not to share proofs between \Cattsu and \Cattsua but was to be able to state the supported rules construction. \end{remark} To be able to prove that \(\mathcal{R}\) satisfies the \(\mathcal{R}_{\mathsf{S}}\)-support condition, we will commonly need to know that \(\mathcal{R}_{\mathsf{S}}\) satisfies various tameness conditions, which are given by the next lemma. \begin{lemma} \label{lem:supp-sat-conds} Let \(\mathcal{R}\) be any equality set. Then \(\mathcal{R}_{\mathsf{S}}\) satisfies the weakening, suspension, and substitution conditions if \(\mathcal{R}\) respects the corresponding condition. \end{lemma} \begin{proof} Let \((\Gamma, s, t) \in \mathcal{R}\) be an arbitrary rule. To show \(\mathcal{R}_{\mathsf{S}}\) satisfies the weakening condition we need to show that: \[ (\Gamma, s, t) \in \mathcal{R}_{\mathsf{S}} \implies ((\Gamma, (x : A)), \wk(s), \wk(t)) \in \mathcal{R}_{\mathsf{S}} \] By assumption, \((\Gamma, \wk(s), \wk(t)) \in \mathcal{R}\) and by the premise of the implication we have \(\Supp(s) = \Supp(t)\). From this it follows that \(\Supp(\wk(s)) = \Supp(\wk(t))\) and so the conclusion of the implication holds. The case for suspension is similar except we need to use the equality: \[ \Supp(\Sigma(s)) = \Sigma(\Supp(s)) = \Sigma(\Supp(t)) = \Supp(\Sigma(t)) \] derived from \cref{lem:susp-vs-prop} and \(\Supp(s) = \Supp(t)\) from the premise of the implication. For the substitution condition we need to show that: \[ \Supp(s) = \Supp(t) \implies \Supp(s \sub \sigma) = \Supp(t \sub \sigma) \] under the assumption that \(\Delta \vdash_{\mathcal{R}_\mathsf{S}} \sigma : \Gamma\). Since \(\mathcal{R}_\mathsf{S}\) satisfies the support rule, we can use \cref{cor:dc-sub} to get: \[ \Supp(s \sub \sigma) = \DC_\Gamma(\FV(s) \sub \sigma) = \Supp(s) \sub \sigma = \Supp(t) \sub \sigma = \DC_\Gamma(\FV(t) \sub \sigma) = \Supp(t \sub \sigma) \] as required. \end{proof} We now prove the appropriate support condition for disc removal. \begin{proposition} \label{prop:dr-supp} Let \(\mathcal{R}\) satisfy the support and weakening conditions. Then the set \(\dr\) satisfies the \(\mathcal{R}\)-support condition. \end{proposition} \begin{proof} It is sufficient to prove that given \(s : \Term_\Gamma\), \(A : \Type_\Gamma\), and \(n = \dim(A)\) that: \[\Gamma \vdash_{\mathcal{R}} \Coh {D^n} {\wk(U^n)} {\{A,t\}} : B \implies \Supp(\{A,t\}) = \Supp(t) \] Assume the premise of the implication. Then \(\Gamma \vdash_{\mathcal{R}} \{A,t\} : D^n\) by case analysis on the typing derivation and so \(\Gamma \vdash_{\mathcal{R}} A\) and \(\Gamma \vdash_{\mathcal{R}} t : A\) by \cref{lem:disc-typing} as \(\mathcal{R}\) satisfies the weakening condition. By a simple induction, it can be shown that \(\Supp(\{A,t\}) = \Supp(A) \cup \Supp(t)\). By \cref{item:supp-tm-char-2} we have \(\Supp(t) = \Supp(A) \cup \Supp(t)\) as \(\mathcal{R}\) satisfies the support condition and so \(\Supp(\{A,t\}) = \Supp(t)\) as required. \end{proof} \paragraph{Preservation condition} Our last condition allows us to prove preservation, the property that typing is preserved by equality. \begin{definition} A set \(\mathcal{R}\) satisfies the \emph{\(\mathcal{R}'\)-preservation condition} for an equality set \(\mathcal{R}'\) when: \[ \Gamma \vdash_{\mathcal{R}'} s : A \implies \Gamma \vdash_{\mathcal{R}'} t : A \] for each \((\Gamma, s, t) \in \mathcal{R}\) and \(A : \Type_\Gamma\). The set \(\mathcal{R}\) satisfies the \emph{preservation condition} if it satisfies the \(\mathcal{R}\)-preservation condition. \end{definition} When a rule set \(\mathcal{R}\) has all the properties presented in this section, we are able to show preservation for the generated theory. \begin{proposition} Let \(\mathcal{R}\) satisfy the support condition and preservation condition, as well as being tame. Then the following inference rules are admissible: \begin{mathpar} \inferrule{\Gamma \vdash A\\ \Gamma \vdash A = B}{\Gamma \vdash B}\and \inferrule{\Gamma \vdash s : A\\ \Gamma \vdash s = t \\ \Gamma \vdash A = B}{\Gamma \vdash t : B}\and \inferrule{\Gamma \vdash \sigma : \Delta\\ \Gamma \vdash \sigma = \tau}{\Gamma \vdash \tau : \Delta} \end{mathpar} for \(A, B : \Type_\Gamma\), \(s,t : \Term_\Gamma\), \(\sigma : \arr \Delta A \Gamma\), and \(\tau : \arr \Delta B \Gamma\). \end{proposition} \begin{proof} We prove the following bidirectional versions of the inference rules by mutual induction on the equality derivation: \begin{alignat*}{5} &\Gamma \vdash A = B &&\implies (\Gamma \vdash A \iff \Gamma \vdash B)\\ &\Gamma \vdash s = t &&\implies (\forall A.\ \Gamma \vdash s : A \iff \Gamma \vdash t : A)\\ &\Gamma \vdash \sigma = \tau &&\implies (\Gamma \vdash \sigma : \Delta \iff \Gamma \vdash \tau : \Delta) \end{alignat*} which imply the inference rules of the proposition are admissible (using the conversion rule for the second rule). The only non-trivial cases are for the statement for terms. We split on the equality derivation \(\Gamma \vdash s = t\). The cases for reflexivity on variables and transitivity are also trivial. The case for symmetry follows from the symmetry of the ``if and only if'' relation. Now suppose the equality is of the form \(\Coh \Delta A \sigma = \Coh \Delta B \tau\) and is derived from the equality rule for coherences from equalities \(\Delta \vdash A = B\) and \(\Gamma \vdash \sigma = \tau\). We prove the first direction, with the second following symmetrically. We therefore assume we have a typing derivation \(\Gamma \vdash \Coh \Delta A \sigma : C\), and will induct on this derivation to construction a derivation of \(\Gamma \vdash \Coh \Delta B \tau : C\). \begin{itemize} \item If the derivation is constructed with the conversion rule from \(\Gamma \vdash \Coh \Delta A \sigma : D\) and \(\Gamma \vdash D = C\), then we get a derivation \(\Gamma \vdash \Coh \Delta B \tau : D\) by inductive hypothesis and can apply the conversion rule to get a derivation \(\Gamma \vdash \Coh \Delta B \tau : C\). \item If instead the derivation is constructed with the coherence rule then \(C \equiv A \sub \sigma\) and \(A \equiv \arr s {A'} t\) and therefore \(B \equiv \arr {u} {B'} {v}\) with \(\Delta \vdash s = u\) and \(\Delta \vdash t = v\). We also have that \(\Delta \vdash_{\mathsf{ps}}\), \((\Delta, \Supp(s), \Supp(t)) \in \mathcal{O}\), \(\Delta \vdash A\), and \(\Gamma \vdash \sigma : \Delta\). By the inductive hypothesis on the equality, we have \(\Delta \vdash B\) and \(\Gamma \vdash \tau : \Delta\). By \cref{prop:supp-prop}, \(\Supp(s) = \Supp(u)\) and \(\Supp(t) = \Supp(v)\) and so \((\Delta, \Supp(u), \Supp(v)) \in \mathcal{O}\). Hence, by the coherence rule we have \(\Gamma \vdash \Coh \Delta B \tau : B \sub \tau\). By \cref{prop:sub-prop-1,prop:sub-prop-2}, \(\Gamma \vdash A \sub \sigma = B \sub \tau\) and so by the conversion rule we obtain a derivation \(\Gamma \vdash \Coh \Delta B \tau : C\). \end{itemize} Finally, suppose the equality is derived from \textsc{rule}, such that \((\Gamma, s,t) \in \mathcal{R}\) and \(\Gamma \vdash s : A\). If \(\Gamma \vdash s : B\), then the preservation condition gives a derivation \(\Gamma \vdash t : B\). Conversely, if \(\Gamma \vdash t : B\), then we need to show that \(\Gamma \vdash A = B\). By applying the preservation condition to the derivation \(\Gamma \vdash s : A\), we get a derivation \(\Gamma \vdash t : A\). Then by \cref{lem:ty-unique}, we have \(\Gamma \vdash A = B\) and so the proof is complete by applying the conversion rule to the derivation \(\Gamma \vdash s : A\). \end{proof} As with the other conditions, we end this section by showing that \dr satisfies the preservation condition. \begin{proposition} \label{prop:dr-preserve} Suppose \(\mathcal{R}\) satisfies the weakening condition, and the set of operations \(\mathcal{O}\) contains the standard operations. Then \dr satisfies the \(\mathcal{R}\)-preservation condition. \end{proposition} \begin{proof} Take \((\Gamma, \Coh {D^n} {\wk(U^n)} {\{A,t\}}, t) \in \dr\) and suppose \(\Gamma \vdash \Coh {D^n} {U^n} {\{A,t\}} : B\). Then by \cref{lem:ty-unique}: \[\Gamma \vdash B = \wk(U^n) \sub {\{A,t\}} \equiv A\] By \cref{lem:disc-typing}, \(\Gamma \vdash t : A\) and so by the conversion rule \(\Gamma \vdash t : B\) as required. \end{proof} \subsection{Endo-coherence removal} \label{sec:ecr} We conclude this chapter with a second example of a family of equality rules called \emph{endo-coherence removal}. As suggested by the name, these equalities simplify a class of terms known as endo-coherences. \begin{definition} An \emph{endo-coherence} is a coherence term \(\Coh \Delta {\arr s A s} \sigma\). \end{definition} If we consider the (ps-context): \[ \Delta = (x : \star) (y : \star) (f : \arr x \star y) (z : \star) (g : \arr y \star z) \] then we see that there are two distinct endo-coherences with source and target \(f * g\), the identity on \(f * g\) and the ``fake identity'' \(\Coh \Delta {f*g \to f*g} {\id_\Delta}\). In the type theories \Cattsu and \Cattsua introduced in \cref{sec:cattsu,sec:cattsua}, identities will be privileged, and these fake identities will be reduced to the true identity. More generally, for each term \(t\) there is a canonical endo-coherence with source and target \(t\), the identity on \(t\). Endo-coherence removal simplifies any other endo-coherence on that term to an identity. It makes the following rule admissible: \begin{mathpar} \inferrule{\Delta \vdash_{\mathsf{ps}} \\ \Delta \vdash A \\ \Delta \vdash s : A \\ \Supp(s) = \Var(\Delta) \\ \Gamma \vdash \sigma : \Delta}{\Gamma \vdash \Coh \Delta {\arr s A s} \sigma = \id(A \sub \sigma,s \sub \sigma)}\textsc{ecr} \end{mathpar} Endo-coherence removal can be assembled into the following equality rule set. \begin{definition} The \emph{endo-coherence removal set}, \ecr, is the set consisting of the triples: \[ \Gamma, \Coh \Delta {\arr s A s} \sigma, \id(A\sub \sigma, s \sub \sigma) \] for contexts \(\Gamma\) and \(\Delta\), \(A : \Type_\Delta\), \(s : \Term_\Delta\), and \(\sigma : \arr \Delta \star \Gamma\). A set of rules \(\mathcal{R}\) \emph{contains endo-coherence removal} if \(\ecr \subseteq \mathcal{R}\). We say that \(\mathcal{R}\) \emph{has endo-coherence removal} if the rule \textsc{ecr} holds in the generated theory. \end{definition} The set \ecr satisfies all the conditions introduced in this chapter, as proven in the next proposition, which concludes this chapter. \begin{proposition} \label{prop:ecr-props} Suppose the set of operations \(\mathcal{O}\) contains the standard operations. Then the set \ecr satisfies the following properties: \begin{lemmaenum} \item The set \ecr satisfies the weakening condition. \item The set \ecr satisfies the suspension condition. \item The set \ecr satisfies the \(\mathcal{R}\)-substitution condition, for any equality set \(\mathcal{R}\). \item \label{item:ecr-supp} The set \ecr satisfies the \(\mathcal{R}\)-support condition, for any equality set \(\mathcal{R}\) satisfying the support condition. \item \label{item:ecr-preserve} The set \ecr satisfies the \(\mathcal{R}\)-preservation condition, for any equality set \(\mathcal{R}\) satisfying the weakening and substitution conditions. \end{lemmaenum} \end{proposition} \begin{proof} Suppose \((\Gamma, \Coh \Delta {\arr s A s} \sigma, \id(A \sub \sigma, s \sub \sigma)) \in \ecr\). To show that the substitution holds, we suppose that \(\tau : \arr \Gamma \star \Theta\), and then must prove that: \[ (\Theta, \Coh \Delta {\arr s A s} {\sigma \bullet \tau}, \id(A \sub \sigma, s \sub \sigma) \sub \tau) \in \ecr \] It is immediate that: \[ (\Theta, \Coh \Delta {\arr s A s} {\sigma \bullet \tau}, \id(A \sub {\sigma \bullet \tau}, s \sub {\sigma \bullet \tau})) \in \ecr \] and so it suffices to prove that \(\id(A \sub \sigma, s \sub \sigma) \sub \tau \equiv \id(A \sub {\sigma \bullet \tau},s \sub {\sigma \bullet \tau})\), but this follows from \cref{item:disc-prop-sub-sub,prop:categorical}. The weakening condition then follows from the substitution condition. For the suspension condition, it must be shown that: \[ (\Sigma(\Gamma), \Coh {\Sigma(\Delta)} {\arr {\Sigma(s)} {\Sigma(A)} {\Sigma(s)}} {\Sigma(\sigma)}, \Sigma(\id(A\sub \sigma, s \sub \sigma))) \in \ecr \] and so it suffices to show that \(\Supp(\Sigma(s)) = \Var(\Sigma(\Delta))\), which follows from \(\Supp(\Sigma(s)) = \Sigma(\Supp(s))\), and \[ \Sigma(\id(A \sub \sigma, s \sub \sigma)) \equiv \id(\Sigma(A) \sub {\Sigma(\sigma)}, \Sigma(s) \sub {\Sigma(\sigma)}) \] which follows from the functoriality of suspension and \cref{item:disc-prop-sub-susp,item:disc-prop-susp}. For the support condition, assume that \(\Gamma \vdash_{\mathcal{R}} \Coh \Delta {\arr s A s} \sigma : B\) for some \(B : \Type_\Gamma\) and that \(\mathcal{R}\) satisfies the support condition. Then: \begin{align*} \Supp(\Coh \Delta {\arr s A s} \sigma) &= \Supp(\sigma)\\ &= \FV(\sigma)&\text{by \cref{item:supp-sub-char}}\\ &= \Var(\Delta) \sub \sigma \\ &= \Supp(s) \sub \sigma&\text{by assumption}\\ &= (\Supp(A) \cup \Supp(s)) \sub \sigma &\text{by \cref{item:supp-tm-char-2}}\\ &= \DC_\Delta(\FV(A) \cup \FV(s)) \sub \sigma\\ &= \DC_\Gamma(\FV(A) \sub \sigma \cup \FV(s) \sub \sigma)&\text{by \cref{cor:dc-sub}}\\ &= \DC_\Gamma(\FV(A \sub \sigma) \cup \FV(s \sub \sigma))&\text{by \cref{prop:vs-sub}}\\ &= \Supp(A \sub \sigma) \cup \Supp(s \sub \sigma)\\ &= \Supp(\id(A \sub \sigma, s \sub \sigma)) \end{align*} as required. Lastly for the preservation condition, let \(\mathcal{R}\) satisfy the weakening and substitution conditions, and assume \(\Gamma \vdash \Coh \Delta {\arr s A s} {\sigma} : B\). By deconstructing the typing derivation, we must have that \(\Delta \vdash A\), \(\Delta \vdash s : A\), and \(\Gamma \vdash \sigma : \Delta\). Therefore, by \cref{prop:sub-prop-1}, \(\Gamma \vdash A \sub \sigma\) and \(\Gamma \vdash s \sub \sigma : A \sub \sigma\). Hence, by \cref{cor:id-typing}, \(\Gamma \vdash \id(A \sub \sigma, s \sub \sigma) : (\arr s A s) \sub \sigma\). It remains to prove that \(\Gamma \vdash (\arr s A s) \sub \sigma = B\), but this is immediate from \cref{lem:ty-unique}, applied to the derivation \(\Gamma \vdash \Coh \Delta {\arr s A s} \sigma : B\). \end{proof} \begin{figure}[t] \centering %\includegraphics[height=\textheight - 25pt]{test.pdf} \caption{Dependency graph of Agda formalisation.} \label{fig:dep-graph} \end{figure} \chapter{Constructions in \texorpdfstring{\boldmath\Cattr}{Cattr}} \label{sec:operations-catt} This chapter will investigate some more involved constructions that can be given in the type theory \Cattr. These constructions will be central to defining the reductions that underpin the type theories \Cattsu and \Cattsua which appear in \cref{cha:cattstrict}. We will give a definition of each construction, describe under what conditions it is well-formed, and state various properties describing the behaviour of the construction and its interaction with other constructions. For this chapter we will assume that we are working in a tame theory, as described in \cref{sec:tame-theories}. This means that all proofs in this section will hold in any variant of \Cattr such that the equality set \(\mathcal{R}\) satisfies the weakening, substitution, and suspension conditions, and the set of operations \(\mathcal{O}\) is suspendable and contains the standard operations. We will also use all the relevant proofs from \cref{sec:catt-with-equality}, without explaining exactly what condition of the set \(\mathcal{R}\) is being used. The formalisation is commonly more specific when specifying which conditions are necessary for each module, for example omitting the suspension condition when it is not needed for a specific construction, but for the body of this text we ignore these distinctions and simply assume that every theory we work with will be tame, as will be case for all theories introduced in \cref{cha:cattstrict}. This chapter builds up to the following two constructions, that can be viewed as meta-operations on \Cattr. \begin{itemize} \item The \emph{pruning} operation will be introduced in \cref{sec:pruning} and is the main component of the type theory \Cattsu, defined in \cref{sec:cattsu}, a type theory for strictly unital \(\infty\)-categories. Pruning removes unnecessary identities from a term, simplifying the resulting term in the process. \item The \emph{insertion} operation will be introduced in \cref{sec:insertion}. It powers the type theory \Cattsua, a type theory for strictly unital and associative \(\infty\)-categories. Insertion merges certain arguments to a coherence into the body of the coherence itself, effectively ``inserting'' the argument into the head term. It can be viewed as a generalisation of pruning, but is a more complex construction. \end{itemize} Both pruning and insertion perform more radical modifications to the structure of a term than disc removal and endo-coherence removal, the equality rules we have seen so far. Pruning and insertion modify the pasting diagram in the coherence at the head of the term they act on. In this chapter, more combinatorial descriptions of pasting diagrams will be introduced to enable the pasting diagrams involved in these constructions to be constructed by induction. The pruning construction identifies locally maximal arguments of a coherence that are syntactically identities, and removes these arguments from the term, while also removing the component of the pasting diagram in the coherence which corresponds to this argument. Pruning could be applied to the term \(f * g * \id\), a ternary composite, to remove the identity argument and convert the ternary composite to a binary composite, returning the term \(f*g\). Insertion does not just simply remove parts of a term, but flattens the structure of a term, moving data from a locally maximal argument into the head term. The motivating example for insertion is the term \(f * (g * h)\), a binary composite where one of the locally maximal arguments is itself a binary composite. Under insertion, the inner composite \(g * h\) is merged with the outer binary composite to form a single ternary composite \(f * g * h\). When a locally maximal argument is an identity, it will always be insertable, and the result of inserting the identity into the head term will be similar to pruning the same argument, motivating the viewpoint that insertion is a generalisation of pruning. At the end of this chapter, this relationship will be made precise. Insertion again performs more radical changes to the head coherence of the term than pruning, and needs to be able to merge two pasting diagrams into one along a locally maximal argument. The operation on pasting diagrams is best understood as an operation on \emph{trees}, an alternative characterisation of pasting diagrams which will be introduced in \cref{sec:trees}. Although the definition of these trees is simple, to be able to use them effectively we must be able to describe their relationship to the \Catt contexts they represent. It will also be necessary to describe the morphisms between these trees, which correspond to substitutions between the underlying contexts, and the composition of such morphisms. Certain constructions on trees will not compute nicely with the syntax in \Catt. We therefore introduce a new notion of \emph{structured term}, an alternative syntax for \Catt which allows more complex representations of terms over contexts derived from trees. Structured terms effectively retain more information about how they are constructed, allowing constructions to compute on them in ways that are not possible on the raw syntax of \Catt. This representation of terms will be crucial in the formalisation, as it aids the proof assistant in simplifying various constructions. These structured terms are defined in \cref{sec:structured-terms}. Finally, \cref{sec:insertion} defines the constructions used in the insertion operation, using the structured syntax from the preceding section. In this section, many properties of insertion are stated, including a universal property that it satisfies. \section{Pruning} \label{sec:pruning} Pruning drives the strictly unital behaviour of \Cattsu. Unitality in \(\infty\)-categories is the property that the identity acts as a unit with respect to composition, so that composing with the unit is equivalent to the original term. If an \(\infty\)-category is strictly unital, then it exhibits this behaviour up to equality rather than equivalence. For \Catt, strict unitality means that a composition containing an identity as one of its arguments should be definitionally equal to the term with this argument removed. Pruning is the operation that removes an argument from a composition, taking a term such as \(f * g * \id\) to \(f * g\), or \(\id * f\) to the unary composite on \(f\). In the presence of strict units, it is also desirable to simplify the higher-dimensional data that witnessed the (weak) unitality in \Catt. For example, the left unitor on \(f\), given by the term: \[ \Coh {(x : \star), (y : \star), (f : \arr x \star y)} {\arr {\id(x) * f} {} {f}} {\id} \] which witnesses that composing on the left with an identity is equivalent to the original term, can be simplified to the identity on \(f\), and the triangle equations which govern the coherence laws for the unitors can also trivialise. For this reason, pruning is defined to be able to apply to any term which has identities as a locally maximal argument. We review the definition of a locally maximal argument below. \begin{definition} In a context \(\Gamma\), a \emph{locally maximal variable} is a variable \(x\) of \(\Gamma\) that does not appear in the source or target of any other variable of \(\Gamma\). Equivalently, \(x\) is locally maximal when: \[ x \not\in \Supp(y) \] for any \(y \neq x \in \Var(\Gamma)\). Given a substitution \(\sigma : \Delta \to \Gamma\), a \emph{locally maximal argument} of \(\sigma\) is a term \(x \sub \sigma\) where \(x\) is a locally maximal variable of \(\Delta\). \end{definition} \begin{example} \label{ex:lm} Consider the pasting diagram given by the following diagram: % https://q.uiver.app/#q=WzAsMyxbMCwwLCJ4Il0sWzIsMCwieSJdLFszLDAsInoiXSxbMCwxLCJmIiwwLHsiY3VydmUiOi01fV0sWzAsMSwiaCIsMix7ImN1cnZlIjo1fV0sWzAsMSwiZyIsMV0sWzEsMiwiaiJdLFszLDUsIlxcYWxwaGEiLDAseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzUsNCwiXFxiZXRhIiwwLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dXQ== % tex-fmt: skip \[ \begin{tikzcd} x && y & z \arrow[""{name=0, anchor=center, inner sep=0}, "f"', curve={height=30pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "h", curve={height=-30pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "g"{description}, from=1-1, to=1-3] \arrow["j", from=1-3, to=1-4] \arrow["\alpha", shorten <=3pt, shorten >=5pt, Rightarrow, from=0, to=2] \arrow["\beta", shorten <=5pt, shorten >=3pt, Rightarrow, from=2, to=1] \end{tikzcd} \] which corresponds to the \Catt context (written to highlight the dimension of each term): \begin{alignat*}{3} \Theta ={} &(x : \star),\\ &(y : \star),{}&&(f : x \to y),\\ &&&(g : x \to y),{}&&(\alpha : f \to g),\\ &&&(h : x \to y),&&(\beta : g \to h),\\ &(z : \star),&&(j : y \to z) \end{alignat*} The locally maximal variables of \(\Theta\) are \(\alpha\), \(\beta\), and \(j\). Note that \(j\) is locally maximal, despite not being of maximal dimension in the context. Pruning the context \(\Theta\) along locally maximal variable \(\alpha\) removes the variables \(\alpha\) and \(g\) from the context, and must amend the type of \(\beta\) so that its source is \(f\). \end{example} To perform the pruning construction, we start with a coherence term \(\Coh \Delta A \sigma : \Term_\Gamma\), and assume that some locally maximal argument of \(\sigma\) is an identity, that is \(x \sub \sigma \equiv \id(B,t)\) for some locally maximal variable \(x\), type \(B : \Type_\Gamma\), and term \(t : \Term_\Gamma\). We then construct the following: \begin{itemize} \item A new pasting diagram \(\Delta \sslash x\), corresponding to \(\Delta\) with the variable \(x\) and its target removed. \item A new set of arguments \(\sigma \sslash x\), consisting of the same terms as \(\sigma\) except those corresponding to \(x\) and its target. \item A projection substitution \(\pi_x : \Delta \to \Delta \sslash x\), from which a type \(A \sub {\pi_x} : \Type_{\Delta \sslash x}\) can be obtained. This projection sends \(x\) to the identity on its source, the target of \(x\) to the source of \(x\), and every other variable to itself. \end{itemize} We note that the source and target of the locally maximal variable \(x\) are well-defined as \(x\) must be sent by \(\sigma\) to an identity, which cannot be zero dimensional. \subsection{Dyck words} To be able to easily reason about the structures involved in pruning, we wish to define them by induction. To do this we introduce a different presentation of pasting diagrams called \emph{Dyck words}, which have a simpler inductive structure. Dyck words more directly encode the structure of the pasting diagram, and will allow us to give an inductive characterisation of the locally maximal variables of the associated context. \begin{definition} The set of \emph{Dyck words}, \(\Dyck_d\) of trailing dimension \(d\) consists of lists of ``up'' and ``down'' moves according to the following rules. \begin{mathpar} \inferrule{ }{\circleddash : \Dyck_0} \and \inferrule{d : \mathbb{N} \\\mathcal{D} : \Dyck_d}{\mathcal{D} \Uparrow : \Dyck_{d + 1}} \and \inferrule{d : \mathbb{N} \\ \mathcal{D} : \Dyck_{d + 1}}{\mathcal{D} \Downarrow : \Dyck_d} \end{mathpar} In any prefix of a Dyck word \(D : \Dyck_d\), the number of ``up'' moves (given by constructor \(\Uparrow\)) must be greater than or equal to the number of ``down'' moves (given by constructor \(\Downarrow\)). The difference between the number of each move is given by the trailing dimension \(d\). \end{definition} Dyck words can be given a visual interpretation as a \emph{mountain diagram}. To obtain such a diagram we start on the left-hand side, and draw a continuous line by drawing an upwards sloping segment for each \(\Uparrow\) in the word, and a downwards sloping line for each \(\Downarrow\) in the word. An example of such a diagram is given in \cref{fig:mountain}. \begin{figure}[ht] \centering \[ \begin{tikzcd}[column sep = small, cells={inner sep = 0}, arrows={no head}] && \bullet && \bullet \\ & \bullet && \bullet && \bullet && \bullet \\ \bullet &&&&&& \bullet && \bullet \arrow[from=3-1, to=2-2] \arrow[from=2-2, to=1-3] \arrow[from=1-3, to=2-4] \arrow[from=2-4, to=1-5] \arrow[from=1-5, to=2-6] \arrow[from=2-6, to=3-7] \arrow[from=3-7, to=2-8] \arrow[from=2-8, to=3-9] \end{tikzcd}\] \caption[Mountain diagram]{Mountain diagram for \(\circleddash \Uparrow\, \Uparrow\, \Downarrow\, \Uparrow\, \Downarrow\, \Downarrow\, \Uparrow\, \Downarrow\, : \Dyck_0\).} \label{fig:mountain} \end{figure} The rules \(\circleddash\), \(\Uparrow\), and \(\Downarrow\) directly correspond to the rules \textsc{pss}, \textsc{pse}, and \textsc{psd} that generate the typing judgement for ps-contexts. From a Dyck word, we can directly construct this context by induction. \begin{definition} For a Dyck word \(\mathcal{D} : \Dyck_d\), its associated context \(\lfloor \mathcal{D} \rfloor\), associated type \(\ty_{\mathcal{D}} : \Type_{\lfloor \mathcal{D} \rfloor}\), and associated term \(\tm_{\mathcal{D}} : \Term_{\lfloor \mathcal{D} \rfloor}\) are defined by mutual induction on \(\mathcal{D}\): \begin{align*} \lfloor \circleddash \rfloor &= (x : \star)\\ \lfloor \mathcal{D} \Uparrow \rfloor &= \lfloor \mathcal{D} \rfloor, (y_{\mathcal{D}} : \ty_{\mathcal{D}}), (f_{\mathcal{D}} : \arr {\wk(\tm_{\mathcal{D}})} {\wk(\ty_{\mathcal{D}})} {y_{\mathcal{D}}})\\ \lfloor \mathcal{D} \Downarrow \rfloor &= \lfloor \mathcal{D} \rfloor\\[10pt] \ty_{\circleddash} &= \star\\ \ty_{\mathcal{D}\Uparrow} &= \arr{\wk(\wk(\tm_{\mathcal{D}}))} {\wk(\wk(\ty_{\mathcal{D}}))} {y_{\mathcal{D}}}\\ \ty_{\mathcal{D} \Downarrow} &= \base(\ty_{\mathcal{D}})&\text{where }\base(\arr s A t) = A\\[10pt] \tm_{\circleddash} &= x\\ \tm_{\mathcal{D}\Uparrow} &= f_{\mathcal{D}}\\ \tm_{\mathcal{D}\Downarrow} &= \tgt(\ty_{\mathcal{D}})&\text{where }\tgt(\arr s A t) = t \end{align*} The variable names given here are used to avoid ambiguity in the definition. As we consider contexts up to \(\alpha\)-equality, we may freely change these variable names. The \(\tgt\) and \(\base\) operations are well-defined here as it may be checked by a simple induction that \(\dim(\ty_{\mathcal{D}}) = d\) for \(\mathcal{D} : \Dyck_d\), ensuring that we only apply \(\tgt\) and \(\base\) to types of strictly positive dimension. \end{definition} The tight correspondence between the rules used to construct Dyck words and ps-contexts allow an easy proof that the contexts associated to Dyck words are in fact pasting diagrams. \begin{lemma} \label{lem:dyck-typing} For a Dyck word \(\mathcal{D} : \Dyck_d\), its associated context, type, and term are all well-formed: \[ \lfloor \mathcal{D} \rfloor \vdash \qquad \lfloor \mathcal{D} \rfloor \vdash \ty_{\mathcal{D}} \qquad \lfloor \mathcal{D} \rfloor \vdash \tm_{\mathcal{D}} : \ty_{\mathcal{D}} \] In addition to being a well-formed context, the context associated to a Dyck word is a ps-context; the following judgement holds: \[ \lfloor \mathcal{D} \rfloor \vdash_{\mathsf{ps}} \tm_{\mathcal{D}} : \ty_{\mathcal{D}} \] and so if \(\mathcal{D} : \Dyck_0\), we have \(\lfloor \mathcal{D} \rfloor \vdash_{\mathsf{ps}}\). Further, all ps-contexts are the associated context of a Dyck word. \end{lemma} \begin{proof} Due to the similarity of the rules for ps-contexts and Dyck words, this follows quickly from simple inductions, which are given in the formalisation. The proofs for the typing judgements appear in \module{Catt.Dyck.Typing} and the proofs for the ps-context judgements appear in \module{Catt.Dyck.Pasting}. \end{proof} The locally maximal variables in the context associated to a Dyck word correspond exactly to the points in the word where there is an upwards move followed immediately by a downwards move, creating a peak in the mountain diagram. These peaks can be given an inductive characterisation. \begin{definition} Let \(\mathcal{D} : \Dyck_d\) be a Dyck word. A \emph{peak} of \(\mathcal{D}\), \(p : \Peak_{\mathcal{D}}\) is inductively defined by the following rules: \begin{mathpar} \inferrule{d \in \mathbb{N} \\ \mathcal{D} : \Dyck_d}{\mathcal{D} \UDPeak : \mathcal{D} \Uparrow\,\Downarrow}\and \inferrule{d \in \mathbb{N} \\ \mathcal{D} : \Dyck_d \\ p : \Peak_{\mathcal{D}}}{p \UpPeak : \Peak_{\mathsf{D}} \Uparrow}\and \inferrule{d \in \mathbb{N} \\ \mathcal{D} : \Dyck_{d+1} \\ p : \Peak_{\mathcal{D}}}{p \DownPeak : \Peak_{\mathcal{D} \Downarrow}} \end{mathpar} From each peak \(p : \Peak_{\mathcal{D}}\), a term \(\lfloor p \rfloor\) of \(\lfloor \mathcal{D} \rfloor\) can be inductively defined by: \[ \lfloor \mathcal{D} \UDPeak \rfloor = f_\mathcal{D} \qquad \lfloor p \UpPeak \rfloor = \wk(\wk \lfloor p \rfloor) \qquad \lfloor p \DownPeak \rfloor = \lfloor p \rfloor\] The term \(\lfloor p \rfloor\) is a locally maximal variable of \(\lfloor \mathcal{D} \rfloor\). \end{definition} \begin{example} \label{ex:dyck-peaks} Recall the ps-context \(\Theta\) from \cref{ex:lm}. This context is the associated context of the Dyck word: \[ \circleddash \Uparrow\, \Uparrow\, \Downarrow\, \Uparrow\, \Downarrow\, \Downarrow\, \Uparrow\, \Downarrow\] for which the mountain diagram is given in \cref{fig:mountain}. The three locally maximal variables \(\alpha\), \(\beta\), and \(j\) correspond to the peaks: \[ \circleddash \Uparrow\, \UDPeak\, \UpPeak\, \DownPeak\, \DownPeak\, \UpPeak\, \DownPeak \qquad \circleddash \Uparrow\, \Uparrow\, \Downarrow\, \UDPeak\, \DownPeak\, \UpPeak\, \DownPeak \qquad \circleddash \Uparrow\, \Uparrow\, \Downarrow\, \Uparrow\, \Downarrow\, \Downarrow\, \UDPeak \] which themselves correspond to the three peaks of the mountain diagram, with the height of each peak corresponding to the dimension of each locally maximal variable. \end{example} As all disc contexts are pasting diagrams, and hence are the associated context of a Dyck word. \begin{definition} Let \(\mathcal{D}^n\) be the Dyck word with \(n\) upwards moves followed by \(n\) downwards moves. The equality \( \lfloor \mathcal{D}^n \rfloor \equiv D^n\) follows from a trivial induction. If \(n > 0\), There is a unique peak of \(\mathcal{D}^n\) with associated term \(d_n\). \end{definition} We lastly show that a Dyck word can be suspended, which is expected as ps-contexts are closed under suspension. The various constructions associated to a suspended Dyck word are equal to the same constructions on the unsuspended Dyck word. \begin{lemma} Dyck words are closed under suspension. We define the suspension of a Dyck word \(\mathcal{D} : \Dyck_d\) to be the Dyck word \(\Sigma(\mathcal{D}) : \Dyck_{d+1}\) which is obtained by inserting an additional up move to the start of the work, or can alternatively be inductively defined by: \[ \Sigma(\circleddash) = \circleddash \Uparrow \qquad \Sigma(\mathcal{D}\Uparrow) = \Sigma(\mathcal{D})\Uparrow \qquad \Sigma(\mathcal{D}\Downarrow) = \Sigma(\mathcal{D})\Downarrow \] The following equalities hold: \[ \lfloor \Sigma(\mathcal{D}) \rfloor = \Sigma(\lfloor \mathcal{D} \rfloor) \qquad \ty_{\Sigma(\mathcal{D})} = \Sigma(\ty_{\mathcal{D}}) \qquad \tm_{\Sigma(\mathcal{D})} = \Sigma(\tm_{\mathcal{D}}) \] for each Dyck word \(\mathcal{D}\). For each peak \(p : \Peak_{\mathcal{D}}\), there is an associated peak \(\Sigma(p) : \Peak_{\Sigma(\mathcal{D})}\) which is defined similarly. \end{lemma} \begin{proof} These properties are all proved by straight forward induction on \(\mathcal{D}\). The formalised proofs appear in \module{Catt.Dyck.Properties}. \end{proof} The Dyck words presented in this section can be viewed as a more direct syntax for pasting contexts, which allow induction to be easily performed. For this reason, most of the properties of Dyck words follow from routine inductions, and hence are relegated to the formalisation. The key contribution of this (sub)section is the characterisation of locally maximal variables as peaks, which have an easy inductive definition due to the simplicity of Dyck words. \begin{remark} All locally maximal variables of ps-contexts are identified with peaks, except for the unique variable of the singleton context. This discrepancy will make no difference for pruning, as a \(0\)-dimensional variable could never have been sent to an identity and so would never have been a candidate for pruning. \end{remark} \subsection{The pruning construction} \label{sec:prune-construction} Equipped with Dyck words, and a classification of locally maximal variables as peaks, we are now able to define each of the constructions used in the pruning operation. \begin{definition} Let \(\mathcal{D} : \Dyck_d\) be a Dyck word, and \(p : \Peak_{\mathcal{D}}\) be a peak of \(\mathcal{D}\). The pruned Dyck word \(\mathcal{D} \sslash p : \Dyck_d\) and substitution \(\pi_p : \lfloor \mathcal{D}\rfloor \to \lfloor \mathcal{D} \sslash p \rfloor\) are then defined inductively on the peak \(p\) by the following equations: \begin{align*} \mathcal{D} \Uparrow\, \Downarrow \sslash \mathcal{D} \UDPeak &= \mathcal{D}\\ \mathcal{D} \Uparrow \sslash p \UpPeak &= (\mathcal{D} \sslash p) \Uparrow \\ \mathcal{D} \Downarrow \sslash p \DownPeak &= (\mathcal{D} \sslash p) \Downarrow \\[10pt] \pi_{\mathcal{D}\UDPeak} &= \langle \id_{\lfloor \mathcal{D} \rfloor} , \tm_{\mathcal{D}}, \id(\ty_{\mathcal{D}}, \tm_{\mathcal{D}}) \rangle\\ \pi_{p \UpPeak} &= \langle \wk(\wk(\pi_p)) , y_{\mathcal{D}}, f_{\mathcal{D}} \rangle\\ \pi_{p \DownPeak} &= \pi_p\\ \intertext{If we further have a substitution \(\sigma : \arr {\lfloor \mathcal{D} \rfloor} \star \Gamma\) for some context \(\Gamma\), then the pruned substitution \(\sigma \sslash p : \arr {\lfloor \mathcal{D} \sslash p \rfloor} \star \Gamma\) can be formed:} \langle \sigma, s, t \rangle \sslash \mathcal{D}\UDPeak &= \sigma \\ \langle \sigma, s, t \rangle \sslash p \UpPeak &= \langle \sigma \sslash p, s, t \rangle \\ \sigma \sslash p \DownPeak &= \sigma \sslash p \end{align*} \end{definition} Each peak in a Dyck word corresponds to a consecutive upwards arrow and downwards arrow. Pruning this peak corresponds removing these two arrows, which does not change the trailing dimension of the Dyck word. The effect on the mountain diagram representation can be seen in \cref{fig:prune}. \begin{figure}[ht] \centering % https://q.uiver.app/#q=WzAsMTcsWzAsMiwiXFxidWxsZXQiXSxbMSwxLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzMsMSwiXFxidWxsZXQiXSxbNCwwLCJcXGJ1bGxldCJdLFs1LDEsIlxcYnVsbGV0Il0sWzYsMiwiXFxidWxsZXQiXSxbNywxLCJcXGJ1bGxldCJdLFs4LDIsIlxcYnVsbGV0Il0sWzksMSwiXFxyaWdodHNxdWlnYXJyb3ciXSxbMTAsMiwiXFxidWxsZXQiXSxbMTEsMSwiXFxidWxsZXQiXSxbMTIsMCwiXFxidWxsZXQiXSxbMTMsMSwiXFxidWxsZXQiXSxbMTQsMiwiXFxidWxsZXQiXSxbMTUsMSwiXFxidWxsZXQiXSxbMTYsMiwiXFxidWxsZXQiXSxbMCwxXSxbMSwyLCIiLDAseyJjb2xvdXIiOlswLDYwLDYwXX1dLFsyLDMsIiIsMCx7ImNvbG91ciI6WzAsNjAsNjBdfV0sWzMsNF0sWzQsNV0sWzUsNl0sWzYsN10sWzcsOF0sWzEwLDExXSxbMTEsMTJdLFsxMiwxM10sWzEzLDE0XSxbMTQsMTVdLFsxNSwxNl1d % tex-fmt: skip \[ \begin{tikzcd}[column sep = small, cells={inner sep = 0}, arrows={no head}] && |[color={rgb,255:red,204;green,0;blue,14}]|\bullet && \bullet &&&&&&&& \bullet \\ & \bullet && \bullet && \bullet && \bullet && \rightsquigarrow && \bullet && \bullet && \bullet \\ \bullet &&&&&& \bullet && \bullet && \bullet &&&& \bullet && \bullet \arrow[from=3-1, to=2-2] \arrow[color={Diag2}, from=2-2, to=1-3] \arrow[color={Diag2}, from=1-3, to=2-4] \arrow[from=2-4, to=1-5] \arrow[from=1-5, to=2-6] \arrow[from=2-6, to=3-7] \arrow[from=3-7, to=2-8] \arrow[from=2-8, to=3-9] \arrow[from=3-11, to=2-12] \arrow[from=2-12, to=1-13] \arrow[from=1-13, to=2-14] \arrow[from=2-14, to=3-15] \arrow[from=3-15, to=2-16] \arrow[from=2-16, to=3-17] \end{tikzcd}\] \caption[Pruning]{Pruning of peak \(\circleddash \Uparrow\, \UDPeak\, \UpPeak\, \DownPeak\, \DownPeak\, \UpPeak\, \DownPeak\).} \label{fig:prune} \end{figure} When a peak is pruned the locally maximal variable and its target are removed from the associated context. The substitution \(\pi_{\mathcal{D} \UDPeak}\) simply maps these two variables to \(\id(\ty_{\mathcal{D}},\tm_{\mathcal{D}})\) and \(\tm_{\mathcal{D}}\), where the Dyck term \(\tm_{\mathcal{D}}\) is the source of the locally maximal variable. Pruning a substitution simply removes the terms corresponding to the removed variables in the associated context. \begin{example} Let \(\Gamma = (x : \star), (f : \arr x \star x)\) and consider the term \(f * \id(x)\), which is given by: \[ \Coh {(a : \star), (b : \star), (c : a \to b), (d : \star), (e : b \to d)} {a \to d} {\langle x, x, f, x, \id(\star,x) \rangle} \] The context in this coherence is the associated context of the Dyck word \(\circleddash \Uparrow\,\Downarrow\,\Uparrow\,\Downarrow\) which has a peak \(\circleddash \Uparrow\,\Downarrow\,\UDPeak\), which corresponds to the locally maximal variable \(e\). Since \(e\) is sent to an identity by the substitution, pruning can be applied to get: \begin{align*} \circleddash \Uparrow\,\Downarrow\,\Uparrow\,\Downarrow \sslash \circleddash \Uparrow\,\Downarrow\,\UDPeak &= \circleddash \Uparrow\, \Downarrow\\ \pi_{\circleddash \Uparrow\,\Downarrow\,\UDPeak} &= \langle a, b, c, b, \id(\star,b) \rangle\\ \langle x, x, f, x, \id(\star,x) \rangle \sslash \circleddash \Uparrow\,\Downarrow\,\UDPeak &= \langle x,x,f\rangle \end{align*} Which results in the term: \[ \Coh {(a : \star), (b : \star), (c : a \to b)} {(a \to d) \sub {\langle a, b, c, b, \id(\star,b) \rangle} } {\langle x, x, f \rangle} \equiv \Coh {(a : \star), (b : \star), (c : a \to b)} {(a \to b)} {\langle x, x, f \rangle} \] which is the unary composite on \(f\). In the presence of disc removal, this term could further simplify to the variable \(f\). \end{example} With these constructions, we can define the pruning rule. \begin{definition} A term \(t\) \emph{is an identity} if \(t \equiv \id(A,s)\) for some type \(A\) and some term \(s\). The \emph{pruning rule set}, \prune, is the set consisting of the triples: \[ (\Gamma, \Coh {\lfloor \mathcal{D} \rfloor} {A} \sigma, \Coh {\lfloor \mathcal{D} \sslash p \rfloor} {A \sub {\pi_p}} {\sigma \sslash p}) \] for each Dyck word \(\mathcal{D} : \Dyck_0\), peak \(p : \Peak_{\mathcal{D}}\), type \(A : \Type_{\lfloor \mathcal{D} \rfloor}\), and substitution \(\sigma : \arr {\lfloor \mathcal{D} \rfloor} \star \Gamma\) where \(\lfloor p \rfloor \sub \sigma\) is an identity. A set of rules \(\mathcal{R}\) \emph{contains pruning} if \(\prune \subseteq \mathcal{R}\). Pruning makes the following rule admissible: \begin{mathpar} \inferrule{\mathcal{D} : \Dyck_0 \\ p : \Peak_{\mathcal{D}} \\ \lfloor \mathcal{D} \rfloor \vdash A \\ \Gamma \vdash \sigma : \lfloor \mathcal{D} \rfloor \\\\ (\lfloor \mathcal{D} \rfloor, \Supp(\src(A)), \tgt(A)) \in \mathcal{O}\\ \lfloor p \rfloor \sub \sigma \text{ is an identity}}{\Gamma \vdash \Coh {\lfloor \mathcal{D} \rfloor} A \sigma = \Coh {\lfloor \mathcal{D} \sslash p} {A \sub {\pi_p}} {\sigma \sslash p}}\textsc{prune} \end{mathpar} The set \(\mathcal{R}\) \emph{has pruning} if the rule \textsc{prune} holds in the generated theory. \end{definition} \subsection{Properties of pruning} We start with the aim of proving that each construction involved in pruning satisfies the expected typing judgements. To do this the following lemma will be necessary, which describes the interaction of the Dyck word construction with pruning. \begin{lemma} \label{lem:dyck-prune-prop} Let \(\mathcal{D} : \Dyck_d\) be a Dyck word. Then the following equations hold: \begin{align*} \ty_{\mathcal{D}} \sub{\pi_p} &\equiv \ty_{\mathcal{D} \sslash p}\\ \tm_{\mathcal{D}} \sub{\pi_p} &\equiv \tm_{\mathcal{D} \sslash p} \end{align*} for any peak \(p : \Peak_{\mathcal{D}}\) of \(\mathcal{D}\). \end{lemma} \begin{proof} The proof proceeds by an induction on the peak \(p\), proving both equations simultaneously. Both equations hold by routine calculations given in \module{Catt.Dyck.Pruning.Properties} by the functions \func{Catt.Dyck.Pruning.Properties}{dyck-type-prune} and \func{Catt.Dyck.Pruning.Properties}{dyck-term-prune}. \end{proof} This allows the main typing properties of this section to be given. \begin{proposition} \label{prop:prune-ty} Let \(\mathcal{D} : \Dyck_d\) be a Dyck word and let \(p : \Peak_{\mathcal{D}}\) be a peak of this word. Then: \[ \lfloor \mathcal{D} \sslash p \rfloor \vdash \pi_p : \lfloor \mathcal{D} \rfloor \] Given a substitution \(\sigma\) with \(\Gamma \vdash \sigma : \lfloor \mathcal{D} \rfloor\), where \(\lfloor p \rfloor \sub \sigma\) is an identity, the equality and typing judgements: \[ \Gamma \vdash \sigma = \pi_p \bullet (\sigma \sslash p) \qquad \Gamma \vdash \sigma : \lfloor \mathcal{D} \sslash p \rfloor \] hold. \end{proposition} \begin{proof} We prove each judgement holds in turn by induction on the peak \(p\). For the judgement: \[ \lfloor \mathcal{D} \sslash p \rfloor \vdash \pi_p : \lfloor \mathcal{D} \rfloor \] the case when the peak is of the form \(p\DownPeak\) is trivial. The case for when it is of the form \(\mathcal{D}\UDPeak\) easily follows from \cref{lem:dyck-typing,cor:id-typing}. For the case where the peak is of the form \(p\UpPeak\), it must be shown that: \[ \Delta \vdash \langle \wk(\wk(\pi_p)), y, f \rangle : \lfloor \mathcal{D} \rfloor, (y : \ty_{\mathcal{D}}), (f : \arr {\wk(\tm_{\mathcal{D}})} {\wk(\ty_{\mathcal{D}})} y) \] where \(\Delta = \lfloor \mathcal{D} \sslash p \rfloor, (y : \ty_{\mathcal{D} \sslash p}), (f : \arr{\wk(\tm_{\mathcal{D}\sslash p})} {\wk(\ty_{\mathcal{D}\sslash p})} {y})\). This requires proofs of: \begin{align*} \Delta &\vdash \wk(\wk(\pi_p)) : \lfloor \mathcal{D} \rfloor\\ \Delta &\vdash y : \ty_{\mathcal{D}} \sub {\pi_p}\\ \Delta &\vdash f : (\arr {\wk(\tm_{\mathcal{D}})} {\wk(\ty_{\mathcal{D}})} y) \sub {\langle \wk(\pi_p), y \rangle} \end{align*} The first part follows from inductive hypothesis (and typing of weakening). The other two judgements follow from some calculation and \cref{lem:dyck-prune-prop}. For the second judgement: \[ \Gamma \vdash \sigma = \pi_p \bullet (\sigma \sslash p)\] The \(p \DownPeak\) case is again trivial. The \(p \UpPeak\) case follows easily from properties of weakening and the inductive hypothesis. For the \(\mathcal{D} \UDPeak\) case we suppose the substitution is of the form \(\langle \sigma, s, \id(A,t) \rangle\) and are required to show that: \[ \Gamma \vdash \langle \id_{\mathcal{D}}, \tm_{\mathcal{D}}, \id(\ty_{\mathcal{D}}, \tm_{\mathcal{D}})\rangle \bullet \sigma = \langle \sigma, s, \id(A,t) \rangle \] It is immediate that \(\id_{\mathcal{D}} \bullet \sigma \equiv \sigma\) and so it remains to show that \(\Gamma \vdash \tm_{\mathcal{D}} \sub \sigma = s\) and \(\Gamma \vdash \id(\ty_{\mathcal{D}},\tm_{\mathcal{D}}) \sub \sigma = \id(A,t)\). By deconstructing the typing derivation of \(\langle \sigma, s, \id(A,t) \rangle\), we have: \[ \Gamma \vdash \id(A,t) : (\arr{\wk(\tm_{\mathcal{D}})} {\wk(\ty_{\mathcal{D}})} {y}) \sub {\langle \sigma ,s \rangle} \] By \cref{cor:id-typing} and the uniqueness of typing, we must have: \[ \Gamma \vdash \arr t A t = (\arr{\wk(\tm_{\mathcal{D}})} {\wk(\ty_{\mathcal{D}})} {y}) \sub {\langle \sigma ,s \rangle} \equiv \arr {\tm_{\mathcal{D}} \sub \sigma} {\ty_{\mathcal{D}} \sub \sigma} {s} \] and so \(A = \ty_{\mathcal{D}} \sub \sigma\) and \(s = t = \tm_{\mathcal{D}} \sub \sigma\). The equality \(\id(\ty_{\mathcal{D}}, \tm_{\mathcal{D}}) = \id(A,t)\) follows as equality is respected by the identity construction, which can be proved by a simple induction. Lastly, we consider the judgement: \[ \Gamma \vdash \sigma \sslash p : \lfloor \mathcal{D} \sslash p \rfloor \] The only difficult case is for the peak \(p \UpPeak\), where we can assume that the substitution is of the form \(\langle \sigma, s, t\rangle\), such that: \[ \langle \sigma, s, t\rangle \sslash p \UpPeak \equiv \langle \sigma \sslash p, s, t\rangle\] Typing for \(\sigma \sslash p\) follows from inductive hypothesis, and the typing for \(s\) and \(t\) follow from applying conversion rules to the corresponding parts of the typing derivation for \(\langle \sigma, s, t \rangle\). After some computation, the following equalities are needed for these conversion rules: \begin{align*} \Gamma &\vdash \tm_{\mathcal{D}} \sub \sigma = \tm_{\mathcal{D} \sslash p} \sub {\sigma \sslash p}\\ \Gamma &\vdash \ty_{\mathcal{D}} \sub \sigma = \ty_{\mathcal{D} \sslash p} \sub {\sigma \sslash p} \end{align*} The first is given by: \begin{align*} \tm_{\mathcal{D}} \sub \sigma &= \tm_{\mathcal{D}} \sub {\pi_p \bullet (\sigma \sslash p)}\\ &\equiv \tm_{\mathcal{D}} \sub {\pi_p} \sub {\sigma \sslash p}\\ &\equiv \tm_{\mathcal{D} \sslash p} \sub {\sigma \sslash p} \end{align*} and the second follows similarly, completing the proof. \end{proof} We next show that pruning has the expected properties on the Dyck words \(\mathcal{D}^n\), which correspond to disc contexts. \begin{proposition} \label{prop:prune-disc} Let \(n > 0\), and \(p\) be the unique peak of \(\mathcal{D}^n\). Then: \[ \mathcal{D}^n \sslash p \equiv \mathcal{D} \qquad \{\arr s A t,u\} \sslash p \equiv \{A,s\}\] for all \(A,s,t,u\) where \(\dim(A) = n - 1\). \end{proposition} \begin{proof} Both properties are immediate. \end{proof} We now turn our attention to proving that the pruning equality set satisfies all the conditions from \cref{sec:ruleset}. We begin with the tameness conditions, omitting the weakening condition, as it follows from the substitution condition. \begin{proposition} \label{prop:prune-tame} For all \(\mathcal{D} : \Dyck_d\) and peaks \(p : \Peak_{\mathcal{D}}\), and substitutions \(\sigma : \lfloor \mathcal{D} \rfloor \to \Delta\) and \(\tau : \Delta \to \Gamma\) the following equality holds: \[ (\sigma \sslash p) \bullet \tau \equiv (\sigma \bullet \tau) \sslash p \] Hence, the set \prune satisfies the \(\mathcal{R}\)-substitution condition for any equality set \(\mathcal{R}\), and so also satisfies the weakening condition. Furthermore, the following equalities hold: \[\Sigma(\mathcal{D}) \sslash \Sigma(p) = \Sigma(\mathcal{D} \sslash p) \qquad \pi_{\Sigma(p)} \equiv \Sigma(\pi_p) \qquad \Sigma(\sigma \sslash p) \equiv \Sigma(\sigma) \sslash \Sigma(p)\] Therefore, the set \prune also satisfies the suspension condition, making the equality set \prune tame. \end{proposition} \begin{proof} The proofs of each syntactic equality are easily proved by induction on the peak \(p\). Their proofs are given in the formalisation in \module{Catt.Dyck.Pruning.Properties} as \func{Catt.Dyck.Pruning.Properties}{//s-sub}, \func{Catt.Dyck.Pruning.Properties}{prune-susp-peak}, \funcn{Catt.Dyck.Pruning.Properties}{susp-π}{susp-\(\pi\)}, and \func{Catt.Dyck.Pruning.Properties}{susp-//s}. \end{proof} To show that the support property holds, we must prove that \(\Supp(\sigma) = \Supp(\sigma \sslash p)\). We aim to do this by observing that \(\Supp(\sigma) = \Supp(\pi_p \bullet (\sigma \sslash p))\) and that \(\Supp(\pi_p \bullet (\sigma \sslash p)) = \Supp(\sigma \sslash p)\). By employing the proof strategy for the support condition introduced in \cref{sec:further-conditions}, the first will follow from the equality \(\sigma = \pi_p \bullet (\sigma \sslash p)\), which we can assume holds in a theory which satisfies the support condition. For the second we need the following lemma. \begin{lemma} \label{lem:pi-bdry} For all \(n : \mathbb{N}\), \(\epsilon \in \{-,+\}\), \(\mathcal{D} : \Dyck_d\), and \(p : \Peak_{\mathcal{D}}\): \[ \bdry n \epsilon {\lfloor \mathcal{D} \rfloor} \sub {\pi_p} = \bdry n \epsilon {\lfloor \mathcal{D} \sslash p \rfloor} \] and so \(\Supp(\pi_p) = \Var(\lfloor \mathcal{D} \sslash p \rfloor)\). \end{lemma} \begin{proof} The main equation in this lemma is given by a long and technical induction on the peak \(p\). The details of this induction appear in the formalisation in the function \funcn{Catt.Dyck.Pruning.Support}{π-boundary-vs}{\(\pi\)-boundary-vs} which appears in the module \module{Catt.Dyck.Pruning.Support}. The equation \(\Supp(\pi_p) = \Var(\lfloor \mathcal{D} \sslash p \rfloor)\) follows from \cref{prop:vs-sub,lem:bdry-full}, by setting \(n = \dim(\lfloor \mathcal{D} \rfloor)\). \end{proof} We are now ready to prove that the support condition holds. \begin{proposition} \label{prop:prune-supp} Let \(\mathcal{R}\) be a tame equality rule set that satisfies the support condition. Then the set \prune satisfies the \(\mathcal{R}\)-support condition. \end{proposition} \begin{proof} It suffices to prove that: \[ \Supp(\Coh {\lfloor \mathcal{D} \rfloor} {A} \sigma) = \Supp(\Coh {\lfloor \mathcal{D} \sslash p \rfloor} {A \sub {\pi_p}} {\sigma \sslash p}) \] for \(\mathcal{D} : \Dyck_0\), \(p : \Peak_{\mathcal{D}}\), type \(A\), and substitution \(\sigma : \lfloor \mathcal{D} \rfloor \to \Gamma\), where \(\lfloor p \rfloor \sub \sigma\) is an identity and \(\Gamma \vdash_{\mathcal{R}} \Coh {\lfloor \mathcal{D} \rfloor} {A} \sigma : B\) for some \(B\). By inspection of the typing derivation we obtain an instance of the judgement \(\Gamma \vdash_{\mathcal{R}} \sigma : \lfloor \mathcal{D} \rfloor\), and so: \begin{align*} \Supp(\Coh {\lfloor \mathcal{D} \rfloor} {A} \sigma) &= \Supp(\sigma)\\ &= \Supp(\pi_p \bullet (\sigma \sslash p))&(*)\\ &= \Supp(\pi_p) \sub {\sigma \sslash p}\\ &= \Var{\lfloor \mathcal{D} \sslash p \rfloor} \sub {\sigma \sslash p}&\text{by \cref{lem:pi-bdry}}\\ &= \Supp(\sigma \sslash p) \\ &= \Supp(\Coh {\lfloor \mathcal{D} \sslash p \rfloor} {A \sub {\pi_p}} {\sigma \sslash p}) \end{align*} where equality \((*)\) is derived by applying \cref{prop:supp-prop} to the equality \[\Gamma \vdash_{\mathcal{R}} \sigma = \pi_p \bullet (\sigma \sslash p)\] from \cref{prop:prune-ty}. \end{proof} To prove that the preservation condition holds, it is necessary to show that the type \(A \sub{\pi_p}\) created by pruning is a valid operation. This cannot be deduced from any of the conditions that have been imposed on the operation set \(\mathcal{O}\) so far. Therefore, we introduce the following additional condition. \begin{definition} An operation set \(\mathcal{O}\) \emph{supports pruning} if for all \(\mathcal{D} : \Dyck_0\), \(p : \Peak_{\mathcal{D}}\), and variable sets \(U,V \subseteq \Var(\lfloor \mathcal{D} \rfloor)\) we have: \[ (\lfloor \mathcal{D} \sslash p \rfloor, U \sub{\pi_p}, V \sub{\pi_p}) \in \mathcal{O} \] whenever \((\lfloor \mathcal{D} \rfloor, U , V) \in \mathcal{O}\). \end{definition} The globular operation set trivially supports pruning. From \cref{lem:pi-bdry,prop:std-op}, it can be proved that the regular operation set supports pruning. We can now prove that the preservation condition holds. \begin{proposition} \label{prop:prune-preserve} Let \(\mathcal{R}\) be a tame equality rule set and suppose the operation set \(\mathcal{O}\) supports pruning. Then the set \prune satisfies the \(\mathcal{R}\)-preservation condition. \end{proposition} \begin{proof} Let \(\mathcal{D} : \Dyck_d\) be a Dyck word and \(p : \Peak_{\mathcal{D}}\) be a peak of \(\mathcal{D}\). Further suppose \(\arr s A t : \Type_{\lfloor \mathcal{D} \rfloor}\), and \(\sigma : \lfloor \mathcal{D} \rfloor \to \Gamma\) such that \(\lfloor p \rfloor \sub \sigma\) is an identity and: \[ \Gamma \vdash_{\mathcal{R}} \Coh {\lfloor \mathcal{D} \rfloor} {\arr s A t} \sigma : B\] for some type \(B : \Type_\Gamma\). By inspection on this typing derivation we have: \[ \lfloor \mathcal{D} \rfloor \vdash_{\mathcal{R}} A \qquad \Gamma \vdash_{\mathcal{R}} \sigma \lfloor \mathcal{D} \rfloor \qquad (\lfloor \mathcal{D} \rfloor, \Supp(s), \Supp(t)) \in \mathcal{O} \qquad \Gamma \vdash_{\mathcal{R}} B = (\arr s A t) \sub \sigma\] and so by \cref{prop:prune-ty}, we have: \[\lfloor \mathcal{D} \sslash p \rfloor \vdash_{\mathcal{R}} \pi_p : \lfloor \mathcal{D} \rfloor \qquad \Gamma \vdash_{\mathcal{R}} \sigma \sslash p : \lfloor \mathcal{D} \sslash p \rfloor\] therefore, as \(\mathcal{O}\) supports pruning, the following judgement holds: \[\Gamma \vdash_{\mathcal{R}} \Coh {\lfloor \mathcal{D} \sslash p \rfloor} {(\arr s A t) \sub {\pi_p}} {\sigma \sslash p} : (\arr s A t) \sub {\pi_p} \sub {\sigma \sslash p}\] and so by applying the conversion rule, it suffices to show that \[ \Gamma \vdash_{\mathcal{R}} B = (\arr s A t) \sub {\pi_p} \sub {\sigma \sslash p}\] but this follows from the equality \(B = (\arr s A t) \sub \sigma\) and the equality \(\sigma = \pi_p \bullet (\sigma \sslash p)\) from \cref{prop:prune-ty}. \end{proof} We end this section with a property of pruning that will be required to prove confluence. Suppose we have a Dyck word \(\mathcal{D}\) and two distinct peaks \(p, q : \Peak_{\mathcal{D}}\). Then both peaks can be pruned from \(\mathcal{D}\) in either order. Consider the example below on the Dyck word from \cref{ex:dyck-peaks}. \[ \begin{tikzcd}[column sep = 0.7em, row sep = scriptsize, cells={inner sep = 0,shape=circle,anchor=center}, arrows={no head}] &&&&&&&&&&& \bullet \\ &&&&&&&&&& \bullet && \bullet && |[color=Diag1]|\bullet \\ &&&&&&&&& \bullet &&&& \bullet && \bullet \\ && |[color=Diag2]|\bullet && \bullet &&&& |[color=Diag2, rotate=35]|\mathclap{\rightsquigarrow} &&&&&&&& |[color=Diag1, rotate=-35]|\mathclap{\rightsquigarrow} && \bullet \\ & \bullet && \bullet && \bullet && |[color=Diag1]|\bullet &&&&&&&&&& \bullet && \bullet \\ \bullet &&&&&& \bullet && \bullet &&&&&&&& \bullet &&&& \bullet \\ &&&&&&&& |[color=Diag1, rotate=-35]|\mathclap{\rightsquigarrow} &&& |[color=Diag2]|\bullet && \bullet &&& |[color=Diag2, rotate=35]|\mathclap{\rightsquigarrow} \\ &&&&&&&&&& \bullet && \bullet && \bullet \\ &&&&&&&&& \bullet &&&&&& \bullet \arrow[from=6-1, to=5-2] \arrow[color=Diag2, from=5-2, to=4-3] \arrow[color=Diag2, from=4-3, to=5-4] \arrow[from=5-4, to=4-5] \arrow[from=4-5, to=5-6] \arrow[from=5-6, to=6-7] \arrow[color=Diag1, from=6-7, to=5-8] \arrow[color=Diag1, from=5-8, to=6-9] \arrow[from=3-10, to=2-11] \arrow[from=2-11, to=1-12] \arrow[from=1-12, to=2-13] \arrow[from=2-13, to=3-14] \arrow[color=Diag1, from=3-14, to=2-15] \arrow[color=Diag1, from=2-15, to=3-16] \arrow[from=6-17, to=5-18] \arrow[from=5-18, to=4-19] \arrow[from=4-19, to=5-20] \arrow[from=5-20, to=6-21] \arrow[from=9-10, to=8-11] \arrow[color=Diag2, from=8-11, to=7-12] \arrow[color=Diag2, from=7-12, to=8-13] \arrow[from=8-13, to=7-14] \arrow[from=7-14, to=8-15] \arrow[from=8-15, to=9-16] \end{tikzcd} \] The following proposition proves that both peaks of the Dyck word can be pruned, and that the order in which this is done does not matter. \begin{proposition} \label{prop:prune-conf} Suppose \(\mathcal{D} : \Dyck_d\) is a Dyck word and let \(p\) and \(q\) be two distinct peaks of \(\mathcal{D}\). Then there is a peak \(q_{p}\) of \(\mathcal{D} \sslash p\) such that: \[ \lfloor q_{p} \rfloor \equiv \lfloor q \rfloor \sub {\pi_{p}}\] and a similar peak \(p_{q}\) of \(\mathcal{D} \sslash q\). Furthermore, the following equations hold syntactically: \begin{mathpar} (\mathcal{D} \sslash p) \sslash q_{p} = (\mathcal{D} \sslash q) \sslash p_{q} \and \pi_p \bullet \pi_{q_p} \equiv \pi_q \bullet \pi_{p_q} \and (\sigma \sslash p) \sslash q_{p} = (\sigma \sslash q) \sslash p_{q} \end{mathpar} where the last equation holds for any \(\sigma : \lfloor \mathcal{D} \rfloor \to \Gamma\). \end{proposition} \begin{proof} All proofs proceed by a simultaneous induction on both the peaks \(p\) and \(q\), and are given in \module{Catt.Dyck.Pruning.Properties} in the formalisation. The construction of the peak \(q_p\) is given by the function \func{Catt.Dyck.Pruning.Properties}{prune-peak}, the equality \(\lfloor q_p \rfloor \equiv \lfloor q \rfloor \sub {\pi_p}\) is given by \func{Catt.Dyck.Pruning.Properties}{prune-peak-prop}, and the remaining three equations are given by \func{Catt.Dyck.Pruning.Properties}{prune-conf}, \funcn{Catt.Dyck.Pruning.Properties}{π-conf}{\(\pi\)-conf}, and \func{Catt.Dyck.Pruning.Properties}{prune-sub-conf}. \end{proof} \section{Trees} \label{sec:trees} During the next sections we build up to defining the insertion operation. This operation performs larger modifications to pasting diagrams than the pruning operation, and we will again want to represent pasting diagrams differently to make the definition in \cref{sec:insertion} as natural as possible. It is well known that pasting diagrams correspond to planar rooted trees \cite{Weber2004,leinster2004higher,batanin1998monoidal}, which we will simply refer to as \emph{trees} and can be defined as follows. \begin{definition} A \emph{tree} \(T : \Tree\) is inductively defined to be a (possibly empty) list of trees. \end{definition} Throughout this section we will make use of standard operations and notations for lists. A list that contains the elements \(x_i\) for \(i\) from \(0\) to \(n\) will be written in square bracket notation as \([x_0,x_1,x_2,\dots,x_n]\). Further, we use the notation \(\emp\) for the empty list and \(\doubleplus\) for the concatenation of lists, which is associative and has the empty list as its unit. We will use the Agda-like notation of writing \(n :: ns\) for a list for which the first element (the head) is \(n\) and the rest of the list (the tail) is \(ns\). The length of a list will be given by the operation \(\len\). We will use the notation \(\Sigma(T) = [T]\), and call \(\Sigma(T)\) the suspension of \(T\), for reasons that will become immediate once the context generated by a tree has been defined in \cref{sec:tree-contexts}. We note that it will be common to see expressions of the form \(S :: T\) where \(S\) and \(T\) are both trees. It may seem as if this was an error, and that a concatenation operation should have been given instead, but in this case we are exploiting the identification of trees and lists of trees to treat \(S\) as a tree (as an element of the list) and \(T\) as a list of trees. We now define some common operations on trees. \begin{definition} \label{def:treetrunk} The \emph{depth} of a tree \(\dep(T)\) is \(0\) if \(T\) is empty or \(1 + \max_k{\dep(T_k)}\) if \(T = [T_0,\dots,T_n]\). For a tree \(T\), its \emph{trunk height}, \(\th(T)\), is \(1 + \th(T_0)\) if \(T = [T_0]\) and \(0\) otherwise. A tree is \emph{linear} if its trunk height equals its depth. Subtrees of a tree can be indexed by a list of natural numbers \(P\), giving a subtree \(T^P\) by letting \(T^{\emp} = T\) and \(T^{k::P} = {(T_k)}^P\) if \(T = [T_0, \dots, T_n]\). \end{definition} As these trees represent pasting diagrams, a context can be associated to each one. To be able to make effective use of trees we will need to understand this mapping to contexts, and the associated constructions used in this mapping. One of these constructions is suspension, which we have already seen. The second is an operation known as the wedge sum, which will be introduced in \cref{sec:wedge-sums}. Both these operations are mappings from contexts to contexts which preserve ps-context derivations. We will see in \cref{sec:tree-contexts} that a further result holds, that these two operations (along with the singleton context) are sufficient to generate all ps-contexts. \begin{remark} In the formalisation, trees are defined in \module{Catt.Tree} and take a slightly different form to the trees defined above, and are actually defined to be a binary tree. This exploits an isomorphism between binary trees and trees with arbitrary (finite) branching. The constructors for the trees in the formalisation are called \(\mathsf{Sing}\), which stands for ``singleton'' and takes no arguments, and \(\mathsf{Join}\), which takes two trees as arguments. The isomorphism is generated from the following rules: \begin{mathpar} \inferrule{ }{\mathsf{Sing} \simeq \emp}\and \inferrule{S \simeq S' \\ T \simeq T'}{\mathsf{Join}(S,T) \simeq S' :: T'} \end{mathpar} Presenting trees in this way in the formalisation allows any induction to be done as a single induction over the constructors of a tree, instead of simultaneously inducting on the depth of the tree and on lists. We retain the standard notation of trees for this text for simplicity of notation. Under the above isomorphism, this has no effect on the formal development. \end{remark} \subsection{Wedge sums} \label{sec:wedge-sums} The wedge sum, just like suspension, is an operation inspired by a similar operation on topological spaces. Given two spaces \(X\) and \(Y\) and points \(x\) of \(X\) and \(y\) of \(Y\), the space \(X \vee Y\) can be formed, by taking the disjoint union of \(X\) and \(Y\), and identifying the points \(x\) and \(y\). This construction satisfies a universal property: it is the colimit of the following diagram: \begin{equation} \label[diagram]{diag:wedge-colimit} \begin{tikzcd} X && Y \\ & {\{*\}} \arrow["x", from=2-2, to=1-1] \arrow["y"', from=2-2, to=1-3] \end{tikzcd} \end{equation} where the arrows labelled \(x\) and \(y\) send the unique point \(*\) to \(x\) and \(y\) respectively. Such a universal construction gives rise to two inclusions: \[\inc_X : X \to X \vee Y \qquad \inc_Y : Y \to X \vee Y\] A similar colimit can be formed in the syntactic category of \Cattr. Leveraging that the variables of a context are ordered, every (non-empty) context in \Catt is naturally bipointed. For a context \(\Gamma\), the first point is given by the first variable of the context (which must have type \(\star\)), which we name \(\fst(\Gamma)\), and the second point is given by the last \(0\)-dimensional variable in the context, which we name \(\snd(\Gamma)\). We therefore restrict the construction above to when the chosen point for the left context \(\Gamma\) is \(\fst(\Gamma)\) and the chosen point for the second context is \(\snd(\Delta)\). This simplifies the construction, and will be the only case we need for forming trees. We note that \(\fst(\Sigma(\Gamma)) \equiv N\) and \(\snd(\Sigma(\Gamma)) \equiv S\), as we will commonly take the wedge sums of suspended contexts. \begin{definition} Let \(\Gamma\) and \(\Delta\) be non-empty contexts. We then mutually define the \emph{wedge sum} \(\Gamma \vee \Delta\) and inclusions \(\inc_\Gamma : \arr \Gamma \star {\Gamma \vee_t \Delta}\) and \(\inc_\Delta : \arr \Delta \star {\Gamma \vee \Delta}\) on the context \(\Delta\), noting that the base case is \(\Delta = (x : A)\) as \(\Delta\) is non-empty. \begin{align*} \Gamma \vee (x : A) &= \Gamma \\ \Gamma \vee \Delta, (x : A) &= \Gamma \vee \Delta, (x : A \sub {\inc_\Delta})\\[10pt] \inc_\Gamma &= \wk^{n - 1}(\id_\Gamma) &\text{when \(\Delta\) has length \(n\)}\\[10pt] \inc_{(x : A)} &= \langle \snd(\Gamma) \rangle\\ \inc_{\Delta, (x : A)} &= \langle \wk(\inc_\Delta), x \rangle \end{align*} If we further have substitutions \(\sigma : \arr \Gamma A \Theta\) and \(\tau : \arr \Delta A \Theta\), then we can define the substitution \( \sigma \vee \tau : \arr {\Gamma \vee \Delta} A \Theta\) again by induction on \(\Delta\): \begin{align*} \sigma \vee \langle A, s \rangle &= \sigma\\ \sigma \vee \langle \tau, s \rangle &= \langle \sigma, s \rangle \end{align*} We note that no extra property is needed to define this universal map, though to show it is well-formed we will need that \(\snd(\Gamma) \sub \sigma = \fst(\Delta) \sub \tau\). \end{definition} We firstly prove some basic properties required for \(\Gamma \vee \Delta\) to be the colimit of \cref{diag:wedge-colimit}. \begin{lemma} \label{lem:wedge-sum-prop} Let \(\Gamma\) and \(\Delta\) be non-empty contexts. Then: \[ \inc_{\Gamma} \vee \inc_{\Delta} \equiv \id_{\Gamma \vee \Delta} \] Further, the following equations hold: \[ \inc_{\Gamma} \bullet (\sigma \vee \tau) \equiv \sigma \qquad \inc_{\Delta} \bullet (\sigma \vee \tau) \equiv \tau \] for substitutions \(\sigma : \arr \Gamma A \Theta\) and \(\tau : \arr \Delta A \Theta\) where the second equality requires that \(\snd(\Gamma) \sub \sigma \equiv \fst(\Delta) \sub \tau\). Lastly: \[ (\sigma \vee \tau) \bullet \mu \equiv (\sigma \bullet \mu) \vee (\tau \bullet \mu) \] where \(\mu : \arr \Theta B {\Theta'}\) is another substitution. \end{lemma} \begin{proof} Proofs appear as \func{Catt.Wedge.Properties}{sub-from-wedge-prop}, \func{Catt.Wedge.Properties}{sub-from-wedge-inc-left}, \func{Catt.Wedge.Properties}{sub-from-wedge-inc-right}, and \func{Catt.Wedge.Properties}{sub-from-wedge-sub} in \module{Catt.Wedge.Properties}. \end{proof} To simplify definitions of substitutions between wedge sums of contexts, we will write substitutions diagrammatically by specifying the individual components. Consider the following diagram: % https://q.uiver.app/?q=WzAsNixbMCwyLCJcXFNpZ21hXFxHYW1tYSJdLFsxLDIsIlxcdmVlIl0sWzIsMiwiXFxTaWdtYSBcXERlbHRhIl0sWzAsMCwiXFxTaWdtYSBcXEdhbW1hJyJdLFsyLDAsIlxcU2lnbWFcXERlbHRhJyJdLFsxLDAsIlxcdmVlIl0sWzAsMywiXFxTaWdtYSBcXHNpZ21hIl0sWzIsNCwiXFxTaWdtYSBcXHRhdSJdXQ== % tex-fmt: skip \[ \begin{tikzcd}[column sep=tiny, row sep=10pt] {\Gamma'} & \vee & {\Delta'} &\vee &{\Theta'} \\ \\ \Gamma & \vee & \Delta & \arrow["{\sigma}", from=3-1, to=1-1, pos=.4] \arrow["{\tau}", from=3-3, to=1-3, pos=.4] \end{tikzcd} \] which is generated from substitutions \(\sigma : \Gamma \to \Gamma'\) and \(\tau : \Delta \to \Delta'\). A substitution \(\Gamma \vee \Delta \to \Gamma' \vee \Delta' \vee \Theta'\) can be generated by composing each arrow in the diagram with suitable inclusions so that its target is \(\Gamma' \vee \Delta' \vee \Theta'\), and then using the universal property of the wedge to map out of the source context. In the diagram above the generated substitution is: \[ ((\sigma \bullet \inc_{\Gamma'}\bullet \inc_{\Gamma' \vee \Delta'}) \vee (\tau \bullet \inc_{\Delta'}\bullet \inc_{\Gamma' \vee \Delta'})) \] To ensure these definitions are unique, the following proposition is needed: \begin{proposition} The wedge sum \(\vee\) is associative and has the singleton context \((x : \star)\) as its left and right unit. Given a context \(\Gamma\), the inclusions satisfy the following unitality properties: \[ \inc_{\Gamma} : \Gamma \to \Gamma \vee (x : \star) \equiv \id_\Gamma \qquad \inc_{\Gamma} : \Gamma \to (x : \star) \vee \Gamma \equiv \id_\Gamma \] and given substitutions \(\sigma : \arr \Gamma A \Xi\), \(\tau : \arr \Delta A \Xi\), and \(\mu : \arr \Theta A \Xi\) we have: \[ (\sigma \vee \tau) \vee \mu \equiv \sigma \vee (\tau \vee \mu)\] There is a unique way of including each of the contexts \(\Gamma\), \(\Delta\), and \(\Theta\) into \(\Gamma \vee \Delta \vee \Theta\), that is there is a unique substitution \(\Gamma \to \Gamma \vee \Delta \vee \Theta\) which is built from a composite of inclusions and similarly for \(\Delta\) and \(\Theta\). \end{proposition} \begin{proof} The proofs of these are given in \module{Catt.Wedge.Properties}, and are all given by inducting on the right most context. The proof for the right unitality of \(\vee\) is omitted from the formalisation as it is immediate from the definitions. The uniqueness of inclusions substitutions is given by \begin{itemize} \item \func{Catt.Wedge.Properties}{wedge-inc-left-assoc}, which says: \begin{align*} \inc_{\Gamma} \bullet \inc_{\Gamma \vee \Delta} : \Gamma \to (\Gamma \vee \Delta) \vee \Theta &\equiv \inc_{\Gamma} : \Gamma \to \Gamma \vee (\Delta \vee \Theta)\\ \intertext{ \item \func{Catt.Wedge.Properties}{wedge-incs-assoc}, which says:} \inc_{\Delta} \bullet \inc_{\Gamma \vee \Delta} : \Delta \to (\Gamma \vee \Delta) \vee \Theta &\equiv \inc_{\Delta} \bullet \inc_{\Delta \vee \Theta} : \Delta \to \Gamma \vee (\Delta \vee \Theta)\\ \intertext{ \item \func{Catt.Wedge.Properties}{wedge-inc-right-assoc}, which says:} \inc_{\Theta} : \Theta \to (\Gamma \vee \Delta) \vee \Theta &\equiv \inc_{\Theta} \bullet \inc_{\Delta \vee \Theta} : \Theta \to \Gamma \vee (\Delta \vee \Theta) \end{align*} \end{itemize} We note that the definition of the wedge sum differs slightly in the formalisation, specifying a term \(t\) in \(\Gamma\) which takes the role of \(\snd(\Gamma)\), in order to give more computational control. By replacing the terms \(t\) in the formalisation by \(\snd(\Gamma)\) for the appropriate context \(\Gamma\), and noting that \(\snd(\Delta) \sub{\inc_{\Delta}} \equiv \snd(\Gamma \vee \Delta)\) (which can be proved by an easy induction), the results written here can be recovered. \end{proof} The previous proposition ensures that the diagrammatic notation for substitutions between wedge sums uniquely defines a substitution. We next show that all the constructions in this section have the expected typing properties. \begin{lemma} \label{lem:wedge-typing} The following inference rules are admissible in \Cattr: \begin{mathpar} \inferrule{\Gamma \vdash \\ \Delta \vdash}{\Gamma \vee \Delta \vdash}\and \inferrule{ }{\Gamma \vee \Delta \vdash \inc_{\Gamma} : \Gamma}\and \inferrule{ }{\Gamma \vee \Delta \vdash \inc_{\Delta} : \Delta}\and \inferrule{\Theta \vdash \snd(\Gamma) \sub \sigma = \fst(\Delta) \sub \tau}{\Theta \vdash \inc_{\Delta} \bullet (\sigma \vee \tau) = \tau}\and \inferrule{\Theta \vdash \sigma : \Gamma \\ \Theta \vdash \tau : \Gamma \\ \Theta \vdash \snd(\Gamma) \sub \sigma = \fst(\Delta) \sub \tau}{\Theta \vdash \sigma \vee \tau : \Gamma \vee \Delta}\and \inferrule{\Theta \vdash \sigma = \sigma'\\ \Theta \vdash \tau = \tau'}{\Theta \vdash \sigma \vee \tau = \sigma' \vee \tau'} \end{mathpar} \end{lemma} \begin{proof} All proofs are given in \module{Catt.Wedge.Typing}. \end{proof} We finally show that the wedge sum preserves pasting diagrams, the property that wedge sums were initially introduced for. \begin{proposition} \label{prop:wedge-ps} The wedge sum of two ps-contexts is a ps-context: If \(\Gamma \vdash_{\mathsf{ps}}\) and \(\Delta \vdash_{\mathsf{ps}}\), then \(\Gamma \vee \Delta \vdash_{\mathsf{ps}}\) \end{proposition} \begin{proof} It can first be proven that if the derivation \(\Gamma \vdash_{\mathsf{ps}}\) is generated by \(\Gamma \vdash_{\mathsf{ps}} x : \star\), then \(x \equiv \snd(\Gamma)\), by showing for all derivations \(\Gamma \vdash_{\mathsf{ps}} x : A \), where \(\dim(A) > 0\) that the \(0\)-target of the type \(A\) is \(\snd(\Gamma)\) by induction, and then case splitting on the original derivation. Then \(\Gamma \vdash_{\mathsf{ps}}\) implies that \(\Gamma \vdash_{\mathsf{ps}} \snd(\Gamma) : \star\). The statement of the proposition is then proven by induction on the following statement: If \(\Gamma \vdash_{\mathsf{ps}}\) and \(\Delta \vdash_{\mathsf{ps}} x : A\), then: \[ \Gamma \vee \Delta \vdash_{\mathsf{ps}} x \sub {\inc_{\Delta}} : A \sub {\inc_{\Delta}}\] The base case is given by the preceding paragraph, and the other cases follow from routine calculation. These proofs are given in \module{Catt.Wedge.Pasting}. \end{proof} We lastly give a version of the wedge sum construction for variable sets. \begin{definition} Let \(\Gamma\) and \(\Delta\) be two non-empty contexts, and let \(U \subseteq \Var(\Gamma)\) and \(V \subseteq \Var(\Delta)\) be variable sets. Then define: \[U \vee V = U \sub {\inc_\Gamma} \cup V \sub {\inc_\Delta}\] to be a variable set of \(\Gamma \vee \Delta\). \end{definition} \subsection{Tree contexts} \label{sec:tree-contexts} We have now defined suspensions and wedge sums, and shown that both operations preserve ps-contexts. This allows us to define the context generated by a tree. \begin{definition} For a tree \(T\), the context $\lfloor T \rfloor$ generated from it is defined recursively by: \[\lfloor \emp \rfloor = D^0 \qquad \lfloor [T_0,\dots,T_n] \rfloor = \bigvee\limits_{i = 0}^n \Sigma\lfloor T_i \rfloor\] It is immediate from this definition that \(\lfloor \Sigma(T) \rfloor \equiv \Sigma(\lfloor T \rfloor)\), \(\lfloor S \doubleplus T \rfloor \equiv \lfloor S \rfloor \vee \lfloor T \rfloor\), and that \(\dim(\lfloor T \rfloor) = \dep(T)\). \end{definition} We can immediately give some examples of trees and their associated contexts. The context \(D^0\) is defined to be the context associated to \(\emp\), and so as \(D^{n+1} \equiv \Sigma(D^n)\), all the disc contexts can easily be recovered from trees as \(D^n \equiv \lfloor \Sigma^n(\emp) \rfloor\). Each tree \(\Sigma^n(\emp)\) is linear and has depth \(n\). Trees can also be drawn graphically as follows: For a tree \([T_0,\dots,T_n]\), first recursively draw the trees \(T_i\) and lay these out in a horizontal line. Then a single point is drawn underneath these subtrees which we call the root of the tree, and a line is and drawn between the root of the tree and the root of each subtree. An example is given in \cref{fig:tree-example}. \begin{figure}[ht] \centering \begin{tikzpicture}[every node/.style={scale=0.6},baseline=(x11.base)] \node [on grid](x01) {$\bullet$}; \node [above left=0.5 and 0.3 of x01, on grid] (x11) {$\bullet$}; \node [above left=0.5 and 0.25 of x11, on grid] (x21) {$\bullet$}; \node [above right=0.5 and 0.25 of x11, on grid] (x22) {$\bullet$}; \node [above right=0.5 and 0.3 of x01, on grid](x12) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{tikzpicture} \qquad \begin{tikzcd} \bullet & \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=30pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, from=1-1, to=1-2] \arrow[from=1-2, to=1-3] \arrow[shorten <=4pt, shorten >=4pt, Rightarrow, from=1, to=2] \arrow[shorten <=4pt, shorten >=4pt, Rightarrow, from=2, to=0] \end{tikzcd} \caption{The tree \([[\emp,\emp],\emp]\) and generated context.} \label{fig:tree-example} \end{figure} The context associated to a tree is clearly a pasting diagram, as the context is built only using the singleton context, wedge sums, and suspension. In fact, the set of contexts generated by trees is exactly the set containing the singleton context, and closed under wedge sums and suspensions. Further, it is proven in the formalisation module \module{Catt.Dyck.FromTree} that all pasting diagrams are generated by some tree, though this will not be needed for any formal development of our type theories. We next introduces \emph{paths}, which can be thought of as the variables in a tree. \begin{definition} Let \(T\) be a tree. \emph{Paths} \(p : \Path_T\) are non-empty lists of natural numbers of the form \(q \doubleplus [n]\) such that \(q\) indexes a subtree \(T^q\) of \(T\) and \(0 \leq n \leq \len(T^q) \). For path \(p : \Path_T\), we obtain a variable of \(\lfloor T \rfloor\) by recursion on \(p\) as follows: \begin{itemize} \item Suppose \(p = [n]\). Let \(T = [T_0,\dots,T_k]\). It is clear that \(\lfloor T \rfloor\) has exactly \(k+2\) variables of dimension \(0\), corresponding to (inclusion of) the first variable of each context \(\Sigma(\lfloor T_i \rfloor)\) as well as the variable corresponding to the inclusion of \(\snd(\Sigma(T_i))\). We then define \(\lfloor [n] \rfloor\) to be the \(n\)\textsuperscript{th} such variable, indexing from 0. \item Let \(p = k :: q\) and \(T = [T_0,\dots,T_k,\dots]\), where \(q\) is a path of \(T_k\). Then by recursion we have a variable \(\lfloor q \rfloor\) of \(\lfloor T_k \rfloor\). This gives a variable \(\Sigma(\lfloor q \rfloor)\) of \(\Sigma(\lfloor T_k \rfloor)\) which can be included into \(\lfloor T \rfloor\) by the appropriate inclusion to get \(\lfloor p \rfloor\). \end{itemize} We lastly define the set of \emph{maximal paths} \(\MaxPath_T\) of \(T\) to be paths \(p \doubleplus [0] \) such that \(T^p = \emp\). Such paths correspond to locally maximal variables of \(\lfloor T \rfloor\). \end{definition} We now turn our attention to substitutions from a tree context \(\sigma : \lfloor T \rfloor \to \Gamma\). A substitution can be viewed as a function from the variables of its source context to terms of the target context. Therefore, a substitution \(\sigma : \lfloor T \rfloor \to \Gamma\) acts on variables of \(\lfloor T \rfloor\). However, we have seen that the more natural notion of a variable in a tree context is a path. This motivates the following definition. \begin{definition} A term-labelling \(L : T \to \Gamma\) from a tree \(T\) to context \(\Gamma\) is a pair containing a function \(\Path_T \to \Term_\Gamma\) and a type of \(\Gamma\). To apply the function component of a labelling to a path \(p\), we write \(L(p)\) or \(L[x_0,x_1,\dots]\) for a path \([x_0,x_1,\dots]\). The type component of the labelling is given by \(\ty(L)\). If \(T = [T_0,\dots,T_n]\), then there are natural projections \( L_i :T_i \to \Gamma\) given by \(L_i(p) = L(i :: p)\) and \(\ty(L) = \arr {L[i]} {\ty(L)} {L[i+1]}\) for \(0 \leq i \leq n\). \end{definition} For labellings to play the role of substitutions, a substitution \(\lfloor L \rfloor : \arr {\lfloor T \rfloor} {\ty(L)} \Gamma\) will be defined for each term-labelling \(L : T \to \Gamma\). A natural way to define this substitution is by induction on the tree \(T\), which motivates the use of extended substitutions. Suppose we start with a labelling \(L : \arr {[T_0,\dots,T_n]} {} \Gamma\). To proceed, we will apply the inductive hypothesis to obtain the substitutions: \[ \lfloor L_i \rfloor : \arr {\lfloor T_i \rfloor} {\arr {L[i]} {\ty(L)} {L[i+1]}} {\Gamma} \] These substitutions are not regular (non-extended) substitutions, even if \(L\) has associated type \(\star\), and so corresponds to a regular substitution. \begin{definition} Let \(L : T \to \Gamma\) be a term-labelling. We define the substitution: \[\lfloor L \rfloor : \arr {\lfloor T \rfloor} {\ty(L)} \Gamma\] by induction on the tree \(T\) as \(\langle \ty(L), L[0] \rangle\) if \(T = \emp\) and: \[ \unrestrict \lfloor L_0 \rfloor \vee \unrestrict \lfloor L_1 \rfloor \vee \cdots \vee \unrestrict \lfloor L_n \rfloor \] if \(T = [T_0, \dots, T_n]\). Although it looks like the \(0\)-dimensional terms in the labelling are not used to generate the substitution, they appear in the types of the labellings \(L_i\), and so appear in the unrestricted substitutions. \end{definition} There are many ways of giving a more syntactic presentation of labellings. Given a tree \(T = [T_0,\dots,T_n]\), a labelling \(L : T \to \Gamma\) can be written as: \[ t_0\{L_0\}t_1\{L_1\}t_2\cdots t_n\{L_n\}t_{n+1} : \ty(L) \] where each \(t_i\) is the term \(L[i]\) and the sublabellings \(L_i\) have been recursively put in this syntactic bracketing format (omitting the type). The syntactic presentation contains all the information of the original labelling, which can be recovered by letting \(L[i] = t_i\) for each \(i\), \(L[i :: p] = L_i(p)\). As an example, take the tree \(T = [[\emp,\emp], \emp]\) from \cref{fig:tree-example}, and let: \[\Gamma = (x : \star), (f : x \to x), (\alpha : f*f \to f)\] Then we can define the labelling \(L : T \to \Gamma\) by: \[ L = x\bigl\{f*f\{\alpha\}f\{\id(f)\}f\bigr\}x\{f\}x : \star \] which sends the (maximal) paths \([0,0,0]\) to \(\alpha\), \([0,1,0]\) to \(\id(f)\), and \([1,0]\) to \(f\), and has associated substitution: \[ \lfloor L \rfloor = \langle x,x,f*f,f,\alpha,f,\id(f),x,f \rangle\] The curly brackets notation for labellings is used instead of a typical round bracket notation to avoid clashes with notations that already use round brackets, such as \(\id(f)\). We finish this section by examining a boundary operation for trees. We have already seen that for every ps-context \(\Gamma\) and \(n \in \mathbb{N}\), there are the boundary variable sets: \[\bdry n - \Gamma \qquad \bdry n + \Gamma\] Since \(\lfloor T \rfloor\) is a ps-context for any tree \(T\), we immediately obtain such boundary variable sets for \(\lfloor T \rfloor\). However, by recalling the definitions for the wedge sum of variable sets given in \cref{sec:wedge-sums} and the suspension of a variable set given in \cref{sec:operation-properties}, a more natural definition can be given. \begin{definition} For any tree \(T : \Tree\), dimension \(n \in \mathbb{N}\), and \(\epsilon \in \{-,+\}\), we define the boundary set: \[\bdry n \epsilon T\] by induction on \(n\) and \(T\). If \(n = 0\), then we define: \[\bdry 0 - T = \FV(\fst(\lfloor T \rfloor)) \qquad \bdry 0 + T = \FV(\snd(\lfloor T \rfloor))\] Now suppose \(n\) is not \(0\). If the tree \(T\) is the singleton tree, then \(\bdry n \epsilon T = \Var(\lfloor T \rfloor)\). Now suppose that \(T = [T_0,\dots,T_n]\). We then define: \[ \bdry n \epsilon T = \bdry {n-1} \epsilon {T_0} \vee \cdots \vee \bdry {n-1} \epsilon {T_n}\] with the boundary sets \(\bdry n \epsilon {T_i}\) obtained by inductive hypothesis. \end{definition} In the formalisation module \module{Catt.Tree.Support}, we prove that the boundary sets \(\bdry n \epsilon T\), the tree boundary, and \(\bdry n \epsilon {\lfloor T \rfloor}\), the ps-context boundary, coincide. Therefore: \[ (\lfloor S \rfloor, \bdry n - T, \bdry n + T) \in \Std\] for each \(n \geq \dep(S) - 1\). \section{Structured syntax} \label{sec:structured-terms} We now introduce a new class of syntax named \emph{structured syntax}. Terms over tree contexts are commonly built using several of the standard constructions we have seen so far, such as paths, labellings, suspensions, and inclusions. By recording which of these constructions was used in the formation of a term, these terms can compute more usefully, which we will exploit to prove more involved lemmas about insertion in \cref{sec:insertion}. Structured syntax will be our variation on the base syntax of \Catt which records these constructions. The key problem with the base syntax for \Catt is that term-labellings are difficult to compose. We have so far considered term-labellings of the form \(L : T \to \Gamma\), where \(\Gamma\) is any arbitrary context, but there is no reason a labelling couldn't be of the form \(M : S \to \lfloor T \rfloor\) for trees \(S\) and \(T\). We would then hope to be able to compose these labellings to get a labelling of the form: \[ M \bullet L : S \to \Gamma \] Such a labelling would need to send a path \(p: \Path_S\) to a term of \(\Gamma\). The only reasonable way forward is to apply \(M\) to \(p\) to get a term of \(\lfloor T \rfloor\), and then apply \(\lfloor L \rfloor\) to this term to get a term of \(\Gamma\). Unfortunately, for an arbitrary term \(t : \Term_{\lfloor T \rfloor}\) and labelling \(L : T \to \Gamma\), the term: \[ t \sub {\lfloor L \rfloor}\] does not have nice computational properties. We examine two examples: \begin{itemize} \item Suppose \(t\) was of the form \(\lfloor p \rfloor\) for some path \(p\). We then have: \[ \lfloor p \rfloor \sub {\lfloor L \rfloor} \equiv L(p)\] and would hope that this syntactic equality would fall out immediately, and that the left-hand side would reduce to the right-hand side in the formalisation. This is not the case however, and proving that such a syntactic equality holds is non-trivial. \item Suppose \(t \equiv \Sigma(s)\) and \(L = a\{L_1\}b : A\). Similar to the above case we would hope that the syntactic equality: \[ \Sigma(s) \sub {\lfloor a\{L_1\}b : A \rfloor} \equiv s \sub {\lfloor L_1 \rfloor}\] holds ``on the nose''. This however is not the case. \end{itemize} Structured terms alleviate these problems by recording that such a term \(t\) was generated from a path or generated using suspension. This allows the application of a labelling to a structured term to use this information, for example letting the two syntactic equalities above to hold by definition. If a labelling is the ``correct'' notion of substitution from a tree, then a structured term is the ``correct'' notion of term in a tree. \begin{definition} Let \(\U\) be a member of \(\Ctx \uplus \Tree\), either some context \(\Gamma\) or some tree \(T\). We then define the \emph{structured syntax} classes \(\STerm_\U\) of \emph{structured terms}, \(\SType_\U\) of \emph{structured types}, and \emph{(\(\STerm\)-)labellings} \(\arr S {} \U\) for some tree \(S\). The syntax classes for structured terms and types are generated by the following rules: \begin{mathpar} \inferrule{p : \Path_T}{\SPath(p) : \STerm_T}\and \inferrule{s : \STerm_{T_i}\\ 0 \leq i \leq n}{\Inc_i(s) : \STerm_{[T_0,\dots,T_n]}} \and \inferrule{S : \Tree\\ A : \SType_S \\ L : S \to \U}{\SCoh S A L : \STerm_\U}\and \inferrule{t : \Term_\Gamma}{\SOther(t) : \STerm_\Gamma} \\ \inferrule{ }{\star : \SType_\U}\and \inferrule{s : \STerm_\U \\ A : \SType_\U \\ t: \STerm_\U}{\arr s A t : \SType_\U} \end{mathpar} Labellings \(L : S \to \U\) are defined as pairs of a function \(\Path_S \to \STerm_\U\) and structured type, similarly to term-labellings in \cref{sec:tree-contexts}. We note that the syntax for structured types is shared with the syntax for \Catt types, and will be careful to make it clear which syntax we are using when necessary. \end{definition} Each piece of structured syntax can be converted back into the base syntax of \Catt, using many of the constructions already introduced. \begin{definition} Suppose \(\U : \Ctx \uplus \Tree\). Define \(\lfloor \U \rfloor\) to be \(\Gamma\) if \(\U = \Gamma\) for some context \(\Gamma\) or \(\lfloor T \rfloor\) if \(\U = T\) for some tree \(T\). Now, for a structured term \(s : \STerm_\U\), a structured type \(A : \SType_\U\), or a labelling \(L : S \to \U\), we define: \[ \lfloor s \rfloor : \Term_{\lfloor \U \rfloor} \qquad \lfloor A \rfloor : \Type_{\lfloor \U \rfloor} \qquad \lfloor L \rfloor : \arr {\lfloor S \rfloor} {\lfloor \ty(L) \rfloor} {\lfloor \U \rfloor} \] by the equations: \begin{align*} \lfloor \SPath(p) \rfloor &= \lfloor p \rfloor\\ \lfloor \Inc_i(s) \rfloor &= \Sigma(\lfloor s \rfloor) \sub {\inc_{\lfloor T_i \rfloor}}&\text{if }s : \STerm_{[T_0,\dots,T_n]}\\ \lfloor \SCoh S A L \rfloor &= \Coh {\lfloor S \rfloor} {\lfloor A \rfloor} {\id_{\lfloor S \rfloor}} \sub {\lfloor L \rfloor}\\ \lfloor \SOther(t) \rfloor &= t\\[10pt] \lfloor \star \rfloor &= \star\\ \lfloor \arr s A t \rfloor &= \arr {\lfloor s \rfloor} {\lfloor A \rfloor} {\lfloor t \rfloor} \end{align*} and by defining \(\lfloor L \rfloor\) similarly to term labellings except \(\lfloor L \rfloor = \langle \lfloor \ty(L) \rfloor, \lfloor L[0] \rfloor \rangle\) for labellings \(L : {\emp} \to {\U}\) from the singleton tree. We refer to \(\lfloor a \rfloor\), \(\lfloor A \rfloor\) and \(\lfloor L \rfloor\) as the term, type, or substitution generated by \(a\),\(A\), or \(L\). \end{definition} For any tree \(T\), there is an \emph{identity labelling} \(\id_T\) given by: \[ \id_T(p) = \SPath(p) \qquad \ty(\id_T) = \star\] The function \func{Catt.Tree.Structured.Properties}{id-label-to-sub} in the formalisation (see \module{Catt.Tree.Structured.Properties}) shows that: \[\lfloor \id_T \rfloor = \id_{\lfloor T \rfloor}\] The main motivation for introducing structured syntax was to be able to define a composition of labellings, which we do now by defining the application of a labelling to a structured term, structured type, or another labelling. \begin{definition} Let \(L : T \to \U\) be a labelling (with \(\U : \Ctx \uplus \Tree\)). We define the application of \(L\) to a structured term \(s : \STerm_{T}\), a structured type \(A : \SType_T\), and a labelling \(M : S \to T\) to give: \[ s \sub L : \STerm_\U \qquad A \sub L : \SType_\U \qquad M \bullet L : S \to \U\] These definitions are given by mutual recursion: \begin{align*} \SPath(p) \sub L &= L(p)\\ \Inc_i(s) \sub L &= s \sub {L_i}\\ \SCoh S A M \sub L &= \SCoh S A {M \bullet L}\\ \SOther(t) \sub L &= t \sub {\lfloor L \rfloor}\\[10pt] \star \sub L &= B\\ (\arr s A t) \sub L &= \arr {s \sub L} {A \sub L} {t \sub L}\\[10pt] (M \bullet L)(p) &= M(p) \sub L\\ \ty(M \bullet L) &= \ty(M) \sub L \end{align*} It can easily be seen that these definitions satisfy the computational properties given at the start of the section. \end{definition} The main theorem of this section is that the application of a labelling to a structured term is compatible with the map from structured syntax to \Catt syntax. \begin{theorem} \label{thm:structured-main} For any labelling \(L : T \to \U\) and structured term \(s : \STerm_T\), structured type \(A : \SType_T\), or labelling \(M : S \to T\), we have: \[ \lfloor s \sub L \rfloor \equiv \lfloor s \rfloor \sub {\lfloor L \rfloor} \qquad \lfloor A \sub L \rfloor \equiv \lfloor A \rfloor \sub {\lfloor L \rfloor} \qquad \lfloor M \bullet L \rfloor \equiv \lfloor M \rfloor \bullet \lfloor L \rfloor\] \end{theorem} \begin{proof} We proceed by proving all statements by mutual induction. Suppose \(s : \STerm_T\) is a structured term. We split on the form of \(s\): \begin{itemize} \item Suppose \(s\) is of the form \(\SCoh S A M\). Then \(s \sub L\) is \(\SCoh S A {M \bullet L}\) and so the required statement follows from the inductive hypothesis for labellings. \item Suppose \(s\) is of the form \(\SOther(t)\). Then \(\lfloor s \sub L \rfloor \equiv \lfloor \SOther (t \sub {\lfloor L \rfloor}) \rfloor \equiv t \sub {\lfloor L \rfloor} \equiv \lfloor s \rfloor \sub {\lfloor L \rfloor}\). \item Suppose \(T = [T_0,\dots, T_n]\) and \(s\) is of the form \(\Inc_i(t)\). Then: \begin{align*} \lfloor \Inc_i(t) \rfloor \sub {\lfloor L \rfloor} &\equiv \Sigma(t)\sub {\inc_{\lfloor T_i \rfloor}} \sub {\unrestrict \lfloor L_0 \rfloor \vee \cdots \vee \unrestrict \lfloor L_n \rfloor}\\ &\equiv \Sigma(t)\sub {\inc_{\lfloor T_i \rfloor} \bullet (\unrestrict \lfloor L_0 \rfloor \vee \cdots \vee \unrestrict \lfloor L_n \rfloor)}\\ &\equiv \Sigma(t) \sub {\unrestrict \lfloor L_i \rfloor} &\text{by \cref{lem:wedge-sum-prop}}\\ &\equiv \lfloor t \rfloor \sub {\lfloor L_i \rfloor}\\ &\equiv \lfloor t \sub {L_i} \rfloor &\text{by inductive hypothesis}\\ &\equiv \lfloor \Inc_i(t) \sub L \rfloor \end{align*} \item Suppose \(s\) is of the form \(\SPath(p)\). Then if \(\lfloor p \rfloor\) is not a \(0\)-dimensional variable, then an argument similar to the preceding case can be made. If instead \(\lfloor p \rfloor\) is of the form \([k]\) and \(T = [T_0,\dots,T_n]\) then first suppose that \(k < n + 1\) such that \(\lfloor [k] \rfloor \equiv \fst(\lfloor T_k \rfloor) \sub {\inc_{\lfloor T_k \rfloor}}\). Then: \begin{align*} \lfloor [k] \rfloor \lfloor L \rfloor &\equiv \fst(\lfloor T_k \rfloor) \sub {\inc_{\lfloor T_k \rfloor}} \sub {\lfloor \unrestrict \lfloor L_0 \rfloor \vee \cdots \vee \unrestrict \lfloor L_n \rfloor \rfloor}\\ &\equiv \fst(\lfloor T_k \rfloor) \sub {\unrestrict \lfloor L_k \rfloor}\\ &= \lfloor L[k] \rfloor \end{align*} where the last equality follows from the labelling \(L_k\) having type component \(\ty(L_k) \equiv \arr {\lfloor L[k] \rfloor} {B} {\lfloor L[k+1] \rfloor}\). The case where \(k = n+1\) is similar to above using \(\snd(T_n)\) instead of \(\fst(T_k)\) (as there is no tree \(T_k\) in this case). \end{itemize} The case for structured types follows by a simple induction using the case for terms. We now consider the case for a label \(M : S \to T\). Suppose \(S = [S_0,\dots,S_n]\). Then: \begin{align*} \lfloor M \rfloor \bullet \lfloor L \rfloor &\equiv \left( \bigvee_i \unrestrict \lfloor M_i \rfloor \right) \bullet \lfloor L \rfloor\\ &\equiv \bigvee_i \unrestrict \lfloor M_i \rfloor \bullet \lfloor L \rfloor&\text{by \cref{lem:wedge-sum-prop}}\\ &\equiv \bigvee_i \unrestrict \left(\lfloor M_i \rfloor \bullet \lfloor L \rfloor\right)\\ &\equiv \bigvee_i \unrestrict \lfloor M_i \bullet L \rfloor&\text{by inductive hypothesis}\\ &\equiv \lfloor M \bullet L \rfloor \end{align*} with the last line following from \((M \bullet L)_i\) and \(M_i \bullet L\) being the same labelling. This concludes all cases. \end{proof} Structured syntax is only used as computational aid for reasoning about the base syntax of \Catt, and therefore the desired notion of ``syntactic'' equality of structured syntax is syntactic equality of the underlying \Catt terms, that is we say \(s \equiv t\) for structured terms \(s\) and \(t\) exactly when \(\lfloor s \rfloor \equiv \lfloor t \rfloor\). On labellings \(L, M : T \to \U\) we can instead provide the equality: \[ L \equiv M \iff \ty(L) \equiv \ty(M) \land \forall (p : \Path_T).\ L(p) \equiv M(p)\] and by observing the proof of \cref{thm:structured-main}, we see that this equality implies equality of the generated substitutions. It is therefore possible to derive many properties for this equality of structured terms simply by reducing all constructions used to the corresponding \Catt constructions, and using the corresponding result for the syntax of \Catt. \begin{proposition} Composition of labellings is associative and has a left and right unit given by the identity labelling. \end{proposition} \begin{proof} Follows immediately from \cref{thm:structured-main}, the identity labelling generating the identity substitution, and the corresponding results for \Catt. \end{proof} Using this technique, every syntactic result about \Catt can be transported to structured syntax. Further, it is easy to prove that the equality relation is preserved by each constructor, for example if \(L \equiv M\) and \(A \equiv B\), then \(\SCoh S A L \equiv \SCoh A B M\). To extend this, we redefine some constructions we have seen for \Catt in the previous sections, this time for structured terms. \begin{definition} We define the suspension for a structured term \(a : \STerm_\U\), structured type \(A : \STerm_\U\), and restricted substitution for a labelling \(L : T \to \U\), giving structured term \(\Sigma(a) : \STerm_{\Sigma(\U)}\), structured type \(\Sigma(A) : \STerm_{\Sigma(\U)}\), and labelling \(\Sigma'(L) : T \to {\Sigma(\U)}\). These are all defined by mutual induction as follows: \begin{align*} \Sigma(a) &\equiv \Inc_0(a) &\text{if \(\U\) is a tree}\\ \Sigma(\SCoh S A M) &\equiv \SCoh S A {\Sigma'(M)}&\text{if \(\U\) is a context}\\ \Sigma(\SOther(t))&\equiv \SOther(\Sigma(t))\\[10pt] \Sigma(\star) &= \arr N \star S&\text{if \(\U\) is a context}\\ \Sigma(\star) &= \arr {\SPath[0]} \star {\SPath[1]}&\text{otherwise}\\ \Sigma(\arr s A t) &= \arr {\Sigma(s)} {\Sigma(A)} {\Sigma(t)}\\[10pt] \Sigma'(L)(p) &= \Sigma(L(p))\\ \ty(\Sigma'(L)) &= \Sigma(\ty(L)) \end{align*} We further define an unrestriction operation that takes a labelling of the form \(M : T \to \U\) with \(\ty(M) \equiv \arr s A t\) and produces a labelling \[\unrestrict M : {\Sigma(T)} \to \U \equiv s\{M\}t : A\] This can be used to define the full suspension of a labelling as with \Catt substitutions by defining \(\Sigma(L)\) to be \(\unrestrict \Sigma'(L)\). \end{definition} A simple case analysis demonstrates that these constructions commute with \(\lfloor \_ \rfloor\). They therefore inherit the properties of the suspension on \Catt terms, types, and substitutions. We lastly recover wedge sums for structured syntax. \begin{definition} We have seen that the wedge sum of trees \(S\) and \(T\) is given by \(S \doubleplus T\). Letting \(S = [S_0,\dots,S_m]\) and \(T = [T_0,\dots,T_n]\), we further define inclusion labellings: \[ \inc_S : S \to {S \doubleplus T} \qquad \inc_T : T \to {S \doubleplus T}\] by the equations: \begin{align*} \inc_S([k])&\equiv \SPath[k] & \inc_S(k :: p) &\equiv \SPath(k :: p) & \ty(\inc_S) &\equiv \star\\ \inc_T([k])&\equiv \SPath[m + k] & \inc_T(k :: p) &\equiv \SPath (m + k :: p) & \ty(\inc_T) &\equiv \star \end{align*} and finally, we suppose \(L : S \to \U\) and \(M : T \to \U\) are labellings of the form: \[ L \equiv s_0\{L_0\}s_1\cdots s_n\{L_n\}t_0 : A \qquad M \equiv t_0\{M_0\}t_1\cdots t_n\{M_n\}t_{n+1} : A \] and define their concatenation to be the labelling: \[ L\doubleplus M \equiv s_0\{L_0\}s_1\cdots s_n\{L_n\}t_0\{M_0\}t_1\cdots t_n\{M_n\}t_{n+1} : A \] where \(L \doubleplus M : {S \doubleplus T} \to \U\). \end{definition} Many properties of these constructions among others are given in the formalisation module \module{Catt.Tree.Structured.Construct.Properties}. In particular, the diagrammatic notation for substitutions between wedge sums can be repurposed to define labellings, which will be used to define certain labellings in \cref{sec:insertion}. It will be useful to be able to interpret all \Catt syntax as structured syntax. For terms such a mapping is trivially given by the \(\SOther\) constructor. For a type \(A\), a structured type \(\lceil A \rceil\) can be formed by a simple induction, applying the \(\SOther\) constructor to each term in the type. For substitutions, we give the following definition. \begin{definition} Let \(\sigma : \lfloor S \rfloor \to_A \Gamma\) be a substitution. We then define the labelling: \[ \lceil \sigma \rceil : S \to \Gamma \] by \(\lceil \sigma \rceil(p) = \SOther(\lfloor p \rfloor \sub \sigma)\) and \(\ty(\lceil \sigma \rceil) = \lceil A \rceil\). \end{definition} This construction is an inverse to taking generating a substitution from a labelling. \begin{proposition} Let \(\sigma : \lfloor S \rfloor \to_A \Gamma\) be a substitution. Then \(\lfloor \lceil \sigma \rceil \rfloor \equiv \sigma\). Further, for any labelling \(L : S \to \Gamma\), \(\lceil \lfloor L \rfloor \rceil \equiv L\). \end{proposition} \begin{proof} We note that every variable of \(\lfloor S \rfloor\) is given by \(\lfloor p \rfloor\) for some path \(p\). We then have the equality: \begin{equation*} \lfloor p \rfloor \sub {\lfloor \lceil \sigma \rceil \rfloor} \equiv \lfloor p \sub {\lceil \sigma \rceil} \rfloor \equiv \lfloor \SOther(\lfloor p \rfloor \sub \sigma) \rfloor \equiv \lfloor p \rfloor \sub \sigma \end{equation*} and so \(\sigma\) and \(\lfloor \lceil \sigma \rceil \rfloor\) have the same action on each variable and so are equal. Letting \(L : S \to \Gamma\) be a labelling. Then for any path \(p\): \[ \lceil \lfloor L \rfloor \rceil(p) \equiv \SOther(\lfloor p \rfloor \sub {\lfloor L \rfloor}) \equiv \SOther(\lfloor L(p) \rfloor) \] and so \(\lfloor \lceil \lfloor L \rfloor \rceil(p) \rfloor \equiv \lfloor L(p) \rfloor\). Therefore, \(L \equiv \lceil \lfloor L \rfloor \rceil\) by definition. \end{proof} \subsection{Typing and equality} \label{sec:typing-struct-terms} Similarly to the definition of syntactic equality for structured syntax, we also want the equality rules for structured terms and structured types to be inherited from the equality relations on their generated terms, and so define: \[ \U \vdash s = t \iff \lfloor \U \rfloor \vdash \lfloor s \rfloor = \lfloor t \rfloor \qquad \U \vdash A = B \iff \lfloor \U \rfloor \vdash \lfloor A \rfloor = \lfloor B \rfloor\] For labellings, (definitional) equality can be defined similarly to the syntactic equality relation: \[ \U \vdash L = M \iff \U \vdash \ty(L) = \ty(M) \land \forall (p : \Path_T).\ \U \vdash L(p) = M(p)\] Using \cref{lem:wedge-typing}, it can be proven by a simple induction that equality of labellings (along with equality of their associated types) induces equality of the generated substitutions. We also want the typing rules for \(s : \STerm_\U\) and \(A : \SType_\U\) to be inherited from the typing rules for \(\lfloor s \rfloor\) and \(\lfloor A \rfloor\). We re-use the notation for each typing judgement. For labellings, we introduce the following more natural typing judgement: \begin{definition} For a labelling \(L : T \to \U\), where \(\U : \Ctx \uplus \Tree\), we define the judgement: \[ \U \vdash L : T \] to mean that the labelling \(L\) is well-formed. This judgement is generated by the following rule: \begin{mathpar} \inferrule{\U \vdash L[0] : \ty(L)\quad \cdots\quad \U\vdash L[n+1] : \ty(L)\\\U\vdash L_0 : T_0\quad \cdots\quad\U\vdash L_n : T_n}{\U \vdash L : [T_0,\dots,T_n]} \end{mathpar} \end{definition} Paths \(p\) can be equipped with a canonical structured type, \(\ty(p)\), as follows: \begin{itemize} \item For paths \([k]\), \(\ty([k]) = \star\), \item For paths \(k :: p\) where \(p\) is a path, the type \(\ty(k :: p)\) is obtained by taking the type \(\ty(p)\), applying \(\Inc_k\) to each term, and replacing the \(\star\) type at its base by the type \(\arr {\SPath[k]} {\star} {\SPath[k+1]}\). \end{itemize} This can be used to prove that the identity labelling is well-formed. \begin{proposition} Let \(S\) be a tree. Then \(S \vdash \id_S : S\). \end{proposition} \begin{proof} Let \(x\) be a list that indexes a subtree of \(S\), and define the labelling \(\mathsf{subtree}(x) : S^x \to x\) by \(\ty(\mathsf{subtree}(x)) = \ty(x \doubleplus [0])\) and \(\mathsf{subtree}(x)(p) = \SPath(x \doubleplus p)\). We then prove the more general result that \(S \vdash \mathsf{subtree}(x) : S^x\) for each \(x\), with the desired result following from the case \(x = \emp\). If \(S^x = \emp\), then the typing judgement follows from \(S \vdash S^x[0] : \ty(S^x[0])\). If \(S^x = [T_0, \dots, T_n]\) then we must show that \(S \vdash S^x[k] : \ty(S^x[0])\), which follows from the observation that \(\ty(S^x[0]) \equiv \ty(S^x[i])\) for any \(i\) as the definition does not use the last element of the path. We are also required to show that \(S \vdash S^x_i : T_i\), but \(T_i \equiv S^{x \doubleplus [i]}\) and \(S^x_i \equiv S^{x \doubleplus [i]}\), and so this follows from inductive hypothesis. \end{proof} From this typing judgement for labellings, one can obtain a derivation of the typing judgement for the generated substitution. \begin{proposition} Let \(L : T \to \U\), and suppose \(\U \vdash L : T\) and \(\U \vdash \ty(L)\). Then: \[ \lfloor \U \rfloor \vdash \lfloor L \rfloor : \lfloor T \rfloor\] \end{proposition} \begin{proof} We induct on the tree \(T\), splitting into cases on whether it is empty. If it is, then by case analysis on the judgement for label typing we get: \[ \U \vdash L[0] : \ty(L) \] Then, \(\lfloor L \rfloor \equiv \langle \lfloor A \rfloor, \lfloor L[0] \rfloor \rangle\), and so the following derivation can be obtained: \[ \begin{prooftree} \infer0{\U \vdash A} \infer1{\lfloor \U \rfloor \vdash \lfloor A \rfloor} \infer1{\lfloor \U \rfloor \vdash \langle \lfloor A \rfloor \rangle : \emptyset} \infer0{\U \vdash L[0] : A} \infer1{\lfloor \U \rfloor \vdash \lfloor L[0] \rfloor : \lfloor A \rfloor} \infer2{\lfloor \U \rfloor \vdash \langle \lfloor A \rfloor, \lfloor L[0] \rfloor \rangle : \lfloor \emp \rfloor} \end{prooftree} \] Suppose instead that \(T = [T_0,\dots,T_n]\), such that: \[ \lfloor L \rfloor \equiv \unrestrict \lfloor L_0 \rfloor \vee \cdots \vee \unrestrict \lfloor L_n \rfloor\] From \(\U \vdash L : T\), we obtain \(\U \vdash L_i : T_i\) for each \(i \in \{0,\dots,n\}\). We further obtain \(\U \vdash L[k] : \ty(L)\) for \(0 \leq k \leq n+1\) and so: \[\ty(L_i) \equiv \arr {L[i]} {\ty(L)} {L[i+1]}\] is well-formed and so by inductive hypothesis we have \(\lfloor \U \rfloor \vdash \lfloor L_i \rfloor : \lfloor T_i \rfloor\). We have for each \(i\) that \(\lfloor \ty(L) \rfloor\) is not the type \(\star\) and so the unrestriction \(\unrestrict \lfloor L_i \rfloor\) is well-formed. Furthermore, by construction of the unrestriction we have: \[ \fst(\lfloor T_i \rfloor) \sub {\lfloor L_i \rfloor} \equiv \lfloor L[i] \rfloor \qquad \snd(\lfloor T_i \rfloor) \sub {\lfloor L_i \rfloor} \equiv \lfloor L[i+ 1] \rfloor\] and so by \cref{lem:wedge-typing}, the wedge sums are well-formed, completing the proof. \end{proof} It can be shown that the reverse implication also holds: if \(\lfloor \U \rfloor \vdash \lfloor L \rfloor : \lfloor T \rfloor\) then \(\U \vdash L : T\). This follows as a corollary from the following proposition. \begin{proposition} Let \(\sigma : \arr {\lfloor T \rfloor} A \Gamma\) be a substitution with \(\Gamma \vdash \sigma : \lfloor S \rfloor\). Then for any \(L : S \to T\) we have: \[T \vdash L : S \implies \Gamma \vdash L \bullet \lceil \sigma \rceil : S\] and hence \(\Gamma \vdash \lceil \sigma \rceil : T\) follows from letting \(L\) be the identity labelling. \end{proposition} \begin{proof} Let \(S = [S_0, \dots, S_n]\) (where we allow this list to be empty). By the definition of the typing for a labelling, it suffices to show that for each \(0 \leq i \leq n\) and \(0 \leq k \leq n + 1\) that: \[ S \vdash L[k] \bullet \lceil \sigma \rceil : \ty(L) \sub {\lceil \sigma \rceil} \qquad S \vdash (L \bullet \lceil \sigma \rceil)_i : S_i\] The second typing judgement follows directly from inductive hypothesis, as \((L \bullet \lceil \sigma \rceil)_i \equiv L_i \bullet \lceil \sigma \rceil\). By definition of typing for structured terms, the first judgement requires us to prove that: \[ \lfloor S \rfloor \vdash \lfloor L[k] \bullet \lceil \sigma \rceil \rfloor : \lfloor \ty(L) \sub {\lceil \sigma \rceil} \rfloor\] which is equivalent to: \[ \lfloor S \rfloor \vdash \lfloor L[k] \rfloor \sub \sigma : \lfloor \ty(L) \rfloor \sub \sigma\] and so follows from typing being preserved by substitution. \end{proof} By these results, many of the properties enjoyed by the typing judgements in \Cattr with a tame rule set \(\mathcal{R}\) also apply to the typing judgements for structured terms. The module \module{Catt.Tree.Structured.Typing.Properties} also introduces many functions for constructing the typing judgements for structured syntax. One such function is \func{Catt.Tree.Structured.Typing.Properties}{TySCoh}, which represents the admissibility of the following rule: \begin{equation} \label[rule]{rule:scoh} \inferrule{S \vdash \arr s A t \\ \U \vdash L : S \\ \U \vdash \ty(L) \\ (\lfloor S \rfloor, \Supp(s), \Supp(t)) \in \mathcal{O}}{\U \vdash \SCoh S {\arr s A t} L} \end{equation} In keeping with the theme of this section, one could define \(\Supp(s)\) as \(\Supp(\lfloor s \rfloor)\) for a structured term \(s : \STerm_\U\). However, we choose not to do this, instead giving a definition of support for structured syntax that leverages the extra information available in the syntax. \begin{definition} For a path \(p : \Path_T\), a structured term \(s : \STerm_\U\), a structured type \(A : \SType_\U\), and a labelling \(L : S \to \U\), we define their supports \(\Supp(p)\), \(\Supp(s)\), \(\Supp(A)\), and \(\Supp(L)\) by mutual recursion: \begin{align*} \Supp([n]) &= \{\lfloor [0] \rfloor\}\\ \Supp(k :: p) &= \Sigma(\Supp(p)) \sub {\inc_{T_k}} &\text{where }T = [T_1,\dots,T_n]\\[10pt] \Supp(\SPath(p)) &= \Supp(p)\\ \Supp(\Inc_i(s)) &= \Sigma(\Supp(s)) \sub {\inc_{T_k}}&\text{where }T = [T_1,\dots,T_n]\\ \Supp(\SCoh S A L) &= \Supp(L) \cup \Supp(\ty(L))\\ \Supp(\SOther(t)) &= \Supp(t)\\[10pt] \Supp(\star) &= \emptyset\\ \Supp(\arr s A t) &= \Supp(s) \cup \Supp(A) \cup \Supp(t)\\ \Supp(L) &= \bigcup_{i=0}^{n+1} \Supp(L[i]) \cup \bigcup_{i=0}^n\Supp(L_i) \end{align*} \end{definition} We note that each of these support definitions is naturally downwards closed, and there is no need to apply a downwards closure operator as was necessary for the support of \Catt syntax. By some routine calculations given in the formalisation module \module{Catt.Tree.Structured.Support}, these support definitions are equivalent to taking the support of the generated piece of syntax. More precisely, the equations: \begin{mathpar} \Supp(p) = \Supp(\lfloor p \rfloor) \and \Supp(s) = \Supp(\lfloor s \rfloor) \and \Supp(A) = \Supp(\lfloor A \rfloor) \and \Supp(L) \cup \Supp(\ty(L)) = \Supp(\lfloor L \rfloor) \end{mathpar} for path \(p\), structured term \(s\), structured type \(A\), and labelling \(L\). By using this notion of support, we are able to avoid a lot of ``boilerplate'' proof. The above definition of support more closely resembles the format of structured terms, and without this definition, most proofs concerning the support of a structured term would begin by simplifying a variable set similar to \(\Supp(\lfloor s \rfloor)\) to one more similar to \(\Supp(s)\). Here, we instead give this equivalence proof once. We end this section by giving alternative equality relations for labellings, which encapsulate the idea that a substitution is fully determined by where it sends locally maximal variables. These equalities are defined as follows for labellings \(L : T \to \U\) and \(M : T \to \U\): \begin{align*} L \equiv^{\max} M &\iff \forall (p : \MaxPath_T).\ L(p) \equiv M(p)\\ \U \vdash L =^{\max} M &\iff \forall (p : \MaxPath_T).\ \U \vdash L(p) = M(p) \end{align*} and define two labels to be equal exactly when their action on maximal paths is equal. The following theorem gives conditions for when the standard equality relation can be recovered from these. \begin{theorem} \label{thm:label-max-equality} Let \(L : S \to \U\) and \(M : S \to \U\) be labellings. Then the following rules are admissible: \begin{mathpar} \inferrule{\U \vdash L : S\\ \U \vdash M : S \\ L \equiv^{\max} M}{\U \vdash L = M}\and \inferrule{\U \vdash L : S\\ \U \vdash M : S \\ L \equiv^{\max} M}{\U \vdash \ty(L) = \ty(M)} \end{mathpar} If the equality rule set \(\mathcal{R}\) satisfies the preservation and support conditions, then the rules above are still admissible with \(\U \vdash L =^{\max} M\) replacing the syntactic equalities. \end{theorem} \begin{proof} We prove the results for the syntactic equality, with the results for the definitional equality following similarly, but using the preservation property instead of uniqueness of typing. We proceed by induction on the tree \(S\), proving the admissibility of both rules simultaneously. First suppose that \(S = \emp\). Then the path \([0] : \Path_{\emp}\) is maximal and so \(\U \vdash L = M\) follows by the reflexivity of equality. The second rule follows from the uniqueness of typing, as we get \(\U \vdash L[0] : \ty(L)\) and \(\U \vdash M[0] : \ty(M)\) from the premises. Now suppose that \(S = [S_0,\dots,S_n]\). By inductive hypothesis, the following judgements hold for each \(i \in \{0,\dots,n\}\): \[ \U \vdash L_i = M_i \qquad \U \vdash \arr {L[i]} {\ty(L)} {L[i+1]} = \arr {M[i]} {\ty(M)} {M[i+1]}\] From the equalities on types, we immediately get that \(\U \vdash \ty(L) = \ty(M)\) as is required for the admissibility of the second rule, and also get that \(\U \vdash L[i] = M[i]\) for each \(0 \leq i \leq n+1\), which along with equality on (sub)labellings above is sufficient to prove that: \[ \U \vdash L = M\] which witnesses the admissibility of the first rule. \end{proof} \subsection{Standard coherences} \label{sec:standard-coherences} In \cref{sec:background}, we gave a preliminary definition of standard coherences, a definition of a canonical coherence over a given pasting diagram. This diagram relies on inclusion substitutions from the boundary of a pasting diagram into its source and target variables, whose definition for ps-contexts can be unpleasant to work with. In contrast, the \(n\)-boundary of a tree and its associated source and target inclusions have a natural definition by induction on the tree, where the source and target inclusions are given by labellings. We give this definition below. \begin{definition} Given dimension \(n \in \mathbb{N}\) and \(T : \Tree\), we define the \emph{\(n\)-boundary} of the tree \(\bound n T : \Tree\) by induction on \(n\) and \(T\): \begin{equation*} \bound 0 T = \emp \qquad \bound {n + 1} {[T_0, \dots, T_n]} = [\bound n {T_0}, \dots , \bound n {T_n}] \end{equation*} We further define path-to-path functions \(\incbdpath n \epsilon T : \bound n T \to T\) for \(\epsilon \in \{-,+\}\) by induction: \begin{align*} \incbdpath 0 - T ([0]) &= [0]\\ \incbdpath 0 + {[T_0, \dots, T_m]} ([0]) &= [m+1]\\ \incbdpath {n+1} \epsilon {[T_0, \dots, T_m]} ([k]) &= [k]\\ \incbdpath {n+1} \epsilon {[T_0,\dots, T_m]} (k :: p) &= [k :: \incbdpath {n+1} \epsilon {T_k} (p)] \end{align*} and then can define the \emph{source inclusion labelling} \(\incbd n + T : {\bound n T} \to T\) and \emph{target inclusion labelling} \(\incbd n + T : {\bound n T} \to T\) by: \[\incbd n \epsilon T(p) = \SPath(\incbdpath n \epsilon T(p)) \qquad \ty(\incbd n \epsilon T) = \star\] for each \(n\) and \(\epsilon \in \{-,+\}\). \end{definition} In the module \module{Catt.Tree.Boundary.Typing}, it is proven that: \[ T \vdash \incbd n \epsilon T : \bound n T\] for all trees \(T\), \(n \in \mathbb{N}\), and \(\epsilon \in \{-,+\}\). In \cref{sec:background}, the source and target variable sets were defined to be support of the source and target inclusions. This can now be justified by the following lemma. \begin{lemma} For a dimension \(n \in \mathbb{N}\), \(T : \Tree\), and \(\epsilon \in \{-,+\}\) we have: \[ \Supp(\incbd n \epsilon T) = \bdry n \epsilon T \] \end{lemma} \begin{proof} The proof is given by the function \func{Catt.Tree.Boundary.Support}{tree-inc-label-supp} in the formalisation module \module{Catt.Tree.Boundary.Support} and proceeds by induction on \(n\) and \(T\). \end{proof} This definition also allows simple inductive proofs that the boundary inclusions satisfy the globularity conditions, which we state in the following proposition. These proofs are given in the formalisation module \module{Catt.Tree.Boundary.Properties}. \begin{proposition} \label{prop:bdry-glob} Let \(n \leq m\) and let \(T\) be a tree. Then: \[ \bound n {\bound m T} \equiv \bound n T\] Further, for \(\epsilon, \omega \in \{-,+\}\) we have: \[ \incbd n \epsilon {\bound m T} \bullet \incbd m \omega T \equiv \incbd n \epsilon T \] If instead \(n \geq \dep(T)\), then \(\bound n T \equiv T\) and \(\incbd n \epsilon T \equiv \id_T\). \end{proposition} Further, these constructions commute with suspension: The equalities \(\Sigma(\bound n T) \equiv \bound {n+1} {\Sigma(T)}\) and \(\Sigma(\incbd n \epsilon T) \equiv \incbd {n+1} \epsilon {\Sigma(T)}\) hold by definition. We now recall the definitions of standard type, standard coherence, and standard term for a tree \(T\), which are given by mutual induction: \begin{itemize} \item The \emph{standard type}, \(\stdty T n\), is an \(n\)-dimensional type where each component of the type is given by the standard term over the appropriate boundary of the tree \(T\), and then included back into \(T\) by applying the inclusion labelling. \item The \emph{standard coherence}, \(\stdcoh T n\), is the canonical dimension \(n\) coherence term over a tree \(T\). It is formed by a single coherence constructor over \(T\) with type given by the standard type, \(\stdty T n\). \item The \emph{standard term}, \(\stdtm T n\), is a variation on the standard coherence which does not introduce unnecessary unary composites. If \(T\) is linear (and so represents a disc context), and \(n = \dep(T)\), then \(\stdtm T n\) is simply given by the unique maximal path in \(T\). Otherwise, it is given by the standard coherence \(\stdcoh T n\). \end{itemize} At the end of \cref{sec:background} it was stated that \(\Sigma(\stdtm T n) \equiv \stdtm {\Sigma(T)} {n + 1}\). Using this, the standard term can instead be defined by letting \(\stdtm \emp 0\) be \(\SPath([0])\), \(\stdtm {\Sigma(T)} {n+1}\) be \(\Sigma(\stdtm T n)\), and \(\stdtm T n\) be \(\stdcoh T n\) otherwise, which avoids the case split on the linearity of \(T\). We now define all three constructions formally using structured syntax. \begin{definition} We define the \(n\)-dimensional \emph{standard type} over a tree \(T\) as a structured type \(\stdty T n : \SType_T\), and the \(n\)-dimensional \emph{standard coherence} and \emph{standard term} over a tree \(T\) as structured terms \(\stdcoh T n, \stdtm T n : \STerm_T\) by mutual induction: \begin{align*} \stdty T 0 &= \star\\ \stdty T {n + 1} &= \arr {\stdtm {\bound n T} n \sub {\incbd {n + 1} - T}} {\stdty T n} {\stdtm {\bound n T} n \sub {\incbd {n+1} + T}}\\[10pt] \stdcoh T n &= \SCoh T {\stdty T n} {\id_T}\\[10pt] \stdtm T n &= \begin{cases*} \SPath([0])&if \(T = \emp\) and \(n = 0\)\\ \Inc_0(\stdtm {T_0} {n-1})&if \(n \neq 0\) and \(T = [T_0]\)\\ \stdcoh T n&\text{otherwise} \end{cases*} \end{align*} when \(n = \dep(T)\), we call the standard coherence \(\stdcoh T n\) the \emph{standard composite} of \(T\). \end{definition} We can immediately show that these standard construct commute with suspension. \begin{lemma} \label{lem:std-susp} For tree \(T\) and \(n \in \mathbb{N}\), \(\Sigma(\stdty T n) \equiv \stdty {\Sigma(T)} {n+1}\) and \(\Sigma(\stdcoh T n) \equiv \stdcoh {\Sigma(T)} {n+1}\). \end{lemma} \begin{proof} We first consider the standard type. The case for \(n = 0\) follows immediately, so we let \(n > 0\). We then get for \(\epsilon \in \{-,+\}\): \begin{align*} \Sigma\left(\stdtm {\bound {n-1} T} {n-1} \sub {\incbd {n-1} \epsilon T}\right) &\equiv \Sigma(\stdtm {\bound {n-1} T} {n-1}) \sub {\Sigma(\incbd{n-1} \epsilon T)}&\text{by functoriality of suspension}\\ &\equiv \stdtm {\Sigma(\bound {n-1} T)} {n} \sub {\Sigma(\incbd{n-1} \epsilon T)}\\ &\equiv \stdtm {\bound n {\Sigma(T)}} n \sub {\incbd n \epsilon {\Sigma(T)}} \end{align*} By inductive hypothesis \(\Sigma(\stdty T {n-1}) \equiv \stdty {\Sigma(T)} n\) and so \begin{align*} \Sigma(\stdty T n) &\equiv \arr {\Sigma\left(\stdtm {\bound {n-1} T} {n-1} \sub {\incbd {n-1} - T}\right)} {\Sigma(\stdty T {n-1})} {\Sigma\left(\stdtm {\bound {n-1} T} {n-1} \sub {\incbd {n-1} + T}\right)}\\ &\equiv \arr {\stdtm {\bound n {\Sigma(T)}} {n} \sub {\incbd n - {\Sigma(T)}}} {\stdty {\Sigma(T)} {n}} {\stdtm {\bound n {\Sigma(T)}} {n} \sub {\incbd n + {\Sigma(T)}}}\\ &\equiv \stdty {\Sigma(T)} {n + 1} \end{align*} as required. For the standard coherence we have: \[ \Sigma(\stdcoh T n) \equiv \SCoh {\Sigma(T)} {\Sigma(\stdty T n)} {\Sigma(\id_T)} \equiv \SCoh {\Sigma(T)} {\stdty {\Sigma(T)} {n+1}} {\id_{\Sigma(T)}} \equiv \stdcoh {\Sigma(T)} {n+1}\] following from the case for types. \end{proof} To prove that the standard constructions are well-formed, we give a couple of lemmas. The first concerns the support of the standard term and standard coherence. \begin{lemma} \label{lem:std-supp} For a tree \(T\), dimension \(n \in \mathbb{N}\), and \(\epsilon \in \{-,+\}\), we have: \[ \Supp\left(\stdtm {\bound n T} n \sub {\incbd n \epsilon T}\right) = \bdry n \epsilon T \qquad \Supp\left(\stdcoh {\bound n T} n \sub {\incbd n \epsilon T}\right) = \bdry n \epsilon T\] \end{lemma} \begin{proof} The case for coherences follows from the definition and the equality \[\Supp(\incbd n \epsilon T) = \bdry n \epsilon T\] For the standard term, it suffices to consider cases where the standard term and standard coherence are not equal. If \(n = 0\), then \(\bound n T \equiv \emp\), and it suffices to prove that \(\Supp([m]) = \FV(\lfloor [m] \rfloor)\), but this is immediate because \(\Supp([m]) = \Supp(\lfloor [m] \rfloor)\) and \(\lfloor [m] \rfloor\) is a variable of type \(\star\) so its support is equal to its free variables. We therefore consider the case where \(n > 0\) and \(\len(\bound n T) = 1\). The only case where this happens is if \(\len(T) = 1\) too, so assume \(T \equiv [T_0]\) \begin{align*} \Supp\left(\stdtm {\bound n T} n \sub {\incbd n \epsilon T} \right) &= \Supp\left(\stdtm {\Sigma(\bound {n-1} {T_0})} n \sub {\Sigma\left(\incbd {n-1} \epsilon {T_0}\right)} \right)\\ &= \Supp\left(\Sigma\left( \stdtm {\bound {n-1} {T_0}} {n - 1} \right) \sub {\Sigma\left(\incbd {n-1} \epsilon {T_0}\right)} \right)\\ &= \Supp\left(\Sigma\left( \stdtm {\bound {n-1} {T_0}} {n - 1} \sub {\incbd {n-1} \epsilon {T_0}} \right) \right)\\ &= \Sigma\left(\Supp\left( \stdtm {\bound {n-1} {T_0}} {n - 1} \sub {\incbd {n-1} \epsilon {T_0}} \right) \right)\\ &= \Sigma\left( \bdry {n-1} \epsilon {T_0} \right)\\ &= \bdry n \epsilon T \end{align*} as required. \end{proof} The second lemma gives a globularity condition for the standard type. \begin{lemma} \label{lem:std-type-glob} Let \(T\) be a tree. Then: \[ \stdty T n \equiv \stdty {\bound m T} n \sub {\incbd m \epsilon T}\] for \(n \leq m\) and \(\epsilon \in \{-,+\}\). \end{lemma} \begin{proof} We induct on \(n\). If \(n = 0\) then both sides of the equation are the type \(\star\). We therefore consider the case for \(n + 1\) and so we must prove: \begin{align*} \stdty T {n+1} &\equiv \arr {\stdtm {\bound n T} n \sub {\incbd n - T}} {\stdty T k} {\stdtm {\bound n T} n \sub {\incbd n + T}}\\ &\equiv \arr {\stdtm {\bound n {\bound m T}} n \sub {\incbd n - {\bound m T}} \sub {\incbd m \epsilon T}} {\stdty {\bound m T} n \sub {\incbd m \epsilon T}} {\stdtm {\bound n {\bound m T}} n \sub {\incbd n + {\bound m T}} \sub {\incbd m \epsilon T}}\\ &\equiv \stdty {\bound m T} {n+1} \sub {\incbd m \epsilon T} \end{align*} The equality \({\stdty T n} \equiv {\stdty {\bound m T} n \sub {\incbd m \epsilon T}}\) follows by inductive hypothesis. Further, for \(\omega \in \{-,+\}\) we have by \cref{prop:bdry-glob}: \begin{align*} \stdtm {\bound n {\bound m T}} n \sub {\incbd n \omega {\bound m T}} \sub {\incbd n \epsilon T} &\equiv \stdtm {\bound n {\bound m T}} n \sub {\incbd n \omega {\bound m T} \bullet \incbd m \epsilon T} \\ &\equiv \stdtm {\bound n T} n \sub {\incbd n - T} \end{align*} which completes the proof. \end{proof} We can now state and prove the typing properties of standard constructions. \begin{proposition} \label{prop:standard-typing} Suppose that \(\mathcal{O}\) contains the standard operations. Then the following rules are admissible: \begin{mathpar} \inferrule{T : \Tree\\ n \in \mathbb{N}}{T \vdash \stdty T n}\and \inferrule{T : \Tree \\ n \neq 0\\ n \geq \dep(T)}{T \vdash \stdcoh T n : \stdty T n}\and \inferrule{T : \Tree \\ n \geq \dep(T)}{T \vdash \stdtm T n : \stdty T n} \end{mathpar} \end{proposition} \begin{proof} We prove that all three rules are admissible by mutual induction. First consider the cases for types. The case when \(n = 0\) is trivial, so we consider the case for \(n + 1\). We need to show that: \[ T \vdash \arr {\stdtm {\bound n T} n \sub {\incbd n - T}} {\stdty n T} {\stdtm {\bound n T} n \sub {\incbd n + T}}\] The inductive hypothesis on types gives that \(T \vdash \stdty n T\) and so we must show that: \[ T \vdash {\stdtm {\bound n T} n \sub {\incbd n \epsilon T}} : \stdty n T\] for \(\epsilon \in \{-,+\}\). By inductive hypothesis for terms, we have \(\bound n T \vdash \stdtm {\bound n T} n : \stdty {\bound n T} n\) as we have \(\dep(\bound n T) \leq n\). As \(T \vdash \incbd n \epsilon T : \bound n T\) we have that: \[ T \vdash {\stdtm {\bound n T} n \sub {\incbd n \epsilon T}} : \stdty {\bound n T} n \sub {\incbd n \epsilon T} \] and so by \cref{lem:std-type-glob}, this case is complete. For the standard coherence, we apply \cref{rule:scoh}, using the inductive hypothesis for types. To show that \((T, \src(\stdty T n), \tgt(\stdty T n)) \in \mathcal{O}\), we apply \cref{lem:std-supp}. For the standard term, like previous proofs it is sufficient to consider the cases where it is defined differently to the standard coherence. For \(n = 0\) we must have \(T = \emp\) by the condition on the depth of \(T\). Hence, \(\stdtm T n \equiv [0]\) which is well-formed as has type \(\star \equiv \stdty T n\) as required. We now consider \(\stdtm {\Sigma(T)} {n+1} \equiv \Sigma (\stdtm T n)\). By inductive hypothesis on dimension, \(T \vdash \stdtm T n : \stdty T n\) and so we immediately have that: \[ \Sigma(T) \vdash \stdtm {\Sigma(T)} {n + 1} : \Sigma(\stdty T n)\] and so the proof is complete by \cref{lem:std-supp}. \end{proof} The equality relations we have seen so far make heavy use of disc contexts and associated terms and types. We therefore pause to consider the form of these as structured syntax and to relate them to the standard constructions presented in this section. All disc contexts are the result of applying iterated suspensions to the singleton context, and so it follows that disc contexts correspond exactly to linear trees. By an abuse of notation we write: \[ D^n = \Sigma^n(\emp)\] As we further have that \(\Sigma(U^n) \equiv U^{n+1}\) for the sphere type \(U^n\), it can be proved for a simple induction that: \[U^n \equiv \lfloor \stdty {D^n} n \rfloor\] As we have already noted, the maximal dimension term \(d_n : \Term_{D^n}\) is given by \(\lfloor \stdtm {D^n} n \rfloor\). It is also equal to the unique maximal path, \(p^n = \Sigma^n[0]\), which is the list containing \(n+1\) zeros. The only missing construction is an equivalent for the substitution from a disc context. From a structured term \(s : \STerm_\U\) of type \(A : \SType_\U\), there should be a labelling \(\{A,s\}\) from \(D^n\) to \(\U\). This however proves more challenging to define as trees and types have opposite inductive structure. For a labelling, it is natural to specify the lower-dimensional terms first and fill in higher-dimensional terms by induction, though when deconstructing a type, we first receive the highest dimensional terms, only receiving the lower-dimensional terms by further deconstructing the type. To define the labelling \(\{A,t\}\), we define the extension of labelling from a linear tree, which allows us to add higher-dimensional terms to the labelling, and use this to define the labelling from a linear tree. \begin{definition} Let \(L : D^n \to \U\) be a labelling from a linear tree, and let \(s,t : \STerm_\U\) be structured terms. The \emph{extension} of \(L\) by \(s\) and \(t\), \(\ext(L,s,t)\), is defined inductively on \(n\) by: \begin{equation*} \ty(\ext(L,s,t)) = \ty(L) \qquad \ext(L,s,t) = \begin{cases*} L[0]\,\{t\}\,s &if \(n = 0\)\\ L[0]\,\{\ext(L_0,s,t)\}\,L[1]&otherwise \end{cases*} \end{equation*} We then define the labelling \(\{A,t\}\) by induction on \(A\): \[ \{\star,t\} = (p \mapsto t) \qquad \{\arr s A t, u\} = \ext(\{A,s\},t,u) \qquad \ty(\{A,t\}) = \star\] \end{definition} These constructions all satisfy the expected typing judgements. More precisely the following inference rules are admissible: \begin{mathpar} \inferrule{\U \vdash L : D^n\\ \U \vdash s : \stdty {D^n} n \sub L\\ \U \vdash t : \arr {p^n \sub L} {\stdty {D^n} n \sub L} {s}}{\U \vdash \ext(L,s,t) : D^{n+1}}\\ \inferrule{\U \vdash A \\ \U \vdash t : A}{\U \vdash \{A,t\} : D^{\dim(A)}} \end{mathpar} The admissibility of the above rules is routine to verify. Using these constructions, we can recover structured term definitions of the unary composite of a (structured) term \(t\) of type \(A\) of dimension \(n\) as \( \stdcoh {D^n} n \sub {\{A,t\}}\) and can define the identity of the same term \(t\) as \(\stdcoh {D^n} {n+1} \sub {\{A,t\}}\). Therefore, the rules for disc removal and endo-coherence removal can be rephrased in terms of structured syntax to get the following rules: \begin{mathpar} \inferrule{\U : \Ctx \uplus \Tree\\ \U \vdash A \\ \U \vdash t : A \\ \dim(A) = n > 0}{ \U \vdash \stdcoh {D^n} n \sub {\{A,t\}} = t}\textsc{dr'}\\ \inferrule{\U : \Ctx \uplus \Tree\\ T : \Tree \\ L : \arr S \star \U\\ n = \dim(A)\\\\ T \vdash A \\ T \vdash s : A \\ \Supp(s) = \Var(T) \\ \U \vdash L : T}{\U \vdash \SCoh T {\arr s A s} L = \stdcoh {D^n} {n+1} \sub { \{A, s\} \bullet L}}\textsc{ecr'} \end{mathpar} which are admissible if the equality rule set \(\mathcal{R}\) has disc removal or endo-coherence removal respectively. We end this section with two further results that can be proven in the presence of disc removal and endo-coherence removal. The first states that disc removal is sufficient (and necessary) to unify standard coherences and standard terms. \begin{theorem} \label{thm:std-dr} The tame equality rule set \(\mathcal{R}\) has disc removal if and only if the rule: \begin{mathpar} \inferrule{T : \Tree \\ n \in \mathbb{N}\\ n \geq \dep(T) > 0}{T \vdash \stdcoh T n = \stdtm T n} \end{mathpar} is admissible. \end{theorem} \begin{proof} We note that \(\stdcoh T n\) and \(\stdtm T n\) only differ when \(T = D^n\). If \(\mathcal{R}\) has disc removal, then for each \(n \neq 0\) we have \(\stdcoh {D^n} n = \SPath(p^n) \equiv \stdtm {D^n} n\). Conversely, if \(\stdcoh T n = \stdtm T n\) when \(n > 0\) or \(\dep(T) > 0\), then \(\stdcoh {D^n} n = \stdtm {D^n} n\) for any \(n > 0\). Then as \(\mathcal{R}\) is tame, we can apply the substitution \(\{A,t\}\) to both sides of the equation to get the statement of disc removal. \end{proof} Lastly, under the presence of endo-coherence removal, the standard coherences \(\stdtm T n\) for which \(n > \dep(T)\) can be shown to be equal to identities. \begin{theorem} \label{thm:std-ecr} Suppose the equality rule set \(\mathcal{R}\) has endo-coherence removal. Let \(T\) be a tree and suppose \(n \geq \dep(T)\). Then: \[ T \vdash \stdcoh T {n+1} = \stdcoh {D^n} {n+1} \sub {\{\stdty T n, \stdtm T n\}} \] \end{theorem} \begin{proof} The following chain of equalities hold: \begin{align*} \stdcoh T {n+1} &\equiv \SCoh T {\arr {\stdtm {\bound n T} n \sub {\incbd n T -}} {\stdty T n} {\stdtm {\bound n T} n \sub {\incbd n T +}}} {\id_S}\\ &\equiv \SCoh T {\arr {\stdtm T n} {\stdty T n} {\stdtm T n}} {\id_S}&\text{by \cref{prop:bdry-glob}}\\ &= \stdcoh {D^n} {n+1} \sub {\{\stdty T n, \stdtm T n\}}&\text{by \textsc{ecr'}} \end{align*} where \textsc{ecr'} can be applied as \(\Supp(\stdtm T n) = \Var(\lfloor T \rfloor)\) by \cref{lem:std-supp}. \end{proof} Due to these two theorems, every standard term \(\stdtm T n\) with \(n \geq \dep(T)\) is equal to either the unique variable of the singleton context (when \(n = \dep(T) = 0\)), a standard composite (when \(n = \dep(T) > 0\)) or an identity (when \(n > \dep(T)\)), hence completely classifying the well-formed standard terms. \section{Insertion} \label{sec:insertion} We now introduce \emph{insertion}, the construction that powers the strictly associative behaviour of \Cattsua. Insertion incorporates part of the structure of a locally maximal argument term into the head coherence, simplifying the overall syntax of the term. Consider the composite \(f * (g * h)\). This term has two locally maximal arguments, \(f\) and \(g * h\), the second of which is a (standard) coherence. Insertion allows us to merge these two composites into one by ``inserting'' the pasting diagram of the inner coherence into the pasting diagram of the outer coherence. In the case above we will get that the term \(f * (g * h)\) is equal to the ternary composite \(f * g * h\), a term with a single coherence. As the term \((f * g) * h\) also reduces by insertion to the ternary composite, we see that both sides of the associator become equal under insertion. The action of insertion on these contexts is shown in \cref{fig:insertion}. \begin{figure} $$ \begin{aligned} \begin{tikzpicture} \node (x) at (0,0) {$x$}; \node (y) at (1.5,0) {$y$}; \node (z) at (3,0) {$z$}; \draw [->] (x) to node [above, font=\small] {$f$} (y); \draw [->] (y) to node [above, font=\small] {$g*h$} (z); \begin{scope}[xshift=1.25cm, yshift=1.75cm, red] %\draw [fill=red!10, draw=none] (1,0.05) ellipse (1.2cm and .6cm); \draw [rounded corners, fill=red!7, draw=none] (-.25,-.35) rectangle +(2.5,1); \node (x2) at (0,0) {$x'$}; \node (y2) at (1,0) {$y'$}; \node (z2) at (2,0) {$z'$}; \draw [->] (x2) to node [above, font=\small] {$g$} (y2); \draw [->] (y2) to node [above, font=\small] {$h$} (z2); \end{scope} \draw [->, very thick, red] (2.25,1.25) to +(0,-.5); \end{tikzpicture} \end{aligned} \quad\leadsto\quad \begin{aligned} \begin{tikzpicture} \node (x) at (0,0) {$x \vphantom'$}; \node [red] (y) at (1,0) {$x'$}; \node [red] (z) at (2,0) {$y'$}; \node [red] (w) at (3,0) {$z'$}; \begin{scope}[xshift=.5cm, yshift=1.5cm, red] \draw [rounded corners, fill=white, draw=none] (-.25,-.35) rectangle +(2.5,1); \end{scope} \draw [->] (x) to node [above, font=\small] {$f$} (y); \draw [->, red] (y) to node [above, font=\small] {$g$} (z); \draw [->, red] (z) to node [above, font=\small] {$h$} (w); \end{tikzpicture} \end{aligned} $$ \caption{Insertion acting on the composite \(f * (g * h)\).} \label{fig:insertion} \end{figure} Insertion is an operation that is best understood with respect to trees instead of ps-contexts. Insertion merges the structure of two trees along a \emph{branch} of the first tree. \begin{definition} Let \(S\) be a tree. A \emph{branch} of \(S\) is a non-empty list of natural numbers \(P\) which indexes a subtree \(S^P\) which is linear. From each branch \(P\), a maximal path \(\olsi P\) can be obtained by concatenating \(P\) with \(p^{\dep(S^P)}\), the unique maximal path of \(S^P\). For a branch \(P\), we further define the \emph{branch height}, \(\bh(P)\), to be one less than the length of \(P\) (noting that branches are non-empty lists), and the \emph{leaf height}, \(\lh(P)\), to be one less than the length of \(\olsi P\), which is equal to the dimension of \(\lfloor \hat P \rfloor\). \end{definition} While each branch \(P\) uniquely determines a maximal path \(\olsi P\), the converse does not hold. There may be multiple branches of a tree which correspond to the same maximal path. Consider the tree \(T = [[[[\emp],\emp],\emp]]\). This has two distinct branches \(P = [0,0,0]\) and \(Q = [0,0,0,0]\) which both correspond to the maximal path \([0,0,0,0,0]\). We graphically depict these branches below by drawing them in blue. \[ P = \begin{tikzpicture}[yscale=0.7,every node/.append style={scale=0.6},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \node [on grid, Diag1] at (-0.9,3)(x31) {$\bullet$}; \node [on grid] at (-0.1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (-0.9,4)(x41) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw[Diag1,very thick] (x21.center) to (x31.center); \draw (x21.center) to (x32.center); \draw[Diag1,very thick] (x31.center) to (x41.center); \end{scope} \end{tikzpicture} \qquad Q = \begin{tikzpicture}[yscale=0.7,every node/.append style={scale=0.6},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \node [on grid] at (-0.9,3)(x31) {$\bullet$}; \node [on grid] at (-0.1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (-0.9,4)(x41) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw (x21.center) to (x31.center); \draw (x21.center) to (x32.center); \draw[Diag1,very thick] (x31.center) to (x41.center); \end{scope} \end{tikzpicture} \] While \(P\) and \(Q\) represent the same path, they have different branch heights: the branch height of \(P\) is \(2\) while the branch height of \(Q\) is \(3\). This will cause insertions along these two branches to proceed differently (though we will see later in \cref{lem:insertion-irrel} that if both insertions are valid then the results are equivalent). The leaf height and branch height of the branch \(P\) is demonstrated in \cref{fig:leafheight}, where we also depict the trunk height of \(T\), which was defined in \cref{sec:trees}. \begin{figure} \[ \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \node [on grid, Diag1] at (-0.9,3)(x31) {$\bullet$}; \node [on grid] at (-0.1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (-0.9,4)(x41) {$\bullet$}; \node [left=0 of x31.center ,on grid] {$T^P$}; \node [right=0 of x41.center ,on grid] {$\olsi P$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw[Diag1,very thick] (x21.center) to (x31.center); \draw (x21.center) to (x32.center); \draw[Diag1,very thick] (x31.center) to (x41.center); \end{scope} \node [on grid] at (-0.7,1)(th) {}; \draw [|->] (-0.7,0) to node [left] {$\th(T)$} (th.center); \draw [dotted,very thick] (th) to (x11); \node [on grid] at (-1.5,2) (bh) {}; \draw [|->] (-1.5,0) to node [left] {$\bh(P)$} (bh.center); \draw [dotted,very thick] (bh) to (x21); \node [on grid] at (-2.3,4) (lh) {}; \draw [|->] (-2.3,0) to node [left] {$\lh(P)$} (lh.center); \draw [dotted,very thick] (lh) to (x41); \end{tikzpicture} \] \caption{\label{fig:leafheight} Leaf height, branch height and trunk height.} \end{figure} Let us again consider the tree \(S = [[\emp,\emp],\emp]\) from \cref{fig:tree-example}. This tree has three branches, corresponding to the maximal paths \([0,0,0]\), \([0,1,0]\), and \([1,0]\). We consider the action of insertion of three trees \(T_1,T_2,T_3\), given below, into branch \(P = [0,0]\), which corresponds to the first of these maximal paths. \[T = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1)(x11) {$\bullet$}; \node [on grid] at (-0.5, 2) (x21) {$\bullet$}; \node [on grid] at (0.5,2)(x22) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{tikzpicture} \qquad T' = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1)(x11) {$\bullet$}; \draw (x01.center) to (x11.center); \path [draw=none] (-0.5,0) to (0.5,0); \end{tikzpicture} \qquad T'' = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x21.base)] \node [on grid] at (0,1)(x11) {$\bullet$}; \node [on grid] at (-0.5, 2) (x21) {$\bullet$}; \node [on grid] at (0.5,2)(x22) {$\bullet$}; \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{tikzpicture} \] We first consider the insertion of \(T\) into \(S\), which returns the inserted tree \(\insertion S P {T}\), where \(P\) is drawn in blue on the diagram. \[ S = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid, Diag1] at (-0.9, 2)(x21) {$\bullet$}; \node [on grid] at (-0.1, 2)(x22) {$\bullet$}; \node [on grid] at (0.5,1)(x12) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw[Diag1, very thick] (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{scope} \end{tikzpicture} \qquad T = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base), Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5, 2) (x21) {$\bullet$}; \node [on grid] at (0.5,2)(x22) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{tikzpicture} \qquad \insertion S P {T} = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base)] \node [on grid, Diag2] at (0,0) (x01) {$\bullet$}; \node [on grid, Diag2] at (-0.5,1) (x11) {$\bullet$}; \node [on grid, Diag2] at (-0.9, 2)(x21) {$\bullet$}; \node [on grid, Diag2] at (-0.5, 2)(x22) {$\bullet$}; \node [on grid] at (-0.1, 2)(x23) {$\bullet$}; \node [on grid] at (0.5,1)(x12) {$\bullet$}; \begin{scope}[on background layer] \draw [Diag2] (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw [Diag2] (x11.center) to (x21.center); \draw [Diag2] (x11.center) to (x22.center); \draw (x11.center) to (x23.center); \end{scope} \end{tikzpicture} \] In this case the structure of \(T\) is compatible with the point of insertion \(P\) and \(T\) can be inserted into \(S\), replacing the branch \(P\) with the appropriate part of \(T\), where this appropriate part is obtained by removing the trunk of \(T\). We now consider the insertion of \(T'\) into \(S\). Despite \(T'\) having a lower depth than \(S\), it is still insertable, forming the following tree \(\insertion S P {T'}\). \[ S = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid, Diag1] at (-0.9, 2)(x21) {$\bullet$}; \node [on grid] at (-0.1, 2)(x22) {$\bullet$}; \node [on grid] at (0.5,1)(x12) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw[Diag1, very thick] (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{scope} \end{tikzpicture} \qquad T' = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base), Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1)(x11) {$\bullet$}; \draw (x01.center) to (x11.center); \path [draw=none] (-0.5,0) to (0.5,0); \end{tikzpicture} \qquad \insertion S P {T'} = \begin{tikzpicture}[yscale=0.7, every node/.style={scale=0.6},baseline=(x11.base)] \node [on grid,Diag2] at (0,0) (x01) {$\bullet$}; \node [on grid,Diag2] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (-0.5, 2)(x21) {$\bullet$}; \node [on grid] at (0.5,1)(x12) {$\bullet$}; \begin{scope}[on background layer] \draw [Diag2] (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw (x11.center) to (x21.center); \end{scope} \end{tikzpicture} \] Here, the branch \(P\) is replaced by a singleton tree, which is the remaining part of \(T'\) after removing its trunk. We note that this operation is the same as pruning the locally maximal variable \(\lfloor \olsi P \rfloor\) from \(\lfloor T \rfloor\). We will see in \cref{sec:univ-prop-insert} that all instances of pruning can be represented as an instance of insertion. When we consider the insertion of \(T''\) into \(S\), it is not clear how to proceed, as there is no ``corresponding part'' of \(T''\) to replace the branch \(P\) with. In the other two cases this is obtained by removing the trunk of the tree, but \(T''\) has no trunk to remove. In this case we say that the insertion is not possible to perform as \(\bh(P) > \th(T'')\), a condition necessary for insertion. More generally we consider a (structured) coherence term \(\SCoh S A L : \STerm_\U\). To apply insertion to this term, we must first identify a branch \(P\) of \(S\) such that \(\olsi P \sub L \equiv \stdcoh T {\lh(P)} \sub M\), that is there is a locally maximal argument of \(L\) which is a standard coherence. We then must construct the following data as part of the insertion operation: \begin{itemize} \item The \emph{inserted tree} \(\insertion S P T\), obtained by inserting \(T\) into \(S\) along the branch \(P\). We have already given some examples of this operation. \item The \emph{interior labelling} \(\iota : T \to \insertion S P T\), the inclusion of \(T\) into a copy of \(T\) living in the inserted tree. \item The \emph{exterior labelling} \(\kappa : S \to \insertion S P T\), which maps \(\olsi P\) to standard coherence over the copy of \(T\), or more specifically \(\mathcal{C}_\Theta^{\lh(P)} \sub \iota\), and other maximal paths to their copy in the inserted tree. \item The \emph{inserted labelling} \(\insertion L P M : \insertion S P T \to \U\), which collects the appropriate parts of \(L\) and \(M\). \end{itemize} Using this notation, insertion yields the following equality: \[\Coh S A L = \Coh {\insertion S P T} {A \sub \kappa} {\insertion L P M}\] These constructions can be assembled into the following diagram, where \(n = \lh(P)\): % https://q.uiver.app/?q=WzAsNSxbMCwwLCJEX24iXSxbMSwwLCJcXERlbHRhIl0sWzAsMSwiXFxUaGV0YSJdLFsxLDEsIlxcaW5zZXJ0aW9uIFxcRGVsdGEgeCBcXFRoZXRhIl0sWzIsMiwiXFxHYW1tYSJdLFsxLDMsIlxca2FwcGEiXSxbMiwzLCJcXGlvdGEiLDJdLFswLDEsIlxce0EseFxcfSJdLFswLDIsIlxce1xcbWF0aGNhbHtVfV9cXFRoZXRhXm4sIFxcbWF0aGNhbHtDfV9cXFRoZXRhXm5cXH0iLDJdLFsxLDQsIlxcc2lnbWEiLDAseyJjdXJ2ZSI6LTN9XSxbMiw0LCJcXHRhdSIsMix7ImN1cnZlIjoyfV0sWzMsNCwiXFxpbnNlcnRpb24gXFxzaWdtYSB4IFxcdGF1IiwxXSxbMywwLCIiLDEseyJzdHlsZSI6eyJuYW1lIjoiY29ybmVyIn19XV0= % tex-fmt: skip \[ \begin{tikzcd} {D^n} & S \\ T & {\insertion S P T} \\ && \U \arrow["\kappa", from=1-2, to=2-2] \arrow["\iota"', from=2-1, to=2-2] \arrow["{\{\ty(\olsi P), \olsi P\}}", from=1-1, to=1-2] \arrow["{\{\stdty T n, \stdcoh T n\}}"', from=1-1, to=2-1] \arrow["L", curve={height=-18pt}, from=1-2, to=3-3] \arrow["M"', curve={height=12pt}, from=2-1, to=3-3] \arrow["{\insertion L P M}"{description}, from=2-2, to=3-3] % \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, % scale=1.5}, draw=none, from=2-2, to=1-1] \end{tikzcd} \] It will be proven in \cref{sec:univ-prop-insert} that the square above is cocartesian, and so \(\insertion S P T\) is the pushout of \(S\) and \(T\). We now begin to define each of these constructions in turn. As we need a lot of data to perform an insertion, we will package it up to avoid repetition. \begin{definition} An \emph{insertion point} is a triple \((S,P,T)\) such that \(S\) and \(T\) are trees and \(P\) is a branch of \(S\) with \(\bh(P) \leq \th(T)\) and \(\lh(S) \geq \dim(T)\). An \emph{insertion redex} is a sextuple \((S,P,T,\U,L,M)\) such that \((S,P,T)\) is an insertion point, \(L : S \to \U\) and \(M : T \to \U\) are labellings with \(\ty(L) \equiv \ty(M) \equiv \star\), and \(L(\olsi P) \equiv \mathcal{C}_T^{\lh(P)}\sub M\). \end{definition} We can now define the insertion operation on trees. \begin{definition}[Inserted tree] Let \((S,P,T)\) be an insertion point. Define the \emph{inserted tree} \(\insertion S P T\) by induction on the branch \(P\), noting that \(P\) is always non-empty. \begin{itemize} \item Suppose \(P = [k]\) and \(S = [S_0,\dots,S_k,\dots,S_n]\). Then: \[\insertion S P T = [S_0,\dots,S_{k-1}] \doubleplus T \doubleplus [S_{k+1},\dots,S_n]\] \item Suppose \(P = k :: Q\) and again \(S = [S_0,\dots,S_k,\dots,S_n]\). We note that \(Q\) is a branch of \(S_k\) and by the condition on trunk height of \(T\) we have \(T = \Sigma(T_0)\). Then: \[\insertion S P T = [S_0,\dots,S_{k-1},(\insertion {S_k} {Q} {T_0}),S_{k+1},\dots,S_n ] \] \end{itemize} We draw attention to the condition of the trunk height of \(T\) being at least the branch height of \(P\), which is necessary for the induction to proceed. We recall that a tree is identified with a list of trees, and that in the first case of insertion \(T\) is treated as a list, and in the second case \(\insertion {S_k} {Q} {T_0}\) is treated as a single tree which forms one of the subtrees of \(\insertion S P T\). \end{definition} We now proceed to define the interior and exterior labellings, which will be done using the diagrammatic notation introduced in \cref{sec:wedge-sums}. \begin{definition}[Interior labelling] Given an insertion point \((S, P, T)\) we define the interior labelling \(\iota_{S,P,T} : T \to \insertion S P T\) by induction on~\(P\). \begin{itemize} \item When \(P = [k]\) and \(S = [S_0,\dots,S_k,\dots,S_n]\) we define \(\iota\) by \(\ty(\iota) = \star\) and: % https://q.uiver.app/?q=WzAsNixbMCwwLCJTXzBcXHZlZVxcZG90c1xcdmVlIFNfe2stMX0iXSxbNCwwLCJTX3trKzF9IFxcdmVlIFxcZG90cyBcXHZlZSBTX24iXSxbMiwwLCJUIl0sWzMsMCwiXFx2ZWUiXSxbMSwwLCJcXHZlZSJdLFsyLDIsIlQiXSxbNSwyLCJcXGlkIl1d % tex-fmt: skip \[ \begin{tikzcd}[column sep=smaller,row sep=10pt] {[S_0,\dots,S_{k-1}]} & \doubleplus & T & \doubleplus & {[S_{k+1},\dots,S_n]} \\ \\ && T \arrow["\id"{font = \normalsize}, from=3-3, to=1-3] \end{tikzcd} \] \item When \(P = k :: Q\), \(S = [S_0,\dots,S_k,\dots,S_n]\), and \(T = [T_0]\) (by the trunk height condition) we define \(\iota\) by \(\ty(\iota) = \star\) and: % https://q.uiver.app/?q=WzAsNixbMCwwLCJcXGxmbG9vciBbU18xLFxcZG90cyxTX3trLTF9XSBcXHJmbG9vciJdLFs0LDAsIlxcbGZsb29yIFtTX3trKzF9LFxcZG90cyxTX25dIFxccmZsb29yIl0sWzIsMCwiXFxTaWdtYSBcXGxmbG9vciBcXGluc2VydGlvbiB7U19rfSB7UCd9IHtUXzF9IFxccmZsb29yIl0sWzMsMCwiXFx2ZWUiXSxbMSwwLCJcXHZlZSJdLFsyLDIsIlxcU2lnbWEgXFxsZmxvb3IgVF8xIFxccmZsb29yIl0sWzUsMiwiXFxTaWdtYSBcXGlvdGFfe1NfayxQJyxUXzF9Il1d % tex-fmt: skip \[ \begin{tikzcd}[column sep=smaller, row sep=10pt] {[S_0,\dots,S_{k-1}]} & \vee & {\Sigma \insertion {S_k} {Q} {T_0}} & \vee & {[S_{k+1},\dots,S_n]} \\ \\ && {\Sigma T_0} \arrow["{\Sigma \iota_{S_k,Q,T_0}}"{font = \normalsize}, from=3-3, to=1-3] \end{tikzcd} \] \end{itemize} We may drop the subscripts on \(\iota\) when they are easily inferred. \end{definition} \begin{definition}[Exterior labelling] Given an insertion point \((S, P, T)\), we define the exterior labelling \(\kappa_{S,P,T} : S \to \insertion S P T\) by induction on \(P\). \begin{itemize} \item When \(P = [k]\) and \(S = [S_0,\dots,S_k,\dots,S_n]\) we define \(\kappa\) by \(\ty(\kappa) = \star\) and: % https://q.uiver.app/?q=WzAsMTAsWzAsMCwiXFxsZmxvb3IgW1NfMSxcXGRvdHMsU197ay0xfV0gXFxyZmxvb3IiXSxbNCwwLCJcXGxmbG9vciBbU197aysxfSxcXGRvdHMsU19uXSBcXHJmbG9vciJdLFsyLDAsIlxcbGZsb29yIFQgXFxyZmxvb3IiXSxbMywwLCJcXHZlZSJdLFsxLDAsIlxcdmVlIl0sWzIsMiwiXFxTaWdtYSBcXGxmbG9vciBTX2sgXFxyZmxvb3IiXSxbMCwyLCJcXGxmbG9vciBbU18xLFxcZG90cyxTX3trLTF9XFxyZmxvb3IiXSxbMSwyLCJcXHZlZSJdLFszLDIsIlxcdmVlIl0sWzQsMiwiXFxsZmxvb3IgW1Nfe2srMX0sXFxkb3RzLFNfbl0gXFxyZmxvb3IiXSxbNSwyLCJcXHtcXG1hdGhjYWx7VX1fVF5uLCBcXG1hdGhjYWx7Q31fVF5uXFx9Il0sWzYsMCwiXFxpZCJdLFs5LDEsIlxcaWQiXV0= % tex-fmt: skip \[ \begin{tikzcd}[column sep=smaller,row sep = 10pt] {[S_0,\dots,S_{k-1}]} & \doubleplus & {T} & \doubleplus & {[S_{k+1},\dots,S_n]} \\ \\ {[S_0,\dots,S_{k-1}]} & \vee & {\Sigma S_k} & \vee & {[S_{k+1},\dots,S_n]} \arrow["{\{\stdty T m, \stdcoh T m\}}"{font = \normalsize, pos=.4}, from=3-3, to=1-3] \arrow["\id"{font = \normalsize}, from=3-1, to=1-1] \arrow["\id"{font = \normalsize}, from=3-5, to=1-5] \end{tikzcd}\] Where we note that by the condition of \(P\) being a branch we have that \(S_k\) is linear and so \(\Sigma \lfloor S_k \rfloor\) is a some disc \(D^m\) where \(m = \dep(S_k) + 1\). \item When \(P = k :: Q\), \(S = [S_0,\dots,S_k,\dots,S_n]\), and \(T = [T_0]\) (by the trunk height condition) we define \(\kappa\) by \(\ty(\kappa) = \star\) and: % https://q.uiver.app/?q=WzAsMTAsWzAsMCwiXFxsZmxvb3IgW1NfMSxcXGRvdHMsU197ay0xfV0gXFxyZmxvb3IiXSxbNCwwLCJcXGxmbG9vciBbU197aysxfSxcXGRvdHMsU19uXSBcXHJmbG9vciJdLFsyLDAsIlxcU2lnbWEgXFxsZmxvb3IgXFxpbnNlcnRpb24ge1Nfa30ge1AnfSB7VF8xfSBcXHJmbG9vciJdLFszLDAsIlxcdmVlIl0sWzEsMCwiXFx2ZWUiXSxbMiwyLCJcXFNpZ21hIFxcbGZsb29yIFNfayBcXHJmbG9vciJdLFswLDIsIlxcbGZsb29yIFtTXzEsXFxkb3RzLFNfe2stMX1cXHJmbG9vciJdLFsxLDIsIlxcdmVlIl0sWzMsMiwiXFx2ZWUiXSxbNCwyLCJcXGxmbG9vciBbU197aysxfSxcXGRvdHMsU19uXSBcXHJmbG9vciJdLFs1LDIsIlxcU2lnbWEgXFxrYXBwYV97U19rLFAnLFRfMX0iXSxbNiwwLCJcXGlkIl0sWzksMSwiXFxpZCJdXQ== % tex-fmt: skip \[ \begin{tikzcd}[column sep=smaller, row sep = 10pt] {[S_0,\dots,S_{k-1}]} & \vee & {\Sigma \insertion {S_k} {Q} {T_0}} & \vee & {[S_{k+1},\dots,S_n]} \\ \\ {[S_0,\dots,S_{k-1}]} & \vee & {\Sigma S_k} & \vee & {[S_{k+1},\dots,S_n]} \arrow["{\Sigma \kappa_{S_k,Q,T_0}}"{font=\normalsize}, from=3-3, to=1-3] \arrow["\id"{font=\normalsize}, from=3-1, to=1-1] \arrow["\id"{font=\normalsize}, from=3-5, to=1-5] \end{tikzcd}\] \end{itemize} Again the subscripts on \(\kappa\) may be dropped where they can be inferred. \end{definition} Lastly we define the inserted labelling, the labelling out of the inserted tree. \begin{definition}[Inserted labelling] Given an insertion point \((S, P, T)\) with \(L : S \to \U\) and \(M : T \to \U\), we define the \emph{inserted labelling} \(\insertion L P M : {\insertion S P T} \to \U\). Let \[ S = [S_0,\dots,S_n] \qquad L = s_0 \{L_0\}s_1 \cdots \{L_n\}s_{n+1} : A\] and then proceed by induction on \(P\). \begin{itemize} \item Let \(P = [k]\), and \[ T = [T_0,\dots,T_m] \qquad M = t_0\{M_0\}t_1 \cdots \{M_m\}t_{m+1} : B\] Then define \(\insertion L {[k]} M\) to be: \[s_0\{L_0\}s_1 \cdots \{L_{k-1}\}t_0\{M_0\}t_1\cdots \{M_m\}t_{m+1}\{L_{k+1}\}s_{k+2}\cdots \{L_n\}s_{n+1} : A\] \item Suppose \(P = k :: Q\) so that \[T = [T_0] \qquad M = t_0\{M_0\}t_1 : B\] Define \(\insertion L P M\) as: \[s_0\{L_0\}s_1\cdots \{L_{k-1}\}t_0\{\insertion {L_k} {Q} {M_0}\}t_1\{L_{k+1}\}s_{k+2} \cdots \{L_n\}s_{n+1} : A\] \end{itemize} \end{definition} We now proceed to prove that each of these constructions used to generate insertion is well-formed. We begin with the following small lemma. \begin{lemma} \label{lem:inserted-label-lem} Let \((S,P,T,\U,L,M)\) be an insertion redex. If we further suppose that \(\U \vdash L : S\) and \(\U \vdash M : T\), then: \[ \U \vdash \arr {L[k]} {\ty(L)} {L[k+1]} = \arr {M[0]} {\ty(M)} {M[m+1]}\] where \(k\) is the first element of \(P\) (as \(P\) is non-empty) and \(T\) has length \(m\). \end{lemma} \begin{proof} From the insertion redex, we have \(L(\olsi P) \equiv \stdcoh T {\lh(P)} \sub M\). By assumption, \(P\) is of the form \(k :: p\), where \(p\) is a path and \(S = [S_0,\dots,S_n]\) and so \[\SPath(\olsi P) \equiv \Inc_k(\SPath(p)) \] and so supposing that \(S_k \vdash \SPath(p) : A\) (as every path is well-formed), we can obtain: \[\U \vdash \SPath(\olsi P) \sub L : \Sigma(A) \sub {\inc_k} \sub L\] By \cref{prop:standard-typing}, \(\U \vdash \stdcoh T {\lh(P)} \sub M : \stdty T {\lh(P)} \sub M\). Therefore, by uniqueness of types (using the syntactic equality from the insertion redex), we have: \[ \U \vdash \Sigma(A) \sub {\inc_k \bullet L} = \stdty T {\lh(P)} \sub M\] By truncating both sides of this equality \(\lh(P) - 1\) times we get: \[ \U \vdash \Sigma(\star) \sub {\inc_k \bullet L} = \stdty T 1 \sub M\] which after expanding definitions on both sides gives the required equality. \end{proof} The typing properties of each of the constructions involved in insertion are given in the following proposition. \begin{proposition} \label{prop:ins-typing} Let \((S,P,T)\) be an insertion point. Then: \[\insertion S P T \vdash \iota_{S,P,T} : T \qquad \insertion S P T \vdash \kappa_{S,P,T} : S\] If we further have \(\U \vdash L : S\) and \(\U \vdash M : S\) with \(L(\olsi P) \equiv \stdcoh T {\lh(P)} \sub M\) then: \[ \U \vdash \insertion L P M : \insertion S P T\] \end{proposition} \begin{proof} The labellings \(\iota\) and \(\kappa\) are formed using constructions that have already been shown to be well-formed. We therefore focus on the typing judgement for the inserted labelling. As in the definition of the inserted labelling, we let \[ S = [S_0,\dots,S_n] \qquad L = s_0 \{L_0\}s_1 \cdots \{L_n\}s_{n+1} : A\] By inspection of the typing derivation \(\U \vdash L : S\) we have that \(\U \vdash s_i : A\) and \(\U \vdash L_i : S_i\) for each \(i\). We then proceed by induction on \(P\). \begin{itemize} \item Let \(P = [k]\) and \[ T = [T_0,\dots,T_m] \qquad M = t_0\{M_0\}t_1 \cdots \{M_m\}t_{m+1} : B\] By \(\U \vdash M : T\), we have that \(\U \vdash t_i : B\) and \(\U \vdash M_i : T_i\) for each \(i\). Applying \cref{lem:inserted-label-lem}, we have \(\U \vdash A = B\), \(\U \vdash s_k = t_0\), and \(\U \vdash s_{k+1} = t_{m+1}\). Therefore, by applying the conversion rule, \(\U \vdash t_i : A\). To complete this case, we must show that for each \(i\): \[ \U \vdash (\insertion L P M)_i : (\insertion S P T)_i\] For most \(i\) this is trivial, however there is a subtlety for \(i = k-1\) that \((\insertion L P M)_{k-1} \not\equiv L_{k-1}\), as: \[\ty((\insertion L P M)_{k-1}) \equiv \arr {s_{k-1}} A {t_0} \not\equiv \arr {s_{k-1}} A {s_k} \equiv \ty(L_{k-1})\] However, the equality \(\U \vdash s_k = t_0\) means that these two types are definitionally equal, and so the required typing derivation follows from \(\U \vdash L_{k-1} : S_k\). A similar argument is needed to prove that \(\U \vdash L_{k+1} : S_{k+1}\), completing this case. \item Suppose \(P = k :: Q\) so that \[T = [T_0] \qquad M = t_0\{M_0\}t_1 : B\] with \(\U \vdash M_0 : T_0\) and \(\U \vdash t_i : B\) for \(i \in \{0,1\}\). Then: \begin{align*} L_k(\olsi{Q}) &\equiv L(\olsi P)\\ &\equiv \stdcoh T {\lh(P)} \sub M\\ &\equiv \Sigma \left(\stdcoh {T_0} {\lh(Q)}\right) \sub M\\ &\equiv \stdcoh {T_0} {\lh(Q)} \sub {M_0} \end{align*} and so by inductive hypothesis, we have \(\U \vdash\insertion {L_k} {Q} {M_0} : \insertion {S_k} {Q} {T_0}\). Then by a similar argument to above it can be shown that \(\insertion L P M\) is well-formed. \end{itemize} Hence, \(\U \vdash \insertion L P M : \insertion S P T\) for all branches \(P\). \end{proof} We now end this section by formally giving the equality rule set for insertion. \begin{definition} \label{def:insertion-rule} The \emph{insertion rule set}, \insert, is the set consisting of the triples: \[ (\Gamma, \lfloor \SCoh S A L \rfloor, \lfloor \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P M} \rfloor)\] for each insertion redex \((S,P,T,\Gamma,L,M)\), and structured type \(A\). A set of rules \(\mathcal{R}\) \emph{contains insertion} if \(\insert \subseteq \mathcal{R}\). Insertion makes the following rule admissible: \begin{equation*} \inferrule{(S,P,T,\Gamma,L,M)\text{ is an insertion redex}\\ S \vdash A \\ \Gamma \vdash L : S}{\Gamma \vdash \SCoh S A L = \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P M}} \end{equation*} The set \(\mathcal{R}\) \emph{has insertion} if the rule \textsc{insert} holds in the generated theory. \end{definition} \subsection{Universal property of insertion} \label{sec:univ-prop-insert} As stated in the previous section, the constructions involved in insertion arise as a pushout square. In this section, we prove this result, which we state below. Throughout this section we assume that we are working in a tame theory for which the support and preservation conditions hold. Further, we only give the maximal arguments of substitutions from a disc, as we only work with well-formed syntax up to definitional equality and so the type will always be inferable. \begin{theorem} \label{thm:univ-prop} Let \((S,P,T)\) be an insertion point. Then the following commutative square of \(\mathsf{Catt}_{\mathcal{R}}\) is cocartesian: \[ \begin{tikzcd}[column sep = large, row sep = large] {D^{\lh(P)}} & \lfloor S \rfloor \\ \lfloor T \rfloor & {\lfloor \insertion S P T \rfloor} \arrow["\lfloor \kappa \rfloor", from=1-2, to=2-2] \arrow["\lfloor \iota \rfloor"', from=2-1, to=2-2] \arrow["{\{\lfloor \olsi P \rfloor\}}", from=1-1, to=1-2] \arrow["{\{\lfloor \stdcoh T {\lh(P)} \rfloor\}}"', from=1-1, to=2-1] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-2, to=1-1] \end{tikzcd} \] The context \(\lfloor \insertion S P T \rfloor\) is the pushout of \(\lfloor S \rfloor\) and \(\lfloor T \rfloor\) along the maps that send the maximal variable of \(D^n\) to the locally maximal variable corresponding to the branch \(P\) and the standard coherence of over \(T\) of dimension equal to the leaf height of \(P\). \end{theorem} This theorem allows an intuitive understanding of the insertion operation; the inserted tree \(\insertion S P T\) is the result of taking the disjoint union of \(S\) and \(T\) and gluing the locally maximal variable of \(S\) corresponding to the branch \(P\) to the composite of \(T\). The original motivation for insertion was to take a term where one of the locally maximal arguments was a standard composition and flatten the structure, which aligns with the intuition given by the universal property. \begin{remark} As contexts have an interpretation as freely generated \(\infty\)-categories, and the category of \(\infty\)-categories is cocomplete, there is an \(\infty\)-category pushout of this square. It however may be surprising that this pushout is freely generated and happens to be freely generated by a pasting diagram. \end{remark} We work towards \cref{thm:univ-prop} by introducing a couple of lemmas. These lemmas will mostly be proven by deferring to the formalisation, using the machinery of structured terms introduced in \cref{sec:structured-terms} to simplify the computations involved. We first show that the square is commutative, while also justifying the description of the exterior labelling given at the start of the section. \begin{lemma} \label{lem:iota-kappa-comm} Let \((S,P,T)\) be an insertion point. Then \(\kappa(\olsi P) \equiv \stdcoh T {\lh(P)} \sub \iota\). \end{lemma} \begin{proof} See \func{Catt.Tree.Insertion.Properties}{κ-branch-path} in \module{Catt.Tree.Insertion.Properties}. \end{proof} We next state two factorisation properties for the interior and exterior labellings. \begin{lemma} \label{lem:ins-comm-max} For insertion redex \((S,P,T,\U,L,M)\), the following hold: \[ \iota_{S,P,T} \circ (\insertion L P M) \equiv M \qquad \kappa_{S,P,T} \circ (\insertion L PM) \equiv^{\max} L \] Hence, the maps \(L\) and \(M\) factor through the labellings \(\kappa\) and \(\iota\) respectively. \end{lemma} \begin{proof} See \funcn{Catt.Tree.Insertion.Properties}{4201}{ι-comm} and \funcn{Catt.Tree.Insertion.Properties}{4738}{κ-comm} in \module{Catt.Tree.Insertion.Properties}. \end{proof} We can now proceed with the proof of \cref{thm:univ-prop}. \begin{proof}[Proof of \cref{thm:univ-prop}] Let \((S,P,T)\) be an insertion point. We must first show that the candidate pushout square is in fact commutative, for which it is sufficient to show: \[ \{\ty(\olsi P), \olsi P\} \bullet \kappa \equiv^{\max} \{\stdty T {\lh(P)}, \stdcoh T {\lh(P)}\} \bullet \iota \] which follows from \cref{lem:iota-kappa-comm}. To prove that this square is cocartesian, we take two substitutions \(\sigma : \lfloor S \rfloor \to \Gamma\) and \(\tau : \lfloor T \rfloor \to \Gamma\) such that the following diagram is commutative: \[ \begin{tikzcd}[column sep = large, row sep = large] {D^{\lh(P)}} & \lfloor S \rfloor \\ \lfloor T \rfloor & {\lfloor \insertion S P T \rfloor}\\ && \Gamma \arrow["\lfloor \kappa \rfloor", from=1-2, to=2-2] \arrow["\lfloor \iota \rfloor"', from=2-1, to=2-2] \arrow["{\{\lfloor \olsi P \rfloor\}}", from=1-1, to=1-2] \arrow["{\{\lfloor \stdcoh T n \rfloor\}}"', from=1-1, to=2-1] \arrow["\sigma", curve={height=-18pt}, from=1-2, to=3-3] \arrow["\tau"', curve={height=12pt}, from=2-1, to=3-3] \end{tikzcd} \] We therefore have that \(\lceil \sigma \rceil\) is a labelling \(S \to \Gamma\) and \(\lceil \tau \rceil\) is a labelling \(T \to \Gamma\) with \[\Gamma \vdash \lceil \sigma \rceil(\olsi P) = \stdcoh T {\lh(P)} \sub{\lceil \tau \rceil}\] To apply \cref{lem:ins-comm-max}, we need this to be a syntactic equality. We therefore define \(M =\lceil \tau \rceil\) and \(L\) to be given by: \[ L(p) = \begin{cases*} \stdcoh T {\lh(P)} \sub M &if \(p = \olsi P\)\\ \lceil \sigma \rceil(p)&otherwise \end{cases*} \] by the equality above, \(L\) is well-formed and \(\lfloor L \rfloor = \sigma\). We then get a well-formed map \(\lfloor \insertion L P M \rfloor\) from \(\lfloor \insertion S P T \rfloor\) to \(\Gamma\) such that the following diagram is commutative by \cref{lem:ins-comm-max}: \[ \begin{tikzcd}[column sep = large, row sep = large] {D^{\lh(P)}} & \lfloor S \rfloor \\ \lfloor T \rfloor & {\lfloor \insertion S P T \rfloor}\\ && \Gamma \arrow["\lfloor \kappa \rfloor", from=1-2, to=2-2] \arrow["\lfloor \iota \rfloor"', from=2-1, to=2-2] \arrow["{\{\lfloor \olsi P \rfloor\}}", from=1-1, to=1-2] \arrow["{\{\lfloor \stdcoh T n \rfloor\}}"', from=1-1, to=2-1] \arrow["\lfloor \sigma \rfloor", curve={height=-18pt}, from=1-2, to=3-3] \arrow["\lfloor M \rfloor"', curve={height=12pt}, from=2-1, to=3-3] \arrow["\lfloor \insertion L P M \rfloor"{description}, from=2-2, to=3-3] \end{tikzcd} \] The uniqueness of this morphism follows from the observation that every path of \(\insertion S P T\) is either of the form \(\iota(p)\) for some \(p : \Path_T\) or \(\kappa(q)\) for some \(q: \Path_S\). \end{proof} From this result we will be able to show that having insertion in a theory implies the existence of pruning. The plan will be to show that pruning satisfies a similar universal property. \begin{proposition} Let \(\mathcal{D} : \Dyck_0\) be a Dyck word, and let \(p\) be a peak of \(\mathcal{D}\). Then the following square is a pushout square: \[ \begin{tikzcd}[column sep = large, row sep = large] {D^{n+1}} & \lfloor \mathcal{D} \rfloor \\ D^{n} & {\lfloor \mathcal{D} \sslash p \rfloor} \arrow["\pi_p", from=1-2, to=2-2] \arrow["{\{\src(\lfloor p \rfloor)\}}"', from=2-1, to=2-2] \arrow["{\{\lfloor p \rfloor\}}", from=1-1, to=1-2] \arrow["{\{\id(d_n)\}}"', from=1-1, to=2-1] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-2, to=1-1] \end{tikzcd} \] where \(\dim(A) = n\), and each substitution from a disc is given only by its maximal element. \end{proposition} \begin{proof} As discussed in \cref{sec:prune-construction}, the substitution \(\pi_p\) sends \(\lfloor p \rfloor\) to the identity on the source of \(\lfloor p \rfloor\), which makes the square commute, as it suffices to consider the action of each substitution on \(d_{n+1}\), the maximal variable of \(D^{n+1}\). We now assume that we have substitutions \(\sigma : \lfloor \mathcal{D} \rfloor \to \Gamma\) and \(\{t\} : D^n \to \Gamma\) such that the following diagram commutes: \[ \begin{tikzcd}[column sep = large, row sep = large] {D^{n+1}} & \lfloor \mathcal{D} \rfloor \\ D^{n} & {\lfloor \mathcal{D} \sslash p \rfloor}\\ && \Gamma \arrow["\pi_p", from=1-2, to=2-2] \arrow["{\{\src(\lfloor p \rfloor)\}}"', from=2-1, to=2-2] \arrow["{\{\lfloor p \rfloor\}}", from=1-1, to=1-2] \arrow["{\{\id(d_n)\}}"', from=1-1, to=2-1] \arrow["\sigma", curve={height=-18pt}, from=1-2, to=3-3] \arrow["{\{t\}}", curve={height=18pt}, from=2-1, to=3-3] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-2, to=1-1] \end{tikzcd} \] We immediately have that \(\lfloor p \rfloor \sub \sigma = \id(\{t\})\). We can therefore let \(\sigma'\) the same substitution as \(\sigma\) but with \(\lfloor p \rfloor \sub \sigma\) replaced by \(\id(\{t\})\), and then can form the substitution: \[ \sigma \sslash p \equiv \sigma' \sslash p : \lfloor \mathcal{D}\sslash p \rfloor \to \Gamma\] By \cref{prop:prune-ty}, we immediately have \(\sigma = \sigma' = \pi_p \bullet \sigma \sslash p\). The other equality follows from a diagram chase, noting that \(d_n^-\) in \(D^{n+1}\) is sent to the variable \(d^n\) in \(D^n\) by the map \(\{\id(d_n)\}\). It remains to show that the chosen universal map \(\sigma \sslash p\) is unique, but this is trivial as every variable of \(\lfloor \mathcal{D} \sslash p \rfloor\) is also a variable of \(\lfloor \mathcal{D} \rfloor\), and so the universal map is fully determined by the substitution \(\sigma\). \end{proof} \begin{corollary} \label{cor:insertion-pruning} Let \(\mathcal{R}\) have insertion. Then \(\mathcal{R}\) has pruning. \end{corollary} \begin{proof} Assume \(\mathcal{R}\) has insertion. Then take a term \(\Coh {\lfloor \mathcal{D} \rfloor} A \sigma : \Term_\Gamma\) with a peak \(p : \Peak_{\mathcal{D}}\) such that: \[ \lfloor p \rfloor \sub \sigma \equiv \id(A,t)\] for some term \(t\) and type \(A\) of \(\Gamma\). We then need to show that: \[ \Gamma \vdash \Coh {\lfloor \mathcal{D} \rfloor} A \sigma = \Coh {\lfloor \mathcal{D} \sslash p\rfloor} {A \sub {\pi_p}} {\sigma \sslash p}\] From \(\lfloor \mathcal{D} \rfloor\) we can obtain a tree \(S\) with \(\lfloor S \rfloor \equiv \lfloor \mathcal{D} \rfloor\). Further, \(\lfloor p \rfloor\) is a locally maximal variable of \(\lfloor \mathcal{D} \rfloor\), and so there exists a branch \(P\) such that \(\lfloor \olsi P \rfloor\) is this locally maximal variable, and \(\bh(P) = \lh(P) - 1\). Then the diagram: \[ % https://q.uiver.app/#q=WzAsMyxbMSwwLCJEXntuKzF9Il0sWzAsMSwiRF5uIl0sWzIsMSwiXFxsZmxvb3IgUyBcXHJmbG9vciJdLFswLDEsIlxceyBcXGlkKHQpIFxcfSIsMl0sWzAsMiwiXFx7IFxcbGZsb29yIHAgXFxyZmxvb3IgXFx9Il1d % tex-fmt: skip \begin{tikzcd} & {D^{n+1}} \\ {D^n} && {\lfloor S \rfloor} \arrow["{\{ \id(t) \}}"', from=1-2, to=2-1] \arrow["{\{ \lfloor p \rfloor \}}", from=1-2, to=2-3] \end{tikzcd} \] has two pushouts, the one given by insertion, and the one given by pruning. Therefore, we obtain an isomorphism \(\lfloor \insertion S P {D^n} \rfloor \cong \lfloor \mathcal{D} \sslash p \rfloor\). By \cref{prop:ps-context-iso}, this isomorphism must be the identity (as both pushouts exist in \textsf{Catt}), and so we can deduce that \(\pi_p = \kappa_{S,P,D^n}\) and \(\sigma \sslash p = \lfloor \insertion {\lceil \sigma \rceil} P {\{\lceil t \rceil\}} \rfloor\). Therefore, the above equality is given by an insertion along \(P\). \end{proof} \subsection{The insertion rule} \label{sec:insertion-rule} We now prove that the insertion rule set given in \cref{sec:insertion-rule} satisfies the various conditions presented in \cref{sec:ruleset}. We begin with the following lemma. \begin{lemma} \label{lem:insertion-map} Let \((S,P,T)\) be an insertion point and let \(L : S \to \U\) and \(M : T \to \U\) be labellings. Let \(f : \STerm_\U \to \STerm_{\U'}\) be any function from structured terms of \(\U\) to structured terms of \(\U'\). Then for any path \(p\) of \(\insertion S P T\) we have: \[ f((\insertion S P T)(p)) \equiv (\insertion {(f \circ L)} P {(f \circ M)})(p)\] where \(f \circ L\) is the result of composing \(f\) to the function component of \(L\). \end{lemma} \begin{proof} The proof of this follows by a simple induction on \(P\) and is given in the formalisation module \module{Catt.Tree.Insertion.Properties} by function \func{Catt.Tree.Insertion.Properties}{label-from-insertion-map}. \end{proof} \begin{proposition} \label{prop:insert-tame} The insertion rule set, \insert, satisfies the suspension condition. It further satisfies the \(\mathcal{R}\)-substitution condition for any rule set \(\mathcal{R}\), and so also satisfies the weakening condition. \end{proposition} \begin{proof} Let \((S,P,T,\Gamma,L,M)\) be an insertion redex and let \(A\) be a structured type of \(S\), such that: \[ s \equiv \SCoh S A L \qquad t \equiv \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P T} \qquad (\Gamma, \lfloor s \rfloor, \lfloor t \rfloor) \in \insert\] To prove the suspension condition, we observe that \(0 :: P\) is a branch of \(\Sigma(S)\) such that \(\insertion {\Sigma(S)} {0::P} {\Sigma(T)} \equiv \Sigma(\insertion S P T)\) and \(\kappa_{\Sigma(S),0::P,\Sigma(T)} \equiv \Sigma(\kappa_{S,P,T})\) by definition. By applying \cref{lem:insertion-map} with \(f = \Sigma\), we get: \[ \insertion {\Sigma'(L)} {P} {\Sigma'(M)} \equiv \Sigma'(\insertion L P M)\] and so by unwrapping definitions we obtain \(\insertion {\Sigma(L)} {0 :: P} {\Sigma(M)} \equiv \Sigma(\insertion L P M)\). Therefore, we have: \begin{align*} \Sigma(s) &\equiv \SCoh {\Sigma(S)} {\Sigma (A)} {\Sigma(L)} \\ \Sigma(t) &\equiv \SCoh {\insertion {\Sigma(S)} {0::P} {\Sigma(T)}} {\Sigma(A) \sub {\kappa_{\Sigma(S),0::P,\Sigma(T)}}} {\insertion {\Sigma(L)} {0::P} {\Sigma(M)}} \end{align*} and so as \[\Sigma(L)(0 :: \olsi P) \equiv \Sigma'(L)(\olsi P) \equiv \Sigma(\stdcoh T {\lh(P)} \sub M) \equiv \stdcoh {\Sigma(T)} {\lh(0::P)} \sub {\Sigma(M)}\] we get \((\Sigma(\Gamma), \Sigma(\lfloor s \rfloor), \Sigma (\lfloor t \rfloor) ) \in \insert\) as required. For the substitution condition we let \(\sigma : \arr \Gamma \star \Delta\) be any substitution. Then: \[\lfloor s \rfloor \sub \sigma \equiv \lfloor \SCoh S A {L \bullet \lceil \sigma \rceil} \rfloor \qquad \lfloor t \rfloor \sub \sigma \equiv \lfloor \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {(\insertion L P T) \bullet \lceil \sigma \rceil} \rfloor\] Again using \cref{lem:insertion-map}, this time with \(f = u \mapsto u\sub{\lceil \sigma \rceil}\), we have: \[(\insertion L P M) \bullet \lceil \sigma \rceil \equiv \insertion {L \bullet \lceil \sigma \rceil} P {M \bullet \lceil \sigma \rceil}\] Further, we have the equality: \[(L \bullet \lceil \sigma \rceil)(\olsi P) \equiv L(p) \sub {\lceil \sigma \rceil} \equiv \stdcoh T {\lh(P)} \sub {M \bullet \lceil \sigma \rceil}\] and so \((\Gamma, \lfloor s \rfloor \sub \sigma, \lfloor t \rfloor \sub \sigma) \in \insert\) and so \insert satisfies the \(\mathcal{R}\)-substitution condition for any \(\mathcal{R}\), as we made no assumption on \(\sigma\) being well-formed. \end{proof} We next prove the support condition for the insertion rule set. We start with the following support lemma for the exterior labelling. \begin{lemma} \label{lem:kappa-full} Let \((\insertion S P T)\) be an insertion point. Then: \[ \Supp(\kappa_{S,P,T}) = \Var(\insertion S P T)\] The exterior labelling is full. \end{lemma} \begin{proof} Proof proceeds by induction on \(P\), the only non-trivial case is \(P = [k]\), where we rely on \(\Supp(\{\stdty T {\lh{P}}, \stdcoh T {\lh{P}}\})\) being \(\Var(T)\). A full proof is given in the formalisation module \module{Catt.Tree.Insertion.Support}. \end{proof} Similar to the other rule sets introduced so far, to prove the support condition for the insertion rule set, we will take an arbitrary rule set \(\mathcal{R}\) that is tame and satisfies the support condition, and prove instead that the insertion set satisfies the \(\mathcal{R}\)-support condition. This result can be then used as part of the strategy for proving the support condition outlined in \cref{lem:proof-strat-supp}. \begin{proposition} \label{prop:insert-supp} Let \(\mathcal{R}\) be a tame equality rule set that satisfies the support condition. Then \insert satisfies the \(\mathcal{R}\)-support condition. \end{proposition} \begin{proof} As in the previous proposition, let \((S,P,T,\Gamma,L,M)\) be an insertion redex and \(A\) a structured type of \(S\), such that: \[ s \equiv \SCoh S A L \qquad t \equiv \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P T} \qquad (\Gamma, \lfloor s \rfloor, \lfloor t \rfloor) \in \insert\] We now assume that \(\Gamma \vdash_{\mathcal{R}} \lfloor s \rfloor : B\) for some \(B\) and must prove that \(\Supp(s) = \Supp(t)\). By inspecting the typing judgement, we can obtain proofs of the following typing judgements: \[ \Gamma \vdash L : S \qquad S \vdash A \qquad \Gamma \vdash M : T\] where the typing of \(M\) is obtained by transporting the typing of \(L(\olsi P)\) along the syntactic equality \(L(\olsi P) \equiv \stdcoh T {\lh(P)} \sub M\). By \cref{lem:ins-comm-max}, we have: \[ \kappa_{S,P,T} \bullet \insertion L P M \equiv^{\max} L \] By \cref{prop:ins-typing}, both sides of this equation are well-formed and so by \cref{thm:label-max-equality}, we obtain the equality: \[ \Gamma \vdash_{\mathcal{R}} \kappa_{S,P,T} \bullet \insertion L P M = L\] As \(\mathcal{R}\) satisfies the support property, we get: \begin{align*} \Supp(s) &= \Supp(L)\\ &= \Supp(\kappa_{S,P,T} \bullet \insertion L P M)\\ &= \Supp(\kappa_{S,P,T}) \sub {\insertion L P M}\\ &= \Var(\insertion S P T) \sub {\insertion L P M}&\text{by \cref{lem:kappa-full}}\\ &= \Supp(\insertion L P M) \\ &= \Supp(t) \end{align*} and so \(\Supp(\lfloor s \rfloor) =\Supp(\lfloor t \rfloor)\) as required. \end{proof} Similarly to the situation in pruning, we are not able to show that the type \(A \sub {\kappa}\) is a valid operation without knowing more about the set of operations \(\mathcal{O}\). We therefore introduce the following additional condition on the set of operations. \begin{definition} An operation \(\mathcal{O}\) \emph{supports insertion} if for all insertion points \((S,P,T)\) and variable sets \(U,V \subseteq \Var(S)\) we have: \[ (\lfloor \insertion S P T \rfloor, \lfloor U \sub {\kappa_{S,P,T}} \rfloor, \lfloor V \sub {\kappa_{S,P,T}} \rfloor) \in \mathcal{O} \] whenever \((\lfloor S \rfloor, U, V) \in \mathcal{O}\) \end{definition} Using this property, we can give the preservation condition for the insertion rule set. \begin{proposition} \label{prop:insert-preserve} Let \(\mathcal{R}\) be a tame equality rule set and suppose the operation set \(\mathcal{O}\) supports insertion. Then the set \insert satisfies the \(\mathcal{R}\)-preservation condition. \end{proposition} \begin{proof} Let \((S,P,T,\Gamma,L,M)\) be an insertion redex and let \(\arr a A b\) be a structured type such that: \[ s \equiv \SCoh S {\arr a A b} L \qquad t \equiv \SCoh {\insertion S P T} {(\arr a A b) \sub {\kappa_{S,P,T}}} {\insertion L P T} \qquad (\Gamma, \lfloor s \rfloor, \lfloor t \rfloor) \in \insert\] We now suppose that \(\Gamma \vdash \lfloor s \rfloor : B\) and aim to prove that \(\Gamma \vdash \lfloor t \rfloor : B\). By inspecting the typing derivation we get: \begin{mathpar} S \vdash \arr a A b \and \Gamma \vdash L : S \and \Gamma \vdash M : T \and (\lfloor S \rfloor, \Supp(a), \Supp(b)) \in \mathcal{O} \and \Gamma \vdash (\arr a A b) \sub L = A \end{mathpar} and so by \cref{prop:ins-typing} we have: \[ \insertion S P T \vdash \kappa_{S,P,T} : S \qquad \Gamma \vdash \insertion L P M : \insertion S P T \] As the operation set supports insertion with \(\Supp(a \sub {\kappa}) = \Supp(a) \sub \kappa\) and \(\Supp(b \sub {\kappa}) = \Supp(b) \sub \kappa\) we get: \[ (\lfloor \insertion S P T \rfloor, \Supp(a \sub \kappa), \Supp(b \sub \kappa))\] and so we obtain: \[ \Gamma \vdash \SCoh {\insertion S P T} {(\arr a A b) \sub \kappa} {\insertion L P M} : {(\arr a A b) \sub \kappa \sub {\insertion L P M}}\] By \cref{lem:ins-comm-max,thm:label-max-equality}, \(\Gamma \vdash \kappa \bullet \insertion L P M = L\), and so: \begin{align*} (\arr a A b) \sub \kappa \sub {\insertion L P M} &\equiv (\arr a A b) \sub {\kappa \bullet (\insertion L P M)}\\ &= (\arr a A b) \sub {L}\\ &= B \end{align*} and so by applying the conversion rule we obtain \(\Gamma \vdash \lfloor t \rfloor : B\) as required. \end{proof} \subsection{Further properties} \label{sec:further-properties} It has now been proved that insertion can form part of a reasonable type theory. We now proceed to prove further properties of the insertion construction that will be critical for proving the confluence of \Cattsua in \cref{sec:cattsua}. The majority of these properties will therefore concern the interaction of insertion with other constructions and itself. We will justify each property with up to three of the following methods: \begin{itemize} \item For each property, we will give a graphical depiction of the constructions involved, similar to the diagram for \cref{prop:prune-conf}, which should help build intuition for the constructions at play. \item Where applicable, each combination of constructions will be described using the universal property from \cref{sec:univ-prop-insert}. This can be used to classify these constructions up to definitional equality. \item As these properties are used in a confluence proof, we will need a more syntactic form than can be offered by the universal property approach. To do this we fall back to the formalisation, using the computation power of structured terms to brute force each property. \end{itemize} The first two properties we consider concern the interaction of insertion with disc contexts, and will be crucial for proving confluence cases involving insertion and disc removal. Disc contexts often admit insertions, and the disc acts as a left and right unit for the insertion operation. \paragraph{Insertion into a disc} We begin by considering insertions into a disc. A disc context has a branch of height \(0\), and so if the locally maximal variable is sent to a standard coherence, then insertion can always be preformed. Inserting into a disc effectively performs disc removal, replacing the entire disc with the entirety of the inner context. We illustrate this by the following diagram, where we take the branch \([0,0]\) of \(D^4\) (which we note is not the minimal branch). \[ \insertion { \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid, Diag1] at (0,2)(x21){$\bullet$}; \node [on grid, Diag1] at (0,3)(x31) {$\bullet$}; \node [on grid, Diag1] at (0,4)(x41) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag1,very thick] (x11.center) to (x21.center); \draw[Diag1,very thick] (x21.center) to (x31.center); \draw[Diag1,very thick] (x31.center) to (x41.center); \end{scope} \end{tikzpicture}\quad} {[0,0]} {\quad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base), Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \node [on grid] at (-0.9,3)(x31) {$\bullet$}; \node [on grid] at (-0.1,3) (x32) {$\bullet$}; \node [on grid] at (-0.9,4)(x41) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw (x21.center) to (x31.center); \draw (x21.center) to (x32.center); \draw (x31.center) to (x41.center); \end{tikzpicture}\qquad} = \qquad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base),Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \node [on grid] at (-0.9,3)(x31) {$\bullet$}; \node [on grid] at (-0.1,3) (x32) {$\bullet$}; \node [on grid] at (-0.9,4)(x41) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw (x21.center) to (x31.center); \draw (x21.center) to (x32.center); \draw (x31.center) to (x41.center); \end{tikzpicture} \] This property of insertion also has a simple proof by universal property. Suppose we have disc \(D^n\) with a branch \(P\) and we insert tree \(T\). Then the inserted tree is given by the following pushout. \[ \begin{tikzcd} {D^n} & {D^n} \\ T & {\insertion {D^n} P T} \arrow["{\{\stdcoh T n\}}"', from=1-1, to=2-1] \arrow["\id", from=1-1, to=1-2] \arrow["\iota"', from=2-1, to=2-2] \arrow["\kappa", from=1-2, to=2-2] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-2, to=1-1] \end{tikzcd} \] By standard properties of pushouts, we have that \(\insertion {D^n} P T\) is isomorphic to \(T\). As this pushout holds in \Catt, we have a \Catt isomorphism between pasting contexts and so by \cref{prop:ps-context-iso}, \(T = \insertion {D^n} P T\), \(\iota = \id\). The following lemma gives syntactic versions of these properties. \begin{lemma} \label{lem:disc-insertion-1} Let \(T\) be a tree, \(n \geq \dim (T)\), and \(P\) a branch of \(D^n\) with \(\bh(P) \leq \th(T)\). Then \(\insertion {D^n} P T = T\) and \(\iota_{D^n,P,T} \equiv \id\). Suppose further that \((D^n,P,T,\Gamma,L,M)\) is an insertion redex. Then \(\insertion L P M \equiv M\). \end{lemma} \begin{proof} See the functions \func{Catt.Tree.Insertion.Properties}{disc-insertion}, \func{Catt.Tree.Insertion.Properties}{disc-ι}, and \func{Catt.Tree.Insertion.Properties}{disc-label-from} in formalisation module \module{Catt.Tree.Insertion.Properties}. \end{proof} \paragraph{Insertion of a disc} We now consider the opposite situation, where a disc context is inserted into an arbitrary tree. For a tree \(T\), with a branch \(P\), we can always insert the disc context \(D^{\lh(P)}\), as the trunk height condition will be satisfied by the linearity of the disc context. Inserting such a disc context makes no change to the tree \(T\), as the operation effectively replaces a branch of \(T\) (which is linear by construction) by a disc. The diagram below depicts this construction. \[ \insertion { \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \node [on grid] at (0,3)(x31) {$\bullet$}; \node [on grid] at (1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (0,4)(x41) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw (x22.center) to (x31.center); \draw (x22.center) to (x32.center); \draw[very thick,Diag1] (x31.center) to (x41.center); \draw (x32.center) to (x42.center); \end{scope} \end{tikzpicture}\quad} {[0,1,0,0]} {\quad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base),Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (0,2)(x21){$\bullet$}; \node [on grid] at (0,3)(x31) {$\bullet$}; \node [on grid] at (0,4)(x41) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x21.center) to (x31.center); \draw (x31.center) to (x41.center); \end{tikzpicture}\qquad} = \qquad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid, Diag2] at (0,0) (x01) {$\bullet$}; \node [on grid, Diag2] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid, Diag2] at (0.5,2) (x22){$\bullet$}; \node [on grid, Diag2] at (0,3)(x31) {$\bullet$}; \node [on grid] at (1,3) (x32) {$\bullet$}; \node [on grid, Diag2] at (0,4)(x41) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \begin{scope}[on background layer] \draw[Diag2] (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw[Diag2] (x11.center) to (x22.center); \draw[Diag2] (x22.center) to (x31.center); \draw (x22.center) to (x32.center); \draw[Diag2] (x31.center) to (x41.center); \draw (x32.center) to (x42.center); \end{scope} \end{tikzpicture} \] Similar to the insertion into the disc, the insertion of a disc can be characterised by universal property. Take any tree \(T\) with a branch \(P\). Then the tree \(\insertion T P {D^{\lh(P)}}\) is the following pushout: \[ \begin{tikzcd} {D^n} & T \\ {D^n} & {\insertion T P {D^n}} \arrow["{\{\olsi P\}}", from=1-1, to=1-2] \arrow["{\{\stdcoh {D^n} n\}}"', from=1-1, to=2-1] \arrow["\iota"', from=2-1, to=2-2] \arrow["\kappa", from=1-2, to=2-2] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-2, to=1-1] \end{tikzcd} \] The situation here is less clear than before, as the map \(D^n \to D^n\) is not the identity. However, in the presence of disc removal this map becomes equal to the identity, and in this case a similar argument can be made to determine that \(\kappa\) should be the identity and \(\insertion T P {D^{\lh(P)}}\) should be equal to the tree \(T\). The results are given in the lemma below: \begin{lemma} \label{lem:disc-insertion-2} Let \((T,P,D^{\lh(P)},\Gamma,L,M)\) be an insertion redex. Then: \[\insertion T P {D^{\lh(P)}} \equiv S \qquad \insertion L P M \equiv^{\mathsf{max}} L\] We further have: \[S \vdash_{\mathcal{R}} \kappa_{S,P,D^{\lh(P)}} =^{\mathsf{max}} \id_{S}\] if \(\mathcal{R}\) is a (tame) equality rule set which has disc removal. \end{lemma} \begin{proof} See the functions \func{Catt.Tree.Insertion.Properties}{insertion-disc} and \func{Catt.Tree.Insertion.Properties}{disc-label-from-2} in the formalisation module \module{Catt.Tree.Insertion.Properties} and \funcn{Catt.Typing.Insertion.Equality}{10459}{κ-disc} in \module{Catt.Typing.Insertion.Equality}. \end{proof} \paragraph{Insertion of an endo-coherence} We now turn our attention to the interaction between insertion and endo-coherence removal. Unlike in \Cattsu, the locally maximal argument in an insertion redex need not be in normal form. In particular, since the only condition on the locally maximal argument is that it is a standard coherence, it may be an endo-coherence. In such a situation there are two distinct ways of applying equalities: \begin{itemize} \item The endo-coherence could be directly inserted into the head term. \item The endo-coherence could be transformed into an identity on a standard coherence (see \cref{thm:std-ecr}) after which the head term could undergo two insertions, the first of which ``prunes'' the identity, and the second of which inserts the locally maximal argument of the pruned identity. \end{itemize} As the insertion of an identity acts in a similar way to pruning (see \cref{cor:insertion-pruning}), we re-use the notation. \begin{definition} Let \(S\) be a tree, and \(P\) be a branch of \(S\). Then define: \[ S \sslash P = \insertion S P {D^{\lh(P) - 1}} \qquad \pi_P = \kappa_{S,P,D^{\lh(P) - 1}}\] where we note that \((S,P,D^{\lh(P)-1})\) is always an insertion point. \end{definition} To perform the second equality path of pruning an identity followed by inserting the maximal argument of that identity, we must obtain a branch of the pruned context \(S \sslash P\). This can be done when \(\lh(P) - \bh(P) \geq 2\) by taking the same list as \(P\), as depicted in \cref{fig:pruned-branch}. We name such a branch the \emph{pruned branch}. \begin{definition} Let \(S\) be a tree, and \(P\) be a branch of \(S\) with \(\lh(P) - \bh(P) \geq 2\). We then define the \emph{pruned branch} \(P'\) of \(S \sslash P\) to be given by the same list as \(P\). \end{definition} If \(\lh(P) - \bh(P) = 1\) (noting that \(\lh(P) - \bh(P)\) cannot be zero) then pruning the branch \(P\) removes the branch entirely, and so the condition \(\lh(P) - \bh(P) \geq 2\) is necessary to form the pruned branch. It is clear that \(\bh(P') = \bh(P)\) and \(\lh(P') = \lh(P) - 1\). \begin{figure}[ht] \centering \begin{subfigure}{0.45\linewidth} \centering \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (0.5,1)(x12){$\bullet$}; \node [on grid] at (0.5,2) (x21){$\bullet$}; \node [on grid, Diag1] at (0,3)(x31) {$\bullet$}; \node [on grid] at (1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (0,4)(x41) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \node [on grid] at (-0.8,2)(bh) {}; \node [left=0 of bh ,on grid] {$\bh(P)$}; \node [on grid] at (-0.8,4)(lh) {}; \node [left=0 of lh ,on grid] {$\lh(P)$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw (x12.center) to (x22.center); \draw[very thick,Diag1] (x22.center) to (x31.center); \draw (x22.center) to (x32.center); \draw[very thick,Diag1] (x31.center) to (x41.center); \draw (x32.center) to (x42.center); \end{scope} \draw [<->] (bh.center) to (lh.center); \draw [dotted, very thick] (bh) to (x21); \draw [dotted, very thick] (lh) to (x41); \end{tikzpicture} \caption{Tree \(S\) and branch \(P = [1,0,0]\).} \end{subfigure} \begin{subfigure}{0.45\linewidth} \centering \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (0.5,1)(x12){$\bullet$}; \node [on grid] at (0.5,2) (x21){$\bullet$}; \node [on grid, Diag1] at (0,3)(x31) {$\bullet$}; \node [on grid] at (1,3) (x32) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw (x12.center) to (x22.center); \draw[very thick,Diag1] (x22.center) to (x31.center); \draw (x22.center) to (x32.center); \draw (x32.center) to (x42.center); \end{scope} \end{tikzpicture} \caption{Tree \(S \sslash P\) and branch \(P' = [1,0,0]\).} \end{subfigure} \caption{The pruned branch.} \label{fig:pruned-branch} \end{figure} We also note that the path \(\olsi{P'}\) is the maximal argument of the labelling \(\iota_{S,P,D^{\lh{P}- 1}}\), the inclusion of \(D^{\lh(P)- 1}\) into \(S \sslash P\). Insertion along the pruned branch is then characterised by the following pushout. \[ \begin{tikzcd} {D^n} & S \\ {D^{n-1}} & {S \sslash P} \\ T & {\insertion {(S \sslash P)} {P'} T} \\ &&& \U\\ &&& \U \arrow["{\{\olsi P\}}", from=1-1, to=1-2] \arrow["{\pi_P}", from=1-2, to=2-2] \arrow["{\{\olsi {P'}\}}"', from=2-1, to=2-2] \arrow["{\{\stdcoh {D^{n-1}} n\}}"', from=1-1, to=2-1] \arrow["{\{ \stdcoh T {n-1} \}}"', from=2-1, to=3-1] \arrow["\kappa", from=2-2, to=3-2] \arrow["\iota"', from=3-1, to=3-2] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-2, to=1-1] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=3-2, to=2-1] \arrow["L", curve={height=-36pt}, from=1-2, to=4-4] \arrow["M"', curve={height=30pt}, from=3-1, to=5-4] \arrow["{\insertion L P {(\{\stdcoh T {n-1}\} \bullet M)}}"{sloped}, from=2-2, to=4-4, dashed] \arrow["{\insertion {(\insertion L P {(\{\stdcoh T {n-1}\} \bullet M)})} {P'} {M}}"'{sloped, pos=0.45}, from=3-2, to=5-4, dashed] \arrow[equal, nfold, from=4-4, to=5-4] \end{tikzcd} \] The top pushout is from the construction of \(S \sslash P\), noting that \(\iota_{S,P,D^{\lh(P) - 1}} = \{\olsi {P'}\}\). The bottom pushout is from the construction of the insertion along the pruned branch. By the pasting lemma for pushouts, the whole outer rectangle is also a pushout along the maps \(\{\hat P\}\) and \(\{\stdcoh {D^{n-1}} n\} \bullet \{\stdcoh T {n-1}\}\). In the presence of endo-coherence removal we have: \[ \{\stdcoh {D^{n-1}} n\} \bullet \{\stdcoh T {n-1}\} = \{\stdcoh T n\}\] by \cref{thm:std-ecr} and so the outer pushout rectangle is the pushout generated by directly inserting the endo-coherence. There are two ways to form the unique map \(\insertion {(S \sslash P)} {P'} {T} \to \U\), one by the outer pushout rectangle that gives the map \(\insertion L P M\), and one by first using the top pushout square with the maps \(L\) and \(\{\stdcoh T {n-1}\} \bullet M\) to get a map \(S \sslash P \to \U\), and then using this map with the bottom pushout square and \(M\) to get the morphisms depicted in the commutative diagram. These results appear in the next lemma. \begin{lemma} \label{lem:pruned-bp} Suppose \(S\) has branch \(P\) with \(\lh(P) - \bh(P) \geq 2\). Then \(\iota_{S,P,D^{\lh(P) - 1}} \equiv \{ \olsi {P'} \} \). Further suppose that \((S,P,T)\) is an insertion point. Then if the (tame) rule set \(\mathcal{R}\) has disc removal and endo-coherence removal we get: \[\insertion {(S \sslash P)} {P'} T = \insertion S P T \qquad \U \vdash_{\mathcal{R}} \pi_P \bullet \kappa_{S \sslash P,P',T} =^{\max} \kappa_{S,P,T} \] If we further have that \((S,P,T,\U,L,M)\) is an insertion redex then: \[\insertion {(\insertion {L} P {(\{\stdcoh T {\lh(P) - 1}\} \bullet M)})} {P'} {M} \equiv^{\max} \insertion L P M\] \end{lemma} \begin{proof} See the functions \func{Catt.Tree.Insertion.Properties}{insertion-tree-pruned-branch}, \func{Catt.Tree.Insertion.Properties}{pruned-branch-prop}, and \func{Catt.Tree.Insertion.Properties}{label-from-pruned-branch} in formalisation module \module{Catt.Tree.Insertion.Properties}, and \funcn{Catt.Typing.Insertion.Equality}{3281}{pruned-branch-κ} in \module{Catt.Typing.Insertion.Equality}. \end{proof} \paragraph{Branch irrelevance} As has already been noted, a tree \(S\) may admit multiple branches \(P\) and \(Q\) which represent the same locally maximal variable, that is \(\olsi P \equiv \olsi Q\). If there is an insertion that can be applied along either branch \(P\) or \(Q\) then it does not matter which branch we choose. This can be immediately seen by the universal property: The pushout square for an insertion point \((S,P,T)\) only mentions the path \(\olsi P\) and never uses the actual branch \(P\). \begin{lemma} \label{lem:insertion-irrel} Suppose \((S,P,T)\) and \((S,Q,T)\) are insertion points with \(\olsi P \equiv \olsi Q\). Then \(\insertion S P T \equiv \insertion S Q T\) and \(\kappa_{S,P,T} \equiv^{\mathsf{max}} \kappa_{S,Q,T}\). If we further have \(L : S \to \Gamma\) and \(M : T \to \Gamma\), then \(\insertion L P M \equiv^{\mathsf{max}} \insertion L Q M\). \end{lemma} \begin{proof} See the functions \func{Catt.Tree.Insertion.Properties}{insertion-irrel}, \func{Catt.Tree.Insertion.Properties}{κ-irrel}, and \func{Catt.Tree.Insertion.Properties}{irrel-label-from} in formalisation module \module{Catt.Tree.Insertion.Properties}. \end{proof} It is natural to ask why we define branches at all, and don't identify points where insertion can be performed by a maximal path, implicitly taking the branch of minimal branching height. While this could be done, it would make other confluence cases more difficult, as the branch associated to a maximal path could significantly change if a different branch is pruned from the tree. \paragraph{Parallel insertion} We now begin to consider the interaction between insertion and itself. In contrast to the previous case, we now consider two branches \(P\) and \(Q\) such that \(\olsi P\) and \(\olsi Q\) are not the same maximal path, in which case we say the branches \(P\) and \(Q\) are \emph{parallel}. Assume we have a tree \(S\) such that \((S, P, T)\) and \((S,Q, U)\) are insertion points. We then aim to perform both insertions, and prove that the order they occur in is irrelevant. To do this we must form a branch of the inserted tree \(\insertion S P T\), which is intuitively given by the branch \(Q\), but such a branch must be adapted to the new inserted tree. \begin{definition} Let \((S, P, T)\) be an insertion point and let \(Q\) be a branch of \(S\) such that \(\olsi P \neq \olsi Q\). Then we define the branch \(\insertion Q P T\) of \(\insertion S P T\) by induction on \(P\) and \(Q\). \begin{itemize} \item Suppose \(P = [k]\) and \(Q = j :: x\). Then if \(j < k\) we let \(\insertion Q P T = Q\). Otherwise, we let: \[\insertion Q P T = (j + \len(T) - 1) :: x\] \item Suppose \(P = k :: P_2\) and \(Q = j :: x\). If \(j \neq k\) then let \(\insertion Q P T = Q\). Otherwise, both \(P_2\) and \(x\) are branches of \(S_k\) and so we let \[\insertion Q P T = k :: \insertion x {P_2} T\] \end{itemize} It is clear that \(\insertion Q P T\) satisfies the condition for being a branch. \end{definition} The maximal path associated to the branch \(\insertion Q P T\) is obtained by applying the labelling \(\kappa\) to the maximal path associated to \(Q\). That is: \[ \olsi {\insertion Q P T} \equiv \olsi Q \sub {\kappa_{S,P,T}}\] A graphical example of such a situation is given in \cref{fig:ins-parallel} where we note how the right branch changes after the left-hand insertion is performed. We also note that the final trees at the bottom of the diagram are coloured slightly differently, which corresponds to the inserted labellings from these trees being different. To remedy this, we introduce a variant of the inserted labelling, which takes arguments from the head labelling instead of the argument labelling wherever possible. \begin{figure}[ht] \centering \newsavebox\redbase \sbox\redbase{\( \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base),Diag1] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0,2) (x22){$\bullet$}; \node [on grid] at (0.5,2) (x23){$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw (x11.center) to (x23.center); \end{tikzpicture} \quad\mathop{{}_{[1,0]}\mathord{\gg}}\quad \insertion{ \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (0,1)(x12){$\bullet$}; \node [on grid] at (0.5,1)(x13){$\bullet$}; \node [on grid, Diag1] at (-0.5,2) (x21){$\bullet$}; \node [on grid, Diag2] at (0.5,2) (x22){$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw (x01.center) to (x13.center); \draw[Diag1, very thick] (x12.center) to (x21.center); \draw[Diag2, very thick] (x12.center) to (x22.center); \end{scope} \end{tikzpicture}\quad }{[1,1]}{\quad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base),Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{tikzpicture}}\)} \newsavebox\redleft \sbox{\redleft}{\(\insertion{ \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base)] \node [on grid, Diag1] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid, Diag1] at (0,1)(x12){$\bullet$}; \node [on grid] at (0.5,1)(x13){$\bullet$}; \node [on grid, Diag1] at (-0.6,2) (x21){$\bullet$}; \node [on grid, Diag1] at (-0.2,2) (x22){$\bullet$}; \node [on grid, Diag1] at (0.2,2) (x23){$\bullet$}; \node [on grid, Diag2] at (0.6,2) (x24){$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag1] (x01.center) to (x12.center); \draw (x01.center) to (x13.center); \draw[Diag1] (x12.center) to (x21.center); \draw[Diag1] (x12.center) to (x22.center); \draw[Diag1] (x12.center) to (x23.center); \draw[Diag2, very thick] (x12.center) to (x24.center); \end{scope} \end{tikzpicture}\quad }{[1,3]}{\quad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base),Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \end{tikzpicture}}\)} \newsavebox\redright \sbox{\redright}{\( \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base),Diag1] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (-0.5,2)(x21){$\bullet$}; \node [on grid] at (0,2) (x22){$\bullet$}; \node [on grid] at (0.5,2) (x23){$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x11.center) to (x22.center); \draw (x11.center) to (x23.center); \end{tikzpicture} \quad\mathop{{}_{[1,0]}\mathord{\gg}}\quad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base)] \node [on grid,Diag2] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid,Diag2] at (0,1)(x12){$\bullet$}; \node [on grid] at (0.5,1)(x13){$\bullet$}; \node [on grid, Diag1] at (-0.5,2) (x21){$\bullet$}; \node [on grid, Diag2] at (0,2) (x22){$\bullet$}; \node [on grid, Diag2] at (0.5,2) (x23){$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag2] (x01.center) to (x12.center); \draw (x01.center) to (x13.center); \draw[Diag1, very thick] (x12.center) to (x21.center); \draw[Diag2] (x12.center) to (x22.center); \draw[Diag2] (x12.center) to (x23.center); \end{scope} \end{tikzpicture}\)} \newsavebox\redleftbot \sbox{\redleftbot}{ \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base)] \node [on grid,Diag2] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid,Diag2] at (0,1)(x12){$\bullet$}; \node [on grid] at (0.5,1)(x13){$\bullet$}; \node [on grid, Diag1] at (-0.8,2) (x21){$\bullet$}; \node [on grid, Diag1] at (-0.4,2) (x22){$\bullet$}; \node [on grid, Diag1] at (0,2) (x23){$\bullet$}; \node [on grid, Diag2] at (0.4,2) (x24){$\bullet$}; \node [on grid, Diag2] at (0.8,2) (x25){$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag2] (x01.center) to (x12.center); \draw (x01.center) to (x13.center); \draw[Diag1] (x12.center) to (x21.center); \draw[Diag1] (x12.center) to (x22.center); \draw[Diag1] (x12.center) to (x23.center); \draw[Diag2] (x12.center) to (x24.center); \draw[Diag2] (x12.center) to (x25.center); \end{scope} \end{tikzpicture}} \newsavebox\redrightbot \sbox{\redrightbot}{ \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x11.base)] \node [on grid,Diag1] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid,Diag1] at (0,1)(x12){$\bullet$}; \node [on grid] at (0.5,1)(x13){$\bullet$}; \node [on grid, Diag1] at (-0.8,2) (x21){$\bullet$}; \node [on grid, Diag1] at (-0.4,2) (x22){$\bullet$}; \node [on grid, Diag1] at (0,2) (x23){$\bullet$}; \node [on grid, Diag2] at (0.4,2) (x24){$\bullet$}; \node [on grid, Diag2] at (0.8,2) (x25){$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag1] (x01.center) to (x12.center); \draw (x01.center) to (x13.center); \draw[Diag1] (x12.center) to (x21.center); \draw[Diag1] (x12.center) to (x22.center); \draw[Diag1] (x12.center) to (x23.center); \draw[Diag2] (x12.center) to (x24.center); \draw[Diag2] (x12.center) to (x25.center); \end{scope} \end{tikzpicture}} \begin{tikzpicture} \node(redbase) at (0,0) {\fcolorbox{gray}{white}{\usebox\redbase}}; \node(redleft) at (-4,-5){\fcolorbox{gray}{white}{\usebox\redleft}}; \node(redright) at (4,-5){\fcolorbox{gray}{white}{\usebox\redright}}; \node(redleftbot) at (-4,-9){\fcolorbox{gray}{white}{\usebox\redleftbot}}; \node(redrightbot) at (4,-9){\fcolorbox{gray}{white}{\usebox\redrightbot}}; \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (redbase) to (redleft); \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (redbase) to (redright); \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (redleft) to (redleftbot); \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (redright) to (redrightbot); \end{tikzpicture} \caption{Parallel insertions.} \label{fig:ins-parallel} \end{figure} \begin{definition} We define an alternative to the inserted labelling as follows: Given an insertion point \((S, P, T)\) with \(L : S \to \U\) and \(M : T \to \U\) we define this \emph{alternative inserted labelling} \(\insertionprime L P M : {\insertion S P T} \to \U\). Let \[ S = [S_0,\dots,S_n] \qquad L = s_0 \{L_0\}s_1 \cdots \{L_n\}s_{n+1} : A\] and then proceed by induction on \(P\). \begin{itemize} \item Let \(P = [k]\), and \[ T = [T_0,\dots,T_m] \qquad M = t_0\{M_0\}t_1 \cdots \{M_m\}t_{m+1} : B\] Then define \(\insertion L {[k]} M\) to be: \[s_0\{L_0\}s_1 \cdots \{L_{k-1}\}\mathbf{s_k}\{M_0\}t_1\cdots \{M_m\}\mathbf{s_{k+1}}\{L_{k+1}\}s_{k+2}\cdots \{L_n\}s_{n+1} : A\] \item Suppose \(P = k :: Q\) so that \[T = [T_0] \qquad M = t_0\{M_0\}t_1 : B\] Define \(\insertion L P M\) as: \[s_0\{L_0\}s_1\cdots \{L_{k-1}\}\mathbf{s_k}\{\insertion {L_k} {Q} {M_0}\}\mathbf{s_{k+1}}\{L_{k+1}\}s_{k+2} \cdots \{L_n\}s_{n+1} : A\] \end{itemize} The terms that differ from the regular inserted labelling are written in bold. In the edge case where \(M = \emp\), we arbitrarily use \(s_k\) instead of \(s_{k+1}\) for the definition of \(\insertionprime L {[k]} M\). \end{definition} It is immediate that the alternative inserted labelling only differs up to definitional equality. \begin{proposition} \label{prop:insertion-prime-eq} Let \((S,P,T,\U,L,M)\) be an insertion redex. Then: \[\insertionprime L P M = \insertion L P M\] \end{proposition} \begin{proof} See function \func{Catt.Tree.Insertion.Typing}{label-from-insertion-eq} in the module \module{Catt.Tree.Insertion.Typing}. \end{proof} We now examine the universal property of parallel insertion. This is given by the following diagram, where we insert along \(P\) first, followed by \(Q\), letting \(n = \lh(P)\) and \(m = \lh(Q)\). \[ \begin{tikzcd}[row sep = large] & {D^n} & T \\ {D^m} & S & {\insertion S P T} \\ U && {\insertion {(\insertion S P T)} {\insertion Q P T} U} \arrow["{\{\olsi P\}}", from=1-2, to=2-2] \arrow["{\{\olsi Q\}}"', from=2-1, to=2-2] \arrow["{\{\stdcoh U m\}}"', from=2-1, to=3-1] \arrow["{\{\stdcoh T n\}}", from=1-2, to=1-3] \arrow["{\iota_{S,P,T}}", from=1-3, to=2-3] \arrow["{\kappa_{S,P,T}}"', from=2-2, to=2-3] \arrow["{\kappa_{\insertion S P T,\insertion Q P T, U}}", from=2-3, to=3-3] \arrow["{\iota_{\insertion S P T, \insertion Q P T, U}}"', from=3-1, to=3-3] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-3, to=1-2] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=3-3, to=2-1] \end{tikzcd} \] Here, the top pushout square is given by the insertion along \(P\), and the bottom square is given by the insertion along \(\insertion Q P T\), noting that: \[ \{ \olsi Q \} \bullet \kappa_{S,P,T} \equiv \{ \olsi {\insertion Q P T}\}\] The construction is therefore given by the colimit of the top-left border of the diagram. By a symmetric argument, it can be seen that performing the insertions in the opposite order also leads to a colimit of the same diagram. We state the lemma that formally states these ideas. \begin{lemma} \label{lem:insertion-different} Let \((S,P,T)\) and \((S,Q,U)\) be insertion points such that \(\olsi P \not\equiv \olsi Q\). Then we have: \begin{align*} \insertion {(\insertion S P T)} {\insertion Q P T} U &\equiv \insertion {(\insertion S Q U)} {\insertion P Q U} T\\ \kappa_{S,P,T} \circ \kappa_{\insertion S P T, \insertion Q P T, U} &\equiv^{\max} \kappa_{S,Q,U} \circ \kappa_{\insertion S Q U, \insertion P Q U, T} \intertext{Further:} \insertionprime {(\insertion L P M)} {\insertion Q P T} N &\equiv^{\max} \insertionprime {(\insertion L Q N)} {\insertion P Q U} M \end{align*} for any insertion redexes \((S,P,T,\U,L,M)\) and \((S,P,T,\U,L,N)\). \end{lemma} \begin{proof} See functions \func{Catt.Tree.Insertion.Properties}{insertion-parallel}, \funcn{Catt.Tree.Insertion.Properties}{33917}{κ-parallel}, and \funcn{Catt.Tree.Insertion.Properties}{40709}{label-from-parallel} in formalisation module \module{Catt.Tree.Insertion.Properties}. \end{proof} \paragraph{Boundaries of inserted trees} We now work towards the most complex property of insertion, the action of insertion on an insertable argument. To do this, we must first understand the action of insertion on standard coherences, which itself requires an understanding of how insertion interacts with the boundary inclusion maps of trees. There are two fundamental cases for the boundary of an inserted tree: \begin{itemize} \item The boundary has low enough dimension such that it is unaffected by the insertion. In this case applying the boundary to the inserted tree is the same as applying the boundary to the original tree. \item The boundary has sufficient dimension such that the boundary of the original tree still contains the insertion branch. In this case applying the boundary to the inserted tree is the same as inserting into the boundary of the original tree along this branch. \end{itemize} We begin with the first case. Suppose we have an insertion point \((S, P, T)\) and a dimension \(n \in \mathbb{N}\). The main criterion for the boundary having no interaction with the insertion is that: \[ n \leq \th(T) \] When this condition holds, taking the \(n\)-boundary of \(T\) returns a linear tree, and we have already seen that inserting linear trees has no effect on the head tree. We illustrate this case in the diagram below, where the tree \(T\) has trunk height \(3\) and we set \(n = 2\). The dashed line represents taking the boundary operation, and it is easy to see that the two boundary of \(S\) and the insertion tree \(\insertion S P T\) are the same. \[\insertion{ \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (0.5,1)(x12){$\bullet$}; \node [on grid] at (-0.5,2) (x21){$\bullet$}; \node [on grid] at (0.5,2) (x22){$\bullet$}; \node [on grid] at (-0.5,3)(x31) {$\bullet$}; \node [on grid, Diag1] at (0.5,3)(x32) {$\bullet$}; \node [on grid, Diag1] at (0.5,4)(x41) {$\bullet$}; \draw[dashed, thick] (-1,2.5) to (1,2.5); \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw (x11.center) to (x21.center); \draw (x12.center) to (x22.center); \draw (x21.center) to (x31.center); \draw[very thick,Diag1] (x22.center) to (x32.center); \draw[very thick,Diag1] (x32.center) to (x41.center); \end{scope} \end{tikzpicture}\quad} {[1,0,0]} {\quad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base),Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (0,2) (x21){$\bullet$}; \node [on grid] at (0,3)(x31) {$\bullet$}; \node [on grid] at (-0.5,4)(x41) {$\bullet$}; \node [on grid] at (0.5,4)(x42) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x21.center) to (x31.center); \draw (x31.center) to (x41.center); \draw (x31.center) to (x42.center); \end{tikzpicture}} \qquad = \qquad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid, Diag2] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid, Diag2] at (0.5,1)(x12){$\bullet$}; \node [on grid] at (-0.5,2) (x21){$\bullet$}; \node [on grid, Diag2] at (0.5,2) (x22){$\bullet$}; \node [on grid] at (-0.5,3)(x31) {$\bullet$}; \node [on grid, Diag2] at (0.5,3)(x32) {$\bullet$}; \node [on grid, Diag2] at (0,4)(x41) {$\bullet$}; \node [on grid, Diag2] at (1,4)(x42) {$\bullet$}; \draw[dashed, thick] (-1,2.5) to (1,2.5); \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag2] (x01.center) to (x12.center); \draw (x11.center) to (x21.center); \draw[Diag2] (x12.center) to (x22.center); \draw (x21.center) to (x31.center); \draw[Diag2] (x22.center) to (x32.center); \draw[Diag2] (x32.center) to (x41.center); \draw[Diag2] (x32.center) to (x42.center); \end{scope} \end{tikzpicture} \] As well as knowing about the interaction of the boundary with the inserted tree, we also need to investigate the interaction of the inclusion maps with the exterior labelling. In this first case, we would hope to prove that: \[ \incbd d - S \bullet \kappa_{S,P,T} \equiv \incbd d - {\insertion S P T}\] Now that \(\bound n {\insertion S P T} \equiv \bound n S\), there are two ways to encode the source inclusion \(\bound d S\) into \(\insertion S P T\). The right-hand side of the above equation directly includes \(\bound d {\insertion S P T}\) into the source of \(\insertion S P T\), and the left-hand side first includes \(\bound d S\) into the source of \(S\) and then projects \(S\) onto \(\insertion S P T\) via the exterior labelling. There is a catch with proving this equality; The exterior labelling sends \(\olsi P\) to the standard coherence, and so if \(\incbd d - S\) has \(\olsi P\) in its image, the equality cannot hold syntactically. We therefore further require that \(d < \lh(P)\), which ensures this cannot happen. We now state these results in the following lemma. \begin{lemma} \label{lem:insertion-bd-1} Let \(n \in \mathbb{N}\) and suppose \((S,P,T)\) is an insertion point such that \(n \leq \th(T)\). Then: \[ \bound n S \equiv \bound n {\insertion S P T}\] If we further have \(n < \lh(P)\) then: \[ \incbd n \epsilon S \circ \kappa_{S,P,T} \equiv^{\mathsf{max}} \incbd n \epsilon {\insertion S P T}\] for \(\epsilon \in \{-,+\}\). \end{lemma} \begin{proof} See the functions \func{Catt.Tree.Insertion.Properties}{insertion-bd-1} and \func{Catt.Tree.Insertion.Properties}{bd-κ-comm-1} in the formalisation module \module{Catt.Tree.Insertion.Properties}. \end{proof} We now move to the second case. We again suppose we have an insertion point \((S,P,T)\) and dimension \(n \in \mathbb{N}\). To perform an insertion into the boundary \(\bound n S\), the dimension \(n\) must be high enough not to remove the branch \(P\) from \(S\). More specifically, we must have the inequality: \[ n > \bh(P)\] which ensures that the list \(P\) is still a branch of \(\bound n S\). \begin{definition} Let \(S\) be a tree with a branch \(P\), and let \(n > \bh(P)\). Then there is a branch \(\bound n P\) of \(\bound n S\) given by the same list as \(P\) with \(\bh(\bound n P) = \bh(P)\). \end{definition} As \(\th(\bound n T) \geq \bh(P)\) when \(\th(T) \geq \bh(P)\) and \(n > \bh(P)\), we are able to insert the tree \(\bound n T\) into \(\bound n S\) along the branch \(\bound n P\). This is depicted in the following diagram, where \(\bh(P) = 2\) and \(n = 3\). In this diagram, the insertion \(\insertion S P T\) is drawn, and dashed lines are drawn across each tree where they would be truncated by the boundary operation. Crucially, the branch is still well-formed under this line, and preforming the insertion on the truncated trees yields the truncation of the inserted tree. \[\insertion{ \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (0.5,1)(x12){$\bullet$}; \node [on grid] at (0.5,2) (x21){$\bullet$}; \node [on grid, Diag1] at (0,3)(x31) {$\bullet$}; \node [on grid] at (1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (0,4)(x41) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \draw [dashed, thick] (-0.5,3.5) to (1.5,3.5); \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw (x12.center) to (x22.center); \draw[very thick,Diag1] (x22.center) to (x31.center); \draw (x22.center) to (x32.center); \draw[very thick,Diag1] (x31.center) to (x41.center); \draw (x32.center) to (x42.center); \end{scope} \end{tikzpicture}\quad} {[1,0,0]} {\quad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base), Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (0,2) (x21){$\bullet$}; \node [on grid] at (-0.5,3)(x31) {$\bullet$}; \node [on grid] at (0.5,3) (x32) {$\bullet$}; \node [on grid] at (-0.5,4)(x41) {$\bullet$}; \draw [dashed, black, thick] (-1,3.5) to (1,3.5); \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x21.center) to (x31.center); \draw (x21.center) to (x32.center); \draw (x31.center) to (x41.center); \end{tikzpicture}} \qquad = \qquad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid, Diag2] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid, Diag2] at (0.5,1)(x12){$\bullet$}; \node [on grid, Diag2] at (0.5,2) (x21){$\bullet$}; \node [on grid, Diag2] at (0,3)(x31) {$\bullet$}; \node [on grid, Diag2] at (0.5,3) (x32) {$\bullet$}; \node [on grid] at (1,3) (x33) {$\bullet$}; \node [on grid, Diag2] at (0,4)(x41) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \draw [dashed,thick] (-0.5,3.5) to (1.5,3.5); \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag2] (x01.center) to (x12.center); \draw[Diag2] (x12.center) to (x22.center); \draw[Diag2] (x22.center) to (x31.center); \draw[Diag2] (x22.center) to (x32.center); \draw (x22.center) to (x33.center); \draw[Diag2] (x31.center) to (x41.center); \draw (x33.center) to (x42.center); \end{scope} \end{tikzpicture} \] As with the previous case, we explore the interaction of the boundary inclusion labellings and the exterior labelling. We aim to give conditions under which: \[ \incbd n - S \bullet \kappa_{S,P,T} \equiv \kappa_{\bound n S,\bound n P,\bound n T} \bullet \incbd n - {\insertion S P T}\] We examine the action of each side of the equation on the path \(\olsi {\bound n P}\). On the right-hand side, this path is sent by \(\kappa\) to a standard coherence, and so on the left-hand side, \((\incbd n - S)(\olsi {\bound n P}) \) must also be sent to a standard coherence by \(\kappa\). If \((\incbd n - S)( \olsi {\bound n P})\) is a maximal path, which will always be the case when \(n \geq \lh(P)\), then it will be sent to a standard coherence. Alternatively, if \(n \leq \lh(P)\) then \(\lh(\bound n P) = n\) and if \(n > \th(T)\) then the standard term returned by \(\kappa_{S,P,T}\) will be a standard coherence. These conditions lead to the following lemma. \begin{lemma} \label{lem:insertion-bd-2} Let \(n \in \mathbb{N}\) and suppose \((S,P,T)\) is an insertion point with \(n > \bh(P)\). Then: \[ \insertion {\bound n S} {\bound n P} {\bound n T} \equiv \bound n {\insertion S P T} \] Suppose further that one of the following holds: \begin{enumerate} \item \(n > \th(T)\) and \(n \leq \lh(P)\) \item \(n \geq \lh(P)\) \end{enumerate} Then: \[ \incbd n \epsilon S \bullet \kappa_{S,P,T} \equiv^{\mathsf{max}} \kappa_{\bound n S,\bound n P,\bound n T} \bullet \incbd n \epsilon {\insertion S P T} \] for \(\epsilon \in \{-,+\}\). \end{lemma} \begin{proof} See the functions \func{Catt.Tree.Insertion.Properties}{insertion-bd-2} and \funcn{Catt.Tree.Insertion.Properties}{50068}{bd-κ-comm-2} in the formalisation module \module{Catt.Tree.Insertion.Properties}. \end{proof} Both of the further conditions in \cref{lem:insertion-bd-2} imply that \(n > \bh(P)\). We have therefore seen 3 conditions that can be put on \(n\), \(P\), and \(T\): \begin{itemize} \item \(n \leq \th(T)\) and \(n < \lh(P)\), \item \(n > \th(T)\) and \(n \leq \lh(P)\), \item \(n \geq \lh(P)\). \end{itemize} One of these conditions must always hold for any \(n\) and insertion point \((S,P,T)\), and hence one of \cref{lem:insertion-bd-1,lem:insertion-bd-2} can always be applied. \begin{remark} The further conditions in each of \cref{lem:insertion-bd-1,lem:insertion-bd-2} could be dropped in favour of weakening the syntactic equalities to definitional equalities in a theory with disc removal, as this would remove the distinction between standard terms and standard coherences. It was however more convenient to take this approach in the formalisation, and although the extra side conditions may seem arbitrary, the key result is that one of the above lemmas always holds. \end{remark} \paragraph{Insertion into standard constructions} Equipped with \cref{lem:insertion-bd-1,lem:insertion-bd-2}, we can now prove that the standard constructions are preserved by applying an exterior labelling up to a definitional equality containing insertion and disc removal. We begin with the following lemma, whose intuition is clear from the universal property of insertion. \begin{lemma} \label{lem:kappa-iota-insert} Suppose \((S,P,T)\) is an insertion point. Then \(\insertion {\kappa_{S,P,T}} P {\iota_{S,P,T}} \equiv \id_{\insertion S P T}\). \end{lemma} \begin{proof} See \func{Catt.Tree.Insertion.Properties}{κ-ι-prop} in \module{Catt.Tree.Insertion.Properties}. \end{proof} We can then proceed to the main theorem of this section. \begin{theorem} \label{thm:std-insert-props} Let \(\mathcal{R}\) be a tame equality rule set that has disc removal and insertion. Then for any insertion point \((S,P,T)\) and \(n \in \mathbb{N}\), we have: \[\insertion S P T \vdash_{\mathcal{R}} \stdtm {\bound n S} n \sub {\incbd n \epsilon S \bullet \kappa_{S,P,T}} = \stdtm {\bound n {\insertion S P T}} n \sub {\incbd n \epsilon {\insertion S P T}}\] \[\insertion S P T \vdash_{\mathcal{R}} \stdty S n \sub {\kappa_{S,P,T}} = \stdty {\insertion S P T} n\] for \(\epsilon \in \{-,+\}\) and if \(n \geq \dep(S)\) then: \[\insertion S P T \vdash_{\mathcal{R}} \stdcoh S n \sub {\kappa_{S,P,T}} = \stdcoh {\insertion S P T} n \qquad \insertion S P T \vdash_{\mathcal{R}}\stdtm S n \sub {\kappa_{S,P,T}} = \stdtm {\insertion S P T} n \] \end{theorem} \begin{proof} We prove all three properties by mutual induction: We begin with the equality: \[ \stdtm {\bound n S} n \sub {\incbd n \epsilon S \bullet \kappa_{S,P,T}} = \stdtm {\bound n {\insertion S P T}} n \sub {\incbd n \epsilon {\insertion S P T}}\] The conditions for either \cref{lem:insertion-bd-1,lem:insertion-bd-2} must hold, and so we treat in case separately. If the conditions for \cref{lem:insertion-bd-1} hold then the required equality is immediately implied by \(\bound n {\insertion S P T} \equiv \bound n S\) and \(\incbd n \epsilon S \bullet \kappa_{S,P,T} \equiv \incbd n \epsilon {\insertion S P T}\). If instead the conditions for \cref{lem:insertion-bd-2} hold then: \begin{align*} \stdtm {\bound n S} n \sub {\incbd n \epsilon S \bullet \kappa_{S,P,T}} &\equiv \stdtm {\bound n S} n \sub {\kappa_{\bound n S, \bound n P, \bound n T} \bullet \incbd n \epsilon {\insertion S P T}}\\ &\equiv \stdtm {\bound n S} n \sub {\kappa_{\bound n S, \bound n P, \bound n T}} \sub {\incbd n \epsilon {\insertion S P T}}\\ &= \stdtm {\insertion {\bound n S} {\bound n P} {\bound n T}} n \sub{\incbd n \epsilon {\insertion S P T}}\\ &\equiv \stdtm {\bound n {\insertion S P T}} n \sub {\incbd n \epsilon {\insertion S P T}} \end{align*} where the definitional equality is due to the inductive hypothesis on terms. We continue to the case for types. If \(n = 0\), then both sides of the equality are \(\star\). Instead, consider the \(n + 1\) case, where we have: \begin{alignat*}{3} \stdty S {n+1} \sub {\kappa_{S,P,T}} \equiv{} &\stdtm {\bound n S} n \sub {\incbd n - S} \sub {\kappa_{S,P,T}} &\qquad& \stdty {\insertion S P T} {n+1} \equiv{}&&\stdtm {\bound n {\insertion S P T}} n \sub {\incbd n - {\insertion S P T}} \\ &\to_{\stdty S n \sub {\kappa_{S,P,T}}} &&&&\to_{\stdty {\insertion S P T} n}\\ &\stdtm {\bound n S} n \sub {\incbd n + S} \sub {\kappa_{S,P,T}}&&&&\stdtm {\bound n {\insertion S P T}} n \sub {\incbd n + {\insertion S P T}} \end{alignat*} By the inductive hypothesis on \(n\), we have \(\stdty S n \sub {\kappa_{S,P,T}} = \stdty {\insertion S P T} n\), and other necessary equalities follow from the first case we considered. We now consider the case for standard coherences, where we must prove that: \[ \SCoh S {\stdty S n} {\kappa_{S,P,T}} = \SCoh {\insertion S P T} {\stdty {\insertion S P T} n} {\id} \] By \cref{lem:iota-kappa-comm}, \(\olsi P \sub \kappa_{S,P,T}\) is the standard coherence \(\stdcoh T {\lh(P)} \sub {\iota_{S,P,T}}\), and so the left-hand side of the above equation admits an insertion. Therefore: \begin{align*} \SCoh S {\stdty S n} {\kappa_{S,P,T}} &= \SCoh {\insertion S P T} {\stdty S n \sub {\kappa_{S,P,T}}} {\insertion {\kappa_{S,P,T}} P {\iota_{S,P,T}}}&\text{by insertion}\\ &\equiv \SCoh {\insertion S P T} {\stdty S n \sub {\kappa_{S,P,T}}} {\id}&\text{by \cref{lem:kappa-iota-insert}}\\ &= \SCoh {\insertion S P T} {\stdty {\insertion S P T} n} {\id}&\text{by inductive hypothesis}\\ &\equiv \stdcoh {\insertion S P T} n \end{align*} The equality for standard terms follows from the equality for standard coherences, using \cref{thm:std-dr}. \end{proof} \begin{corollary} \label{cor:standard-coh-insert} If \(\mathcal{R}\) has disc removal and insertion, then an insertion into a standard coherence is equal to the standard coherence over the inserted tree. \end{corollary} \begin{proof} Let \(s \equiv \stdcoh S n \sub L\) be a standard coherence, and suppose \((S,P,T,\U,L,M)\) is an insertion redex with \(\U \vdash s : A\) for some \(A\). Then: \begin{align*} \stdcoh S n \sub L &= \SCoh {\insertion S P T} {\stdty S n \sub {\kappa_{S,P,T}}} {\insertion L P M}\\ &= \SCoh {\insertion S P T} {\stdty {\insertion S P T} n} {\insertion L P M}\\ &= \stdcoh {\insertion S P T} n \sub {\insertion L P M} \end{align*} and so \(s\) is equal to a standard coherence over the tree \(\insertion S P T\). \end{proof} \paragraph{Chained insertion} We explore the situation where a term \(s\) has a locally maximal argument \(t\) which can be inserted, and this term \(t\) admits an insertion itself. For the argument \(t\) to be insertable, it must be a standard coherence, and by \cref{cor:standard-coh-insert}, if \(t = t'\) by insertion, then \(t'\) will be equal to a standard coherence over some tree \(T\). For the term \(t'\) to be insertable, \(T\) must have sufficient trunk height. Conditions for this are given in the following lemma. \begin{lemma} \label{lem:insert-lin-height} Let \((S,P,T)\) be an insertion point. Further, assume \(S\) is not linear. Then \(\th(\insertion S P T) \geq \th(S)\). \end{lemma} \begin{proof} See \func{Catt.Tree.Insertion.Properties}{insertion-trunk-height} in \module{Catt.Tree.Insertion.Properties}. \end{proof} If a tree \(S\) is not linear, then any branch of \(S\) has branch height greater than the trunk height of \(S\), and hence any insertion into \(S\) only modifies the tree above its trunk height, and so can only increase the trunk height. Therefore, if \((S,P,T)\) and \(T, Q , U\) are insertion points, and \(T\) is not linear, then \((S, P, \insertion T Q U)\) is also an insertion point. Conversely, it is possible to insert the argument directly into the head term, before performing the inner insertion, looking to perform the inner insertion afterwards. For this to be possible, a branch of the inserted tree must be given. This can again be done under a non-linearity condition. \begin{definition} Let \((S, P, T)\) be an insertion point where \(T\) is not linear. Then from a branch \(Q\) of \(T\) we can obtain a branch \(\insertion S P Q\) of \(\insertion S P T\). We first observe that \(\bh(Q) \geq \th(T) \geq \bh(P)\). We define this branch by induction on \(P\) and \(Q\): \begin{itemize} \item Suppose \(P = [k]\) and \(Q = q :: x\). Then define: \[\insertion S P Q = (k - 1 + q) :: x\] \item Suppose \(P = k :: P_2\) with \(S = [S_0,\dots,S_n]\) and \(T = \Sigma(T_0)\). In this case we must have \(Q = 0 :: Q_2\) where \(Q_2\) is a branch of \(T_0\). Then define: \[ \insertion S P Q = k :: \insertion {S_k} {P_2} {Q_2}\] \end{itemize} It is clear that \(\insertion S P Q\) has the same branching and leaf height as \(Q\). \end{definition} A simple inductive proof shows that: \[\olsi {\insertion S P Q} \equiv \olsi Q \sub{\iota_{S,P,T}}\] Now given insertion points \((S,P,T)\) and \((T, Q ,U)\) with \(T\) non-linear we have that the triple \((\insertion S P T, \insertion S P Q, U)\) is another insertion point. Therefore, two ways of performing both insertions, which are depicted in \cref{fig:chained-insertion}. \begin{figure}[ht] \centering \newsavebox\chaintop \sbox\chaintop{\(\insertion{\insertion{ \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (0.5,1)(x12){$\bullet$}; \node [on grid, Diag1] at (0,2) (x21){$\bullet$}; \node [on grid] at (1,2) (x22){$\bullet$}; \node [on grid, Diag1] at (0,3)(x31) {$\bullet$}; \node [on grid] at (1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (0,4)(x41) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw[very thick,Diag1] (x12.center) to (x21.center); \draw (x12.center) to (x22.center); \draw[very thick,Diag1] (x21.center) to (x31.center); \draw (x22.center) to (x32.center); \draw[very thick,Diag1] (x31.center) to (x41.center); \draw (x32.center) to (x42.center); \end{scope} \end{tikzpicture}\quad} {[1,0]} {\quad \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base), Diag1] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (0,2) (x21){$\bullet$}; \node [on grid] at (-0.5,3)(x31) {$\bullet$}; \node [on grid] at (0.5,3) (x32) {$\bullet$}; \node [on grid, Diag2] at (-0.5,4)(x41) {$\bullet$}; \node [on grid] at (0.5,4)(x42) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x21.center) to (x31.center); \draw (x21.center) to (x32.center); \begin{scope}[on background layer] \draw[very thick, Diag2] (x31.center) to (x41.center); \end{scope} \draw (x32.center) to (x42.center); \end{tikzpicture}}\quad} {[0,0,0,0]} { \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base), Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (0,2) (x21){$\bullet$}; \node [on grid] at (0,3)(x31) {$\bullet$}; \node [on grid] at (-0.5,4)(x41) {$\bullet$}; \node [on grid] at (0.5,4)(x42) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x21.center) to (x31.center); \draw (x31.center) to (x41.center); \draw (x31.center) to (x42.center); \end{tikzpicture}}\)} \newsavebox\chainleft \sbox\chainleft{\(\insertion{ \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid, Diag1] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid, Diag1] at (0.5,1)(x12){$\bullet$}; \node [on grid, Diag1] at (0,2) (x21){$\bullet$}; \node [on grid] at (1,2) (x22){$\bullet$}; \node [on grid, Diag1] at (-0.33,3)(x31) {$\bullet$}; \node [on grid, Diag1] at (0.33,3) (x32) {$\bullet$}; \node [on grid] at (1,3) (x33) {$\bullet$}; \node [on grid, Diag2] at (-0.33,4)(x41) {$\bullet$}; \node [on grid, Diag1] at (0.33,4)(x42) {$\bullet$}; \node [on grid] at (1,4)(x43) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag1] (x01.center) to (x12.center); \draw[Diag1] (x12.center) to (x21.center); \draw (x12.center) to (x22.center); \draw[Diag1] (x21.center) to (x31.center); \draw[Diag1] (x21.center) to (x32.center); \draw (x22.center) to (x33.center); \draw[very thick,Diag2] (x31.center) to (x41.center); \draw[Diag1] (x32.center) to (x42.center); \draw (x33.center) to (x43.center); \end{scope} \end{tikzpicture}\quad} {[1,0,0,0]} { \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base), Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (0,2) (x21){$\bullet$}; \node [on grid] at (0,3)(x31) {$\bullet$}; \node [on grid] at (-0.5,4)(x41) {$\bullet$}; \node [on grid] at (0.5,4)(x42) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x21.center) to (x31.center); \draw (x31.center) to (x41.center); \draw (x31.center) to (x42.center); \end{tikzpicture}}\) } \newsavebox\chainright \sbox\chainright{\(\insertion{ \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (-0.5,1) (x11) {$\bullet$}; \node [on grid] at (0.5,1)(x12){$\bullet$}; \node [on grid, Diag1] at (0,2) (x21){$\bullet$}; \node [on grid] at (1,2) (x22){$\bullet$}; \node [on grid, Diag1] at (0,3)(x31) {$\bullet$}; \node [on grid] at (1,3) (x32) {$\bullet$}; \node [on grid, Diag1] at (0,4)(x41) {$\bullet$}; \node [on grid] at (1,4)(x42) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw (x01.center) to (x12.center); \draw[very thick,Diag1] (x12.center) to (x21.center); \draw (x12.center) to (x22.center); \draw[very thick,Diag1] (x21.center) to (x31.center); \draw (x22.center) to (x32.center); \draw[very thick,Diag1] (x31.center) to (x41.center); \draw (x32.center) to (x42.center); \end{scope} \end{tikzpicture}\quad} {[1,0]} { \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base), Diag2] \node [on grid] at (0,0) (x01) {$\bullet$}; \node [on grid] at (0,1) (x11) {$\bullet$}; \node [on grid] at (0,2) (x21){$\bullet$}; \node [on grid] at (-0.5,3)(x31) {$\bullet$}; \node [on grid, Diag1] at (0.5,3) (x32) {$\bullet$}; \node [on grid] at (-0.83,4)(x41) {$\bullet$}; \node [on grid] at (-0.16,4)(x42) {$\bullet$}; \node [on grid, Diag1] at (0.5,4)(x43) {$\bullet$}; \draw (x01.center) to (x11.center); \draw (x11.center) to (x21.center); \draw (x21.center) to (x31.center); \draw (x31.center) to (x41.center); \draw (x31.center) to (x42.center); \begin{scope}[on background layer] \draw[Diag1] (x21.center) to (x32.center); \draw[Diag1] (x32.center) to (x43.center); \end{scope} \end{tikzpicture}}\) } \newsavebox\chainbot \sbox\chainbot{\( \begin{tikzpicture}[xscale=1.4,every node/.append style={scale=0.85},baseline=(x21.base)] \node [on grid, Diag2] at (0.125,0) (x01) {$\bullet$}; \node [on grid] at (-0.3125,1) (x11) {$\bullet$}; \node [on grid, Diag2] at (0.5625,1)(x12){$\bullet$}; \node [on grid, Diag2] at (0.125,2) (x21){$\bullet$}; \node [on grid] at (1,2) (x22){$\bullet$}; \node [on grid, Diag2] at (-0.25,3)(x31) {$\bullet$}; \node [on grid, Diag1] at (0.5,3) (x32) {$\bullet$}; \node [on grid] at (1,3) (x33) {$\bullet$}; \node [on grid, Diag2] at (-0.5,4)(x41) {$\bullet$}; \node [on grid, Diag2] at (0,4)(x42) {$\bullet$}; \node [on grid, Diag1] at (0.5,4)(x43) {$\bullet$}; \node [on grid] at (1,4)(x44) {$\bullet$}; \begin{scope}[on background layer] \draw (x01.center) to (x11.center); \draw[Diag2] (x01.center) to (x12.center); \draw[Diag2] (x12.center) to (x21.center); \draw (x12.center) to (x22.center); \draw[Diag2] (x21.center) to (x31.center); \draw[Diag1] (x21.center) to (x32.center); \draw (x22.center) to (x33.center); \draw[Diag2] (x31.center) to (x41.center); \draw[Diag2] (x31.center) to (x42.center); \draw[Diag1] (x32.center) to (x43.center); \draw (x33.center) to (x44.center); \end{scope} \end{tikzpicture}\) } \begin{tikzpicture} \node(chaintop) at (0,0) {\fcolorbox{gray}{white}{\usebox\chaintop}}; \node(chainleft) at (-4.5,-6){\fcolorbox{gray}{white}{\usebox\chainleft}}; \node(chainright) at (4.5,-6){\fcolorbox{gray}{white}{\usebox\chainright}}; \node(chainbot) at (0,-12){\fcolorbox{gray}{white}{\usebox\chainbot}}; \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (chaintop) to (chainleft); \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (chaintop) to (chainright); \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (chainleft) to (chainbot); \draw[arrows={->[scale=1.5]}, line join=round, decorate, decoration={ zigzag, segment length=4, amplitude=1.2,post=lineto, post length=2pt }] (chainright) to (chainbot); \end{tikzpicture} \caption{Chained insertion.} \label{fig:chained-insertion} \end{figure} We now explore the universal property of the insertion along the branch \(\insertion S P Q\). We assume that \(n = \lh(P)\) and \(m = \lh(Q)\) and form the following diagram: \[ \begin{tikzcd} & {D^n} & S \\ {D^m} & T & {\insertion S P T} \\ U && {\insertion {(\insertion S P T)} {\insertion S P Q} U} \arrow["{\{\olsi P \}}", from=1-2, to=1-3] \arrow["{\{ \stdcoh T n \}}"', from=1-2, to=2-2] \arrow["{\{ \olsi Q \}}"', from=2-1, to=2-2] \arrow["{\{\stdcoh U m\}}"', from=2-1, to=3-1] \arrow["{\iota_{S,P,T}}"', from=2-2, to=2-3] \arrow["{\kappa_{S,P,T}}", from=1-3, to=2-3] \arrow["{\kappa_{\insertion S P T, \insertion S P Q, U}}", from=2-3, to=3-3] \arrow["{\iota_{\insertion S P T, \insertion S P Q, U}}"', from=3-1, to=3-3] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=2-3, to=1-2] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=3-3, to=2-1] \end{tikzcd} \] The top pushout square is given by the insertion of \(T\) into \(S\) along \(P\). The morphism \(\{\olsi Q\} \bullet \iota_{S,P,T}\) through the middle of the diagram is then equal to \(\{\olsi {\insertion S P Q} \}\), allowing the bottom pushout rectangle to be formed by the insertion of \(U\) into \(\insertion S P T\) along \(\insertion S P Q\). We can also consider the universal property of the tree generated by first inserting \(U\) into \(T\), and then inserting the inserted tree into \(S\), which is given by the diagram below: \[ \begin{tikzcd} & {D^n} && S \\ {D^m} & T \\ U & {\insertion T Q U} & & {\insertion S P {(\insertion T Q U)}} \arrow["{\{\olsi P \}}", from=1-2, to=1-4] \arrow["{\{ \stdcoh T n \}}"', from=1-2, to=2-2] \arrow["{\{ \olsi Q \}}"', from=2-1, to=2-2] \arrow["{\{\stdcoh U m\}}"', from=2-1, to=3-1] \arrow["{\kappa_{T,Q,U}}", from=2-2, to=3-2] \arrow["{\iota_{T,Q,U}}"', from=3-1, to=3-2] \arrow["{\kappa_{S,P,\insertion T Q U}}", from=1-4, to=3-4] \arrow["{\iota_{S,P,\insertion T Q U}}"', from=3-2, to=3-4] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=3-4, to=1-2] \arrow["\lrcorner"{anchor=center, pos=0.125, rotate=180, scale=1.5}, draw=none, from=3-2, to=2-1] \end{tikzcd} \] The left-hand pushout square is given by the insertion of \(U\) into \(T\) along \(Q\). The morphism \(\{\stdcoh T n\} \bullet \kappa_{T,Q,U}\) which runs vertically through the centre of the diagram is then equal to \(\{\stdcoh {\insertion T Q U} n\}\) by \cref{cor:standard-coh-insert}, allowing for the right-hand pushout square to be formed as the insertion of \(\insertion T Q U\) into \(S\) along \(P\). By common properties of colimits, both of these constructions then arise as colimits of the same diagram, the shared top left boundary of both constructions. The results of this section are stated in the following lemma. \begin{lemma} \label{lem:inserted-insertion} Let \((S,P,T)\) and \((T,Q,U)\) be insertion points. Further assume \(T\) is not linear. Then: \begin{alignat*}{3} &\insertion S P {(\insertion T Q U)} & &= & &\insertion {(\insertion S P T)} {\insertion S P Q} U\\ &\kappa_{S,P,\insertion T Q U} &&=^{\mathsf{max}}{} &&\kappa_{S,P,T} \circ \kappa_{\insertion S P T, \insertion S P Q, U}\\ &\insertion L P {(\insertion M Q N)} &&\equiv^{\mathsf{max}}{} &&\insertion {(\insertion L P M)} {\insertion S P Q} N \end{alignat*} for any \(L : S \to \U\), \(M : T \to \U\), and \(N : U \to \U\). \end{lemma} \begin{proof} See the functions \func{Catt.Tree.Insertion.Properties}{insertion-tree-inserted-branch} and \func{Catt.Tree.Insertion.Properties}{label-from-inserted-branch} in the formalisation module \module{Catt.Tree.Insertion.Properties}, and \funcn{Catt.Typing.Insertion.Equality}{22361}{κ-inserted-branch} in module \module{Catt.Typing.Insertion.Equality}. \end{proof} \chapter{Semistrict variants of \Catt} \label{cha:cattstrict} The type theories \Cattsu, a type theory for strictly unital \(\infty\)-categories, and \Cattsua, a type theory for strictly unital and associative \(\infty\)-categories, are introduced in this chapter, where we will define both theories and explore some metatheory and properties of each type theory in detail. The results in this chapter will heavily depend on the theory developed in the previous chapters. Both type theories will be defined as instances of \Cattr, which was introduced in \cref{sec:catt-with-equality}, and much of the initial metatheory can be immediately derived by demonstrating that the equality rule sets that generate \Cattsu and \Cattsua satisfy the various conditions given in \cref{sec:ruleset}. The theory \Cattsu is primarily generated by pruning, which was introduced in \cref{sec:pruning}, and the theory \Cattsua depends on the insertion operation, which was introduced in \cref{sec:insertion}. \cref{sec:cattsu} will introduce and define the \Cattsu, and \cref{sec:cattsua} will do the same for \Cattsua. The main contribution of these sections is to give normalisation algorithms for their respective theories, giving a notion of computation to each theory. A normalisation algorithm is a function \(\N : \Term_\Gamma \to \Term_\Gamma\) with the following properties: \begin{itemize} \item For any term \(t : \Term_\Gamma\), \(\Gamma \vdash \N(t) = t\). \item For any \(s, t : \Term_\Gamma\) with \(\Gamma \vdash s = t\), \(\N(s) \equiv \N(t)\). \end{itemize} The term \(\N(t)\) is called the \emph{normal form} of \(t\). Such an algorithm allows equality of two term \(s\) and \(t\) to be decided by taking the normal form of each term and checking if they are syntactically equal. Normalisation can be extended to types and substitutions in a natural way. In \cref{sec:cattsu,sec:cattsua}, the normalisation algorithm is defined by giving a reduction system on the syntax of the type theory, which we show to be terminating, meaning that there is no infinite reduction sequence and confluent, meaning that any two reduction paths converge to a common reduct. The normal form of a term can then be obtained by reducing it until there are no further reductions possible. In \cref{sec:reduction}, these notions are recalled, and we demonstrate that the resulting normalisation algorithm satisfies the two properties stated above. This section also introduces a method for obtaining a reduction system from an arbitrary equality rule set \(\mathcal{R}\). Such a normalisation procedure allows a type checking algorithm to be implemented, creating an interpreter for the language. This allows us to write larger terms, and it can be automatically verified whether they are well-formed. In \cref{sec:towards-nbe}, we introduce our implementation of \Catt, \Cattsu, and \Cattsua, written in rust. This implementation supports features such as implicit arguments to terms, implicit suspension, and native support for trees and tree labellings. We will explain how the tool can be used, and use it to give larger examples of \Cattsua terms, including proofs of Eckmann-Hilton (see \cref{fig:eh}) and its higher-dimensional coherence condition, the syllepsis. The implementation uses an approach closer to normalisation by evaluation for typechecking terms in the theory. \cref{sec:towards-nbe} explores this algorithm and presents some perspectives on applying normalisation by evaluation to semistrict versions of \Catt. \cref{sec:models} provides a discussion of the models of the semistrict type theories \Cattsu and \Cattsua, demonstrating how they can be viewed as semistrict \(\infty\)-categories. The section proves a partial conservativity result, which allows a proof that semistrictness is a property of a weak \(\infty\)-category, and not additional structure. A discussion is provided on some of the challenges that must be overcome to extend this partial conservativity result. The thesis ends with \cref{sec:future-work}, which provides a review of avenues for future work in this area, including a discussion of further variants of \Catt which could be defined. \section{Reduction} \label{sec:reduction} Reduction is a method for defining computation for a type theory. For each term, a number of reductions can be applied to it, representing the various computations that could be applied to the term. Computation can then be run on a term by repeatedly searching for positions in the term that admit a reduction, known as \emph{redexes}, and applying this reduction, until no more redexes exist in the term. When a term admits no reductions, it is called a \emph{normal form}. \begin{definition} A \emph{reduction system} is given by a relation \(s \red t\) on terms. The relation \(\red^{*}\) is defined to be the reflexive transitive closure of \(\red\), and so \(s \red^* t\) exactly when there is some chain \[s \equiv u_0 \red \cdots \red u_k \equiv t\] for \(k \in \mathbb{N}\) (which could be \(0\) with \(s \equiv t\)) and terms \(u_i\) for \(i \leq k\). Further define \(\leftrightsquigarrow\) to be the reflexive symmetric transitive closure of \(\red\). When a term \(s\) admits no reductions, that is there is no \(t\) such that \(s \red t\), we say it is in \emph{normal form}. \end{definition} If we have an equality rule set \(\mathcal{R}\) (see \cref{sec:ruleset}) that generates \Cattr, a reduction system can be defined on \(\mathcal{R}\) modifying the rules for equality to remove the reflexivity, symmetry, and transitivity constructors and ensure that reductions do not happen ``in parallel''. \begin{definition} Let \(\mathcal{R}\) be an equality rule set. Define the reduction system \(\redr\) on well-formed terms, well-formed substitutions, and well-formed types to be generated by the rules in \cref{fig:reduction}. When it is clear which equality rule set is being used, we may simply write \(s \red t\) instead of \(s \redr t\). \end{definition} \begin{figure}[ht] \centering \begin{mathpar} \inferrule{(\Gamma, s, t) \in \mathcal{R}}{s \redr t}\textsc{rule} \and \inferrule{A \redr B}{\Coh \Delta A \sigma \redr \Coh \Delta B \sigma}\textsc{cell} \and \inferrule{\sigma \redr \tau}{\Coh \Delta A \sigma = \Coh \Delta A \tau}\textsc{arg} \\ \inferrule{s \redr s'}{\arr s A t \redr \arr {s'} A t}\and \inferrule{t \redr t'}{\arr s A t \redr \arr s A {t'}}\and \inferrule{A \redr A'}{\arr s A t \redr \arr s {A'} t}\\ \inferrule{\sigma \redr \tau}{\langle \sigma, s \rangle \redr \langle \tau, s \rangle}\and \inferrule{s \redr t}{\langle \sigma, s \rangle \redr \langle \sigma, t \rangle} \end{mathpar} \caption[Reduction rules]{Rules for \(\rightsquigarrow_{\mathcal{R}}\).} \label{fig:reduction} \end{figure} The rules for reduction are set up so that each reduction \(s \redr t\) corresponds to the application of exactly one rule from \(\mathcal{R}\) at a single point in the term. Given a coherence \(\Coh \Delta A \sigma\), we call reductions generated by the \textsc{cell} rule \emph{cell reductions} and reductions generated by the \textsc{arg} rule \emph{argument reductions}. Reductions generated by \textsc{rule} will be named by the rule in \(\mathcal{R}\) that was used. For example a reduction generated by \textsc{rule} applied with an instance of pruning will be called a pruning reduction. We highlight that our reduction system \(\redr\) is only defined between well-formed pieces of syntax. As this reduction will be used with rule sets \(\mathcal{R}\) which satisfy the preservation condition, there will be no additional burden of checking that typing is preserved while applying reductions. Therefore, we can prove that the reflexive symmetric transitive closure of reduction, \(\redrts\), is the same relation as equality on well-formed terms, given the similarity between the rules for reduction and the rules for equality. \begin{proposition} \label{prop:red-is-eq} Let \(\mathcal{R}\) be a rule set satisfying the preservation, support, and substitution conditions (such that the generated equality preserves typing). Letting \(\redrts\) be the reflexive symmetric transitive closure of \(\redr\), we get: \begin{align*} \Gamma \vdash s = t &\iff s \redrts t \\ \intertext{for \(s,t : \Term_\Gamma\) such that \(\Gamma \vdash s : A\) and \(\Gamma \vdash t : A\) for some \(A : \Type_\Gamma\)} \Gamma \vdash A = B &\iff A \redrts B \\ \intertext{for \(A,B : \Type_\Gamma\) such that \(\Gamma \vdash A\) and \(\Gamma \vdash B\)} \Gamma \vdash \sigma = \tau &\iff \sigma \redrts \tau \end{align*} for \(\sigma, \tau : \arr \Delta \star \Gamma\) such that \(\Gamma \vdash \sigma : \Delta\) and \(\Gamma \vdash \tau : \Delta\). \end{proposition} \begin{proof} Each direction can be proved separately by a mutual induction on the derivation in the premise. For the right to left direction, it suffices to show that the single step reduction (\(\redr\)) is contained in the equality, as equality is an equivalence relation by construction. \end{proof} Just as the preservation condition on a rule set \(\mathcal{R}\) allows us to deduce that reduction preserves typing, the substitution condition can be used to prove that reduction is preserved by application of substitution. \begin{proposition} \label{prop:red-sub} Suppose \(\mathcal{R}\) satisfies the substitution condition and let \(\sigma : \Delta \to \Gamma\) be a well-formed substitution. Then: \begin{align*} s \redr t &\implies s \sub \sigma \redr t \sub \sigma \\ A \redr B &\implies A \sub \sigma \redr B \sub \sigma \\ \tau \redr \mu &\implies \tau \bullet \sigma \redr \mu \bullet \sigma \end{align*} for well-formed terms \(s,t\), well-formed types \(A,B\), and well-formed substitutions \(\tau\) and \(\mu\). Furthermore, if \(\sigma \redr \tau\), then: \[ s \sub \sigma \redr^* s \sub \tau \qquad A \sub \sigma \redr^* A \sub \tau \qquad \mu \bullet \sigma \redr^* \mu \bullet \tau\] for term \(s\), type \(A\), and substitution \(\mu\). \end{proposition} \begin{proof} The first part by a simple induction on the reduction in the premise. The second holds by a mutual induction on the term \(s\), type \(A\), and substitution \(\mu\). \end{proof} \subsection{Termination} \label{sec:termination} In order to obtain a normal form of each term of the theory, we perform reductions on a term until no more can be applied. This can only be done if we know that this will eventually result in a normal form, a property known as \emph{strong termination}. \begin{definition} A reduction system \(\red\) is \emph{strongly terminating} if there is no infinite sequence of reductions: \[ s_0 \red s_1 \red s_2 \red \cdots \] For such a reduction, applying reductions to a term will eventually reach a normal form. \end{definition} Demonstrating the termination of the reduction systems defined in \cref{sec:cattsu,sec:cattsua} will be non-trivial, as each reduction adds new constructions to the term, which could themselves admit reductions. Suppose we have the following reduction due to endo-coherence removal (see \cref{sec:ecr}): \[ \Coh \Delta {\arr s A s} \sigma \red \id(A \sub \sigma,s \sub \sigma) \] The identity term was not present in the premise of the reduction, and the term \(s \sub \sigma\) is newly created by the reduction, and could itself admit any number of reductions. To prove termination, we will exploit that although each reduction creates new subterms, these subterms are all of a lower dimension than the dimension of the term that is being reduced. In the example above, the dimension of \(\Coh \Delta {\arr s A s} \sigma\) is greater than the dimension of the term \(s\), and so the reduction has still made progress towards a normal form by decreasing the complexity of the term in dimension \(\dim(A)\), even though it may introduce arbitrary complexity below \(\dim(A)\). To this end we define the following notion of complexity for each class of syntax, which assigns an ordinal number to each term, which we call its \emph{syntactic complexity}. As the ordinal numbers are well-founded, we aim to prove that our reduction is terminating by proving that each single-step reduction reduces the complexity of the term. To define syntactic complexity, we will need to use ordinal numbers up to \(\omega^\omega\). We will also need a construction known as the natural sum of ordinals, \(\alpha \+ \beta\), which is associative, commutative, and strictly monotone in both of its arguments~\cite{lipparini16}. \begin{definition} For all terms \(t\), types \(A\), and substitutions \(\sigma\), the \emph{syntactic complexity} \(\sc(t)\), \(\sc(A)\), and \(\sc(\sigma)\) are mutually defined as follows: \begin{itemize} \item For types: \[ \sc(\star) = 0 \qquad \sc(\arr s A t) = \sc(s) \+ \sc(A) \+ \sc(t)\] \item For substitutions we have: \[\sc(\langle t_0, \dots, t_n \rangle) = \bighash_{i=0}^n t_i\] \item For terms, we have \(\sc(x) = 0\) for variables \(x\) and for coherences we have: \begin{equation*} \sc(\Coh \Delta A \sigma) = \begin{cases*} \omega^{\dim(A)} \+ \sc(\sigma)&if \(\Coh \Delta A \sigma\) is an identity\\ 2\omega^{\dim(A)} \+ \sc(\sigma)&otherwise \end{cases*} \end{equation*} \end{itemize} \end{definition} The syntactic complexity is given as an ordinal to leverage known results, though it should be noted that ordinals below \(\omega^\omega\) can be represented by a list of natural numbers ordered reverse lexicographically. Under this interpretation the syntactic complexity effectively computes the number of coherences at each dimension. Therefore, removing a coherence of dimension \(n\) reduces the complexity, even if arbitrary complexity is added at lower dimensions. Syntactic complexity also treats identities in a special way, as these play a special role in blocking reduction for the theories presented in this chapter. The syntactic complexity does not account for the type in a coherence, as this is difficult to encode. Instead of showing that all reductions reduce syntactic complexity, we instead show that all reductions which are not cell reductions (reductions that have the rule marked \textsc{cell} in their derivation) reduce syntactic complexity and deduce that a hypothetical infinite reduction sequence must only consist of cell reductions after a finite number of steps, and then appeal to an induction on dimension. \begin{lemma} \label{lem:termination-lem} Let \(\mathcal{R}\) be an equality set with \( \sc(s) > \sc(t) \) for all \((\Gamma,s,t) \in \mathcal{R}\). Then \(\redr\) is strongly terminating. \end{lemma} \begin{proof} By a simple induction on reductions, we immediately have that if \(s \redr t\) then \(\sc(s) \geq \sc(t)\), with the inequality strict when the reduction is not a cell reduction. We then proceed by induction on the dimension. Suppose there is an infinite reduction sequence, starting with a \(k\)-dimensional term: \[ s_0 \red s_1 \red s_2 \red \cdots\] Then by assumption, only finitely many of these reductions do not use the cell rule, as otherwise we would obtain an infinite chain of ordinals \[ \sc(s_0) \geq \sc(s_1) \geq \sc(s_2) \geq \cdots\] where infinitely many of these inequalities are strict. Therefore, there is an \(n\) such that: \[ s_n \red s_{n+1} \red \cdots\] are all cell reductions. Each of these reductions reduces one of finitely many subterms of \(s_n\), and each of these subterms has dimension less than \(k\), so by inductive hypothesis, none of these subterms can be reduced infinitely often, contradicting the existence of an infinite reduction sequence. \end{proof} We can immediately prove that disc removal reduces syntactic complexity. \begin{proposition} \label{prop:disc-rem-sc} Let \(s \red t\) be an instance of disc removal. Then \(\sc(s) > \sc(t)\). \end{proposition} \begin{proof} We must have \(s \equiv \Coh {D^n} {\wk(U^n)} {\{A,t\}}\) for some \(n\) and \(A\). Then: \begin{align*} \sc(s) &= \sc(\Coh {D^n} {\wk(U^n)} {\{A,t\}})\\ &= 2\omega^n \+ \sc(\{A,t\})\\ &> \sc(\{A,t\})\\ &\geq \sc(t) \end{align*} where the last inequality holds by a simple induction on the dimension of \(A\). \end{proof} We note that as stated so far the reduction: \[ \id(A,s) \red \id(A,s)\] is a valid instance of endo-coherence removal for type \(A\) and term \(s\), which will break termination. We therefore let \(\ecr'\) be the equality rule set obtained by removing all triples \((\Gamma,s,t)\) from \(\ecr\) where \(s\) is already an identity. We justify replacing \ecr by \ecr' with the following lemma. \begin{lemma} \label{lem:always-ecr} The following reduction holds, even when the left-hand side is an identity: \[\Coh \Delta {\arr s A s} \sigma \red_{\ecr'}^* \id(A\sub \sigma,s\sub\sigma)\] \end{lemma} \begin{proof} If \(\Coh \Delta {\arr s A s} \sigma\) is not an identity then it can be reduced by endo-coherence removal. Otherwise, we have \(\Delta = D^n\) for some \(n\), \(s \equiv d_n\), \(A \equiv \wk(U^n)\), and \(\sigma \equiv \{B,t\}\) for some \(B\) and \(t\) and so: \[\id(A\sub \sigma,s \sub \sigma) \equiv \id(\wk(U^n)\sub{\{B,t\}}, d_n \sub {\{B,t\}}) \equiv \id(B,t) \] It follows that the reduction is trivial. \end{proof} It can then be proven that the reductions in this set reduce syntactic complexity. \begin{proposition} \label{prop:ecr-sc} Let \(s \red t\) be an instance of endo-coherence removal. If \(s\) is not an identity then \(\sc(s) > \sc(t)\). \end{proposition} \begin{proof} As \(s \red t\) is an instance of endo-coherence removal, we must have \(s \equiv \Coh \Delta {\arr u A u} \sigma\) and \(t \equiv \id(A \sub \sigma, u \sub \sigma)\). Further, \(s\) is not an identity and so: \begin{align*} \sc(s) &= \sc(\Coh \Delta {\arr u A u} \sigma)\\ &= 2\omega^{\dim(A) + 1} \+ \sc(\sigma)\\ &\geq 2\omega^{\dim(A) + 1}\\ &< \omega^{\dim(A) + 1} \+ \sc(A \sub \sigma) \+ \sc(u \sub \sigma) &= \sc(\id(A \sub \sigma, u \sub \sigma)) \\ &= \sc(t) \end{align*} where the last inequality holds as \(\sc(A \sub \sigma) \+ \sc(u \sub \sigma) < \omega^{\dim(A) + 1}\) due to both \(A \sub \sigma\) and \(u \sub \sigma\) having the same dimension as \(\dim(A)\), meaning that their syntactic complexities are strictly bounded by \(\omega^{\dim(A) + 1}\). \end{proof} \subsection{Confluence} \label{sec:confluence} Another crucial property of reduction systems is \emph{confluence}. A term \(s\) may have any number of redexes and could reduce to distinct terms \(t\) and \(u\). Confluence states that both the terms \(t\) and \(u\) must reduce to some common term, allowing us to apply reductions to a term in any order. \begin{definition} Let \(\red\) be a reduction system. It is \emph{(globally) confluent} if for all terms \(s\),\(t\), and \(u\) with \(s \red^* t\) and \(s \red^* u\), there is a term \(v\) such that \(t \red^* v\) and \(t \red^* v\). This can be assembled into the following diagram: \[ \begin{tikzcd} & s \\ t && u \\ & v \arrow["{*}", squiggly, from=1-2, to=2-3] \arrow["{*}"', squiggly, from=1-2, to=2-1] \arrow["{*}"', squiggly, from=2-1, to=3-2] \arrow["{*}", squiggly, from=2-3, to=3-2] \end{tikzcd} \] and hence is sometimes called the diamond property for \(\red^*\). \end{definition} From global confluence, it is clear that if \(s \redrts t\), where \(\redrts\) is the reflexive symmetric transitive closure of \(\redr\), then there is \(u\) with \(s \redr^* u\) and \(t \redr^* u\). It is sometimes simpler to show that the following weaker confluence property holds: \begin{definition} Let \(\red\) be a reduction system. It is \emph{locally confluent} if given \(s \red t\) and \(s \red u\) there exists a term \(v\) such that: \[ \begin{tikzcd} & s \\ t && u \\ & v \arrow["", squiggly, from=1-2, to=2-3] \arrow[""', squiggly, from=1-2, to=2-1] \arrow["{*}"', squiggly, from=2-1, to=3-2] \arrow["{*}", squiggly, from=2-3, to=3-2] \end{tikzcd} \] that is \(t \red^* v\) and \(u \red^* v\). \end{definition} Global confluence trivially implies local confluence. If we further know that the reduction system \(\red\) is strongly terminating then local confluence is sufficient to show global confluence. \begin{lemma}[Newman's lemma \cite{newman1942theories}] \label{lem:newman} Let \(\red\) be strongly terminating and locally confluent. Then \(\red\) is globally confluent. \end{lemma} Local confluence for the reduction systems of the type theories \Cattsu and \Cattsua will be proved using \emph{critical pair analysis}. A critical pair is a pair of distinct reductions which apply to the same term. When analysing the critical pairs of our semistrict type theories, we will encounter terms that are structurally similar, but differ on lower-dimensional subterms up to equality. We define this precisely. \begin{definition} Let \(\mathcal{R}\) be an equality rule set. For \(n \in \mathbb{N}\), define the \emph{bounded equality set} \(\mathcal{R}_n\) as: \[ \mathcal{R}_n = \left\{ (\Gamma, s, t) \in \mathcal{R} \mid \dim(s) = \dim(t) < n \right\}\] Let the \emph{bounded equality relation} \(s =_n t\) be the equality generated by the set \(\mathcal{R}_n\). \end{definition} This is used to prove the following lemma, which implies that for a critical pair \(t \leftsquigarrow s \rightsquigarrow u\) it is not necessary to find a common reduct of \(t\) and \(u\), but simply find reducts \(t'\) and \(u'\) of \(t\) and \(u\) such that \(t' =_{\dim(s)} u'\). \begin{lemma} \label{lem:conf-strat} Let \(\mathcal{R}\) be a tame equality rule set which satisfies the preservation and support conditions, and further assume that \(\redr\) is strongly terminating. Suppose the following diagram can be formed: % https://q.uiver.app/#q=WzAsNixbMiwwLCJzIl0sWzAsMSwidCJdLFs0LDEsInUiXSxbMSwyLCJ0JyJdLFszLDIsInUnIl0sWzIsMiwiPV97XFxkaW0ocyl9Il0sWzAsMiwiIiwwLHsic3R5bGUiOnsiYm9keSI6eyJuYW1lIjoic3F1aWdnbHkifX19XSxbMCwxLCIiLDIseyJzdHlsZSI6eyJib2R5Ijp7Im5hbWUiOiJzcXVpZ2dseSJ9fX1dLFsxLDMsIioiLDIseyJzdHlsZSI6eyJib2R5Ijp7Im5hbWUiOiJzcXVpZ2dseSJ9fX1dLFsyLDQsIioiLDAseyJzdHlsZSI6eyJib2R5Ijp7Im5hbWUiOiJzcXVpZ2dseSJ9fX1dXQ== % tex-fmt: skip \[ \begin{tikzcd}[column sep=tiny] && s \\ t &&&& u \\ & {t'} & {\mathclap{=_{\dim(s)}}} & {u'} \arrow[squiggly, from=1-3, to=2-5] \arrow[squiggly, from=1-3, to=2-1] \arrow["{*}"', squiggly, from=2-1, to=3-2] \arrow["{*}", squiggly, from=2-5, to=3-4] \end{tikzcd} \] for all critical pairs \(t \leftsquigarrow_{\mathcal{R}} s \redr u\) such that \(s \redr t\) is derived using \textsc{rule}. Then \(\redr\) is confluent. \end{lemma} \begin{proof} By \cref{lem:newman}, it suffices to show local confluence. We proceed by strong induction on \(n\) and \(s\), proving that all critical pairs \(t \leftsquigarrow_{\mathcal{R}_n} s \red_{\mathcal{R}_n} u\) have a common reduct, assuming that all critical pairs \(t \leftsquigarrow_{\mathcal{R}_m} s' \red_{\mathcal{R}_m} u\) have a common reduct, where \(s'\) is a subterm of \(s\) or \(m < n\). We justify this induction principle by noting that for all subterms \(s'\) of \(s\) we have \(\dim(s') \leq \dim(s)\). We now consider critical pair \(t \leftsquigarrow_{\mathcal{R}_n} s \red_{\mathcal{R}_n} u\). We first suppose that \(s \red_{\mathcal{R}_n} t\) is derived from \textsc{rule}. Then, by definition of the set \(\mathcal{R}_n\), we must have that \(n > \dim(s)\). By the assumption of the lemma, there exist \(t'\) and \(u'\) with \(t' =_{\dim(s)} u'\) and \(t \redr^* t'\) and \(u \redr^* u'\). As \(n > \dim(s)\), we further have that \(t \red_{\mathcal{R}_n}^* t'\) and \(u \red_{\mathcal{R}_n}^* u'\). By \cref{prop:red-is-eq}, \(t' \leftrightsquigarrow_{\mathcal{R}_{\dim(s)}} u'\), and so as \(\red_{\mathcal{R}_{\dim(s)}}\) is confluent by inductive hypothesis on dimension we have \(v\) such that \(t' \red_{\mathcal{R}_{\dim(s)}}^* v \leftsquigarrow_{\mathcal{R}_{\dim(s)}}^* u'\). The following diagram can therefore be formed, where all the reductions are \(\mathcal{R}_n\) reductions (noting that \(\mathcal{R}_{\dim(s)} \subseteq \mathcal{R}_n\)): \[ \begin{tikzcd}[column sep=tiny] && s \\ t &&&& u \\ & {t'} && {u'} \\ && v \arrow[squiggly, from=1-3, to=2-5] \arrow[squiggly, from=1-3, to=2-1] \arrow["{*}"', squiggly, from=2-1, to=3-2] \arrow["{*}", squiggly, from=2-5, to=3-4] \arrow["{*}"', squiggly, from=3-2, to=4-3] \arrow["{*}", squiggly, from=3-4, to=4-3] \end{tikzcd} \] If \(s \red_{\mathcal{R}_n} u\) was derived from \textsc{rule}, then finding a reduct can be found similarly to the first case by symmetry. We therefore consider the cases where neither \(s \red t\) nor \(s \red u\) are derived using \textsc{rule}. Both reductions must be either cell or argument reductions, and so each reduces some subterm of \(s\). If they reduce distinct subterms of \(s\), then a common reduct \(v\) can be formed by applying both reductions to \(s\). Otherwise, both reductions act on the same subterm of \(s\), and a common reduct can be found by applying the inductive hypothesis for subterms. \end{proof} Once termination and confluence have been proven, a normalisation function can be defined, which repeatedly applies reductions until no more can be applied. \begin{lemma} Suppose that \(\red\) is strongly terminating and confluent. Then every term \(s\) reduces to a unique normal form \(\N(s)\). Furthermore, if \(s \redrts t\), then \(\N(s) \equiv \N(t)\). \end{lemma} \begin{proof} By termination, repeatedly reducing a term will reach a normal form. Suppose a term \(s\) has two normal forms \(t\) and \(u\) such that there are reduction sequences \(s \red^* t\) and \(s \red^* u\). Then by confluence there must be a term \(v\) with \(t \red^* v\) and \(u \red^* u\). However, \(t\) and \(u\) are normal forms and so admit no reductions, so \(t \equiv v \equiv u\) as required. Suppose \(s \redrts t\). Then there are terms \(s_i\) such that: \[ s \equiv s_0 \rightsquigarrow^* s_1 \leftsquigarrow^* s_2 \rightsquigarrow^* \cdots \leftsquigarrow^* s_k \equiv t\] Now we must have \(\N(s_i) \equiv \N(s_{i+1})\) for each \(i\) as if \(s_i \rightsquigarrow^* s_{i+1}\) then both \(\N(s_i)\) and \(\N(s_{i+1})\) are normal forms of \(s_i\) and if \(s_i \leftsquigarrow^* s_{i+1}\) then both are normal forms of \(s_{i+1}\). Therefore, \(\N(s)\) and \(\N(t)\) are syntactically equal as required. \end{proof} \begin{corollary} Let \(\mathcal{R}\) be tame and satisfy the preservation and support properties. Further, suppose that \(\redr\) is strongly terminating and confluent, and it is decidable whether a term admits a reduction. Then the equality \(s = t\) is decidable. \end{corollary} \begin{proof} By \cref{prop:red-is-eq}, \(s = t\) if and only if \(s \redrts t\). By the above lemma, \(s \redrts t\) if and only if \(\N(s) \equiv \N(t)\). As syntactic equality is clearly decidable, and normal forms can be computed, equality is also decidable. \end{proof} We note that for an arbitrary rule set \(\mathcal{R}\), it may not be decidable whether a specific term \(s\) admits a reduction, but for the rule sets introduced in \cref{sec:cattsu,sec:cattsua}, it will be easy to mechanically check whether any reduction applies to a term \(s\). \section{\texorpdfstring{\Cattsu}{Cattsu}} \label{sec:cattsu} We are ready to define \Cattsu, the type theory for strictly unital \(\infty\)-categories. \Cattsu is a variant of \Cattr for which the equality is built from three classes of equalities: \begin{itemize} \item Pruning: The pruning operation was introduced in \cref{sec:pruning}. Pruning is the key operation in \Cattsu and drives the strict unitality of the theory. The operation ``prunes'' identities that appear as locally maximal arguments to other terms, simplifying the overall structure of a term by removing unnecessary units. \item Endo-coherence removal: This operation was introduced in \cref{sec:ecr}, and converts ``fake identities'', terms which are morally identities yet have the wrong syntactic form, into true identities. These converted identities can then be further removed from terms by pruning. \item Disc removal: Disc removal was the running example from \cref{sec:catt-with-equality}, and removes unary composites from the theory. Commonly after pruning, a composite is reduced to a unary composite, for which disc removal is necessary to complete the simplification of the term. \end{itemize} In this section we will prove that \Cattsu is a type theory satisfying many standard meta-theoretic properties by combining results from previous chapters. We also give a reduction system for \Cattsu and show that this is strongly terminating and confluent. \begin{example} Suppose we have terms \(f : \arr x \star y\), \(g : \arr y \star z\), \(h : \arr x \star z\), and \(\alpha : f * g \to h\) in some context \(\Gamma\). We can then consider the term: \[ \Coh {(x : *), (y : *), (f : \arr x \star y), (z : \star), (g : \arr y \star z)} {f * g \to f * g} {\langle x,y,f,z, g \rangle} * \alpha\] which consists of an endo-coherence composed with the variable \(\alpha\). This then reduces as follows: \begin{align*} &\phantom{{}\red{}}\Coh {(x : *), (y : *), (f : \arr x \star y), (z : \star), (g : \arr y \star z)} {f * g \to f * g} {\langle f, g \rangle} * \alpha\\ &\red \id(\arr x \star z, f * g) * \alpha&\text{by endo-coherence removal}\\ &\red \Coh {D^2} {\wk(U^2)} {\langle x,z,f*g,h,\alpha \rangle}&\text{by pruning}\\ &\red \alpha &\text{by disc removal} \end{align*} and so uses all three reductions to fully simplify to a variable. \end{example} We define \Cattsu by the following equality rule set. \begin{definition} Define the equality rule set \su for \Cattsu by: \[ \su = \dr \cup \prune \cup \ecr\] \Cattsu is then the variant of \Cattr where \(\mathcal{R} = \su\). \end{definition} When it is not specified, we will assume that the operation set \(\mathcal{O}\) is given by the regular operation set \(\Reg\). \begin{theorem} The rule set \su is tame and satisfies the support and preservation conditions. \end{theorem} \begin{proof} By \cref{prop:dr-weak,prop:dr-susp,prop:dr-sub}, disc removal satisfies the weakening, suspension, and \(\su\)-substitution conditions. Endo-coherence removal and pruning satisfy the same conditions by \cref{prop:ecr-props,prop:prune-tame}. As these conditions are closed under unions, the set \su must also satisfy the weakening, suspension, and substitution conditions, and hence is tame. We now use the proof strategy introduced in \cref{sec:further-conditions} to prove that \su satisfies the support condition. Firstly, by \cref{lem:supp-sat-conds} we know that \(\su_{\mathsf{s}}\) is also tame. Disc removal then satisfies the \(\su_{\mathsf{s}}\)-support condition by \cref{prop:dr-supp}. The same condition is satisfied by endo-coherence removal (\cref{item:ecr-supp}) and pruning (\cref{prop:prune-supp}) and so \(\su\) satisfies the \(\su_{\mathsf{s}}\)-support condition. By \cref{lem:proof-strat-supp}, \su satisfies the support condition. Lastly, \su satisfies the \su-preservation condition as it is satisfied by disc removal (\cref{prop:dr-preserve}), endo-coherence removal (\cref{item:ecr-preserve}), and pruning (\cref{prop:prune-preserve}) and is closed under unions of rule sets. \end{proof} From this theorem it can be deduced that weakening, suspension, and applications of substitution are well-formed. Furthermore, equality in \Cattsu preserves the support of a term and preserves typing judgements. Such results are found in \cref{sec:ruleset}. Before giving normalisation results for \Cattsu, we recall the Eckmann-Hilton argument (\cref{fig:eh}) and give the definition of the term giving this equivalence. First let \(\Delta\) be the ps-context given by: \begin{alignat*}{2} \Delta = D^2 \wedge D^2 ={} &(x : *),\\ &(y : *),&\ &(f : x \to y),\\ &&&(g : x \to y),(a : f \to g),\\ &(z : *),&&(h : x \to y),\\ &&&(j : x \to y),(b : h \to j) \end{alignat*} which is given by the diagram: \[ \begin{tikzcd} \bullet & \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, "h"', curve={height=18pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "i", curve={height=-18pt}, from=1-2, to=1-3] \arrow["a"', shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["b"', shorten <=5pt, shorten >=5pt, Rightarrow, from=2, to=3] \end{tikzcd} \] The following term can be formed, which is similar to an interchange move, and changes the order in which two whiskered terms are composed: \[ \mathsf{swap} = \Coh {\Delta} {(a *_0 j) *_1 (g *_0 b) \to (f *_0 b) *_1 (a *_0 h)} {\id_\Delta}\] Then given a context \(\Gamma\) with terms \(x : *\) and \(\alpha,\beta : \id(x) \to \id(x)\), the following term, the Eckmann-Hilton term, can be formed: \[ \mathsf{EH}_{\alpha,\beta} = \mathsf{swap} \sub{\langle x,x,\id(x),\id(x),\alpha, x,\id(x),\id(x),\beta \rangle}\] In \Cattsu, this term can be typed as follows: \[ \Gamma \vdash \mathsf{EH}_{\alpha,\beta} : \alpha *_1 \beta \to \beta *_1 \alpha\] and so witnesses the Eckmann-Hilton argument. We note that there is a clear inverse of the Eckmann Hilton term, which immediately gives rise to two morphisms \(\alpha *_1 \beta \to \beta *_1 \alpha\): the original term \(\mathsf{EH}_{\alpha,\beta}\) and the term \(\mathsf{EH}_{\beta,\alpha}^{-1}\). These two terms manoeuvre \(\alpha\) and \(\beta\) round each other in opposite directions, and are not in general equivalent. However, we can instead apply Eckmann-Hilton to terms \(\phi\) and \(\psi\) of type \(\id^2(x) \to \id^2(x)\), which is done by suspending the Eckmann-Hilton term. By an abuse of notation we define this term to be (only giving the locally maximal arguments of the substitution): \[\mathsf{EH}_{\phi,\psi} = \Sigma(\mathsf{swap}) \sub{\langle \phi, \psi \rangle}\] In this case, the extra dimension gives enough freedom to give an equivalence between the resulting two terms \(\phi *_2 \psi \to \psi *_2 \phi\) which is called the \emph{syllepsis} and has the type: \[ \mathsf{Syl}_{\phi,\psi} : \mathsf{EH}_{\phi,\psi} \to \mathsf{EH}^{-1}_{\psi,\phi}\] To define this term, a similar approach to the approach used for Eckmann-Hilton of giving a single coherence containing a more complex type and a substitution containing multiple identity terms, and letting the \Cattsu reduction simplify the type to the required one. We delay defining this term until \cref{sec:towards-nbe}, where the implementation presented in this section can be used to check that the resulting term is well-formed. \subsection{Normalisation for \texorpdfstring{\Cattsu}{Cattsu}} \label{sec:reduction-cattsu} Following \cref{sec:reduction} we aim to give a normalisation algorithm for \Cattsu by exhibiting a strongly terminating and confluent reduction system. The reduction system \(\red_{\su}\) cannot be used directly because the reduction generated from \ecr is not terminating, as it allows identities to reduce to identities. Even after replacing the equality rule set \ecr by \ecr', the equality set obtained by removing these trivial identity to identity reductions from \ecr, the generated reduction is still non-terminating. Consider the term \(\id(\arr t A t,\id(A,t))\) for some term \(t\) of type \(A\). Then the following reduction sequence can be formed: \[ \id(\arr t A t,\id(A,t)) \red \Coh {D^n} {\id(\wk(U^n), d_n) \to \id(\wk(U^n), d_n)} {\{A,t\}} \red \id(\arr t A t, \id(A,t)) \] where \(n = \dim(A)\), the first reduction is by pruning, and the second reduction is by endo-coherence removal. We therefore choose to also restrict the pruning equality rule set to not apply when the head term is an identity, obtaining the set \prune'. We can now define the reduction system for \Cattsu. \begin{definition} Define the reduction \(\red_{\su'}\) to be the reduction generated by the equality rule set \(\su'\) where \[ \su' = \dr \cup \prune' \cup \ecr'\] where \ecr' is the endo-coherence removal set without identity to identity equalities and \prune' is the pruning set restricted to the triples where the left-hand side term is not an identity. \end{definition} The reduction \(\red_{\su'}\) applies equality rules from \Cattsu when the redex is not an identity, effectively forcing identities to be normal forms of the theory. As applying a substitution to or suspending a non-identity term cannot result in an identity, it is clear that \su' is tame. Strong termination for \(\red_{\su'}\) can now be proven using \cref{lem:termination-lem}, by showing that all rules reduce the syntactic complexity of terms. \begin{proposition} Let \(s \red t\) be an instance of pruning. If \(s\) is not an identity then \(\sc(s) > \sc(t)\). \end{proposition} \begin{proof} The reduction \(s \red t\) is an instance of pruning, and so there must be Dyck word \(\mathcal{D} : \Dyck_0\), and peak \(p : \Peak_{\mathcal{D}}\) such that \[s \equiv \Coh {\lfloor \mathcal{D} \rfloor} {A} {\sigma} \qquad t \equiv \Coh {\lfloor \mathcal{D} \sslash p \rfloor} {A \sub {\pi_p}} {\sigma \sslash p}\] where \(s\) is not an identity and \(\lfloor p \rfloor \sub \sigma\) is. We then have \(\sc(s) = \sc(\sigma)\) and \(\sc(t) = \sc(\sigma \sslash p)\), but \(\sigma \sslash p\) is simply \(\sigma\) with two terms removed, one of which is known to be a coherence, and so \(\sc(s) > \sc(t)\). \end{proof} \begin{corollary} The reduction \(\red_{\su'}\) is strongly terminating. \end{corollary} \begin{proof} By \cref{lem:termination-lem}, it suffices to show that each rule of \(\su'\) reduces syntactic complexity, which follows from the preceding proposition and \cref{prop:ecr-sc,prop:disc-rem-sc}. \end{proof} By \cref{prop:red-is-eq}, we know that the reflexive symmetric transitive closure of \(\red_{\su'}\) is the equality relation generated by \su'. We therefore prove that this agrees with the equality relation from \Cattsu. \begin{proposition} \label{prop:suprime-equiv} The type theories generated from \su and \su' are equivalent. Terms are equal or well-formed in one theory exactly when they are equal or well-formed in the other, and similar properties hold for types and substitutions. \end{proposition} \begin{proof} We use \cref{lem:subset-lem} for both directions. Since \(\su' \subseteq \su\), we are only required to show that if \((\Gamma, s, t) \in \su\) with \(\Gamma \vdash_{\su'} s : A\) for some \(A : \Type_\Gamma\) then \[ \Gamma \vdash_{\su'} s = t\] If \((\Gamma,s,t) \in \su'\), then the equality follows from the \textsc{rule} constructor. Otherwise, \(s\) must be an identity and the rule is an instance of endo-coherence removal or pruning. Suppose \(s\) reduces to \(t\) by endo-coherence removal. Then \(s \equiv \id(A,u)\) and \[t \equiv \id(\wk(U^n) \sub {\{A,u\}}, d_n \sub {\{A,u\}}) \equiv \id(A,u) \equiv s\] and so the equality holds by reflexivity. Now assume \(s\) reduces by pruning to \(t\). Letting \(s \equiv \id(A,u)\) and \(n = \dim(A)\), we get: \begin{align*} t &\equiv \Coh {\lfloor \mathcal{D}^n \sslash p^n \rfloor} {\arr {d_n} {\wk(U^n)} {d_n} \sub {\pi_{p^n}}} {\{A,u\} \sslash p} \\ &= \id(\wk(U^n) \sub {\pi_{p^n}} \sub {\{A,u\} \sslash p^n}, d_n \sub {\pi_{p^n}} \sub {\{A,u\} \sslash p^n})&\text{by endo-coherence removal}\\ &\equiv \id(\wk(U^n),d_n) \sub {\pi_{p^n} \bullet \{A,u\} \sslash p^n}\\ &= \id(\wk(U^n),d_n) \sub {\{A,u\}}&\text{by \cref{prop:prune-ty}}\\ &\equiv \id(\wk(U^n) \sub {\{A,u\}}, d_n \sub {\{A,u\}})\\ &\equiv \id(A,u) \end{align*} and so the equality holds as required. \end{proof} We therefore have that two terms \(s\) and \(t\) are equal in \Cattsu if and only if \(s \leftrightsquigarrow_{\su'} t\). To demonstrate normalisation, it therefore remains to show that the reduction system is confluent, for which we employ the strategy introduced in \cref{lem:conf-strat}. \begin{theorem} \label{thm:su-conf} The reduction \(\red_{\su'}\) is confluent. \end{theorem} \begin{proof} By \cref{lem:conf-strat} it is sufficient to show that for all \(t \leftsquigarrow s \rightsquigarrow u\) with \(s \rightsquigarrow t\) being a reduction derived from \textsc{rule}, that the following diagram can be formed: \[ \begin{tikzcd}[column sep=tiny] && s \\ t &&&& u \\ & {t'} & {\mathclap{=_{\dim(s)}}} & {u'} \arrow[squiggly, from=1-3, to=2-5] \arrow[squiggly, from=1-3, to=2-1] \arrow["{*}"', squiggly, from=2-1, to=3-2] \arrow["{*}", squiggly, from=2-5, to=3-4] \end{tikzcd} \] We therefore begin by case splitting on the reduction \(s \red t\), ignoring cases where both reductions are identical and ignoring cases which follow by symmetry of other cases. \textbf{Disc removal:} Suppose \(s \red t\) is a disc removal reduction. Then \(s \equiv \Coh {D^n} {\wk(U^n)} {\{A,t\}}\). We now split on the reduction \(s \red u\). We immediately know that \(s \red u\) cannot be an endo-coherence removal reduction, as \(s\) is not an endo-coherence. It also cannot be a cell reduction as \(\wk(U^n)\) only contains variables and so is in normal form. Let \(s \red u\) be an argument reduction. It must therefore be generated by a reduction on \(\{A,t\}\). If it is a reduction generated by \(A \red A'\) then \(u \red t\) by endo-coherence removal and so we are done. Otherwise, it is generated by \(t \red t'\) and so \(t\) and \(u\) both reduce by disc removal to \(t'\). The only remaining case is where \(s \red u\) is an instance of pruning, which forces \(t \equiv \id(B,a)\) for some \(B\) and \(a\). As \(s\) is well-formed, we must have \(n > 0\) and so \(A \equiv \arr b {A'} c\). Therefore: \begin{align*} u &\equiv \Coh {\lfloor \mathcal{D}^{n} \sslash p \rfloor} {\wk(U^n) \sub {\pi_{p}}} {\{A,\id(B,a)\} \sslash p}\\ &\equiv \Coh {D^{n-1}} {\wk(U^n) \sub {\{\arr {d_{n-1}} {\wk(U^{n-1})} {d_{n-1}}, \id(\wk(U^{n-1}), d_{n-1})\} }} {\{A',b\}}&\text{by \cref{prop:prune-disc}}\\ &\equiv \Coh {D^{n-1}} {\arr {d_{n-1}} {\wk(U^{n-1})} {d_{n-1}}} {\{A',b\}}\\ &\equiv \id(A',b) \end{align*} Now as \(s\) is well-formed we have \(\Gamma \vdash \{A,\id(B,a)\} : D^n\) and so by \cref{lem:disc-typing}, we have \(\Gamma \vdash \id(B,a) : A\) and hence by \cref{cor:id-typing} and uniqueness of typing: \[ \arr a {B} a = A \equiv \arr b {A'} c\] and so \(a = b\) and \(B = A'\) and hence \(s \equiv \id(A', b) = \id(B,a) \equiv t\). Since \(\dim(a) = \dim(B) < \dim(s)\), we get \(t =_{\dim(s)} u\) as required. \textbf{Endo coherence removal:} Suppose \(s \red t\) is an endo-coherence removal reduction. Then: \[ s \equiv \Coh {\Delta} {\arr a A a} {\sigma} \red \id(A \sub \sigma, a \sub \sigma) \equiv t\] with \(s\) not being an identity. We now split on the reduction \(s \red u\). First consider when it is an argument reduction generated by \(\sigma \red \tau\). Then by \cref{prop:red-sub}, we have \(t \equiv \id(A \sub \sigma, a \sub \sigma) \red^* \id(A \sub \tau, a \sub \tau)\). By endo-coherence removal, \(u \red \id(A \sub \tau, a \sub \tau)\), completing this case. Now suppose the reduction \(s \red u\) is an instance of cell reduction. If it is generated from a reduction \(A \red B\) then by \cref{prop:red-sub}, \(t \red \id(B \sub \sigma, a \sub \sigma)\) and by endo-coherence removal: \[u \equiv \Coh \Delta {\arr a B a} \sigma \red \id(B \sub \sigma, a \sub \sigma)\] We now consider when the reduction is generated by \(\arr a A a \red \arr b A a\), with the case where it is generated by \(\arr a A a \red \arr a A b\) following symmetrically. We consider the reductions sequence from \(u\): \begin{align*} u &\equiv \Coh \Delta {\arr b A a} {\sigma} \\ &\red \Coh \Delta {\arr b A b} \sigma &\text{by cell reduction}\\ &\red \id(A\sub \sigma,b\sub \sigma) &\text{by endo-coherence removal} \end{align*} Again by \cref{prop:red-sub}, \(t \equiv \id(A\sub\sigma,a\sub\sigma) \red \id(A\sub\sigma,b\sub\sigma)\), completing the case. Lastly, we consider when \(s \red u\) is a pruning reduction. We suppose \(\Delta = \lfloor \mathcal{D} \rfloor\) and that the pruning is generated from peak \(p : \mathcal{D}\). Then: \[ u \equiv \Coh {\lfloor \mathcal{D} \sslash p \rfloor} {(\arr a A a) \sub {\pi_p}} {\sigma \sslash p}\] Then: \begin{align*} u &\red \id(A \sub {\pi_p} \sub {\sigma\sslash p},a \sub {\pi_p} \sub {\sigma\sslash p})&\text{by \cref{lem:always-ecr}}\\ &\equiv \id(A,a) \sub {\pi_p \bullet \sigma \sslash p} \\ &=_{\dim(s)} \id(A,a) \sub \sigma \end{align*} where the last (bounded) equality is by \cref{prop:prune-ty} and by noting that \(\dim(A) = \dim(a) < \dim(s)\). \textbf{Pruning:} Let \(s \red t\) be a reduction by pruning with \[ s \equiv \Coh {\lfloor \mathcal{D} \rfloor} A \sigma\] for some \(\mathcal{D} : \Dyck_0\) with peak \(p : \Peak_{\mathcal{D}}\) such that \(\lfloor p \rfloor \sub \sigma\) is an identity. Then: \[ t \equiv \Coh {\lfloor \mathcal{D} \sslash p \rfloor} {A \sub {\pi_p}} {\sigma \sslash p}\] We now split on the reduction \(s \red u\). First suppose it is given by an argument reduction \(\sigma \red \tau\). Identities do not admit head reductions, meaning \(\lfloor p \rfloor \sub \tau\) is still an identity. Therefore, pruning can be applied to \(u\) to get: \[ u \red \Coh {\lfloor \mathcal{D} \sslash p \rfloor} {A \sub {\pi_p}} {\tau \sslash p}\] Now \(\sigma \sslash p\) is simply \(\sigma\) with two terms removed, and so \(\sigma \sslash p \red^* \tau \sslash p\), meaning \(t\) reduces to the same term as \(u\). If \(s \red u\) is a cell reduction \(A \red B\), then pruning can be applied to \(u\) immediately to get the term: \[\Coh {\lfloor \mathcal{D} \sslash p \rfloor} {B \sub {\pi_p}} {\sigma \sslash p}\] but \(t\) also reduces to this term by \cref{prop:red-sub}. Let \(s \red u\) be a second pruning reduction, at a different peak \(q : \Peak_{\mathcal{D}}\). By \cref{prop:prune-conf}, there is a common reduct: \[ \Coh {\lfloor (\mathcal{D} \sslash p) \sslash q_p \rfloor} {A \sub {\pi_p} \sub{\pi_{q_p}}} {(\sigma \sslash p) \sslash q_p} \] which both reduce to by pruning if \(\lfloor q_p \rfloor\) and \(\lfloor p_q \rfloor\) are identities. However: \[\lfloor q_p \rfloor \equiv \lfloor q \rfloor \sub {\pi_p}\] and \(\lfloor q \rfloor\) must be an identity for \(s \red u\) to be a valid instance of pruning. Therefore, as identities are preserved by application of substitution, \(\lfloor q_p \rfloor\) is an identity. Similarly, \(\lfloor p_q \rfloor\) is an identity, and so both \(t\) and \(u\) reduce to the term above. Any remaining cases follow by symmetry, completing the proof. \end{proof} \subsection{Disc trivialisation} \label{sec:properties-cattsu} We take a brief moment to explore the theory \Cattsu in its entirety. For this section we will further assume that we take the set of operations \(\mathcal{O}\) to be the regular operations. We begin by proving a property of terms over disc contexts, which we call \emph{disc trivialisation}. This is the following structure theorem: in a disc context \(D^n\), every term is either a variable, or the iterated identity on a variable, up to definitional equality. Restricting to those terms \(t : \Term_{D^n}\) that are full, that is \(\Supp(t) = \Var{D^n}\), then there is exactly one term (up to definitional equality) at each dimension \(k \geq n\). Hence, the type theory \Cattsu trivialises disc contexts. This property relates the type theory \Cattsu to the definition of strictly unital \(\infty\)-categories of \citeauthor{Batanin2013}~\cite{Batanin2013}, whose \emph{reduced operads} enforce that there is a unique term of each dimension over a linear tree. We now state and prove disc trivialisation, recalling the definition of an iterated canonical identity from \cref{def:canonical-id}. \begin{theorem}[Disc trivialisation] \label{thm:disc-triv} Suppose \(D^n \vdash t : A\) in \Cattsu. Then \(t\) is equal to an iterated canonical identity on a variable, that is \(t = \id^k(x)\) for some variable \(x \in \Var(D^n)\) and \(k \in \mathbb{N}\). \end{theorem} \begin{proof} Without loss of generality, we may assume that \(t\) is in \Cattsu normal form, and proceed to prove that \(t\) is an iterated canonical identity. We proceed by induction on subterms of the term \(t\). If \(t\) is a variable then we are done. Otherwise, we assume \(t\) is a coherence term \(\Coh \Delta U \sigma\). We now show that \(\Delta\) must be a disc context by contradiction. We therefore assume that \(\Delta\) is not a disc, and hence \(t\) is not an identity. By induction on subterms, we must have that each term in \(\sigma\) is an iterated canonical identity on a variable. No locally maximal argument can be an identity, as otherwise pruning could be performed and \(t\) would not be in normal form, and so every locally maximal argument is a variable. Suppose there is some variable \(x\) such that \(x \sub\sigma\) is an identity, and let \(x\) be a variable of maximal dimension with this property. As \(x\) cannot be locally maximal, there must either be the source or target of a variable \(y\), but this variable \(y\) must be sent to a variable of \(D^n\), which cannot have an identity as its source or target. Therefore, the substitution \(\sigma\) is variable to variable. We now let \(\Gamma\) be the smallest ps-context prefix of \(\Delta\) such that \(\Gamma\) is not a disc. We must have: \[ \Gamma \equiv D^k, (y : A), (f : \arr x A y)\] where \(D^k \vdash_{\mathsf{ps}} x : A\). Furthermore, the last rule used in this derivation must be \textsc{psd}, as if it were \textsc{pse} or \textsc{pss} then \(k = \dim(A)\) and \(\Gamma \equiv D^{k+1}\), breaking the assumption that \(\Gamma\) is not a disc. Therefore, \(D^k \vdash_{\mathsf{ps}} g : \arr w A x\) for some variables \(g\) and \(x\). However, now \(g \sub \sigma\), \(x \sub \sigma\), and \(f \sub \sigma\) are variables of \(D^n\) such that \(\tgt(g \sub \sigma) \equiv x \sub \sigma \equiv \src(f \sub \sigma)\). No such variables exist in \(D^n\) and so we reach a contradiction. We therefore deduce that \(\Delta\) is a disc \(D^n\) for some \(n\). Now \(t \equiv \Coh {D^n} {\arr u A v} \sigma\) and so by induction on subterms, \(u\) and \(v\) are equal to iterated canonical identities. We now split on whether \(t\) is a composition or equivalence. If it is a composition then \(\Supp(u) = \bdry {n-1} - {D^n}\) and \(\Supp(v) = \bdry {n-1} + {D^n}\) and therefore neither \(u\) or \(v\) are identities (as then \(A\) would have the same support as \(u\) or \(v\) respectively) and so \(u = d_{n-1}^-\) and \(v = d_{n-1}^*\), but this makes \(t\) a disc removal redex, and so \(t\) is not in normal form. We therefore assume that \(t\) is an equivalence and \(u\) and \(v\) are full. Then \(u\) and \(v\) must be iterated identities on \(d_n\), and must have the same dimension and so are syntactically equal. To avoid \(t\) being an endo-coherence removal redex, it must be an identity \(\id(B,s)\). Now, \(s \equiv \id^k(x)\) for some variable \(x\) (as \(s\) is a subterm of \(t\)), and so if \(k = 0\) then \(\ty(s) \equiv d_{n-1}^- \to d_{n-1}^+\) and if \(k > 0\) then \(\ty(s) \equiv \id^{k-1}(x) \to \id^{k-1}(x)\). In either case, \(\ty(s)\) is in normal form, and so since \(B\) is also a normal form and \(\Gamma \vdash s : B\) (by the well-typing of \(t\) and \cref{cor:id-typing}), we have \(B \equiv \ty(s)\) and so \(t \equiv \id(s) \equiv \id^{k+1}(x)\) as required. \end{proof} Disc trivialisation allows us to prove the following results concerning terms and substitutions in pasting diagrams. \begin{theorem} Let \(\mathcal{D}\) be a Dyck word. Let \(t\) be a well-formed \Cattsu term of \(\lfloor \mathcal{D} \rfloor\). Then \(\Supp(t)\) is a ps-context. \end{theorem} \begin{proof} Suppose, for contradiction, that we have a Dyck word \(\mathcal{D}\) and a term \(t\) where \(\Supp(t)\) is not a ps-context. Assume further that \(\mathcal{D}\) is minimal (in terms of length) where such a term exists. Immediately, \(\mathcal{D} \not\equiv \circleddash\), as all terms have non-empty support. We now examine the locally maximal variables of \(\mathcal{D}\). There must exist some locally maximal variable \(f : x \to y\) such that \(f \not\in \Supp(t)\), as otherwise \(\Supp(t) = \Var(\lfloor \mathcal{D} \rfloor)\). Now suppose that \(y \not\in\Supp(t)\). Then we let \(p\) be the peak corresponding to \(f\) and consider the term: \[t \sub {\pi_p} : \Term_{\lfloor \mathcal{D}\sslash p \rfloor}\] Then \(\Supp(t \sub {\pi_p}) = \Supp(t)\), which contradicts the minimality of \(\mathcal{D}\). By a similar argument, \(x\) must also be in \(\Supp(t)\). It is also the case that if such a variable \(f : x \to y\) with \(f \not\in \Supp(t)\) and \(\{x,y\} \subseteq \Supp(t)\) exists, then \(\Supp(t)\) cannot be a ps-context, by an argument involving the linear order on ps-contexts introduced by \citeauthor{finster2017type}~\cite{finster2017type}. Now suppose \(\mathcal{D}\) has a peak \(p\) that is not \(f\). Then \(f\sub{\pi_p} : x \sub{\pi_p} \to y \sub{\pi_p}\) is a locally maximal variable of \(\lfloor \mathcal{D} \sslash p \rfloor\) with \(f\sub{\pi_p} \not\in \Supp(t \sub {\pi_p})\) and \(\{x \sub {\pi_p}, y \sub {\pi_p}\} \subseteq \Supp(t \sub {\pi_p})\). Hence, \(\Supp(t \sub {\pi_p})\) is not a ps-context, again breaking the minimality of \(\mathcal{D}\). Therefore, \(\mathcal{D}\) has one peak, and so \(\lfloor \mathcal{D} \rfloor \equiv D^n\) for some \(n\). Now by \cref{thm:disc-triv}, \(t\) is \Cattsu equal to a variable \(z\) or an iterated identity on a variable \(z\). Since \Cattsu preserves support, we must have \(\Supp(t) = \Supp(z)\), but \(\Supp(z)\) is a disc and so is a ps-context. Hence, no such term \(t\) existed. \end{proof} Since any \Catt term is also a \Cattsu term, we get the following corollary. \begin{corollary} \label{cor:supp-ps} If \(\Gamma \vdash t : A\) in \Catt, and \(\Gamma\) is a ps-context, then \(\Supp(t)\) is a ps-context. \end{corollary} \section{\texorpdfstring{\Cattsua}{Cattsua}} \label{sec:cattsua} We now move on to defining \Cattsua, the type theory for strictly unital and associative \(\infty\)-categories. \Cattsua extends \Cattsu by replacing the pruning equality with the more general insertion equality, which was introduced in \cref{sec:insertion}. Under certain conditions, insertion can merge more complex terms into a single coherence. As an example, the term \((f * g) * h\), which is a composite which has a composite as one of its arguments, is reduced by insertion to the ternary composite \(f*g*h\), reducing the depth of the term. As we did for \Cattsu, we will prove in this section that \Cattsua satisfies standard meta-theoretic properties, and provide a reduction system for it which is strongly terminating and confluent. \begin{example} We consider the associator term, and its reductions in \Cattsua. The associator witnesses the associativity law in a weak \(\infty\)-category. Letting \(\Delta\) be the following ps-context: \begin{alignat*}{2} \Delta = \lfloor [\emp,\emp,\emp] \rfloor ={}& (w : *)\\ &(x : *)&&(f : w \to x)\\ &(y : *)&&(g : x \to y)\\ &(z : *)&&(h : y \to z) \end{alignat*} we can define the associator as: \[ \alpha = \Coh \Delta {(f * g) * h \to f * (g * h)} {\id_\Delta}\] This then admits the following reduction sequence in \Cattsua: \begin{align*} \alpha &\rightsquigarrow \Coh \Delta {f*g*h \to f * (g * h)} {\id_\Delta}&\text{by insertion}\\ &\rightsquigarrow \Coh \Delta {f * g * h \to f * g * h} {\id_\Delta}&\text{by insertion}\\ &\rightsquigarrow \id(f*g*h) &\text{by endo-coherence removal} \end{align*} \end{example} We formally define \Cattsua as the version of \Cattr generated by the rule set \sua, which we define below: \begin{definition} We define the equality rule set \sua for \Cattsua by: \[ \sua = \dr \cup \ecr \cup \insert \] \Cattsua is then the variant of \Cattr where \(\mathcal{R} = \sua\). \end{definition} As before, when we do not specify an operation set, it should be assumed that the regular operation set is used. When we use the groupoidal operation set, we refer to the resulting theory as \emph{groupoidal \Cattsua}. \begin{theorem} \label{thm:sua-conds} The rule set \sua is tame and satisfies the support condition. If \(\mathcal{O}\) supports insertion, then \sua also satisfies the preservation condition. \end{theorem} \begin{proof} By \cref{prop:dr-weak,prop:dr-susp,prop:dr-sub,prop:ecr-props,prop:insert-tame}, each of the disc removal, endo-coherence removal, and insertion sets satisfy the weakening, suspension, and \(\sua\)-substitution conditions. It follows that \(\sua\) satisfies the weakening, suspension, and substitution conditions. Hence, \sua is tame. To prove that the support condition holds for \sua, we use the strategy introduced in \cref{sec:further-conditions} and instead show that \sua satisfies the \(\sua_{\mathsf{S}}\)-support condition. By \cref{lem:supp-sat-conds}, the equality rule set \(\sua_{\mathsf{S}}\), the restriction of \sua to support preserving equalities, is also tame. As it trivially satisfies the support condition, we have by \cref{prop:dr-supp,item:ecr-supp,prop:insert-supp} that disc removal, endo-coherence removal, and insertion satisfy the \(\sua_{\mathsf{S}}\)-support condition. Therefore, \sua satisfies the \(\sua_{\mathsf{S}}\)-support condition and so by \cref{lem:proof-strat-supp} \sua satisfies the support condition. The \sua-preservation condition is satisfied by disc removal (by \cref{prop:dr-preserve}) and endo-coherence removal (by \cref{item:ecr-preserve}). If \(\mathcal{O}\) supports insertion, then insertion also satisfies the \sua-preservation condition by \cref{prop:insert-preserve}. Therefore, \sua satisfies the preservation condition, completing the proof. \end{proof} While the groupoidal operation set trivially supports insertion, we have not yet proven that the regular operation set, \Reg, supports insertion. This is done now using \cref{thm:sua-conds}. \begin{proposition} The regular operation set, \Reg, supports insertion. \end{proposition} \begin{proof} Using that the regular operation set is equal to the standard operation set, we instead prove that the standard operation set supports insertion. For this it will be sufficient to prove that for an insertion point \((S, P, T)\), dimension \(n \in \mathbb{N}\) and \(\epsilon \in \{-,+\}\) that: \[ \bdry n \epsilon S \sub {\kappa_{S,P,T}} = \bdry n \epsilon {\insertion S P T}\] Then: \begin{align*} \bdry n \epsilon S \sub {\kappa_{S,P,T}} &= \Supp(\stdtm {\bound n S} n \sub {\incbd n \epsilon S}) \sub {\kappa_{S,P,T}}&\text{by \cref{lem:std-supp}}\\ &= \Supp(\stdtm {\bound n S} n \sub {\incbd n \epsilon S \bullet \kappa_{S,P,T}})\\ &= \Supp(\stdtm {\bound n {\insertion S P T}} n \sub {\incbd n \epsilon {\insertion S P T}})&\text{by (*)}\\ &= \bdry n \epsilon {\insertion S P T} &\text{by \cref{lem:std-supp}} \end{align*} where the equality \((*)\) holds as \sua satisfies the support condition by \cref{thm:sua-conds} and: \[ \insertion S P T \vdash_\sua \stdtm {\bound n S} n \sub {\incbd n \epsilon S \bullet \kappa_{S,P,T}} = \stdtm {\bound n {\insertion S P T}} n \sub {\incbd n \epsilon {\insertion S P T}} \] by \cref{thm:std-insert-props}. \end{proof} \subsection{Reduction for \texorpdfstring{\Cattsua}{Cattsua}} \label{sec:norm-cattsua} Using the results of \cref{sec:reduction}, we give a normalisation algorithm for \Cattsua by defining a reduction system which generates the equality relation and proving that this reduction system is strongly terminating and confluent. As with \Cattsu, we cannot directly use the reduction \(\red_\sua\) directly, as we have seen already that the reduction \(\red_\ecr\) alone is non-terminating. Similarly to pruning, allowing insertions into identity terms also creates non-terminating loops of reductions when combined with endo-coherence removal, as was explained in \cref{sec:reduction-cattsu}. We therefore restrict our reduction so that no head-reductions can be applied to identity terms. Although these restrictions are sufficient to ensure termination, we choose to further restrict the set of insertion reductions, in order to streamline the proof of confluence. Firstly, we only allow insertions of a locally maximal argument when that argument is either an identity or a standard composition. The motivation for this restriction is that identities and standard compositions are the only standard coherences that are in normal form. Moreover, not allowing the insertion of endo-coherences avoids a difficult insertion/argument endo-coherence removal confluence case. We also disallow insertions into a unary composite and insertions of a unary composite, as we have already seen in \cref{sec:further-properties} that discs act as a left and right unit for insertion, and so these two insertion reductions are subsumed by disc removal. Further, disallowing the insertion of discs removes another case where an insertable standard coherence is not in normal form. We now define the resulting reduction system. \begin{definition} Define the reduction \(\red_{\sua'}\) to be the reduction generated by the equality rule set \(\sua'\) where: \[\sua' = \dr \cup \ecr' \cup \insert'\] where \(\ecr'\) is the endo-coherence removal set without the identity-to-identity reductions, and \(\insert'\) is the insertion rule set restricted to insertion redexes \((S,P,T,\Gamma,L,M)\) and types \(A\) such that \(\SCoh S A L\) is not an identity or a unary composite, and \(L(\olsi P) \equiv \stdcoh T {\lh(P)} \sub M\) is an identity or a standard composite which is not a unary composite. \end{definition} It can be determined by a simple observation that \(\sua'\) is tame, as suspension and the application of substitution cannot transform a term into an identity or unary composite where it wasn't before. We further justify the restrictions made to insertion by showing that many insertion reductions can still be performed, starting with the following technical lemma. \begin{lemma} \label{lem:insertion-change-max} If \(P\) is a branch of \(S\), and \(L, L' : S \to \Gamma\) are labellings differing only on \(\olsi P\), then the following holds for insertion redex \((S,P,T,\Gamma,L,M)\): \[\insertion L P M \equiv \insertion {L'} P M\] \end{lemma} \begin{proof} By inspection of the definition, \(\insertion L P M\) does not use the term \(L(\olsi P)\). \end{proof} We now show that many insertion reductions can still be simulated up to bounded equality. \begin{lemma} \label{lem:insertable} Let \((S,P,T,\Gamma, L, M)\) be an insertion redex. Further suppose that \(a \equiv \SCoh S A L\) is not an identity or disc. Then there exists a term \(s\) with: \[a \red_{\sua'}^* s =_{\dim(a)} \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P M}\] even when \(L(\olsi P)\) is a unary composite or is not a standard composite or identity. \end{lemma} \begin{proof} We proceed by induction on \(\lh(P) - \dep(T)\). If \(\lh(P) - \dep(T) = 0\) then \(\stdcoh T {\lh(P)}\) is a composite. The only case for which insertion cannot be performed is when \(\stdcoh T {\lh(P)}\) is a unary composite, such that \(T = D^{\lh(P)}\). Now by \cref{lem:disc-insertion-2}, \(\insertion S P T \equiv S\), \(\insertion L P M \equiv^{\max} L\) and \(\kappa_{S,P,T} = \id_S\) and so \[a =_{\dim(a)} \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P M}\] We now assume that \(\lh(P) > \dim(T)\). We may also assume without loss of generality that \(\stdcoh T {\lh(P)}\) is not an identity, as otherwise it would be immediately insertable. This allows us to perform endo-coherence removal to get: \[\stdcoh T {\lh(P)} \red \id(\stdty T {\lh(P) - 1}, \stdtm T {\lh(P)- 1}) \sub M\] Now suppose \(b \equiv \Coh S A {L'}\) where \(L'\) is the result of applying the above reduction to the term of \(L\) corresponding to \(\olsi P\). Since \(L'(\olsi P)\) is now an identity it can be inserted to get \(b \red c\) where: \begin{align*} c &\equiv \SCoh {S \sslash P} {A \sub {\pi_P}} {\insertion {L'} P {(\{\stdtm T {\lh(P) - 1}\} \bullet M)}}\\ &\equiv \SCoh {S \sslash P} {A \sub {\pi_P}} {\insertion {L'} P {(\{\stdcoh T {\lh(P) - 1}\} \bullet M)}} \end{align*} where \(\stdtm T {\lh(P - 1)} \equiv \stdcoh T {\lh(P - 1)}\) as if \(\stdtm T {\lh(P)-1}\) was a variable then \(\stdcoh T {\lh(P)}\) would be an identity. We now wish to show that \(2 + \bh(P) \leq \lh(P)\) so that \(P'\) exists as a branch of \(S \sslash P\). Since we always have \(1 + \bh(P) \leq \lh(P)\), we consider the case where \(1 + \bh(P) = \lh(P)\). We know that \(\bh(P) \leq \dep(T) \leq \lh(P)\) and so one of these inequalities must be an equality. If \(\dep(T) = \lh(P)\) then \(\stdcoh T {\lh(P)}\) is a standard composite. If \(\dep(T) = \bh(P)\) then \(\th(T) = \dep(T)\) and so \(T\) is linear. However, this makes \(\stdcoh T {\lh(P)}\) an identity. Either case is a contradiction and so \(2 + \bh(P) \leq \lh(P)\) and so \(P'\) is a branch of \(S \sslash P\). By \cref{lem:pruned-bp,lem:iota-kappa-comm}, we now have: \begin{align*} &\phantom{{}\equiv{}}\olsi {P'} \sub {\insertion{L'} P {(\{\stdcoh T {\lh(P) - 1}\} \bullet M)}} \\ &\equiv d_{\lh(P) - 1} \sub {\iota_{S,P,D^{\lh(P) - 1}} \bullet (\insertion {L'} P {(\{\stdcoh T {\lh(P) - 1}\} \bullet M)})} \\ &\equiv d_{\lh(P) - 1} \sub {\{\stdcoh T {\lh(P) - 1}\} \bullet M} \\ &\equiv \stdcoh T {\lh(P) - 1}\sub M \end{align*} As \(\lh(P') - \dim(T) = \lh(P) - \dim(T) - 1\) we can use the induction hypothesis to get that \(c \leadsto d\) and: \begin{align*} d =_{\dim(a)}{} &\SCoh {\insertion {(S \sslash P)} {P'} T} {A \sub {\pi_P \bullet \kappa_{S\sslash P,P',T}}} {\\&\insertion {(\insertion {L'} P {(\{\stdcoh T {\lh(P) - 1}\} \bullet M)})} {P'} {M}} \end{align*} By \cref{lem:pruned-bp,lem:insertion-change-max}, \begin{equation*} d =_{\dim(a)} \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P M} \end{equation*} which completes the proof as \(a \leadsto^* d\). \end{proof} We further show that insertions into discs can be simulated by disc removal. \begin{lemma} \label{lem:disc-insertion-red} Let \((D^n,P,T,\Gamma,L,M)\) be an insertion redex and let \(a \equiv \stdcoh {D^n} n \sub L\). Then: \[ a \red_{\sua'} s =_{n} \SCoh {\insertion {D^n} P T} {\stdty {D^n} n \sub \kappa} {\insertion L P M}\] \end{lemma} \begin{proof} We have the equality: \begin{align*} \SCoh {\insertion {D^n} P T} {\stdty {D^n} n \sub \kappa} {\insertion L P M} &\equiv \SCoh T {\stdty {D^n} n \sub {\kappa_{D^n,P,T}}} M&\text{ \cref{lem:disc-insertion-1}}\\ &=_n \SCoh T {\stdty T n} M&\text{by \cref{thm:std-insert-props}}\\ &\equiv \stdcoh T n \sub M\\ &\equiv L(\olsi P) \end{align*} Therefore, the reduction \(a \red s \equiv L(\olsi P)\) is given by disc removal. \end{proof} Using these lemmas, we now show that the type theories \Cattsua and \(\Catt_{\sua'}\) are equivalent. \begin{proposition} The type theories generated by \sua and \sua' are equivalent. Terms, types, and substitutions are equal or well-formed in one theory exactly when they are equal or well-formed in the other. \end{proposition} \begin{proof} Both directions proceed by \cref{lem:subset-lem}. Since \(\sua' \subseteq \sua\), it suffices to show that if \((\Gamma,s,t) \in \sua\) with \(\Gamma \vdash_{\sua'} s : A\) for some type \(A\) then: \[ \Gamma \vdash_{\sua'} s = t\] If \((\Gamma,s,t) \in \sua'\), then there is nothing to do. If it is in \(\ecr'\), then the argument is the same as in the proof of \cref{prop:suprime-equiv}. We therefore assume \((\Gamma,s,t) \in \insert\), and so there must be some insertion redex \((S,P,T,\Gamma,L,M)\) such that \(s \equiv \lfloor \SCoh S B L \rfloor\) and \[ t \equiv \lfloor \SCoh {\insertion S P T} {B \sub {\kappa_{S,P,T}}} {\insertion L P M} \rfloor \] By an induction on dimension, we assume that the theories generated by \sua and \(\sua'\) are already equivalent for terms of dimension less than \(\dim(s)\). We begin a case analysis of such reductions than are not in \insert. If \(s\) is an identity, then \(B \equiv b \to b\) for some term \(b\) and so \(t\) is an endo-coherence. If \(t\) is already an identity, then \(s \equiv t\). Otherwise: \begin{align*} \Gamma \vdash_{\sua'} t &= \id(b \sub {\kappa_{S,P,T}}) \sub {\insertion L P M}\\ &\equiv \id(b) \sub {\kappa_{S,P,T} \bullet (\insertion L P M)}\\ &= \id(b) \sub L\\ &\equiv s \end{align*} where the first equality is by endo-coherence removal, and the second equality is by \cref{lem:ins-comm-max}, appealing to the induction on dimension. If \(s\) is a unary composite we apply \cref{lem:disc-insertion-red} and use the inductive hypothesis on dimension. Otherwise, we are done by \cref{lem:insertable} and the inductive hypothesis on dimension. \end{proof} Having shown that the reflexive symmetric transitive closure of the reduction \(\red_{\sua'}\) agrees with the equality of \Cattsua, we move on to showing that this reduction is strongly terminating. To do this we appeal to \cref{lem:termination-lem}, and show that all reductions reduce the syntactic complexity of the terms involved. \begin{lemma} \label{lem:insert-sc-prop} The following inequality holds for any insertion redex \((S,P,T,\Gamma,L,M)\): \[\sc(\insertion L P M) < \sc(L)\] \end{lemma} \begin{proof} We extend the notion of syntactic depth to labellings in the obvious way. We begin by noting that: \begin{align*} \sc(L) &= \left(\bighash_{p \neq \olsi P} \sc(L(p))\right) \+ \sc(L(\olsi P))\\ &= \left(\bighash_{p \neq \olsi P} \sc(L(p))\right) \+ \sc(\stdcoh T {\lh(P)} \sub M)\\ &> \left(\bighash_{p\neq \olsi P} \sc(L(p))\right) \+ \sc(M) \end{align*} Further, we show that for all labels \(L\) and \(M\) with appropriate conditions that: \[\sc(\insertion L P M) \leq \bighash_{p\neq \olsi P} \sc(L(p)) \+ \sc(M) \] which we do by induction on \(P\). If \(P = [k]\) then it is clear that \(\insertion L P M\) contains all the terms of \(M\) and some terms of \(L\), and crucially not \(L(\olsi P)\). If instead \(P = k :: P_2\) then by induction hypothesis we get that: \[\sc(\insertion {L_k} {P_2} {M_0}) \leq \bighash_{p\neq \olsi {P_2}} \sc(L_k(p)) \+ \sc(M_1)\] It is then clear again that \(\insertion L P M\) contains terms from \(M\) and terms of \(L\) which are not \(L(\olsi P)\), and so the inequality holds. \end{proof} We can now show that insertion reductions reduce syntactic complexity. \begin{proposition} \label{prop:insert-sc} Let \(s \red t\) be an instance of insertion. If \(s\) is not an identity then \(\sc(s) > \sc(t)\). \end{proposition} \begin{proof} Let \((S,P,T,\Gamma,L,M)\) be an insertion redex so that: \[\SCoh S A L \red \SCoh {\insertion S P T} {A \sub \kappa} {\insertion L P M}\] by insertion. By assumption \(\Coh S A L\) is not an identity. Then: \begin{align*} \sc(t) &= \sc(\SCoh {\insertion S P T} {A \sub \kappa} {\insertion L P M})\\ &\leq 2\omega^{\dim(A)} \+ \sc(\insertion L P M)\\ &< 2\omega^{\dim(A)} \+ \sc(L) &\text{by \cref{lem:insert-sc-prop}}\\ &\leq \SCoh S A L\\ &= \sc(s) \end{align*} and so \(\sc(s) > \sc(t)\), completing the proof. \end{proof} \begin{corollary} The reduction system \(\redr\) is strongly terminating. \end{corollary} \begin{proof} By \cref{lem:termination-lem}, it suffices to show that each rule of \(\sua'\) reduces syntactic complexity, which follows from \cref{prop:disc-rem-sc,prop:ecr-sc,prop:insert-sc}. \end{proof} \subsection{Confluence of \texorpdfstring{\Cattsua}{Cattsua}} \label{sec:confluence-cattsua} In this section, we prove the following theorem: \begin{theorem} \label{thm:sua-conf} The reduction \(\red_{\sua'}\) is confluent. \end{theorem} The confluence proof for \Cattsua is significantly more complex than the corresponding proof for \Cattsu. The primary difficulty with \Cattsua is that a term can have an insertion redex where the term to be inserted admits a head reduction. In particular, consider the case where $a \equiv \SCoh S A L \red b$ is an instance of insertion along some branch \(P\), and \(a \red c\) is an insertion on the argument \(L(\olsi P)\). The difficulty of this critical pair is that \(L(\olsi P)\) need not be in head normal form, and furthermore, the reduction \(a \leadsto c\) can make the original insertion invalid. This does not occur in the predecessor theory \Cattsu, where only identities can be pruned, and all reducts of identities are again identities. We will prove this theorem using \cref{lem:conf-strat}. It is therefore sufficient to show that whenever \(b \leftsquigarrow a \red c\), with \(a \red b\) being a reduction derived from \textsc{rule}, that the following diagram can be formed: \[ \begin{tikzcd}[column sep=tiny] && a \\ b &&&& c \\ & {b'} & {\mathclap{=_{\dim(s)}}} & {c'} \arrow[squiggly, from=1-3, to=2-5] \arrow[squiggly, from=1-3, to=2-1] \arrow["{*}"', squiggly, from=2-1, to=3-2] \arrow["{*}", squiggly, from=2-5, to=3-4] \end{tikzcd} \] We split by cases on the reduction \(a \red b\), ignoring cases where both reductions are identical and ignoring cases which follow by symmetry of other cases. Any cases which do not mention insertion will follow from an identical argument to the one given in \cref{thm:su-conf}, and so we omit these here. We can therefore assume without loss of generality that \(a \leadsto b\) is an insertion along redex \((S,P,T,\Gamma,L,M)\) such that \(a\) is not an identity or unary composite and \(\stdcoh T {\lh(P)}\) is an identity or a standard composite which is not unary. We now split on the reduction \(a \red c\). \paragraph{Insertion on the inserted argument \(\bm{L(\olsi P)}\)} Suppose \(\stdcoh T {\lh(P)} \sub M\) admits an insertion along redex \((T, Q, U, \Gamma, M, N)\). Then: \[\stdcoh T {\lh(P)} \sub M \red \SCoh {\insertion T Q U} {\stdty T {\lh(P)}\sub{\kappa_{T,Q,U}}} {\insertion M Q N}\] We then have \(c \equiv \SCoh S A {L'}\) where \(L'\) is \(L\) with the reduction above applied. We can conclude that \(\stdcoh T {\lh(P)}\) must be a composite (i.e.\ not an identity) as otherwise the second insertion would not be possible. Similarly, \(T\) cannot be linear as otherwise \(\stdcoh T {\lh(P)}\) would be a unary composite. We now need the following lemmas, the second of which is a directed version of \cref{thm:std-insert-props} with more conditions. \begin{lemma} \label{lem:comp-to-tm} For all \(n\) and \(S\), \(\stdcoh S n \red^* \stdtm S n\). \end{lemma} \begin{proof} The only case in which \(\stdcoh S n \neq \stdtm S n\) is when \(S = D^n\), in which case a single disc removal gives the required reduction. \end{proof} \begin{lemma} \label{lem:standard-type-exterior-reduct} Let \((S,P,T)\) be an insertion point. Then if \(S\) is not linear or \(n \leq \dep(S)\), \(\stdty S n \sub{\kappa_{S,P,T}} \red^* \stdty {\insertion S P T} n\) and if \(\dep(S) \leq n\) and \(S\) is not linear or \(\dep(S) = n\) then \(\stdtm S n \sub{\kappa_{S,P,T}} \red^* \stdtm {\insertion S P T} n\). \end{lemma} \begin{proof} We proceed by induction on \(n\), starting with the statement for types. If \(n = 0\) then both standard types are \(\star\), so we are done. Otherwise, we have: \begin{alignat*}{3} \stdty S {n+1} \sub {\kappa_{S,P,T}} \equiv{} &\stdtm {\bound n S} n \sub {\incbd n - S} \sub {\kappa_{S,P,T}} &\qquad& \stdty {\insertion S P T} {n+1} \equiv{}&&\stdtm {\bound n {\insertion S P T}} n \sub {\incbd n - {\insertion S P T}} \\ &\to_{\stdty S n \sub {\kappa_{S,P,T}}} &&&&\to_{\stdty {\insertion S P T} n}\\ &\stdtm {\bound n S} n \sub {\incbd n + S} \sub {\kappa_{S,P,T}}&&&&\stdtm {\bound n {\insertion S P T}} n \sub {\incbd n + {\insertion S P T}} \end{alignat*} By inductive hypothesis: \(\stdty S n \sub{\kappa_{S,P,T}} \red^* \stdty {\insertion S P T} n\), and so we need to show that: \[\stdtm {\bound n S} n\sub{\incbd n \epsilon S \bullet \kappa_{S,P,T}} \red^* \stdty {\bound n {\insertion S P T}} n \sub{\incbd n \epsilon {\insertion S P T}}\] We now note that either the conditions for \cref{lem:insertion-bd-1} or \cref{lem:insertion-bd-2} must hold. If conditions for \cref{lem:insertion-bd-1} hold then (as everything is well-formed in \Catt) we get that the required reduction is trivial. Therefore, we focus on the second case. Here we get from \cref{lem:insertion-bd-2} that: \[\stdtm {\bound n S} n\sub{\incbd n \epsilon S \bullet \kappa_{S,P,T}} \equiv \stdtm {\bound n S} n\sub{\kappa_{\bound n S,\bound n P,\bound n T} \bullet \incbd n \epsilon {\insertion S P T}}\] Then we can apply the inductive hypothesis for terms as if \(n \leq \dim(S)\) then \(\dep(\bound n S) = n\) and otherwise \(\bound n S = S\) is not linear, and so we get the required reduction. Now we move on to the case for terms. If \(\stdtm S n\) is a variable, then we must have that \(S\) is linear and so \(S = D^n\). We must also have in this case that \(\stdtm S n = \olsi P\). Then by \cref{lem:iota-kappa-comm}, \(\stdtm S n \sub {\kappa_{S,P,T}} \equiv \stdcoh T n \sub {\iota_{S,P,T}}\) and then by \cref{lem:disc-insertion-1,lem:comp-to-tm} this reduces to \(\stdtm {\insertion S P T} n\) as required. If \(\stdtm S n\) is not a variable, then \(\stdtm S n \equiv \stdcoh S n\), and \(\stdcoh S n\) cannot be an identity (as either \(S\) is non-linear or \(n = \dim(S)\)). By \cref{lem:iota-kappa-comm} and other assumptions we get that \(\stdcoh S n \sub {\kappa_{S,P,T}}\) admits an insertion along branching point \(P\) and so: \begin{alignat*}{2} \stdtm S n\sub{\kappa_{S,P,T}} &\equiv{} &&\stdcoh S n\sub {\kappa_{S,P,T}}\\ &\red{} &&\SCoh {\insertion S P T} {\stdty S n \sub {\kappa_{S,P,T}}} {\insertion {\kappa_{S,P,T}} P {\iota_{S,P,T}}}\\ &\equiv{} &&\SCoh {\insertion S P T} {\stdty S n \sub {\kappa_{S,P,T}}} {\id}\\ &\red^*{} &&\SCoh {\insertion S P T} {\stdty {\insertion S P T} n} {\id}\\ &\equiv{} &&\stdcoh {\insertion S P T} n\\ &\red^*{} &&\stdtm {\insertion S P T} n \end{alignat*} With the second equivalence coming from \cref{lem:kappa-iota-insert}, the second reduction coming from inductive hypothesis (which is well-founded as the proof for types only uses the proof for terms on strictly lower values of \(n\)), and the last reduction coming from \cref{lem:comp-to-tm}. \end{proof} By this lemma (as \(T\) is not linear), we have \[\stdty T {\lh(P)}\sub{\kappa_{T,Q,U}} \red^* \stdty {\insertion T P Q} {\lh(P)}\] and so \(\stdcoh T {\lh(P)} \sub M \red^* \stdcoh {\insertion T Q U} {\lh(P)} \sub {\insertion M Q N}\). Let \(c'\) be the term obtained by applying this further reduction to the appropriate argument. Now by \cref{lem:insert-lin-height}, we have that \(\th(\insertion T Q U) \geq \th(T)\) and so by \cref{lem:insertable}, there is \(c' \leadsto^* c''\) with: \begin{equation*} c'' =_{\dim(a)} \SCoh {\insertion S P {(\insertion T Q U)}} {A \sub {\kappa_{S,P,\insertion T Q U}}} {\insertion L P {(\insertion M Q N)}} \end{equation*} We now examine how \(b\) reduces. As \(T\) is not linear, there is a branch \(\insertion S P Q\) of \(\insertion S P T\) and we get the following by \cref{lem:ins-comm-max}: \begin{equation*} \olsi {\insertion S P Q} \sub {\insertion L P M} \equiv \olsi Q \sub {\iota_{S,P,T} \bullet (\insertion L P M)} \equiv \olsi Q \sub M \equiv \stdcoh U {\lh(Q)}\sub N \end{equation*} Since \(\th(U) \geq \bh(Q) = \bh(\insertion S P Q)\) we can reduce \(b\) to \(b'\) by insertion as follows: \begin{equation*} b' \equiv{} \SCoh {\insertion {(\insertion S P T)} {\insertion S P Q} U} {A \sub {\kappa_{S,P,T} \bullet \kappa_{\insertion S P T, \insertion S P Q, U}}} {\insertion {(\insertion L P M)} {\insertion S P Q} N} \end{equation*} and then by \cref{lem:inserted-insertion} we get \(b' =_{\dim(a)} c''\) as required. \paragraph{Argument reduction on the inserted argument \(\bm{L(\olsi P)}\)} Suppose \(M \leadsto M'\), and \(L'\) is \(L\) but with the argument for \(\olsi P\) replaced by \(\stdcoh T {\lh(P)} \sub {M'}\), such that \(L \red L'\) and \(a \red c \equiv \Coh S A {L'}\). Then \(c\) admits an insertion and reduces as follows: \[c \leadsto c' \equiv \Coh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion {L'} P {M'}}\] Since each term in \(\insertion {L} P {M}\) is a term of \(L\) or a term of \(M\), we can simply apply the same reductions from \(L \red L'\) and \(L \red M'\) to get \(\insertion L P M \red^* \insertion {L'} P {M'}\). Therefore, \(b \red^* c'\). \paragraph{Other reduction on the inserted argument \(\bm{L(\olsi P)}\)} The argument \(L(\olsi P)\) is either a standard composite which is not unary or an identity. Therefore, the type contained in the coherence is in normal form and hence a cell reduction cannot be applied. Further, disc removal cannot be applied, as \(L(\olsi P)\) is not a unary composite, and endo-coherence removal cannot be applied as if \(L(\olsi P)\) is an endo-coherence then it is an identity. Hence, there are no other reductions that can be applied to the inserted argument and so this case is vacuous. \paragraph{Reduction of non-inserted argument} Suppose \(L \leadsto L'\) along an argument which is not \(\olsi P\) and \(c \equiv \Coh S A {L'}\). Then as \(L'(\olsi P) \equiv \mathcal{C}_T^{\lh(P)}\), an insertion can still be performed on \(c\) to get: \[ c \leadsto c' \equiv \SCoh {\insertion S P T} {A \sub {\kappa_{S,P,T}}} {\insertion {L'} P M}\] Since the terms of \(\insertion L P M\) are a subset of the terms of \(L\) and \(M\), we get \(\insertion L P M \red^* \insertion {L'} P M\) and so \(b \red^* c'\). \paragraph{Disc removal} By assumption, insertion cannot be applied to unary composites, and so this case is vacuous. \paragraph{Endo-coherence removal} Suppose \(A \equiv \arr s B s\) and \(a \red c\) by endo-coherence removal. In this case \(c \equiv \id(A,s) \sub L\) and \[ b \equiv \Coh {\insertion S P T} {(\arr s B s) \sub {\kappa_{S,P,T}}} {\insertion L P M}\] which reduces by endo-coherence removal to: \[b' \equiv \id(A, s) \sub {\kappa_{S,P,T} \bullet (\insertion L P M)}\] By \cref{lem:ins-comm-max}, we have that \(\kappa_{S,P,T} \circ (\insertion L P M) =_{\dim(S)} L\) and so \(b' = _{\dim(S)} c\) and since \(\dim(S) \leq \dim(a)\), we get \(b' =_{\dim(a)} c\) as required. \paragraph{Cell reduction} If \(A \red B\) and \(c \equiv \SCoh S B L\) from cell reduction, then if \(c\) is not an identity or disc it admits an insertion to reduce to: \[c' \equiv \SCoh {\insertion S P T} {B \sub {\kappa_{S,P,T}}} {\insertion L P M}\] As reduction is compatible with substitution, \(b\) also reduces to~\(c'\). If instead \(c\) was an identity then \begin{align*} b &\equiv \SCoh {\insertion {D^n} P T} {A \sub {\kappa_{S,P,T}}} {\insertion L P M}\\ &\red \SCoh {\insertion {D^n} P T} {\stdty {D^n} {n+1}\sub {\kappa_{S,P,T}}} {(\insertion L P M)}\\ &\red^* \id(d_n) \sub {\kappa_{S,P,T} \bullet \insertion L P M}\\ &=_{n+1} \id(d_n) \sub L\\ &\equiv c \end{align*} Where the second reduction is due to \cref{lem:always-ecr} and the equality is due to \cref{lem:ins-comm-max}. If \(c\) is a disc then \cref{lem:disc-insertion-red} can be applied to get that \(c\) reduces to a term \(c''\) with \(c'' =_{n+1} c'\) and \(b \red c'\), completing this case. \paragraph{Insertion} Suppose \(a \leadsto c\) is also an insertion, along a branch \(Q\) of \(S\). We now split on whether \(\olsi P = \olsi Q\). First suppose \(\olsi P = \olsi Q\); then by \cref{lem:insertion-irrel}, we have \(b =_{\dim(a)} c\). Suppose now that \(\olsi P \neq \olsi Q\), and that \(L(\olsi Q) \equiv \stdcoh U {\lh(Q)} \sub N\), such that: \[c \equiv \SCoh {\insertion S Q U} {A \sub {\kappa_{S,Q,U}}} {\insertion L Q N}\] We now consider the case where \(b\) is an identity. As \(P\) and \(Q\) are distinct branches of \(S\), we must have that \(S\) itself is not linear. Therefore, the insertion along \(P\) must be an insertion of an identity. Further, for \(b\) to have the correct type for an identity, we must have that \(A \sub {\pi_P} \equiv \SPath(\olsi Q) \to \SPath(\olsi Q)\). The only path sent to \(\olsi Q\) by \(\pi_P\) is \(\olsi Q\) itself, and so \(A \equiv \SPath(\olsi Q) \to \SPath(\olsi Q)\). Now, by \cref{lem:iota-kappa-comm}: \begin{align*} c &\equiv \SCoh {\insertion S Q U} {\stdcoh U {\lh(Q)} \sub {\iota} \to \stdcoh U {\lh(Q)} \sub {\iota}} {\insertion L Q N}\\ &\red \id(\stdcoh U {\lh(Q)} \sub \iota) \sub {\insertion L Q N}&\text{ by endo-coherence removal}\\ &\equiv \id(\stdcoh U {\lh(Q)}) \sub N & \text{by \cref{lem:ins-comm-max}} \end{align*} Then, \(\insertion L P M\) sends \(\olsi Q\) to \(L(\olsi Q) \equiv \stdcoh U {\lh(Q)} \sub N\), and so \(b \equiv \id(\stdcoh U {\lh(Q)}) \sub N\). The case where \(c\) is an identity is symmetric, so we now consider when neither \(b\) or \(c\) are identities. We now observe that \(b\) and \(c\) further reduce as follows: \begin{align*} b &\red^* b' =_{\dim(a)} \SCoh {\insertion {(\insertion S P T)} {\insertion Q P T} U} {A\sub{\kappa_{S,P,T} \bullet \kappa_{\insertion S P T, \insertion Q P T, U}}} {\insertion {(\insertion L P M)} {\insertion Q P T} N}\\ c &\red^* c' =_{\dim(a)} \SCoh {\insertion {(\insertion S Q U)} {\insertion P Q U} T} {A\sub{\kappa_{S,Q,U} \bullet \kappa_{\insertion S Q U, \insertion P Q U, T}}} {\insertion {(\insertion L Q N)} {\insertion P Q U} M} \end{align*} We show that the first reduction is valid with the validity of the second holding by symmetry. If \(b\) is a unary composite then we apply \cref{lem:disc-insertion-red} to obtain a suitable \(b'\): Otherwise, we obtain the reduction via insertion, noting that: \begin{align*} \olsi{\insertion Q P T} \sub {\insertion L P M} &\equiv \olsi{Q} \sub{\kappa} \sub{\insertion L P M}\\ &\equiv L(Q)\\ &\equiv \stdcoh U {\lh(Q)}\sub N\\ &\equiv \stdcoh U {\lh(\insertion Q P T)}\sub N \end{align*} as required for the insertion, with the third equality coming from \cref{lem:ins-comm-max}. Lastly, the trunk height condition is satisfied as \(\bh(Q) = \bh(\insertion Q P T)\). Therefore, both reductions are valid. We now need the following lemma to complete the proof: \begin{lemma} Let \((S, P, T, \Gamma, L, M)\) be an insertion redex. Then: \[ \insertion L P M =_{\bh(P)+1} \insertionprime L P M\] \end{lemma} \begin{proof} By \cref{prop:insertion-prime-eq}, the two labellings are equal. By inspection of the definition, the maximum dimension of terms that differ is \(\dim(\bh(P))\). \end{proof} By the above and \cref{lem:insertion-different}, \(b' =_{\dim(a)} c'\). This completes all cases of \cref{thm:sua-conf}. \section{Towards normalisation by evaluation} \label{sec:towards-nbe} In this section, the Rust implementation of \Catt, \Cattsu, and \Cattsua, which can be found at \cite{alex_rice_2024_10964705}, is introduced. This implementation takes the form of an interpreter, allowing terms of \Catt to be written in a convenient syntax which can be mechanically checked. The implementation aids the user in writing \Catt terms by automatically constructing standard composites, allowing terms to be bound to top level syntax, implicitly suspending terms, automatically filling arguments which are not locally maximal, and providing informative error messages to the user when typechecking fails. We highlight three points of our implementation: \begin{itemize} \item The typechecker uses \emph{bidirectional typing}~\cite{10.1145/3450952} to mix ``inference'' and ``checking'' rules. Although types for \Catt can always be inferred, we find ourselves in the unusual situation where in some cases the context a term lives in can be inferred, and in some cases it must be provided. We expand on this type system in \cref{sec:typechecking}. \item Tree contexts (see \cref{sec:trees}) are given an explicit representation in the tool. The syntax in the theory is then split into syntax over a tree context and syntax over an arbitrary context. Syntax over a tree context can then use paths instead of de Bruijn levels to reference positions in the context, and substitutions from tree contexts can be given by labellings. We explore this syntax in \cref{sec:nbe-syntax}. \item During typechecking, the equality between types must be checked, which is done by syntactically comparing the normal form of each type. In this implementation, an approach inspired by \emph{normalisation by evaluation} is taken, as opposed to the reduction based approaches used in the previous sections. \end{itemize} Normalisation by evaluation (NbE) (see \cite{abel2013normalization} for an introduction), can be viewed as a method of evaluating terms with ``unknowns''. Equivalently, NbE defines a semantic model of the theory, and interprets each constructor of the type theory in these semantics. When equipped with a method for transforming elements of this model back to terms of the type theory (referred to as \emph{quoting}), the normal form of a term can be calculated directly by recursion on its structure. Compared to the reduction based approach taken in the previous sections, which simplifies the term via a series of locally applied reduction rules, NbE takes a more global approach, deconstructing the original term and using it to synthesise a normal form. The form of NbE implemented in the tool is largely inspired by the paper \citetitle{gratzer2019implementing}~\cite{gratzer2019implementing}, although we note that the form of the theory \Catt is vastly different to the modal type theory they present; \Catt does not have lambda abstraction or application in the usual sense, which makes adapting NbE techniques from the literature difficult. Nevertheless, the overall form of the evaluation is similar. \begin{figure}[ht] \centering % https://q.uiver.app/#q=WzAsMyxbMCwwLCJcXHRleHR7UmF3IHN5bnRheH0iXSxbMCwyLCJcXHRleHR7Q2hlY2tlZCBzeW50YXh9Il0sWzAsNCwiXFx0ZXh0e05vcm1hbC1mb3JtIHN5bnRheH0iXSxbMCwxLCJcXG1hdGhzZntjaGVja30iLDAseyJjdXJ2ZSI6LTR9XSxbMCwxLCJcXG1hdGhzZntpbmZlcn0iLDIseyJjdXJ2ZSI6NH1dLFsxLDAsIlxcbWF0aHNme3RvXFxfcmF3fSIsMV0sWzEsMiwiXFxtYXRoc2Z7ZXZhbH0iLDAseyJjdXJ2ZSI6LTR9XSxbMiwxLCJcXG1hdGhzZntxdW90ZX0iLDAseyJjdXJ2ZSI6LTR9XV0= % tex-fmt: skip \begin{tikzcd} {\text{Raw syntax}} \\ \\ {\text{Core syntax}} \\ \\ {\text{Normal form syntax}} \arrow["{\mathsf{check}}", curve={height=-24pt}, from=1-1, to=3-1] \arrow["{\mathsf{infer}}"', curve={height=24pt}, from=1-1, to=3-1] \arrow["{\mathsf{to\_raw}}"{description}, from=3-1, to=1-1] \arrow["{\eval}", curve={height=-24pt}, from=3-1, to=5-1] \arrow["{\quote}", curve={height=-24pt}, from=5-1, to=3-1] \end{tikzcd} \caption{Implementation overview.} \label{fig:overview} \end{figure} A high-level overview of the implementation is given in \cref{fig:overview}. We pause to explain the purpose of each component: \begin{itemize} \item The \emph{raw syntax} is the syntax that the user of the tool interacts with. We maintain no invariants over the well-formedness of the raw syntax, and it allows the user to omit arbitrary arguments. The primary purpose of the raw syntax is to be the target of parsing, and conversely to facilitate the pretty-printing of terms. We also specify a command language around this raw syntax which is used to interact with the tool. \item The \emph{core syntax} is the result of the typechecking procedure. Syntax of this form is known to be well-formed, and all implicit arguments have been filled in at this point. The terms of this syntax resemble the structured terms of \cref{sec:structured-terms}, with various common operations of \Catt being defined as constructors. Contrary to previous representations of \Catt in this thesis, the application of substitution is treated as a term former, instead of an operation. \item The \emph{normal form syntax} represents the normal forms of each of the type theories \Cattsua, \Cattsu, and \Catt itself. This syntax is also always assumed to be well-formed, and is the closest to original syntax of \Catt. \item The \textsf{eval} and \textsf{quote} functions convert syntax between core syntax and normal form syntax. For each constructor in the core syntax, evaluation computes the result of the corresponding operation, quotienting by the rules of \Cattsu or \Cattsua when applicable. We note that despite \Catt itself having no computation, evaluation must still process operations such as suspension and substitution application. Quotation converts normal form syntax back to core syntax, and in our case is a trivial inclusion. \item The \textsf{infer} and \textsf{check} functions perform typechecking while converting raw syntax into core syntax. Both functions are mutually dependent on each other, and also may need to convert types to normal form syntax to check equality. The \textsf{to\_raw} functions ``forget'' that a piece of core syntax is well-formed, returning a piece of raw syntax, and can optionally remove all non-locally maximal arguments from terms. \end{itemize} In the following subsections, we expand on these points, fully defining each class of syntax, and describing the typechecking and evaluation procedures. \subsection{Syntax} \label{sec:nbe-syntax} Before defining each of the syntactic classes in the tool, we introduce some common notation that will be used in the definitions below: \begin{itemize} \item The letter \(v\) will be used to represent \emph{names} in the syntax: strings that represent a valid identifier. \item A \(\mathsf{Maybe}(x)\) is either of the form \(\mathsf{Some}(x)\) or \(\mathsf{None}\). \item The notation \(\mathsf{Tree}(x)\) represents a tree structure which is given by a list of \(x\)'s which we call the \emph{elements} and a list of trees, which we call the \emph{branches}, whose length is one less than the list of elements. These resemble labellings from \cref{sec:tree-contexts}, but will allow trees to be labelled with arbitrary objects. \end{itemize} We begin our study of the syntax with the raw syntax, which is defined by the following grammar: \begin{alignat*}{4} &(\text{Terms})&\quad&s,t &\ \ &{}::={}&\ \ &{v} \mid \mathsf{coh}[T:A] \mid \_ \mid \mathsf{id} \mid \mathsf{comp} \mid \mathsf{inc}_n^m(s) \mid s \sub \sigma \mid \Sigma(s)\\ &(\text{Types})&&A&&{}::={}&& \star \mid \arr s {\mathsf{Maybe}(A)} t \mid \_ \mid A \sub \sigma \mid \Sigma(A)\\ &(\text{Arguments})&&\sigma&&{}::={}&& (\mathsf{Tree}(\mathsf{Maybe}(s)), \mathsf{Maybe}(A)) \mid (\mathsf{Maybe}(A,)s_0,\dots,s_n)\\ &(\text{Contexts})&&\Gamma&&{}::={}&& T \mid (v_0, A_0), \dots, (v_n : A_n)\\ &(\text{Tree Contexts})&&T&&{}::={}&& \mathsf{Tree}(\mathsf{Maybe}(v)) \end{alignat*} The primary purpose of the raw syntax is to accurately represent the written plaintext syntax. For most cases, each constructor is written in plaintext exactly how it is written above, apart from a few cases: \begin{itemize} \item The application of substitution \(s \sub \sigma\) and \(A \sub \sigma\) is simply written \(s\ \sigma\) and \(A\ \sigma\) respectively. \item The constructor \(\mathsf{inc}_n^m\) is not parsed and is used as an internal operation for defining the external substitution (see \cref{sec:insertion}). It is displayed as \(\mathsf{inc}\verb||\). \item The suspension can be given by the characters \(\Sigma\) or \(S\), to avoid the user being forced to type Unicode characters. \item The type \(\arr s {\mathsf{None}} t\) is written simply as \(s \to t\), and the type \(\arr s {\mathsf{Some}(A)} t\) is written as \(A \mid s \to t\), where the symbol \(\to\) can be replaced by \verb|->| in either case. \item For the construction \(\mathsf{Maybe}\), \(\mathsf{Some}(s)\) is printed the same as \(s\), and \(\mathsf{None}\) is printed as the empty string. \item We provide two ways to write trees: \begin{itemize} \item The curly bracket notation from \cref{sec:trees} can be used. The string: \[ s_0\{T_0\}s_1\cdots\{T_n\}s_{n+1}\] is parsed as a tree with elements given by (the parse of) \(s_0\) to \(s_{n+1}\) and branches given by the parse of \(T_0\) to \(T_n\). \item We provide a notation for specifying the locally maximal arguments of a tree. We parse the string: \[ [a_1,a_2,\dots,a_n]\] As a tree that has \(\mathsf{None}\) as each of elements branches by given by each of the \(a_i\), where if \(a_i\) does not recursively parse as a tree, it is parsed as an element and wrapped in a singleton tree. \end{itemize} To compare these two notations, the two trees below are equal: \[ \{f\}\{\{a\}\{b\}\} = [f,[a,b]]\] When using the full (curly bracket) notation to specify a labelling, it must be wrapped in angle brackets to avoid parse ambiguity. \end{itemize} We highlight the use of the extended substitution introduced in \cref{sec:extend-subst} in the raw syntax. This allows the tool to perform ``implicit suspension'', the automatic suspension of a term, by reducing it to a problem of type inference. These extended substitutions are converted to regular substitutions by the evaluation function introduced in \cref{sec:evaluation}, which applies the appropriate number of suspensions to the head term. An example of this is given in \cref{sec:examples}. We also provide a command language on top of the raw syntax for \Catt, which allows the user to perform various operations on terms, such as binding them to a top-level name, or normalising them. These commands are given by the following syntax: \begin{alignat*}{3} &\mathsf{def}\ v = s &&{}\mathrel{\big\vert} \mathsf{def}\ v\ \Gamma = s &&{}\mathrel{\big\vert} \mathsf{def}\ v\ \Gamma : A = s \\ \mathrel{\big\vert}{}&\mathsf{normalise}\ s\ \mathsf{in}\ \Gamma &&{}\mathrel{\big\vert} \mathsf{assert}\ s = t\ \mathsf{in}\ \Gamma &&{}\mathrel{\big\vert} \mathsf{size}\ s\ \mathsf{in}\ \Gamma\\ \mathrel{\big\vert}{}&\mathsf{import}\ \mathtt{filename} \end{alignat*} The first three commands define the name \(v\) to be given by the term \(s\), where the context \(\Gamma\) and type \(A\) can optionally be given, determining whether the term \(s\) will be inferred or checked. The next three commands take a context \(\Gamma\) and respectively calculate the normal form of \(s\) in \(\Gamma\), assert that \(s\) and \(t\) are equal in \(\Gamma\), or count the number of coherence constructors in \(s\). The last command parses the file \texttt{filename} and runs the commands it contains. In the implementation, each piece of syntax is paired with a piece of span information, which specifies where in the source file it originated. This is done by making the raw syntax generic over a type \(S\) of spans. When obtaining the raw syntax from parsing, this \(S\) is given by a range \(n \lh(T)\), we return the insertion redex \((S,P,T,\_,{\color{Diag2}L}, {\color{Diag2}M})\). \end{itemize} If an insertion redex \((S,P,T,\_,{\color{Diag2}L}, {\color{Diag2}M})\) is found, then \(S\) is replaced by \(\insertion S P T\), \({\color{Diag2}L}\) is replaced by \(\insertion {\color{Diag2}L} P {\color{Diag2}M}\), and \(\color{Diag1}A\) is replaced by \(\color{Diag1}A \sub {\kappa_{S,P,T}}\). This step is then repeated until no insertion redexes are found. \begin{remark} At this critical step, the evaluation proceeds in a fashion closer to reduction than NbE, with insertions repeatedly applied by searching for redexes and applying reductions to the head term. This seems unavoidable; even if one could define a parallel insertion which inserted all insertable arguments at once, it is not clear how to deal with locally maximal arguments that are iterated identities. Despite this, we still claim that the overall structure of the evaluation follows an NbE style, especially regarding the treatment of suspension and application of substitutions and labellings. \end{remark} We next obtain the type \({\color{Diag2}B} = \eval_{\color{Diag2}\id_S}({\color{Diag1}A})\), and split into cases: \begin{itemize} \item If endo-coherence removal is enabled, and \(\color{Diag2}B\) is of the form \(\color{Diag2}(s,s) :: B'\), then we let \({\color{Diag1}\arr t C t} = \quote({\color{Diag2}B})\), interpret \(\color{Diag2}L\) as an environment by letting \({\color{Diag2}\ty(L)} = {\color{Diag2}\star}\) and let: \[ \eval_{\color{Diag2}\rho}({\color{Diag1}\mathsf{coh}[S : A]}) = {\color{Diag2}\mathsf{id}_{\dim(B')}} \sub {\{\eval_{\color{Diag2}L}({\color{Diag1}C}), \eval_{\color{Diag2}L}({\color{Diag1}t})\}}\] where the labelling \(\{\_,\_\}\) from a disc can be trivially constructed by deconstructing the type. \item Suppose endo-coherence removal is disabled, \(S\) is a disc \(D^n\), and \(\color{Diag2}B\) is of the form \(\color{Diag2}(\mathsf{var}_{p^n}, \mathsf{var}_{p^n}) :: B'\), where we recall the path \(p^n\) is the unique locally maximal variable of \(D^n\), then we let: \[ \eval_{\color{Diag2}\rho}({\color{Diag1}\mathsf{coh}[S : A]}) = {\color{Diag2}\mathsf{id}_n \sub {L}}\] \item If disc-removal is enabled, \(S = D^n\), and \(\color{Diag2}B\) is equal to the standard type of dimension \(n\), then: \[ \eval_{\color{Diag2}\rho}({\color{Diag1}\mathsf{coh}[S : A]}) = {\color{Diag2}L(p^n)}\] \item If none of the above cases hold, and \(\color{Diag2}B\) is equal to the standard type of dimension \(\dim(S)\), then: \[ \eval_{\color{Diag2}\rho}({\color{Diag1}\mathsf{coh}[S : A]}) = {\color{Diag2}\mathsf{comp}_S \sub L}\] \item If none of the above cases hold, then: \[ \eval_{\color{Diag2}\rho}({\color{Diag1}\mathsf{coh}[S : A]}) = {\color{Diag2}\mathsf{coh}[S : B] \sub L}\] \end{itemize} The \(\color{Diag1}\mathsf{comp}_T\) case is treated in much the same way, removing any step involving \(\color{Diag1}A\) and instead setting \({\color{Diag2}B} = \eval_{\color{Diag2}\id_T}({\color{Diag1}\stdty T n})\), where \(n\) is given by the dimension of \(T\) before any insertion was performed. This completes all cases for the evaluation function. In contrast, the \(\quote\) function is defined completely trivially by recursion, converting head terms and normal form terms to core terms, normal form labellings to core labellings, and converting normal form types to an iterated arrow type in the obvious way. We note that this is unusual for NbE, where the \(\quote\) function is often mutually defined with evaluation, and performs a significant portion of the work of converting terms to normal form. \subsection{Typechecking} \label{sec:typechecking} Now that the three classes of syntax and the evaluation function have been introduced, the bidirectional typechecking algorithm in the tool can be described. Bidirectional typing allows us to mix typing rules which ``check'' a term, and typing rules which ``infer'' the type for a term. In the implementation, this will determine which pieces of data are inputs to a procedure, and which pieces of data are outputs. By \cref{lem:ty-unique}, all \Catt terms \(s\) have a unique type, which is given by the canonical type \(\ty(s)\). However, for certain terms, such as the coherence term \(\mathsf{coh}[T : A]\), we will be able to further infer the context that a term lives in, which in this case is the tree context \(T\). In this case the pair of the inferred context and type is known as a \emph{principal typing}~\cite{10.1145/237721.237728}, which is not to be confused with a \emph{principal type} of a term in a fixed context. Due to our unique case where all types are inferable, but the context in a judgement may or may not be inferable, we refer to judgements where the context is an input as \emph{checking} judgements and judgements where the context is output as \emph{inferring} judgements. \begin{remark} We justify this choice of terminology by noting the similarity of the judgements \(\Gamma \vdash s : A\) and \(\cdot \vdash \Pi_{\Gamma}\,s : \Gamma \to A\) in a type theory with (dependent) function types, where inferring the type of the second judgement would infer the context of the first. Of course, \Catt does not have function types, yet the intuition can still apply. \end{remark} The typing system will be defined with respect to a \emph{Signature} \(\Psi\), which contains a mapping from names to triples \(({\color{Diag1}\U}, {\color{Diag1}s}, {\color{Diag1}A})\) where \(\color{Diag1}s\) is a term of type \(\color{Diag1}A\) in (tree) context \(\color{Diag1}\U\). In the implementation, the signature also stores all relevant settings for the tool: which reductions are active, the operation set \(\mathcal{O}\) (which can only be configured to the groupoidal or regular operation sets), and whether implicit variables should be kept in the \(\mathsf{to\_raw}\) functions. We write: \[ \Psi(v) = ({\color{Diag1}\U}, {\color{Diag1}s}, {\color{Diag1}A})\] if the signature \(\Psi\) maps \(v\) to the triple above. We further define the notation \({\color{Diag1}\U}(i) = (v : {\color{Diag1}A})\) to mean that at the \(i^{\text{th}}\) index of \(\color{Diag1}\U\) (with \(\color{Diag1}\U\) being a tree or a context), contains a variable name \(v\), which is given type \(\color{Diag1}A\) by \(\color{Diag1}\U\). Lastly we define two conversion functions: \(\mathsf{from\_sub}\) and \(\mathsf{flatten}\). The first is a (partial) function which takes a tree \(T\) and a substitution \(\sigma\) and creates a labelling \(\mathsf{from\_sub}_T(\sigma)\) by letting the locally maximal arguments be given by the terms of \(\sigma\), if \(\sigma\) contains the correct number of terms. The function \(\mathsf{flatten}\) acts on the \(\mathsf{Maybe}\) construction applied to a term or type. It takes \(\mathsf{Some}(s)\) and \(\mathsf{Some}(A)\) to \(s\) and \(A\) respectively, and \(\mathsf{None}\) to \(\_\), the hole constructor for terms and types. Our bidirectional typing system will be based on the following judgements, letting \(\color{Diag1}\U\) refer to either a context or tree context: \begin{alignat*}{2} &s \rightsquigarrow {\color{Diag1}\U} \vdash {\color{Diag1}t} : {\color{Diag1}A}&\qquad&\text{Convert \(s\) to \(\color{Diag1}t\) inferring its type \(\color{Diag1}A\) in inferred (tree) context \(\color{Diag1}\U\)}\\ &{\color{Diag1}\U} \vdash s \rightsquigarrow {\color{Diag1}t} : {\color{Diag1}A} &&\text{Given \(\color{Diag1}\U\), convert \(s\) to \(\color{Diag1}t\) checking it has some type \(\color{Diag1}A\) in \(\color{Diag1}\U\)}\\ &{\color{Diag1}\U} \vdash s = {\color{Diag2}t} \rightsquigarrow () &&\text{In \(\color{Diag1}\U\), check \(s\) has normal form \(\color{Diag2}t\)}\\ &{\color{Diag1}\U} \vdash A \rightsquigarrow {\color{Diag1}B} = {\color{Diag2}C} &&\text{In \(\color{Diag1}\U\), convert \(A\) to \(\color{Diag1}B\), inferring its normal form \(\color{Diag2}C\)}\\ &{\color{Diag1}\U} \vdash A = {\color{Diag2}C} \rightsquigarrow () &&\text{In \(\color{Diag1}\U\), check \(A\) has normal form \(\color{Diag2}C\)}\\ &\Gamma \vdash{} \rightsquigarrow {\color{Diag1}\U}&&\text{Check \(\Gamma\), producing (tree) context \(\color{Diag1}\U\)}\\ &{\color{Diag1}\U} \vdash \sigma : {\color{Diag1}\Gamma} \rightsquigarrow {\color{Diag1}\tau} &&\text{Check \(\sigma\) is a substitution from \(\color{Diag1}\Gamma\) to \(\color{Diag1}\U\), producing \(\color{Diag1}\tau\)}\\ &{\color{Diag1}\U} \vdash L : T \rightsquigarrow {\color{Diag1}M} : {\color{Diag2}A}&&\text{Check labelling \(L\) in \(\color{Diag1}\U\), producing \(\color{Diag1}M\) with type \(\color{Diag2}A\)} \end{alignat*} for each judgement, the syntax to the left of \(\rightsquigarrow\) are the inputs to the judgements, and the syntax to the right are the outputs. \newcommand{\ruleone}{\alpha} \newcommand{\ruletwo}{\beta} \newcommand{\rulethree}{\gamma} \newcommand{\rulefour}{\delta} \newcommand{\rulefive}{\varepsilon} The typing rules for all judgements of this system are given in \cref{fig:bidirectional}. In this figure, \(D^n\) always refers to the linear tree of depth \(n\), rather than the disc context, \(\emptyset\) refers to the empty context, and \(\emp\) refers to the singleton tree. In the final rules, \(i\) should be treated as if it is universally quantified. We pause to highlight some of these rules: \begin{itemize} \item In the rule for coherences, marked \(\ruleone\), the support conditions are checked. This is done using the normal form syntax for the type, due to the simplicity of this syntax. The variable sets of a term can easily be collected by recursion, and in the implementation are stored in a hash set, using Rust's \textsf{HashSet} type. \item The rule for composites, marked \(\ruletwo\), is crucially a checking rule as there is no way to infer the tree \(T\) for the term \(\color{Diag1}\mathsf{comp}_T\). \item For the rule for the application of labellings, marked \(\rulethree\), the premise for the typing of the term is given by a checking judgement instead of an inferring judgement, as the tree \(T\) can be inferred form the labelling. This is in contrast to the corresponding rule for application of substitutions, where the context must be inferred from the inner term before the substitution can be checked. Combined with the point above, this allows a labelling applied to a \(\mathsf{comp}\) term to be checked. \item The rule marked \(\rulefour\) allows a substitution to be applied to a term over a tree context, by converting the substitution to a labelling. This is mainly a convenience feature, as given a term \(s\) where it can be inferred that the context of \(s\) is a tree \(T\), it can be easier to give the locally maximal arguments for \(s\) as a list rather than describing the labelling. \item Lastly, we explain each component of the rule for the typing of a substitution, marked \(\rulefive\). We note that the first type in any \Catt context, which in the rule is given by the type \(\color{Diag1}A_0\), is always \(\star\). Therefore, the type of the first term in a substitution \(\sigma\) should be equal to \(\star \sub \sigma \equiv \ty(\sigma)\). In the rule, the type of the first term is given by \(\color{Diag1}B_0\), explaining its presence as the type of the substitution that gets evaluated to \(\color{Diag2}\rho\). We further note that \(\color{Diag2}\ty(\rho)\) is simply the evaluation of \(\color{Diag1}B_0\), which is why \(X\) is checked against it. Due to the choice to use de Bruijn levels instead of indices, weakening a term is the identity, and so \(s \sub \sigma \equiv s \sub {\langle \sigma,t \rangle}\) for any \(t\). Therefore, by inspecting the typing rules for substitutions in \Catt, it can be proven that to type \(\Gamma \vdash \sigma : \Delta\), it is sufficient to show that \(\Gamma \vdash x \sub \sigma : A \sub \sigma\) for all \((x : A) \in \Delta\). Observing the rule \(\rulefive\), this translates to proving that \(A_i \sub {(B_0,t_0,\dots,t_n)} = B_i\) recalling that \(B_0\) is the core syntax version of the type of the substitution. These equations can be shown by proving that the evaluation of each side is the same, but the evaluation of the left-hand side is given by \(\eval_{\color{Diag2}\rho}({\color{Diag1}A_i})\) for each \(i\), and so for efficiency we factor out the calculation of \(\color{Diag2}\rho\). \end{itemize} \begin{figure}[p] \centering \begin{mathpar} \inferrule{\Psi(v) = ({\color{Diag1}\U}, {\color{Diag1}t}, {\color{Diag1}A})}{v \rightsquigarrow {\color{Diag1}\U} \vdash {\color{Diag1}\mathsf{top\_lvl}(v,t)} : {\color{Diag1}A}}\and \inferrule{{\color{Diag1}T} \vdash A \rightsquigarrow {\color{Diag1}B} = {\color{Diag2}C} \\ (T, \src({\color{Diag2}C}), \tgt({\color{Diag2}C})) \in \mathcal{O}}{\mathsf{coh}[T : A] \rightsquigarrow {\color{Diag1}T} \vdash {\color{Diag1}\mathsf{coh}[T : B]} : {\color{Diag1}B}}\ \ruleone\and \inferrule{ }{\mathsf{id} \rightsquigarrow {\color{Diag1}D^1} \vdash {\color{Diag1}\mathsf{id}_0} : {\color{Diag1}\arr {\mathsf{var}_{[0]}} {\star} {\mathsf{var}_{[0]}}}}\and \inferrule{s \rightsquigarrow {\color{Diag1}\U} \vdash {\color{Diag1}t} : {\color{Diag1}A}}{\Sigma(s) \rightsquigarrow {\color{Diag1}\Sigma(\U)} \vdash {\color{Diag1}\Sigma(t)}: {\color{Diag1}\Sigma(A)}}\\ \inferrule{s \rightsquigarrow {\color{Diag1}T} \vdash {\color{Diag1}t} : {\color{Diag1}A}}{{\color{Diag1}T} \vdash s \rightsquigarrow {\color{Diag1}t} : {\color{Diag1}A}}\and \inferrule{{\color{Diag1}\U}(i) = (v : {\color{Diag1}A})}{{\color{Diag1}\U} \vdash v \rightsquigarrow {\color{Diag1}\mathsf{var}_i} : {\color{Diag1}A}}\and \inferrule{ }{{\color{Diag1}D^n} \vdash \mathsf{id} \rightsquigarrow {\color{Diag1}\mathsf{id}_n} : {\color{Diag1}\stdty {D^n} {n + 1}}}\and \inferrule{ }{{\color{Diag1}T} \vdash \mathsf{comp} \rightsquigarrow {\color{Diag1}\mathsf{comp}_T} : {\color{Diag1}\stdty T n}}\ \ruletwo\and \inferrule{s \rightsquigarrow {\color{Diag1}\Gamma} : {\color{Diag1}t} : {\color{Diag1}A} \\ {\color{Diag1}\U} \vdash \sigma : {\color{Diag1}\Gamma} \rightsquigarrow {\color{Diag1}\tau}\\ {\color{Diag1}\U} \vdash \ty(\sigma) = {\color{Diag2}B} \rightsquigarrow ()}{{\color{Diag1}\U} \vdash s \sub \sigma \rightsquigarrow {\color{Diag1}t \sub \tau} : {\color{Diag1}A \sub \tau}}\and \inferrule{{\color{Diag1}T} \vdash s \rightsquigarrow {\color{Diag1}t} : {\color{Diag1}A} \\ {\color{Diag1}\U} \vdash \mathsf{from\_sub}_T(\sigma) : {\color{Diag1}T} \rightsquigarrow {\color{Diag1}M} : {\color{Diag2}B}\\ {\color{Diag1}\U} \vdash \ty(\sigma) = {\color{Diag2}B} \rightsquigarrow ()}{{\color{Diag1}\U} \vdash s \sub \sigma \rightsquigarrow {\color{Diag1}t \sub M} : {\color{Diag1}A \sub M}}\ \rulethree\and \inferrule{{\color{Diag1}T} : s \rightsquigarrow {\color{Diag1}t} : {\color{Diag1}A} \\ {\color{Diag1}\U} \vdash L : T \rightsquigarrow {\color{Diag1}M} : {\color{Diag2}B}\\ {\color{Diag1}\U} \vdash \ty(L) = {\color{Diag2}B} \rightsquigarrow ()}{{\color{Diag1}\U} \vdash s \sub L \rightsquigarrow {\color{Diag1}t \sub M} : {\color{Diag1}A \sub M}}\ \rulefour\\ \inferrule{ }{{\color{Diag1}\U} \vdash \_ = {\color{Diag2}t} \rightsquigarrow ()}\and \inferrule{{\color{Diag1}\U} \vdash s \rightsquigarrow {\color{Diag1}t} : {\color{Diag1}A}}{{\color{Diag1}\U} \vdash s = {\eval_{\color{Diag2}\id_{\color{Diag1}\U}}({\color{Diag1}t})} \rightsquigarrow ()}\\ \inferrule{ }{{\color{Diag1}\U} \vdash \star \rightsquigarrow {\color{Diag1}\star} = {\color{Diag2}\emp}}\and \inferrule{{\color{Diag1}\U} \vdash s \rightsquigarrow {\color{Diag1}s'} : {\color{Diag1}A} \\ {\color{Diag1}\U} \vdash t \rightsquigarrow {\color{Diag1}t'} : {\color{Diag1}B} \\ \eval_{\color{Diag2}\id_{\color{Diag1}\U}}{\color{Diag1}A} = \eval_{\color{Diag2}\id_{\color{Diag1}\U}}{\color{Diag1}B}}% {{\color{Diag1}\U} \vdash \arr s {} t \rightsquigarrow {\color{Diag1}\arr {s'} {A} {t'}} = {\color{Diag2}(\eval_{\id_{\color{Diag1}\U}}{\color{Diag1}s'}, \eval_{\id_{\color{Diag1}\U}}{\color{Diag1}t'}) :: \eval_{\id_{\color{Diag1}\U}}{\color{Diag1}A}}}\and \inferrule{{\color{Diag1}\U} \vdash s \rightsquigarrow {\color{Diag1}s'} : {\color{Diag1}B} \\ {\color{Diag1}\U} \vdash t \rightsquigarrow {\color{Diag1}t'} : {\color{Diag1}C} \\ {\color{Diag1}\U} \vdash A \rightsquigarrow {\color{Diag1}A'} = {\color{Diag2}A''}\\ {\color{Diag2}A''} = \eval_{\color{Diag2}\id_{\color{Diag1}\U}}{\color{Diag1}B} = \eval_{\color{Diag2}\id_{\color{Diag1}\U}}{\color{Diag1}C}}% {{\color{Diag1}\U} \vdash \arr s {A} t \rightsquigarrow {\color{Diag1}\arr {s'} {A'} {t'}} = {\color{Diag2}(\eval_{\id_{\color{Diag1}\U}}{\color{Diag1}s'}, \eval_{\id_{\color{Diag1}\U}}{\color{Diag1}t'}) :: A''}}\\ \inferrule{{\color{Diag1}\U} \vdash A \rightsquigarrow {\color{Diag1}B} = {\color{Diag2}C}}{{\color{Diag1}\U} \vdash A = {\color{Diag2}C} \rightsquigarrow ()}\and \inferrule{{\color{Diag1}\U} \vdash s = {\color{Diag2}s'} \rightsquigarrow ()\\ {\color{Diag1}\U} \vdash t = {\color{Diag2}t'} \rightsquigarrow () }{{\color{Diag1}\U} \vdash \arr s {} t = {\color{Diag2}\arr {s'} {A} {t'}} \rightsquigarrow ()}\and \inferrule{{\color{Diag1}\U} \vdash s = {\color{Diag2}s'} \rightsquigarrow ()\\ {\color{Diag1}\U} \vdash t = {\color{Diag2}t'} \rightsquigarrow ()\\ {\color{Diag1}\U} \vdash A = {\color{Diag2}A'} \rightsquigarrow ()\\ }{{\color{Diag1}\U} \vdash \arr s {A} t = {\color{Diag2}\arr {s'} {A'} {t'}} \rightsquigarrow ()}\and \inferrule{ }{{\color{Diag1}\U} \vdash \_ = {\color{Diag2}C} \rightsquigarrow ()}\\ \inferrule{ }{T \vdash {} \rightsquigarrow {\color{Diag1}T}}\and \inferrule{ }{\emptyset \vdash {} \rightsquigarrow {\color{Diag1}\emptyset}}\and \inferrule{\Gamma \vdash {} \rightsquigarrow {\color{Diag1}\Delta} \\ {\color{Diag1}\Delta} \vdash A \rightsquigarrow {\color{Diag1}B} = {\color{Diag2}C}}{\Gamma, (v : A) \vdash {} \rightsquigarrow {\color{Diag1}\Delta, (v : B)}}\\ \inferrule{{\color{Diag1}\U} \vdash s_i \rightsquigarrow {\color{Diag1}t_i} : {\color{Diag1}B_i}\\ {\color{Diag2}\rho} := \eval_{\color{Diag2}\id_{\color{Diag1}\U}}% ({\color{Diag1}(B_0,t_0,\dots,t_n)})\\\\ \eval_{\color{Diag2}\id_{\color{Diag1}\U}}({\color{Diag1}B_i}) = \eval_{\color{Diag2}\rho}({\color{Diag1}A_i})\\ {\color{Diag1}\U} \vdash \mathsf{flatten}(X) = {\color{Diag2}\ty(\rho)} \rightsquigarrow ()}{{\color{Diag1}\U} \vdash (X,s_0,\dots,s_n) : {\color{Diag1}(v_0:A_0),\dots,(v_n:A_n)} \rightsquigarrow {\color{Diag1}(B,t_0,\dots,t_n)} }\ \rulefive\\ \inferrule{{\color{Diag1}\U} \vdash \mathsf{flatten}(x) \rightsquigarrow {\color{Diag1}A}}{{\color{Diag1}\U} \vdash x : \emp \rightsquigarrow {\color{Diag1}t} : \eval_{\color{Diag2}\id_{\color{Diag1}\U}}({\color{Diag1}A})}\and \inferrule{{\color{Diag1}\U} \vdash L_i \rightsquigarrow {\color{Diag1}M_i} : {\color{Diag2}(s_i,s_{i+1})::A} \\ {\color{Diag1}\U} \vdash \mathsf{flatten}(x_i) = {\color{Diag2}s_i} \rightsquigarrow ()}{{\color{Diag1}\U} \vdash x_0\{L_0\}\cdots\{L_n\}x_{n+1} \rightsquigarrow {\color{Diag1}s_0\{M_0\}\cdots\{M_n\}s_{n+1}} : {\color{Diag2}A}} \end{mathpar} \caption{Bidirectional typing rules.} \label{fig:bidirectional} \end{figure} The typing rules in \cref{fig:bidirectional} can easily be translated into an algorithm for mechanically checking each of these typing judgements. In some cases, some equalities of normal forms are left implicit, such as in the final rule concerning the typing of a non-singleton labelling, and must be made explicit in the final algorithm. Many of the choices for the form of these rules was made to improve the quality of error messages. Each of these rules can fail for a variety of reasons, at which point an error is created by converting the relevant syntax back to raw syntax using the \(\mathsf{to\_raw}\) functions so that it can be displayed to the user. The use of Rust's \textsf{Result} type, which allows each of these functions to return either the well-formed core syntax or an appropriate error message, is essential, and benefits greatly from the question mark syntax in Rust, which allows errors to easily be propagated through the code. We end this section by describing the function of each of the commands introduced in \cref{sec:nbe-syntax}. Each of these commands is run with a mutable reference to a signature \(\Psi\). The commands use this signature for typechecking, and may modify the signature. The three \(\mathsf{def}\) commands are used to add a new binding to the signature \(\Psi\). For the first command, which omits the context, the term \(s\) must be inferred, producing a core syntax context, term, and type, which is inserted into the signature with key \(v\) and printed to the user. The second command is given a raw context and so first checks this raw context to produce a core (possibly tree) context \(\color{Diag1}\U\), before checking the term \(s\) in this context. Checking the term then produces a core syntax term and type, which are inserted into the signature along with the context \(\color{Diag1}\U\). The last \(\mathsf{def}\) command proceeds as before, checking the context to get a context \(\color{Diag1}\U\) and then checking the term in \(\color{Diag1}\U\), producing a core term \(\color{Diag1}t\) and type \(\color{Diag1}B\). The supplied type \(A\) is then checked against \(\eval_{\color{Diag2}\id_{\color{Diag1}\U}}({\color{Diag1}B})\). If this check succeeds, the key-value pair \((v, ({\color{Diag1}\U},{\color{Diag1}t},{\color{Diag1}B}))\) is added to the signature \(\Psi\), identically to the previous case. The \(\mathsf{normalise}\) command is used to print the normal form of a term \(s\). As with the final two \(\mathsf{def}\) cases, we begin by checking the context, and checking the term \(s\) in the resulting core context to get term \(\color{Diag1}t\) of type \(\color{Diag1}A\). Both \(\color{Diag1}t\) and \(\color{Diag1}A\) are then evaluated to normal form, quoted, and converted back to raw syntax, before being pretty-printed to the user. The \(\mathsf{size}\) command calculates a primitive estimate of the complexity of a term (which we note is not the same as the syntactic complexity given in \cref{sec:termination}) by counting the number of constructors in the normal form. To run this command, the term \(s\) is checked as before, and converted to a normal form term \(\color{Diag2}t\). Then \(\mathsf{size}({\color{Diag2}t})\) is then calculated by induction by the rules given in \cref{fig:size} and this size is printed to the user. The \(\mathsf{assert}\) command checks both input terms \(s\) and \(t\), and evaluates the resulting core syntax terms to normal form to check that they are equal. None of the \(\mathsf{normalise}\), \(\mathsf{size}\), or \(\mathsf{assert}\) commands modify the signature \(\Psi\). \begin{figure}[ht] \centering \begin{mathpar} \mathsf{size}({\color{Diag2}\mathsf{coh}[T : A]}) = 1 + \mathsf{size}({\color{Diag2}A})\and \mathsf{size}({\color{Diag2}\mathsf{id}_n}) = \mathsf{size}({\color{Diag2}\mathsf{comp}_T}) = 1 \and \mathsf{size}({\color{Diag2}\mathsf{var}_p}) = 0 \and \mathsf{size}({\color{Diag2}H \sub L}) = \mathsf{size}({\color{Diag2}H}) + \mathsf{size}({\color{Diag2}L})\and \mathsf{size}({\color{Diag2}L}) = \sum_{p : \Path_T} \mathsf{size}({\color{Diag2}L(p)})\and \mathsf{size}({\color{Diag2}[(s_0,t_0), \dots, (s_n,t_n)]}) = \sum_{i= 0}^n \left(\mathsf{size}({\color{Diag2}s_i}) + \mathsf{size}({\color{Diag2}t_i})\right) \end{mathpar} \caption{Size of normal form syntax.} \label{fig:size} \end{figure} Finally, the \(\mathsf{import}\) command reads the contents of the supplied file, parses it as a list of commands, and runs each of these commands with the same signature. The tool has a command line interface, which allows files to be loaded at startup, as well as providing a REPL (read-eval-print loop) which parses one command at a time. \subsection{Examples} \label{sec:examples} We now demonstrate the use of the tool with some examples. All the examples below can be found in the \texttt{/examples} directory of the implementation code base~\cite{alex_rice_2024_10964705}. We begin defining some standard operations that can be found in a monoidal category or bicategory, which can be found in the file \texttt{/examples/monoidal.catt}. We start by defining \(1\)-composition as a coherence: \begin{lstlisting}[language=Catt] def comp1coh [f,g] = coh [ x{}{}z : x -> z ] (f,g) \end{lstlisting} This example demonstrates the two ways of giving a tree context: in the \(\mathsf{def}\) command we give the context using the square bracket notation, which only labels the maximal elements, and in the coherence it is given by the full labelling, as we require access to the variables \(x\) and \(z\) (we note that all other variables of the context have been omitted). This example further demonstrates that a substitution can be applied to a term over a tree context, where we have only specified the locally maximal arguments. This composite can of course also be given using the \(\mathsf{comp}\) construction. \begin{lstlisting}[language=Catt] def comp1 [f,g] = comp assert comp1coh(f,g) = comp1(f,g) in [f,g] \end{lstlisting} The tree for \(\mathsf{comp}\) is inferred from the labelling \texttt{[f,g]}. The assert statement ensures that these two ways of giving the \(1\)-composition are equal in the theory. The assert passes even with no reduction enabled, demonstrating the value of evaluation in the fully weak case. The horizontal and vertical composites of \(2\)-cells can be given similarly: \begin{lstlisting}[language=Catt] def horiz [[a],[b]] = comp def vert [[a,b]] = comp \end{lstlisting} As the vertical composite is the suspension of \(1\)-composition, it can also be given using implicit suspension: \begin{lstlisting}[language=Catt] def vertsusp [[a,b]] = comp1[a,b] assert vert(a,b) = vertsusp(a,b) in [[a,b]] \end{lstlisting} In this case, the labelling applied to \texttt{comp1} is a tree of depth \(1\) where the locally maximal arguments are given by \(2\)-dimensional terms. Type inference then deduces that the type component of this labelling should be \(1\)-dimensional, and hence evaluation causes the head term \texttt{comp1} to be suspended, making it equal to the composite \texttt{vert}, as demonstrated by the assertion. The unitors and associator are then given by the following coherences, using the \(\mathsf{id}\) builtin for the unitors: \begin{lstlisting}[language=Catt] def unitor_l = coh [ x{f}y : comp1(id(x),f) -> f ] def unitor_r = coh [ x{f}y : comp1(f, id(y)) -> f ] def assoc = coh [ {f}{g}{h} : comp1(comp1(f,g),h) -> comp1(f,comp1(g,h)) ] \end{lstlisting} which allows definitions to be given for terms which witness the triangle and pentagon equations of monoidal categories: \begin{lstlisting}[language=Catt] def triangle = coh [ x{f}y{g}z : vert(assoc(f,id(y),g), horiz(id(f),unitor_l(g))) -> horiz(unitor_r(f),id(g)) ] def pentagon = coh [ v{f}w{g}x{h}y{i}z : vert(assoc(comp1(f,g),h,i),assoc(f,g,comp1(h,i))) -> comp [ horiz(assoc(f,g,h),id(i)), assoc(f,comp1(g,h),i), horiz(id(f),assoc(g,h,i)) ] ] \end{lstlisting} We note the direct use of the \(\mathsf{comp}\) constructor to easily provide a ternary composite without needing to give a new top-level definition. Using the \(\mathsf{normalise}\) command, it can be shown that the triangle reduces to the identity with \Cattsu normalisation enabled, and the pentagon reduces to the identity with \Cattsua normalisation enabled. In the files \texttt{/examples/eh.catt} and \texttt{/examples/eh-cyll.catt}, we give two \Catt proofs of the Eckmann-Hilton argument (see \cref{prop:eh}). In \Cattsu, these both normalise to the following vastly smaller term: \begin{lstlisting}[language=Catt] def swap = coh [ x{f{a}g}y{h{b}k}z : comp[comp [[a],h], comp[g,[b]]] -> comp[comp [f,[b]], comp[[a],k]] ] \end{lstlisting} The \(\mathsf{size}\) command demonstrates that the \Catt Eckmann-Hilton proof in \texttt{/examples/eh.catt} has size 1807 whereas its \Cattsu normalisation has a size of only 19. Due to the simplicity of Eckmann-Hilton in \Cattsu, we are able to give \Cattsu and \Cattsua proofs of the syllepsis (see \cref{sec:cattsu}) in \texttt{/examples/syllepsis-su.catt} and \texttt{/examples/syllepsis.catt} respectively. It can be verified that in \Cattsua, the \Cattsu proof of syllepsis, which has size 2745, reduces to the \Cattsua proof, which has size 1785. \subsection{Further work} \label{sec:further-work} We end the discussion of this implementation with some options for improving the tool. Each of these suggestions could make the tool easier to use and interact with, which in turn extends what can be achieved with it. Currently, the tool completely relies on the bidirectional typing rules to perform all of its type inference. While this is effective in some scenarios, for example labellings and implicit suspension, it is lacking in others, such as the lack of implicit arguments in substitutions. One could try to implement such features by adding metavariables and a unification procedure to the typechecker. Contrary to the situation for the fully weak \Catt, unification for \Cattsu and \Cattsua is non-trivial. Suppose we wished to unify the following two terms: \[ f *_0 g = h *_0 i\] where \(f\),\(g\),\(h\), and \(i\) may contain metavariables. In \Catt, this problem could be reduced to the unification problems \(f = h\) and \(g = i\). In \Cattsu however, this cannot be done, as a potential solution is \(f = h *_0 i\) and \(g = \id\). It is likely that any unification that can be implemented for \Cattsu (and \Cattsua) is quite limited, but an investigation into the limits of unification in these settings could be valuable. Even without a powerful unification algorithm, there are still instances where an argument could be inferred by the tool. One such example is the Eckmann-Hilton term presented in the previous section. This term is defined in the context: \[ (x : \star)\ (\alpha : \id(x) \to \id(x))\ (\beta : \id(x) \to \id(x)) \] Here, the \(x\) should be inferable as it is the \(0\)-source of \(\alpha\). The tool currently has no way to deduce this. Separately, improvements could be made to the treatment of unfolding of top-level definitions in the tool. Whenever a term is evaluated by the tool, any top-level definition is unfolded to its normal form. This is not always desirable, as it means that error messages frequently contain fully expanded terms, increasing the length and readability of terms in addition to losing the information associated with the name given to the definition. Conversely, the full unfolding of evaluation often means that we avoid evaluating terms before displaying them to the user, even when a (partial) evaluation would simplify the term. A notable example is that when giving a new definition, its type is not simplified before being displayed, often resulting in terms such as \verb|p0{x{f}y}|. A better approach would likely add top-level definitions to the normal form syntax as a head term, allowing their unfolding to be optional. One potential approach for efficient unfolding is given by \citeauthor{andrastalk}~\cite{andrastalk}. Finally, the accessibility of the tool could be improved with proper editor integration, for example by implementing the language server protocol (see \url{https://microsoft.github.io/language-server-protocol/}), which would allow errors to be displayed directly in the editor, among other code refactoring features. \section{Models} \label{sec:models} Despite claiming that the type theories \Cattsu and \Cattsua model semistrict \(\infty\)-categories, we are yet to discuss their models. In this section we recall the definition of a model for these theories, and discuss some properties of these models. The definitions of \emph{globular category} and \emph{globular sum} were given in \cref{sec:background}. Any variant of \Cattr can be equipped with the structure of a globular category by choosing the disc objects to be the disc contexts and letting the source and target maps be given by the inclusions \(\lfloor \incbd {n} \epsilon {D^{n+1}} \rfloor\) for \(\epsilon \in \{-,+\}\). We then define the category of models of \Cattsu and \Cattsua. \begin{definition} Recall that for any tame variant of \Cattr, the category \(\mathsf{Catt}_{\mathcal{R}}^{\mathsf{ps}}\) is defined to be the restriction of the syntactic category \(\mathsf{Catt}_{\mathcal{R}}\) to the ps-contexts. We define the category of models to be the full subcategory of the presheaf category on \(\mathsf{Catt}_{\mathcal{R}}^{\mathsf{ps}}\) consisting of functors: \[F : \left( \mathsf{Catt}_{\mathcal{R}}^{\mathsf{ps}} \right)^{\text{op}} \to \mathbf{Set}\] such that \(F^{\text{op}}\) preserves globular sums. \end{definition} Each element of the category of models has the structure of a weak \(\infty\)-category. For a model \(F : \left( \mathsf{Catt}_{\mathcal{R}}^{\mathsf{ps}} \right)^{\text{op}} \to \mathbf{Set}\), the set of \(n\)-cells is given by \(F(D^n)\), with source and target maps given by the functions: \[ F(\lfloor \incbd {n-1} - {D^n} \rfloor), F(\lfloor \incbd {n-1} + {D^n} \rfloor) : F(D^n) \to F(D^{n-1})\] for which the globularity equations follow from the globularity of the inclusion maps. For each term over a ps-context in \Cattr, an operation on each of the models can be derived. We consider the action of the \(1\)-composition term, given by \(\stdcoh {[\emp,\emp]} 1\). For the model \(F\), this induces an operation: \[ F(\{\lfloor \stdcoh {[\emp, \emp]} 1 \rfloor\}) : F(D^1\vee D^1) \to F(D^1)\] Due to the preservation of globular sums, we have \(F(D^1 \vee D^1) = F(D^1) \amalg_{F(D^0)} F(D^1)\), which is exactly the set of composable \(1\)-cells, which the function above sends to their composition. Similarly, the identity \(\id(d_0)\) induces a map \(F(D^0) \to F(D^1)\), giving the identity on each \(0\)-cell. These operations can be combined, to get a compound operation of the following form: \[ \begin{tikzcd}[column sep = large] {F(D^1)} = {F(D^1) \amalg_{F(D^0)} F(D^0)} & {F(D^1) \amalg_{F(D^0)} F(D^1)} & {F(D^1)} \arrow["{\id \amalg F(\id(d_0))}", from=1-1, to=1-2] \arrow["{F(\{\stdcoh {[\emp,\emp]} 1\})}", from=1-2, to=1-3] \end{tikzcd} \] By the functoriality of \(F\) (and preservation of globular sums), this composite should be equal to: \[ F(\{\stdcoh {[\emp,\emp]} 1\} \bullet \langle d_1 , \id_{d_0^+} \rangle) : F(D^1) \to F(D^1)\] Therefore, if \(F\) is further a \Cattsu model, then this operation must equal \(F(\id) = \id\), enforcing the semistrict properties of \Cattsu onto the model. Throughout the thesis, contexts in \Cattr have been viewed as semistrict \(\infty\)-categories themselves. This viewpoint can be made precise by the Yoneda embedding, as for each context \(\Gamma\) of \Cattr, we obtain the presheaf: \[Y(\Gamma) : \mathsf{Catt}_{\mathcal{R}}^{\mathsf{op}} \to \mathbf{Set}\] which sends \(\Delta\) to \(\mathrm{Hom}(\Delta, \Gamma)\), the substitutions from \(\Delta\) to \(\Gamma\). This map preserves all limits, so in particular its opposite preserves the globular sums, meaning it can be restricted to a model of \Cattr. Furthermore, the \(n\)-cells are given by substitutions \(D^n \to \Gamma\), which are precisely the \(n\)-dimensional terms of \(\Gamma\) up to definitional equality. Since every \Catt term is also a \Cattr term, there is an evident functor: \[ K_{\mathcal{R}} : \mathsf{Catt} \to \mathsf{Catt}_{\mathcal{R}}\] which sends each context and substitution to its equivalence class in \Cattr. This functor can be restricted to the functor: \[ K_{\mathcal{R}}^{\mathsf{ps}} : \mathsf{Catt}^{\mathsf{ps}} \to \mathsf{Catt}_{\mathcal{R}}^{\mathsf{ps}} \] which is the identity on objects. We now prove that this functor preserves globular sums. By \cite[Lemma 64]{benjamin2021globular}, the functor \(\mathbf{FinGlob} \to \mathsf{Catt}\) from the category of finite globular sets preserves globular sums, and so it suffices to show that the functor \(\mathbf{FinGlob} \to \mathsf{Catt}_{\mathcal{R}}\) preserves globular sum. By \cite[Lemmas 25 and 29]{benjamin2021globular}, it suffices to show that this functor preserves the initial object and preserves pushouts along the inclusion maps \(S^n \to D^n\). The empty context is clearly the initial object, and this is preserved by the above functor. For the second property it suffices to show that: \[ \begin{tikzcd} {S^n} & \Gamma \\ {D^n} & {\Gamma, (x: A)} \arrow["\{\wk(U^n)\}"', from=1-1, to=2-1] \arrow["{\{A\}}", from=1-1, to=1-2] \arrow["{\{A,x\}}"', from=2-1, to=2-2] \arrow[from=1-2, to=2-2] \end{tikzcd}\] is a pushout for each \(\Gamma \vdash A\) in \Cattr. Suppose there is context \(\Delta\) with substitutions \(\sigma : \Gamma \to \Delta\) and \(\{B,t\} : D^n \to \Delta\) such that: \[\{B\} \equiv \{\wk(U^n)\} \bullet \{B, t\} = \{A\} \bullet \sigma \equiv \{A \sub \sigma\}\] Then the universal map is given by \(\langle \sigma, t \rangle\), with this map being well-formed as \(\Delta \vdash t : B\) and \(B = A \sub \sigma\). The uniqueness of this universal map is clear. Hence, the square above is cocartesian. From this we get the following proposition. \begin{proposition} The functors \(K_{\mathcal{R}}\) and \(K_{\mathcal{R}}^{\mathsf{ps}}\) preserve globular sums. \end{proposition} \begin{proof} As the maps \(\mathbf{FinGlob} \to \mathsf{Catt}\) and \(\mathbf{FinGlob} \to \mathsf{Catt}_{\mathcal{R}}\) preserve globular sums, the globular sums in both \(\mathsf{Catt}\) and \(\mathsf{Catt}_{\mathcal{R}}\) are given exactly by the ps-contexts. The two functors \(K_{\mathcal{R}}\) and \(K_{\mathcal{R}}^{\mathsf{ps}}\) are the identity on ps-contexts, and hence preserve globular sums. \end{proof} Due to this proposition, any model of \Cattr can be also seen as a model of \Catt, by precomposing with the functor \(K_{\mathcal{R}}^{\mathsf{ps}}\). This is to be expected, as intuitively every semistrict \(\infty\)-category should also be a weak \(\infty\)-category, where certain operations are given by identities. \subsection{Rehydration for pasting diagrams} \label{sec:rehydration} We have shown a way in which every model of \Cattr can be viewed as a model of \Catt. In this section we prove that this mapping from \Cattr models to \Catt models is injective. This implies that being semistrict is a \emph{property} of the model, a particular \Catt model can only arise from a unique \Cattr model, if such a \Cattr model exists. We prove this result by demonstrating a partial conservativity result for \Cattr, which we call \emph{rehydration for pasting contexts}. Rehydration refers to the process of taking a term in the semistrict theory, and inserting the necessary coherence morphisms into the term such that it can be typed in \Catt. We discuss the difficulties involved with rehydrating an arbitrary term in \cref{sec:towards-gener-rehydr}, but for now we are only concerned with the simpler case of rehydrating a term \(t : \Term_\Gamma\) where \(\Gamma\) is a ps-context. We work towards the following theorem: \begin{theorem} \label{thm:rehydration} Let \(\mathcal{R}\) be a tame equality rule set that satisfies the support condition and has pruning, disc removal, and endo-coherence removal. Then for any ps-context \(\Delta\) and term \(t : \Term_\Delta\), there is a \Catt term \(s : \Term_\Delta\) such that \(\Delta \vdash s = t\) in \Cattr. \end{theorem} We begin with an example for \Cattsu. Take the pasting context given by the following diagram: \[ \Delta = \begin{tikzcd} w & x & y & z \arrow["f", from=1-1, to=1-2] \arrow["g", from=1-2, to=1-3] \arrow["h", from=1-3, to=1-4] \end{tikzcd} \] The associator \(\alpha\) is a \Cattsu normal form term over \(\Delta\), and we can further define the term: \[ \eta : \id((f*g)*h) \to \alpha_{f,g,h} * \alpha_{f,g,h}^{-1}\] as a single coherence over \(\Delta\). This term is also a \Cattsu normal form. Finally the term: % https://q.uiver.app/#q=WzAsMyxbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzQsMCwiXFxidWxsZXQiXSxbMCwxLCJcXGlkIiwyXSxbMSwyLCJcXGlkIiwyXSxbMCwxLCJcXGFscGhhICogXFxhbHBoYV57LTF9IiwwLHsiY3VydmUiOi00fV0sWzEsMiwiXFxhbHBoYSAqIFxcYWxwaGFeey0xfSIsMCx7ImN1cnZlIjotNH1dLFswLDIsIlxcYWxwaGEgKiBcXGFscGhhXnstMX0iLDIseyJjdXJ2ZSI6NX1dLFszLDUsIlxcZXRhIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs0LDYsIlxcZXRhIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs3LDEsIlxcZXRhXnstMX0iLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjB9fV1d % tex-fmt: skip \[ \begin{tikzcd} \bullet && \bullet && \bullet \arrow[""{name=0, anchor=center, inner sep=0}, "\id"', from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "\id"', from=1-3, to=1-5] \arrow[""{name=2, anchor=center, inner sep=0}, "{\alpha * \alpha^{-1}}", curve={height=-24pt}, from=1-1, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "{\alpha * \alpha^{-1}}", curve={height=-24pt}, from=1-3, to=1-5] \arrow[""{name=4, anchor=center, inner sep=0}, "{\alpha * \alpha^{-1}}"', curve={height=30pt}, from=1-1, to=1-5] \arrow["\eta"', shorten <=3pt, shorten >=3pt, Rightarrow, from=0, to=2] \arrow["\eta"', shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=3] \arrow["{\eta^{-1}}"', shorten <=3pt, Rightarrow, from=4, to=1-3] \end{tikzcd} \] is a \Cattsu normal form term over a pasting context, which is not well-formed in \Catt. Such a term can be rehydrated by inserting the equivalence \(\id \cong \id * \id\) into the centre of the term. Performing a similar construction with the interchanger instead of the associator creates a \Cattsua normal form term over a pasting context which is not a \Catt term. We now proceed with the proof of \cref{thm:rehydration}. We introduce three operations, which are mutually defined on terms of \Cattr over pasting contexts. \begin{itemize} \item The \emph{rehydration} \(R(t)\) of a term \(t\) recursively rehydrates all subterms of \(t\), and then pads the resulting term. For any \Cattr term \(t\), the rehydration is a \Catt term over the same context. For any term \(t\), we call \(R(N(t))\) its \emph{rehydrated normal form}, where \(N\) is the function taking any term to its normal form. We similarly define the rehydration \(R(A)\) of a type \(A\) over a pasting context and \(R(\sigma)\) of a substitution \(\sigma\) whose domain and codomain are pasting contexts. \item The \emph{padding} \(P(t)\) of a \Catt term \(t\), which composes the term with coherences to ensure that its boundaries are in rehydrated normal form. \item The normaliser \(\phi(t)\), a coherence term from \(t\) to its rehydrated normal form \(R(N(t))\) for any \Catt term \(t\). \end{itemize} We give formal definitions for each of these, which we define mutually with proofs of the following statements, where we assume \(\Delta\) and \(\Gamma\) are pasting contexts: \begin{enumerate} \item Suppose \(\Delta \vdash_{\mathcal{R}} t : A\). Then \(\Delta \vdash R(t) : R(N(A))\). Similarly, if \(\Delta \vdash_{\mathcal{R}} A\) or \(\Delta \vdash_{\mathcal{R}} \sigma : \Gamma\), then \(\Delta \vdash R(A)\) and \(\Delta \vdash R(\sigma) : \Gamma\). \item For a \Cattr well-formed term \(t\), type \(A\), and substitution \(\sigma\), we have \(\Delta \vdash t = R(t)\), \(\Delta \vdash A = R(A)\), and \(\Delta \vdash \sigma = R(\sigma)\) in \Cattr. \item Suppose \(\Delta \vdash t : A\) for a \Catt term \(t\), then \(P_k(t)\) is well-formed for \(k \leq \dim(t)\) and \(\Delta \vdash P(t) : R(N(A))\). \item Suppose \(t\) is a well-formed \Catt term. Then for each \(k \leq \dim(t)\), \(P_k(t) = t\). \item If \(\Delta \vdash t : R(N(A))\) in \Catt, then \(\Delta \vdash \phi(t) : \arr t A {R(N(t))}\). \item Let \(t\) be a well-formed \Catt term over a pasting context. Then \(\phi(t) = \id(t)\). \end{enumerate} Each of these definitions and proofs are given by an induction on dimension and subterms, ensuring that they are well-founded. We begin with the definition of the rehydrated term, type, and substitution. \begin{definition} Let \(\Delta\) and \(\Gamma\) be a pasting context. For a term \(t\) or type \(A\) over \(\Delta\), or a substitution \(\sigma : \Gamma \to \Delta\), we define the rehydrations: \[ R(t) : \Term_\Delta \qquad R(A) : \Type_\Delta \qquad R(\sigma) : \Gamma \to \Delta\] by mutual recursion. For a variable \(x\), we let \(R(x) = x\), and for a coherence term we define: \[ R(\Coh \Gamma A \sigma) = P(\Coh {\Gamma} {R(A)} {R(\sigma)})\] For types and substitutions, we recursively apply the rehydration to all subterms. \end{definition} To define the padding, we need the composites over certain trees \(T_k^n\) for \(k < n\) which are defined by: \[ T_0^n = [\emp, D^{n-1}, \emp] \qquad T_{k+1}^{n+1} = \Sigma(T_k^n)\] As an example \(T_1^3\) produces the following context: % https://q.uiver.app/#q=WzAsMixbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzAsMSwiIiwwLHsiY3VydmUiOi0zfV0sWzAsMSwiIiwyLHsiY3VydmUiOjN9XSxbMCwxLCIiLDEseyJjdXJ2ZSI6LTV9XSxbMCwxLCIiLDEseyJjdXJ2ZSI6NX1dLFs1LDMsIiIsMSx7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMiw0LCIiLDAseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzMsMiwiIiwyLHsib2Zmc2V0IjotNSwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFszLDIsIiIsMix7Im9mZnNldCI6NSwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs4LDksIiIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XV0= % tex-fmt: skip \[ \begin{tikzcd} \bullet && \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-18pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=18pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-40pt}, from=1-1, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=40pt}, from=1-1, to=1-3] \arrow[shorten <=2pt, shorten >=2pt, Rightarrow, from=3, to=1] \arrow[shorten <=2pt, shorten >=2pt, Rightarrow, from=0, to=2] \arrow[""{name=4, anchor=center, inner sep=0}, shift left=5, shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow[""{name=5, anchor=center, inner sep=0}, shift right=5, shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow[shorten <=4pt, shorten >=4pt, Rightarrow, scaling nfold=3, from=4, to=5] \end{tikzcd} \] The composite over this context allows us to ``fix'' the \(1\)-dimensional boundary of a \(3\)-dimensional term. \begin{definition} Let \(t\) be an \(n\)-dimensional term of a pasting diagram \(\Delta\). Define its padding \(P(t)\) to be equal to \(P_n(t)\) where: \[ P_0(t) = t \qquad P_{k+1}(t) = \stdcoh {T_{k}^{n}} n \sub {\langle \phi(\src_{k}(P_{k}(t)))^{-1}, P_{k}(t), \phi(\tgt_k(P_{k}(t))) \rangle}\] where \(\src_k\) and \(\tgt_k\) give the \(k\) dimensional source and target of a term. \end{definition} Consider the term \(\alpha : \arr f {\arr x \star y} g\). As an example, we build the following sequence of paddings: \begin{center} \begin{tabular}{P{3cm} P{9cm}} \(P_0(\alpha)\)&{ \begin{tikzcd}[ampersand replacement=\&] x \& y \arrow[""{name=0, anchor=center, inner sep=0}, "f"', curve={height=12pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "g", curve={height=-12pt}, from=1-1, to=1-2] \arrow["\alpha"', shorten <=3pt, shorten >=3pt, Rightarrow, from=0, to=1] \end{tikzcd}}\\ \(P_1(\alpha)\)&{ \begin{tikzcd}[ampersand replacement=\&] {R(N(x))} \& x \& y \& {R(N(y))} \arrow[""{name=0, anchor=center, inner sep=0}, "f"', curve={height=12pt}, from=1-2, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "g", curve={height=-12pt}, from=1-2, to=1-3] \arrow["{\phi(x)}"', from=1-2, to=1-1] \arrow["{\phi(y)}", from=1-3, to=1-4] \arrow["\alpha"', shorten <=3pt, shorten >=3pt, Rightarrow, from=0, to=1] \end{tikzcd}}\\ \(P_2(\alpha)\)&{ \begin{tikzcd}[ampersand replacement=\&] {R(N(x))} \& x \& y \& {R(N(y))} \arrow[""{name=0, anchor=center, inner sep=0}, "f"{description}, curve={height=12pt}, from=1-2, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "g"{description}, curve={height=-12pt}, from=1-2, to=1-3] \arrow["{\phi(x)}"', from=1-2, to=1-1] \arrow["{\phi(y)}", from=1-3, to=1-4] \arrow[""{name=2, anchor=center, inner sep=0}, "R(N(g))", curve={height=-60pt}, from=1-1, to=1-4] \arrow[""{name=3, anchor=center, inner sep=0}, "R(N(f))"', curve={height=60pt}, from=1-1, to=1-4] \arrow["\alpha"', shorten <=4pt, shorten >=3pt, Rightarrow, from=0, to=1] \arrow["{\phi(\phi(x)^{-1} * g * \phi(y))}"{description}, shorten <=5pt, shorten >=3pt, Rightarrow, from=1, to=2] \arrow["{\phi(\phi(x)^{-1}*f*\phi(y))}"{description}, shorten <=5pt, shorten >=3pt, Rightarrow, from=0, to=3] \end{tikzcd}} \end{tabular} \end{center} We lastly define the normaliser coherences. As these are each built from a coherence constructor with the rule for equivalences, they can all be inverted. \begin{definition} Let \(t\) be a term of a pasting diagram \(\Delta\). By \cref{cor:supp-ps}, \(\Supp(t)\) is a pasting diagram, and we let \(i_t\) be the inclusion \(\Supp(t) \to \Delta\). Then we define the normaliser \(\phi(t)\): \[ \phi(t) = \Coh {\Supp(t)} {t \to R(N(t))} {i_t}\] By assumption, \(R(N(t)) = N(t) = t\) and so \(\Supp(R(N(t))) = \Supp(t)\), making the above term well-formed. \end{definition} We now prove the required properties, starting with statement 1. The statements for types and substitutions follow by a simple induction using the case for terms, as if \(A = B\) then \(R(N(A)) = R(N(B))\) (as \(N(A) = N(B)\)). The case for a variable is also trivial, so assume that: \[\Delta \vdash_{\mathcal{R}} \Coh \Gamma B \sigma : A\] Then it follows from induction on subterms that \(\Gamma \vdash R(B)\) and \(\Delta \vdash R(\sigma) : \Gamma\), and so: \[ \Delta \vdash \Coh \Gamma {R(B)} {R(\sigma)} : R(B) \sub {R(\sigma)}\] Then by induction on statement (3), we get: \[ \Delta \vdash P(\Coh \Gamma {R(B)} {R(\sigma)} : R(N(R(B) \sub {R(\sigma)}))) \] By induction on statement (2), we have \(R(B) \sub {R(\sigma)} = B \sub \sigma\). By inspection of the original typing derivation, we have \(B \sub \sigma = A\), and so \(R(N(R(B) \sub {R(\sigma)})) \equiv R(N(A))\), as required. Now consider statement 2. The cases for types and substitutions follow by an easy induction from the result for terms. Since the case for variables is trivial, we restrict to the cases for the coherence terms, where we must prove that: \[ \Gamma \vdash_{\mathcal{R}} \Coh \Delta A \sigma = P(\Coh \Delta {R(A)} {R(\sigma)})\] By (1), \(\Coh \Delta {R(A)} {R(\sigma)}\) is a well-formed \Catt term, and so by (4) and induction on subterms we have: \[ P(\Coh \Delta {R(A)} {R(\sigma)}) = \Coh \Delta {R(A)} {R(\sigma)} = \Coh \Delta A \sigma\] For statement 3, we let \(\Delta \vdash t : A\) and prove for each \(k\) that \(P_k(t)\) is well-formed and that \(\src_m(P_k(t)) \equiv R(N(\src_m(t)))\) and \(\tgt_m(P_k(t)) \equiv R(N(\tgt_m(t)))\) for \(m \leq k\). We proceed by induction on \(k\). The case for \(k = 0\) is trivial, so we must prove that \(P_{k+1}(t)\) is well-formed, which is the term: \[\stdcoh {T_{k}^{n}} n \sub {\langle \phi(\src_{k}(P_{k}(t)))^{-1}, P_{k}(t), \phi(\tgt_k(P_{k}(t))) \rangle} \] By (5), noting that the inductive hypothesis on \(k\) implies that the types of \(\src_k(P_k(t))\) and \(\tgt_k(P_k(t))\) are in rehydrated normal form, we have that the normalisers are well-typed. Therefore, \(P_{k+1}(t)\) is well-formed by the previous fact and the inductive hypothesis on \(k\). By simple calculation it follows that: \[\src_m(P_k(t)) \equiv \src_m(P_m(t)) \equiv \src(\phi(\src_m(t))^{-1}) \equiv R(N(\src_m(t)))\] with a similar equation holding for the target. It then follows that \(\Delta \vdash P(t) : R(N(A))\). Statement 4 holds by a simple induction on \(k\), using statement (6) to reduce each normaliser to an identity, and then using pruning and disc removal to get the equality: \[ \stdcoh {T_{k}^{n}} n \sub {\langle \id(\src_{k}(P_{k}(t))), P_{k}(t), \id(\tgt_k(P_{k}(t))) \rangle} = P_k(t) \] which along with the inductive hypothesis on \(k\) is sufficient. For statement 5, we assume \(\Delta \vdash t : R(N(A))\). Then, by (1) and the preservation rule, we have \(\Delta \vdash \Delta \vdash R(N(t)) : R(N(R(N(A)))) \equiv R(N(A))\), where the equality follows from (2) and the idempotency of the normal form functor. The typing for the normaliser then trivially follows, as \(t\) and \(R(N(t))\) are full in \(\Supp(t)\). For statement 6, we apply statement (1) to get that \(t = N(t) = R(N(t))\). Therefore: \begin{align*} \phi(t) &\equiv \Coh {\Supp(t)} {t \to R(N(t))} {i_t}\\ &= \Coh {\Supp(t)} {t \to t} {i_t}\\ &= \id(t) \sub {i_t}&\text{by endo-coherence removal}\\ &\equiv \id(t) \end{align*} This completes all parts of the definitions and proofs. Then for any well-formed \Cattr term \(t\), \(R(N(t))\) is a well-formed \Catt term with \(R(N(t)) = t\) in \Cattr completing the proof of \cref{thm:rehydration}. Moreover, if \(t = t'\) then \(R(N(t)) \equiv R(N(t'))\), and so the rehydrated of \Cattr terms over pasting contexts can be chosen to respect \Cattr equality. From this we get the following corollary. \begin{corollary} Semistrictness is a property. Let \(\mathcal{R}\) is a tame equality rule set satisfying the support and preservation conditions in addition to having pruning, disc removal, and endo-coherence removal. If \(F\) and \(G\) are \Cattr models such that: \[F \circ K_{\mathcal{R}}^{\mathsf{ps}} = G \circ K_{\mathcal{R}}^{\mathsf{ps}}\] then \(F = G\). \end{corollary} \begin{proof} Since \(K_{\mathcal{R}}^{\mathsf{ps}}\) is the identity on objects, it follows that \(F\) and \(G\) must be equal on objects. Now let \(\Gamma\) and \(\Delta\) be pasting diagrams, and let \(\Gamma \vdash_{\mathcal{R}} \sigma : \Delta\). Then by \cref{thm:rehydration} we have, \(\Gamma \vdash R(\sigma) : \Delta\) and so: \[ F(K_{\mathcal{R}}^{\mathsf{ps}}(R(\sigma))) = G(K_{\mathcal{R}}^{\mathsf{ps}}(R(\sigma))) \] but \(K_{\mathcal{R}}^{\mathsf{ps}}\) is simply an inclusion, so \(F(R(\sigma)) = G(R(\sigma))\) and since \(R(\sigma) = \sigma\) in \Cattr, we have \(F(\sigma) = G(\sigma)\). The substitution \(\sigma\) was arbitrary, so \(F = G\) as required. \end{proof} The above result holds in particular for the equality rule sets \su and \sua, meaning that a model of \Catt can be a model of \Cattsu or \Cattsua in at most one way. \subsection{Towards generalised rehydration} \label{sec:towards-gener-rehydr} The rehydration result of the previous section can be viewed as a partial conservativity result, stating that in a pasting context, \Cattsu and \Cattsua have the same expressive power as \Catt. The original motivation of semistrictness was to strictify parts of the theory without losing the expressiveness of the fully weak setting. We would therefore hope that the rehydration results of \cref{sec:rehydration} extend to arbitrary contexts. Such a result would be a powerful tool for constructing terms in a weak setting; a term could be constructed by constructing it in the semistrict setting, before applying rehydration to the resulting term to get term in the fully weak setting. Such a technique would allow a \Catt proof of Eckmann-Hilton to be constructed mechanically from the vastly simpler \Cattsu Eckmann-Hilton proof, or even give a proof of the Syllepsis in \Catt, for which no proof has been given as of writing. By observing the proof of \cref{thm:rehydration}, we see that the main part that would need replacing for a general rehydration result is the construction of the normalisers, as we can no longer rely on the source and target term of our normaliser living over a pasting diagram that allows the construction of a single coherence. A natural way to proceed is to attempt to build a normaliser \(\phi(t) : t \to R(N(t))\) by recursion on the reduction sequence \(t \red^* N(t)\). We consider a context with \(x : *\) and a scalar \(\alpha : \id(x) \to \id(x)\), and consider the reduction by pruning: \[ \alpha *_0 \id(x) \red (\alpha)\] where \((\alpha)\) is the unary composite on \(\alpha\). We immediately encounter two problems: \begin{itemize} \item For each individual reduction, the source and target of the reduction may not have the same type. In the example above, the source has type \(\id(x) * \id(x) \to \id(x) * \id(x)\), but the target has type \(\id(x) \to \id(x)\). A normaliser between these two terms can therefore not be directly constructed. \item If the source term is padded such that it has the same type as the target term, we can run into a separate problem. Consider the reduction given above again. The following normaliser can be formed: \[ \Coh {D^2} {\rho_{d_1^-}^{-1} *_1 (d_2 *_0 \id(d_0^+)) *_1 \rho_{d_1^+} \to (d_2)} {\langle \{\alpha\} \rangle}\] which has source given by the padded term: \[ \begin{tikzcd} x & x & x \arrow[""{name=0, anchor=center, inner sep=0}, "{\id(x)}", curve={height=-12pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "{\id(x)}"', curve={height=12pt}, from=1-1, to=1-2] \arrow["{\id(x)}"', from=1-2, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "{\id(x)}", controls=+(90:1.5) and +(90:1.5), from=1-1, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "{\id(x)}"', controls=+(270:1.5) and +(270:1.5), from=1-1, to=1-3] \arrow["\alpha"', shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=0] \arrow["{\rho_{\id(x)}}", shorten >=3pt, Rightarrow, from=1-2, to=3] \arrow["{\rho_{\id(x)}}"', shorten >=3pt, Rightarrow, from=1-2, to=2] \end{tikzcd} \] However this term is padded by the right unitor on each side, which is not the canonical normaliser from \(\id(x) * \id(x)\) to \(\id(x)\), the unbiased unitor. \end{itemize} The reduction above was not only chosen to demonstrate both of these problems, but was chosen as it is the problematic reduction that is encountered if one tries to rehydrate the Eckmann-Hilton term from \Cattsu. To give a proof of Eckmann-Hilton, one reaches a critical point where a left unitor and right unitor on the identity must be cancelled out, highlighting the second of the two problems. To solve the second problem one could attempt to prove that for any two reductions paths from \(t\) to \(N(t)\), that there is a higher cell between the normalisers generated from each reduction path, critically relying on the confluence proof for the theory to modularise the problem into finding fillers for each confluence diamond. Such an approach seems infeasible for the following reasons: To find fillers for a confluence diamond, we presumably must already know the form of all rehydrations in the dimension below, which themselves could depend on filling confluence diamonds of the dimension below. This seems to necessitate rehydrating on a dimension by dimension basis, making the full rehydration problem infeasible. It is also likely that at some point it would be necessary to show that two different fillers of a confluence diamond have a higher cell between them, leading to some form of \(\infty\)-groupoid flavoured confluence problem. Such a problem also seems infeasible with the tools currently available to us. An alternative approach could be to show that the ``space'' of all rehydrations is contractible. This can be made precise in the following way. Let \(t\) be a \Cattr term. Then consider the globular set whose \(0\)-cells are \Catt terms \(s\) which are equal to \(t\) in \Cattr, \(1\)-cells are given by \Catt terms \(f : s \to s'\) which are equal to \(\id(t)\) in \Cattr, in general \(n\)-cells given by \Catt terms that are equal to \(\id^n(t)\). The contractability of such a globular set is exactly the property needed for rehydration, as it gives the existence of a \(0\)-cell \(s\) which gives the rehydration, and witnesses the essential uniqueness of this rehydration. Such a contractability proof can be given when the term \(t\) is a term of a pasting diagram, as any higher cells can be given by a simple coherence. This allows us to fix the padding in the example above, observing that the right unitor is equivalent to the unbiased unitor. It is however unclear how such a contractability proof could be extended to arbitrary contexts. We now turn our attention to the first problem presented above. One method for tackling this problem is to give normalisers as a \emph{cylindrical equivalence} instead of a regular equivalence. A cylindrical equivalence can be viewed as the canonical notion of equivalence between two objects of different types. We introduce the first few dimensions of cylinder terms. A \(0\)-cylinder is simply a \(1\)-dimensional term. A \(1\)-cylinder from a cylinder \(f : w \to x\) to a cylinder \(g : y \to z\) can be defined by the square: \[ \begin{tikzcd} x & z \\ w & y \arrow["f", from=2-1, to=1-1] \arrow["g"', from=2-2, to=1-2] \arrow["a"', from=2-1, to=2-2] \arrow["a'", from=1-1, to=1-2] \arrow[Rightarrow, from=2-2, to=1-1] \end{tikzcd} \] where the central arrow has type \(a * g \to f * a'\). If such a cylinder was invertible, which is the case when \(a\), \(b\), and the two-dimensional cell are invertible, then it would be a cylindrical equivalence and would witness the equivalence of \(f\) and \(g\). Suppose two \(1\)-cylinders \(\alpha : f \to g\) and \(\beta : g \to h\) as below: \[ \begin{tikzcd} x & z & v \\ w & y & u \arrow["f", from=2-1, to=1-1] \arrow["g"{description}, from=2-2, to=1-2] \arrow["a"', from=2-1, to=2-2] \arrow["{a'}", from=1-1, to=1-2] \arrow[Rightarrow,from=2-2, to=1-1] \arrow["h"', from=2-3, to=1-3] \arrow["b"', from=2-2, to=2-3] \arrow["{b'}", from=1-2, to=1-3] \arrow[Rightarrow,from=2-3, to=1-2] \end{tikzcd} \] Then a composite cylinder \(f \to h\) could be formed by letting the front ``face'' be given by \(a * b\), the back ``face'' be given by \(a' * b'\) and the filler given by a combination of associators and whiskerings of the two fillers in the diagram. A \(2\)-cylinder could be given by the following diagram: % https://q.uiver.app/#q=WzAsNCxbMCwxLCJcXGJ1bGxldCJdLFsyLDEsIlxcYnVsbGV0Il0sWzIsMCwiXFxidWxsZXQiXSxbNCwwLCJcXGJ1bGxldCJdLFswLDEsIiIsMCx7ImN1cnZlIjotNX1dLFswLDEsIiIsMix7ImN1cnZlIjo1fV0sWzIsMywiIiwyLHsiY3VydmUiOi01fV0sWzIsMywiIiwwLHsiY3VydmUiOjV9XSxbMCwyXSxbMSwzXSxbNCw2LCIiLDAseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzUsNywiIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dXQ== % tex-fmt: skip \[ \begin{tikzcd} && \bullet && \bullet \\ \bullet && \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-40pt}, from=2-1, to=2-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=40pt}, from=2-1, to=2-3] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-40pt}, from=1-3, to=1-5] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=40pt}, from=1-3, to=1-5] \arrow[from=2-1, to=1-3] \arrow[from=2-3, to=1-5] \arrow[shorten <=13pt, shorten >=13pt, Rightarrow, from=0, to=2] \arrow[shorten <=13pt, shorten >=13pt, Rightarrow, from=1, to=3] \arrow[shorten <=8pt, shorten >=8pt, Rightarrow, from=1, to=0] \arrow[shorten <=8pt, shorten >=8pt, Rightarrow, from=3, to=2] \end{tikzcd}\] where the top and bottom faces of this diagram are \(1\)-cylinders, and the whole digram should be filled by a \(3\)-dimensional term with appropriate source and target. The shape of this diagram gives the name to this construction. When using cylinders to represent the normalisers in a rehydration process, the inductive step for coherences would require a cylinder to be generated from a cylindrical version of the substitution attached to the coherence. We have seen that this can be done when the coherence is given by \(1\)-composition, but achieving full rehydration would involve giving cylindrical versions of every operation in \Catt. No such proof has been given for any variety of globular weak \(\infty\)-categories. We offer an alternative solution which avoids defining cylinder composition, which we call \emph{rehydration by dimension}. From an equality rule set \(\mathcal{R}\), we can form the rule sets \(\mathcal{R}_n\) which consists of the rules in \((\Gamma,s,t) \in \mathcal{R}\) such that \(\dim(s) = \dim(t) \leq n\). Rehydration by dimension attempts to rehydrate an \(n\)-dimensional term \(t\) by constructing terms \(t_n,\dots,t_0\) such that \(t_i\) is a term which is well-formed in \(\Catt_{\mathcal{R}_i}\), creating a rehydration sequence: \[ \Catt_{\mathcal{R}_n} \to \Catt_{\mathcal{R}_{n-1}} \to \cdots \to \Catt_{\mathcal{R}_1} \to \Catt_{\mathcal{R}_0}\] The term \(t_n\) is given immediately by \(t\), and \(t_0\) is then a term of \(\Catt_{\mathcal{R}_0} = \Catt\), giving the rehydration of \(t\). The key insight of this method is that when generating the normaliser for a particular \(k\)-dimensional generating rule \(s \red t\), we know by the preservation property that the types of \(s\) and \(t\) are equal, and so are further equal in \(\Catt_{\mathcal{R}_{k-1}}\). By factoring through these partial rehydrations, the normaliser of a dimension \(k\) generating rule only has to be valid in \(\Catt_{\mathcal{R}_{k-1}}\), meaning that the normalisers can again be given by regular equivalences. Unfortunately, this method does not avoid the need to define new classes of operations in \Catt, as we could be required to prove that arbitrary \Catt operations are natural in their lower-dimensional arguments. Consider terms \(f : x \to y\) and \(g : y \to z\) and suppose the \(\Catt_{\mathcal{R}_1}\) normal form of \(y\) is \(y'\) with normaliser \(\phi(y)\). Then, during a rehydration proof to \(\Catt_{\mathcal{R}_0}\), it may be required to give a normaliser from \(f * g\) to \((f * \phi(y)) * (\phi(y)^{-1} * g)\), effectively requiring us to prove that \(1\)-composition is natural in its central \(0\)-cell. Similarly to the case with cylinders, in this case for \(1\)-composition, such a normaliser can easily be given, but we possess no way of creating such naturality arguments on arbitrary coherences. The proofs of Eckmann-Hilton given in \cref{sec:examples} give an example of the result of each of these methods, with the proof in \texttt{/examples/eh.catt} proceeding by ``rehydration by dimension'', and the proof in \texttt{/examples/eh-cyll.catt} using cylinders. In both proofs, the only example of the second problem we encounter is proving that the left and right unitors on the identity are equivalent to the unbiased unitor. For the cylinder proof, the composition of \(1\)-cylinders is used and is given by the term \texttt{cyl\_comp}, which is then implicitly suspended by the tool. The rehydration by dimension proof needs a naturality move like the one described above, which is given by the term \texttt{compat\_move}. \section{Future ideas} \label{sec:future-work} In this final section, we collect together some ideas for the continuation of this work, including ideas for different semistrict theories based on \Cattr, and modifications to the existing theories. Some ideas for future avenues of research have already been discussed, such as the potential improvements to the implementation discussed in \cref{sec:further-work}, and the discussion of full rehydration given in \cref{sec:towards-gener-rehydr}, which we will not repeat here. \paragraph{Further results for \Cattsua} The metatheory of \Cattsua is more complicated than the corresponding metatheory of \Cattsu, though at first glance the relative increase in power does not match this complexity. The jump from \Catt to \Cattsu vastly simplified the proof of Eckmann-Hilton, allowed the syllepsis to be proven, and lead to results such as disc trivialisation. In contrast, \Cattsua provides no further simplification to Eckmann-Hilton and only slightly simplifies the syllepsis, removing some associators from the proof. One potential utility of \Cattsua could be simplifying the composites of cylinders, as briefly introduced in \cref{sec:towards-gener-rehydr}. Consider the following diagram from that section which contains two composable \(1\)-cylinders. \[ \begin{tikzcd} x & z & v \\ w & y & u \arrow["f", from=2-1, to=1-1] \arrow["g"{description}, from=2-2, to=1-2] \arrow["a"', from=2-1, to=2-2] \arrow["{a'}", from=1-1, to=1-2] \arrow["X", Rightarrow,from=2-2, to=1-1] \arrow["h"', from=2-3, to=1-3] \arrow["b"', from=2-2, to=2-3] \arrow["{b'}", from=1-2, to=1-3] \arrow["Y", Rightarrow,from=2-3, to=1-2] \end{tikzcd} \] In \Catt, the \(1\)-composite of these cylinders is a term \((a*b)*h \to f*(a'*b')\) given by: \[ \alpha_{a,b,h} *_1 (a *_0 Y) *_1 \alpha_{a,g,b'}^{-1} *_1 (X *_0 b') * \alpha_{f,a',b'}\] where each \(\alpha\) term is an associator. This would of course simplify in \Cattsua to \((a *_0 Y) *_1 (X *_0 b')\). Such a simplification could make it simpler to define higher cylinder coherences, such as associator for \(1\)-cylinders, which would be trivial in \Cattsua, but far more involved in \Catt. Further future work for \Cattsua could involve the search for an analogue of disc trivialisation for \Cattsua. We would expect there to be a more general class of contexts that are trivialised by \Cattsua but are not trivialised. The contexts present in the cylinder contexts presented above could form a starting point for such a study. A separate avenue for further study is to explore the links between \Cattsua and more graphical presentations of semistrict \(\infty\)-categories. String diagrams are a common graphical method for working with monoidal categories and bicategories~\cite{selinger2011survey}, and their higher-dimensional counterparts, such as those implemented in the tool \textsf{homotopy.io}, can be viewed as strictly associative and unital finitely presented \(\infty\)-categories, much like contexts of \Cattsua. Translation results in either direction between these two settings, while highly non-trivial due to the contrast in the way each system approaches composition, would be valuable. \paragraph{Generalised insertion} The conditions given for insertion in \cref{sec:insertion} were not the most general conditions possible. In this section, we stated that to perform insertion we required an insertion redex \((S,P,T,\U,L,M)\), and one of the conditions of this insertion redex was that: \[ L(\olsi P) \equiv \stdcoh T {\lh(P)} \sub M\] It turns out that it is sufficient to give the weaker condition that the locally maximal argument is a coherence where the type contained in the coherence is sufficiently suspended: \[ L(\olsi P) \equiv \Coh T {\Sigma^{\bh(P)}(A)} M\] As \(\stdcoh {\Sigma(T)} {n+1} \equiv \Sigma (\stdcoh T n)\), and the original condition required that \(\th(T) \geq \bh(P)\), this alternative condition is a strict generalisation of the previous condition. Under the new condition, the exterior labelling must be modified. It firstly must take the type \(A\) as an argument. The case for \(P = [k]\) is then modified such that \(\kappa_{S,[k],T,A}\) (noting the extra type subscript) is given by: \[ \begin{tikzcd}[column sep=smaller,row sep = 20pt] {[S_0,\dots,S_{k-1}]} & \doubleplus & {T} & \doubleplus & {[S_{k+1},\dots,S_n]} \\ \\ {[S_0,\dots,S_{k-1}]} & \vee & {\Sigma S_k} & \vee & {[S_{k+1},\dots,S_n]} \arrow["{\{A, \Coh T A {\id_T}\}}"{description, font = \normalsize}, from=3-3, to=1-3] \arrow["\id"{font = \normalsize}, from=3-1, to=1-1] \arrow["\id"{font = \normalsize}, from=3-5, to=1-5] \end{tikzcd} \] when \(S = [S_0,\dots,S_n]\). The inductive step of the exterior labelling then relies on the type \(A\) being sufficiently suspended to proceed, just as the original version depends on the trunk height of \(T\) being sufficient to proceed (we note that the trunk height condition is still needed in this generalisation). For the necessary typing judgements to be satisfied, we must have \(\src_0(A) \equiv \fst(\lfloor T \rfloor)\) and \(\tgt_0(A) \equiv \snd(\lfloor T \rfloor)\), but no other extra condition is necessary. In some ways, this definition of insertion is more natural than the definition given earlier. We no longer rely on the syntactic condition of the locally maximal argument being a standard coherence, only relying on the far weaker suspendability property. In the proof for confluence of \Cattsua, a large focus was cases where a reduction modified a standard coherence into a term which was no longer a standard coherence. Cases like these do not happen with generalised insertion, as reductions do not break the suspendability property. More generally, a confluence proof for generalised insertion does not require any proof about the interaction of insertion with boundary inclusion maps and standard coherences (given in \cref{sec:further-properties} for the original definition). Unfortunately, this generalised form of insertion cannot be directly used in \Cattsua without breaking confluence. Let \(\Gamma\) be the following context given by the following diagram: % https://q.uiver.app/#q=WzAsNCxbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzMsMCwiXFxidWxsZXQiXSxbNCwwLCJcXGJ1bGxldCJdLFswLDEsIiIsMCx7ImN1cnZlIjotNX1dLFswLDEsIiIsMix7ImN1cnZlIjo1fV0sWzEsMiwiZyIsMCx7ImN1cnZlIjotMn1dLFsxLDIsImYiLDIseyJjdXJ2ZSI6Mn1dLFsyLDMsImkiLDAseyJjdXJ2ZSI6LTJ9XSxbMiwzLCJoIiwyLHsiY3VydmUiOjJ9XSxbNSw0LCIiLDIseyJvZmZzZXQiOi01LCJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzUsNCwiIiwwLHsib2Zmc2V0Ijo1LCJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzcsNiwiXFxhbHBoYSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbOSw4LCJcXGJldGEiLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzEwLDExLCJcXHBoaSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XV0= % tex-fmt: skip \[ \begin{tikzcd} \bullet && \bullet & \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=30pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "g", curve={height=-12pt}, from=1-3, to=1-4] \arrow[""{name=3, anchor=center, inner sep=0}, "f"', curve={height=12pt}, from=1-3, to=1-4] \arrow[""{name=4, anchor=center, inner sep=0}, "i", curve={height=-12pt}, from=1-4, to=1-5] \arrow[""{name=5, anchor=center, inner sep=0}, "h"', curve={height=12pt}, from=1-4, to=1-5] \arrow[""{name=6, anchor=center, inner sep=0}, "\gamma", shift left=5, shorten <=8pt, shorten >=8pt, Rightarrow, from=1, to=0] \arrow[""{name=7, anchor=center, inner sep=0}, "\delta"', shift right=5, shorten <=8pt, shorten >=8pt, Rightarrow, from=1, to=0] \arrow["\alpha"', shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=2] \arrow["\beta"', shorten <=3pt, shorten >=3pt, Rightarrow, from=5, to=4] \arrow["\phi"', shorten <=4pt, shorten >=4pt, Rightarrow, nfold=3, from=6, to=7] \end{tikzcd} \] and consider the terms: \begin{align*} I &= (\alpha *_0 h) *_1 (g *_0 \beta)\\ E &= \Coh {\Supp(I)} {I \to I} {\id}\\ X &= \phi *_0 E \end{align*} We now have the following critical pair: \(X\) can reduce by inserting the locally maximal argument \(E\), as the branch has branching height \(0\) making the suspendability condition vacuous, but \(E\) also reduces by endo-coherence removal. By performing the generalised insertion we obtain the coherence: \[ \Coh \Gamma {\gamma *_0 I \to \delta *_0 I} \id\] Let \(W(x,y,z)\) refer to the standard composite over the diagram: % https://q.uiver.app/#q=WzAsMyxbMCwwLCJcXGJ1bGxldCJdLFsxLDAsIlxcYnVsbGV0Il0sWzIsMCwiXFxidWxsZXQiXSxbMCwxLCIiLDAseyJjdXJ2ZSI6LTN9XSxbMCwxLCIiLDIseyJjdXJ2ZSI6M31dLFsxLDIsIiIsMix7ImN1cnZlIjotNX1dLFsxLDIsIiIsMix7ImN1cnZlIjo1fV0sWzEsMl0sWzQsMywiIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs2LDcsIiIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbNyw1LCIiLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV1d % tex-fmt: skip \[ \begin{tikzcd} \bullet & \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=18pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=30pt}, from=1-2, to=1-3] \arrow[""{name=4, anchor=center, inner sep=0}, from=1-2, to=1-3] \arrow["x"',shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["y"',shorten <=4pt, shorten >=4pt, Rightarrow, from=3, to=4] \arrow["z"',shorten <=4pt, shorten >=4pt, Rightarrow, from=4, to=2] \end{tikzcd} \] Then the coherence term above admits further cell reductions which convert the composites \(\gamma *_0 I\) and \(\delta *_0 I\) to \(W(\gamma, (\alpha *_0 h), (g *_0 \beta))\) and \(W(\delta, (\alpha *_0 h), (g *_0 \beta))\). The resulting term reduces no further. If the endo-coherence removal is performed, then \(E\) reduces to \(\id(I)\), which can be pruned from the original composite. After further reductions, we obtain a coherence over the context \(\Delta\) given by the following diagram: % https://q.uiver.app/#q=WzAsMyxbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzMsMCwiXFxidWxsZXQiXSxbMCwxLCIiLDAseyJjdXJ2ZSI6LTV9XSxbMCwxLCIiLDAseyJjdXJ2ZSI6NX1dLFsxLDIsIiIsMCx7ImN1cnZlIjotMn1dLFsxLDIsIiIsMix7ImN1cnZlIjoyfV0sWzQsMywiIiwwLHsib2Zmc2V0IjotNSwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs0LDMsIiIsMix7Im9mZnNldCI6NSwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs2LDUsIkIiLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzcsOCwiQSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XV0= % tex-fmt: skip \[ \begin{tikzcd} \bullet && \bullet & \bullet \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=30pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, curve={height=-30pt}, from=1-3, to=1-4] \arrow[""{name=6, anchor=center, inner sep=0}, curve={height=0pt}, from=1-3, to=1-4] \arrow[""{name=3, anchor=center, inner sep=0}, curve={height=30pt}, from=1-3, to=1-4] \arrow[""{name=4, anchor=center, inner sep=0}, "x", shift left=5, shorten <=8pt, shorten >=8pt, Rightarrow, from=1, to=0] \arrow[""{name=5, anchor=center, inner sep=0}, "y"', shift right=5, shorten <=8pt, shorten >=8pt, Rightarrow, from=1, to=0] \arrow["B"', shorten <=3pt, shorten >=3pt, Rightarrow, from=3, to=6] \arrow["C"', shorten <=3pt, shorten >=3pt, Rightarrow, from=6, to=2] \arrow["A"', shorten <=4pt, shorten >=4pt, Rightarrow, nfold=3, from=4, to=5] \end{tikzcd} \] In particular, the result of these reductions is the following coherence: \[ \Coh \Delta {W(x,B,C) \to W(y,B,C)} {\langle \phi, (\alpha *_0 h), (g *_0 \beta) \rangle}\] which admits no further reductions, hence breaking confluence. It is even unclear which of these reduction paths is the more canonical for such a system, the first moves the complexity of \(I\) to the type in the coherence, whereas the second keeps the complexity of \(I\) in the arguments of the coherence. Conjecturally, one could consider generalisations to endo-coherence removal which could factor out the common structure of \(W(\gamma, (\alpha *_0 h), (g *_0 \beta))\) and \(W(\delta, (\alpha *_0 h), (g *_0 \beta))\), reducing the result of the first reduction path to the result of the second reduction path, though we have not explored any such definition. \paragraph{A further strictification to \Cattsua} \citeauthor{douglas2016internal} give an explicit representation a Gray category~\cite[Definition~2.8]{douglas2016internal}, which can be used as a direct point of comparison to \Cattsua, as Gray categories are semistrict \(3\)-categories with strict unitors and associators. The weak structure in their presentation of Gray categories is given by an invertible \(3\)-cell they call \emph{switch}, which has the same form as the \Catt term which we called \(\mathsf{swap}\) in \cref{sec:cattsu}. In their paper, all of the equalities between \(2\)-cells are generated by a set of axioms [S2-4] to [S2-15]. Each of these equalities is contained in the definitional equality of \Cattsua, with the exception of [S2-9] and [S2-10], which witness a compatibility between whiskering and vertical composition. We consider the axiom [S2-9], as [S2-10] can be treated symmetrically. Let \(\Delta\) be the context given by diagram: % https://q.uiver.app/#q=WzAsMyxbMCwwLCJcXGJ1bGxldCJdLFsxLDAsIlxcYnVsbGV0Il0sWzIsMCwiXFxidWxsZXQiXSxbMCwxLCJmIl0sWzEsMiwiIiwwLHsiY3VydmUiOi00fV0sWzEsMiwiIiwwLHsiY3VydmUiOjR9XSxbMSwyXSxbNSw2LCJcXGFscGhhIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs2LDQsIlxcYmV0YSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XV0= % tex-fmt: skip \[ \begin{tikzcd} \bullet & \bullet & \bullet \arrow["f", from=1-1, to=1-2] \arrow[""{name=0, anchor=center, inner sep=0}, curve={height=-24pt}, from=1-2, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, curve={height=24pt}, from=1-2, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, from=1-2, to=1-3] \arrow["\alpha"', shorten <=3pt, shorten >=3pt, Rightarrow, from=1, to=2] \arrow["\beta"', shorten <=3pt, shorten >=3pt, Rightarrow, from=2, to=0] \end{tikzcd} \] and consider the following terms of \(\Delta\): \[ (f *_0 \alpha) *_1 (f *_0 \beta) \qquad f *_0 (\alpha *_1 \beta)\] while the second term reduces to the standard composite over \(\Delta\), the first does not reduce, as no insertion can be performed due to the condition on trunk height, and hence these two terms are not equal in \Cattsua, unlike in Gray categories. Although it could be argued that these axioms reside in the interchange family of laws for \(\infty\)-categories, one could attempt to define a stricter version of \Cattsua which incorporates these equalities, with the aim of proving that \(3\)-truncated models of this stricter type theory are equivalent to Gray categories. \paragraph{Strict interchange} In contrast to the reductions in this thesis which strictify units, one could instead consider reductions that strictify all composition, making the associativity and interchange laws strict, leaving only units weak. Such a form of semistrictness is often called \emph{Simpson semistrictness}, due to a conjecture of \citeauthor{simpson1998homotopy}~\cite{simpson1998homotopy} that leaving units weak is sufficient to retain the full expressiveness of weak \(\infty\)-categories. To achieve this, one could try an approach similar to insertion of merging arguments of a term into the head coherence, when all the involved terms are standard coherences. To be able to strictify terms such as the \(\mathsf{swap}\) term given in \cref{sec:cattsu}, the trunk height condition of insertion must be dropped. This immediately leads to composites over contexts which are not pasting diagrams: Consider the context generated by the diagram: % https://q.uiver.app/#q=WzAsMyxbMCwwLCJ4Il0sWzEsMCwieSJdLFsyLDAsInoiXSxbMCwxLCJnIiwwLHsiY3VydmUiOi0zfV0sWzAsMSwiZiIsMix7ImN1cnZlIjozfV0sWzEsMiwiaSIsMCx7ImN1cnZlIjotM31dLFsxLDIsImgiLDIseyJjdXJ2ZSI6M31dLFs0LDMsIlxcYWxwaGEiLDIseyJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzYsNSwiXFxiZXRhIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dXQ== % tex-fmt: skip \[ \begin{tikzcd} x & y & z \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, "i", curve={height=-18pt}, from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "h"', curve={height=18pt}, from=1-2, to=1-3] \arrow["\alpha"', shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["\beta"', shorten <=5pt, shorten >=5pt, Rightarrow, from=3, to=2] \end{tikzcd} \] and then consider the following composite in this context: \[ \alpha *_0 ((\beta *_0 \id(\id(z))) *_1 \rho_i)\] where \(\rho_i\) is the right unitor on \(i\). Allowing a more general form of merging would lead to this term becoming a composite of the following form: % https://q.uiver.app/#q=WzAsNCxbMCwwLCJ4Il0sWzEsMCwieSJdLFsyLDAsInoiXSxbMywwLCJ6Il0sWzAsMSwiZyIsMCx7ImN1cnZlIjotM31dLFswLDEsImYiLDIseyJjdXJ2ZSI6M31dLFsxLDIsImkiXSxbMSwyLCJoIiwyLHsiY3VydmUiOjN9XSxbMiwzLCJcXGlkKHopIiwyXSxbMSwzLCIiLDAseyJjdXJ2ZSI6LTV9XSxbMiwzLCJcXGlkKHopIiwyLHsiY3VydmUiOjN9XSxbNSw0LCJcXGFscGhhIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs3LDYsIlxcYmV0YSIsMix7InNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbMiw5LCJcXHJob19pIiwyLHsic2hvcnRlbiI6eyJ0YXJnZXQiOjIwfX1dLFsxMCw4LCJcXGlkKFxcaWQoeikpIiwyLHsic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dXQ== % tex-fmt: skip \[ \begin{tikzcd} x & y & z & z \arrow[""{name=0, anchor=center, inner sep=0}, "g", curve={height=-18pt}, from=1-1, to=1-2] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=18pt}, from=1-1, to=1-2] \arrow[""{name=2, anchor=center, inner sep=0}, "i", from=1-2, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "h"', curve={height=25pt}, from=1-2, to=1-3] \arrow[""{name=4, anchor=center, inner sep=0}, "{\id}", from=1-3, to=1-4] \arrow[""{name=5, anchor=center, inner sep=0}, "i", curve={height=-40pt}, from=1-2, to=1-4] \arrow[""{name=6, anchor=center, inner sep=0}, "{\id}"', curve={height=25pt}, from=1-3, to=1-4] \arrow["\alpha"', shorten <=5pt, shorten >=5pt, Rightarrow, from=1, to=0] \arrow["\beta"', shorten <=2pt, shorten >=2pt, Rightarrow, from=3, to=2] \arrow["{\rho_i}"'{pos=0.4}, shorten >=3pt, Rightarrow, from=1-3, to=5] \arrow["{\id^2}"', shorten <=2pt, shorten >=2pt, Rightarrow, from=6, to=4] \end{tikzcd} \] Although this diagram is not a pasting diagram, as it is not a globular set, we would still expect it to fulfil a similar contractability property to the one pasting diagrams do. One may therefore be lead to believe that strict interchange could be achieved in a type theory similar to \Catt by allowing a more general class of pasting diagrams. This, however, does not work. We consider the following counterexample due to \citeauthor{forest2022unifying}~\cite{forest2022unifying}: let \(\Gamma\) be the context generated by the following diagram. % https://q.uiver.app/#q=WzAsMyxbMCwwLCJcXGJ1bGxldCJdLFsyLDAsIlxcYnVsbGV0Il0sWzQsMCwiXFxidWxsZXQiXSxbMCwxLCIiLDAseyJjdXJ2ZSI6LTV9XSxbMCwxLCIiLDIseyJjdXJ2ZSI6NX1dLFswLDFdLFsxLDIsIiIsMSx7ImN1cnZlIjotNX1dLFsxLDIsIiIsMSx7ImN1cnZlIjo1fV0sWzEsMl0sWzQsNSwiXFxhbHBoYSIsMix7Im9mZnNldCI6LTQsInNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbNCw1LCJcXGFscGhhJyIsMix7Im9mZnNldCI6NCwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs1LDMsIlxcYmV0YSIsMix7Im9mZnNldCI6LTQsInNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbNSwzLCJcXGJldGEnIiwyLHsib2Zmc2V0Ijo0LCJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzgsNiwiXFxkZWx0YSIsMix7Im9mZnNldCI6LTQsInNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XSxbOCw2LCJcXGRlbHRhJyIsMix7Im9mZnNldCI6NCwic2hvcnRlbiI6eyJzb3VyY2UiOjIwLCJ0YXJnZXQiOjIwfX1dLFs3LDgsIlxcZ2FtbWEiLDIseyJvZmZzZXQiOi00LCJzaG9ydGVuIjp7InNvdXJjZSI6MjAsInRhcmdldCI6MjB9fV0sWzcsOCwiXFxnYW1tYSciLDIseyJvZmZzZXQiOjQsInNob3J0ZW4iOnsic291cmNlIjoyMCwidGFyZ2V0IjoyMH19XV0= % tex-fmt: skip \[ \begin{tikzcd} \bullet && \bullet && \bullet \arrow[""{name=0, anchor=center, inner sep=0}, "h", curve={height=-40pt}, from=1-1, to=1-3] \arrow[""{name=1, anchor=center, inner sep=0}, "f"', curve={height=40pt}, from=1-1, to=1-3] \arrow[""{name=2, anchor=center, inner sep=0}, "g"{description}, from=1-1, to=1-3] \arrow[""{name=3, anchor=center, inner sep=0}, "k", curve={height=-40pt}, from=1-3, to=1-5] \arrow[""{name=4, anchor=center, inner sep=0}, "i"', curve={height=40pt}, from=1-3, to=1-5] \arrow[""{name=5, anchor=center, inner sep=0}, "j"{description}, from=1-3, to=1-5] \arrow["\alpha\vphantom{\alpha'}"', shift left=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=1, to=2] \arrow["{\alpha'}"', shift right=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=1, to=2] \arrow["\beta\vphantom{\beta'}"', shift left=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=2, to=0] \arrow["{\beta'}"', shift right=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=2, to=0] \arrow["\delta\vphantom{\delta'}"', shift left=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=5, to=3] \arrow["{\delta'}"', shift right=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=5, to=3] \arrow["\gamma\vphantom{\gamma'}"', shift left=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=4, to=5] \arrow["{\gamma'}"', shift right=3, shorten <=4pt, shorten >=4pt, Rightarrow, from=4, to=5] \end{tikzcd} \] and let \(\Delta = \Gamma, (X : \alpha *_0 \delta \to \alpha' *_0 \delta'), (Y : \beta *_0 \gamma \to \beta' *_0 \gamma')\). We then have the following distinct composites: \[ \left( \begin{matrix} f *_0 \gamma\\ *_1\\ X\\ *_1\\ \beta *_0 k \end{matrix} \right) *_2 \left( \begin{matrix} \alpha' *_0 i\\ *_1\\ Y\\ *_1\\ h *_0 \delta' \end{matrix} \right) \not\cong \left( \begin{matrix} \alpha *_0 i\\ *_1\\ Y\\ *_1\\ h *_0 \delta \end{matrix} \right) *_2 \left( \begin{matrix} f *_0 \gamma'\\ *_1\\ X\\ *_1\\ \beta' *_0 k \end{matrix} \right) \] which are intuitively the composite of \(X\) and \(Y\) in either order, where \(X\) and \(Y\) have been whiskered with the appropriate terms. We note that the matrix notation above is only used to aid comprehension, and does not represent the application of any matrix operations. The approach described above of merging together composites would lead to both of the above composites of \(X\) and \(Y\) being reduced to the same composite over \(\Delta\), contradicting the viability of such an approach. An alternative, non-rewriting based approach could be defined by the following equality rule: \begin{equation*} \left\{ (\Gamma, s \sub \sigma, t \sub \sigma) \mathrel{\bigg\vert}{} \begin{matrix*}[l] \text{\(s\) and \(t\) are pure composite terms,}\\ s = t \text{ in a strict \(\infty\)-category} \end{matrix*} \right\} \end{equation*} where a \emph{pure composite} is a term constructed only using standard composites. Such an approach avoids the counter example above, as the two composites of \(X\) and \(Y\) are not equal in a strict \(\infty\)-category, and so would not be equated in the type theory generated by this equality rule set. We note that due to an algorithm of \citeauthor{makkai2005word}~\cite{makkai2005word}, which is also described and implemented by \citeauthor{forest2021computational}~\cite{forest2021computational}, it can be decided whether terms \(s\) and \(t\) are equal in a strict \(\infty\)-category. Therefore, to decide equality of the above system, we need a method of finding the correct decomposition of a term into a substitution applied to a purely compositional term. We conjecture that there exists a factorisation system on \(\mathsf{Catt}\) with the left class of morphisms given by purely compositional substitutions, substitutions whose contained terms are all pure composites, which could be used for this purpose. We leave all details of such a construction for future work. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% References: %% \printbibliography \end{document} % Local Variables: % jinx-local-words: "CollegeShields ps" % TeX-engine: xetex % End: tex-fmt-0.5.2/tests/target/ignore.tex000066400000000000000000000007031473573253500175550ustar00rootroot00000000000000\documentclass{article} \begin{document} Lines which end with the ignore keyword are not indented or wrapped even if they are long % tex-fmt: skip % tex-fmt: off It is also possible to ignore blocks of lines together and not indent them even like this % tex-fmt: on Not ignored % tex-fmt: on Not ignored % tex-fmt: off Ignored % tex-fmt: off Ignored % tex-fmt: on Not ignored % tex-fmt: off Ignored \end{document} tex-fmt-0.5.2/tests/target/lists.tex000066400000000000000000000007101473573253500174260ustar00rootroot00000000000000\documentclass{article} \begin{document} \begin{itemize} \item Lists with items on one line \item Lists with items on multiple lines % comments before a list item \item Another item \item Another item % comments inside a list item Or even just % trailing comments \item Every \item should start \item a new line \end{itemize} Commands such as itemsep should not be affected. \setlength{\itemsep}{0pt} \end{document} tex-fmt-0.5.2/tests/target/masters_dissertation.tex000066400000000000000000003016011473573253500225410ustar00rootroot00000000000000\documentclass[12pt,draft]{ociamthesis} %TC:ignore % PDF Version %\pdfminorversion=7 % general typesetting \usepackage[utf8]{inputenc} \usepackage[british]{babel} \usepackage{microtype} \usepackage[table]{xcolor} % lengths \usepackage{etoolbox} \usepackage{setspace} % mathematics typesetting \usepackage{amsmath} \usepackage{amssymb} \usepackage{amsthm} \usepackage{mathtools} \usepackage{dsfont} % headers \usepackage{fancyhdr} \fancyhead{} \renewcommand{\headrulewidth}{0pt} % Lof, LoT \usepackage{tocloft} \setlength{\cftfigindent}{0pt} \setlength{\cfttabindent}{0pt} % algorithms \usepackage[boxruled, linesnumbered, commentsnumbered, algochapter, ]{algorithm2e} % graphics \usepackage{graphicx} \usepackage{float} \usepackage{subcaption} % draft options %\usepackage{ifdraft} %\ifoptiondraft{ %\usepackage{draftwatermark} %\SetWatermarkText{DRAFT} %\SetWatermarkScale{6} %\SetWatermarkColor[rgb]{1,0.9,0.9} %\usepackage{showframe} %\usepackage{layout} %}{} % hyperlinks \usepackage[plainpages=false,draft=false ,hidelinks ]{hyperref} \usepackage{cite} % glossary \usepackage[nopostdot,nonumberlist]{glossaries} %TC:endignore % suppress pdf warnings %\pdfsuppresswarningpagegroup=1 \title{Motif\hspace*{0.05cm}-Based Spectral Clustering\\[1ex] of Weighted Directed Networks} \author{William George Underwood} \college{Department of Statistics} \renewcommand{\submittedtext}{} \degree{Part C Dissertation in Mathematics \& Statistics} \degreedate{Trinity 2019} %TC:ignore \theoremstyle{plain} \newtheorem{theorem}{Theorem}[chapter] \newtheorem{proposition}{Proposition}[chapter] \theoremstyle{definition} \newtheorem{definition}{Definition}[chapter] \newtheorem{example}{Example}[chapter] \newtheorem{prf}{Proof}[chapter] \theoremstyle{remark} \newtheorem*{remark}{Remark} \newtheorem*{notation}{Notation} % algorithms \DontPrintSemicolon % input output definitions \makeatletter \renewcommand{\SetKwInOut}[2]{% \sbox\algocf@inoutbox{\KwSty{#2}\algocf@typo:}% \expandafter\ifx\csname InOutSizeDefined\endcsname\relax% \newcommand\InOutSizeDefined{}\setlength{\inoutsize}{\wd\algocf@inoutbox}% \sbox\algocf@inoutbox{\parbox[t]{\inoutsize}% {\KwSty{#2}\algocf@typo:\hfill}~}% \setlength{\inoutindent}{\wd\algocf@inoutbox}% \else% else keep the larger dimension \ifdim\wd\algocf@inoutbox>\inoutsize% \setlength{\inoutsize}{\wd\algocf@inoutbox}% \sbox\algocf@inoutbox{\parbox[t]{\inoutsize}% {\KwSty{#2}\algocf@typo:\hfill}~}% \setlength{\inoutindent}{\wd\algocf@inoutbox}% \fi% \fi% the dimension of the box is now defined. \algocf@newcommand{#1}[1]{% \ifthenelse{\boolean{algocf@inoutnumbered}}{\relax}{\everypar={\relax}}% {\let\\\algocf@newinout\hangindent=\inoutindent\hangafter=1\parbox[t]% {\inoutsize}{\KwSty{#2}\algocf@typo:\hfill}~##1\par}% \algocf@linesnumbered% reset the numbering of the lines }}% \makeatother % keywords \SetKwInOut{Input}{Input} \SetKwInOut{Output}{Output} \SetKw{In}{in} \SetKwProg{Function}{function}{:}{} % algorithm comment styles \newcommand\commfont[1]{\rmfamily{#1}} \SetCommentSty{commfont} \SetKwComment{Comm}{$\rhd\ $}{} % line spacing \AtBeginEnvironment{algorithm}{\setstretch{1.15}} % glossaries \setlength{\glsdescwidth}{0.92\hsize} \newglossarystyle{mystyle}{% \setglossarystyle{long}% \renewenvironment{theglossary}% \begin{longtable}{@{}p{2cm}p{\glsdescwidth}}% \end{longtable}% } \makeglossaries % macros \newcommand\bb[1]{\mathbb{#1}} \newcommand\ca[1]{\mathcal{#1}} \newcommand\Aut{\mathrm{Aut}} % for inputting tables \makeatletter\let\expandableinput\@@input\makeatother %TC:endignore % Glossary \newglossaryentry{MAM}{name=MAM, description={Motif adjacency matrix}} \newglossaryentry{DSBM}{name=DSBM, description={Directed stochastic block model}} \newglossaryentry{BSBM}{name=BSBM, description={Bipartite stochastic block model}} \newglossaryentry{Ncut}{name=Ncut, description={Normalised cut}} \newglossaryentry{ARI}{name=ARI, description={Adjusted Rand Index}} \glsaddall \begin{document} %TC:ignore % give sufficient line spacing for comment markup \baselineskip=18pt plus1pt % set how many section levels get numbers and appear in the contents \setcounter{secnumdepth}{3} \setcounter{tocdepth}{2} % do not hyphenate short words \lefthyphenmin4 \righthyphenmin4 \pagenumbering{Alph} %TC:endignore \maketitle \clearpage{} \begin{abstract} Clustering is an essential technique for network analysis, with applications in a diverse range of fields. Although spectral clustering is a popular and effective method, it fails to consider higher-order structure and can perform poorly on directed networks. We aim to address these shortcomings by exploring motif-based spectral clustering methods. We present new matrix formulae for motif adjacency matrices, and a motif-based approach for clustering bipartite networks. Comprehensive experimental results from both synthetic and real data demonstrate the effectiveness of our techniques on a variety of networks. We conclude that motif-based spectral clustering is a valuable tool for analysis of directed and bipartite weighted networks, which is also scalable and easy to implement. \end{abstract} \clearpage{} %TC:ignore \pagenumbering{arabic} \begin{romanpages} \tableofcontents \newpage \listoffigures \newpage \listoftables \begingroup \let\cleardoublepage\relax \let\clearpage\relax %\printglossary[title=Abbreviations, style=mystyle] \endgroup \end{romanpages} % fancy headers \pagestyle{fancy} \renewcommand{\chaptermark}[1]{\markboth{#1}{}} \fancyhead[RO]{\itshape{\nouppercase{Chapter \thechapter : \leftmark}}} %TC:endignore \clearpage{} \chapter{Introduction} % Importance of network analysis in the modern world Networks are ubiquitous in modern society; from the internet and online blogs to protein interactions and human migration, we are surrounded by inherently connected structures~\cite{kolaczyk2014statistical}. The mathematical and statistical analysis of networks is therefore a very important area of modern research, with applications in a diverse range of fields including biology~\cite{albert2005scale}, chemistry~\cite{jacob2018statistics}, physics~\cite{newman2008physics} and sociology~\cite{adamic2005political}. % Clustering is a core technique A common problem in network analysis is that of \emph{clustering}~\cite{schaeffer2007graph}. Network clustering refers to the division of a network into several parts so that objects in the same part are similar, while those in different parts are dissimilar. %Spectral methods are good Spectral methods for network clustering have a long and successful history~\cite{cheeger1969lower,donath1972algorithms,guattery1995performance}, and have become increasingly popular in recent years. These techniques exhibit many attractive properties including generality, ease of implementation and scalability~\cite{von2007tutorial}. % Shortcomings of spectral methods However traditional spectral methods have shortcomings, particularly involving their inability to consider higher-order network structures~\cite{benson2016higher}, and their insensitivity to edge direction~\cite{DirectedClustImbCuts}. These weaknesses can lead to unsatisfactory results, especially when working with directed networks. Motif-based spectral methods have proven more effective for clustering directed networks on the basis of higher-order structures~\cite{tsourakakis2017scalable}, with the introduction of the \emph{motif adjacency matrix} (MAM). % Problems we want to solve In this dissertation we will explore motif-based spectral clustering methods with a focus on addressing these shortcomings for weighted directed networks. Our main contributions include a collection of new matrix-based formulae for MAMs on weighted directed networks, and a motif-based approach for clustering bipartite networks. We also provide comprehensive experimental results both from synthetic data (stochastic block models) and from real-world network data. \section*{Dissertation layout} In Chapter~\ref{chap:graphs} we describe our graph-theoretic framework which provides a natural model for real-world weighted directed networks. We define motifs and instances, and then state and prove new matrix-based formulae for MAMs. % In Chapter~\ref{chap:spectral} we provide a summary of random-walk spectral clustering and discuss techniques for cluster extraction and evaluation. We state the algorithms for both traditional and motif-based spectral clustering. % In Chapter~\ref{chap:motif} we introduce directed stochastic block models (DSBMs), a family of generative models for directed networks, and evaluate the performance of motif-based clustering both on synthetic data and on real data (US Political Blogs network, US Migration network). % In Chapter~\ref{chap:bipartite} we propose a motif-based approach for clustering bipartite graphs and introduce bipartite stochastic block models (BSBMs), a family of generative models for bipartite networks. We again provide experimental results both on synthetic data and on real data (American Revolution network, Unicode Languages network). % Finally in Chapter~\ref{chap:conclusions} we present our conclusions, along with a discussion about limitations and potential extensions of our work. \clearpage{} \clearpage{} \chapter{Graphs and Motifs} \label{chap:graphs} We describe our graph-theoretic framework for network analysis and give matrix-based formulae for motif adjacency matrices (MAMs). In Section~\ref{sec:graphs_graph_definitions} we outline basic concepts relating to graphs and motifs. In Section~\ref{sec:graphs_adj_and_ind_matrices} we define the adjacency and indicator matrices of a graph. In Section~\ref{sec:graphs_motif_adj_matrices} we introduce MAMs and present the main results of this chapter, Proposition~\ref{prop:motif_adj_matrix_formula} and Proposition~\ref{prop:motif_adj_matrix_computation}. \section{Graph definitions} \label{sec:graphs_graph_definitions} Graph notation is notoriously inconsistent in the literature \cite{intro_to_graph_theory}, so we begin by giving all of the relevant notation and definitions. \begin{definition}[Graphs] A \emph{graph} is a triple $\ca{G} = (\ca{V,E},W)$ where $\ca{V}$ is the \emph{vertex set}, $\ca{E} \subseteq \left\{ (i,j) : i,j \in \ca{V}, i \neq j \right\}$ is the \emph{edge set} and $W\colon \ca{E} \to (0,\infty)$ is the \emph{weight map}. \end{definition} \begin{remark} We consider weighted directed graphs without self-loops or multiple edges. We can extend to undirected graphs by replacing undirected edges with bidirectional edges. Where it is not relevant, we may sometimes omit the weight map $W$. \end{remark} \begin{definition}[Underlying edges] Let $\ca{G} = (\ca{V,E})$ be a graph. Its \emph{underlying edges} are $\bar{\ca{E}} \vcentcolon = \big\{ \{i,j\} : (i,j) \in \ca{E} \big\}$. \end{definition} \begin{definition}[Subgraphs] A graph $\ca{G'} = (\ca{V',E'})$ is a \emph{subgraph} of a graph $\ca{G} = (\ca{V,E})$ (write $\ca{G'} \leq \ca{G}$) if $\ca{V'} \subseteq \ca{V}$ and $\ca{E'} \subseteq \ca{E}$. It is an \emph{induced subgraph} (write $\ca{G'} < \ca{G}$) if further $\ca{E'} = \ca{E} \cap ( \ca{V'} \times \ca{V'} )$. \end{definition} \begin{definition}[Connected components] Let $\ca{G} = (\ca{V,E})$ be a graph. The \emph{connected components} of $\ca{G}$ are the partition $\ca{C}$ generated by the transitive closure of the relation $\sim$ on $\ca{V}$ defined by $i \sim j \iff \{i,j\} \in \bar{\ca{E}}$. We say $\ca{G}$ is (weakly) \emph{connected} if $|\ca{C}| = 1$. \end{definition} \begin{definition}[Graph isomorphisms] A graph $\ca{G'} = (\ca{V',E'})$ is \emph{isomorphic} to a graph $\ca{G} = (\ca{V,E})$ (write $\ca{G'} \cong \ca{G}$) if there is a bijection $\phi\colon \ca{V'} \rightarrow \ca{V}$ with $(u,v) \in \ca{E'} \iff \big(\phi(u), \phi(v) \big) \in \ca{E}$. An isomorphism from a graph to itself is called an \emph{automorphism}. \end{definition} \begin{definition}[Motifs and anchor sets] A \emph{motif} is a pair $(\ca{M,A})$ where $\ca{M} = (\ca{V_M,E_M})$ is a connected graph with $\ca{V_M} = \{ 1, \ldots, m \}$ for some small $m \geq 2$, and $\ca{A} \subseteq \ca{V_M}$ with $|\ca{A}| \geq 2$ is an \emph{anchor set}. If $\ca{A} \neq \ca{V_M}$ we say the motif is \emph{anchored}, and if $\ca{A=V_M}$ we say it is \emph{simple}. \end{definition} \begin{remark} Anchor sets~\cite{benson2016higher} specify which r\^oles vertices play in the motif, and are crucial for defining the collider and expander motifs given in Section~\ref{sec:coll_expa}. When an anchor set is not given, it is assumed that the motif is simple. Figure~\ref{fig:motif_definitions_directed} shows all simple motifs (up to isomorphism) on at most three vertices. \end{remark} \begin{definition}[Instances] Let $\ca{G}$ be a graph and $(\ca{M,A})$ a motif. We say that $\ca{H}$ is a \emph{functional instance} of $\ca{M}$ in $\ca{G}$ if $\ca{M} \cong \ca{H} \leq \ca{G}$. We say that $\ca{H}$ is a \emph{structural instance} of $\ca{M}$ in $\ca{G}$ if $\ca{M} \cong \ca{H} < \ca{G}$. \end{definition} \begin{definition}[Anchored pairs] Let $\ca{G}$ be a graph and $(\ca{M,A})$ a motif. Suppose $\ca{H}$ is an instance of $\ca{M}$ in $\ca{G}$. Define the \emph{anchored pairs of the instance} $\ca{H}$ as $$ \ca{A(H)} \vcentcolon = \big\{ \{\phi(i),\phi(j)\} : i,j \in \ca{A}, \ i \neq j, \ \phi \textrm{ is an isomorphism from } \ca{M} \textrm{ to } \ca{H} \big\}\,.$$ \end{definition} \begin{remark} Example~\ref{ex:instances} demonstrates functional and structural instances. Note that $\{i,j\} \in \ca{A(H)}$ if and only if $\ca{H}$ appears in $\ca{G}$ as an instance of $\ca{M}$ with $i \neq j$ co-appearing in the image of $\ca{A}$ under isomorphism. The motivation for this is that clustering methods should avoid separating vertices which appear as an anchored pair. \end{remark} % \begin{figure}[H] \centering %\includegraphics[scale=0.7,draft=false]{% %../tikz/motif_definitions_directed/motif_definitions_directed.pdf} \caption{All simple motifs on at most three vertices} \label{fig:motif_definitions_directed} \end{figure} \section{Adjacency and indicator matrices} \label{sec:graphs_adj_and_ind_matrices} Adjacency matrices provide a useful data structure for representing graphs and have many uses in calculating graph properties \cite{bapat2010graphs}. We define several variants of the adjacency matrix, which appear in Proposition~\ref{prop:motif_adj_matrix_formula} and Table~\ref{tab:motif_adj_mat_table}. \begin{definition}[Adjacency matrices] Let $\ca{G} = (\ca{V,E},W)$ be a graph with vertex set $\ca{V} = \{1, \ldots, n \}$. The \emph{adjacency matrix, single-edge adjacency matrix} and \emph{double-edge adjacency matrix} of $\ca{G}$ are respectively the $n \times n$ matrices \begin{align*} G_{i j} &\vcentcolon= W((i,j)) \ \bb{I} \{ (i,j) \in \ca{E} \}\,, \\ (G_\mathrm{s})_{i j} &\vcentcolon= W((i,j)) \ \bb{I} \{ (i,j) \in \ca{E} \textrm{ and } (j,i) \notin \ca{E} \}\,, \\ (G_\mathrm{d})_{i j} &\vcentcolon= \big( W((i,j)) + W((j,i)) \big) \ \bb{I} \{ (i,j) \in \ca{E} \textrm{ and } (j,i) \in \ca{E} \}\,. \end{align*} \end{definition} \begin{definition}[Indicator matrices] Let $\ca{G} = (\ca{V,E},W)$ be a graph with vertex set $\ca{V} = \{1, \ldots, n \}$. The \emph{indicator matrix, single-edge indicator matrix, double-edge indicator matrix, missing-edge indicator matrix} and \emph{vertex-distinct indicator matrix} of $\ca{G}$ are respectively the $n \times n$ matrices \begin{align*} J_{i j} &\vcentcolon= \bb{I} \{ (i,j) \in \ca{E} \}\,, \\ (J_\mathrm{s})_{i j} &\vcentcolon= \bb{I} \{ (i,j) \in \ca{E} \textrm{ and } (j,i) \notin \ca{E} \}\,, \\ (J_\mathrm{d})_{i j} &\vcentcolon= \bb{I} \{ (i,j) \in \ca{E} \textrm{ and } (j,i) \in \ca{E} \}\,, \\ (J_0)_{i j} &\vcentcolon= \bb{I} \{ (i,j) \notin \ca{E} \textrm{ and } (j,i) \notin \ca{E} \textrm{ and } i \neq j \}\,, \\ (J_\mathrm{n})_{i j} &\vcentcolon= \bb{I} \{ i \neq j \}\,. \end{align*} \end{definition} \section{Motif adjacency matrices} \label{sec:graphs_motif_adj_matrices} The central object in motif-based spectral clustering is the \emph{motif adjacency matrix} (MAM) \cite{benson2016higher}, which serves as a similarity matrix for spectral clustering (Chapter~\ref{chap:spectral}). We provide here our main results: Proposition~\ref{prop:motif_adj_matrix_formula} gives a computationally useful formula for MAMs, and Proposition~\ref{prop:motif_adj_matrix_computation} gives a complexity analysis of this formula. \pagebreak \subsection{Definitions} \begin{definition}[Motif adjacency matrices] \label{def:motif_adj_matrices} % Let $\ca{G} = (\ca{V,E},W)$ be a graph with $n$ vertices and let $\ca{(M,A)}$ be a motif. The \emph{functional} and \emph{structural motif adjacency matrices} (MAMs) of $\ca{(M,A)}$ in $\ca{G}$ are respectively the $n \times n$ matrices % \begin{align*} M^\mathrm{func}_{i j} &\vcentcolon= \frac{1}{|\ca{E_M}|} \sum_{\ca{M} \cong \ca{H} \leq \ca{G}} \bb{I} \big\{ \{i,j\} \in \ca{A}(\ca{H}) \big\} \sum_{e \in \ca{E_H}} W(e)\,, \\ M^\mathrm{struc}_{i j} &\vcentcolon= \frac{1}{|\ca{E_M}|} \sum_{\ca{M} \cong \ca{H} < \ca{G}} \bb{I} \big\{ \{i,j\} \in \ca{A}(\ca{H}) \big\} \sum_{e \in \ca{E_H}} W(e)\,. \end{align*} \end{definition} \begin{remark} Example~\ref{ex:motif_adj_matrices} gives a simple illustration of calculating an MAM. When $W \equiv 1$ and $\ca{M}$ is simple, the (functional or structural) MAM entry $M_{i j} \ (i \neq j)$ simply counts the (functional or structural) instances of $\ca{M}$ in $\ca{G}$ containing $i$ and $j$. When $\ca{M}$ is not simple, $M_{i j}$ counts only those instances with anchor sets containing both $i$ and $j$. MAMs are always symmetric, since the only dependency on $(i,j)$ is via the unordered set $\{i,j\}$. \end{remark} \subsection{Computation} \label{sec:graphs_computation} In order to state Propositions \ref{prop:motif_adj_matrix_formula} and~\ref{prop:motif_adj_matrix_computation}, we need one more definition. \begin{definition}[Anchored automorphism classes] Let $(\ca{M,A})$ be a motif. Let $S_\ca{M}$ be the set of permutations on $ \ca{V_M} = \{ 1, \ldots, m \}$ and define the \emph{anchor-preserving permutations} $S_\ca{M,A} = \{ \sigma \in S_\ca{M} : \{1,m\} \subseteq \sigma(\ca{A}) \}$. Let $\sim$ be the equivalence relation defined on $S_\ca{M,A}$ by: $\sigma \sim \tau \iff \tau^{-1} \sigma$ is an automorphism of $\ca{M}$. Finally the \emph{anchored automorphism classes} are the quotient set $S_\ca{M,A}^\sim \vcentcolon= S_\ca{M,A} \ \big/ \sim$\,. \end{definition} \begin{proposition}[MAM formula] \label{prop:motif_adj_matrix_formula} Let $\ca{G} = (\ca{V,E},W)$ be a graph with vertex set ${\ca{V}=\{1,\ldots,n\}}$ and let $(\ca{M,A})$ be a motif on $m$ vertices. Then for any $i,j \in \ca{V}$ and with $k_1 = i$, $k_m = j$, the functional and structural MAMs of $\ca{(M,A)}$ in $\ca{G}$ are given by % % \begin{align*} M^\mathrm{func}_{i j} &= \frac{1}{|\ca{E_M}|} \sum_{\sigma \in S_\ca{M,A}^\sim} \ \sum_{\{k_2, \ldots, k_{m-1}\} \subseteq \ca{V}} \ J^\mathrm{func}_{\mathbf{k},\sigma} \ G^\mathrm{func}_{\mathbf{k},\sigma}\,, &(1) \\ M^\mathrm{struc}_{i j} &= \frac{1}{|\ca{E_M}|} \sum_{\sigma \in S_\ca{M,A}^\sim} \ \sum_{\{k_2, \ldots, k_{m-1}\} \subseteq \ca{V}} \ J^\mathrm{struc}_{\mathbf{k},\sigma} \ G^\mathrm{struc}_{\mathbf{k},\sigma}\,, &(2) \end{align*} % where % \begin{align*} \ca{E}_\ca{M}^0 &\vcentcolon= \{ (u,v) : 1 \leq u < v \leq m : (u,v) \notin \ca{E_M}, (v,u) \notin \ca{E_M} \}\,, \\ \ca{E}_\ca{M}^\mathrm{s} &\vcentcolon= \{ (u,v) : 1 \leq u < v \leq m : (u,v) \in \ca{E_M}, (v,u) \notin \ca{E_M} \}\,, \\ \ca{E}_\ca{M}^\mathrm{d} &\vcentcolon= \{ (u,v) : 1 \leq u < v \leq m : (u,v) \in \ca{E_M}, (v,u) \in \ca{E_M} \}\,, \end{align*} % are respectively the missing edges, single edges and double edges of $\ca{E_M}$, and % %TC:ignore \begin{alignat*}{3} % J^\mathrm{func}_{\mathbf{k},\sigma} & \vcentcolon= \prod_{\ca{E}_\ca{M}^0} (J_\mathrm{n})_{k_{\sigma u},k_{\sigma v}} && && \prod_{\ca{E}_\ca{M}^\mathrm{s}} J_{k_{\sigma u},k_{\sigma v}} \prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}\,, \\ % G^\mathrm{func}_{\mathbf{k},\sigma} & \vcentcolon= \sum_{\ca{E}_\ca{M}^\mathrm{s}} G_{k_{\sigma u},k_{\sigma v}} && + && \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}\,, \\ % J^\mathrm{struc}_{\mathbf{k},\sigma} & \vcentcolon= \prod_{\ca{E}_\ca{M}^0} (J_0)_{k_{\sigma u},k_{\sigma v}} && && \prod_{\ca{E}_\ca{M}^\mathrm{s}} (J_\mathrm{s})_{k_{\sigma u},k_{\sigma v}} \prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}\,, \\ % G^\mathrm{struc}_{\mathbf{k},\sigma} &\vcentcolon= \sum_{\ca{E}_\ca{M}^\mathrm{s}} (G_\mathrm{s})_{k_{\sigma u},k_{\sigma v}} && + && \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}\,. % \end{alignat*} %TC:endignore \end{proposition} % \begin{proof} See Proof~\ref{proof:motif_adj_matrix_formula}. \end{proof} \begin{proposition}[Complexity of MAM formula] \label{prop:motif_adj_matrix_computation} Suppose that ${m \leq 3}$, and the adjacency matrix $G$ of $\ca{G}$ is known. Then computing adjacency and indicator matrices and calculating an MAM using Equations $(1)$ and $(2)$ in Proposition~\ref{prop:motif_adj_matrix_formula} involves at most 18 matrix multiplications, 22 entry-wise multiplications and 21 additions of (typically sparse) $n \times n$ matrices. \end{proposition} \begin{proof} See Proof~\ref{proof:motif_adj_matrix_computation}. \end{proof} Hence for motifs on at most three vertices and with sparse adjacency matrices, Proposition~\ref{prop:motif_adj_matrix_formula} gives a fast and parallelisable matrix-based procedure for computing MAMs. In practice, additional symmetries of the motif often allow computation with even fewer matrix operations, demonstrated in Example~\ref{ex:motif_adj_calc}. A list of such MAM formulae for all simple motifs on at most three vertices (up to isomorphism), as well as for the \emph{collider} and \emph{expander} motifs (Section~\ref{sec:coll_expa}), is given in Table~\ref{tab:motif_adj_mat_table}. These formulae are generalisations of those stated in Table S6 in the supplementary materials for \cite{benson2016higher}, in an incomplete list of only \emph{structural} MAMs of \emph{unweighted} graphs. Note that the functional MAM formula for the two-vertex motif $\ca{M}_\mathrm{s}$ yields the symmetrised adjacency matrix $M = G + G^\top$ which is used for traditional spectral clustering (Section~\ref{sec:spectral_overview}). The question of whether to use functional or structural MAMs for motif-based spectral clustering will be addressed in Section~\ref{sec:spectral_motifrwspectclust}. \clearpage{} \clearpage{} \chapter{Spectral Clustering} \label{chap:spectral} We provide a summary of traditional random-walk spectral clustering and show how it applies to motif-based clustering. This chapter mostly follows the relevant sections in the tutorial by U.~Von~Luxburg~\cite{von2007tutorial}, which provides further explanations and proofs. In Section~\ref{sec:spectral_overview} we give an overview of the spectral clustering procedure. In Section~\ref{sec:spectral_laplacians} we define the random-walk Laplacian and state some of its useful properties (Proposition~\ref{prop:laplacian}). In Section~\ref{sec:spectral_graph_cut} we introduce normalised cut (Ncut) as an objective function for graph partitioning. In Section~\ref{sec:spectral_cluster_extraction} we explore methods of extracting clusters from $\bb{R}^l$-valued embeddings, and in Section~\ref{sec:spectral_algs} we present the algorithms for both traditional and motif-based random-walk spectral clustering. \section{Overview of spectral clustering} \label{sec:spectral_overview} Suppose $x_1, \ldots, x_n$ are data points with some associated symmetric similarity matrix $M$ with ${M_{i j} = \mathrm{similarity}(x_i,x_j)}$. The intuitive aim of clustering is to find a partition $\ca{P}_1, \ldots, \ca{P}_k$ of $\{ x_1, \ldots, x_n \}$ which places similar points in the same group and dissimilar points in different groups. Where other methods such as $k$-means++ \cite{arthur2007k} and GMM clustering \cite{duda1973pattern} demand some further structure on $x_i$ (such as taking values in $\bb{R}^l$), spectral clustering has no such requirements. In the context of \emph{undirected} graph clustering, the data points are the vertices of the graph, and a similarity matrix is provided by the graph's adjacency matrix $G$. To cluster directed graphs, the adjacency matrix must first be symmetrised, traditionally by the transformation $M = G + G^\top$ \cite{Meila2007ClusteringBW}. This symmetrisation ignores information about edge direction and higher-order structures; and can lead to poor performance, as will be seen in Section~\ref{sec:motif_asymm_dsbms}. Spectral clustering consists of two steps. Firstly, eigendecomposition of a Laplacian matrix embeds the vertices into $\bb{R}^{l}$. The $k$ clusters are then extracted from this space. \section{Graph Laplacians} \label{sec:spectral_laplacians} The Laplacians of an undirected graph are a family of matrices which play a central r\^ole in spectral clustering. While many different graph Laplacians are available, we focus in this dissertation on just the \emph{random-walk Laplacian}, for reasons concerning objective functions, consistency and computation \cite{von2007tutorial, luxburg2004convergence}. \begin{definition} Let $\ca{G}$ be an undirected graph with (symmetric) adjacency matrix $G$. The \emph{random-walk Laplacian matrix} of $\ca{G}$ is $$ L_\mathrm{rw} \vcentcolon= I - D^{-1} G $$ where $I$ is the identity and $D_{ii} \vcentcolon= \sum_j G_{i j}$ is the diagonal matrix of weighted degrees. \end{definition} \begin{remark} $D^{-1} G$ is the transition matrix of a random walk on the vertex set $\ca{V}$ where the probability of the transition $v_i \to v_j$ is proportional to $G_{i j}$. \end{remark} \begin{proposition}[Properties of the random-walk Laplacian] \label{prop:laplacian} $L_\mathrm{rw}$ is positive semi-definite with eigenvalues $0 = \lambda_1 \leq \cdots \leq \lambda_n$. The multiplicity $k$ of the eigenvalue $0$ is equal to the number of connected components $\ca{P}_1, \ldots, \ca{P}_k$ of $\ca{G}$. The eigenspace of the eigenvalue $0$ is spanned by the indicator vectors on these components; $ \bb{I}_{\ca{P}_1}, \ldots, \bb{I}_{\ca{P}_k} $. \end{proposition} \begin{proof} See \cite{von2007tutorial}. \end{proof} \section{Graph cuts} \label{sec:spectral_graph_cut} Graph cuts provide objective functions which we seek to minimise while clustering the vertices of a graph. We look at the normalised cut and its relationship with the random-walk Laplacian. \begin{definition} Let $\ca{G}$ be a graph. Let $ \ca{P}_1, \ldots, \ca{P}_k $ be a partition of $\ca{V}$. Then the \emph{normalised cut} \cite{shi2000normalized} of $\ca{G}$ with respect to $ \ca{P}_1, \ldots, \ca{P}_k $ is % $$ \mathrm{Ncut}_\ca{G}(\ca{P}_1, \ldots, \ca{P}_k) \vcentcolon= \frac{1}{2} \sum_{i=1}^k \frac{ \mathrm{cut}(\ca{P}_i,\bar{\ca{P}_i}) }{ \mathrm{vol}(\ca{P}_i) } $$ % where $ \mathrm{cut}(\ca{P}_i,\bar{\ca{P}_i}) \vcentcolon= \sum_{u \in \ca{P}_i, \, v \in \ca{V} \setminus \ca{P}_i} G_{u v}$ and $\mathrm{vol}(\ca{P}_i) \vcentcolon= \sum_{u \in \ca{P}_i} D_{u u}$. \end{definition} \begin{remark} More desirable partitions have a lower Ncut value; the numerators penalise partitions which cut a large number of heavily weighted edges, and the denominators penalise partitions which have highly imbalanced cluster sizes. \end{remark} It can be shown \cite{von2007tutorial} that minimising Ncut over partitions $ \ca{P}_1, \ldots, \ca{P}_k $ is equivalent to finding the cluster indicator matrix $H \in \bb{R}^{n \times k}$ minimising $$ \mathrm{Tr} \big( H^\top (D-G) H \big) $$ subject to $$ H_{i j} = \mathrm{vol}(\ca{P}_j)^{-\frac{1}{2}} \ \bb{I} \{ v_i \in \ca{P}_j \}\,, \qquad (\dagger) $$ $$ H^\top D H = I\,. $$ Solving this problem is in general \textsf{NP}-hard \cite{wagner1993between}. However, by dropping the constraint~$(\dagger)$ and applying the Rayleigh Principle \cite{lutkepohl1996handbook}, we find that the solution to this relaxed problem is that $H$ contains the first $k$ eigenvectors of $L_\mathrm{rw}$ as columns \cite{von2007tutorial}. In practice, to find $k$ clusters it is often sufficient to use only the first $l < k$ eigenvectors of $L_\mathrm{rw}$. \section{Cluster extraction} \label{sec:spectral_cluster_extraction} Once Laplacian eigendecomposition has been used to embed the data into $\bb{R}^l$, the clusters may be extracted using a variety of methods. We propose $k$-means++ and eigenvector sweep as two appropriate techniques. \subsection{\texorpdfstring{$k$}{k}-means++} $k$-means++ \cite{arthur2007k} is a popular clustering algorithm for data in $\bb{R}^l$. It aims to minimise the within-cluster sum of squares, based on the standard Euclidean metric on $\bb{R}^l$. This makes it a reasonable candidate for clustering spectral data, since the Euclidean metric corresponds to notions of `diffusion distance' in the original graph \cite{nadler2006diffusion}. \subsection{Eigenvector sweep} \label{sec:spectral_sweep} Eigenvector sweep (Algorithm~\ref{alg:eigenvector_sweep})~\cite{shi2000normalized} offers a more principled technique for cluster extraction when $k=2$ clusters are required, and a single eigenvector (usually the second eigenvector of $L_\mathrm{rw}$) is available. It works by sorting the eigenvector and selecting a splitting point to minimise the Ncut score of the partition generated. \pagebreak \begin{algorithm}[H] \caption{Eigenvector sweep} \label{alg:eigenvector_sweep} \SetKwFunction{Main}{EigenvectorSweep} \newcommand{\MainArgs}{$\ca{G}, x$} \BlankLine \Input{Graph $\ca{G}$, eigenvector $x$} \Output{Partition $\ca{P}_1, \ca{P}_2$} \BlankLine \Function{\Main{\MainArgs}}{ $\hat{x} \leftarrow \mathtt{sort}(x)$ \; $\mathrm{Score_{best}} \leftarrow \infty$ \; \For{$i$ \In $1, \ldots, n-1$}{ $\ca{P} \leftarrow \{ \hat{x}_1, \ldots \hat{x}_i \}$ \; $\mathrm{Score} \leftarrow \mathrm{Ncut}_\ca{G} (\ca{P}, \ca{V} \setminus \ca{P})$ \; \If{$\mathrm{Score} < \mathrm{Score_{best}}$}{ $\ca{P}_\mathrm{best} \leftarrow \ca{P}$ \; $\mathrm{Score_{best}} \leftarrow \mathrm{Score}$ \; } } $\ca{P}_1 \leftarrow \ca{P}_\mathrm{best}$ \; $\ca{P}_2 \leftarrow \ca{V} \setminus \ca{P}_\mathrm{best}$ \; \Return $\ca{P}_1, \ca{P}_2$ } \end{algorithm} \vspace*{0.5cm} Figure~\ref{fig:eigenvector_sweep_network} shows a small network with vertices labelled by position in the sorted second eigenvector $\hat{x}$ of $L_\mathrm{rw}$. Figure~\ref{fig:eigenvector_sweep_profile} shows the `sweep profile' of Ncut scores, which is minimised at the splitting point $i=5$. Hence eigenvector sweep chooses the final partition $\ca{P}_1 = \{1, \ldots,5\}, \ \ca{P}_2 = \{6, \ldots,10\}$; as indicated by the vertex colours and dashed line in Figure~\ref{fig:eigenvector_sweep_network}. % % \begin{figure}[H] \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../tikz/eigenvector_sweep_network/eigenvector_sweep_network.pdf} \caption{A small network} \label{fig:eigenvector_sweep_network} \end{subfigure} % \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/eigenvector_sweep/eigenvector_sweep_scores.pdf} \caption{Sweep profile of the network} \label{fig:eigenvector_sweep_profile} \end{subfigure} \caption{Eigenvector sweep selects a partition by minimising Ncut} \label{fig:eigenvector_sweep} \end{figure} % \subsection{Cluster evaluation} When a graph has been clustered, we assign a score to the partition. If the ground-truth clustering is available, we can compare it to our clustering using the \emph{adjusted Rand index} (ARI) \cite{hubert1985comparing}. The ARI between two clusterings has expected value $0$ under random cluster assignment, and maximum value $1$ denoting perfect agreement between the clusterings. A larger ARI indicates a more similar clustering. If the ground-truth clustering is not available, we can use the objective function Ncut. Clusterings with lower Ncut values partition the graph more agreeably. \section{Spectral clustering algorithms} \label{sec:spectral_algs} We present the full random-walk spectral clustering algorithm and show how it can be applied to motif-based random-walk spectral clustering. \subsection{Random-walk spectral clustering} Algorithm~\ref{alg:rwspectclust} gives random-walk spectral clustering \cite{von2007tutorial}, which takes a symmetric connected adjacency matrix as input. We use $k$-means++ rather than eigenvector sweep as the cluster extraction method, due to its superior flexibility and computational speed. We drop the first column of $H$ (the first eigenvector of $L_\mathrm{rw}$) since although it should be constant and uninformative (Proposition~\ref{prop:laplacian}), numerical imprecision may give unwanted artefacts. It is worth noting that although the relaxation used in Section~\ref{sec:spectral_graph_cut} is reasonable and often leads to good approximate solutions of the Ncut problem, there are cases where it performs poorly~\cite{guattery1998quality}. The Cheeger inequality~\cite{chung2005laplacians} gives a bound on the error introduced by this relaxation. \vspace*{0.5cm} \begin{algorithm}[H] \caption{Random-walk spectral clustering} \label{alg:rwspectclust} \SetKwFunction{Main}{RWSpectClust} \newcommand{\MainArgs}{$G,k,l$} \BlankLine \Input{Symmetric adjacency matrix $G$, number of clusters $k$, dimension $l$} \Output{Partition $\ca{P}_1, \ldots, \ca{P}_k$} \BlankLine \Function{\Main{\MainArgs}}{ Construct the weighted degree matrix $D_{ii} \leftarrow \sum_j G_{i j}$ \\ Construct the random walk Laplacian matrix $L_\mathrm{rw} \leftarrow I-D^{-1}G$ \\ Let $H$ have the first $l$ eigenvectors of $L_\mathrm{rw}$ as columns \\ Drop the first column of $H$ \\ Run $k$-means++ on the rows of $H$ with $k$ clusters to produce $\ca{P}_1, \ldots, \ca{P}_k$ \\ \Return $\ca{P}_1, \ldots, \ca{P}_k$ } \end{algorithm} \subsection{Motif-based random-walk spectral clustering} \label{sec:spectral_motifrwspectclust} Algorithm~\ref{alg:motifrwspectclust} gives motif-based random-walk spectral clustering. Note that although $\ca{G}$ may be a connected graph, there is no guarantee that the MAM is connected too. Hence $M$ is restricted to its largest connected component $C$ before spectral clustering is applied. While this may initially seem to be a flaw with motif-based spectral clustering (since not all vertices are assigned to a cluster), in fact it can be very useful; restriction of $M$ can remove vertices which are in some sense not `well connected' to the rest of the graph, which means that only a `core' set of vertices are clustered. This can result in Algorithm~\ref{alg:motifrwspectclust} making fewer misclassifications with motif-based methods than with traditional spectral clustering, as seen in Section~\ref{sec:motif_polblogs}. There is ambiguity in whether to use functional or structural MAMs. While the authors in~\cite{benson2016higher} opt for structural MAMs, we propose to use functional MAMs, for a few reasons. Firstly, note that $ 0 \leq M^\mathrm{struc}_{i j} \leq M^\mathrm{func}_{i j}$ for all $i,j \in \ca{V}$. This implies that the largest connected component of $M^\mathrm{func}$ is always at least as large as that of $M^\mathrm{struc}$, meaning that often more vertices can be assigned to a cluster. Secondly, we argue that functional instances are of more interest than structural motifs, since they specify only `existence' rather than `non-existence' of edges. For consistency we will therefore use functional MAMs throughout our experiments. The most computationally expensive part of Algorithm~\ref{alg:motifrwspectclust} is the calculation of the MAM using a formula from Table~\ref{tab:motif_adj_mat_table}. We found this to be feasible for graphs with up to around $n \approx 10 \, 000$ vertices. General notes on hardware and software are given in Section~\ref{sec:notes_hardware}, and timings for MAM computation across a range of graph sizes and sparsities are available in Section~\ref{sec:notes_timing}. \vspace*{0.5cm} \begin{algorithm}[H] \caption{Motif-based random-walk spectral clustering} \label{alg:motifrwspectclust} \SetKwFunction{Main}{MotifRWSpectClust} \newcommand{\MainArgs}{$\ca{G},\mathcal{M},k,l$} \BlankLine \Input{Graph $\ca{G}$, motif $\ca{M}$, number of clusters $k$, dimension $l$} \Output{Partition $\ca{P}_1, \ldots, \ca{P}_k$} \BlankLine \Function{\Main{\MainArgs}}{ Construct the motif adjacency matrix $M$ of the graph $\ca{G}$ with motif $\ca{M}$ \\ Let $\tilde{M}$ be $M$ restricted to its largest connected component, $C$ \\ $\ca{P}_1, \ldots, \ca{P}_k \leftarrow$ \texttt{RWSpectClust($\tilde{M},k,l$)} \\ \Return $\ca{P}_1, \ldots, \ca{P}_k$ } \end{algorithm} \clearpage{} \clearpage{} \chapter{Motif-Based Clustering} \label{chap:motif} We analyse the performance of motif-based random-walk spectral clustering on both synthetic and real data. In Section~\ref{sec:motif_dsbms} we propose a family of stochastic block models and perform experiments with a variety of motifs and parameters. In Section~\ref{sec:motif_polblogs} we analyse the US Political Blogs network and in Section~\ref{sec:motif_migration} we present results from the US Migration network. \section{Directed stochastic block models} \label{sec:motif_dsbms} We begin by describing \emph{directed stochastic block models} (DSBMs), a broad class of generative models for directed graphs. A DSBM is characterised by a block count $k$, a list of block sizes $(n_i)_{i=1}^k$ and a sparsity matrix $F \in [0,1]^{k \times k}$. We define the cumulative block sizes $N_i = \sum_{j=1}^i n_j$ with $N_0=0$, and the total graph size $N=N_k$. These are used to construct the expected adjacency matrix $A \in [0,1]^{N \times N}$ given by $A_{i j} = F_{rs} \ \bb{I}\{i \neq j\}$ where $N_{r-1} < i \leq N_r$ and $N_{s-1} < j \leq N_s$. Finally a graph $\ca{G}$ is generated with adjacency matrix entries $G_{i j} \sim \textrm{Ber}(A_{i j})$ sampled independently. We say that a DSBM is \emph{symmetric} if $F$ is a symmetric matrix. This DSBM definition is similar to that given by \cite{DirectedClustImbCuts}, although we impose independence between all entries of the adjacency matrix, allowing for bidirectional edges. \subsection{Symmetric two-block DSBMs} We define the \emph{symmetric two-block DSBM} as the DSBM with $k=2$, $n_1=n_2=n$ and $F = \begin{psmallmatrix} p & q \\ q & p \end{psmallmatrix}$ where $p > q$. Figure~\ref{fig:sym_two_block_dsbm} illustrates the block structure and sparsity matrix of this model. Thicker lines indicate existence of edges with higher probability. \begin{figure}[H] \centering %\includegraphics[scale=0.8,draft=false]{% %../tikz/sym_two_block_dsbm/sym_two_block_dsbm.pdf} \caption{Symmetric two-block DSBM block structure and sparsity matrix} \label{fig:sym_two_block_dsbm} \end{figure} We test the performance of Algorithm~\ref{alg:motifrwspectclust} across various motifs with parameters $k=l=2$ on this model. Figure~\ref{fig:motifsym} shows violin plots over 20 trials of ARI against motif, for different sets of parameters $n,p,q$. Also shown is $|C|$, the average size of the largest connected component of each MAM. It can be seen that several motifs (such as $\ca{M}_5$ and $\ca{M}_9$) achieve a similar ARI to the traditional spectral clustering technique given by the symmetrised adjacency matrix $M=G+G^\top$ generated by the motif $\ca{M}_\mathrm{s}$ (Table~\ref{tab:motif_adj_mat_table}). However the strongly connected motifs (particularly $\ca{M}_4$) generate MAMs with small connected components, especially when $\ca{G}$ is sparse, and hence only cluster a subset of the vertices of $\ca{G}$. \begin{figure}[H] \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/motifsym/motifsym_1.pdf} \caption{$n=50$, $p=0.3$, $q=0.2$} \end{subfigure} \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/motifsym/motifsym_2.pdf} \caption{$n=100$, $p=0.15$, $q=0.1$} \end{subfigure} \caption{ARI violin plots for the symmetric two-block DSBM} \label{fig:motifsym} \end{figure} \subsection{Asymmetric two-block DSBMs} \label{sec:motif_asymm_dsbms} We define the \emph{asymmetric two-block DSBM} as the DSBM with $k=2$, $n_1=n_2=n$ and $F = \begin{psmallmatrix} p & q_1 \\ q_2 & p \end{psmallmatrix}$ where $q_1 > q_2$ and $p = \frac{1}{2}(q_1+q_2)$. Figure~\ref{fig:asym_two_block_dsbm} shows this model. \begin{figure}[H] \centering %\includegraphics[scale=0.8,draft=false]{% %../tikz/asym_two_block_dsbm/asym_two_block_dsbm.pdf} \caption{Asymmetric two-block DSBM block structure and sparsity matrix} \label{fig:asym_two_block_dsbm} \end{figure} We test the performance of Algorithm~\ref{alg:motifrwspectclust} across various motifs with parameters $k=l=2$ on this model. Figure~\ref{fig:motifasym} shows violin plots over 20 trials of ARI against motif, for different sets of parameters $n,p,q_1,q_2$, and $|C|$ is shown. It is apparent that motif-based clustering with $\ca{M}_1$ is the best method, consistently achieving the highest ARI and keeping $|C|$ at its maximum value of $2n$. It is unsurprising that $\ca{M}_1$ (feed-back loop) performs well on this model; large $p$ makes feed-back loops within clusters likely, and small $q_2$ makes feed-back loops spanning the clusters unlikely. Motif $\ca{M}_2$ also performs reasonably well since it contains $\ca{M}_1$ as a submotif. Furthermore, the constraint $p = \frac{1}{2}(q_1+q_2)$ ensures that the na\"ive symmetrisation $M=G+G^\top$ produces indistinguishable clusters, and hence the traditional method performs extremely poorly. \begin{figure}[H] \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/motifasym/motifasym_1.pdf} \caption{$n=100$, $p=0.2$, $q_1=0.35$, $q_2=0.05$} \end{subfigure} \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/motifasym/motifasym_2.pdf} \caption{$n=200$, $p=0.15$, $q_1=0.25$, $q_2=0.05$} \end{subfigure} \caption{ARI violin plots for the asymmetric two-block DSBM} \label{fig:motifasym} \end{figure} \section{US Political Blogs network} \label{sec:motif_polblogs} Our first real data set is the US Political Blogs network \cite{adamic2005political}, consisting of data collected two months before the 2004 US election. Vertices represent blogs, and are labelled by their political leaning (`liberal' or `conservative'). Weighted directed edges represent the number of citations from one blog to another. After preprocessing (Section~\ref{sec:notes_preprocessing}) there are $536$ liberal blogs, $636$ conservative blogs (total 1222) and $19 \, 024$ edges. The network is plotted in Figure~\ref{fig:polblogs_network}. We test the performance of Algorithm~\ref{alg:motifrwspectclust} across various motifs with parameters $k=l=2$ on this network. Figure~\ref{fig:polblogs_ariplot} plots ARI against component size $|C|$. There is an apparent trade-off between ARI and connected component size. Motif $\ca{M}_9$ clusters many vertices with $|C|=1197$ and an ARI of 0.82, while the more strongly connected $\ca{M}_4$ only clusters $378$ vertices, with an improved ARI of 0.92. Finally, the poor performance of traditional spectral clustering is due to a small number of very weakly connected vertices being partitioned off, indicated by the dashed line and circled vertices in Figure~\ref{fig:polblogs_network}. \vspace*{0.5cm} \begin{figure}[H] \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/polblogs/polblogs_network.pdf} \caption{The US Political Blogs network} \label{fig:polblogs_network} \end{subfigure} \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/polblogs/polblogs_ari_conn.pdf} \caption{ARI against $|C|$ across motifs} \label{fig:polblogs_ariplot} \end{subfigure} \caption{Plots relating to the US Political Blogs network} \end{figure} Figure~\ref{fig:polblogs_embedding} shows the embedding given by eigenvectors 2 and 3 of the random-walk Laplacian of the MAM generated by motif $\ca{M}_{12}$. An instance of this motif in the network indicates the presence of a pair of mutually citing blogs with an incoming citation from a third (see Figure~\ref{fig:motif_definitions_directed}). Colourings are provided for Figure~\ref{fig:polblogs_embedding_truth} by the truth labels and for Figure~\ref{fig:polblogs_embedding_kmeans} by the $k$-means++ clustering of eigenvector 2. The clusterings are very similar, giving an ARI of $0.82$. \vspace*{0.5cm} \begin{figure}[H] \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/polblogs/polblogs_M12_truth.pdf} \caption{Colouring by truth label} \label{fig:polblogs_embedding_truth} \end{subfigure} \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/polblogs/polblogs_M12_clusts.pdf} \caption{Colouring by $k$-means++ cluster} \label{fig:polblogs_embedding_kmeans} \end{subfigure} \caption{Eigendecomposition embedding of the US Political Blogs network} \label{fig:polblogs_embedding} \end{figure} \pagebreak \section{US Migration network} \label{sec:motif_migration} The next data set is the US Migration network \cite{census2000}, consisting of data collected during the US Census in 2000. Vertices represent the 3075 counties in 49 contiguous states (excluding Alaska and Hawaii, and including the District of Columbia). The $721\,432$ weighted directed edges represent the number of people migrating from county to county, capped at $10 \, 000$ (the 99.9th percentile) to control large entries, as in \cite{DirectedClustImbCuts}. We test the performance of Algorithm~\ref{alg:motifrwspectclust} with three selected motifs: $\ca{M}_\mathrm{s}$, $\ca{M}_6$ and $\ca{M}_9$ (see Figure~\ref{fig:motif_definitions_directed}). $\ca{M}_\mathrm{s}$ gives the traditional spectral clustering method with na\"ive symmetrisation. $\ca{M}_6$ represents a pair of counties exchanging migrants, with both also receiving migrants from a third. $\ca{M}_9$ is a path of length two, allowing counties to be deemed similar if there is migration between them via another. Firstly, we plot sweep profiles of the graph using the second eigenvector of the random-walk Laplacian of the MAM associated with each motif, in Figure~\ref{fig:migration_sweep}. Note that all three display clear minima, indicating that these motifs produce well-defined clusters. The two-part clusterings produced by eigenvector sweep are somewhat similar across the three motifs, with pairwise ARIs equal to $\textrm{ARI}(\ca{M}_\mathrm{s}, \ca{M}_6) = 0.67$, $\textrm{ARI}(\ca{M}_\mathrm{s}, \ca{M}_9) = 0.92$ and $\textrm{ARI}(\ca{M}_6, \ca{M}_9) = 0.73$. \begin{figure}[H] \begin{subfigure}{.325\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/us_migration/us_migration_sweep_profile_Ms.pdf} \caption{$\ca{M}_\mathrm{s}$} \end{subfigure} \begin{subfigure}{.325\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/us_migration/us_migration_sweep_profile_M6.pdf} \caption{$\ca{M}_6$} \end{subfigure} \begin{subfigure}{.325\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/us_migration/us_migration_sweep_profile_M9.pdf} \caption{$\ca{M}_9$} \end{subfigure} \caption{Sweep profiles of the US Migration network} \label{fig:migration_sweep} \end{figure} Next, Figure~\ref{fig:us_migration} plots maps of the US, with counties coloured initially by the first six non-trivial eigenvectors $x_2, \ldots, x_7$ of the random-walk Laplacian of the associated MAM, and then by the clustering $C$ obtained by Algorithm~\ref{alg:motifrwspectclust} with $k=l=7$. For the eigenvector colourings, note how the coloured regions often line up with state boundaries, indicating that many migrants stay within the same state. It is also apparent that the motifs $\ca{M}_6$ and $\ca{M}_9$ produce `noisier' embeddings than traditional spectral clustering, due to their reliance on three-vertex motifs. Eigenvector~2 approximately differentiates counties by longitude, although $\ca{M}_9$ achieves a clearer division between east and west, while $\ca{M}_\mathrm{s}$ and $\ca{M}_6$ colour California (CA, see Figure~\ref{fig:notes_us_map}) more similarly to the East Coast. Eigenvector 3 tends to differentiate by latitude, though $\ca{M}_\mathrm{s}$ and $\ca{M}_6$ particularly isolate the states of North Dakota (ND), South Dakota (SD), Minnesota (MN), Wisconsin (WI) and Michigan (MI). Further structure is visible across all three motifs for eigenvectors 4--7. The clusterings $C$ partition the counties into $k=7$ regions, and there are some interesting differences between the motifs. Since there is no ground-truth clustering, we record the Ncut score associated with each clustering. It is apparent that motifs $\ca{M}_6$ and $\ca{M}_9$ give a similar partition, although with some differences: $\ca{M}_6$ clusters the East Coast together with western Florida (FL) and the counties containing Los Angeles (CA), San Diego (CA), Las Vegas (NV), Phoenix (AZ), Tucson (AZ), Denver (CO), Chicago (IL) and Nashville (TN). $\ca{M}_6$ favours a larger `central' region, which includes significant parts of Colorado (CO), Oklahoma (OK), Arkansas (AR) and Illinois (IL). $\ca{M}_\mathrm{s}$ gives a somewhat different partition, with one of the clusters allocated to Michigan (MI) and Wisconsin (WI) rather than Mississippi (MS), Alabama (AL), Georgia (GA) and Tennessee (TN). As with the eigenvectors, the clustering is smoother for $\ca{M}_\mathrm{s}$ than for $\ca{M}_6$ and $\ca{M}_9$. \pagebreak \vspace*{-1cm} \begin{figure}[H] \begin{table}[H] \centering \setlength{\tabcolsep}{0em} \begin{tabular}{ |c|c|c|c| } %\expandableinput ../../results/us_migration/us_migration_table.txt \end{tabular} \end{table} \vspace*{-0.5cm} \caption{Motif-based colourings of the US Migration network} \label{fig:us_migration} \end{figure} \clearpage{} \clearpage{} \addtocontents{toc}{\protect\newpage} \chapter{Bipartite Clustering} \label{chap:bipartite} We propose a technique for spectral clustering of bipartite graphs and test its performance on both real and synthetic data. In Section~\ref{sec:bipartite_graphs} we define bipartite graphs and present our clustering technique. In Section~\ref{sec:bipartite_sbms} we propose a bipartite stochastic block model (BSBM) and perform experiments with varying parameters. In Section~\ref{sec:bipartite_american_revolution} we demonstrate our method using the American Revolution network. In Section~\ref{sec:bipartite_languages} we analyse the Unicode Languages network. \section{Bipartite graphs} \label{sec:bipartite_graphs} \begin{definition} A \emph{bipartite graph} is a graph $\ca{G}=(\ca{V,E})$ where $\ca{V}$ can be partitioned into $\ca{V} = \ca{S} \sqcup \ca{D}$ such that $\ca{E} \subseteq \ca{S} \times \ca{D}$. That is, every edge starts in $\ca{S}$ and ends in $\ca{D}$. We refer to $\ca{S}$ as the \emph{source vertices} and to $\ca{D}$ as the \emph{destination vertices}. \end{definition} \subsection{Collider and expander motifs} \label{sec:coll_expa} Our method for clustering bipartite graphs revolves around two \emph{anchored} motifs; the \emph{collider} and the \emph{expander} (Figure~\ref{fig:expa_coll}). For each motif the anchor set is $\ca{A}=\{ 1,3 \}$. \begin{figure}[H] \centering %\includegraphics[scale=0.8,draft=false]{../tikz/expa_coll/expa_coll.pdf} \caption{The collider and expander motifs} \label{fig:expa_coll} \end{figure} These motifs are useful for bipartite clustering because of Proposition~\ref{prop:coll_expa_formulae}, which states that their restricted MAMs are the adjacency matrices of the projections~\cite{kolaczyk2014statistical} of the graph $\ca{G}$. In particular they can be used as similarity matrices for the source and destination vertices respectively. The similarity of two distinct source (resp. destination) vertices is the sum over their mutual neighbours of the average weights of their edges to (resp. from) that neighbour. \begin{proposition}[Colliders and expanders in bipartite graphs] \label{prop:coll_expa_formulae} Let $\ca{G} = (\ca{V,E},W)$ be a directed bipartite graph. Let $M_\mathrm{coll}$ and $M_\mathrm{expa}$ be the structural or functional MAMs of $\ca{M}_\mathrm{coll}$ and $\ca{M}_\mathrm{expa}$ respectively in $\ca{G}$. Then % \begin{align*} (M_\mathrm{coll})_{i j} &= \bb{I} \{i \neq j\} \hspace*{-0.4cm} \sum_{\substack{k \in \ca{D} \\ (i,k), (j,k) \in \ca{E}}} \hspace*{-0.2cm} \frac{1}{2} \Big[ W((i,k)) + W((j,k)) \Big]\,, &(1)\\ (M_\mathrm{expa})_{i j} &= \bb{I} \{i \neq j\} \hspace*{-0.4cm} \sum_{\substack{k \in \ca{S} \\ (k,i), (k,j) \in \ca{E}}} \hspace*{-0.2cm}\frac{1}{2} \Big[ W((k,i)) + W((k,j)) \Big]\,. &(2) \end{align*} % \end{proposition} % \begin{proof} See Proof~\ref{proof:coll_expa_formulae}. \end{proof} \subsection{Bipartite spectral clustering algorithm} Algorithm~\ref{alg:bipartite_clustering} gives our procedure for clustering a bipartite graph. The algorithm uses the collider and expander motifs to create similarity matrices for the source and destination vertices respectively (as in Section~\ref{sec:coll_expa}), and then applies random-walk spectral clustering (Algorithm~\ref{alg:rwspectclust}) to produce the partitions. \vspace*{0.5cm} \begin{algorithm}[H] \SetKwFunction{Main}{BipartiteRWSpectClust} \newcommand{\MainArgs}{$\ca{G},k_\ca{S},k_\ca{D},l_\ca{S},l_\ca{D}$} \BlankLine \Input{Bipartite graph $\ca{G}$, source clusters $k_\ca{S}$, destination clusters $k_\ca{D}$, source dimension $l_\ca{S}$, destination dimension $l_\ca{D}$} \Output{Source partition $\ca{S}_1, \ldots, \ca{S}_{k_\ca{S}}$, destination partition $\ca{D}_1, \ldots, \ca{D}_{k_\ca{D}}$} \BlankLine \Function{\Main{\MainArgs}}{ Construct the collider motif adjacency matrix $M_\mathrm{coll}$ of the graph $\ca{G}$ \\ Construct the expander motif adjacency matrix $M_\mathrm{expa}$ of the graph $\ca{G}$ \\ $M_\mathrm{coll} \leftarrow M_\mathrm{coll}[\ca{S,S}]$ \Comm*{restrict rows and columns of $M_\mathrm{coll}$ to $\ca{S}$ \hspace*{0.07cm}} $M_\mathrm{expa} \leftarrow M_\mathrm{expa}[\ca{D,D}]$ \Comm*{restrict rows and columns of $M_\mathrm{expa}$ to $\ca{D}$} $\ca{S}_1, \ldots, \ca{S}_{k_\ca{S}} \leftarrow$ \texttt{RWSpectClust($M_\mathrm{coll},k_\ca{S},l_\ca{S}$)} \\ $\ca{D}_1, \ldots, \ca{D}_{k_\ca{D}} \leftarrow$ \texttt{RWSpectClust($M_\mathrm{expa},k_\ca{D},l_\ca{D}$)} \\ \Return $\ca{S}_1, \ldots, \ca{S}_{k_\ca{S}}$ and $\ca{D}_1, \ldots, \ca{D}_{k_\ca{D}}$ } \caption{Bipartite random walk spectral clustering} \label{alg:bipartite_clustering} \end{algorithm} \section{Bipartite stochastic block models} \label{sec:bipartite_sbms} We define the \emph{bipartite stochastic block model} (BSBM) \cite{florescu2016spectral} as the DSBM with $k=4$, $n_1 = \dots = n_4=n$ and $F = \begin{psmallmatrix} 0 & 0 & p & q \\ 0 & 0 & q & p \\ 0 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 \end{psmallmatrix}$ where $p > q$. Figure~\ref{fig:bipartite_bsbm} illustrates the block structure and sparsity matrix of this model. This model partitions the source vertices as $\ca{S} = \ca{S}_1 \sqcup \ca{S}_2$ and the destination vertices as $\ca{D}=\ca{D}_1 \sqcup \ca{D}_2$. Edges exist with high probability from $\ca{S}_1$ to $\ca{D}_1$ and from $\ca{S}_2$ to $\ca{D}_2$. \begin{figure}[H] \centering %\includegraphics[scale=0.8,draft=false]{% %../tikz/bipartite_dsbm/bipartite_dsbm.pdf} \caption{BSBM block structure and sparsity matrix} \label{fig:bipartite_bsbm} \end{figure} We test the performance of Algorithm~\ref{alg:bipartite_clustering} with parameters $k_\ca{S} = k_\ca{D} = l_\ca{S} = l_\ca{D} = 2$ on this model. For comparison we implement the co-clustering method from \cite{dhillon2001co}, which is based on random-walk spectral clustering of the symmetrised adjacency matrix $G+G^\top$. Figure~\ref{fig:bipartite} shows violin plots over 20 trials of ARI against method, for different sets of parameters $n,p,q$. Note that if a bipartite graph is connected, then so are $M_\mathrm{coll}$ and $M_\mathrm{expa}$, so we need not consider the largest connected component size $|C|$. Performance of the two methods is very similar, for source and destination vertices. \begin{figure}[H] \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/bipartite/bipartite1.pdf} \caption{$n=100$, $p=0.2$, $q=0.1$} \end{subfigure} \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/bipartite/bipartite2.pdf} \caption{$n=200$, $p=0.1$, $q=0.06$} \end{subfigure} \caption{ARI violin plots for the BSBM} \label{fig:bipartite} \end{figure} \section{American Revolution network} \label{sec:bipartite_american_revolution} As an example of application of our bipartite clustering method to real data, we consider the American Revolution network \cite{konect:brunson_revolution}. This consists of data collected from before the American Revolution. Source vertices are people, and destination vertices are organisations. Edges represent membership of a person to an organisation. There are 136 people, 5 organisations and 160 edges. Algorithm~\ref{alg:bipartite_clustering} is run on the American Revolution network, with parameters $k_\ca{S} = l_\ca{S} = 5$ and $k_\ca{D} = l_\ca{D} = 2$. Figure~\ref{fig:bipartite_revolution_source} plots the network with people coloured by source cluster, and Figure~\ref{fig:bipartite_revolution_dest} plots the network with organisations coloured by destination cluster. The algorithm succeeds in clustering people based on their common memberships, and in clustering organisations based on their common members. \begin{figure}[H] \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/american_revolution/american_revolution_source.pdf} \caption{Grouping people into 5 clusters} \label{fig:bipartite_revolution_source} \end{subfigure} \begin{subfigure}{.49\textwidth} \centering %\includegraphics[scale=0.4,draft=false]{% %../../results/american_revolution/american_revolution_dest.pdf} \caption{Grouping organisations into 2 clusters} \label{fig:bipartite_revolution_dest} \end{subfigure} \caption{Bipartite clustering of the American Revolution network} \label{fig:bipartite_revolution} \end{figure} \section{Unicode Languages network} \label{sec:bipartite_languages} The final data set is the Unicode Languages network \cite{konect:unicodelang}, consisting of data collected in 2014 on languages spoken around the world. Source vertices are territories, and destination vertices are languages. Weighted directed edges from territory to language indicate the number of inhabitants in that territory who speak the specified language (territory population data taken from \cite{geonames}). After preprocessing (Section~\ref{sec:notes_preprocessing}) there are $155$ territories, $270$ languages and $705$ edges. We test Algorithm~\ref{alg:bipartite_clustering} with parameters $k_\ca{S} = l_\ca{S} = k_\ca{D} = l_\ca{D} = 6$ on this network. For the source vertices, Figure~\ref{fig:bipartite_languages_map} plots maps of the world with territories coloured by the clustering obtained. The top 20 territories (by population) in each cluster are given in Table~\ref{tab:bipartite_languages_source_clusters}. Cluster~1 is by far the largest cluster, and includes a wide variety of territories, of which many but not all speak some English. Cluster~2 contains the Persian-speaking territories of Iran and Afghanistan, the Arabic territories of Saudi Arabia and Syria, and the African French-speaking DR Congo, C\^ote d'Ivoire, Burkina Faso, Niger and others. It also includes Haiti, another French-speaking territory. Cluster~3 mostly captures Spanish-speaking territories in the Americas and also contains Equatorial Guinea, another Spanish-speaking territory in Africa. Cluster~4 includes the Slavic territories of Russia and some of its neighbours. The absence of Kazakhstan may be due to the $981 \, 760$ Kazakhs who speak German which is not a Slavic or Turkic language. Cluster~5 covers China, Hong Kong, Mongolia and some of South-East Asia. The inclusion of Panama might be due to the $6821$ Panamanians who speak Chinese. Cluster~6 is the smallest cluster and contains only Japan and the Koreas, which are connected by the $636 \, 440$ Japanese who speak Korean. There are a few territories and languages which are not contained in the large connected component of the network due to their linguistic isolation. These territories are Laos, Norway and Timor-Leste, and the languages are Lao, Norwegian Bokm{\aa}l and Norwegian Nynorsk. \begin{figure}[H] \centering %\includegraphics[scale=0.6, draft=false]{% %../../results/languages/languages_source_map_clusts.pdf} \caption{Clustering the territories from the Unicode Languages network} \label{fig:bipartite_languages_map} \end{figure} \begin{table}[H] \centering \scriptsize \begin{tabular}{ |c|c|c|c|c|c| } \hline \rule{0pt}{1.2em} \cellcolor[HTML]{8DD3C7} Cluster 1 & \cellcolor[HTML]{FFFFB3} Cluster 2 & \cellcolor[HTML]{BEBADA} Cluster 3 & \cellcolor[HTML]{FB8072} Cluster 4 & \cellcolor[HTML]{80B1D3} Cluster 5 & \cellcolor[HTML]{FDB462} Cluster 6 \\[0.1cm] \hline \rule{0pt}{1.2em} India & Iran & Mexico & Russia & China & Japan \\ United States & DR Congo & Colombia & Ukraine & Indonesia & S.\ Korea \\ Brazil & Afghanistan & Argentina & Uzbekistan & Vietnam & N.\ Korea \\ Pakistan & Saudi Arabia & Peru & Belarus & Malaysia & \\ Bangladesh & Syria & Venezuela & Tajikistan & Taiwan & \\ Nigeria & C\^ote d'Ivoire & Ecuador & Kyrgyzstan & Cambodia & \\ Philippines & Burkina Faso & Guatemala & Turkmenistan & Hong Kong & \\ Ethiopia & Niger & Cuba & Georgia & Singapore & \\ Germany & Mali & Bolivia & Moldova & Panama & \\ Egypt & Senegal & Paraguay & Latvia & Mongolia & \\ Turkey & Tunisia & El Salvador & Estonia & & \\ Thailand & Chad & Nicaragua & & & \\ France & Guinea & Costa Rica & & & \\ United Kingdom & Somalia & Uruguay & & & \\ Italy & Burundi & Eq.\ Guinea & & & \\ Myanmar & Haiti & & & & \\ South Africa & Benin & & & & \\ Spain & Azerbaijan & & & & \\ Tanzania & Togo & & & & \\ Kenya & Libya & & & & \\ $\cdots$ & $\cdots$ & & & & \\ $|\textrm{Cluster\ } 1 |$ = 87 & $|\textrm{Cluster\ } 2 |$ = 29 & $|\textrm{Cluster\ } 3 |$ = 15 & $|\textrm{Cluster\ } 4 |$ = 11 & $|\textrm{Cluster\ } 5 |$ = 10 & $|\textrm{Cluster\ } 6 |$ = 3 \\[0.1cm] \hline \end{tabular} \caption{Clustering the territories from the Unicode Languages network} \label{tab:bipartite_languages_source_clusters} \end{table} For the destination vertices, we present the six clusters obtained by Algorithm~\ref{alg:bipartite_clustering}. Table~\ref{tab:bipartite_languages_dest_clusters} contains the top 20 languages (by number of speakers) in each cluster. Cluster~1 is the largest cluster and contains the European languages of Spanish, Portuguese and French, as well as dialects of Arabic. Cluster~2 is also large and includes English as well as several South Asian languages such as Hindi, Bengali, Urdu and Punjabi. Cluster~3 consists of many indigenous African languages such as Swahili, Kinyarwanda and Somali. Cluster~4 captures languages from South-East Asia, mostly spoken in Indonesia and Malaysia. Cluster~5 identifies several varieties of Chinese and a few other Central and East Asian languages such as Kazakh and Uighur. Interestingly Korean is also placed in this group and not with Japanese, even though the Koreas are clustered together with Japan in Table~\ref{tab:bipartite_languages_source_clusters}. Cluster~6 captures more South-East Asian languages, this time from Thailand, Myanmar and Cambodia. Pattani Malay is in this cluster because despite its name it is spoken more in Thailand than in Malaysia. \vspace*{0.5cm} \begin{table}[H] \centering \scriptsize \begin{tabular}{ |c|c|c|c|c|c| } \hline \rule{0pt}{1.2em} Cluster 1 & Cluster 2 & Cluster 3 & Cluster 4 & Cluster 5 & Cluster 6 \\[0.1cm] \hline \rule{0pt}{1.2em} Spanish & English & Swahili & Indonesian & Chinese & Thai \\ Arabic & Hindi & Kinyarwanda & Javanese & Wu Chinese & N.E.\ Thai \\ Portuguese & Bengali & Somali & Malay & Korean & Khmer \\ French & Urdu & Luba-Lulua & Sundanese & Xiang Chinese & N.\ Thai \\ Russian & Punjabi & Kikuyu & Madurese & Hakka Chinese & S.\ Thai \\ Japanese & Telugu & Congo Swahili & Minangkabau & Minnan Chinese & Shan \\ German & Marathi & Luyia & Betawi & Gan Chinese & Pattani Malay \\ Turkish & Vietnamese & Ganda & Balinese & Kazakh & \\ Persian & Tamil & Luo & Buginese & Uighur & \\ Italian & Lahnda & Sukuma & Banjar & Sichuan Yi & \\ Egyptian Arabic & Filipino & Kalenjin & Achinese & Mongolian & \\ Polish & Gujarati & Lingala & Sasak & Zhuang & \\ Nigerian Pidgin & Kannada & Nyankole & Makasar & Tibetan & \\ Ukrainian & Pushto & Gusii & Lampung Api & & \\ Dutch & Malayalam & Kiga & Rejang & & \\ Algerian Arabic & Oriya & Soga & & & \\ Moroccan Arabic & Burmese & Luba-Katanga & & & \\ Hausa & Bhojpuri & Meru & & & \\ Azerbaijani & Amharic & Teso & & & \\ Uzbek & Oromo & Nyamwezi & & & \\ $\cdots$ & $\cdots$ & $\cdots$ & & & \\ $|\textrm{Cluster\ } 1 |$ = 120 & $|\textrm{Cluster\ } 2 |$ = 90 & $|\textrm{Cluster\ } 3 |$ = 25 & $|\textrm{Cluster\ } 4 |$ = 15 & $|\textrm{Cluster\ } 5 |$ = 13 & $|\textrm{Cluster\ } 6 |$ = 7 \\[0.1cm] \hline \end{tabular} \caption{Clustering the languages from the Unicode Languages network} \label{tab:bipartite_languages_dest_clusters} \end{table} \clearpage{} \clearpage{} \chapter{Conclusion} \label{chap:conclusions} With this dissertation we have introduced a graph-theoretic framework for analysis of weighted directed networks, and presented new matrix-based formulae for MAMs (Chapter~\ref{chap:graphs}). We have summarised the method of random-walk spectral clustering and shown how it can be used with motif-based techniques (Chapter~\ref{chap:spectral}). We have presented results from the application of a motif-based method both to synthetic data (DSBMs) and to real data (US Political Blogs network, US Migration network). We have demonstrated that this technique outperforms traditional spectral clustering methods on several occasions (Chapter~\ref{chap:motif}). We have introduced a motif-based spectral method for clustering bipartite graphs and presented results both from synthetic data (BSBMs) and from real data (American Revolution network, Unicode Languages network). In particular we have shown that motif-based spectral clustering is a valuable tool for clustering weighted directed networks, which is scalable and easy to implement. Superior performance has been demonstrated especially with asymmetric DSBMs in Section~\ref{sec:motif_asymm_dsbms}, and with the US Political Blogs network in Section~\ref{sec:motif_polblogs}. \section*{Limitations} There are limitations to our work. While our matrix-based formulae for MAMs are simple to implement and moderately scalable, they are computationally unwieldy for large networks (see Section~\ref{sec:notes_computation} for details). As mentioned in~\cite{benson2016higher}, fast triangle enumeration algorithms~\cite{demeyer2013ISMA,wernicke2006efficient,wernicke2006fanmod} offer increased performance, at the expense of methodological simplicity. Another shortcoming of the matrix-based formulae is that unlike motif detection algorithms such as~\cite{wernicke2006fanmod}, they do not extend to motifs on four or more vertices. \section*{Future work} There is plenty of scope for methodological investigation related to our work. Simple extensions could involve an analysis of the differences between clustering methods based on functional and structural MAMs respectively. One could also experiment with the effects of replacing the random-walk Laplacian with the unnormalised Laplacian or symmetric normalised Laplacian \cite{von2007tutorial}. Similarly one might try replacing Ncut with RatioCut \cite{hagen1992new}. We note that although our methods apply to weighted graphs, we have only discussed unweighted DSBMs. Therefore it would be interesting to investigate weighted DSBMs (perhaps following the exponential family method detailed in \cite{aicher2013adapting}) and to use them for evaluation of motif-based spectral clustering procedures. Further experimental work is also desirable. We would like to conduct experiments on more real data, and suggest that collaboration networks such as~\cite{snap:astro}, and bipartite preference networks such as~\cite{icon:movie} could be interesting. Comparison with other clustering methods could also be insightful; the Hermitian matrices method in~\cite{DirectedClustImbCuts}, the PageRank method in~\cite{yin2017local} and \textsc{Tectonic} from~\cite{tsourakakis2017scalable} may give suitable benchmarks for performance. \clearpage{} %TC:ignore % enable appendix numbering format and include appendices \appendix \fancyhead[RO]{\itshape{\nouppercase{Appendix \thechapter : \leftmark}}} %TC:endignore \clearpage{} \chapter{Proofs and Examples}\label{chap:appendix_proofs} \section{Proofs} \begin{prf}[Proposition~\ref{prop:motif_adj_matrix_formula}, MAM formula] \label{proof:motif_adj_matrix_formula} % Consider $(1)$. We sum over functional instances $\ca{M} \cong \ca{H} \leq \ca{G}$ such that $\{i,j\} \in \ca{A(H)}$. This is equivalent to summing over $\{k_2, \ldots, k_{m-1}\} \subseteq \ca{V}$ and $\sigma \in S_\ca{M,A}^\sim$, such that $k_u$ are all distinct and % $$ (u,v) \in \ca{E_M} \implies (k_{\sigma u}, k_{\sigma v}) \in \ca{E}\,. \qquad (\dagger) $$ % This is because the vertex set $\{k_2, \ldots, k_{m-1}\} \subseteq \ca{V}$ indicates which vertices are present in the instance $\ca{H}$, and $\sigma$ describes the mapping from $\ca{V_M}$ onto those vertices: $u \mapsto k_{\sigma u}$. We take $\sigma \in S_\ca{M,A}^\sim$ to ensure that $\{i,j\} \in \ca{A(H)}$ (since $i=k_1, \ j=k_m$), and that instances are counted exactly once. The condition $(\dagger)$ is to check that $\ca{H}$ is a functional instance of $\ca{M}$ in $\ca{G}$. Hence % \begin{align*} M^\mathrm{func}_{i j} &= \frac{1}{|\ca{E_M}|} \sum_{\ca{M} \cong \ca{H} \leq \ca{G}} \bb{I} \big\{ \{i,j\} \in \ca{A}(\ca{H}) \big\} \sum_{e \in \ca{E_H}} W(e) \\ % &= \frac{1}{|\ca{E_M}|} \sum_{\{ k_2, \ldots, k_{m-1} \}} \sum_{\sigma \in S_\ca{M,A}^\sim} \bb{I} \big\{ k_u \textrm{ all distinct}, \, (\dagger) \big\} \sum_{e \in \ca{E_H}} W(e)\,. \end{align*} % For the first term, by conditioning on the types of edge in $\ca{E_M}$: \begin{align*} % \bb{I} \big\{ k_u \textrm{ all distinct}, \, (\dagger) \big\} &= \prod_{\ca{E}_\ca{M}^0} \bb{I} \{ k_{\sigma u} \neq k_{\sigma v} \} \\ & \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{s}} \bb{I} \{ (k_{\sigma u}, k_{\sigma v}) \in \ca{E} \} \\ & \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{d}} \bb{I} \{(k_{\sigma u}, k_{\sigma v}) \in \ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u}) \in \ca{E}\} \\ % &= \prod_{\ca{E}_\ca{M}^0} (J_\mathrm{n})_{k_{\sigma u},k_{\sigma v}} \prod_{\ca{E}_\ca{M}^\mathrm{s}} J_{k_{\sigma u},k_{\sigma v}} \prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma v}} \\ % &= J^\mathrm{func}_{\mathbf{k},\sigma}\,. % \end{align*} % Assuming $\big\{ k_u \textrm{ all distinct}, \, (\dagger) \big\}$, the second term is % \begin{align*} % \sum_{e \in \ca{E_H}} W(e) &= \sum_{\ca{E}_\ca{M}^\mathrm{s}} W((k_{\sigma u},k_{\sigma v})) + \sum_{\ca{E}_\ca{M}^\mathrm{d}} \big( W((k_{\sigma u},k_{\sigma v})) + W((k_{\sigma v},k_{\sigma u})) \big) \\ % &= \sum_{\ca{E}_\ca{M}^\mathrm{s}} G_{k_{\sigma u},k_{\sigma v}} + \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma v}} \\ % &= G^\mathrm{func}_{\mathbf{k},\sigma} \end{align*} % as required. For $(2)$, we simply change $(\dagger)$ to $(\ddagger)$ to check that an instance is a \emph{structural} instance: % $$ (u,v) \in \ca{E_M} \iff (k_{\sigma u}, k_{\sigma v}) \in \ca{E} \qquad (\ddagger) $$ % Now for the first term: % \begin{align*} % \bb{I} \big\{ k_u \textrm{ all distinct}, \, (\ddagger) \big\} &= \prod_{\ca{E}_\ca{M}^0} \bb{I} \{(k_{\sigma u}, k_{\sigma v}) \notin \ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u}) \notin \ca{E}\} \\ & \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{s}} \bb{I} \{(k_{\sigma u}, k_{\sigma v}) \in \ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u}) \notin \ca{E}\} \\ & \qquad \times \prod_{\ca{E}_\ca{M}^\mathrm{d}} \bb{I} \{(k_{\sigma u}, k_{\sigma v}) \in \ca{E} \textrm{ and } (k_{\sigma v}, k_{\sigma u}) \in \ca{E}\} \\ % &= \prod_{\ca{E}_\ca{M}^0} (J_\mathrm{0})_{k_{\sigma u},k_{\sigma v}} \prod_{\ca{E}_\ca{M}^\mathrm{s}} (J_\mathrm{s})_{k_{\sigma u},k_{\sigma v}} \prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma v}} \\ % &= J^\mathrm{struc}_{\mathbf{k},\sigma}\,. % \end{align*} % Assuming $\big\{ k_u \textrm{ all distinct}, \, (\ddagger) \big\}$, the second term is % \begin{align*} % \sum_{e \in \ca{E_H}} W(e) &= \sum_{\ca{E}_\ca{M}^\mathrm{s}} W((k_{\sigma u},k_{\sigma v})) + \sum_{\ca{E}_\ca{M}^\mathrm{d}} \big( W((k_{\sigma u},k_{\sigma v})) + W((k_{\sigma v},k_{\sigma u})) \big) \\ % &= \sum_{\ca{E}_\ca{M}^\mathrm{s}} (G_\mathrm{s})_{k_{\sigma u},k_{\sigma v}} + \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma v}} \\ % &= G^\mathrm{struc}_{\mathbf{k},\sigma}\,. \end{align*} \hfill $\square$ \end{prf} \pagebreak \begin{prf}[Proposition~\ref{prop:motif_adj_matrix_computation}, Complexity of MAM formula] \label{proof:motif_adj_matrix_computation} Suppose ${m \leq 3}$ and consider $M^\mathrm{func}$. The adjacency and indicator matrices of $\ca{G}$ are % \begin{equation*} \begin{aligned}[c] &(1) \quad J = \bb{I} \{ G>0 \}\,, \\ &(2) \quad J_0 = \bb{I} \{ G + G^\top = 0 \} \circ J_\mathrm{n}\,, \\ &(3) \quad J_\mathrm{s} = J - J_\mathrm{d}\,, \\ &(4) \quad G_\mathrm{d} = (G + G^\top) \circ J_\mathrm{d} \,, \end{aligned} \hspace*{2cm} \begin{aligned}[c] &(5) \quad J_\mathrm{n} = \bb{I} \{I_{n \times n} = 0 \}\,, \\ &(6) \quad J_\mathrm{d} = J \circ J^\top\,, \\ &(7) \quad G_\mathrm{s} = G \circ J_\mathrm{s}\,, \\ & \end{aligned} \end{equation*} % and are computed using four additions and four element-wise multiplications. $J^\mathrm{func}_{\mathbf{k},\sigma}$ is a product of at most three factors, and $G^\mathrm{func}_{\mathbf{k},\sigma}$ contains at most three summands, so % $$ \sum_{k_2 \in \ca{V}} J^\mathrm{func}_{\mathbf{k},\sigma} \ G^\mathrm{func}_{\mathbf{k},\sigma} $$ % is expressible as a sum of at most three matrices, each of which is constructed with at most one matrix multiplication (where $\{k_{\sigma r},k_{\sigma s}\} \neq \{i,j\}$) and one entry-wise multiplication (where $\{k_{\sigma r},k_{\sigma s}\} = \{i,j\}$). This is repeated for each $\sigma \in S_\ca{M,A}^\sim$ (at most six times) and the results are summed. Calculations are identical for $M^\mathrm{struc}$. \hfill $\square$ \end{prf} \begin{prf}[Proposition~\ref{prop:coll_expa_formulae}, Colliders and expanders in bipartite graphs] \label{proof:coll_expa_formulae} % Consider (1) and the collider motif $\ca{M}_\mathrm{coll}$. Since $\ca{G}$ is bipartite, $M_\mathrm{coll}^\mathrm{func} = M_\mathrm{coll}^\mathrm{struc} = \vcentcolon M_\mathrm{coll}$, and by Table~\ref{tab:motif_adj_mat_table}, $M_\mathrm{coll} = \frac{1}{2} J_\mathrm{n} \circ (J G^\top + G J^\top)$. Hence % \begin{align*} (M_\mathrm{coll})_{i j} &= \frac{1}{2} (J_\mathrm{n})_{i j} \ (J G^\top + G J^\top)_{i j} \\ &= \bb{I}\{i \neq j\} \sum_{k \in \ca{V}} \ \frac{1}{2} \Big(J_{i k} G_{j k} + G_{i k} J_{j k} \Big) \\ &= \bb{I}\{i \neq j\} \sum_{k \in \ca{V}} \ \frac{1}{2} \,\bb{I} \, \Big\{ (i,k),(j,k) \in \ca{E} \Big\} \Big[W((i,k)) + W((j,k))\Big] \\ &= \bb{I} \{i \neq j\} \hspace*{-0.4cm} \sum_{\substack{k \in \ca{D} \\ (i,k), (j,k) \in \ca{E}}} \hspace*{-0.2cm} \frac{1}{2} \Big[ W((i,k)) + W((j,k)) \Big]\,. \end{align*} % Similarly for the expander motif, $M_\mathrm{expa} = \frac{1}{2} J_\mathrm{n} \circ (J^\top G + G^\top J)$ so % \begin{align*} (M_\mathrm{expa})_{i j} &= \frac{1}{2} (J_\mathrm{n})_{i j} \ (J^\top G + G^\top J)_{i j} \\ &= \bb{I} \{i \neq j\} \hspace*{-0.4cm} \sum_{\substack{k \in \ca{S} \\ (k,i), (k,j) \in \ca{E}}} \hspace*{-0.2cm} \frac{1}{2} \Big[ W((k,i)) + W((k,j)) \Big]\,. \end{align*} % \hfill $\square$ \end{prf} \section{Examples} \begin{example}[Functional and structural instances] \label{ex:instances} Let $\ca{G}=(\ca{V,E})$ be the graph with $\ca{V} = \{ 1,2,3,4 \}$ and $\ca{E} = \{ (1,2),(1,3),(1,4),(2,3),(3,4),(4,3) \}$. Let $(\ca{M,A})$ be the anchored motif with $\ca{V_M} = \{1,2,3\}$, $\ca{E_M} = \{(1,2),(1,3),(2,3)\}$ and $\ca{A} = \{1,3\}$ as defined in Figure \ref{fig:instance_example_1}. % \begin{figure}[H] \centering %\includegraphics[scale=0.7,draft=false]{% %../tikz/instance_example_1/instance_example_1.pdf} \caption{The specified graph $\ca{G}$ and anchored motif $\ca{M}$} \label{fig:instance_example_1} \end{figure} % There are three functional instances of $\ca{M}$ in $\ca{G}$, shown in Figure~\ref{fig:instance_example_2}. However there is just one structural instance of $\ca{M}$ in $\ca{G}$, given by $\ca{H}_1$. This is because the double edge $3 \leftrightarrow 4$ in $\ca{G}$ prevents the subgraphs on $\{1,3,4\}$ from being induced subgraphs. % \begin{align*} \ca{H}_1 &: \quad \ca{V}_1 = \{ 1,2,3 \} ; \quad \ca{E}_1 = \{ (1,2) , (2,3) , (1,3) \} ; \quad \ca{A(H}_1) = \big\{\{1,3\}\big\}\,, \\ \ca{H}_2 &: \quad \ca{V}_2 = \{ 1,3,4 \} ; \quad \ca{E}_2 = \{ (1,3) , (1,4) , (3,4) \} ; \quad \ca{A(H}_2) = \big\{\{1,4\}\big\}\,, \\ \ca{H}_3 &: \quad \ca{V}_3 = \{ 1,3,4 \} ; \quad \ca{E}_3 = \{ (1,3) , (1,4) , (4,3) \} ; \quad \ca{A(H}_3) = \big\{\{1,3\}\big\}\,. \end{align*} % \begin{figure}[H] \centering %\includegraphics[scale=0.7,draft=false]{% %../tikz/instance_example_2/instance_example_2.pdf} \caption{Functional instances $\ca{H}_1,\ca{H}_2$ and $\ca{H}_3$} \label{fig:instance_example_2} \end{figure} \end{example} \begin{example}[Motif adjacency matrices] \label{ex:motif_adj_matrices} Let $\ca{G}$ and $\ca{(M,A)}$ be as in Example~\ref{ex:instances}, and suppose $\ca{G}$ has weight map $W((i,j)) \vcentcolon = i + j$. Then using Definition~\ref{def:motif_adj_matrices} directly, the functional and structural MAMs of $\ca{(M,A)}$ in $\ca{G}$ are respectively \vspace*{0.2cm} $$ % M^\mathrm{func} = \begin{pmatrix} 0 & 0 & 28 & 16 \\ 0 & 0 & 0 & 0 \\ 28 & 0 & 0 & 0 \\ 16 & 0 & 0 & 0 \end{pmatrix} \,, \qquad M^\mathrm{struc} = \begin{pmatrix} 0 & 0 & 12 & 0 \\ 0 & 0 & 0 & 0 \\ 12 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 \end{pmatrix}\,. $$ \end{example} \pagebreak \begin{example}[Calculating an explicit formula for an MAM] \label{ex:motif_adj_calc} Consider the functional MAM of the simple motif $\ca{M}_6$ (Figure~\ref{fig:M6}). % \begin{figure}[H] \centering %\includegraphics[scale=0.7,draft=false]{../tikz/M6/M6.pdf} \caption{The motif $\ca{M}_6$} \label{fig:M6} \end{figure} % We use Equation (1) in Proposition~\ref{prop:motif_adj_matrix_formula}. Firstly, $m = |\ca{V_M}| = 3$ and $|\ca{E_M}| = 4$. The automorphism group of $\ca{M}_6$ has order 2, corresponding to swapping vertices 1 and 3. Hence $|S_\ca{M,A}^\sim| = |S_m| / 2 = 6/2 = 3$, and suitable representatives from $S_\ca{M,A}^\sim$ are $$ S_\ca{M,A}^\sim = \left\{ % \sigma_1 = \begin{pmatrix} 1 & 2 & 3 \\ 1 & 2 & 3 \end{pmatrix}, % \sigma_2 = \begin{pmatrix} 1 & 2 & 3 \\ 2 & 1 & 3 \end{pmatrix}, % \sigma_3 = \begin{pmatrix} 1 & 2 & 3 \\ 1 & 3 & 2 \end{pmatrix} \right\}\,. \vspace*{0.2cm}$$ % So by Proposition~\ref{prop:motif_adj_matrix_formula}, with $i=k_1$ and $j=k_3$, and writing $k$ for $k_2$: $$ M^\mathrm{func}_{i j} = \frac{1}{4} \sum_{\sigma \in S_\ca{M,A}^\sim} \ \sum_{k \in \ca{V}} J^\mathrm{func}_{\mathbf{k},\sigma} \ G^\mathrm{func}_{\mathbf{k},\sigma} $$ % where since there are no missing edges in $\ca{M}_6$: % \begin{align*} % J^\mathrm{func}_{\mathbf{k},\sigma} &= \prod_{\ca{E}_\ca{M}^\mathrm{s}} J_{k_{\sigma u},k_{\sigma v}} \prod_{\ca{E}_\ca{M}^\mathrm{d}} (J_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}\,, \\ % G^\mathrm{func}_{\mathbf{k},\sigma} &= \sum_{\ca{E}_\ca{M}^\mathrm{s}} G_{k_{\sigma u},k_{\sigma v}} + \sum_{\ca{E}_\ca{M}^\mathrm{d}} (G_\mathrm{d})_{k_{\sigma u},k_{\sigma v}}\,. % \end{align*} % Writing out the sum over $\sigma$: % \begingroup \allowdisplaybreaks \begin{align*} M^\mathrm{func}_{i j} &= \frac{1}{4} \sum_{k=1}^n J^\mathrm{func}_{\mathbf{k},\sigma_1} \ G^\mathrm{func}_{\mathbf{k},\sigma_1} + \frac{1}{4} \sum_{k=1}^n J^\mathrm{func}_{\mathbf{k},\sigma_2} \ G^\mathrm{func}_{\mathbf{k},\sigma_2} + \frac{1}{4} \sum_{k=1}^n J^\mathrm{func}_{\mathbf{k},\sigma_3} \ G^\mathrm{func}_{\mathbf{k},\sigma_3} \\ % &= \frac{1}{4} \sum_{k=1}^n J_{j i} J_{j k} (J_\mathrm{d})_{i k} \big(G_{j i} + G_{j k} + (G_\mathrm{d})_{i k}\big) \\ & \qquad + \frac{1}{4} \sum_{k=1}^n J_{i j} J_{i k} (J_\mathrm{d})_{j k} \big(G_{i j} + G_{i k} + (G_\mathrm{d})_{j k}\big) \\ & \qquad + \frac{1}{4} \sum_{k=1}^n J_{k i} J_{k j} (J_\mathrm{d})_{i j} \big(G_{k i} + G_{k j} + (G_\mathrm{d})_{i j}\big) \\ % & \\ & \\ & \\ &= \frac{1}{4} J^\top_{i j} \sum_{k=1}^n (J_\mathrm{d})_{i k} J^\top_{k j} \big(G^\top_{i j} + (G_\mathrm{d})_{i k} + G^\top_{k j}\big) \\ & \qquad + \frac{1}{4} J_{i j} \sum_{k=1}^n J_{i k} (J_\mathrm{d})_{k j} \big(G_{i j} + G_{i k} + (G_\mathrm{d})_{k j}\big) \\ & \qquad + \frac{1}{4} (J_\mathrm{d})_{i j} \sum_{k=1}^n J^\top_{i k} J_{k j} \big((G_\mathrm{d})_{i j} + G^\top_{i k} + G_{k j}\big) \,, \end{align*} \endgroup % and writing this as a sum of entry-wise and matrix products: % \begin{align*} M^\textrm{func} &= \frac{1}{4} \Big[ J^\top \circ (J_\mathrm{d} G^\top) + J^\top \circ (G_\mathrm{d} J^\top) + G^\top \circ (J_\mathrm{d} J^\top) \Big] \\ & \qquad + \frac{1}{4} \Big[ J \circ (J G_\mathrm{d}) + J \circ (G J_\mathrm{d}) + G \circ (J J_\mathrm{d}) \Big] \\ & \qquad + \frac{1}{4} \Big[ J_\mathrm{d} \circ (J^\top G) + J_\mathrm{d} \circ (G^\top J) + G_\mathrm{d} \circ (J^\top J) \Big] \end{align*} % where $A \circ B$ is an entry-wise product and $AB$ is a matrix product. Finally, setting $$C = J \circ (J G_\mathrm{d}) + J \circ (G J_\mathrm{d}) + G \circ (J J_\mathrm{d}) + J_\mathrm{d} \circ (J^\top G)\,, $$ and $$ C' = G_\mathrm{d} \circ (J^\top J)\,, $$ then we have that $$ M^\mathrm{func} = \frac{1}{4} \big(C + C^\top + C' \big)\,. $$ as in Table~\ref{tab:motif_adj_mat_table}, achieved with just five matrix multiplications, nine entry-wise multiplications and nine matrix additions (including the four entry-wise multiplications and four additions needed to construct the adjacency and indicator matrices). \end{example} \clearpage{} \clearpage{} \chapter{Motif Adjacency Matrix Formulae} \label{chap:appendix_matrices} We give explicit matrix-based formulae for functional motif adjacency matrices $M^\mathrm{func}$ for all simple motifs $\ca{M}$ on at most three vertices, along with the anchored motifs $\ca{M}_\mathrm{coll}$ and $\ca{M}_\mathrm{expa}$. For structural motif adjacency matrices, simply replace $J_\mathrm{n}$, $J$ and $G$ with $J_0$, $J_\mathrm{s}$ and $G_\mathrm{s}$ respectively. Entry-wise products are denoted by $\circ$. \vspace*{0.2cm} \begin{table}[H] \centering \renewcommand{\arraystretch}{1.8} \tiny \begin{tabular}{ |c|c|c|c| } \hline Motif & $C$ & $C'$ & $M^\mathrm{func}$ \\ \hline $\ca{M}_\mathrm{s}$ & & & $G + G^\top$ \\ \hline $\ca{M}_\mathrm{d}$ & & & $\frac{1}{2} G_\mathrm{d}$ \\ \hline $\ca{M}_1$ & $J^\top \circ (J G) + J^\top \circ (G J) + G^\top \circ (J J)$ & & $\frac{1}{3} \big(C + C^\top\big)$ \\ \hline $\ca{M}_2$ & \rule{0pt}{2.7em}$\displaystyle \begin{aligned} & J^\top \circ (J_\mathrm{d} G) + J^\top \circ (G_\mathrm{d} J) + G^\top \circ (J_\mathrm{d} J) \\ & + J^\top \circ (J G_\mathrm{d}) + J^\top \circ (G J_\mathrm{d}) + G^\top \circ (J J_\mathrm{d}) \\ & + J_\mathrm{d} \circ (J G) + J_\mathrm{d} \circ (G J) + G_\mathrm{d} \circ (J J) \end{aligned} $\rule[-2em]{0pt}{1em} & & $\frac{1}{4} \big(C + C^\top\big)$ \\ \hline $\ca{M}_3$ & \rule{0pt}{2.7em}$\displaystyle \begin{aligned} & J \circ (J_\mathrm{d} G_\mathrm{d}) + J \circ (G_\mathrm{d} J_\mathrm{d}) + G \circ (J_\mathrm{d} J_\mathrm{d}) \\ & + J_\mathrm{d} \circ (J_\mathrm{d} G) + J_\mathrm{d} \circ (G_\mathrm{d} J) + G_\mathrm{d} \circ (J_\mathrm{d} J) \\ & + J_\mathrm{d} \circ (J G_\mathrm{d}) + J_\mathrm{d} \circ (G J_\mathrm{d}) + G_\mathrm{d} \circ (J J_\mathrm{d}) \end{aligned} $\rule[-2em]{0pt}{1em} & & $\frac{1}{5} \big(C + C^\top\big)$ \\ \hline $\ca{M}_4$ & $ J_\mathrm{d} \circ (J_\mathrm{d} G_\mathrm{d}) + J_\mathrm{d} \circ (G_\mathrm{d} J_\mathrm{d}) + G_\mathrm{d} \circ (J_\mathrm{d} J_\mathrm{d}) $ & & $ \frac{1}{6} C$ \\ \hline $\ca{M}_5$ & \rule{0pt}{2.7em}$\displaystyle \begin{aligned} & J \circ (J G) + J \circ (G J) + G \circ (J J) \\ & + J \circ (J G^\top) + J \circ (G J^\top) + G \circ (J J^\top) \\ & + J \circ (J^\top G) + J \circ (G^\top J) + G \circ (J^\top J) \end{aligned} $\rule[-2em]{0pt}{1em} & & $\frac{1}{3} \big(C + C^\top\big)$ \\ \hline $\ca{M}_6$ & $J \circ (J G_\mathrm{d}) + J \circ (G J_\mathrm{d}) + G \circ (J J_\mathrm{d}) + J_\mathrm{d} \circ (J^\top G)$ & $G_\mathrm{d} \circ (J^\top J)$ & $\frac{1}{4} \big(C + C^\top + C' \big)$ \\ \hline $\ca{M}_7$ & $J \circ (J_\mathrm{d} G) + J \circ (G_\mathrm{d} J) + G \circ (J_\mathrm{d} J)$ & $J_\mathrm{d} \circ (J G^\top) + J_\mathrm{d} \circ (G J^\top) + G_\mathrm{d} \circ (J J^\top)$ & $ \frac{1}{4} \big(C + C^\top + C' \big)$ \\ \hline $\ca{M}_8$ & $J \circ (G J_\mathrm{n}) + G \circ (J J_\mathrm{n})$ & $J_\mathrm{n} \circ (J^\top G) + J_\mathrm{n} \circ (G^\top J)$ & $\frac{1}{2} \big(C + C^\top + C' \big)$ \\ \hline $\ca{M}_9$ & \rule{0pt}{1.9em}$\displaystyle \begin{aligned} & J \circ (J_\mathrm{n} G^\top) + G \circ (J_\mathrm{n} J^\top) + J_\mathrm{n} \circ (J G) \\ & + J_\mathrm{n} \circ (G J) + J \circ (G^\top J_\mathrm{n}) + G \circ (J^\top J_\mathrm{n}) \end{aligned} $\rule[-1.3em]{0pt}{1em} & & $\frac{1}{2} \big(C + C^\top\big)$ \\ \hline $\ca{M}_{10}$ & $J \circ (J_\mathrm{n} G) + G \circ (J_\mathrm{n} J)$ & $J_\mathrm{n} \circ (J G^\top) + J_\mathrm{n} \circ (G J^\top)$ & $\frac{1}{2} \big(C + C^\top + C' \big)$ \\ \hline $\ca{M}_{11}$ & \rule{0pt}{1.9em}$\displaystyle \begin{aligned} & J_\mathrm{d} \circ (G J_\mathrm{n}) + G_\mathrm{d} \circ (J J_\mathrm{n}) + J_\mathrm{n} \circ (J_\mathrm{d} G) \\ & + J_\mathrm{n} \circ (G_\mathrm{d} J) + J \circ (G_\mathrm{d} J_\mathrm{n}) + G \circ (J_\mathrm{d} J_\mathrm{n}) \end{aligned} $\rule[-1.3em]{0pt}{1em} & & $\frac{1}{3} \big(C + C^\top\big)$ \\ \hline $\ca{M}_{12}$ & \rule{0pt}{1.9em}$\displaystyle \begin{aligned} & J_\mathrm{d} \circ (J_\mathrm{n} G) + G_\mathrm{d} \circ (J_\mathrm{n} J) + J_\mathrm{n} \circ (J G_\mathrm{d}) \\ & + J_\mathrm{n} \circ (G J_\mathrm{d}) + J \circ (J_\mathrm{n} G_\mathrm{d}) + G \circ (J_\mathrm{n} J_\mathrm{d}) \end{aligned} $\rule[-1.3em]{0pt}{1em} & & $ \frac{1}{3} \big(C + C^\top\big)$ \\ \hline $\ca{M}_{13}$ & $J_\mathrm{d} \circ (G_\mathrm{d} J_\mathrm{n}) + G_\mathrm{d} \circ (J_\mathrm{d} J_\mathrm{n}) + J_\mathrm{n} \circ (J_\mathrm{d} G_\mathrm{d})$ & & $\frac{1}{4} \big(C + C^\top \big)$ \\ \hline $\ca{M}_\mathrm{coll}$ & $J_\mathrm{n} \circ (J G^\top)$ & & $\frac{1}{2} \big( C + C^\top \big)$ \\ \hline $\ca{M}_\mathrm{expa}$ & $J_\mathrm{n} \circ (J^\top G)$ & & $\frac{1}{2} \big( C + C^\top \big)$ \\ \hline \end{tabular} \caption{Functional motif adjacency matrix formulae} \label{tab:motif_adj_mat_table} \end{table} \clearpage{} \clearpage{} \chapter{Further Notes} \section{Computation} \label{sec:notes_computation} \subsection{Hardware and software} \label{sec:notes_hardware} The hardware used for computation was an \emph{Intel Core i7-4790} CPU at 3.60\,GHz, with 32\,GB of RAM. The software used was R 3.5.1 \cite{r_rsoftware}, along with several R packages: % % \begin{itemize} \item \textbf{igraph} \cite{r_igraph} for plotting networks \item \textbf{LICORS} \cite{r_LICORS} for an implementation of $k$-means++ \item \textbf{mclust} \cite{r_mclust} for an implementation of ARI \item \textbf{rnaturalearth} \cite{r_rnaturalearth} for world territory boundary data \item \textbf{RSpectra} \cite{r_RSpectra} for eigendecomposition of sparse matrices \item \textbf{USAboundaries} \cite{r_USAboundaries} for US county and state boundary data \end{itemize} \subsection{Timings for MAM computations} \label{sec:notes_timing} We record timings (in seconds) for the MAM formulae given in Table~\ref{tab:motif_adj_mat_table}. We test on DSBMs (Section~\ref{sec:motif_dsbms}) with $k=1$, and vary the graph size $n$ and sparsity parameter $p$. \vspace*{0.3cm} \begin{table}[H] \centering \renewcommand{\arraystretch}{1.5} \setlength\tabcolsep{0.2em} \scriptsize \begin{tabular}{|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|} \hline \cellcolor[HTML]{E9E9E9} \smash{\raisebox{0.7pt}{$p$}} & \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{s}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{d}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_1$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_2$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_3$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_4$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_5$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_6$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_7$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_8$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_9$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{10}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{11}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{12}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{13}$ \\ \hline \cellcolor[HTML]{E9E9E9} 0.0001 & 0.013 & 0.012 & 0.017 & 0.029 & 0.034 & 0.015 & 0.028 & 0.022 & 0.022 & 0.019 & 0.030 & 0.019 & 0.042 & 0.021 & 0.016 \\ \hline \cellcolor[HTML]{E9E9E9} 0.001 & 0.013 & 0.011 & 0.016 & 0.035 & 0.027 & 0.017 & 0.028 &0.024 & 0.023 & 0.026 & 0.027 & 0.018 & 0.021 & 0.022 & 0.016 \\ \hline \cellcolor[HTML]{E9E9E9} 0.01 & 0.013 & 0.012 & 0.024 & 0.028 & 0.028 & 0.016 & 0.028 & 0.022 & 0.032 & 0.021 & 0.026 & 0.020 & 0.023 & 0.023 & 0.017 \\ \hline \cellcolor[HTML]{E9E9E9} 0.1 & 0.014 & 0.019 & 0.019 & 0.031 & 0.029 & 0.019 & 0.033 & 0.025 & 0.032 & 0.023 & 0.028 & 0.023 & 0.026 & 0.025 & 0.019 \\ \hline \end{tabular} \caption{Timings for MAM computation with $n=100$}% \label{tab:timing_n_100}% \end{table} \begin{table}[H] \centering \renewcommand{\arraystretch}{1.5} \setlength\tabcolsep{0.2em} \scriptsize \begin{tabular}{ |c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c| } \hline \cellcolor[HTML]{E9E9E9} \smash{\raisebox{0.7pt}{$p$}} & \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{s}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{d}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_1$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_2$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_3$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_4$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_5$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_6$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_7$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_8$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_9$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{10}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{11}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{12}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{13}$ \\ \hline \cellcolor[HTML]{E9E9E9} 0.0001 & 0.13 & 0.14 & 0.14 & 0.32 & 0.14 & 0.13 & 0.14 & 0.31 & 0.13 & 0.21 & 0.22 & 0.21 & 0.20 & 0.34 & 0.16 \\ \hline \cellcolor[HTML]{E9E9E9} 0.001 & 0.30 & 0.13 & 0.15 & 0.16 & 0.16 & 0.14 & 0.16 & 0.32 & 0.14 & 0.48 & 0.37 & 0.29 & 0.31 & 0.29 & 0.17 \\ \hline \cellcolor[HTML]{E9E9E9} 0.01 & 0.11 & 0.14 & 0.17 & 0.19 & 0.14 & 0.13 & 0.21 & 0.18 & 0.18 & 0.64 & 0.73 & 0.89 & 0.46 & 0.56 & 0.18 \\ \hline \cellcolor[HTML]{E9E9E9} 0.1 & 0.23 & 0.22 & 0.60 & 1.1 & 0.57 & 0.24 & 1.4 & 0.86 & 0.69 & 1.5 & 2.3 & 1.6 & 1.6 & 1.6 & 0.67 \\ \hline \end{tabular} \caption{Timings for MAM computation with $n=1000$} \label{tab:timing_n_1000} \end{table} \begin{table}[H] \centering \renewcommand{\arraystretch}{1.5} \setlength\tabcolsep{0.2em} \scriptsize \begin{tabular}{ |c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c|c| } \hline \cellcolor[HTML]{E9E9E9} \smash{\raisebox{0.7pt}{$p$}} & \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{s}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_\mathrm{d}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_1$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_2$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_3$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_4$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_5$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_6$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_7$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_8$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_9$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{10}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{11}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{12}$ & \cellcolor[HTML]{E9E9E9} $\ca{M}_{13}$ \\ \hline \cellcolor[HTML]{E9E9E9} 0.0001 & 11 & 12 & 12 & 12 & 12 & 12 & 12 & 12 & 12 & 41 & 55 & 37 & 38 & 34 & 15 \\ \hline \cellcolor[HTML]{E9E9E9} 0.001 & 13 & 12 & 13 & 13 & 12 & 12 & 13 & 12 & 12 & 61 & 89 & 54 & 56 & 48 & 15 \\ \hline \cellcolor[HTML]{E9E9E9} 0.01 & 13 & 13 & 36 & 36 & 14 & 13 & 82 & 36 & 36 & 150 & 230 & 130 & 130 & 99 & 36 \\ \hline \cellcolor[HTML]{E9E9E9} 0.1 & 33 & 31 & 170 & 260 & 160 & 53 & 410 & 210 & 210 & 700 & 1100 & 520 & 760 & 580 & 150 \\ \hline \end{tabular} \caption{Timings for MAM computation with $n=10 \, 000$} \label{tab:timing_n_10000} \end{table} \section{Data preprocessing} \label{sec:notes_preprocessing} All real networks were preprocessed by restriction to their largest connected component. The Unicode Languages network in Section~\ref{sec:bipartite_languages} was also preprocessed to remove territories with under one million inhabitants and languages with under one million speakers. Vertex and edge counts of all networks are stated \emph{after} this preprocessing. \section{US map} \label{sec:notes_us_map} % \vspace*{-0.8cm} \begin{figure}[H] \centering %\includegraphics[scale=0.6,draft=false]{% %../../results/us_migration/us_migration_map_state_names.pdf} \vspace*{-0.5cm} \caption{US map with state boundaries and state abbreviations} \label{fig:notes_us_map} \end{figure} \section{Word count} The word count of this dissertation is 6230 \unskip, obtained using \TeX \hspace*{-0.15cm} count by running % \begin{center} \texttt{texcount -relaxed -inc -0 -sum=1,1,1,0,0,0,0\,}. \end{center} % %The final dissertation should be no longer than 7,500 words, this usually %equates to 25--30 pages. The word count may exclude any table of contents, %all mathematical equations and symbols, diagrams, tables, bibliography and %the texts of computer programs. However any preface, footnotes, and %appendices must be included \clearpage{} %TC:ignore % add the bibliography to the contents page \pagestyle{empty} \cleardoublepage \phantomsection \addcontentsline{toc}{chapter}{References} % change bibliography name to References \renewcommand{\bibname}{References} \pagestyle{fancy} \fancyhead[RO]{\itshape{\nouppercase{References}}} \bibliography{refs} \bibliographystyle{abbrv} %TC:endignore \end{document} tex-fmt-0.5.2/tests/target/ociamthesis.cls000066400000000000000000000172551473573253500205750ustar00rootroot00000000000000% ociamthesis v2.2 % By Keith A. Gillow % Version 1.0 released 26/11/1997 %-------------------------- identification --------------------- \NeedsTeXFormat{LaTeX2e} \ProvidesClass{ociamthesis}[2010/11/22 v2.2 OCIAM thesis class] %-------------------------- initial code ----------------------- \def\logoversion{squarelogo} \DeclareOption{beltcrest}{\def\logoversion{beltcrest}} \DeclareOption{shieldcrest}{\def\logoversion{shieldcrest}} \DeclareOption*{\PassOptionsToClass{\CurrentOption}{report}} \ProcessOptions\relax \LoadClass[a4paper]{report} % As an alternative to the above could use next line for twosided output %\LoadClass[a4paper,twoside,openright]{report} \RequirePackage{graphicx} % needed for latest frontpage logo \RequirePackage{ifthen} % needed for option parsing for logo \raggedbottom %define the default submitted text \newcommand{\submittedtext}{{A thesis submitted for the degree of}} % % DECLARATIONS % % These macros are used to declare arguments needed for the % construction of the title page and other preamble. % The year and term the thesis is submitted \def\degreedate#1{\gdef\@degreedate{#1}} % The full (unabbreviated) name of the degree \def\degree#1{\gdef\@degree{#1}} % The name of your Oxford college (e.g. Christ Church, Pembroke) \def\college#1{\gdef\@college{#1}} % % Setup chosen crest/logo % \ifthenelse{\equal{\logoversion}{shieldcrest}}% { % Traditional Oxford shield crest %Using latex metafont (Mathematical Institute system) \font\crestfont=oxcrest40 scaled\magstep3 \def\logo{{\crestfont \char1}} %For comlab system replace 1st line above with %\font\crestfont=crest scaled\magstep3 }{} \ifthenelse{\equal{\logoversion}{beltcrest}}% { % Newer Oxford Belt crest %Using latex metafont (Mathematical Institute system) \font\beltcrestfont=oxbeltcrest \def\logo{{\beltcrestfont \char0}} %For comlab system replace 1st line above with %\font\beltcrestfont=newcrest }{} \ifthenelse{\equal{\logoversion}{squarelogo}}% { % Latest Logo, Square version (the default!) % you need an oxlogo.eps or oxlogo.pdf file as appropriate \def\logo{{ %\includegraphics[width=32mm,draft=false]{../graphics/branding/oxlogo} }} }{} % % Define text area of page and margin offsets % \setlength{\topmargin}{0.0in} %0.0in \setlength{\oddsidemargin}{0.167in} % 0.33in \setlength{\evensidemargin}{-0.08in} %-0.08in \setlength{\textheight}{9.2in} %9.0in \setlength{\textwidth}{6.0in} %6.0in \setlength{\headheight}{15pt} % not set \setlength{\voffset}{-0.2in} % not set % % Environments % % This macro define an environment for front matter that is always % single column even in a double-column document. \newenvironment{alwayssingle}{% \@restonecolfalse \if@twocolumn\@restonecoltrue\onecolumn \else\if@openright\cleardoublepage\else\clearpage\fi \fi}% {\if@restonecol\twocolumn \else\newpage\thispagestyle{empty}\fi} %define title page layout \renewcommand{\maketitle}{% \begin{alwayssingle} \renewcommand{\footnotesize}{\small} \renewcommand{\footnoterule}{\relax} \thispagestyle{empty} \null\vfill \begin{center} { \Huge {\bfseries {\@title}} \par} {\large \vspace*{40mm} {\logo \par} \vspace*{25mm}} {{\Large \@author} \par} {\large \vspace*{1.5ex} % 1ex {{\@college} \par} \vspace*{1ex} {University of Oxford \par} \vspace*{25mm} {{\submittedtext} \par} \vspace*{1ex} {\it {\@degree} \par} \vspace*{2ex} {\@degreedate}} \end{center} \null\vfill \end{alwayssingle}} % DEDICATION % % The dedication environment makes sure the dedication gets its % own page and is set out in verse format. \newenvironment{dedication} { \begin{alwayssingle} \thispagestyle{empty} \begin{center} \vspace*{1.5cm} {\LARGE } \end{center} \vspace{0.5cm} \begin{verse} \begin{center}} { \end{center} \end{verse} \end{alwayssingle}} % ACKNOWLEDGEMENTS % % The acknowledgements environment puts a large, bold, centered % "Acknowledgements" label at the top of the page. The acknowledgements % themselves appear in a quote environment, i.e. tabbed in at both sides, and % on its own page. \newenvironment{acknowledgements} { \begin{alwayssingle} \thispagestyle{empty} \begin{center} \vspace*{1.5cm} {\Large \bfseries Acknowledgements} \end{center} \vspace{0.5cm} \begin{quote}} { \end{quote} \end{alwayssingle}} % The acknowledgementslong environment puts a large, bold, centered % "Acknowledgements" label at the top of the page. The acknowledgement itself % does not appears in a quote environment so you can get more in. \newenvironment{acknowledgementslong} { \begin{alwayssingle} \thispagestyle{empty} \begin{center} \vspace*{1.5cm} {\Large \bfseries Acknowledgements} \end{center} \vspace{0.5cm}} { \end{alwayssingle}} % STATEMENT OF ORIGINALITY (AS SUGGESTED BY GSW) % % The originality environment puts a large, bold, centered % "Statement of originality" label at the top of the page. The statement % of originality itself appears in a quote environment, i.e. tabbed in at % both sides, and on its own page. \newenvironment{originality} { \begin{alwayssingle} \thispagestyle{empty} \begin{center} \vspace*{1.5cm} {\Large \bfseries Statement of Originality} \end{center} \vspace{0.5cm} \begin{quote}} { \end{quote} \end{alwayssingle}} % The originalitylong environment puts a large, bold, centered % "Statement of originality" label at the top of the page. The statement % of originality itself does not appears in a quote environment so you can % get more in. \newenvironment{originalitylong} { \begin{alwayssingle} \thispagestyle{empty} \begin{center} \vspace*{1.5cm} {\Large \bfseries Statement of Originality} \end{center} \vspace{0.5cm}} { \end{alwayssingle}} %ABSTRACT % %The abstract environment puts a large, bold, centered "Abstract" label at %the top of the page. The abstract itself appears in a quote environment, %i.e. tabbed in at both sides, and on its own page. \renewenvironment{abstract} { \begin{alwayssingle} \thispagestyle{empty} \begin{center} \vspace*{1.5cm} {\Large \bfseries Abstract} \end{center} \vspace{0.5cm} \begin{quote}} { \end{quote} \end{alwayssingle}} %The abstractlong environment puts a large, bold, centered "Abstract" label at %the top of the page. The abstract itself does not appears in a quote %environment so you can get more in. \newenvironment{abstractlong} { \begin{alwayssingle} \thispagestyle{empty} \begin{center} \vspace*{1.5cm} {\Large \bfseries Abstract} \end{center} \vspace{0.5cm}} { \end{alwayssingle}} %The abstractseparate environment is for running of a page with the abstract %on including title and author etc as required to be handed in separately \newenvironment{abstractseparate} { \begin{alwayssingle} \thispagestyle{empty} \vspace*{-1in} \begin{center} { \Large {\bfseries {\@title}} \par} {{\large \vspace*{1ex} \@author} \par} {\large \vspace*{1ex} {{\@college} \par} {University of Oxford \par} \vspace*{1ex} {{\it \submittedtext} \par} {\it {\@degree} \par} \vspace*{2ex} {\@degreedate}} \end{center}} { \end{alwayssingle}} %ROMANPAGES % % The romanpages environment set the page numbering to lowercase roman one % for the contents and figures lists. It also resets % page-numbering for the remainder of the dissertation (arabic, starting at 1). \newenvironment{romanpages} {\cleardoublepage\setcounter{page}{1}\renewcommand{\thepage}{\roman{page}}} {\cleardoublepage\renewcommand{\thepage}{\arabic{page}}\setcounter{page}{1}} tex-fmt-0.5.2/tests/target/phd_dissertation.tex000066400000000000000000030563351473573253500216540ustar00rootroot00000000000000% !TeX program = lualatex %! TeX root = phd_dissertation.tex %\pdfvariable suppressoptionalinfo 512\relax \documentclass[11pt,lof]{puthesis} % packages \usepackage{amsmath} \usepackage{amssymb} \usepackage[amsmath,thmmarks,noconfig]{ntheorem} \usepackage{mathtools} \usepackage{multirow} \usepackage{pgfplots} \usepackage{graphicx} \usepackage{enumitem} \usepackage{subcaption} \usepackage{titlesec} \usepackage{stackengine} \usepackage{scalerel} \usepackage{microtype} \usepackage[boxruled,linesnumbered,commentsnumbered,procnumbered]{algorithm2e} \usepackage[longnamesfirst]{natbib} \usepackage[hypertexnames=false,hidelinks]{hyperref} \usepackage[norefs,nocites]{refcheck} \usepackage[defaultlines=3,all]{nowidow} \usepackage{float} % settings \pgfplotsset{compat=1.9} \setcitestyle{round} \captionsetup[subfigure]{justification=centering} \def\arraystretch{1.3} \renewcommand{\descriptionlabel}[1]{\hspace{\labelsep}\textit{#1}} % tables numbered as figures \def\table{\def\figurename{Table}\figure} \let\endtable\endfigure \renewcommand\listfigurename{List of Figures and Tables} % arxiv \newcommand{\arxiv}[1]{\href{https://arxiv.org/abs/#1}{\texttt{arXiv:#1}}} % github \newcommand{\github}[1]{\href{https://github.com/#1}{\texttt{github.com/#1}}} % blackboard \renewcommand{\P}{\ensuremath{\mathbb{P}}} \newcommand{\N}{\ensuremath{\mathbb{N}}} \newcommand{\R}{\ensuremath{\mathbb{R}}} \newcommand{\E}{\ensuremath{\mathbb{E}}} \newcommand{\Q}{\ensuremath{\mathbb{Q}}} \newcommand{\I}{\ensuremath{\mathbb{I}}} \newcommand{\Z}{\ensuremath{\mathbb{Z}}} % roman \newcommand{\rF}{\ensuremath{\mathrm{F}}} \newcommand{\rH}{\ensuremath{\mathrm{H}}} \newcommand{\rL}{\ensuremath{\mathrm{L}}} \newcommand{\rk}{\ensuremath{\mathrm{k}}} \newcommand{\rd}{\ensuremath{\mathrm{d}}} \newcommand{\comp}{\ensuremath{\mathrm{c}}} \newcommand{\TV}{\mathrm{TV}} % bold \newcommand{\bW}{\ensuremath{\mathbf{W}}} \newcommand{\bY}{\ensuremath{\mathbf{Y}}} \newcommand{\bX}{\ensuremath{\mathbf{X}}} \newcommand{\bT}{\ensuremath{\mathbf{T}}} \newcommand{\bA}{\ensuremath{\mathbf{A}}} \newcommand{\bV}{\ensuremath{\mathbf{V}}} % calligraphic \newcommand{\cH}{\ensuremath{\mathcal{H}}} \newcommand{\cF}{\ensuremath{\mathcal{F}}} \newcommand{\cN}{\ensuremath{\mathcal{N}}} \newcommand{\cX}{\ensuremath{\mathcal{X}}} \newcommand{\cG}{\ensuremath{\mathcal{G}}} \newcommand{\cW}{\ensuremath{\mathcal{W}}} \newcommand{\cB}{\ensuremath{\mathcal{B}}} \newcommand{\cS}{\ensuremath{\mathcal{S}}} \newcommand{\cT}{\ensuremath{\mathcal{T}}} \newcommand{\cV}{\ensuremath{\mathcal{V}}} \newcommand{\cE}{\ensuremath{\mathcal{E}}} \newcommand{\cU}{\ensuremath{\mathcal{U}}} \newcommand{\cR}{\ensuremath{\mathcal{R}}} \newcommand{\cA}{\ensuremath{\mathcal{A}}} \newcommand{\cC}{\ensuremath{\mathcal{C}}} \newcommand{\cM}{\ensuremath{\mathcal{M}}} \newcommand{\cD}{\ensuremath{\mathcal{D}}} \newcommand{\cP}{\ensuremath{\mathcal{P}}} \newcommand{\cI}{\ensuremath{\mathcal{I}}} \newcommand{\cY}{\ensuremath{\mathcal{Y}}} % sans serif \newcommand{\T}{\ensuremath{\mathsf{T}}} % symbols \newcommand{\vvvert}{{\vert\kern-0.25ex\vert\kern-0.25ex\vert}} \newcommand{\bigvvvert}{{\big\vert\kern-0.35ex\big\vert\kern-0.35ex\big\vert}} \newcommand{\Bigvvvert}{{\Big\vert\kern-0.3ex\Big\vert\kern-0.3ex\Big\vert}} \newcommand{\bigsetminus}{\mathbin{\big\backslash}} \newcommand{\Bigsetminus}{\mathbin{\Big\backslash}} \newcommand{\dprime}{\ensuremath{\prime\prime}} \newcommand{\tprime}{\ensuremath{\prime\prime\prime}} \newcommand{\objective}{\ensuremath{\mathrm{obj}}} \newcommand{\Dl}{\ensuremath{D_{\textup{lo}}}} \newcommand{\Du}{\ensuremath{D_{\textup{up}}}} % floor of beta \newcommand{\flbeta}{{\ThisStyle{% \ensurestackMath{\stackengine{-0.5\LMpt}{\SavedStyle \beta}% {\SavedStyle {\rule{3.7\LMpt}{0.3\LMpt}}} {U}{c}{F}{F}{S}}\vphantom{\beta}}}} % operators \DeclareMathOperator{\Var}{Var} \DeclareMathOperator{\Cov}{Cov} \DeclareMathOperator{\AIMSE}{AIMSE} \DeclareMathOperator{\LOOCV}{LOOCV} \DeclareMathOperator{\symconv}{symconv} \DeclareMathOperator{\GCV}{GCV} \DeclareMathOperator{\Unif}{Unif} \DeclareMathOperator*{\logistic}{logistic} \DeclareMathOperator{\Bias}{Bias} \DeclareMathOperator{\Env}{Env} \DeclareMathOperator*{\esssup}{ess\,sup} \DeclareMathOperator{\Ber}{Ber} \DeclareMathOperator{\KL}{KL} \DeclareMathOperator{\Gam}{Gam} \DeclareMathOperator{\Yule}{Yule} \DeclareMathOperator{\rank}{rank} \DeclareMathOperator{\Exp}{Exp} \DeclareMathOperator{\Bin}{Bin} \DeclareMathOperator{\Tr}{Tr} \DeclareMathOperator{\Leb}{Leb} \DeclareMathOperator*{\argmin}{arg\,min} \DeclareMathOperator*{\minimize}{minimize:} \DeclareMathOperator*{\subjectto}{subject\ to:} \DeclareMathOperator{\ROT}{ROT} \newcommand{\diff}[1]{\,\mathrm{d}#1} % theorem environments \renewtheoremstyle{break}{% \item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ % ##2}\hbox{\strut}}}]% }{% \item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ % ##2\ \normalfont (##3)}\hbox{\strut}}}]% } \theoremstyle{break} \theorempreskip{7mm} \newtheorem{theorem}{Theorem}[section] \newtheorem{lemma}{Lemma}[section] \newtheorem{assumption}{Assumption}[section] \newtheorem{corollary}{Corollary}[section] \newtheorem{proposition}{Proposition}[section] \newtheorem{definition}{Definition}[section] \newtheorem{remark}{Remark}[section] % proof environments \let\proof\relax \newtheoremstyle{proof}{% \item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ % }\hbox{\strut}}}]% }{% \item[\rlap{\vbox{\hbox{\hskip\labelsep \bfseries\upshape ##1\ % \normalfont (##3)}\hbox{\strut}}}]% } \theoremstyle{proof} \theorembodyfont{\upshape} \theorempreskip{7mm} \theoremsymbol{\ensuremath{\square}} \newtheorem{proof}{Proof} \AtBeginEnvironment{proof}{\setcounter{proofparagraphcounter}{0}}% % proof paragraphs \titleformat{\paragraph}[hang]{\bfseries\upshape}{}{0pt}{}[] \titlespacing*{\paragraph}{0pt}{6pt}{0pt} \newcounter{proofparagraphcounter} \newcommand{\proofparagraph}[1]{ \refstepcounter{proofparagraphcounter}% \paragraph{Part \theproofparagraphcounter : #1}}% % inline roman lists \newlist{inlineroman}{enumerate*}{1} \setlist[inlineroman]{afterlabel=~,label=(\roman*)} % algorithms \DontPrintSemicolon% \makeatletter% \renewcommand{\SetKwInOut}[2]{% \sbox\algocf@inoutbox{\KwSty{#2}\algocf@typo:}% \expandafter\ifx\csname InOutSizeDefined\endcsname\relax% \newcommand\InOutSizeDefined{}% \setlength{\inoutsize}{\wd\algocf@inoutbox}% \sbox\algocf@inoutbox{% \parbox[t]{\inoutsize}% {\KwSty{#2}\algocf@typo:\hfill}~% }% \setlength{\inoutindent}{\wd\algocf@inoutbox}% \else% \ifdim\wd\algocf@inoutbox>\inoutsize% \setlength{\inoutsize}{\wd\algocf@inoutbox}% \sbox\algocf@inoutbox{% \parbox[t]{\inoutsize}% {\KwSty{#2}\algocf@typo:\hfill}~% }% \setlength{\inoutindent}{\wd\algocf@inoutbox}% \fi% \fi% \algocf@newcommand{#1}[1]{% \ifthenelse{\boolean{algocf@inoutnumbered}}{\relax}{\everypar={\relax}}{% \let\\\algocf@newinout\hangindent=\inoutindent\hangafter=1\parbox[t]% {\inoutsize}{\KwSty{#2}% \algocf@typo:\hfill}~##1\par% }% \algocf@linesnumbered% }% }% \makeatother% \SetKwInOut{Input}{Input}% \SetKwInOut{Output}{Output}% \setlength{\algomargin}{2em}% \author{William George Underwood} \adviser{Matias Damian Cattaneo} \title{Estimation and Inference in \\ Modern Nonparametric Statistics} \abstract{ % 350 words max Nonparametric methods are central to modern statistics, enabling data analysis with minimal assumptions in a wide range of scenarios. While contemporary procedures such as random forests and kernel methods are popular due to their performance and flexibility, their statistical properties are often less well understood. The availability of sound inferential techniques is vital in the sciences, allowing researchers to quantify uncertainty in their models. We develop methodology for robust and practical statistical estimation and inference in some modern nonparametric settings involving complex estimators and nontraditional data. We begin in the regression setting by studying the Mondrian random forest, a variant in which the partitions are drawn from a Mondrian process. We present a comprehensive analysis of the statistical properties of Mondrian random forests, including a central limit theorem for the estimated regression function and a characterization of the bias. We show how to conduct feasible and valid nonparametric inference by constructing confidence intervals, and further provide a debiasing procedure that enables minimax-optimal estimation rates for smooth function classes in arbitrary dimension. Next, we turn our attention to nonparametric kernel density estimation with dependent dyadic network data. We present results for minimax-optimal estimation, including a novel lower bound for the dyadic uniform convergence rate, and develop methodology for uniform inference via confidence bands and counterfactual analysis. Our methods are based on strong approximations and are designed to be adaptive to potential dyadic degeneracy. We give empirical results with simulated and real-world economic trade data. Finally, we develop some new probabilistic results with applications to nonparametric statistics. Coupling has become a popular approach for distributional analysis in recent years, and Yurinskii's method stands out for its wide applicability and explicit formulation. We present a generalization of Yurinskii's coupling, treating approximate martingale data under weaker conditions than previously imposed. We allow for Gaussian mixture coupling distributions, and a third-order method permits faster rates in certain situations. We showcase our results with applications to factor models and martingale empirical processes, as well as nonparametric partitioning-based and local polynomial regression procedures. } \acknowledgments{ I am extremely fortunate to have been surrounded by many truly wonderful people over the course of my career, and without their support this dissertation would not have been possible. While it is impossible for me to identify every one of them individually, I would like to mention a few names in particular to recognize those who have been especially important to me during the last few years. Firstly, I would like to express my utmost gratitude to my Ph.D.\ adviser, Matias Cattaneo. Working with Matias has been genuinely inspirational for me, and I could not have asked for a more rewarding start to my journey as a researcher. From the very beginning, he has guided me expertly through my studies, providing hands-on assistance when required while also allowing me the independence necessary to develop as an academic. I hope that, during the four years we have worked together, I have acquired just a fraction of his formidable mathematical intuition, keen attention to detail, boundless creativity, and inimitable pedagogical skill. Alongside his role as my adviser, Matias has been above all a friend, who has been in equal measure inspiring, insightful, dedicated, understanding, and kind. Secondly, I would like to thank all of the faculty members at Princeton and beyond who have acted as my collaborators and mentors, without whom none of my work could have been realized. In particular, I express my gratitude to my tireless Ph.D.\ committee members and letter writers Jianqing Fan and Jason Klusowski, my coauthors Yingjie Feng and Ricardo Masini, my dissertation reader Boris Hanin, my teachers Amir Ali Ahmadi, Ramon van Handel, Mikl{\'o}s R{\'a}cz, and Mykhaylo Shkolnikov, my colleagues Sanjeev Kulkarni and Roc{\'i}o Titiunik, and my former supervisor Mihai Cucuringu. I am also thankful for the staff members at Princeton who have been perpetually helpful, and I would like to identify Kim Lupinacci in particular; her assistance in all things administrative has been invaluable. I am grateful to my fellow graduate students in the ORFE department for their technical expertise and generosity with their time, and for making Sherrerd Hall such a vibrant and exciting space, especially Jose Avilez, Pier Beneventano, Ben Budway, Rajita Chandak, Abraar Chaudhry, Stefan Clarke, Giulia Crippa, G{\"o}k{\c{c}}e Dayan{\i}kl{\i}, Nicolas Garcia, Felix Hoefer, Erica Lai, Jackie Lok, Maya Mutic, Dan Rigobon, Till Saenger, Rajiv Sambharya, Boris Shigida, Igor Silin, Giang Truong, and Rae Yu. Our regular social events made a contribution to my well-being which is difficult to overstate. My thanks extend also to the students I taught, as well as to my group of senior thesis undergraduates, for their commitment, patience, and responsiveness. More broadly, I would like to thank all of my friends, near and far, for their unfailing support and reliability, and for helping to create so many of my treasured memories. In particular, Ole Agersnap, James Ashford, Christian Baehr, Chris Bambic, Kevin Beeson, James Broadhead, Alex Cox, Reece Edmends, Robin Franklin, Greg Henderson, Bonnie Ko, Grace Matthews, Dan Mead, Ben Musachio, Jacob Neis, Monika Papayova, Will Pedrick, Oliver Philcox, Nandita Rao, Alex Rice, Edward Rowe, David Snyder, Titi Sodimu, Nikitas Tampakis, and Anita Zhang. Thank you to the Princeton Chapel Choir for being such a wonderful community of musicians and a source of close friends, and to our directors, Nicole Aldrich and Penna Rose, and organist Eric Plutz. Lastly, yet most importantly, I want to thank my family for their unwavering support throughout my studies. My visits back home have been a source of joy throughout my long and often challenging Ph.D., and I cherish every moment I have spent with my parents, sister, grandparents, and extended family. } \begin{document} \chapter{Introduction} % nonparametric estimation is common Nonparametric estimation procedures are at the heart of many contemporary theoretical and methodological topics within the fields of statistics, data science, and machine learning. Where classical parametric techniques impose specific distributional and structural assumptions when modeling statistical problems, nonparametric methods instead take a more flexible approach, typically positing only high-level restrictions such as moment conditions, independence criteria, and smoothness assumptions. Examples of such procedures abound in modern data science and machine learning, encompassing histograms, kernel estimators, smoothing splines, decision trees, nearest neighbor methods, random forests, neural networks, and many more. % nonparametric estimation is good The benefits of the nonparametric framework are clear: statistical procedures can be formulated in cases where the stringent assumptions of parametric models are untestable, demonstrably violated, or simply unreasonable. As a consequence, the resulting methods often inherit desirable robustness properties against various forms of misspecification or misuse. The class of problems that can be formulated is correspondingly larger: arbitrary distributions and relationships can be characterized and estimated in a principled manner. % nonparametric estimation is hard Nonetheless, these attractive properties do come at a price. In particular, as its name suggests, the nonparametric approach forgoes the ability to reduce a complex statistical problem to that of estimating a fixed, finite number of parameters. Rather, nonparametric procedures typically involve making inferences about a growing number of parameters simultaneously, as witnessed in high-dimensional regimes, or even directly handling infinite-dimensional objects such as entire regression or density functions. As a consequence, nonparametric estimators are usually less efficient than their correctly specified parametric counterparts, when they are available; rates of convergence tend to be slower, and confidence sets more conservative. Another challenge is that theoretical mathematical analyses of nonparametric estimators are often significantly more demanding than those required for low-dimensional parametric settings, necessitating tools from contemporary developments in high-dimensional concentration phenomena, coupling and strong approximation theory, empirical processes, mathematical optimization, and stochastic calculus. % nonparametric inference In addition to providing accurate point estimates of unknown (possibly high-dimensional or infinite-dimensional) quantities of interest, modern nonparametric procedures are also expected to come equipped with methodologies for conducting statistical inference. The availability of such inferential techniques is paramount, with contemporary nonparametric methods forming a ubiquitous component of modern data science tool kits. Valid uncertainty quantification is essential for hypothesis testing, error bar construction, assessing statistical significance, and performing power analyses. Inference is a central concept in classical statistics, and despite the rapid recent development of theory for modern nonparametric estimators, their applicability to statistical inference is in certain cases rather less well studied; theoretically sound and practically implementable inference procedures are sometimes absent in the literature. % complex data In any statistical modeling problem, the selection and application of an estimator must naturally be tailored to the available data. Today, much of the data produced and analyzed does not necessarily fit neatly into the classical framework of independent and identically distributed samples, and instead might consist of time series, stochastic processes, networks, or high-dimensional or functional data, to name just a few. Therefore, it is important to understand how nonparametric methods might be adapted to correctly handle these data types, maintaining fast estimation rates and valid techniques for statistical inference. The technical challenges associated with such an endeavor are non-trivial; many standard techniques are ineffective in the presence of dependent or infinite-dimensional data, for example. As such, the development of new mathematical results in probability theory plays an important role in the comprehensive treatment of nonparametric statistics with complex data. \section*{Overview of the dissertation} % what we do This dissertation presents a selection of topics relating to nonparametric estimation and inference, and the associated technical mathematical tools. % mondrian Chapter~\ref{ch:mondrian}, titled ``Inference with Mondrian Random Forests,'' is based on the work of \citet{cattaneo2023inference}. % what are random forests Random forests are popular ensembling-based methods for classification and regression, which are well known for their good performance, flexibility, robustness, and efficiency. The majority of random forest models share the following common framework for producing estimates of a classification or regression function using covariates and a response variable. Firstly, the covariate space is partitioned in some algorithmic manner, possibly using a source of external randomness. Secondly, a local estimator of the classification or regression function is fitted to the responses in each cell separately, yielding a tree estimator. Finally, this process is repeated with many different partitions, and the resulting tree estimators are averaged to produce a random forest. % why are there variants Many different variants of random forests have been proposed in recent years, typically with the aim of improving their statistical or computational properties, or simplifying their construction in order to permit a more detailed theoretical analysis. % mondrian random forests One interesting such example is that of the Mondrian random forest, in which the underlying partitions (or trees) are constructed independently of the data. Naturally, this restriction rules out many classical random forest models, which exhibit a complex and data-dependent partitioning scheme. Instead, trees are sampled from a canonical stochastic process known as the Mondrian process, which endows the resulting tree and forest estimators with various agreeable features. % what we do We study the estimation and inference properties of Mondrian random forests in the nonparametric regression setting. In particular, we establish a novel central limit theorem for the estimates made by a Mondrian random forest which, when combined with a characterization of the bias and a consistent variance estimator, allows one to perform asymptotically valid statistical inference, such as constructing confidence intervals, on the unknown regression function. We also provide a debiasing procedure for Mondrian random forests, which allows them to achieve minimax-optimal estimation rates with H{\"o}lder smooth regression functions, for any smoothness parameter and in arbitrary dimension. % kernel Chapter~\ref{ch:kernel}, titled ``Dyadic Kernel Density Estimators,'' is based on the work of \citet{cattaneo2024uniform}. Network data plays an important role in statistics, econometrics, and many other data science disciplines, providing a natural framework for modeling relationships between units, be they people, financial institutions, proteins, or economic entities. Of prominent interest is the task of performing statistical estimation and inference with data sampled from the edges of such networks, known as dyadic data. The archetypal lack of independence between edges in a network renders many classical statistical tools unsuited for direct application. As such, researchers must appeal to techniques tailored to dyadic data in order to accurately capture the complex structure present in the network. % broad scope We focus on nonparametric estimation and inference with dyadic data, and in particular we seek methods that are robust in the sense that our results should hold uniformly across the support of the data. Such uniformity guarantees allow for statistical inference in a broader range of settings, including specification testing and distributional counterfactual analysis. We specifically consider the problem of uniformly estimating a dyadic density function, focusing on kernel estimators taking the form of dyadic empirical processes. % main contributions Our main contributions include the minimax-optimal uniform convergence rate of the dyadic kernel density estimator, along with strong approximation results for the associated standardized and Studentized $t$-processes. A consistent variance estimator enables the construction of feasible uniform confidence bands for the unknown density function. We showcase the broad applicability of our results by developing novel counterfactual density estimation and inference methodology for dyadic data, which can be used for causal inference and program evaluation. % why it is difficult A crucial feature of dyadic distributions is that they may be ``degenerate'' at certain points in the support of the data, a property that makes our analysis somewhat delicate. Nonetheless, our methods for uniform inference remain robust to the potential presence of such points. % applications For implementation purposes, we discuss inference procedures based on positive semi-definite covariance estimators, mean squared error optimal bandwidth selectors, and robust bias correction. We illustrate the empirical performance of our methods in simulations and with real-world trade data, for which we make comparisons between observed and counterfactual trade distributions in different years. Our technical results on strong approximations and maximal inequalities are of potential independent interest. % yurinskii Finally, Chapter~\ref{ch:yurinskii}, titled ``Yurinskii's Coupling for Martingales,'' is based on the work of \citet{cattaneo2022yurinskii}. Yurinskii's coupling is a popular theoretical tool for non-asymptotic distributional analysis in mathematical statistics and applied probability. Coupling theory, also known as strong approximation, provides an alternative framework to the more classical weak convergence approach to statistical analysis. Rather than merely approximating the distribution of a random variable, strong approximation techniques construct a sequence of random variables which are close almost surely or in probability, often with finite-sample guarantees. % what is it used for Coupling allows distributional analysis in settings where weak convergence fails, including many applications to nonparametric or high-dimensional statistics; it is a key technical component in the main strong approximation results of our Chapter~\ref{ch:kernel}. The Yurinskii method specifically offers a Gaussian coupling with an explicit error bound under easily verified conditions; originally stated in $\ell^2$-norm for sums of independent random vectors, it has recently been extended both to the $\ell^p$-norm, for $1 \leq p \leq \infty$, and to vector-valued martingales in $\ell^2$-norm, under some strong conditions. % what we do We present as our main result a Yurinskii coupling for approximate martingales in $\ell^p$-norm, under substantially weaker conditions than previously imposed. Our formulation allows the coupling variable to follow a general Gaussian mixture distribution, and we provide a novel third-order coupling method which gives tighter approximations in certain situations. We specialize our main result to mixingales, martingales, and independent data, and derive uniform Gaussian mixture strong approximations for martingale empirical processes. Applications to nonparametric partitioning-based and local polynomial regression procedures are provided. % appendices Supplementary materials for Chapters~\ref{ch:mondrian}, \ref{ch:kernel}, and \ref{ch:yurinskii} are provided in Appendices~\ref{app:mondrian}, \ref{app:kernel}, and \ref{app:yurinskii} respectively. These contain detailed proofs of the main results, additional technical contributions, and further discussion. \chapter[Inference with Mondrian Random Forests]% {Inference with \\ Mondrian Random Forests} \label{ch:mondrian} % abstract Random forests are popular methods for classification and regression, and many different variants have been proposed in recent years. One interesting example is the Mondrian random forest, in which the underlying trees are constructed according to a Mondrian process. In this chapter we give a central limit theorem for the estimates made by a Mondrian random forest in the regression setting. When combined with a bias characterization and a consistent variance estimator, this allows one to perform asymptotically valid statistical inference, such as constructing confidence intervals, on the unknown regression function. We also provide a debiasing procedure for Mondrian random forests which allows them to achieve minimax-optimal estimation rates with $\beta$-H{\"o}lder regression functions, for all $\beta$ and in arbitrary dimension, assuming appropriate parameter tuning. \section{Introduction} Random forests, first introduced by \citet{breiman2001random}, are a workhorse in modern machine learning for classification and regression tasks. Their desirable traits include computational efficiency (via parallelization and greedy heuristics) in big data settings, simplicity of configuration and amenability to tuning parameter selection, ability to adapt to latent structure in high-dimensional data sets, and flexibility in handling mixed data types. Random forests have achieved great empirical successes in many fields of study, including healthcare, finance, online commerce, text analysis, bioinformatics, image classification, and ecology. Since Breiman introduced random forests over twenty years ago, the study of their statistical properties remains an active area of research: see \citet{scornet2015consistency}, \citet{chi2022asymptotic}, \citet{klusowski2024large}, and references therein, for a sample of recent developments. Many fundamental questions about Breiman's random forests remain unanswered, owing in part to the subtle ingredients present in the estimation procedure which make standard analytical tools ineffective. These technical difficulties stem from the way the constituent trees greedily partition the covariate space, utilizing both the covariate and response data. This creates complicated dependencies on the data which are often exceedingly hard to untangle without overly stringent assumptions, thereby hampering theoretical progress. To address the aforementioned technical challenges while retaining the phenomenology of Breiman's random forests, a variety of stylized versions of random forest procedures have been proposed and studied in the literature. These include centered random forests \citep{biau2012analysis,arnould2023interpolation} and median random forests \citep{duroux2018impact,arnould2023interpolation}. Each tree in a centered random forest is constructed by first choosing a covariate uniformly at random and then splitting the cell at the midpoint along the direction of the chosen covariate. Median random forests operate in a similar way, but involve the covariate data by splitting at the empirical median along the direction of the randomly chosen covariate. Known as purely random forests, these procedures simplify Breiman's original---albeit more data-adaptive---version by growing trees that partition the covariate space in a way that is statistically independent of the response data. Yet another variant of random forests, Mondrian random forests \citep{lakshminarayanan2014mondrian}, have received significant attention in the statistics and machine learning communities in recent years \citep{ma2020isolation, mourtada2020minimax, scillitoe2021uncertainty, mourtada2021amf, vicuna2021reducing, gao2022towards, oreilly2022stochastic}. Like other purely random forest variants, Mondrian random forests offer a simplified modification of Breiman's original proposal in which the partition is generated independently of the data and according to a canonical stochastic process known as the Mondrian process \citep{roy2008mondrian}. The Mondrian process takes a single parameter $\lambda > 0$ known as the ``lifetime'' and enjoys various mathematical properties. These probabilistic features allow Mondrian random forests to be fitted in an online manner as well as being subject to a rigorous statistical analysis, while also retaining some of the appealing features of other more traditional random forest methods. This chapter studies the statistical properties of Mondrian random forests. We focus on this purely random forest variant not only because of its importance in the development of random forest theory in general, but also because the Mondrian process is, to date, the only known recursive tree mechanism involving randomization, pure or data-dependent, for which the resulting random forest is minimax-optimal for point estimation over a class of smooth regression functions in arbitrary dimension \citep{mourtada2020minimax}. In fact, when the covariate dimension exceeds one, the aforementioned centered and median random forests are both minimax-\emph{suboptimal}, due to their large biases, over the class of Lipschitz smooth regression functions \citep{klusowski2021sharp}. It is therefore natural to focus our study of inference for random forests on versions that at the very least exhibit competitive bias and variance, as this will have important implications for the trade-off between precision and confidence. Despite their recent popularity, relatively little is known about the formal statistical properties of Mondrian random forests. Focusing on nonparametric regression, \citet{mourtada2020minimax} recently showed that Mondrian forests containing just a single tree (called a Mondrian tree) can be minimax-optimal in integrated mean squared error whenever the regression function is $\beta$-H{\"o}lder continuous for some $\beta \in (0, 1]$. The authors also showed that, when appropriately tuned, large Mondrian random forests can be similarly minimax-optimal for $\beta \in (0, 2]$, while the constituent trees cannot. See also \citet{oreilly2022stochastic} for analogous results for more general Mondrian tree and forest constructions. These results formally demonstrate the value of ensembling with random forests from a point estimation perspective. No results are currently available in the literature for statistical inference using Mondrian random forests. This chapter contributes to the literature on the foundational statistical properties of Mondrian random forest regression estimation with two main results. Firstly, we give a central limit theorem for the classical Mondrian random forest point estimator, and propose valid large-sample inference procedures employing a consistent standard error estimator. We establish this result by deploying a martingale central limit theorem \citep[Theorem~3.2]{hall1980martingale} because we need to handle delicate probabilistic features of the Mondrian random forest estimator. In particular, we deal with the existence of Mondrian cells which are ``too small'' and lead to a reduced effective (local) sample size for some trees in the forest. Such pathological cells are in fact typical in Mondrian random forests and complicate the probability limits of certain sample averages; in fact, small Mondrian random forests (or indeed single Mondrian trees) remain random even in the limit due to the lack of ensembling. The presence of small cells renders inapplicable prior distributional approximation results for partitioning-based estimators in the literature \citep{huang2003local,cattaneo2020large}, since the commonly required quasi-uniformity assumption on the underlying partitioning scheme is violated by cells generated using the Mondrian process. We circumvent this technical challenge by establishing new theoretical results for Mondrian partitions and their associated Mondrian trees and forests, which may be of independent interest. The second main contribution of the chapter is to propose a debiasing approach for the Mondrian random forest point estimator. We accomplish this by first precisely characterizing the probability limit of the large sample conditional bias, and then applying a debiasing procedure based on the generalized jackknife \citep{schucany1977improvement}. We thus exhibit a Mondrian random forest variant which is minimax-optimal in pointwise mean squared error when the regression function is $\beta$-H{\"o}lder for any $\beta > 0$. Our method works by generating an ensemble of Mondrian random forests carefully chosen to have smaller misspecification bias when extra smoothness is available, resulting in minimax optimality even for $\beta > 2$. This result complements \citet{mourtada2020minimax} by demonstrating the existence of a class of Mondrian random forests that can efficiently exploit the additional smoothness of the unknown regression function for minimax-optimal point estimation. Our proposed debiasing procedure is also useful when conducting statistical inference because it provides a principled method for ensuring that the bias is negligible relative to the standard deviation of the estimator. More specifically, we use our debiasing approach to construct valid inference procedures based on robust bias correction \citep{calonico2018effect,calonico2022coverage}. This chapter is structured as follows. In Section~\ref{sec:mondrian_setup} we introduce the Mondrian process and give our assumptions on the data generating process, using a H{\"o}lder smoothness condition on the regression function to control the bias of various estimators. We define the Mondrian random forest estimator and present our assumptions on its lifetime parameter and the number of trees. We give our notation for the following sections in this chapter. Section~\ref{sec:mondrian_inference} presents our first set of main results, beginning with a central limit theorem for the centered Mondrian random forest estimator (Theorem~\ref{thm:mondrian_clt}), in which we characterize the limiting variance. Theorem~\ref{thm:mondrian_bias} complements this result by precisely calculating the limiting bias of the estimator, with the aim of subsequently applying a debiasing procedure. To enable valid feasible statistical inference, we provide a consistent variance estimator in Theorem~\ref{thm:mondrian_variance_estimation} and briefly discuss implications for lifetime parameter selection. In Section~\ref{sec:mondrian_overview_proofs} we provide a brief overview of the proofs of these first main results. We focus on the technical innovations and general strategic approach, giving some insight into the challenges involved, and refer the reader to Section~\ref{sec:mondrian_app_proofs} for detailed proofs. In Section~\ref{sec:mondrian_debiased} we define debiased Mondrian random forests, a collection of estimators based on linear combinations of Mondrian random forests with varying lifetime parameters. These parameters are carefully chosen to annihilate leading terms in our bias characterization, yielding an estimator with provably superior bias properties (Theorem~\ref{thm:mondrian_bias_debiased}). In Theorem~\ref{thm:mondrian_clt_debiased} we verify that a central limit theorem continues to hold for the debiased Mondrian random forest. We again state the limiting variance, discuss the implications for the lifetime parameter, and provide a consistent variance estimator (Theorem~\ref{thm:mondrian_variance_estimation_debiased}) for constructing confidence intervals (Theorem~\ref{thm:mondrian_confidence_debiased}). As a final corollary of the improved bias properties, we demonstrate in Theorem~\ref{thm:mondrian_minimax} that the debiased Mondrian random forest estimator is minimax-optimal in pointwise mean squared error for all $\beta > 0$, provided that $\beta$ is known a priori. Section~\ref{sec:mondrian_parameter_selection} discusses tuning parameter selection, beginning with a data-driven approach to selecting the crucial lifetime parameter using polynomial estimation, alongside other practical suggestions including generalized cross-validation. We also give advice on choosing the number of trees, and other parameters associated with the debiasing procedure. In Section~\ref{sec:mondrian_weather} we present an illustrative example application of our proposed methodology for estimation and inference in the setting of weather forecasting in Australia. We demonstrate the use of our debiased Mondrian random forest estimator and our generalized cross-validation procedure for lifetime parameter selection, as well as the construction of point estimates and confidence intervals. Concluding remarks are given in Section~\ref{sec:mondrian_conclusion}, while Appendix~\ref{app:mondrian} contains all the mathematical proofs of our theoretical contributions, along with some other technical probabilistic results on the Mondrian process which may be of interest. \subsection{Notation} We write $\|\cdot\|_2$ for the usual Euclidean $\ell^2$-norm on $\R^d$. The natural numbers are $\N = \{0, 1, 2, \ldots \}$. We use $a \wedge b$ for the minimum and $a \vee b$ for the maximum of two real numbers. For a set $A$, we use $A^{\comp}$ for the complement whenever the background space is clear from context. We use $C$ to denote a positive constant whose value may change from line to line. For non-negative sequences $a_n$ and $b_n$, write $a_n \lesssim b_n$ or $a_n = O(b_n)$ to indicate that $a_n / b_n$ is bounded for $n\geq 1$. Write $a_n \ll b_n$ or $a_n = o(b_n)$ if $a_n / b_n \to 0$. If $a_n \lesssim b_n \lesssim a_n$, write $a_n \asymp b_n$. For random non-negative sequences $A_n$ and $B_n$, similarly write $A_n \lesssim_\P B_n$ or $A_n = O_\P(B_n)$ if $A_n / B_n$ is bounded in probability, and $A_n = o_\P(B_n)$ if $A_n / B_n \to 0$ in probability. Convergence of random variables $X_n$ in distribution to a law $\P$ is denoted by $X_n \rightsquigarrow \P$. \section{Setup} \label{sec:mondrian_setup} When using a Mondrian random forest, there are two sources of randomness. The first is of course the data, and here we consider the nonparametric regression setting with $d$-dimensional covariates. The second source is a collection of independent trees drawn from a Mondrian process, which we define in the subsequent section, using a specified lifetime parameter. \subsection{The Mondrian process} \label{sec:mondrian_process} The Mondrian process was introduced by \citet{roy2008mondrian} and offers a canonical method for generating random rectangular partitions, which can be used as the trees for a random forest \citep{lakshminarayanan2014mondrian,lakshminarayanan2016mondrian}. For the reader's convenience, we give a brief description of this process here; see \citet[Section~3]{mourtada2020minimax} for a more complete definition. For a fixed dimension $d$ and lifetime parameter $\lambda > 0$, the Mondrian process is a stochastic process taking values in the set of finite rectangular partitions of $[0,1]^d$. For a rectangle $D = \prod_{j=1}^d [a_j, b_j] \subseteq [0,1]^d$, we denote the side aligned with dimension $j$ by $D_j = [a_j, b_j]$, write $D_j^- = a_j$ and $D_j^+ = b_j$ for its left and right endpoints respectively, and use $|D_j| = D_j^+ - D_j^-$ for its length. The volume of $D$ is $|D| = \prod_{j=1}^{d} |D_j|$ and its linear dimension (or half-perimeter) is $|D|_1 = \sum_{j=1}^{d} |D_j|$. To sample a partition $T$ from the Mondrian process $\cM \big( [0,1]^d, \lambda \big)$ we start at time $t=0$ with the trivial partition of $[0,1]^d$ which has no splits. We then repeatedly apply the following procedure to each cell $D$ in the partition. Let $t_D$ be the time at which the cell was formed, and sample $E_D \sim \Exp \left( |D|_1 \right)$. If $t_D + E_D \leq \lambda$, then we split $D$. This is done by first selecting a split dimension $J$ with $\P(J=j) = |D_j| / |D|_1$, and then sampling a split location $S_J \sim \Unif\big[D_J^-, D_J^+\big]$. The cell $D$ splits into the two new cells $\{x \in D : x_J \leq S_J\}$ and $\{x \in D : x_J > S_J\}$, each with formation time $t_D + E_D$. The final outcome is the partition $T$ consisting of the cells $D$ which were not split because $t_D + E_D > \lambda$. The cell in $T$ containing a point $x \in [0,1]^d$ is written $T(x)$. Figure~\ref{fig:mondrian_process} shows typical realizations of $T \sim \cM\big( [0,1]^d, \lambda \big)$ for $d=2$ and with different lifetime parameters $\lambda$. % \begin{figure}[t] \centering % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/plot_mondrian_process_1.pdf} \caption{$\lambda = 3$} \end{subfigure} % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/plot_mondrian_process_2.pdf} \caption{$\lambda = 10$} \end{subfigure} % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/plot_mondrian_process_3.pdf} \caption{$\lambda = 30$} \end{subfigure} % \caption[The Mondrian process]{ The Mondrian process $T \sim \cM \big( [0,1]^d, \lambda \big)$ with $d=2$ and lifetime parameters $\lambda$.} \label{fig:mondrian_process} \end{figure} \subsection{Data generation} Throughout this chapter, we assume that the data satisfies Assumption~\ref{ass:mondrian_data}. We begin with a definition of H{\"o}lder continuity which will be used for controlling the bias of various estimators. \begin{definition}[H{\"o}lder continuity]% Take $\beta > 0$ and define $\flbeta$ to be the largest integer which is strictly less than $\beta$. We say a function $g: [0,1]^d \to \R$ is $\beta$-H{\"o}lder continuous and write $g \in \cH^\beta$ if $g$ is $\flbeta$ times differentiable and $\max_{|\nu| = \flbeta} \left| \partial^\nu g(x) - \partial^{\nu} g(x') \right| \leq C \|x-x'\|_2^{\beta - \flbeta}$ for some constant $C > 0$ and all $x, x' \in [0,1]^d$. Here, $\nu \in \N^d$ is a multi-index with $|\nu| = \sum_{j=1}^d \nu_j$ and $\partial^{\nu} g(x) = \partial^{|\nu|} g(x) \big/ \prod_{j=1}^d \partial x_j^{\nu_j}$. We say $g$ is Lipschitz if $g \in \cH^1$. \end{definition} \begin{assumption}[Data generation]% \label{ass:mondrian_data} Fix $d \geq 1$ and let $(X_i, Y_i)$ be i.i.d.\ samples from a distribution on $\R^d \times \R$, writing $\bX = (X_1, \ldots, X_n)$ and $\bY = (Y_1, \ldots, Y_n)$. Suppose $X_i$ has a Lebesgue density function $f(x)$ on $[0,1]^d$ which is bounded away from zero and satisfies $f \in \cH^\beta$ for some $\beta \geq 1$. Suppose $\E[Y_i^2 \mid X_i]$ is bounded, let $\mu(X_i) = \E[Y_i \mid X_i]$, and assume $\mu \in \cH^\beta$. Write $\varepsilon_i = Y_i - \mu(X_i)$ and assume $\sigma^2(X_i) = \E[\varepsilon_i^2 \mid X_i]$ is Lipschitz and bounded away from zero. \end{assumption} Some comments are in order surrounding Assumption~\ref{ass:mondrian_data}. The requirement that the covariate density $f(x)$ be strictly positive on all of $[0,1]^d$ may seem strong, particularly when $d$ is moderately large. However, since our theory is presented pointwise in $x$, it is sufficient for this to hold only on some neighborhood of $x$. To see this, note that continuity implies the density is positive on some hypercube containing $x$. Upon rescaling the covariates, we can map this hypercube onto $[0,1]^d$. The same argument of course holds for the H{\"o}lder smoothness assumptions and the upper and lower bounds on the conditional variance function. \subsection{Mondrian random forests} \label{sec:mondrian_forests} We define the basic Mondrian random forest estimator \eqref{eq:mondrian_estimator} as in \citet{lakshminarayanan2014mondrian} and \citet{mourtada2020minimax}, and will later extend it to a debiased version in Section~\ref{sec:mondrian_debiased}. For a lifetime parameter $\lambda > 0$ and forest size $B \geq 1$, let $\bT = (T_1, \ldots, T_B)$ be a Mondrian forest where $T_b \sim \cM\big([0,1]^d, \lambda\big)$ are i.i.d.\ Mondrian trees which are independent of the data. For $x \in [0,1]^d$, write $N_b(x) = \sum_{i=1}^{n} \I \left\{ X_i \in T_b(x) \right\}$ for the number of samples in $T_b(x)$, with $\I$ denoting an indicator function. Then the Mondrian random forest estimator of $\mu(x)$ is % \begin{equation} \label{eq:mondrian_estimator} \hat\mu(x) = \frac{1}{B} \sum_{b=1}^B \frac{\sum_{i=1}^n Y_i \, \I\big\{ X_i \in T_b(x) \big\}} {N_b(x)}. \end{equation} % If there are no samples $X_i$ in $T_b(x)$ then $N_b(x) = 0$, so we define $0/0 = 0$ (see Section~\ref{sec:mondrian_app_proofs} for details). To ensure the bias and variance of the Mondrian random forest estimator converge to zero (see Section~\ref{sec:mondrian_inference}), and to avoid boundary issues, we impose some basic conditions on $x$, $\lambda$, and $B$ in Assumption~\ref{ass:mondrian_estimator}. \begin{assumption}[Mondrian random forest estimator]% \label{ass:mondrian_estimator} % Suppose $x \in (0,1)^d$ is an interior point of the support of $X_i$, $\frac{\lambda^d}{n} \to 0$, $\log \lambda \asymp \log n$, and $B \asymp n^{\xi}$ for some $\xi \in (0, 1)$, which may depend on the dimension $d$ and smoothness $\beta$. % \end{assumption} Assumption~\ref{ass:mondrian_estimator} implies that the size of the forest $B$ grows with $n$. For the purpose of mitigating the computational burden, we suggest the sub-linear polynomial growth $B \asymp n^{\xi}$, satisfying the conditions imposed in our main results. Large forests usually do not present computational challenges in practice as the ensemble estimator is easily parallelizable over the trees. We emphasize places where this ``large forest'' condition is important to our theory as they arise throughout the chapter. \section{Inference with Mondrian random forests}% \label{sec:mondrian_inference} We begin with a bias--variance decomposition for the Mondrian random forest estimator: % \begin{align} \nonumber \hat\mu(x) - \mu(x) &= \Big( \hat\mu(x) - \E \big[ \hat \mu(x) \mid \bX, \bT \big]\Big) + \Big( \E \big[ \hat \mu(x) \mid \bX, \bT \big] - \mu(x)\Big) \\ &= \nonumber \frac{1}{B} \sum_{b=1}^B \frac{\sum_{i=1}^n \varepsilon_i \, \I\big\{ X_i \in T_b(x) \big\}} {N_b(x)} \\ \label{eq:mondrian_bias_variance} &\quad+ \frac{1}{B} \sum_{b=1}^B \frac{\sum_{i=1}^n \big(\mu(X_i) - \mu(x)\big) \, \I\big\{ X_i \in T_b(x) \big\}} {N_b(x)}. \end{align} % Our approach to inference is summarized as follows. Firstly, we provide a central limit theorem (weak convergence to a Gaussian) for the first ``variance'' term in \eqref{eq:mondrian_bias_variance}. Secondly, we precisely compute the probability limit of the second ``bias'' term. By ensuring that the standard deviation dominates the bias, a corresponding central limit theorem holds for the Mondrian random forest. With an appropriate estimator for the limiting variance, we establish procedures for valid and feasible statistical inference on the unknown regression function $\mu(x)$. We begin with the aforementioned central limit theorem, which forms the core of our methodology for performing statistical inference. Before stating our main result, we highlight some of the challenges involved. At first glance, the summands in the first term in \eqref{eq:mondrian_bias_variance} seem to be independent over $1 \leq i \leq n$, conditional on the forest $\bT$, depending only on $X_i$ and $\varepsilon_i$. However, the $N_b(x)$ appearing in the denominator depends on all $X_i$ simultaneously, violating this independence assumption and rendering classical central limit theorems inapplicable. A natural preliminary attempt to resolve this issue is to observe that % \begin{equation*} N_b(x)= \sum_{i=1}^{n} \I\big\{X_i \in T_b(x)\big\} \approx n \, \P \big( X_i \in T_b(x) \mid T_b \big) \approx n f(x) |T_b(x)| \end{equation*} % with high probability. One could attempt to use this by approximating the estimator with an average of i.i.d.\ random variables, or by employing a central limit theorem conditional on $\bX$ and $\bT$. However, such an approach fails because $\E \left[ \frac{1}{|T_b(x)|^2} \right] = \infty$; the possible existence of small cells causes the law of the inverse cell volume to have heavy tails. For similar reasons, attempts to directly establish a central limit theorem based on $2 + \delta$ moments, such as the Lyapunov central limit theorem, are ineffective. We circumvent these problems by directly analyzing $\frac{\I\{N_b(x) \geq 1\}}{N_b(x)}$. We establish concentration properties for this non-linear function of $X_i$ via the Efron--Stein inequality \citep[Section 3.1]{boucheron2013concentration} along with a sequence of somewhat delicate preliminary lemmas regarding inverse moments of truncated (conditional) binomial random variables. In particular, we show that $\E \left[ \frac{\I \{N_b(x) \geq 1\}}{N_b(x)} \right] \lesssim \frac{\lambda^d}{n}$ and $\E \left[ \frac{\I \{N_b(x) \geq 1\}}{N_b(x)^2} \right] \lesssim \frac{\lambda^{2d} \log n}{n^2}$. Asymptotic normality is then established using a central limit theorem for martingale difference sequences \citep[Theorem~3.2]{hall1980martingale} with respect to an appropriate filtration. Section~\ref{sec:mondrian_overview_proofs} gives an overview our proof strategy in which we further discuss the underlying challenges, while Section~\ref{sec:mondrian_app_proofs} gives all the technical details. \subsection{Central limit theorem} \label{sec:mondrian_clt} Theorem~\ref{thm:mondrian_clt} gives our first main result. \begin{theorem}[Central limit theorem for the centered Mondrian random forest estimator]% \label{thm:mondrian_clt} % Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold, and further assume that $\E[Y_i^4 \mid X_i ]$ is bounded almost surely and $\frac{\lambda^d \log n}{n} \to 0$. Then % \begin{align*} \sqrt{\frac{n}{\lambda^d}} \Big( \hat \mu(x) - \E \big[ \hat \mu(x) \mid \bX, \bT \big] \Big) &\rightsquigarrow \cN\big(0, \Sigma(x)\big) & &\text{where} &\Sigma(x) &= \frac{\sigma^2(x)}{f(x)} \left( \frac{4 - 4 \log 2}{3 } \right)^d. \end{align*} \end{theorem} The condition of $B \to \infty$ is crucial, ensuring sufficient ``mixing'' of different Mondrian cells to escape the heavy-tailed phenomenon detailed in the preceding discussion. For concreteness, the large forest condition allows us to deal with expressions such as $\E \left[ \frac{1}{|T_b(x)| |T_{b'}(x)|} \right] = \E \left[ \frac{1}{|T_b(x)|} \right] \E \left[ \frac{1}{|T_{b'}(x)|} \right] \approx \lambda^{2d} < \infty$ where $b \neq b'$, by independence of the trees, rather than the ``no ensembling'' single tree analog $\E \left[ \frac{1}{|T_b(x)|^2} \right] = \infty$. We take this opportunity to contrast Mondrian random forests with more classical kernel-based smoothing methods. The lifetime $\lambda$ plays a similar role to the inverse bandwidth in determining the effective sample size $n / \lambda^d$, and thus the associated rate of convergence. However, due to the Mondrian process construction, some cells are typically ``too small'' (equivalent to an insufficiently large bandwidth) to give an appropriate effective sample size. Similarly, classical methods based on non-random partitioning such as spline estimators \citep{huang2003local,cattaneo2020large} typically impose a quasi-uniformity assumption to ensure all the cells are of comparable size, a property which does not hold for the Mondrian process (not even with probability approaching one). \subsection*{Bias characterization} We turn to the second term in \eqref{eq:mondrian_bias_variance}, which captures the bias of the Mondrian random forest estimator conditional on the covariates $\bX$ and the forest $\bT$. As such, it is a random quantity which, as we will demonstrate, converges in probability. We precisely characterize the limiting non-random bias, including high-degree polynomials in $\lambda$ which for now may seem ignorable. Indeed the magnitude of the bias is determined by its leading term, typically of order $1/\lambda^2$ whenever $\beta \geq 2$, and this suffices for ensuring a negligible contribution from the bias with an appropriate choice of lifetime parameter. However, the advantage of specifying higher-order bias terms is made apparent in Section~\ref{sec:mondrian_debiased} when we construct a debiased Mondrian random forest estimator. There, we target and annihilate the higher-order terms in order to furnish superior estimation and inference properties. Theorem~\ref{thm:mondrian_bias} gives our main result on the bias of the Mondrian random forest estimator. \begin{theorem}[Bias of the Mondrian random forest estimator]% \label{thm:mondrian_bias} % Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold. Then for each $1 \leq r \leq \lfloor \flbeta / 2 \rfloor$ there exists $B_r(x) \in \R$, which is a function only of the derivatives of $f$ and $\mu$ at $x$ up to order $2r$, with % \begin{equation*} \E \left[ \hat \mu(x) \mid \bX, \bT \right] = \mu(x) + \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor} \frac{B_r(x)}{\lambda^{2r}} + O_\P \left( \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} + \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}} \right). \end{equation*} % Whenever $\beta > 2$ the leading bias is the quadratic term % \begin{equation*} \frac{B_1(x)}{\lambda^2} = \frac{1}{2 \lambda^2} \sum_{j=1}^d \frac{\partial^2 \mu(x)}{\partial x_j^2} + \frac{1}{2 \lambda^2} \frac{1}{f(x)} \sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j} \frac{\partial f(x)}{\partial x_j}. \end{equation*} % If $X_i \sim \Unif\big([0,1]^d\big)$ then $f(x) = 1$, and using multi-index notation we have % \begin{equation*} \frac{B_r(x)}{\lambda^{2r}} = \frac{1}{\lambda^{2r}} \sum_{|\nu|=r} \partial^{2 \nu} \mu(x) \prod_{j=1}^d \frac{1}{\nu_j + 1}. \end{equation*} % \end{theorem} In Theorem~\ref{thm:mondrian_bias} we give some explicit examples of calculating the limiting bias if $\beta > 2$ or when $X_i$ are uniformly distributed. The general form of $B_r(x)$ is provided in Section~\ref{sec:mondrian_app_proofs} but is somewhat unwieldy except in specific situations. Nonetheless the most important properties are that $B_r(x)$ are non-random and do not depend on the lifetime $\lambda$, crucial facts for our debiasing procedure given in Section~\ref{sec:mondrian_debiased}. If the forest size $B$ does not diverge to infinity then we suffer the first-order bias term $\frac{1}{\lambda \sqrt B}$. This phenomenon was explained by \citet{mourtada2020minimax}, who noted that it allows single Mondrian trees to achieve minimax optimality only when $\beta \in (0, 1]$. Large forests remove this first-order bias and are optimal for all $\beta \in (0, 2]$. Using Theorem~\ref{thm:mondrian_clt} and Theorem~\ref{thm:mondrian_bias} together, along with an appropriate choice of lifetime parameter $\lambda$, gives a central limit theorem for the Mondrian random forest estimator which can be used, for example, to build confidence intervals for the unknown regression function $\mu(x)$ whenever the bias shrinks faster than the standard deviation. In general this will require $\frac{1}{\lambda^2} + \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} \ll \sqrt{\frac{\lambda^d}{n}}$, which can be satisfied by imposing the restrictions $\lambda \gg n^{\frac{1}{d + 2(2 \wedge \beta)}}$ and $B \gg n^{\frac{2(2 \wedge \beta) - 2}{d + 2(2 \wedge \beta)}}$ on the lifetime $\lambda$ and forest size $B$. If instead we aim for optimal point estimation, then balancing the bias and standard deviation requires $\frac{1}{\lambda^2} + \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} \asymp \sqrt{\frac{\lambda^d}{n}}$, which can be satisfied by $\lambda \asymp n^{\frac{1}{d + 2(2 \wedge \beta)}}$ and $B \gtrsim n^{\frac{2(2 \wedge \beta) - 2}{d + 2(2 \wedge \beta)}}$. Such a choice of $\lambda$ gives the convergence rate $n^{\frac{-(2 \wedge \beta)}{d + 2(2 \wedge \beta)}}$ which is the minimax-optimal rate of convergence \citep{stone1982optimal} for $\beta$-H{\"o}lder functions with $\beta \in (0,2]$ as shown by \citet[Theorem~2]{mourtada2020minimax}. In Section~\ref{sec:mondrian_debiased} we will show how the Mondrian random forest estimator can be debiased, giving both weaker lifetime conditions for inference and also improved rates of convergence, under additional smoothness assumptions. \subsection*{Variance estimation} The limiting variance $\Sigma(x)$ from the resulting central limit theorem depends on the unknown quantities $\sigma^2(x)$ and $f(x)$. To conduct feasible inference, we must therefore first estimate $\Sigma(x)$. To this end, define % \begin{align} \label{eq:mondrian_sigma2_hat} \hat\sigma^2(x) &= \frac{1}{B} \sum_{b=1}^{B} \sum_{i=1}^n \frac{\big(Y_i - \hat \mu(x)\big)^2 \, \I\{X_i \in T_b(x)\}} {N_b(x)}, \\ \nonumber \hat\Sigma(x) &= \hat\sigma^2(x) \frac{n}{\lambda^d} \sum_{i=1}^n \left( \frac{1}{B} \sum_{b=1}^B \frac{\I\{X_i \in T_b(x)\}}{N_b(x)} \right)^2. \end{align} % In Theorem~\ref{thm:mondrian_variance_estimation} we show that this estimator is consistent, and establish its rate of convergence. % \begin{theorem}[Variance estimation]% \label{thm:mondrian_variance_estimation} Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}, and suppose $\E[Y_i^4 \mid X_i ]$ is bounded almost surely. Then % \begin{align*} \hat\Sigma(x) = \Sigma(x) + O_\P \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{1}{\sqrt B} + \sqrt{\frac{\lambda^d \log n}{n}} \right). \end{align*} \end{theorem} \subsection{Confidence intervals} Theorem~\ref{thm:mondrian_confidence} shows how to construct valid confidence intervals for the regression function $\mu(x)$ under the lifetime and forest size assumptions previously discussed. For details on feasible and practical selection of the lifetime parameter $\lambda$, see Section~\ref{sec:mondrian_parameter_selection}. % \begin{theorem}[Feasible confidence intervals using a Mondrian random forest]% \label{thm:mondrian_confidence} % Suppose that Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold, $\E[Y_i^4 \mid X_i ]$ is bounded almost surely, and $\frac{\lambda^d \log n}{n} \to 0$. Assume that $\lambda \gg n^{\frac{1}{d + 2(2 \wedge \beta)}}$ and $B \gg n^{\frac{2 (2 \wedge \beta) - 2}{d + 2 (2 \wedge \beta)}}$. For a confidence level $\alpha \in (0, 1)$, let $q_{1 - \alpha / 2}$ be the normal quantile satisfying $\P \left( \cN(0, 1) \leq q_{1 - \alpha / 2} \right) = 1 - \alpha / 2$. Then % \begin{align*} \P \left( \mu(x) \in \left[ \hat \mu(x) - \sqrt{\frac{\lambda^d}{n}} \hat \Sigma(x)^{1/2} q_{1 - \alpha / 2}, \ \hat \mu(x) + \sqrt{\frac{\lambda^d}{n}} \hat \Sigma(x)^{1/2} q_{1 - \alpha / 2} \right] \right) \to 1 - \alpha. \end{align*} \end{theorem} When coupled with an appropriate lifetime selection method, Theorem~\ref{thm:mondrian_confidence} gives a fully feasible procedure for uncertainty quantification in Mondrian random forests. Our procedure requires no adjustment of the original Mondrian random forest estimator beyond ensuring that the bias is negligible, and in particular does not rely on sample splitting. The construction of confidence intervals is just one corollary of the weak convergence result given in Theorem~\ref{thm:mondrian_clt}, and follows immediately from Slutsky's theorem \citep[Chapter~7]{pollard2002user} with a consistent variance estimator. Other applications include hypothesis testing on the value of $\mu(x)$ at a design point $x$ by inversion of the confidence interval, as well as parametric specification testing by comparison with a $\sqrt{n}$-consistent parametric regression estimator. The construction of simultaneous confidence intervals for finitely many points $x_1, \ldots, x_D$ can be accomplished either using standard multiple testing corrections or by first establishing a multivariate central limit theorem using the Cram{\'e}r--Wold device \citep[Chapter~8]{pollard2002user} and formulating a consistent multivariate variance estimator. \section{Overview of proof strategy}% \label{sec:mondrian_overview_proofs} This section provides some insight into the general approach we use to establish the main results in the preceding sections. We focus on the technical innovations forming the core of our arguments, and refer the reader to Section~\ref{sec:mondrian_app_proofs} for detailed proofs, including those for the debiased estimator discussed in the upcoming Section~\ref{sec:mondrian_debiased}. \subsection*{Preliminary results} The starting point for our proofs is a characterization of the exact distribution of the shape of a Mondrian cell $T(x)$. This property is a direct consequence of the fact that the restriction of a Mondrian process to a subcell remains Mondrian \citep[Fact~2]{mourtada2020minimax}. We have % \begin{align*} |T(x)_j| &= \left( \frac{E_{j1}}{\lambda} \wedge x_j \right) + \left( \frac{E_{j2}}{\lambda} \wedge (1-x_j) \right) \end{align*} % for all $1 \leq j \leq d$, recalling that $T(x)_j$ is the side of the cell $T(x)$ aligned with axis $j$, and where $E_{j1}$ and $E_{j2}$ are mutually independent $\Exp(1)$ random variables. Our assumptions that $x \in (0,1)$ and $\lambda \to \infty$ make the boundary terms $x_j$ and $1-x_j$ eventually ignorable so % \begin{align*} |T(x)_j| &= \frac{E_{j1} + E_{j2}}{\lambda} \end{align*} % with high probability. Controlling the size of the largest cell in the forest containing $x$ is now straightforward with a union bound, exploiting the sharp tail decay of the exponential distribution, and thus % \begin{align*} \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_b(x)_j| \lesssim_\P \frac{\log B}{\lambda}. \end{align*} % This shows that up to logarithmic terms, none of the cells in the forest at $x$ are significantly larger than average, ensuring that the Mondrian random forest estimator is localized around $x$ on the scale of $1/\lambda$, an important property for the upcoming bias characterization. Having provided upper bounds for the sizes of Mondrian cells, we also must establish some lower bounds in order to quantify the ``small cell'' phenomenon mentioned previously. The first step towards this is to bound the first two moments of the truncated inverse Mondrian cell volume; we show that % \begin{align*} \E\left[ 1 \wedge \frac{1}{n |T(x)|} \right] &\asymp \frac{\lambda^d}{n} &&\text{and} &\frac{\lambda^{2d}}{n^2} &\lesssim \E\left[ 1 \wedge \frac{1}{n^2 |T(x)|^2} \right] \lesssim \frac{\lambda^{2d} \log n}{n^2}. \end{align*} % These bounds are computed directly using the exact distribution of $|T(x)|$. Note that $\E\left[ \frac{1}{|T(x)|^2} \right] = \infty$ because $\frac{1}{E_{j1} + E_{j2}}$ has only $2 - \delta$ finite moments, so the truncation is crucial here. Since we nearly have two moments, this truncation is at the expense of only a logarithmic term. Nonetheless, third and higher truncated moments will not enjoy such tight bounds, demonstrating both the fragility of this result and the inadequacy of tools such as the Lyapunov central limit theorem which require $2 + \delta$ moments. To conclude this investigation into the small cell phenomenon, we apply the previous bounds to ensure that the empirical effective sample sizes $N_b(x) = \sum_{i=1}^{n} \I \left\{ X_i \in T_b(x) \right\}$ are approximately of the order $n / \lambda^d$ in an appropriate sense; we demonstrate that % \begin{align*} \E\left[ \frac{\I\{N_b(x) \geq 1\}}{N_b(x)} \right] &\lesssim \frac{\lambda^d}{n} &&\text{and} &\E\left[ \frac{\I\{N_b(x) \geq 1\}}{N_b(x)^2} \right] &\lesssim \frac{\lambda^{2d} \log n}{n^2}, \end{align*} % as well as similar bounds for mixed terms such as % $\E \left[ \frac{\I\{N_b(x) \geq 1\}}{N_b(x)} \frac{\I\{N_{b'}(x) \geq 1\}}{N_{b'}(x)} \right] \lesssim \frac{\lambda^{2d}}{n^2}$ % when $b \neq b'$, which arise from covariance terms across multiple trees. The proof of this result is involved and technical, and proceeds by induction. The idea is to construct a class of subcells by taking all possible intersections of the cells in $T_b$ and $T_{b'}$ (we show two trees here for clarity; there may be more) and noting that each $N_b(x)$ is the sum of the number of points in each such refined cell intersected with $T_b(x)$. We then swap out each refined cell one at a time and replace the number of data points it contains with its volume multiplied by $n f(x)$, showing that the expectation on the left hand side does not increase too much using a moment bound for inverse binomial random variables based on Bernstein's inequality. By induction and independence of the trees, eventually the problem is reduced to computing moments of truncated inverse Mondrian cell volumes, as above. \subsection*{Central limit theorem} To prove our main central limit theorem result (Theorem~\ref{thm:mondrian_clt}), we use the martingale central limit theorem given by \citet[Theorem~3.2]{hall1980martingale}. For each $1 \leq i \leq n$ define $\cH_{n i}$ to be the filtration generated by $\bT$, $\bX$, and $(\varepsilon_j : 1 \leq j \leq i)$, noting that $\cH_{n i} \subseteq \cH_{(n+1)i}$ because $B$ increases as $n$ increases. Define the $\cH_{n i}$-measurable and square integrable variables % \begin{align*} S_i(x) &= \sqrt{\frac{n}{\lambda^d}} \frac{1}{B} \sum_{b=1}^B \frac{\I \{X_i \in T_b(x)\} \varepsilon_i} {N_{b}(x)}, \end{align*} % which satisfy the martingale difference property $\E [ S_i(x) \mid \cH_{n i} ] = 0$. Further, % \begin{align*} \sqrt{\frac{n}{\lambda^d}} \big( \hat\mu(x) - \E\left[ \hat\mu(x) \mid \bX, \bT \right] \big) = \sum_{i=1}^n S_i(x). \end{align*} % To establish weak convergence to $\cN\big(0, \Sigma(x)\big)$, it suffices to check that $\max_i |S_i(x)| \to 0$ in probability, $\E\left[\max_i S_i(x)^2\right] \lesssim 1$, and $\sum_i S_i(x)^2 \to \Sigma(x)$ in probability. Checking the first two of these is straightforward given the denominator moment bounds derived above. For the third condition, we demonstrate that $\sum_i S_i(x)^2$ concentrates by checking its variance is vanishing. To do this, first observe that $S_i(x)^2$ is the square of a sum over the $B$ trees. Expanding this square, we see that the diagonal terms (where $b = b'$) provide a negligible contribution due to the large forest assumption. For the other terms, we apply the law of total variance and the moment bounds detailed earlier. Here, it is crucial that $b \neq b'$ in order to exploit the independence of the trees and avoid having to control any higher moments. The law of total variance requires that we bound % \begin{align*} \Var \left[ \E \left[ \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I\{X_i \in T_b(x) \cap T_{b'}(x)\} \varepsilon_i^2} {N_{b}(x) N_{b'}(x)} \Bigm| \bX, \bY \right] \right], \end{align*} % which is the variance of a non-linear function of the i.i.d.\ variables $(X_i, \varepsilon_i)$, and so we apply the Efron--Stein inequality. The important insight here is that replacing a sample $(X_i, \varepsilon_i)$ with an independent copy $(\tilde X_i, \tilde \varepsilon_i)$ can change the value of $N_b(x)$ by at most one. Further, this can happen only on the event $\{ X_i \in T_{b}(x) \} \cup \{ \tilde X_i \in T_{b}(x) \}$, which occurs with probability on the order $1/\lambda^d$ (the expected cell volume). The final part of the central limit theorem proof is to calculate the limiting variance $\Sigma(x)$. The penultimate step showed that we must have % \begin{align*} \Sigma(x) &= \lim_{n \to \infty} \sum_{i=1}^n \E \left[S_i(x)^2 \right] = \lim_{n \to \infty} \frac{n^2}{\lambda^d} \, \E \left[ \frac{\I\{X_i \in T_b(x) \cap T_{b'}(x)\} \varepsilon_i^2} {N_{b}(x) N_{b'}(x)} \right], \end{align*} % assuming the limit exists, so it remains to check this and calculate the limit. It is a straightforward but tedious exercise to verify that each term can be replaced with its conditional expectation given $T_b$ and $T_{b'}$, using some further properties of the binomial and exponential distributions. This yields % \begin{align*} \Sigma(x) &= \frac{\sigma^2(x)}{f(x)} \lim_{\lambda \to \infty} \frac{1}{\lambda^d} \E \left[ \frac{|T_{b}(x) \cap T_{b'}(x)|} {|T_{b}(x)| \, |T_{b'}(x)|} \right] = \frac{\sigma^2(x)}{f(x)} \E \left[ \frac{(E_{1} \wedge E'_{1}) + (E_{2} \wedge E'_{2})} {(E_{1} + E_{2}) (E'_{1} + E'_{2})} \right]^d \end{align*} % where $E_1$, $E_2$, $E'_1$, and $E'_2$ are independent $\Exp(1)$, by the cell shape distribution and independence of the trees. This final expectation is calculated by integration, using various incomplete gamma function identities. \subsection*{Bias characterization} Our second substantial technical result is the bias characterization given as Theorem~\ref{thm:mondrian_bias}, in which we precisely characterize the probability limit of the conditional bias % \begin{align*} \E \left[ \hat \mu(x) \mid \bX, \bT \right] - \mu(x) &= \frac{1}{B} \sum_{b=1}^B \sum_{i=1}^n \big( \mu(X_i) - \mu(x) \big) \frac{\I\{X_i \in T_b(x)\}}{N_b(x)}. \end{align*} % The first step is to pass to the ``infinite forest'' limit by taking an expectation conditional on $\bX$, or equivalently marginalizing over $\bT$, applying the conditional Markov inequality to see % \begin{align*} \big| \E \left[ \hat \mu(x) \mid \bX, \bT \right] - \E \left[ \hat \mu(x) \mid \bX \right] \big| &\lesssim_\P \frac{1}{\lambda \sqrt B}. \end{align*} % While this may seem a crude approximation, it is already known that fixed-size Mondrian forests have suboptimal bias properties when compared to forests with a diverging number of trees. In fact, the error $\frac{1}{\lambda \sqrt B}$ exactly accounts for the first-order bias of individual Mondrian trees noted by \citet{mourtada2020minimax}. Next we show that $\E \left[ \hat \mu(x) \mid \bX \right]$ converges in probability to its expectation, again using the Efron--Stein theorem for this non-linear function of the i.i.d.\ variables $X_i$. The Lipschitz property of $\mu$ and the upper bound on the maximum cell size give $|\mu(X_i) - \mu(x)| \lesssim \max_{1 \leq j \leq d} |T_b(x)_j| \lesssim_\P \frac{\log B}{\lambda}$ whenever $X_i \in T_b(x)$, so we combine this with moment bounds for the denominator $N_b(x)$ to see % \begin{align*} \left| \E \left[ \hat \mu(x) \mid \bX \right] - \E \left[ \hat \mu(x) \right] \right| \lesssim_\P \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}}. \end{align*} The next step is to approximate the resulting non-random bias $\E \left[ \hat \mu(x) \right] - \mu(x)$ as a polynomial in $1/\lambda$. To this end, we firstly apply a concentration-type result for the binomial distribution to deduce that % \begin{align*} \E \left[ \frac{\I\{N_b(x) \geq 1\}}{N_b(x)} \Bigm| \bT \right] \approx \frac{1}{n \int_{T_b(x)} f(s) \diff s} \end{align*} % in an appropriate sense, and hence, by conditioning on $\bT$ and $\bX$ without $X_i$, we write % \begin{align} \label{eq:mondrian_bias_ratio} \E \left[ \hat \mu(x) \right] - \mu(x) &\approx \E \left[ \frac{\int_{T_b(x)} (\mu(s) - \mu(x)) f(s) \diff s} {\int_{T_b(x)} f(s) \diff s} \right]. \end{align} % Next we apply the multivariate version of Taylor's theorem to the integrands in both the numerator and the denominator in \eqref{eq:mondrian_bias_ratio}, and then apply the Maclaurin series of $\frac{1}{1+x}$ and the multinomial theorem to recover a single polynomial in $1/\lambda$. The error term is on the order of $1/\lambda^\beta$ and depends on the smoothness of $\mu$ and $f$, and the polynomial coefficients are given by various expectations involving exponential random variables. The final step is to verify using symmetry of Mondrian cells that all the odd monomial coefficients are zero, and to calculate some explicit examples of the form of the limiting bias. \section{Debiased Mondrian random forests}% \label{sec:mondrian_debiased} In this section we give our next main contribution, proposing a variant of the Mondrian random forest estimator which corrects for higher-order bias with an approach based on generalized jackknifing \citep{schucany1977improvement}. This estimator retains the basic form of a Mondrian random forest estimator in the sense that it is a linear combination of Mondrian tree estimators, but in this section we allow for non-identical linear coefficients, some of which may be negative, and for differing lifetime parameters across the trees. Since the basic Mondrian random forest estimator is a special case of this more general debiased version, we will discuss only the latter throughout the rest of the chapter. We use the explicit form of the bias given in Theorem~\ref{thm:mondrian_bias} to construct a debiased version of the Mondrian forest estimator. Let $J \geq 0$ be the bias correction order. As such, with $J=0$ we retain the original Mondrian forest estimator, with $J=1$ we remove second-order bias, and with $J = \lfloor\flbeta / 2 \rfloor$ we remove bias terms up to and including order $2 \lfloor\flbeta / 2 \rfloor$, giving the maximum possible bias reduction achievable in the H{\"o}lder class $\cH^\beta$. As such, only bias terms of order $1/\lambda^\beta$ will remain. For $0 \leq r \leq J$ let $\hat \mu_r(x)$ be a Mondrian forest estimator based on the trees $T_{b r} \sim \cM\big([0,1]^d, \lambda_r \big)$ for $1 \leq b \leq B$, where $\lambda_r = a_r \lambda$ for some $a_r > 0$ and $\lambda > 0$. Write $\bT$ to denote the collection of all the trees, and suppose they are mutually independent. We find values of $a_r$ along with coefficients $\omega_r$ in order to annihilate the leading $J$ bias terms of the debiased Mondrian random forest estimator % \begin{align} \label{eq:mondrian_debiased} \hat \mu_\rd(x) &= \sum_{r=0}^J \omega_r \hat \mu_r(x) = \sum_{r=0}^{J} \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\sum_{i=1}^n Y_i \, \I\big\{ X_i \in T_{r b}(x) \big\}} {N_{r b}(x)}. \end{align} % This ensemble estimator retains the ``forest'' structure of the original estimators, but with varying lifetime parameters $\lambda_r$ and coefficients $\omega_r$. Thus by Theorem~\ref{thm:mondrian_bias} we want to solve % \begin{align*} \sum_{r=0}^{J} \omega_r \left( \mu(x) + \sum_{s=1}^{J} \frac{B_{s}(x)}{a_r^{2s} \lambda^{2s}} \right) &= \mu(x) \end{align*} % for all $\lambda$, or equivalently the system of linear equations $\sum_{r=0}^{J} \omega_r = 1$ and $\sum_{r=0}^{J} \omega_r a_r^{-2s} = 0$ for each $1 \leq s \leq J$. We solve these as follows. Define the $(J+1) \times (J+1)$ Vandermonde matrix $A_{r s} = a_{r-1}^{2-2s}$, and let $\omega = (\omega_0, \ldots, \omega_J)^\T \in \R^{J+1}$ and $e_0 = (1, 0, \ldots, 0)^\T \in \R^{J+1}$. Then a solution for the debiasing coefficients is given by $\omega = A^{-1} e_0$ whenever $A$ is non-singular. In practice we can take $a_r$ to be a fixed geometric or arithmetic sequence to ensure this is the case, appealing to the Vandermonde determinant formula: $\det A = \prod_{0 \leq r < s \leq J} (a_r^{-2} - a_s^{-2}) \neq 0$ whenever $a_r$ are distinct. For example, we could set $a_r = (1 + \gamma)^r$ or $a_r = 1 + \gamma r$ for some $\gamma > 0$. Because we assume $\beta$, and therefore the choice of $J$, do not depend on $n$, there is no need to quantify the invertibility of $A$ by, for example, bounding its eigenvalues away from zero as a function of $J$. \subsection{Central limit theorem} In Theorem~\ref{thm:mondrian_clt_debiased}, we verify that a central limit theorem holds for the debiased random forest estimator $\hat\mu_\rd(x)$ and give its limiting variance. The strategy and challenges associated with proving Theorem~\ref{thm:mondrian_clt_debiased} are identical to those discussed earlier surrounding Theorem~\ref{thm:mondrian_clt}. In fact in Section~\ref{sec:mondrian_app_proofs} we provide a direct proof only for Theorem~\ref{thm:mondrian_clt_debiased} and deduce Theorem~\ref{thm:mondrian_clt} as a special case. \begin{theorem}[Central limit theorem for the debiased Mondrian random forest estimator]% \label{thm:mondrian_clt_debiased} % Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold, $\E[Y_i^4 \mid X_i ]$ is bounded, and $\frac{\lambda^d \log n}{n} \to 0$. Then % \begin{align*} \sqrt{\frac{n}{\lambda^d}} \Big( \hat \mu_\rd(x) - \E \big[ \hat \mu_\rd(x) \mid \bX, \bT \big] \Big) &\rightsquigarrow \cN\big(0, \Sigma_\rd(x)\big) \end{align*} % where, with $\ell_{r r'} = \frac{2 a_r}{3} \left( 1 - \frac{a_{r}}{a_{r'}} \log\left(\frac{a_{r'}}{a_{r}} + 1\right) \right)$, the limiting variance is % \begin{align*} \Sigma_\rd(x) &= \frac{\sigma^2(x)}{f(x)} \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \left( \ell_{r r'} + \ell_{r' r} \right)^d. \end{align*} % \end{theorem} It is easy to verify that in the case of no debiasing we have $J=0$ and $a_0 = \omega_0 = 1$, yielding $\Sigma_\rd(x) = \Sigma(x)$, and recovering Theorem~\ref{thm:mondrian_clt}. \subsection*{Bias characterization} In Theorem~\ref{thm:mondrian_bias_debiased} we verify that this debiasing procedure does indeed annihilate the desired bias terms, and its proof is a consequence of Theorem~\ref{thm:mondrian_bias} and the construction of the debiased Mondrian random forest estimator $\hat\mu_\rd(x)$. \begin{theorem}[Bias of the debiased Mondrian random forest estimator]% \label{thm:mondrian_bias_debiased} Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}. In the notation of Theorem~\ref{thm:mondrian_bias} with $\bar\omega = \sum_{r=0}^J \omega_r a_r^{-2J - 2}$, % \begin{align*} \E \big[ \hat \mu_\rd(x) \mid \bX, \bT \big] &= \mu(x) + \I\{2J+2 < \beta \} \frac{\bar\omega B_{J+1}(x)}{\lambda^{2J + 2}} \\ &\quad+ O_\P \left( \frac{1}{\lambda^{2J + 4}} + \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} + \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}} \right). \end{align*} % \end{theorem} Theorem~\ref{thm:mondrian_bias_debiased} has the following consequence: the leading bias term is characterized in terms of $B_{J+1}(x)$ whenever $J < \beta/2 - 1$, or equivalently $J < \lfloor \flbeta/2 \rfloor$, that is, the debiasing order $J$ does not exhaust the H{\"o}lder smoothness $\beta$. If this condition does not hold, then the estimator is fully debiased, and the resulting leading bias term is bounded above by $1/\lambda^\beta$ up to constants, but its form is left unspecified. \subsection*{Variance estimation} As before, we propose a variance estimator in order to conduct feasible inference and show that it is consistent. With $\hat\sigma^2(x)$ as in \eqref{eq:mondrian_sigma2_hat} in Section~\ref{sec:mondrian_inference}, define the estimator % \begin{align} \label{eq:mondrian_debiased_variance_estimator} \hat\Sigma_\rd(x) &= \hat\sigma^2(x) \frac{n}{\lambda^d} \sum_{i=1}^n \left( \sum_{r=0}^J \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\I\{X_i \in T_{r b}(x)\}} {N_{r b}(x)} \right)^2. \end{align} % \begin{theorem}[Variance estimation]% \label{thm:mondrian_variance_estimation_debiased} Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}, and suppose $\E[Y_i^4 \mid X_i ]$ is bounded almost surely. Then % \begin{align*} \hat\Sigma_\rd(x) = \Sigma_\rd(x) + O_\P \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{1}{\sqrt B} + \sqrt{\frac{\lambda^d \log n}{n}} \right). \end{align*} % \end{theorem} \subsection{Confidence intervals} In analogy to Section~\ref{sec:mondrian_inference}, we now demonstrate the construction of feasible valid confidence intervals using the debiased Mondrian random forest estimator in Theorem~\ref{thm:mondrian_confidence_debiased}. Once again we must ensure that the bias (now significantly reduced due to our debiasing procedure) is negligible when compared to the standard deviation (which is of the same order as before). We assume for simplicity that the estimator has been fully debiased by setting $J \geq \lfloor \flbeta / 2\rfloor$ to yield a leading bias of order $1/\lambda^\beta$, but intermediate ``partially debiased'' versions can easily be provided, with leading bias terms of order $1/\lambda^{\beta \wedge (2J+2)}$ in general. We thus require $\frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} \ll \sqrt{\frac{\lambda^d}{n}}$, which can be satisfied by imposing the restrictions $\lambda \gg n^{\frac{1}{d + 2 \beta}}$ and $B \gg n^{\frac{2\beta - 2}{d + 2\beta}}$ on the lifetime parameter $\lambda$ and forest size $B$. \begin{theorem}[Feasible confidence intervals using a debiased Mondrian random forest]% \label{thm:mondrian_confidence_debiased} % Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold, $\E[Y_i^4 \mid X_i ]$ is bounded, and $\frac{\lambda^d \log n}{n} \to 0$. Fix $J \geq \lfloor \flbeta / 2 \rfloor$ and assume that $\lambda \gg n^{\frac{1}{d + 2 \beta}}$ and $B \gg n^{\frac{2 \beta - 2}{d + 2 \beta}}$. For a confidence level $\alpha \in (0, 1)$, let $q_{1 - \alpha / 2}$ be as in Theorem~\ref{thm:mondrian_confidence}. Then % \begin{align*} \P \left( \mu(x) \in \left[ \hat \mu_\rd(x) - \sqrt{\frac{\lambda^d}{n}} \hat \Sigma_\rd(x)^{1/2} q_{1 - \alpha / 2}, \ \hat \mu_\rd(x) + \sqrt{\frac{\lambda^d}{n}} \hat \Sigma_\rd(x)^{1/2} q_{1 - \alpha / 2} \right] \right) \to 1 - \alpha. \end{align*} \end{theorem} One important benefit of our debiasing technique is made clear in Theorem~\ref{thm:mondrian_confidence_debiased}: the restrictions imposed on the lifetime parameter $\lambda$ are substantially relaxed, especially in smooth classes with large $\beta$. As well as the high-level of benefit of relaxed conditions, this is also useful for practical selection of appropriate lifetimes for estimation and inference respectively; see Section~\ref{sec:mondrian_parameter_selection} for more details. Nonetheless, such improvements do not come without concession. The limiting variance $\Sigma_\rd(x)$ of the debiased estimator is larger than that of the unbiased version (the extent of this increase depends on the choice of the debiasing parameters $a_r$), leading to wider confidence intervals and larger estimation error in small samples despite the theoretical asymptotic improvements. \subsection{Minimax optimality} Our final result Theorem~\ref{thm:mondrian_minimax} shows that, when using an appropriate sequence of lifetime parameters $\lambda$, the debiased Mondrian random forest estimator achieves, up to constants, the minimax-optimal rate of convergence for estimating a regression function $\mu \in \cH^\beta$ in $d$ dimensions \citep{stone1982optimal}. This result holds for all $d \geq 1$ and all $\beta > 0$, complementing a previous result established only for $\beta \in (0, 2]$ by \citet{mourtada2020minimax}. % \begin{theorem}[Minimax optimality of the debiased Mondrian random forest estimator]% \label{thm:mondrian_minimax} Grant Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator}, and let $J \geq \lfloor \flbeta / 2 \rfloor$, $\lambda \asymp n^{\frac{1}{d + 2 \beta}}$, and $B \gtrsim n^{\frac{2 \beta - 2}{d + 2 \beta}}$. Then % \begin{align*} \E \left[ \big( \hat \mu_\rd(x) - \mu(x) \big)^2 \right]^{1/2} &\lesssim \sqrt{\frac{\lambda^d}{n}} + \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} \lesssim n^{-\frac{\beta}{d + 2 \beta}}. \end{align*} % \end{theorem} The sequence of lifetime parameters $\lambda$ required in Theorem~\ref{thm:mondrian_minimax} are chosen to balance the bias and standard deviation bounds implied by Theorem~\ref{thm:mondrian_bias_debiased} and Theorem~\ref{thm:mondrian_clt_debiased} respectively, in order to minimize the pointwise mean squared error. While selecting an optimal debiasing order $J$ needs only knowledge of an upper bound on the smoothness $\beta$, choosing an optimal sequence of $\lambda$ values does assume that $\beta$ is known a priori. The problem of adapting to $\beta$ from data is challenging and beyond the scope of this chapter; we provide some practical advice for tuning parameter selection in Section~\ref{sec:mondrian_parameter_selection}. Theorem~\ref{thm:mondrian_minimax} complements the minimaxity results proven by \citet{mourtada2020minimax} for Mondrian trees (with $\beta \leq 1$) and for Mondrian random forests (with $\beta \leq 2$), with one modification: our version is stated in pointwise rather than integrated mean squared error. This is because our debiasing procedure is designed to handle interior smoothing bias and so does not provide any correction for boundary bias. We leave the development of such boundary corrections to future work, but constructions similar to higher-order boundary-correcting kernels should be possible. If the region of integration is a compact set in the interior of $[0,1]^d$, then we do obtain an optimal integrated mean squared error bound: if $\delta \in (0, 1/2)$ is fixed then under the same conditions as Theorem~\ref{thm:mondrian_minimax}, % \begin{align*} \E \left[ \int_{[\delta, 1-\delta]^d} \big( \hat \mu_\rd(x) - \mu(x) \big)^2 \diff x \right]^{1/2} &\lesssim \sqrt{\frac{\lambda^d}{n}} + \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} \lesssim n^{-\frac{\beta}{d + 2 \beta}}. \end{align*} The debiased Mondrian random forest estimator defined in \eqref{eq:mondrian_debiased} is a linear combination of Mondrian random forests, and as such contains both a sum over $0 \leq r \leq J$, representing the debiasing procedure, and a sum over $1 \leq b \leq B$, representing the forest averaging. We have thus far been interpreting this estimator as a debiased version of the standard Mondrian random forest given in \eqref{eq:mondrian_estimator}, but it is equally valid to swap the order of these sums. This gives rise to an alternative point of view: we replace each Mondrian random tree with a ``debiased'' version, and then take a forest of such modified trees. This perspective is more in line with existing techniques for constructing randomized ensembles, where the outermost operation represents a $B$-fold average of randomized base learners, not necessarily locally constant decision trees, each of which has a small bias component \citep{caruana2004ensemble, zhou2019deep, friedberg2020local}. \section{Tuning parameter selection}% \label{sec:mondrian_parameter_selection} We discuss various procedures for selecting the parameters involved in fitting a debiased Mondrian random forest; namely the base lifetime parameter $\lambda$, the number of trees in each forest $B$, the bias correction order $J$, and the debiasing scale parameters $a_r$ for $0 \leq r \leq J$. \subsection{Selecting the base lifetime parameter \texorpdfstring{$\lambda$}{lambda}}% \label{sec:mondrian_lifetime_selection} The most important parameter is the base Mondrian lifetime parameter $\lambda$, which plays the role of a complexity parameter and thus governs the overall bias--variance trade-off of the estimator. Correct tuning of $\lambda$ is especially important in two main respects: % firstly, in order to use the central limit theorem established in Theorem~\ref{thm:mondrian_clt_debiased}, we must have that the bias converges to zero, requiring $\lambda \gg n^{\frac{1}{d + 2\beta}}$. % Secondly, the minimax optimality result of Theorem~\ref{thm:mondrian_minimax} is valid only in the regime $\lambda \asymp n^{\frac{1}{d + 2\beta}}$, and thus requires careful determination in the more realistic finite-sample setting. For clarity, in this section we use the notation $\hat\mu_\rd(x; \lambda, J)$ for the debiased Mondrian random forest with lifetime $\lambda$ and debiasing order $J$ as in \eqref{eq:mondrian_debiased}. Similarly, write $\hat\Sigma_\rd(x; \lambda, J)$ for the associated variance estimator given in \eqref{eq:mondrian_debiased_variance_estimator}. For minimax-optimal point estimation when $\beta$ is known, choose any sequence $\lambda \asymp n^{\frac{1}{d + 2\beta}}$ and use $\hat\mu_\rd(x; \lambda, J)$ with $J = \lfloor \flbeta / 2 \rfloor$, following the theory given in Theorem~\ref{thm:mondrian_minimax}. For an explicit example of how to choose the lifetime, one can instead use $\hat\mu_\rd\big(x; \hat\lambda_{\AIMSE}(J-1), J-1\big)$ so that the leading bias is explicitly characterized by Theorem~\ref{thm:mondrian_bias_debiased}, and with $\hat\lambda_{\AIMSE}(J-1)$ as defined below. This is no longer minimax-optimal as $J-1 < J$ does not satisfy the conditions of Theorem~\ref{thm:mondrian_minimax}. For performing inference, a more careful procedure is required; we suggest the following method assuming $\beta > 2$. Set $J = \lfloor \flbeta / 2 \rfloor$ as before, and use $\hat\mu_\rd\big(x; \hat\lambda_{\AIMSE}(J-1), J\big)$ and $\hat\Sigma_\rd\big(x; \hat\lambda_{\AIMSE}(J-1), J\big)$ to construct a confidence interval. The reasoning for this is that we select a lifetime tailored for a more biased estimator than we actually use. This results in an inflated lifetime estimate, guaranteeing the resulting bias is negligible when it is plugged into the fully debiased estimator. This approach to tuning parameter selection and debiasing for valid nonparametric inference corresponds to an application of robust bias correction \citep{calonico2018effect,calonico2022coverage}, where the point estimator is bias-corrected and the robust standard error estimator incorporates the additional sampling variability introduced by the bias correction. This leads to a more refined distributional approximation but does not necessarily exhaust the underlying smoothness of the regression function. An alternative inference approach based on Lepskii's method \citep{lepskii1992asymptotically,birge2001alternative} could be developed with the latter goal in mind. It remains to propose a concrete method for computing $\hat\lambda_{\AIMSE}(J)$ in the finite-sample setting; we suggest two such procedures based on plug-in selection with polynomial estimation and cross-validation respectively, building on classical ideas from the nonparametric smoothing literature \citep{fan2020statistical}. \subsubsection*{Lifetime selection with polynomial estimation} Firstly, suppose $X_i \sim \Unif\big([0,1]^d\big)$ and that the leading bias of $\hat\mu_\rd(x)$ is well approximated by an additively separable function so that, writing $\partial^{2 J + 2}_j \mu(x)$ for $\partial^{2 J + 2}_j \mu(x) / \partial x_j^{2 J + 2}$, % \begin{align*} \frac{\bar \omega B_{J+1}(x)}{\lambda^{2 J + 2}} &\approx \frac{1}{\lambda^{2 J + 2}} \frac{\bar \omega }{J + 2} \sum_{j=1}^d \partial^{2 J + 2}_j \mu(x). \end{align*} % Now suppose the model is homoscedastic so $\sigma^2(x) = \sigma^2$ and the limiting variance of $\hat\mu_\rd$ is % \begin{align*} \frac{\lambda^d}{n} \Sigma_\rd(x) &= \frac{\lambda^d \sigma^2}{n} \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \left( \ell_{r r'} + \ell_{r' r} \right)^d. \end{align*} % The asymptotic integrated mean squared error (AIMSE) is % \begin{align*} \AIMSE(\lambda, J) &= \frac{1}{\lambda^{4 J + 4}} \frac{\bar \omega^2}{(J + 2)^2} \int_{[0,1]^d} \left( \sum_{j=1}^d \partial^{2 J + 2}_j \mu(x) \right)^2 \diff x \\ &\quad+ \frac{\lambda^d \sigma^2}{n} \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \left( \ell_{r r'} + \ell_{r' r} \right)^d. \end{align*} % Minimizing over $\lambda > 0$ yields the AIMSE-optimal lifetime parameter % \begin{align*} \lambda_{\AIMSE}(J) &= \left( \frac{ \frac{(4 J + 4) \bar \omega^2}{(J + 2)^2} n \int_{[0,1]^d} \left( \sum_{j=1}^d \partial^{2 J + 2}_j \mu(x) \right)^2 \diff x }{ d \sigma^2 \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \left( \ell_{r r'} + \ell_{r' r} \right)^d } \right)^{\frac{1}{4 J + 4 + d}}. \end{align*} % An estimator of $\lambda_{\AIMSE}(J)$ is therefore given by % \begin{align*} \hat\lambda_{\AIMSE}(J) &= \left( \frac{ \frac{(4 J + 4) \bar \omega^2}{(J + 2)^2} \sum_{i=1}^n \left( \sum_{j=1}^d \partial^{2 J + 2}_j \hat\mu(X_i) \right)^2 }{ d \hat\sigma^2 \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \left( \ell_{r r'} + \ell_{r' r} \right)^d } \right)^{\frac{1}{4 J + 4 + d}} \end{align*} % for some preliminary estimators $\partial^{2 J + 2}_j \hat\mu(x)$ and $\hat\sigma^2$. These can be obtained by fitting a global polynomial regression to the data of order $2 J + 4$ without interaction terms. To do this, define the $n \times ((2 J + 4)d + 1)$ design matrix $P$ with rows % \begin{align*} P_i = \big( 1, X_{i1}, X_{i1}^2, \ldots, X_{i1}^{2 J + 4}, X_{i2}, X_{i2}^2, \ldots, X_{i2}^{2 J + 4}, \ldots, X_{id}, X_{id}^2, \ldots, X_{id}^{2 J + 4} \big), \end{align*} % and let % $P_x = \big( 1, x_{1}, x_{1}^2, \ldots, x_{1}^{2 J + 4}, x_{2}, x_{2}^2, \ldots, x_{2}^{2 J + 4}, \ldots, x_{d}, x_{d}^2, \ldots, x_{d}^{2 J + 4} \big). $ % Then we define the derivative estimator as % \begin{align*} \partial^{2 J + 2}_j \hat\mu(x) &= \partial^{2 J + 2}_j P_x \big( P^\T P \big)^{-1} P^\T \bY \\ &= (2J + 2)! \left( 0_{1 + (j-1)(2 J + 4) + (2J + 1)}, 1, x_j, x_j^2 / 2, 0_{(d-j)(2 J + 4)} \right) \big( P^\T P \big)^{-1} P^\T \bY, \end{align*} % and the variance estimator $\hat\sigma^2$ is based on the residual sum of squared errors of this model: % \begin{align*} \hat\sigma^2 &= \frac{1}{n - (2J + 4)d - 1} \big( \bY^\T \bY - \bY^\T P \big( P^\T P \big)^{-1} P^\T \bY \big). \end{align*} \subsubsection*{Lifetime selection with cross-validation} As an alternative to the analytic plug-in methods described above, one can use a cross-validation approach. While leave-one-out cross-validation (LOOCV) can be applied directly \citep{fan2020statistical}, the linear smoother structure of the (debiased) Mondrian random forest estimator allows a computationally simpler formulation. Writing $\hat\mu_\rd^{-i}(x)$ for a debiased Mondrian random forest estimator fitted without the $i$th data sample, it is easy to show that % \begin{align*} \LOOCV(\lambda, J) &= \frac{1}{n} \sum_{i=1}^{n} \left( Y_i - \hat\mu_\rd^{-i}(X_i) \right)^2 \\ &= \frac{1}{n} \sum_{i=1}^{n} \left( \sum_{r=0}^{J} \omega_r \frac{1}{B} \sum_{b=1}^{B} \frac{1}{1 - 1/N_{r b}(X_i)} \left( Y_i - \sum_{j=1}^{n} \frac{ Y_j \I \left\{ X_j \in T_{r b}(X_i) \right\}} {N_{r b}(X_i)} \right) \right)^{2}, \end{align*} % avoiding refitting the model leaving each sample out in turn. Supposing $X_i \sim \Unif\big([0,1]^d\big)$ and replacing $1/N_{r b}(X_i)$ with their average expectation $ \frac{1}{J+1} \sum_{r=0}^{J} \E \left[ 1/N_{r b}(X_i) \right] \approx \bar a^d \lambda^d / n$ where $\bar a^d = \frac{1}{J+1} \sum_{r=0}^{J} a_r^d$ gives the generalized cross-validation (GCV) formula % \begin{align} \label{eq:mondrian_gcv} \GCV(\lambda, J) &= \frac{1}{n} \sum_{i=1}^{n} \left( \frac{Y_i - \hat\mu_\rd(X_i)} {1 - \bar a^d \lambda^d / n} \right)^2. \end{align} % The lifetime can then be selected by computing either $\hat\lambda_{\LOOCV} \in \argmin_\lambda \LOOCV(\lambda, J)$ or $\hat\lambda_{\GCV} \in \argmin_\lambda \GCV(\lambda, J)$. See Section~\ref{sec:mondrian_weather} for a practical illustration. \subsection{Choosing the other parameters} \subsubsection*{The number \texorpdfstring{$B$}{B} of trees in each forest}% If no debiasing is applied, we suggest $B = \sqrt{n}$ to satisfy Theorem~\ref{thm:mondrian_confidence}. If debiasing is used then we recommend $B = n^{\frac{2J-1}{2J}}$, consistent with Theorem~\ref{thm:mondrian_confidence_debiased} and Theorem~\ref{thm:mondrian_minimax}. \subsubsection*{The debiasing order \texorpdfstring{$J$}{J}}% When debiasing a Mondrian random forest, one must decide how many orders of bias to remove. This requires some oracle knowledge of the H{\"o}lder smoothness of $\mu$ and $f$, which is difficult to estimate statistically. As such, we recommend removing only the first one or two bias terms, taking $J \in \{0,1,2\}$ to avoid overly inflating the variance of the estimator. \subsubsection*{The debiasing coefficients \texorpdfstring{$a_r$}{ar}}% As in Section~\ref{sec:mondrian_debiased}, we take $a_r$ to be a fixed geometric or arithmetic sequence. For example, one could set $a_r = (1+\gamma)^r$ or $a_r = 1 + \gamma r$ for some $\gamma > 0$. We suggest taking $a_r = 1.05^r$. \section{Illustrative example: weather forecasting}% \label{sec:mondrian_weather} To demonstrate our methodology for estimation and inference with Mondrian random forests, we consider a simple application to a weather forecasting problem. We emphasize that the main aim of this section is to provide intuition and understanding for how a Mondrian random forest may be used in practice, and we refrain from an in-depth analysis of the specific results obtained. Indeed, our assumption of i.i.d.\ data is certainly violated with weather data, due to the time-series structure of sequential observations. Nonetheless, we use data from the \citet{bureau2017daily}, containing daily weather information from 2007--2017, at 49 different locations across Australia, with $n = 125\,927$ samples. \begin{figure}[b!] \centering \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/weather_data.png}% \end{subfigure} \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/weather_data_filled_partition.png}% \end{subfigure} \caption[Australian weather forecasting data]{ Australian weather forecasting data. Left: colors indicate the response variable of dry (orange) or wet (blue) on the following day. Right: the data is overlaid with a Mondrian random tree, fitted with a lifetime of $\lambda = 5$ selected by generalized cross-validation. Cell colors represent the response proportions.} \label{fig:mondrian_weather_data} \end{figure} We consider the classification problem of predicting whether or not it will rain on the following day using two covariates: the percentage relative humidity, and the pressure in mbar, both at 3pm on the current day. For the purpose of framing this as a nonparametric regression problem, we consider estimating the probability of rain as the regression function by setting $Y_i = 1$ if there is rain on the following day and $Y_i = 0$ otherwise. Outliers with pressure less than 985\,mbar or more than 1040\,mbar are removed to justify the assertion in Assumption~\ref{ass:mondrian_data} that the density of the covariates should be bounded away from zero, and the features are linearly scaled to provide normalized samples $(X_i, Y_i) \in [0, 1]^2 \times \{0, 1\}$. We fit a Mondrian random forest to the data as defined in Section~\ref{sec:mondrian_forests}, selecting the lifetime parameter with the generalized cross-validation (GCV) method detailed in Section~\ref{sec:mondrian_lifetime_selection}. Figure~\ref{fig:mondrian_weather_data} plots the data, using colors to indicate the response values, and illustrates how a single Mondrian tree is fitted by sampling from an independent Mondrian process and then computing local averages (equivalent to response proportions in this special setting with binary outcomes) within each cell. The general pattern of rain being predicted by high humidity and low pressure is apparent, with the preliminary tree estimator taking the form of a step function on axis-aligned rectangles. This illustrates the first-order bias of Mondrian random trees discussed in Section~\ref{sec:mondrian_clt}, with the piecewise constant estimator providing a poor approximation for the smooth true regression function. \begin{figure}[b!] \centering \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/weather_forest_2.png}% \end{subfigure} \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/weather_forest_design.png}% \end{subfigure} \caption[Fitting Mondrian random forests to the Australian weather data]{ Fitting Mondrian random forests to the Australian weather data. Left: with $B=2$ trees, individual cells are clearly visible and the step function persists. Right: with $B=40$ trees, the estimate is much smoother as the individual tree estimates average out. Three design points are identified for further analysis.} \label{fig:mondrian_weather_forest} \end{figure} Figure~\ref{fig:mondrian_weather_forest} adds more trees to the estimator, demonstrating the effect of increasing the forest size first to $B=2$ and then to $B=40$. As more trees are included in the Mondrian random forest, the regression estimate $\hat \mu(x)$ becomes smoother and therefore also enjoys improved bias properties as shown in Theorem~\ref{thm:mondrian_bias}, assuming a correct model specification. We also choose three specific design points in the (humidity, pressure) covariate space, namely (20\%, 1020\,mbar), (70\%, 1000\,mbar), and (80\%, 990\,mbar), at which to conduct inference by constructing confidence intervals. See Table~\ref{tab:mondrian_weather_ci} for the results. \begin{figure}[b!] \centering \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/weather_gcv.png}% \end{subfigure} \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/weather_debiased_forest_design.png}% \end{subfigure} \caption[Cross-validation and debiasing for the Australian weather data]{ Left: mean squared error and generalized cross-validation scores for Mondrian random forests with the Australian weather data. Right: a debiased Mondrian random forest with $B=20$, giving $40$ trees in total. Three design points are identified for further analysis.} \label{fig:mondrian_weather_gcv} \end{figure} In Figure~\ref{fig:mondrian_weather_gcv} we show the mean squared error and GCV scores computed using \eqref{eq:mondrian_gcv} with $B=400$ trees for several candidate lifetime parameters $\lambda$. As expected, the mean squared error decreases monotonically as $\lambda$ increases and the model overfits, but the GCV score is minimized at a value which appropriately balances the bias and variance; we take $\lambda = 5$. We then fit a debiased Mondrian forest with bias correction order $J = 1$ as described in Section~\ref{sec:mondrian_debiased}, using $B=20$ trees at each debiasing level $r \in \{0, 1\}$ for a total of $40$ trees. We continue to use the same lifetime parameter $\lambda = 5$ selected through GCV without debiasing, following the approach recommended in Section~\ref{sec:mondrian_lifetime_selection} to ensure valid inference through negligible bias. The resulting debiased Mondrian random forest estimate is noticeably less smooth than the version without bias correction. This is expected due to both the inflated variance resulting from the debiasing procedure, and the undersmoothing enacted by selecting a lifetime parameter using GCV on the original estimator without debiasing. \begin{table}[t] \centering \begin{tabular}{|c|c|c|c|c|c|c|} \hline \multirow{2}{*}{Point} & \multirow{2}{*}{Humidity} & \multirow{2}{*}{Pressure} & \multicolumn{2}{|c|}{No debiasing, $J=0$} & \multicolumn{2}{|c|}{Debiasing, $J=1$} \\ \cline{4-7} & & & $\hat\mu(x)$ & 95\% CI & $\hat\mu(x)$ & 95\% CI \\ \hline $1$ & $20\%$ & $1020\,\textrm{mbar}$ & $\phantom{0}4.2\%$ & $3.9\%$ -- $4.5\%$ & $\phantom{0}2.0\%$ & $1.6\%$ -- $2.4\%$ \\ $2$ & $70\%$ & $1000\,\textrm{mbar}$ & $52.6\%$ & $51.7\%$ -- $53.6\%$ & $59.8\%$ & $57.8\%$ -- $61.9\%$ \\ $3$ & $80\%$ & $\phantom{1}990\,\textrm{mbar}$ & $78.1\%$ & $75.0\%$ -- $81.2\%$ & $93.2\%$ & $86.7\%$ -- $99.6\%$ \\ \hline \end{tabular} \caption[Results for the Australian weather data]{ Results for the Australian weather data at three specified design points.} \label{tab:mondrian_weather_ci} \end{table} Table~\ref{tab:mondrian_weather_ci} presents numerical results for estimation and inference at the three specified design points. We first give the outcomes without debiasing, using a Mondrian random forest with $B = 400$ trees and $\lambda = 5$ selected by GCV. We then show the results with a first-order ($J=1$) debiased Mondrian random forest using $B = 200$ (again a total of $400$ trees) and the same value of $\lambda = 5$. The predicted chance of rain $\hat\mu(x)$ is found to vary substantially across different covariate values, and the resulting confidence intervals (CI) are generally narrow due to the large sample size and moderate lifetime parameter. The forest with debiasing exhibits more extreme predictions away from $50\%$ and wider confidence intervals in general, in line with the illustration in Figure~\ref{fig:mondrian_weather_gcv}. Interestingly, the confidence intervals for the non-debiased and debiased estimators do not intersect, indicating that the original estimator is severely biased, and providing further justification for our modified debiased random forest estimator. \section{Conclusion}% \label{sec:mondrian_conclusion} We gave a central limit theorem for the Mondrian random forest estimator and showed how to perform statistical inference on an unknown nonparametric regression function. We introduced debiased versions of the Mondrian random forest, and demonstrated their advantages for statistical inference and minimax-optimal estimation. We discussed tuning parameter selection, enabling a fully feasible and practical methodology. An application to weather forecasting was presented as an illustrative example. Implementations of this chapter's methodology and empirical results are provided by a Julia package at \github{wgunderwood/MondrianForests.jl}. This work is based on \citet{cattaneo2023inference}, and has been presented by Underwood at the University of Illinois Statistics Seminar (2024), the University of Michigan Statistics Seminar (2024), and the University of Pittsburgh Statistics Seminar (2024). \chapter{Dyadic Kernel Density Estimators} \label{ch:kernel} % abstract Dyadic data is often encountered when quantities of interest are associated with the edges of a network. As such, it plays an important role in statistics, econometrics, and many other data science disciplines. We consider the problem of uniformly estimating a dyadic Lebesgue density function, focusing on nonparametric kernel-based estimators taking the form of dyadic empirical processes. The main contributions of this chapter include the minimax-optimal uniform convergence rate of the dyadic kernel density estimator, along with strong approximation results for the associated standardized and Studentized $t$-processes. A consistent variance estimator enables the construction of valid and feasible uniform confidence bands for the unknown density function. We showcase the broad applicability of our results by developing novel counterfactual density estimation and inference methodology for dyadic data, which can be used for causal inference and program evaluation. A crucial feature of dyadic distributions is that they may be ``degenerate'' at certain points in the support of the data, a property making our analysis somewhat delicate. Nonetheless our methods for uniform inference remain robust to the potential presence of such points. For implementation purposes, we discuss inference procedures based on positive semi-definite covariance estimators, mean squared error optimal bandwidth selectors, and robust bias correction techniques. We illustrate the empirical finite-sample performance of our methods both in simulations and with real-world trade data, for which we make comparisons between observed and counterfactual trade distributions in different years. Our technical results concerning strong approximations and maximal inequalities are of potential independent interest. \section{Introduction} \label{sec:kernel_introduction} Dyadic data, also known as graphon data, plays an important role in the statistical, social, behavioral, and biomedical sciences. In network settings, this type of dependent data captures interactions between the units of study, and its analysis is of interest in statistics \citep{kolaczyk2009statistical}, economics \citep{graham2020network}, psychology \citep{kenny2020dyadic}, public health \citep{luke2007network}, and many other data science disciplines. For $n \geq 2$, a dyadic data set contains $\frac{1}{2}n(n-1)$ observed real-valued random variables % \begin{align*} \bW_n = (W_{i j}:1\leq i0$, define the H\"{o}lder class with smoothness parameter $\beta > 0$ to be $\cH^\beta_C(\cX) = \big\{ g \in \cC^{\flbeta}(\cX) \! : \! \max_{1 \leq r \leq \flbeta} \big| g^{(r)}(x) \big| \leq C, \big| g^{(\flbeta)}(x) - g^{(\flbeta)}(x') \big| \leq C |x-x'|^{\beta - \flbeta}, \forall x, x' \in \cX \big\}$, where $\flbeta$ denotes the largest integer which is strictly less than $\beta$. Note that $\cH^1_C(\cX)$ is the class of $C$-Lipschitz functions on $\cX$. For $a \in \R$ and $b \geq 0$, we write $[a \pm b]$ for the interval $[a-b, a+b]$. For non-negative sequences $a_n$ and $b_n$, write $a_n \lesssim b_n$ or $a_n = O(b_n)$ to indicate that $a_n / b_n$ is bounded for $n\geq 1$. Write $a_n \ll b_n$ or $a_n = o(b_n)$ if $a_n / b_n \to 0$. If $a_n \lesssim b_n \lesssim a_n$, write $a_n \asymp b_n$. For random non-negative sequences $A_n$ and $B_n$, write $A_n \lesssim_\P B_n$ or $A_n = O_\P(B_n)$ if $A_n / B_n$ is bounded in probability. Write $A_n = o_\P(B_n)$ if $A_n / B_n \to 0$ in probability. For $a,b \in \R$, define $a\wedge b=\min\{a,b\}$ and $a \vee b = \max\{a,b\}$. \section{Setup}\label{sec:kernel_setup} We impose the following two assumptions throughout this chapter, which concern firstly the dyadic data generating process, and secondly the choice of kernel and bandwidth sequence. % \begin{assumption}[Data generation] \label{ass:kernel_data} % % A and V variables Let $\bA_n = (A_i: 1 \leq i \leq n)$ be i.i.d.\ random variables supported on $\cA \subseteq \R$ and let $\bV_n = (V_{i j}: 1 \leq i < j \leq n)$ be i.i.d.\ random variables with a Lebesgue density $f_V$ on $\R$, with $\bA_n$ independent of $\bV_n$. % % W variables Let $W_{i j} = W(A_i, A_j, V_{i j})$ and $\bW_n = (W_{i j}: 1 \leq i < j \leq n)$, where $W$ is an unknown real-valued function which is symmetric in its first two arguments. % Let $\cW \subseteq \R$ be a compact interval with positive Lebesgue measure $\Leb(\cW)$. The conditional distribution of $W_{i j}$ given $A_i$ and $A_j$ admits a Lebesgue density $f_{W \mid AA}(w \mid A_i, A_j)$. For $C_\rH > 0$ and $\beta \geq 1$, take $f_W \in \cH^\beta_{C_\rH}(\cW)$ where $f_{W}(w) = \E\left[f_{W \mid AA}(w \mid A_i,A_j)\right]$ and $f_{W \mid AA}(\cdot \mid a, a') \in \cH^1_{C_\rH}(\cW)$ for all $a,a' \in \cA$. Suppose $\sup_{w \in \cW} \|f_{W \mid A}(w \mid \cdot\,)\|_\TV <\infty$ where $f_{W \mid A}(w \mid a) = \E\left[f_{W \mid AA}(w \mid A_i,a)\right]$. % \end{assumption} In Assumption~\ref{ass:kernel_data} we require the density $f_W$ be in a $\beta$-smooth H\"older class of functions on the compact interval $\cW$. H\"older classes are well established in the minimax estimation literature \citep{stone1982optimal,gine2021mathematical}, with the smoothness parameter $\beta$ appearing in the minimax-optimal rate of convergence. If the H\"older condition is satisfied only piecewise, then our results remain valid provided that the boundaries between the pieces are known and treated as boundary points. If $W(a_1, a_2, v)$ is strictly monotonic and continuously differentiable in its third argument, we can give the conditional density of $W_{i j}$ explicitly using the usual change-of-variables formula: with $w=W(a_1,a_2,v)$, we have $f_{W \mid AA}(w \mid a_1,a_2) = f_V(v) \big|\partial W(a_1,a_2,v)/\partial v\big|^{-1}$. \begin{assumption}[Kernels and bandwidth] \label{ass:kernel_bandwidth}% % Let $h = h(n) > 0$ be a sequence of bandwidths satisfying $h \log n \to 0$ and $\frac{\log n}{n^2h} \to 0$. For each $w \in \cW$, let $k_h(\cdot, w)$ be a real-valued function supported on $[w \pm h] \cap \cW$. For an integer $p \geq 1$, let $k_h$ belong to a family of boundary bias-corrected kernels of order $p$, i.e., % \begin{align*} \int_{\cW} (s-w)^r k_h(s,w) \diff{s} \quad \begin{cases} \begin{alignedat}{2} &= 1 &\qquad &\text{for all } w \in \cW \text{ if }\, r = 0, \\ &= 0 & &\text{for all } w \in \cW \text{ if }\, 1 \leq r \leq p-1, \\ &\neq 0 & &\text{for some } w \in \cW \text{ if }\, r = p. \end{alignedat} \end{cases} \end{align*} % Also, for $C_\rL > 0$, suppose $k_h(s, \cdot) \in \cH^1_{C_\rL h^{-2}}(\cW)$ for all $s \in \cW$. % \end{assumption} This assumption allows for all standard compactly supported and possibly boundary-corrected kernel functions \citep{wand1994kernel,simonoff1996smoothing}, constructed for example by taking polynomials on a compact interval and solving a linear system for the coefficients. Assumption~\ref{ass:kernel_bandwidth} implies (see Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded} in Appendix~\ref{app:kernel}) that if $h \leq 1$ then $k_h$ is uniformly bounded by $C_\rk h^{-1}$ where $C_\rk \vcentcolon = 2 C_\rL + 1 + 1/\Leb(\cW)$. \subsection{Bias characterization} \label{sec:kernel_bias} We begin by characterizing and bounding the bias $B_n(w) = \E \big[ \hat f_W(w) \big] - f_W(w)$. Theorem~\ref{thm:kernel_bias} is a standard result for the non-random smoothing bias in kernel density estimation with higher-order kernels and boundary bias correction, and does not rely on the dyadic structure. \begin{theorem}[Bias bound] \label{thm:kernel_bias} Suppose that Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold. For $w \in \cW$ define the leading bias term as % \begin{align*} b_p(w) &= \frac{f_W^{(p)}(w)}{p!} \int_{\cW} k_h(s,w) \left( \frac{s-w}{h} \right)^p \diff{s}. \end{align*} % for $1 \leq p \leq \flbeta$. Then we have the following bias bounds. % \begin{enumerate}[label=(\roman*)] \item If $p \leq \flbeta - 1$, then $\sup_{w \in \cW} | B_n(w) - h^p b_p(w) | \leq \frac{2 C_\rk C_\rH}{(p+1)!} h^{p+1}$. \item If $p = \flbeta$, then $\sup_{w \in \cW} | B_n(w) - h^p b_p(w) | \leq \frac{2 C_\rk C_\rH}{\flbeta !} h^\beta$. \item If $p \geq \flbeta+1$, then $\sup_{w \in \cW} | B_n(w) | \leq \frac{2 C_\rk C_\rH}{\flbeta !} h^\beta$. \end{enumerate} % Noting that $\sup_{\cW} |b_p(w)| \leq 2 C_\rk C_\rH / p!$, we deduce that for $h \leq 1$, % \begin{align*} \sup_{w \in \cW} | B_n(w) | \leq \frac{4 C_\rk C_\rH}{(p \wedge \flbeta)!} h^{p \wedge \beta} \lesssim h^{p \wedge \beta}. \end{align*} \end{theorem} \subsection{Hoeffding-type decomposition and degeneracy} \label{sec:kernel_degeneracy} Our next step is to consider the stochastic part $\hat f_W(w) - \E \big[ \hat f_W(w) \big]$ of the classical bias--variance decomposition. This term is akin to a U-statistic and thus admits a Hoeffding-type decomposition, presented in Lemma~\ref{lem:kernel_hoeffding}, which is a key element in our analysis. \begin{lemma}[Hoeffding-type decomposition for $\hat f_W$] \label{lem:kernel_hoeffding} Suppose that Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth} hold. Define the linear, quadratic, and error terms % \begin{align*} L_n(w) &= \frac{2}{n} \sum_{i=1}^n l_i(w), &Q_n(w) &= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} q_{i j}(w), \\ E_n(w) &= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} e_{i j}(w) \end{align*} % respectively, where % \begin{align*} l_i(w) &= \E\left[k_h(W_{i j},w) \mid A_i\right] - \E\left[k_h(W_{i j},w)\right], \\ q_{i j}(w) &= \E\left[k_h(W_{i j},w) \mid A_i, A_j\right] - \E\left[k_h(W_{i j},w) \mid A_i\right] - \E\left[k_h(W_{i j},w) \mid A_j\right] + \E\left[k_h(W_{i j},w)\right], \\ e_{i j}(w) &= k_h(W_{i j},w) - \E\left[k_h(W_{i j},w) \mid A_i, A_j\right]. \end{align*} % Then, recalling the bias term $B_n$ from Section~\ref{sec:kernel_bias}, we have the Hoeffding-type decomposition % \begin{align} \label{eq:kernel_hoeffding} \hat f_W(w) - f_W(w) = L_n(w) + Q_n(w) + E_n(w) + B_n(w). \end{align} % The processes $L_n$, $Q_n$, and $E_n$ are mean-zero with $\E\big[L_n(w)\big] = \E\big[Q_n(w)\big] = \E\big[E_n(w)\big] = 0$ for all $w \in \cW$. They are also orthogonal, satisfying $\E\big[ L_n(w) Q_n(w') \big] = \E\big[ L_n(w) E_n(w') \big] = \E\big[ Q_n(w) E_n(w') \big] = 0$ for all $w, w' \in \cW$. % \end{lemma} The process $L_n$ is the H{\'a}jek projection of a U-process, which can exhibit degeneracy if $\Var[L_n(w)] = 0$ at some or all points $w \in \cW$. To characterize the different possible degeneracy types in Lemma~\ref{lem:kernel_trichotomy}, we first introduce the following lower and upper degeneracy constants: % \begin{align*} \Dl^2 := \inf_{w \in \cW} \Var\left[f_{W \mid A}(w \mid A_i)\right] \qquad \text{ and } \qquad \Du^2 := \sup_{w \in \cW} \Var\left[f_{W \mid A}(w \mid A_i)\right]. \end{align*} % \begin{lemma}[Trichotomy of degeneracy]% \label{lem:kernel_trichotomy}% % Grant Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth}. Then the type of degeneracy exhibited by $\hat f_W(w)$ is precisely one of the following three possibilities. % \begin{enumerate}[label=(\roman*)] \item Total degeneracy: $\Du = \Dl = 0$. Then $L_n(w) = 0$ for all $w \in \cW$ almost surely. \item No degeneracy: $\Dl > 0$. Then $\inf_{w \in \cW} \Var[L_n(w)] \geq \frac{2 \Dl}{n}$ for all large enough n. \item Partial degeneracy: $\Du > \Dl = 0$. There exists $w \in \cW$ with $\Var\left[f_{W \mid A}(w \mid A_i)\right] = 0$; such a point is labeled \emph{degenerate} and satisfies $\Var[L_n(w)] \leq 64 C_\rk C_\rH C_\rd \frac{h}{n}$. There is also a point $w' \in \cW$ with $\Var\left[f_{W \mid A}(w' \mid A_i)\right] > 0$; such a point is labeled \emph{non-degenerate} and satisfies $\Var[L_n(w')] \geq \frac{2}{n} \Var\left[f_{W \mid A}(w' \mid A_i)\right]$ for all large enough $n$. \end{enumerate} \end{lemma} The following lemma describes the uniform stochastic order of the different terms in the Hoeffding-type decomposition, explicitly accounting for potential degeneracy. \begin{lemma}[Uniform concentration] \label{lem:kernel_uniform_concentration} Suppose Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold. Then % \begin{align*} \E\left[ \sup_{w \in \cW} |L_n(w)| \right] &\lesssim \frac{\Du}{\sqrt n}, &\E\left[ \sup_{w \in \cW} |Q_n(w)| \right] &\lesssim \frac{1}{n}, &\E\left[ \sup_{w \in \cW} |E_n(w)| \right] &\lesssim \sqrt{\frac{\log n}{n^2h}}. \end{align*} \end{lemma} Lemma~\ref{lem:kernel_uniform_concentration} captures the potential total degeneracy of $L_n$ by illustrating how if $\Du=0$ then $L_n=0$ everywhere on $\cW$ almost surely. The following lemma captures the potential partial degeneracy of $L_n$, where $\Du > \Dl = 0$. For $w,w' \in \cW$, define the covariance function % \begin{align*} \Sigma_n(w,w') = \E\Big[ \Big( \hat f_W(w) - \E\big[\hat f_W(w)\big] \Big) \Big( \hat f_W(w') - \E\big[\hat f_W(w')\big] \Big) \Big]. \end{align*} % \begin{lemma}[Variance bounds] \label{lem:kernel_variance_bounds} Suppose that Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth} hold. Then for sufficiently large $n$, % \begin{align*} \frac{\Dl^2}{n} + \frac{1}{n^2h} \inf_{w \in \cW} f_W(w) &\lesssim \inf_{w \in \cW} \Sigma_n(w,w) \leq \sup_{w \in \cW} \Sigma_n(w,w) \lesssim \frac{\Du^2}{n} + \frac{1}{n^2h}. \end{align*} % \end{lemma} As a simple example of the different types of degeneracy, consider the family of dyadic distributions $\P_{\pi}$ indexed by $\pi = (\pi_1, \pi_2, \pi_3)$ with $\sum_{i=1}^3 \pi_i = 1$ and $\pi_i \geq 0$, generated by $W_{i j} = A_i A_j + V_{i j}$, where $A_i$ equals $-1$ with probability $\pi_1$, equals $0$ with probability $\pi_2$ and equals $+1$ with probability $\pi_3$, and $V_{i j}$ is standard Gaussian. This model induces a latent ``community structure'' where community membership is determined by the value of $A_i$ for each node $i$, and the interaction outcome $W_{i j}$ is a function only of the communities which $i$ and $j$ belong to and some idiosyncratic noise. Unlike the stochastic block model \citep{kolaczyk2009statistical}, our setup assumes that community membership has no impact on edge existence, as we work with fully connected networks; see Section~\ref{sec:kernel_trade_data} for a discussion of how to handle missing edges in practice. Also note that the parameter of interest in this chapter is the Lebesgue density of a continuous random variable $W_{i j}$ rather than the probability of network edge existence, which is the focus of the graphon estimation literature \citep{gao2021minimax}. In line with Assumption~\ref{ass:kernel_data}, $\bA_n$ and $\bV_n$ are i.i.d.\ sequences independent of each other. Then $f_{W \mid AA}(w \mid A_i, A_j) = \phi(w - A_i A_j)$,\, $f_{W \mid A}(w \mid A_i) = \pi_1 \phi(w + A_i) + \pi_2 \phi(w) + \pi_3 \phi(w - A_i)$, and $f_W(w) = (\pi_1^2 + \pi_3^2) \phi(w-1) + \pi_2 (2 - \pi_2) \phi(w) + 2 \pi_1 \pi_3 \phi(w+1),$ where $\phi$ denotes the probability density function of the standard normal distribution. Note that $f_W(w)$ is strictly positive for all $w \in \R$. Consider the parameter choices: % \begin{enumerate}[label=(\roman*)] \item $\pi = \left( \frac{1}{2}, 0, \frac{1}{2} \right)$:\quad $\P_\pi$ is degenerate at all $w \in \R$, \item $\pi = \left( \frac{1}{4}, 0, \frac{3}{4} \right)$:\quad $\P_\pi$ is degenerate only at $w=0$, \item $\pi = \left( \frac{1}{5}, \frac{1}{5}, \frac{3}{5} \right)$:\quad $\P_\pi$ is non-degenerate for all $w \in \R$. \end{enumerate} % Figure~\ref{fig:kernel_distribution} demonstrates these phenomena, plotting the density $f_W$ and the standard deviation of the conditional density $f_{W|A}$ over $\cW = [-2,2]$ for each choice of the parameter $\pi$. The trichotomy of total/partial/no degeneracy is useful for understanding the distributional properties of the dyadic kernel density estimator $\hat{f}_W(w)$. Crucially, our need for uniformity in $w$ complicates the simpler degeneracy/no degeneracy dichotomy observed previously in the literature \citep{graham2024kernel}. From a pointwise-in-$w$ perspective, partial degeneracy causes no issues, while it is a fundamental problem when conducting inference uniformly over $w \in \cW$. We develop methods that are valid regardless of the presence of partial or total degeneracy. \begin{figure}[t] \centering % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/distribution_plot_total.pdf} \caption{Total degeneracy, \\ $\pi = \left( \frac{1}{2}, 0, \frac{1}{2} \right)$.} \end{subfigure} % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/distribution_plot_partial.pdf} \caption{Partial degeneracy, \\ $\pi = \left( \frac{1}{4}, 0, \frac{3}{4} \right)$.} \end{subfigure} % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/distribution_plot_none.pdf} \caption{No degeneracy, \\ $\pi = \left( \frac{1}{5}, \frac{1}{5}, \frac{3}{5} \right)$.} \end{subfigure} % \caption[The family of distributions $\P_\pi$]{ Density $f_W$ and standard deviation of $f_{W|A}$ for the family of distributions $\P_\pi$.} % \label{fig:kernel_distribution} \end{figure} \section{Point estimation results} \label{sec:kernel_point_estimation} Using the bias bound from Theorem~\ref{thm:kernel_bias} and the concentration results from Lemma~\ref{lem:kernel_uniform_concentration}, the next theorem establishes an upper bound on the uniform convergence rate of $\hat f_W$. % \begin{theorem}[Uniform convergence rate]% \label{thm:kernel_uniform_consistency}% Suppose that Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold. Then % \begin{align*} \E\left[ \sup_{w \in \cW} \big|\hat{f}_W(w) - f_W(w)\big| \right] \lesssim h^{p\wedge\beta} + \frac{\Du}{\sqrt n} + \sqrt{\frac{\log n}{n^2h}}. \end{align*} \end{theorem} % The implicit constant in Theorem~\ref{thm:kernel_uniform_consistency} depends only on $\cW$, $\beta$, $C_\rH$, and the choice of kernel. We interpret this result in light of the degeneracy trichotomy from Lemma~\ref{lem:kernel_trichotomy}. These results generalize \citet*[Theorem~1]{chiang2020empirical} by allowing for compactly supported data and more general kernels $k_h(\cdot,w)$, enabling boundary-adaptive estimation. % \begin{enumerate}[label=(\roman*)] \item Partial or no degeneracy: $\Du > 0$. Any bandwidths satisfying $n^{-1} \log n \lesssim h \lesssim n^{-\frac{1}{2(p\wedge\beta)}}$ yield $\E\big[\sup_{w \in \cW}\big|\hat f_W(w) - f_W(w)\big| \big] \lesssim \frac{1}{\sqrt n}$, the ``parametric'' bandwidth-independent rate noted by \citet{graham2024kernel}. \item Total degeneracy: $\Du = 0$. Minimizing the bound in Theorem~\ref{thm:kernel_uniform_consistency} with $h \asymp \left( \frac{\log n}{n^2} \right)^{\frac{1}{2(p\wedge\beta)+1}}$ yields $\E\big[ \sup_{w \in \cW} \big|\hat f_W(w) - f_W(w)\big| \big] \lesssim \big(\frac{\log n}{n^2} \big)^{\frac{p\wedge\beta}{2(p\wedge\beta)+1}}$. \end{enumerate} \subsection{Minimax optimality} We establish the minimax rate under the supremum norm for density estimation with dyadic data. This implies minimax optimality of the kernel density estimator $\hat f_W$, regardless of the degeneracy type of the dyadic distribution. \begin{theorem}[Uniform minimax optimality] \label{thm:kernel_minimax} Fix $\beta \geq 1$ and $C_\rH > 0$, and take $\cW$ a compact interval with positive Lebesgue measure. Define $\cP = \cP(\cW, \beta, C_\rH)$ as the class of dyadic distributions satisfying Assumption~\ref{ass:kernel_data}. Define $\cP_\rd$ as the subclass of $\cP$ containing only those distributions which are totally degenerate on $\cW$ in the sense that $\sup_{w \in \cW} \Var\left[f_{W \mid A}(w \mid A_i)\right] = 0$. Then % \begin{align*} \inf_{\tilde f_W} \sup_{\P \in \cP} \E_\P\left[ \sup_{w \in \cW} \big| \tilde f_W(w) - f_W(w) \big| \right] &\asymp \frac{1}{\sqrt n}, \\ \inf_{\tilde f_W} \sup_{\P \in \cP_\rd} \E_\P\left[ \sup_{w \in \cW} \big| \tilde f_W(w) - f_W(w) \big| \right] &\asymp \left( \frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta+1}}, \end{align*} % where $\tilde f_W$ is any estimator depending only on the data $\bW_n = (W_{i j}: 1 \leq i < j \leq n)$ distributed according to the dyadic law $\P$. The constants in $\asymp$ depend only on $\cW$, $\beta$, and $C_\rH$. \end{theorem} Theorem~\ref{thm:kernel_minimax} shows that the uniform convergence rate of $n^{-1/2}$ obtained in Theorem~\ref{thm:kernel_uniform_consistency} (coming from the $L_n$ term) is minimax-optimal in general. When attention is restricted to totally degenerate dyadic distributions, $\hat f_W$ also achieves the minimax rate of uniform convergence (assuming a kernel of sufficiently high order $p \geq \beta$), which is on the order of $\left(\frac{\log n}{n^2}\right)^{\frac{\beta}{2\beta+1}}$ and is determined by the bias $B_n$ and the leading variance term $E_n$ in \eqref{eq:kernel_hoeffding}. Combining Theorems \ref{thm:kernel_uniform_consistency}~and~\ref{thm:kernel_minimax}, we conclude that $\hat{f}_W(w)$ achieves the minimax-optimal rate for uniformly estimating $f_W(w)$ if $h \asymp \left( \frac{\log n}{n^2} \right)^{\frac{1}{2\beta+1}}$ and a kernel of sufficiently high order ($p \geq \beta$) is used, whether or not there are any degenerate points in the underlying data generating process. This result appears to be new to the literature on nonparametric estimation with dyadic data. See \citet{gao2021minimax} for a contemporaneous review. \section{Distributional results} \label{sec:kernel_inference} We investigate the distributional properties of the standardized $t$-statistic process % \begin{align*} T_n(w) = \frac{\hat{f}_W(w) - f_W(w)}{\sqrt{\Sigma_n(w,w)}}, \end{align*} % which is not necessarily asymptotically tight. Therefore, to approximate the distribution of the entire $t$-statistic process, as well as specific functionals thereof, we rely on a novel strong approximation approach outlined in this section. Our results can be used to perform valid uniform inference irrespective of the degeneracy type. This section is largely concerned with distributional properties and thus frequently requires copies of stochastic processes. For succinctness of notation, we will not differentiate between a process and its copy, but details are available in Section~\ref{sec:kernel_app_technical}. \subsection{Strong approximation} By the Hoeffding-type decomposition \eqref{eq:kernel_hoeffding} and Lemma~\ref{lem:kernel_uniform_concentration}, it suffices to consider the distributional properties of the stochastic process $L_n + E_n$. Our approach combines the K{\'o}mlos--Major--Tusn{\'a}dy (KMT) approximation \citep{komlos1975approximation} to obtain a strong approximation of $L_n$ with a Yurinskii approximation \citep{yurinskii1978error} to obtain a \emph{conditional} (on $\bA_n$) strong approximation of $E_n$. The latter is necessary because $E_n$ is akin to a local empirical process of i.n.i.d.\ random variables, conditional on $\bA_n$, and therefore the KMT approximation is not applicable. These approximations are then combined to give a final (unconditional) strong approximation for $L_n+E_n$, and thus for the $t$-statistic process $T_n$. The following lemma is an application of our generic KMT approximation result for empirical processes, given in Section~\ref{sec:kernel_app_technical}, which builds on earlier work by \citet{gine2004kernel} and \citet{gine2010confidence} and may be of independent interest. \begin{lemma}[Strong approximation of $L_n$] \label{lem:kernel_strong_approx_Ln} % Suppose that Assumptions \ref{ass:kernel_data}~and~\ref{ass:kernel_bandwidth} hold. For each $n$ there exists a mean-zero Gaussian process $Z^L_n$ indexed on $\cW$ satisfying $\E\big[ \sup_{w \in \cW} \big| \sqrt{n} L_n(w) - Z_n^L(w) \big| \big] \lesssim \frac{\Du \log n}{\sqrt{n}}$, where $\E[Z_n^L(w)Z_n^L(w')] = n\E[L_n(w)L_n(w')]$ for all $w, w' \in \cW$. The process $Z_n^L$ is a function only of $\bA_n$ and some random noise independent of $(\bA_n, \bV_n)$. \end{lemma} % donsker case The strong approximation result in Lemma~\ref{lem:kernel_strong_approx_Ln} would be sufficient to develop valid and even optimal uniform inference procedures whenever both $\Dl > 0$ (no degeneracy in $L_n$) and $n h \gg \log n$ ($L_n$ is leading). In this special case, the recent Donsker-type results of \citet{davezies2021exchangeable} can be applied to analyze the limiting distribution of the stochastic process $\hat{f}_W$. Alternatively, again only when $L_n$ is non-degenerate and leading, standard empirical process methods could also be used. However, even in the special case when $\hat{f}_W(w)$ is asymptotically Donsker, our result in Lemma~\ref{lem:kernel_strong_approx_Ln} improves upon the literature by providing a rate-optimal strong approximation for $\hat{f}_W$ as opposed to only a weak convergence result. See Theorem \ref{thm:kernel_infeasible_ucb} and the subsequent discussion below. % however often non-donsker More importantly, as illustrated above, it is common in the literature to find dyadic distributions which exhibit partial or total degeneracy, making the process $\hat{f}_W$ non-Donsker. Thus approximating only $L_n$ is in general insufficient for valid uniform inference, and it is necessary to capture the distributional properties of $E_n$ as well. % we do better The following lemma is an application of our strong approximation result for empirical processes based on the Yurinskii approximation, which builds on a refinement by \citet{belloni2019conditional}. \begin{lemma}[Conditional strong approximation of $E_n$] \label{lem:kernel_conditional_strong_approx_En} % Suppose Assumptions \ref{ass:kernel_data}~and~\ref{ass:kernel_bandwidth} hold and take any $R_n \to \infty$. For each $n$ there exists $\tilde Z^E_n$ a mean-zero Gaussian process conditional on $\bA_n$ satisfying $\sup_{w \in \cW} \big| \sqrt{n^2h} E_n(w) - \tilde Z_n^E(w) \big| \lesssim_\P \frac{(\log n)^{3/8} R_n}{n^{1/4}h^{3/8}}$, where $\E[\tilde Z_n^E(w)\tilde Z_n^E(w')\bigm\vert \bA_n] =n^2h\E[E_n(w)E_n(w')\bigm\vert \bA_n]$ for all $w, w' \in \cW$. % \end{lemma} The process $\tilde Z_n^E$ is a Gaussian process conditional on $\bA_n$ but is not in general a Gaussian process unconditionally. The following lemma constructs an unconditional Gaussian process $Z_n^E$ that approximates $\tilde Z_n^E$. \begin{lemma}[Unconditional strong approximation of $E_n$] \label{lem:kernel_unconditional_strong_approx_En} Suppose that Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold. For each $n$ there exists a mean-zero Gaussian process $Z^E_n$ satisfying $\E\big[ \sup_{w \in \cW} \big|\tilde Z_n^E(w) - Z_n^E(w)\big| \big] \lesssim \frac{(\log n)^{2/3}}{n^{1/6}}$, where $Z_n^E$ is independent of $\bA_n$ and $\E[Z_n^E(w)Z_n^E(w')]=\E[\tilde Z_n^E(w)\tilde Z_n^E(w')] = n^2h \, \E[E_n(w)E_n(w')]$ for all $w, w' \in \cW$. % \end{lemma} Combining Lemmas \ref{lem:kernel_conditional_strong_approx_En} and~\ref{lem:kernel_unconditional_strong_approx_En}, we obtain an unconditional strong approximation for $E_n$. The resulting rate of approximation may not be optimal, due to the Yurinskii coupling, but to the best of our knowledge it is the first in the literature for the process $E_n$, and hence for $\hat{f}_W$ and its associated $t$-process in the context of dyadic data. The approximation rate is sufficiently fast to allow for optimal bandwidth choices; see Section \ref{sec:kernel_implementation} for more details. Strong approximation results for local empirical processes (e.g.\ \citealp{gine2010confidence}) are not applicable here because the summands in the non-negligible $E_n$ are not (conditionally) i.i.d. Likewise, neither standard empirical process and U-process theory \citep{van1996weak,gine2021mathematical} nor the recent results in \citet{davezies2021exchangeable} are applicable to the non-Donsker process $E_n$. The previous lemmas showed that $L_n$ is $\sqrt{n}$-consistent while $E_n$ is $\sqrt{n^2h}$-consistent (pointwise in $w$), showcasing the importance of careful standardization (cf.\ Studentization in Section~\ref{sec:kernel_implementation}) for the purpose of rate adaptivity to the unknown degeneracy type. In other words, a challenge in conducting uniform inference is that the finite-dimensional distributions of the stochastic process $L_n+E_n$, and hence those of $\hat{f}_W$ and its associated $t$-process $T_n$, may converge at different rates at different points $w\in\cW$. The following theorem provides an (infeasible) inference procedure which is fully adaptive to such potential unknown degeneracy. \begin{theorem}[Strong approximation of $T_n$] \label{thm:kernel_strong_approx_Tn} Suppose that Assumptions~\ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold and $f_W(w) > 0$ on $\cW$, and take any $R_n \to \infty$. Then for each $n$ there exists a centered Gaussian process $Z_n^{T}$ such that % \begin{align*} &\sup_{w \in \cW} \left| T_n(w) - Z_n^{T}(w) \right| \lesssim_\P \! \frac{ n^{-1} \! \log n + n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-7/6} h^{-1/2} (\log n)^{2/3} + h^{p\wedge\beta}} {\Dl/\sqrt{n} + 1/\sqrt{n^2h}}, \end{align*} % where $\E[Z_n^T(w)Z_n^T(w')] = \E[T_n(w)T_n(w')]$ for all $w,w' \in \cW$. % \end{theorem} The first term in the numerator corresponds to the strong approximation for $L_n$ in Lemma~\ref{lem:kernel_strong_approx_Ln} and the error introduced by $Q_n$. The second and third terms correspond to the conditional and unconditional strong approximation errors for $E_n$ in Lemmas \ref{lem:kernel_conditional_strong_approx_En} and \ref{lem:kernel_unconditional_strong_approx_En}. The fourth term is from the smoothing bias result in Theorem~\ref{thm:kernel_bias}. The denominator is the lower bound on the standard deviation $\Sigma_n(w,w)^{1/2}$ formulated in Lemma~\ref{lem:kernel_variance_bounds}. In the absence of degenerate points ($\Dl > 0$) and if $n h^{7/2}\gtrsim 1$, Theorem~\ref{thm:kernel_strong_approx_Tn} offers a strong approximation of the $t$-process at the rate $(\log n)/\sqrt{n}+\sqrt{n}h^{p\wedge\beta}$, which matches the celebrated KMT approximation rate for i.i.d.\ data plus the smoothing bias. Therefore, our novel $t$-process strong approximation can achieve the optimal KMT rate for non-degenerate dyadic distributions provided that $p\wedge\beta \geq 3.5$. This is achievable if a fourth-order (boundary-adaptive) kernel is used and $f_W$ is sufficiently smooth. In the presence of partial or total degeneracy ($\Dl =0$), Theorem~\ref{thm:kernel_strong_approx_Tn} provides a strong approximation for the $t$-process at the rate $\sqrt{h}\log n + n^{-1/4}h^{-3/8}(\log n)^{3/8} R_n + n^{-1/6}(\log n)^{2/3} + n h^{1/2+p\wedge\beta}$. If, for example, $n h^{p\wedge\beta}\lesssim 1$, then our result can achieve a strong approximation rate of $n^{-1/7}$ up to $\log n $ terms. Theorem~\ref{thm:kernel_strong_approx_Tn} appears to be the first in the dyadic literature which is also robust to the presence of degenerate points in the underlying dyadic distribution. \subsection{Application: confidence bands} Theorem~\ref{thm:kernel_infeasible_ucb} constructs standardized confidence bands for $f_W$ which are infeasible as they depend on the unknown population variance $\Sigma_n$. In Section~\ref{sec:kernel_implementation} we will make this inference procedure feasible by proposing a valid estimator of the covariance function $\Sigma_n$ for Studentization, as well as developing bandwidth selection and robust bias correction methods. Before presenting our result on valid infeasible uniform confidence bands, we first impose in Assumption~\ref{ass:kernel_rates} some extra restrictions on the bandwidth sequence, which depend on the degeneracy type of the dyadic distribution, to ensure the coverage rate converges. \begin{assumption}[Rate restriction for uniform confidence bands] \label{ass:kernel_rates} Assume that one of the following holds: % \begin{enumerate}[label=(\roman*)] \item \label{it:kernel_rate_non} No degeneracy ($\Dl > 0$): $n^{-6/7} \log n \ll h \ll (n \log n)^{-\frac{1}{2(p \wedge \beta)}}$, \item \label{it:kernel_rate_degen} Partial or total degeneracy ($\Dl = 0$): $n^{-2/3} (\log n)^{7/3} \ll h \ll (n^2 \log n)^{-\frac{1}{2(p \wedge \beta) + 1}}$. \end{enumerate} \end{assumption} We now construct the infeasible uniform confidence bands. For $\alpha \in (0,1)$, let $q_{1-\alpha}$ be the quantile satisfying $ \P\left(\sup_{w \in \cW} \left| Z_n^T(w) \right| \leq q_{1-\alpha} \right) = 1 - \alpha$. The following result employs the anti-concentration idea due to \citet{chernozhukov2014anti} to deduce valid standardized confidence bands, where we approximate the quantile of the unknown finite sample distribution of $\sup_{w\in\cW} |T_n(w)|$ by the quantile $q_{1-\alpha}$ of $\sup_{w\in\cW}|Z_n^T(w)|$. This approach offers a better rate of convergence than relying on extreme value theory for the distributional approximation, hence improving the finite sample performance of the proposed confidence bands. \begin{theorem}[Infeasible uniform confidence bands] \label{thm:kernel_infeasible_ucb} Suppose that Assumptions~\ref{ass:kernel_data},~\ref{ass:kernel_bandwidth}, and~\ref{ass:kernel_rates} hold and $f_W(w) > 0$ on $\cW$. Then % \begin{align*} \P\left( f_W(w) \in \left[ \hat f_W(w) \pm q_{1-\alpha} \sqrt{\Sigma_n(w,w)} \, \right] \, \textup{for all } w \in \cW \right) \to 1 - \alpha. \end{align*} % \end{theorem} By Theorem~\ref{thm:kernel_uniform_consistency}, the asymptotically optimal choice of bandwidth for uniform convergence is $h \asymp ((\log n)/n^2)^{\frac{1}{2(p \wedge \beta)+1}}$. As discussed in the next section, the approximate IMSE-optimal bandwidth is $h \asymp (1/n^2)^{\frac{1}{2(p \wedge \beta)+1}}$. Both bandwidth choices satisfy Assumption~\ref{ass:kernel_rates} only in the case of no degeneracy. The degenerate cases in Assumption~\ref{ass:kernel_rates}\ref{it:kernel_rate_degen}, which require $p \wedge \beta > 1$, exhibit behavior more similar to that of standard nonparametric kernel-based estimation and so the aforementioned optimal bandwidth choices will lead to a non-negligible smoothing bias in the distributional approximation for $T_n$. Different approaches are available in the literature to address this issue, including undersmoothing or ignoring the bias \citep{hall2001bootstrapping}, bias correction \citep{hall1992effect}, robust bias correction \citep{calonico2018effect, calonico2022coverage}, and Lepskii's method \citep{lepskii1992asymptotically,birge2001alternative}, among others. In the next section we develop a feasible uniform inference procedure, based on robust bias correction methods, which amounts to first selecting an optimal bandwidth for the point estimator $\hat{f}_W$ using a $p$th-order kernel, and then correcting the bias of the point estimator while also adjusting the standardization (Studentization) when forming the $t$-statistic $T_n$. Importantly, regardless of the specific implementation details, Theorem~\ref{thm:kernel_infeasible_ucb} shows that any bandwidth sequence $h$ satisfying both \ref{it:kernel_rate_non} and \ref{it:kernel_rate_degen} in Assumption~\ref{ass:kernel_rates} leads to valid uniform inference which is robust and adaptive to the (unknown) degeneracy type. \section{Implementation} \label{sec:kernel_implementation} We address outstanding implementation details to make our main uniform inference results feasible. In Section~\ref{sec:kernel_covariance_estimation} we propose a covariance estimator along with a modified version which is guaranteed to be positive semi-definite. This allows for the construction of fully feasible confidence bands in Section~\ref{sec:kernel_feasible_confidence_bands}. In Section~\ref{sec:kernel_bandwidth_selection} we discuss bandwidth selection and formalize our procedure for robust bias correction inference. \subsection{Covariance function estimation} \label{sec:kernel_covariance_estimation} Define the following plug-in covariance function estimator of $\Sigma_n$. For $w, w' \in \cW$, let $S_i(w) = \frac{1}{n-1} \big( \sum_{j = 1}^{i-1} k_h(W_{j i}, w) + \sum_{j = i+1}^n k_h(W_{i j}, w) \big)$ estimate $\E[k_h(W_{i j},w) \mid A_i]$ and take % \begin{align*} \hat \Sigma_n(w,w') &= \frac{4}{n^2} \sum_{i=1}^n S_i(w) S_i(w') - \frac{4}{n^2(n-1)^2} \sum_{i 0$ on $\cW$. Then % \begin{align*} \sup_{w,w' \in \cW} \left| \frac{\hat \Sigma_n(w,w') - \Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right| &\lesssim_\P \frac{\sqrt{\log n}}{n}. \end{align*} % The optimization problem \eqref{eq:kernel_sdp} is a semi-definite program \citep[SDP,][]{laurent2005semidefinite} and has an approximately optimal solution $\hat\Sigma_n^+$ satisfying % \begin{align*} \sup_{w,w' \in \cW} \left| \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right| &\lesssim_\P \frac{\sqrt{\log n}}{n}. \end{align*} % \end{lemma} In practice we take $w, w' \in \cW_d$ where $\cW_d$ is a finite subset of $\cW$, typically taken to be an equally-spaced grid. This yields finite-dimensional covariance matrices, for which \eqref{eq:kernel_sdp} can be solved in polynomial time in $|\cW_d|$ using a general-purpose SDP solver \citep[e.g.\ by interior point methods,][]{laurent2005semidefinite}. The number of points in $\cW_d$ should be taken as large as is computationally practical in order to generate confidence bands rather than merely simultaneous confidence intervals. It is worth noting that the complexity of solving \eqref{eq:kernel_sdp} does not depend on the number of vertices $n$, and so does not influence the ability of our methodology to handle large and possibly sparse networks. The bias-corrected variance estimator in \citet[Section~3.2]{matsushita2021jackknife} takes a similar form to our estimator $\hat\Sigma_n$ but in the parametric setting, and is therefore also not guaranteed to be positive semi-definite in finite samples. Our approach addresses this issue, ensuring a positive semi-definite estimator $\hat\Sigma_n^+$ is always available. \subsection{Feasible confidence bands} \label{sec:kernel_feasible_confidence_bands} Given a choice of the kernel order $p$ and a bandwidth $h$, we construct a valid confidence band that is implementable in practice. Define the Studentized $t$-statistic process % \begin{align*} \hat T_n(w) = \frac{\hat{f}_W(w) - f_W(w)}{\sqrt{\hat \Sigma_n^+(w,w)}}. \end{align*} % Let $\hat Z_n^T(w)$ be a process which, conditional on the data $\bW_n$, is mean-zero and Gaussian, whose conditional covariance structure is $\E\big[ \hat Z_n^T(w) \hat Z_n^T(w') \bigm\vert \bW_n \big] = \frac{\hat \Sigma_n^+(w,w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}$. For $\alpha \in (0,1)$, let $\hat q_{1-\alpha}$ be the conditional quantile satisfying $\P\big(\sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \leq \hat q_{1-\alpha} \bigm\vert \bW_n \big) = 1 - \alpha$, which is shown to be well defined in Section~\ref{sec:kernel_app_proofs}. \begin{theorem}[Feasible uniform confidence bands] \label{thm:kernel_ucb} Suppose that Assumptions \ref{ass:kernel_data}, \ref{ass:kernel_bandwidth}, and \ref{ass:kernel_rates} hold and $f_W(w) > 0$ on $\cW$. Then % \begin{align*} \P\left( f_W(w) \in \left[ \hat f_W(w) \pm \hat q_{1-\alpha} \sqrt{\hat\Sigma_n^+(w,w)} \,\right] \,\textup{for all } w \in \cW \right) \to 1 - \alpha. \end{align*} % \end{theorem} Recently, \citet{chiang2022inference} derived high-dimensional central limit theorems over rectangles for exchangeable arrays and applied them to construct simultaneous confidence intervals for a sequence of design points. Their inference procedure relies on the multiplier bootstrap, and their conditions for valid inference depend on the number of design points considered. In contrast, Theorem~\ref{thm:kernel_ucb} constructs a feasible uniform confidence band over the entire domain of inference $\cW$ based on our strong approximation results for the whole $t$-statistic process and the covariance estimator $\hat\Sigma_n^+$. The required rate condition specified in Assumption~\ref{ass:kernel_rates} does not depend on the number of design points. Furthermore, our proposed inference methods are robust to potential unknown degenerate points in the underlying dyadic data generating process. In practice, suprema over $\cW$ can be replaced by maxima over sufficiently many design points in $\cW$. The conditional quantile $\hat q_{1-\alpha}$ can be estimated by Monte Carlo simulation, resampling from the Gaussian process defined by the law of $\hat Z_n^T \mid \bW_n$. The bandwidth restrictions in Theorem~\ref{thm:kernel_ucb} are the same as those for the infeasible version given in Theorem~\ref{thm:kernel_infeasible_ucb}, namely those imposed in Assumption \ref{ass:kernel_rates}. This follows from the rates of convergence obtained in Lemma~\ref{lem:kernel_sdp}, coupled with some careful technical work given in Section~\ref{sec:kernel_app_proofs} to handle the potential presence of degenerate points in $\Sigma_n$. \subsection{Bandwidth selection and robust bias-corrected inference} \label{sec:kernel_bandwidth_selection} We give practical suggestions for selecting the bandwidth parameter $h$. Let $\nu(w)$ be a non-negative real-valued function on $\cW$ and suppose we use a kernel of order $p < \beta$ of the form $k_h(s,w) = K\big((s-w) / h\big)/h$. The $\nu$-weighted asymptotic IMSE (AIMSE) is minimized by % \begin{align*} h^*_{\AIMSE} &= \left( \frac{p!(p-1)! \Big(\int_\cW f_W(w) \nu(w) \diff{w}\Big) \Big(\int_\R K(w)^2 \diff{w}\Big)} {2 \Big( \int_{\cW} f_W^{(p)}(w)^2 \nu(w) \diff{w} \Big) \Big( \int_\R w^p K(w) \diff{w} \Big)^2 } \right)^{\frac{1}{2p+1}} \left( \frac{n(n-1)}{2} \right)^{-\frac{1}{2p+1}}. \end{align*} % This is akin to the AIMSE-optimal bandwidth choice for traditional monadic kernel density estimation with a sample size of $\frac{1}{2}n(n-1)$. The choice $h^*_{\AIMSE}$ is slightly undersmoothed (up to a polynomial $\log n$ factor) relative to the uniform minimax-optimal bandwidth choice discussed in Section~\ref{sec:kernel_point_estimation}, but it is easier to implement in practice. To implement the AIMSE-optimal bandwidth choice, we propose a simple rule-of-thumb (ROT) approach based on Silverman's rule. Suppose $p\wedge\beta=2$ and let $\hat\sigma^2$ and $\hat I$ be the sample variance and sample interquartile range respectively of the data $\bW_n$. Then $\hat{h}_{\ROT} = C(K) \big( \hat\sigma \wedge \frac{\hat I}{1.349} \big) \big(\frac{n(n-1)}{2} \big)^{-1/5}$, where we have $C(K)=2.576$ for the triangular kernel $K(w) = (1 - |w|) \vee 0$, and $C(K)=2.435$ for the Epanechnikov kernel $K(w) = \frac{3}{4}(1 - w^2) \vee 0$. The AIMSE-optimal bandwidth selector $h^*_{\AIMSE}\asymp n^{-\frac{2}{2p+1}}$ and any of its feasible estimators only satisfy Assumption~\ref{ass:kernel_rates} in the case of no degeneracy ($\Dl>0$). Under partial or total degeneracy, such bandwidths are not valid due to the usual leading smoothing (or misspecification) bias of the distributional approximation. To circumvent this problem and construct feasible uniform confidence bands for $f_W$, we employ the following robust bias correction approach. \begin{algorithm}[b!] \caption{Feasible uniform confidence bands} \label{alg:kernel_method} \setstretch{1.5} Choose a kernel $k_h$ of order $p \geq 2$ satisfying Assumption~\ref{ass:kernel_bandwidth}. \\ Select a bandwidth $h \approx h^*_{\AIMSE}$ for $k_h$ as in Section~\ref{sec:kernel_bandwidth_selection}, perhaps using $h = \hat{h}_{\ROT}$. \\ Choose another kernel $k_h'$ of order $p'>p$ satisfying Assumption~\ref{ass:kernel_bandwidth}. For $d \geq 1$, choose a set of $d$ distinct evaluation points $\cW_d$. \\ For each $w \in \cW_d$, construct the density estimate $\hat f_W(w)$ using $k'_{h}$ as in Section~\ref{sec:kernel_introduction}. \\ For $w, w' \in \cW_d$, estimate the covariance $\hat \Sigma_n(w,w')$ using $k'_{h}$ as in Section~\ref{sec:kernel_covariance_estimation}. \\ Construct positive semi-definite covariance estimate $\hat \Sigma_n^+$ as in Section~\ref{sec:kernel_covariance_estimation}. \\ For $B \geq 1$, let $(\hat Z_{n,r}^T: 1\leq r\leq B)$ be i.i.d.\ from $\hat{Z}_n^T$ as in Section~\ref{sec:kernel_feasible_confidence_bands}. \\ For $\alpha \in (0,1)$, set $\hat q_{1-\alpha} = \inf_{q \in \R} \{ q : \# \{r: \max_{w\in\cW_d}|\hat Z_{n,r}^T(w)| \leq q \} \geq B(1-\alpha) \}$. \\ Construct $ \big[\hat f_W(w) \pm \hat q_{1-\alpha} \hat\Sigma_n^+(w,w)^{1/2} \big]$ for each $w \in \cW_d$. % \end{algorithm} Firstly, estimate the bandwidth $h^*_{\AIMSE}\asymp n^{-\frac{2}{2p+1}}$ using a kernel of order $p$, which leads to an AIMSE-optimal point estimator $\hat{f}_W$ in an $L^2(\nu)$ sense. Then use this bandwidth and a kernel of order $p' > p$ to construct the statistic $\hat T_n$ and the confidence band as detailed in Section~\ref{sec:kernel_feasible_confidence_bands}. Importantly, both $\hat{f}_W$ and $\hat{\Sigma}^+_n$ are recomputed with the new higher-order kernel. The change in centering is equivalent to a bias correction of the original AIMSE-optimal point estimator, while the change in scale captures the additional variability introduced by the bias correction itself. As shown formally in \citet{calonico2018effect, calonico2022coverage} for the case of kernel-based density estimation with i.i.d.\ data, this approach leads to higher-order refinements in the distributional approximation whenever additional smoothness is available ($p'\leq\beta$). In the present dyadic setting, this procedure is valid so long as $n^{-2/3} (\log n)^{7/3} \ll n^{-\frac{2}{2p+1}} \ll (n^2 \log n)^{-\frac{1}{2p' + 1}}$, which is equivalent to $2 \leq p < p'$. For concreteness, we recommend taking $p = 2$ and $p' = 4$, and using the rule-of-thumb bandwidth choice $\hat{h}_{\ROT}$ defined above. In particular, this approach automatically delivers a KMT-optimal strong approximation whenever there are no degeneracies in the underlying dyadic data generating process. Our feasible robust bias correction method based on AIMSE-optimal dyadic kernel density estimation for constructing uniform confidence bands for $f_W$ is summarized in Algorithm~\ref{alg:kernel_method}. \section{Simulations} \label{sec:kernel_simulations} We investigate the empirical finite-sample performance of the kernel density estimator with dyadic data using simulations. The family of dyadic distributions defined in Section~\ref{sec:kernel_degeneracy}, with its three parameterizations, is used to generate data sets with different degeneracy types. We use two different boundary bias-corrected Epanechnikov kernels of orders $p=2$ and $p=4$ respectively, on the inference domain $\cW = [-2,2]$. We select an optimal bandwidth for $p=2$ as recommended in Section~\ref{sec:kernel_bandwidth_selection}, using the rule-of-thumb with $C(K) = 2.435$. The semi-definite program in Section~\ref{sec:kernel_covariance_estimation} is solved with the MOSEK interior point optimizer \citep{mosek}, ensuring positive semi-definite covariance estimates. Gaussian vectors are resampled $B = 10\,000$ times. \begin{figure}[b!] \centering % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/outcome_plot_total.pdf} \caption{Total degeneracy, \\ $\pi = \left( \frac{1}{2}, 0, \frac{1}{2} \right)$.} \end{subfigure} % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/outcome_plot_partial.pdf} \caption{Partial degeneracy, \\ $\pi = \left( \frac{1}{4}, 0, \frac{3}{4} \right)$.} \end{subfigure} % \begin{subfigure}{0.32\textwidth} \centering %\includegraphics[scale=0.64]{graphics/outcome_plot_none.pdf} \caption{No degeneracy, \\ $\pi = \left( \frac{1}{5}, \frac{1}{5}, \frac{3}{5} \right)$.} \end{subfigure} % \caption[Typical outcomes for different values of the parameter $\pi$] {Typical outcomes for three different values of the parameter $\pi$.} % \label{fig:kernel_results} % \end{figure} In Figure~\ref{fig:kernel_results} we plot a typical outcome for each of the three degeneracy types (total, partial, none), using the Epanechnikov kernel of order $p=2$, with sample size $n=100$ (so $N=4950$ pairs of nodes) and with $d=100$ equally-spaced evaluation points. Each plot contains the true density function $f_W$, the dyadic kernel density estimate $\hat f_W$ and two different approximate $95\%$ confidence bands for $f_W$. The first is the uniform confidence band (UCB) constructed using one of our main results, Theorem~\ref{thm:kernel_ucb}. The second is a sequence of pointwise confidence intervals (PCI) constructed by finding a confidence interval for each evaluation point separately. We show only $10$ pointwise confidence intervals for clarity. In general, the PCIs are too narrow as they fail to provide simultaneous (uniform) coverage over the evaluation points. Note that under partial degeneracy the confidence band narrows near the degenerate point $w = 0$. \begin{table}[b!] \centering \begin{tabular}{|c|c|c|c|c|cc|cc|} \hline \multirow{2}{*}{$ \pi $} & \multirow{2}{*}{Degeneracy type} & \multirow{2}{*}{$ \hat h_{\ROT} $} & \multirow{2}{*}{$ p $} & \multirow{2}{*}{RIMSE} & \multicolumn{2}{|c|}{UCB} & \multicolumn{2}{|c|}{PCI} \\ \cline{6-9} & & & & & CR & AW & CR & AW \\ \hline \multirow{2}{*}{$ \left(\frac{1}{2}, 0, \frac{1}{2}\right) $} & \multirow{2}{*}{Total} & \multirow{2}{*}{0.161} & 2 & 0.00048 & 87.1\% & 0.0028 & 6.5\% & 0.0017 \\ & & & 4 & 0.00068 & 95.2\% & 0.0042 & 9.7\% & 0.0025 \\ \hline \multirow{2}{*}{$ \left(\frac{1}{4}, 0, \frac{3}{4}\right) $} & \multirow{2}{*}{Partial} & \multirow{2}{*}{0.158} & 2 & 0.00228 & 94.5\% & 0.0112 & 75.6\% & 0.0083 \\ & & & 4 & 0.00234 & 94.7\% & 0.0124 & 65.3\% & 0.0087 \\ \hline \multirow{2}{*}{$ \left(\frac{1}{5}, \frac{1}{5}, \frac{3}{5}\right) $} & \multirow{2}{*}{None} & \multirow{2}{*}{0.145} & 2 & 0.00201 & 94.2\% & 0.0106 & 73.4\% & 0.0077 \\ & & & 4 & 0.00202 & 95.6\% & 0.0117 & 64.3\% & 0.0080 \\ \hline \end{tabular} \caption[Numerical results for three values of the parameter $\pi$]{ Numerical results for three values of the parameter $\pi$.} \label{tab:kernel_results} \end{table} Next, Table~\ref{tab:kernel_results} presents numerical results. For each degeneracy type (total, partial, none) and each kernel order ($p=2$, $p=4$), we run $2000$ repeats with sample size $n=3000$ (giving $N=4\,498\,500$ pairs of nodes) and with $d=50$ equally-spaced evaluation points. We record the average rule-of-thumb bandwidth $\hat{h}_{\ROT}$ and the average root integrated mean squared error (RIMSE). For both the uniform confidence bands (UCB) and the pointwise confidence intervals (PCI), we report the coverage rate (CR) and the average width (AW). % The lower-order kernel ($p=2$) ignores the bias, leading to good RIMSE performance and acceptable UCB coverage under partial or no degeneracy, but gives invalid inference under total degeneracy. In contrast, the higher-order kernel ($p=4$) provides robust bias correction and hence improves the coverage of the UCB in every regime, particularly under total degeneracy, at the cost of increasing both the RIMSE and the average widths of the confidence bands. % As expected, the pointwise (in $w\in\cW$) confidence intervals (PCIs) severely undercover in every regime. Thus our simulation results show that the proposed feasible inference methods based on robust bias correction and proper Studentization deliver valid uniform inference which is robust to unknown degenerate points in the underlying dyadic distribution. \section{Counterfactual dyadic density estimation} \label{sec:kernel_counterfactual} To further showcase the applicability of our main results, we develop a kernel density estimator for dyadic counterfactual distributions. The aim of such counterfactual analysis is to estimate the distribution of an outcome variable had some covariates followed a distribution different from the actual one, and it is important in causal inference and program evaluation settings \citep{dinardo1996distribution,chernozhukov2013inference}. For each $r \in \{0,1\}$, let $\bW_n^r$, $\bA_n^r$, and $\bV_n^r$ be random variables as defined in Assumption~\ref{ass:kernel_data} and $\bX_n^r = (X_1^r, \ldots, X_n^r)$ be some covariates. We assume that $(A_i^r, X_i^r)$ are independent over $1 \leq i \leq n$ and that $\bX_n^r$ is independent of $\bV_n^r$, that $W_{i j}^r \mid X_i^r, X_j^r$ has a conditional Lebesgue density $f_{W \mid XX}^r(\,\cdot \mid x_1, x_2) \in \cH^\beta_{C_\rH}(\cW)$, that $X_i^r$ follows a distribution function $F_X^r$ on a common support $\cX$, and that $(\bA_n^0, \bV_n^0, \bX_n^0)$ is independent of $(\bA_n^1, \bV_n^1, \bX_n^1)$. We interpret $r$ as an index for two populations, labeled $0$ and $1$. The counterfactual density of population $1$ had it followed the same covariate distribution as population $0$ is % \begin{align*} f_W^{1 \triangleright 0}(w) &= \E\left[ f_{W \mid XX}^1\big(w \mid X_1^0, X_2^0\big) \right] \\ &= \int_{\cX} \int_{\cX} f_{W \mid XX}^{1}(w \mid x_1, x_2) \psi(x_1) \psi(x_2) \diff F_X^{1}(x_1) \diff F_X^{1}(x_2), \end{align*} % where $\psi(x) = \mathrm{d} F_X^0(x) / \mathrm{d} F_X^1(x)$ for $x \in \cX$ is a Radon--Nikodym derivative. If $X^0_i$ and $X^1_i$ have Lebesgue densities, it is natural to consider a parametric model of the form $\mathrm{d} F_X^{r}(x)=f_X^r(x;\theta)\diff x$ for some finite-dimensional parameter $\theta$. Alternatively, if the covariates $X_n^r$ are discrete and have a positive probability mass function $p_X^r(x)$ on a finite support $\cX$, the object of interest becomes $f_W^{1 \triangleright 0}(w) = \sum_{x_1 \in \cX} \sum_{x_2 \in \cX} f_{W \mid XX}^{1}(w \mid x_1, x_2) \psi(x_1) \psi(x_2) p_X^{1}(x_1) p_X^{1}(x_2)$, where $\psi(x) = p_X^0(x)/p_X^1(x)$ for $x \in \cX$. We consider discrete covariates for simplicity, and hence the counterfactual dyadic kernel density estimator is % \begin{align*} \hat f_W^{\,1 \triangleright 0}(w) &= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^n \hat \psi(X_i^1) \hat \psi(X_j^1) k_h(W_{i j}^1, w), \end{align*} % where $\hat\psi(x) = \hat p_X^{\,0}(x) / \hat p_X^{\,1}(x)$ and $\hat p_X^{\,r}(x) = \frac{1}{n}\sum_{i = 1}^n \I\{X_i^r = x\}$, with $\I$ the indicator function. Section~\ref{sec:kernel_app_main} provides technical details: we show how an asymptotic linear representation for $\hat\psi(x)$ leads to a Hoeffding-type decomposition of $\hat f_W^{\,1 \triangleright 0}(w)$, which is then used to establish that $\hat f_W^{\,1 \triangleright 0}$ is uniformly consistent for $f_W^{\,1 \triangleright 0}(w)$ and also admits a Gaussian strong approximation, with the same rates of convergence as for the standard density estimator. Furthermore, define the covariance function of $\hat f_W^{\,1 \triangleright 0}(w)$ as $\Sigma_n^{1 \triangleright 0}(w,w') = \Cov\big[ \hat f_W^{\,1 \triangleright 0}(w), \hat f_W^{\,1 \triangleright 0}(w') \big]$, which can be estimated as follows. First let $\hat\kappa(X_i^0, X_i^1, x) = \frac{\I\{X_i^0 = x\} - \hat p_X^0(x)}{\hat p_X^1(x)} - \frac{\hat p_X^0(x)}{\hat p_X^1(x)} \frac{\I\{X_i^1 = x\} - \hat p_X^1(x)}{\hat p_X^1(x)}$ be a plug-in estimate of the influence function for $\hat\psi(x)$ and define the leave-one-out conditional expectation estimators $S_i^{1 \triangleright 0}(w) = \frac{1}{n-1} \big( \sum_{j=1}^{i-1} k_h(W_{j i}^1,w) \hat\psi(X_j^1) + \sum_{j=i+1}^n k_h(W_{i j}^1,w) \hat\psi(X_j^1) \big)$ and $\tilde S_i^{1 \triangleright 0}(w) = \frac{1}{n-1} \sum_{j=1}^n \I\{j \neq i\} \hat\kappa(X_i^0, X_i^1, X_j^1) S_j^{1 \triangleright 0}(w)$. Define the covariance estimator % \begin{align*} \hat\Sigma_n^{1 \triangleright 0}(w,w') &= \frac{4}{n^2} \sum_{i=1}^n \big( \hat\psi(X_i^1) S_i^{1 \triangleright 0}(w) + \tilde S_i^{1 \triangleright 0}(w) \big) \big( \hat\psi(X_i^1) S_i^{1 \triangleright 0}(w') + \tilde S_i^{1 \triangleright 0}(w') \big) \\ &\quad- \frac{4}{n^3(n-1)} \sum_{i 0$ and a kernel function $k_h$ on $\R^m \times \R^m$, the local polynomial regression estimator of $\mu(x_1, x_2)$ is $\hat\mu(x_1, x_2) = e_1^\T \hat\beta(x_1, x_2)$ where $e_1$ is the first standard unit vector in $\R^q$ for $q=\binom{2m+\gamma}{\gamma}$ and % \begin{align} \nonumber \hat{\beta}(x_1, x_2) &= \argmin_{\beta \in \R^q} \sum_{i=1}^{n-1} \sum_{j=i+1}^n \left( Y_{i j} - r(X_i-x_1, X_j-x_2)^\T \beta \right)^2 k_h(X_i-x_1, X_j-x_2) \\ \label{eq:kernel_locpol} &= \left( \sum_{i=1}^{n-1} \sum_{j=i+1}^n k_{i j} r_{i j} r_{i j}^\T \right)^{-1} \left( \sum_{i=1}^{n-1} \sum_{j=i+1}^n k_{i j} r_{i j} Y_{i j} \right), \end{align} % with $k_{i j} = k_h(X_i-x_1, X_j-x_2)$ and $r_{i j} = r(X_i-x_1, X_j-x_2)$. \citet{graham2021minimax} established pointwise distribution theory for the special case of the dyadic Nadaraya--Watson kernel regression estimator ($\gamma=0$), but no uniform analogues have yet been given. It can be shown that the ``denominator'' matrix in \eqref{eq:kernel_locpol} converges uniformly to its expectation, while the U-process-like ``numerator'' matrix can be handled the same way as we analyzed $\hat f_W(w)$ in this chapter, through a Hoeffding-type decomposition and strong approximation methods, along with standard bias calculations. Such distributional approximation results can be used to construct valid uniform confidence bands for the regression function $\mu(x_1, x_2)$, as well as to conduct hypothesis testing for parametric specifications or shape constraints. As a third example, we consider applying our results to semiparametric semi-linear regression problems. The dyadic semi-linear regression model is $\E[Y_{i j} \mid W_{i j}, X_i, X_j] = \theta^\T W_{i j} + g(X_i, X_j)$ where $\theta$ is the finite-dimensional parameter of interest and $g(X_i, X_j)$ is an unknown function of the covariates $(X_i, X_j)$. Local polynomial (or other) methods can be used to estimate $\theta$ and $g$, where the estimator of the nonparametric component $g$ takes a similar form to \eqref{eq:kernel_locpol}, that is, a ratio of two kernel-based estimators as in \eqref{eq:kernel_estimator}. Consequently, the strong approximation techniques presented in this chapter can be appropriately modified to develop valid uniform inference procedures for $g$ and $\E[Y_{i j} \mid W_{i j}=w, X_i=x_1, X_j=x_2]$, as well as functionals thereof. \section{Conclusion} \label{sec:kernel_conclusion} We studied the uniform estimation and inference properties of the dyadic kernel density estimator $\hat{f}_W$ given in \eqref{eq:kernel_estimator}, which forms a class of U-process-like estimators indexed by the $n$-varying kernel function $k_h$ on $\cW$. We established uniform minimax-optimal point estimation results and uniform distributional approximations for this estimator based on novel strong approximation strategies. We then applied these results to derive valid and feasible uniform confidence bands for the dyadic density estimand $f_W$, and also developed a substantive application of our theory to counterfactual dyadic density analysis. We gave some other statistical applications of our methodology as well as potential avenues for future research. From a technical perspective, Appendix~\ref{app:kernel} contains several generic results concerning strong approximation methods and maximal inequalities for empirical processes that may be of independent interest. Implementations of this chapter's methodology, along with replication files for the empirical results, are provided by a Julia package available at \github{wgunderwood/DyadicKDE.jl}. This work is based on \citet{cattaneo2024uniform}, and has been presented by Cattaneo at the Columbia University Biostatistics Colloquium Seminar (2022) and the Georgia Institute of Technology Statistics Seminar (2022), by Feng at the Renmin University Econometrics Seminar (2022), the Xiamen University Symposium on Modern Statistics (2022), the Peking University Econometrics Seminar (2023), and the Asian Meeting of the Econometric Society in East and Southeast Asia, Singapore (2023), and by Underwood at the University of Illinois Statistics Seminar (2024), the University of Michigan Statistics Seminar (2024), and the University of Pittsburgh Statistics Seminar (2024). \chapter[Yurinskii's Coupling for Martingales]% {Yurinskii's Coupling \\ for Martingales} \label{ch:yurinskii} % abstract Yurinskii's coupling is a popular theoretical tool for non-asymptotic distributional analysis in mathematical statistics and applied probability, offering a Gaussian strong approximation with an explicit error bound under easily verified conditions. Originally stated in $\ell^2$-norm for sums of independent random vectors, it has recently been extended both to the $\ell^p$-norm, for $1 \leq p \leq \infty$, and to vector-valued martingales in $\ell^2$-norm, under some strong conditions. We present as our main result a Yurinskii coupling for approximate martingales in $\ell^p$-norm, under substantially weaker conditions than those previously imposed. Our formulation further allows for the coupling variable to follow a more general Gaussian mixture distribution, and we provide a novel third-order coupling method which gives tighter approximations in certain settings. We specialize our main result to mixingales, martingales, and independent data, and derive uniform Gaussian mixture strong approximations for martingale empirical processes. Substantive applications of our theory to nonparametric partitioning-based and local polynomial regression procedures are provided. \section{Introduction} Yurinskii's coupling \citep{yurinskii1978error} has proven to be an important theoretical tool for developing non-asymptotic distributional approximations in mathematical statistics and applied probability. For a sum $S$ of $n$ independent zero-mean $d$-dimensional random vectors, this coupling technique constructs (on a suitably enlarged probability space) a zero-mean $d$-dimensional Gaussian vector $T$ with the same covariance matrix as $S$ and which is close to $S$ in probability, bounding the discrepancy $\|S-T\|$ as a function of $n$, $d$, the choice of the norm, and some features of the underlying distribution. See, for example, \citet[Chapter 10]{pollard2002user} for a textbook introduction. When compared to other coupling approaches, such as the celebrated Hungarian construction \citep{komlos1975approximation} or Zaitsev's coupling \citep{zaitsev1987estimates,zaitsev1987gaussian}, Yurinskii's approach stands out for its simplicity, robustness, and wider applicability, while also offering tighter couplings in some applications (see below for more discussion and examples). These features have led many scholars to use Yurinskii's coupling to study the distributional features of high-dimensional statistical procedures in a variety of settings, often with the end goal of developing uncertainty quantification or hypothesis testing methods. For example, in recent years, Yurinskii's coupling has been used to construct Gaussian approximations for the suprema of empirical processes \citep{chernozhukov2014gaussian}; to establish distribution theory for non-Donsker stochastic $t$-processes generated in nonparametric series regression \citep{belloni2015some}; to prove distributional approximations for high-dimensional $\ell^p$-norms \citep{biau2015high}; to develop distribution theory for vector-valued martingales \citep{belloni2018high,li2020uniform}; to derive a law of the iterated logarithm for stochastic gradient descent optimization methods \citep{anastasiou2019normal}; to establish uniform distributional results for nonparametric high-dimensional quantile processes \citep{belloni2019conditional}; to develop distribution theory for non-Donsker stochastic $t$-processes generated in partitioning-based series regression \citep{cattaneo2020large}; to deduce Bernstein--von Mises theorems in high-dimensional settings \citep{ray2021bernstein}; and to develop distribution theory for non-Donsker U-processes based on dyadic network data \citep{cattaneo2024uniform}. There are also many other early applications of Yurinskii's coupling: \citet{dudley1983invariance} and \citet{dehling1983limit} establish invariance principles for Banach space-valued random variables, and \citet{lecam1988} and \citet{sheehy1992uniform} obtain uniform Donsker results for empirical processes, to name just a few. This chapter presents a new Yurinskii coupling which encompasses and improves upon all of the results previously available in the literature, offering four new features: % \begin{enumerate}[label=(\roman*),leftmargin=*] \item \label{it:yurinskii_contribution_approximate_martingale} It applies to vector-valued \textit{approximate martingale} data. \item \label{it:yurinskii_contribution_gaussian_mixture} It allows for a \textit{Gaussian mixture} coupling distribution. \item \label{it:yurinskii_contribution_degeneracy} It imposes \textit{no restrictions on degeneracy} of the data covariance matrix. \item \label{it:yurinskii_contribution_third_order} It establishes a \textit{third-order} coupling to improve the approximation in certain situations. \end{enumerate} % Closest to our work are the unpublished manuscript by \citet{belloni2018high} and the recent paper by \citet{li2020uniform}, which both investigated distribution theory for martingale data using Yurinskii's coupling and related methods. Specifically, \citet{li2020uniform} established a Gaussian $\ell^2$-norm Yurinskii coupling for mixingales and martingales under the assumption that the covariance structure has a minimum eigenvalue bounded away from zero. As formally demonstrated in this chapter (Section~\ref{sec:yurinskii_kde}), such eigenvalue assumptions can be prohibitively strong in practically relevant applications. In contrast, our Yurinskii coupling does not impose any restrictions on covariance degeneracy \ref{it:yurinskii_contribution_degeneracy}, in addition to offering several other new features not present in \citet{li2020uniform}, including \ref{it:yurinskii_contribution_approximate_martingale}, \ref{it:yurinskii_contribution_gaussian_mixture}, \ref{it:yurinskii_contribution_third_order}, and applicability to general $\ell^p$-norms. In addition, we correct a slight technical inaccuracy in their proof relating to the derivation of bounds in probability (Remark \ref{rem:yurinskii_coupling_bounds_probability}). \citet{belloni2018high} did not establish a Yurinskii coupling for martingales, but rather a central limit theorem for smooth functions of high-dimensional martingales using the celebrated second-order Lindeberg method \citep[see][and references therein]{chatterjee2006generalization}, explicitly accounting for covariance degeneracy. As a consequence, their result could be leveraged to deduce a Yurinskii coupling for martingales with additional, non-trivial technical work (see Section~\ref{sec:yurinskii_app_proofs} in Appendix~\ref{app:yurinskii} for details). Nevertheless, a Yurinskii coupling derived from \citet{belloni2018high} would not feature \ref{it:yurinskii_contribution_approximate_martingale}, \ref{it:yurinskii_contribution_gaussian_mixture}, \ref{it:yurinskii_contribution_third_order}, or general $\ell^p$-norms, as our results do. We discuss further the connections between our work and the related literature in the upcoming sections, both when introducing our main theoretical results and when presenting the examples and statistical applications. The most general coupling result of this chapter (Theorem~\ref{thm:yurinskii_sa_dependent}) is presented in Section~\ref{sec:yurinskii_main_results}, where we also specialize it to a slightly weaker yet more user-friendly formulation (Proposition~\ref{pro:yurinskii_sa_simplified}). Our Yurinskii coupling for approximate martingales is a strict generalization of all previous Yurinskii couplings available in the literature, offering a Gaussian mixture strong approximation for approximate martingale vectors in $\ell^p$-norm, with an improved rate of approximation when the third moments of the data are negligible, and with no assumptions on the spectrum of the data covariance matrix. A key technical innovation underlying the proof of Theorem~\ref{thm:yurinskii_sa_dependent} is that we explicitly account for the possibility that the minimum eigenvalue of the variance may be zero, or its lower bound may be unknown, with the argument proceeding using a carefully tailored regularization. Establishing a coupling to a Gaussian mixture distribution is achieved by an appropriate conditioning argument, leveraging a conditional version of Strassen's theorem established by \citet{chen2020jackknife}, along with some related technical work detailed in Section~\ref{sec:yurinskii_app_proofs}. A third-order coupling is obtained via a modification of a standard smoothing technique for Borel sets from classical versions of Yurinskii's coupling, enabling improved approximation errors whenever third moments are negligible. In Proposition~\ref{pro:yurinskii_sa_simplified}, we explicitly tune the parameters of the aforementioned regularization to obtain a simpler, parameter-free version of Yurinskii's coupling for approximate martingales, again offering Gaussian mixture coupling distributions and an improved third-order approximation error. This specialization of our main result takes an agnostic approach to potential singularities in the data covariance matrix and, as such, may be improved in specific applications where additional knowledge of the covariance structure is available. Section~\ref{sec:yurinskii_main_results} also presents some further refinements when additional structure is imposed, deriving Yurinskii couplings for mixingales, martingales, and independent data as Corollaries~\ref{cor:yurinskii_sa_mixingale}, \ref{cor:yurinskii_sa_martingale}, and \ref{cor:yurinskii_sa_indep}, respectively. We take the opportunity to discuss and correct in Remark~\ref{rem:yurinskii_coupling_bounds_probability} a technical issue which is often neglected \citep{pollard2002user, li2020uniform} when using Yurinskii's coupling to derive bounds in probability. Section~\ref{sec:yurinskii_factor} presents a stylized example portraying the relevance of our main technical results in the context of canonical factor models, illustrating the importance of each of our new Yurinskii coupling features \ref{it:yurinskii_contribution_approximate_martingale}--% \ref{it:yurinskii_contribution_third_order}. Section~\ref{sec:yurinskii_emp_proc} considers a substantive application of our main results: strong approximation of martingale empirical processes. We begin with the motivating example of canonical kernel density estimation, demonstrating how Yurinskii's coupling can be applied, and showing in Lemma~\ref{lem:yurinskii_kde_eigenvalue} why it is essential that we do not place any conditions on the minimum eigenvalue of the variance matrix \ref{it:yurinskii_contribution_degeneracy}. We then present a general-purpose strong approximation for martingale empirical processes in Proposition~\ref{pro:yurinskii_emp_proc}, combining classical results in the empirical process literature \citep{van1996weak} with our Corollary~\ref{cor:yurinskii_sa_martingale}. This statement appears to be the first of its kind for martingale data, and when specialized to independent (and not necessarily identically distributed) data, it is shown to be superior to the best known comparable strong approximation result available in the literature \citep{berthet2006revisiting}. Our improvement comes from using Yurinskii's coupling for the $\ell^\infty$-norm, where \citet{berthet2006revisiting} apply Zaitsev's coupling \citep{zaitsev1987estimates, zaitsev1987gaussian} with the larger $\ell^2$-norm. Section~\ref{sec:yurinskii_nonparametric} further illustrates the applicability of our results through two examples in nonparametric regression estimation. Firstly, we deduce a strong approximation for partitioning-based least squares series estimators with time series data, applying Corollary~\ref{cor:yurinskii_sa_martingale} directly and additionally imposing only a mild mixing condition on the regressors. We show that our Yurinskii coupling for martingale vectors delivers the same distributional approximation rate as the best known result for independent data, and discuss how this can be leveraged to yield a feasible statistical inference procedure. We also show that if the residuals have vanishing conditional third moment, an improved rate of Gaussian approximation can be established. Secondly, we deduce a strong approximation for local polynomial estimators with time series data, using our result on martingale empirical processes (Proposition~\ref{pro:yurinskii_emp_proc}) and again imposing a mixing assumption. Appealing to empirical process theory is essential here as, in contrast with series estimators, local polynomials do not possess certain additive separability properties. The bandwidth restrictions we require are relatively mild, and, as far as we know, they have not been improved upon even with independent data. Section \ref{sec:yurinskii_conclusion} concludes the chapter. All proofs are collected in Appendix~\ref{app:yurinskii}, which also includes other technical lemmas of potential independent interest, alongside some further results on applications of our theory to deriving high-dimensional central limit theorems for martingales in Section~\ref{sec:yurinskii_app_high_dim_clt}. \subsection{Notation} We write $\|x\|_p$ for $p\in[1,\infty]$ to denote the $\ell^p$-norm if $x$ is a (possibly random) vector or the induced operator $\ell^p$--$\ell^p$-norm if $x$ is a matrix. For $X$ a real-valued random variable and an Orlicz function $\psi$, we use $\vvvert X \vvvert_\psi$ to denote the Orlicz $\psi$-norm \citep[Section~2.2]{van1996weak} and $\vvvert X \vvvert_p$ for the $L^p(\P)$-norm where $p\in [1,\infty]$. For a matrix $M$, we write $\|M\|_{\max}$ for the maximum absolute entry and $\|M\|_\rF$ for the Frobenius norm. We denote positive semi-definiteness by $M \succeq 0$ and write $I_d$ for the $d \times d$ identity matrix. For scalar sequences $x_n$ and $y_n$, we write $x_n \lesssim y_n$ if there exists a positive constant $C$ such that $|x_n| \leq C |y_n|$ for sufficiently large $n$. We write $x_n \asymp y_n$ to indicate both $x_n \lesssim y_n$ and $y_n \lesssim x_n$. Similarly, for random variables $X_n$ and $Y_n$, we write $X_n \lesssim_\P Y_n$ if for every $\varepsilon > 0$ there exists a positive constant $C$ such that $\P(|X_n| \leq C |Y_n|) \leq \varepsilon$, and write $X_n \to_\P X$ for limits in probability. For real numbers $a$ and $b$ we use $a \vee b = \max\{a,b\}$. We write $\kappa \in \N^d$ for a multi-index, where $d \in \N = \{0, 1, 2, \ldots\}$, and define $|\kappa| = \sum_{j=1}^d \kappa_j$ and $x^\kappa = \prod_{j=1}^d x_j^{\kappa_j}$ for $x \in \R^d$, and $\kappa! = \prod_{j=1}^{d} \kappa_j !$. Since our results concern couplings, some statements must be made on a new or enlarged probability space. We omit the details of this for clarity of notation, but technicalities are handled by the Vorob'ev--Berkes--Philipp Theorem~\citep[Theorem~1.1.10]{dudley1999uniform}. \section{Main results} \label{sec:yurinskii_main_results} We begin with our most general result: an $\ell^p$-norm Yurinskii coupling of a sum of vector-valued approximate martingale differences to a Gaussian mixture-distributed random vector. The general result is presented in Theorem~\ref{thm:yurinskii_sa_dependent}, while Proposition~\ref{pro:yurinskii_sa_simplified} gives a simplified and slightly weaker version which is easier to use in applications. We then further specialize Proposition~\ref{pro:yurinskii_sa_simplified} to three scenarios with successively stronger assumptions, namely mixingales, martingales, and independent data in Corollaries~\ref{cor:yurinskii_sa_mixingale}, \ref{cor:yurinskii_sa_martingale}, and \ref{cor:yurinskii_sa_indep} respectively. In each case we allow for possibly random quadratic variations (cf.\ mixing convergence), thereby establishing a Gaussian mixture coupling in the general setting. In Remark~\ref{rem:yurinskii_coupling_bounds_probability} we comment on and correct an often overlooked technicality relating to the derivation of bounds in probability from Yurinskii's coupling. As a first illustration of the power of our generalized $\ell^p$-norm Yurinskii coupling, we present in Section~\ref{sec:yurinskii_factor} a simple factor model example relating to all three of the aforementioned scenarios. \begin{theorem}[Strong approximation for vector-valued approximate martingales] \label{thm:yurinskii_sa_dependent} Take a complete probability space with a countably generated filtration $\cH_0, \ldots, \cH_n$ for $n \geq 1$, supporting the $\R^d$-valued square-integrable variables $X_1, \ldots, X_n$. Let $S = \sum_{i=1}^n X_i$ and define % \begin{align*} \tilde X_i &= \sum_{r=1}^n \big(\E[X_{r} \mid \cH_{i}] - \E[X_{r} \mid \cH_{i-1}]\big) & &\text{and} &U &= \sum_{i=1}^{n} \big( X_i - \E[ X_i \mid \cH_n] + \E[ X_i \mid \cH_0 ] \big). \end{align*} % Let $V_i = \Var[\tilde X_i \mid \cH_{i-1}]$ and define $\Omega = \sum_{i=1}^n V_i - \Sigma$ where $\Sigma$ is an almost surely positive semi-definite $\cH_0$-measurable $d \times d$ matrix. Then, for each $\eta > 0$ and $p \in [1,\infty]$, there exists, on an enlarged probability space, an $\R^d$-valued random vector $T$ with $T \mid \cH_0 \sim \cN(0, \Sigma)$ and % \begin{align} \label{eq:yurinskii_sa_dependent} \P\big(\|S-T\|_p > 6\eta\big) &\leq \inf_{t>0} \left\{ 2 \P\big( \|Z\|_p > t \big) + \min\left\{ \frac{\beta_{p,2} t^2}{\eta^3}, \frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3} \right\} \right\} \nonumber \\ &\quad+ \inf_{M \succeq 0} \Big\{ 2 \P\big(\Omega \npreceq M\big) + \delta_p(M,\eta) + \varepsilon_p(M, \eta)\Big\} +\P\big(\|U\|_p>\eta\big), \end{align} % where $Z, Z_1,\dots ,Z_n$ are i.i.d.\ standard Gaussian random variables on $\R^d$ independent of $\cH_n$, the second infimum is taken over all positive semi-definite $d \times d$ non-random matrices $M$, % \begin{align*} \beta_{p,k} &= \sum_{i=1}^n \E\left[\| \tilde X_i \|^k_2 \| \tilde X_i \|_p + \|V_i^{1/2} Z_i \|^k_2 \|V_i^{1/2} Z_i \|_p \right], &\pi_3 &= \sum_{i=1}^{n} \sum_{|\kappa| = 3} \E \Big[ \big| \E [ \tilde X_i^\kappa \mid \cH_{i-1} ] \big| \Big] \end{align*} % for $k \in \{2, 3\}$, with $\pi_3 = \infty$ if the associated conditional expectation does not exist, and with % \begin{align*} \delta_p(M,\eta) &= \P\left( \big\|\big((\Sigma +M)^{1/2}- \Sigma^{1/2}\big) Z\big\|_p \geq \eta \right), \\ \varepsilon_p(M, \eta) &= \P\left(\big\| (M - \Omega)^{1/2} Z \big\|_p\geq \eta, \ \Omega \preceq M\right). \end{align*} \end{theorem} This theorem offers four novel contributions to the literature on coupling theory and strong approximation, as discussed in the introduction. % approximate martingales Firstly \ref{it:yurinskii_contribution_approximate_martingale}, it allows for approximate vector-valued martingales, with the variables $\tilde X_i$ forming martingale differences with respect to $\cH_i$ by construction, and $U$ quantifying the associated martingale approximation error. Such martingale approximation techniques for sequences of dependent random vectors are well established and have been used in a range of scenarios: see, for example, \citet{wu2004martingale}, \citet{dedecker2007weak}, \citet{zhao2008martingale}, \citet{peligrad2010conditional}, \citet{atchade2014martingale}, \citet{cuny2014martingale}, \citet{magda2018martingale}, and references therein. In Section~\ref{sec:yurinskii_mixingales} we demonstrate how this approximation can be established in practice by restricting our general theorem to the special case of mixingales, while the upcoming example in Section~\ref{sec:yurinskii_factor} provides an illustration in the context of auto-regressive factor models. % Gaussian mixture Secondly \ref{it:yurinskii_contribution_gaussian_mixture}, Theorem~\ref{thm:yurinskii_sa_dependent} allows for the resulting coupling variable $T$ to follow a multivariate Gaussian distribution only conditionally, and thus we offer a useful analog of mixing convergence in the context of strong approximation. To be more precise, the random matrix $\sum_{i=1}^{n} V_i$ is the quadratic variation of the constructed martingale $\sum_{i=1}^n \tilde X_i$, and we approximate it using the $\cH_0$-measurable random matrix $\Sigma$. This yields the coupling variable $T \mid \cH_0 \sim \cN(0, \Sigma)$, which can alternatively be written as $T=\Sigma^{1/2} Z$ with $Z \sim \cN(0,I_d)$ independent of $\cH_0$. The errors in this quadratic variation approximation are accounted for by the terms $\P(\Omega \npreceq M)$, $\delta_p(M, \eta)$, and $\varepsilon_p(M, \eta)$, utilizing a regularization argument through the free matrix parameter $M$. If a non-random $\Sigma$ is used, then $T$ is unconditionally Gaussian, and one can take $\cH_0$ to be the trivial $\sigma$-algebra. As demonstrated in our proof, our approach to establishing a mixing approximation is different from naively taking an unconditional version of Yurinskii's coupling and applying it conditionally on $\cH_0$, which will not deliver the same coupling as in Theorem~\ref{thm:yurinskii_sa_dependent} for a few reasons. To begin with, we explicitly indicate in the conditions of Theorem~\ref{thm:yurinskii_sa_dependent} where conditioning is required. Next, our error of approximation is given unconditionally, involving only marginal expectations and probabilities. Finally, we provide a rigorous account of the construction of the conditionally Gaussian coupling variable $T$ via a conditional version of Strassen's theorem \citep{chen2020jackknife}. Section~\ref{sec:yurinskii_martingales} illustrates how a strong approximation akin to mixing convergence can arise when the data forms an exact martingale, and Section~\ref{sec:yurinskii_factor} gives a simple example relating to factor modeling in statistics and data science. % remove lower bound on minimum eigenvalue As a third contribution to the literature \ref{it:yurinskii_contribution_degeneracy}, and of particular importance for applications, Theorem~\ref{thm:yurinskii_sa_dependent} makes no requirements on the minimum eigenvalue of the quadratic variation of the approximating martingale sequence. Instead, our proof technique employs a careful regularization scheme designed to account for any such exact or approximate rank degeneracy in $\Sigma$. This capability is fundamental in some applications, a fact which we illustrate in Section \ref{sec:yurinskii_kde} by demonstrating the significant improvements in strong approximation errors delivered by Theorem~\ref{thm:yurinskii_sa_dependent} relative to those obtained using prior results in the literature. % matching third moments Finally \ref{it:yurinskii_contribution_third_order}, Theorem~\ref{thm:yurinskii_sa_dependent} gives a third-order strong approximation alongside the usual second-order version considered in all prior literature. More precisely, we observe that an analog of the term $\beta_{p,2}$ is present in the classical Yurinskii coupling and comes from a Lindeberg telescoping sum argument, replacing random variables by Gaussians with the same mean and variance to match the first and second moments. Whenever the third moments of $\tilde X_i$ are negligible (quantified by $\pi_3$), this moment-matching argument can be extended to third-order terms, giving a new term $\beta_{p,3}$. In certain settings, such as when the data is symmetrically distributed around zero, using $\beta_{p,3}$ rather than $\beta_{p,2}$ can give smaller approximation errors in the coupling given in \eqref{eq:yurinskii_sa_dependent}. Such a refinement can be viewed as a strong approximation counterpart to classical Edgeworth expansion methods. We illustrate this phenomenon in our upcoming applications to nonparametric inference (Section~\ref{sec:yurinskii_nonparametric}). \subsection{User-friendly formulation of the main result}% The result in Theorem~\ref{thm:yurinskii_sa_dependent} is given in a somewhat implicit manner, involving infima over the free parameters $t > 0$ and $M \succeq 0$, and it is not clear how to compute these in general. In the upcoming Proposition~\ref{pro:yurinskii_sa_simplified}, we set $M = \nu^2 I_d$ and approximately optimize over $t > 0$ and $\nu > 0$, resulting in a simplified and slightly weaker version of our main general result. In specific applications, where there is additional knowledge of the quadratic variation structure, other choices of regularization schemes may be more appropriate. Nonetheless, the choice $M = \nu^2 I_d$ leads to arguably the principal result of our work, due to its simplicity and utility in statistical applications. For convenience, define the functions $\phi_p : \N \to \R$ for $p \in [0, \infty]$, % \begin{align*} \phi_p(d) = \begin{cases} \sqrt{pd^{2/p} } & \text{ if } p \in [1,\infty), \\ \sqrt{2\log 2d} & \text{ if } p =\infty, \end{cases} \end{align*} % which are related to tail probabilities of the $\ell^p$-norm of a standard Gaussian. \begin{proposition}[Simplified strong approximation for approximate martingales]% \label{pro:yurinskii_sa_simplified} Assume the setup and notation of Theorem~\ref{thm:yurinskii_sa_dependent}. For each $\eta > 0$ and $p \in [1,\infty]$, there exists a random vector $T \mid \cH_0 \sim \cN(0, \Sigma)$ satisfying % \begin{align*} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3} +\P\left(\|U\|_p>\frac{\eta}{6}\right). \end{align*} % If further $\pi_3 = 0$ then % \begin{align*} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4} \right)^{1/4} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3} +\P\left(\|U\|_p>\frac{\eta}{6}\right). \end{align*} % \end{proposition} Proposition~\ref{pro:yurinskii_sa_simplified} makes clear the potential benefit of a third-order coupling when $\pi_3 = 0$, as in this case the bound features $\beta_{p,3}^{1/4}$ rather than $\beta_{p,2}^{1/3}$. If $\pi_3$ is small but non-zero, an analogous result can easily be derived by adjusting the optimal choices of $t$ and $\nu$, but we omit this for clarity of notation. In applications (see Section~\ref{sec:yurinskii_series}), this reduction of the exponent can provide a significant improvement in terms of the dependence of the bound on the sample size $n$, the dimension $d$, and other problem-specific quantities. When using our results for strong approximation, it is usual to set $p = \infty$ to bound the maximum discrepancy over the entries of a vector (to construct uniform confidence sets, for example). In this setting, we have that $\phi_\infty(d) = \sqrt{2 \log 2d}$ has a sub-Gaussian slow-growing dependence on the dimension. The remaining term depends on $\E[\|\Omega\|_2]$ and requires that the matrix $\Sigma$ be a good approximation of $\sum_{i=1}^{n} V_i$, while remaining $\cH_0$-measurable. In some applications (such as factor modeling; see Section~\ref{sec:yurinskii_factor}), it can be shown that the quadratic variation $\sum_{i=1}^n V_i$ remains random and $\cH_0$-measurable even in large samples, giving a natural choice for $\Sigma$. In the next few sections, we continue to refine Proposition~\ref{pro:yurinskii_sa_simplified}, presenting a sequence of results with increasingly strict assumptions on the dependence structure of the data $X_i$. These allow us to demonstrate the broad applicability of our main results, providing more explicit bounds in settings which are likely to be of special interest. In particular, we consider mixingales, martingales, and independent data, comparing our derived results with those in the existing literature. \subsection{Mixingales} \label{sec:yurinskii_mixingales} In our first refinement, we provide a natural method for bounding the martingale approximation error term $U$. Suppose that $X_i$ form an $\ell^p$-mixingale in $L^1(\P)$ in the sense that there exist non-negative $c_1, \ldots, c_n$ and $\zeta_0, \ldots, \zeta_n$ such that for all $1 \leq i \leq n$ and $0 \leq r \leq i$, % \begin{align} \label{eq:yurinskii_mixingale_1} \E \left[ \left\| \E \left[ X_i \mid \cH_{i-r} \right] \right\|_p \right] &\leq c_i \zeta_r, \end{align} % and for all $1 \leq i \leq n$ and $0 \leq r \leq n-i$, % \begin{align} \label{eq:yurinskii_mixingale_2} \E \left[ \big\| X_i - \E \big[ X_i \mid \cH_{i+r} \big] \big\|_p \right] &\leq c_i \zeta_{r+1}. \end{align} % These conditions are satisfied, for example, if $X_i$ are integrable strongly $\alpha$-mixing random variables \citep{mcleish1975invariance}, or if $X_i$ are generated by an auto-regressive or auto-regressive moving average process (see Section~\ref{sec:yurinskii_factor}), among many other possibilities \citep{bradley2005basic}. Then, in the notation of Theorem~\ref{thm:yurinskii_sa_dependent}, we have by Markov's inequality that % \begin{align*} \P \left( \|U\|_p > \frac{\eta}{6} \right) &\leq \frac{6}{\eta} \sum_{i=1}^{n} \E \left[ \big\| X_i - \E \left[ X_i \mid \cH_n \right] \big\|_p + \big\| \E \left[ X_i \mid \cH_0 \right] \big\|_p \right] \leq \frac{\zeta}{\eta}, \end{align*} % with $\zeta = 6 \sum_{i=1}^{n} c_i (\zeta_{i} + \zeta_{n-i+1})$. Combining Proposition~\ref{pro:yurinskii_sa_simplified} with this martingale error bound yields the following result for mixingales. % \begin{corollary}[Strong approximation for vector-valued mixingales]% \label{cor:yurinskii_sa_mixingale} Assume the setup and notation of Theorem~\ref{thm:yurinskii_sa_dependent}, and suppose the mixingale conditions \eqref{eq:yurinskii_mixingale_1} and \eqref{eq:yurinskii_mixingale_2} hold. For each $\eta > 0$ and $p \in [1,\infty]$ there is a random vector $T \mid \cH_0 \sim \cN(0, \Sigma)$ with % \begin{align*} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3} + \frac{\zeta}{\eta}. \end{align*} % If further $\pi_3 = 0$ then % \begin{align*} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4} \right)^{1/4} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3} + \frac{\zeta}{\eta}. \end{align*} % \end{corollary} The closest antecedent to Corollary~\ref{cor:yurinskii_sa_mixingale} is found in \citet[Theorem~4]{li2020uniform}, who also considered Yurinskii's coupling for mixingales. Our result improves on this work in the following manner: it removes any requirements on the minimum eigenvalue of the quadratic variation of the mixingale sequence; it allows for general $\ell^p$-norms with $p\in[1,\infty]$; it establishes a coupling to a multivariate Gaussian mixture distribution in general; and it permits third-order couplings (when $\pi_3=0$). These improvements have important practical implications as demonstrated in Sections \ref{sec:yurinskii_factor} and \ref{sec:yurinskii_nonparametric}, where significantly better coupling approximation errors are demonstrated for a variety of statistical applications. On the technical side, our result is rigorously established using a conditional version of Strassen's theorem \citep{chen2020jackknife}, a carefully crafted regularization argument, and a third-order Lindeberg method \citep[see][and references therein, for more discussion on the standard second-order Lindeberg method]{chatterjee2006generalization}. Furthermore, as explained in Remark~\ref{rem:yurinskii_coupling_bounds_probability}, we clarify a technical issue in \citet{li2020uniform} surrounding the derivation of valid probability bounds for $\|S-T\|_p$. Corollary~\ref{cor:yurinskii_sa_mixingale} focused on mixingales for simplicity, but, as previously discussed, any method for constructing a martingale approximation $\tilde X_i$ and bounding the resulting error $U$ could be used instead in Proposition~\ref{pro:yurinskii_sa_simplified} to derive a similar result. \subsection{Martingales} \label{sec:yurinskii_martingales} For our second refinement, suppose that $X_i$ form martingale differences with respect to $\cH_i$. In this case, $\E[X_i \mid \cH_n] = X_i$ and $\E[X_i \mid \cH_0] = 0$, so $U = 0$, and the martingale approximation error term vanishes. Applying Proposition~\ref{pro:yurinskii_sa_simplified} in this setting directly yields the following result. % \begin{corollary}[Strong approximation for vector-valued martingales]% \label{cor:yurinskii_sa_martingale} With the setup and notation of Theorem~\ref{thm:yurinskii_sa_dependent}, suppose that $X_i$ is $\cH_i$-measurable satisfying $\E[X_i \mid \cH_{i-1}] = 0$ for $1 \leq i \leq n$. Then, for each $\eta > 0$ and $p \in [1,\infty]$, there is a random vector $T \mid \cH_0 \sim \cN(0, \Sigma)$ with % \begin{align} \label{eq:yurinskii_sa_martingale_order_2} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3}. \end{align} % If further $\pi_3 = 0$ then % \begin{align} \label{eq:yurinskii_sa_martingale_order_3} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4} \right)^{1/4} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3}. \end{align} % \end{corollary} The closest antecedents to Corollary~\ref{cor:yurinskii_sa_martingale} are \citet{belloni2018high} and \citet{li2020uniform}, who also implicitly or explicitly considered Yurinskii's coupling for martingales. More specifically, \citet[Theorem~1]{li2020uniform} established an explicit $\ell^2$-norm Yurinskii coupling for martingales under a strong assumption on the minimum eigenvalue of the martingale quadratic variation, while \citet[Theorem~2.1]{belloni2018high} established a central limit theorem for vector-valued martingale sequences employing the standard second-order Lindeberg method, implying that their proof could be adapted to deduce a Yurinskii coupling for martingales with the help of a conditional version of Strassen's theorem \citep{chen2020jackknife} and some additional nontrivial technical work. Corollary~\ref{cor:yurinskii_sa_martingale} improves over this prior work as follows. With respect to \citet{li2020uniform}, our result establishes an $\ell^p$-norm Gaussian mixture Yurinskii coupling for martingales without any requirements on the minimum eigenvalue of the martingale quadratic variation, and permits a third-order coupling if $\pi_3=0$. The first probability bound \eqref{eq:yurinskii_sa_martingale_order_2} in Corollary~\ref{cor:yurinskii_sa_martingale} gives the same rate of strong approximation as that in Theorem~1 of \citet{li2020uniform} when $p=2$, with non-random $\Sigma$, and when the eigenvalues of a normalized version of $\Sigma$ are bounded away from zero. In Section~\ref{sec:yurinskii_kde} we demonstrate the crucial importance of removing this eigenvalue lower bound restriction in applications involving nonparametric kernel estimators, while in Section~\ref{sec:yurinskii_series} we demonstrate how the availability of a third-order coupling \eqref{eq:yurinskii_sa_martingale_order_3} can give improved approximation rates in applications involving nonparametric series estimators with conditionally symmetrically distributed residual errors. Finally, our technical work improves on \citet{li2020uniform} in two respects: % \begin{inlineroman} \item we employ a conditional version of Strassen's theorem (see Lemma~\ref{lem:yurinskii_app_strassen} in the appendix) to appropriately handle the conditioning arguments; and \item we deduce valid probability bounds for $\|S-T\|_p$, as the following Remark~\ref{rem:yurinskii_coupling_bounds_probability} makes clear. \end{inlineroman} \begin{remark}[Yurinskii's coupling and bounds in probability] \label{rem:yurinskii_coupling_bounds_probability} Given a sequence of random vectors $S_n$, Yurinskii's method provides a coupling in the following form: for each $n$ and any $\eta > 0$, there exists a random vector $T_n$ with $\P\big(\|S_n - T_n\| > \eta\big) < r_n(\eta)$, where $r_n(\eta)$ is the approximation error. Crucially, each coupling variable $T_n$ is a function of the desired approximation level $\eta$ and, as such, deducing bounds in probability on $\|S_n - T_n\|$ requires some extra care. One option is to select a sequence $R_n \to \infty$ and note that $\P\big(\|S_n - T_n\| > r_n^{-1}(1 / R_n)\big) < 1 / R_n \to 0$ and hence $\|S_n - T_n\| \lesssim_\P r_n^{-1}(1 / R_n)$. In this case, $T_n$ depends on the choice of $R_n$, which can in turn typically be chosen to diverge slowly enough to cause no issues in applications. \end{remark} Technicalities akin to those outlined in Remark~\ref{rem:yurinskii_coupling_bounds_probability} have been both addressed and neglected alike in the prior literature. \citet[Chapter 10.4, Example 16]{pollard2002user} apparently misses this subtlety, providing an inaccurate bound in probability based on the Yurinskii coupling. \citet{li2020uniform} seem to make the same mistake in the proof of their Lemma~A2, which invalidates the conclusion of their Theorem~1. In contrast, \citet{belloni2015some} and \citet{belloni2019conditional} directly provide bounds in $o_\P$ instead of $O_\P$, circumventing these issues in a manner similar to our approach involving a diverging sequence $R_n$. To see how this phenomenon applies to our main results, observe that the second-order martingale coupling given as \eqref{eq:yurinskii_sa_martingale_order_2} in Corollary~\ref{cor:yurinskii_sa_martingale} implies that for any $R_n \to \infty$, % \begin{align*} \|S - T\|_p \lesssim_\P \beta_{p,2}^{1/3} \phi_p(d)^{2/3} R_n + \E[\|\Omega\|_2]^{1/2} \phi_p(d) R_n. \end{align*} % This bound is comparable to that obtained by \citet[Theorem~1]{li2020uniform} with $p=2$, albeit with their formulation missing the $R_n$ correction terms. In Section~\ref{sec:yurinskii_series} we discuss further their (amended) result, in the setting of nonparametric series estimation. Our approach using $p = \infty$ obtains superior distributional approximation rates, alongside exhibiting various other improvements such as the aforementioned third-order coupling. Turning to the comparison with \citet{belloni2018high}, our Corollary~\ref{cor:yurinskii_sa_martingale} again offers the same improvements, with the only exception being that the authors did account for the implications of a possibly vanishing minimum eigenvalue. However, their results exclusively concern high-dimensional central limit theorems for vector-valued martingales, and therefore while their findings could in principle enable the derivation of a result similar to our Corollary~\ref{cor:yurinskii_sa_martingale}, this would require additional technical work on their behalf in multiple ways (see Appendix~\ref{app:yurinskii}): % \begin{inlineroman} \item a correct application of a conditional version of Strassen's theorem (Lemma~\ref{lem:yurinskii_app_strassen}); \item the development of a third-order Borel set smoothing technique and associated $\ell^p$-norm moment control (Lemmas \ref{lem:yurinskii_app_smooth_approximation}, \ref{lem:yurinskii_app_gaussian_useful}, and \ref{lem:yurinskii_app_gaussian_pnorm}); \item a careful truncation scheme to account for $\Omega\npreceq0$; and \item a valid third-order Lindeberg argument (Lemma \ref{lem:yurinskii_app_sa_martingale}), among others. \end{inlineroman} \subsection{Independence} As a final refinement, suppose that $X_i$ are independent and zero-mean conditionally on $\cH_0$, and take $\cH_i$ to be the filtration generated by $X_1, \ldots, X_i$ and $\cH_0$ for $1 \leq i \leq n$. Then, taking $\Sigma = \sum_{i=1}^n V_i$ gives $\Omega = 0$, and hence Corollary~\ref{cor:yurinskii_sa_martingale} immediately yields the following result. % \begin{corollary}[Strong approximation for sums of independent vectors]% \label{cor:yurinskii_sa_indep} Take the setup of Theorem~\ref{thm:yurinskii_sa_dependent}, and let $X_i$ be independent given $\cH_0$, with $\E[X_i \mid \cH_0] = 0$. Then, for each $\eta > 0$ and $p \in [1,\infty]$, with $\Sigma = \sum_{i=1}^n V_i$, there is $T \mid \cH_0 \sim \cN(0, \Sigma)$ with % \begin{align} \label{eq:yurinskii_sa_indep_order_2} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3}. \end{align} % If further $\pi_3 = 0$ then % \begin{align*} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4} \right)^{1/4}. \end{align*} % \end{corollary} Taking $\cH_0$ to be trivial, \eqref{eq:yurinskii_sa_indep_order_2} provides an $\ell^p$-norm approximation analogous to that presented in \citet{belloni2019conditional}. By further restricting to $p=2$, we recover the original Yurinskii coupling as presented in \citet[Theorem~1]{lecam1988} and \citet[Theorem~10]{pollard2002user}. Thus, in the independent data setting, our result improves on prior work as follows: \begin{inlineroman} \item it establishes a coupling to a multivariate Gaussian mixture distribution; and \item it permits a third-order coupling if $\pi_3=0$. \end{inlineroman} \subsection{Stylized example: factor modeling} \label{sec:yurinskii_factor} In this section, we present a simple statistical example of how our improvements over prior coupling results can have important theoretical and practical implications. Consider the stylized factor model % \begin{align*} X_i = L f_i + \varepsilon_i, \qquad 1 \leq i \leq n, \end{align*} % with random variables $L$ taking values in $\R^{d \times m}$, $f_i$ in $\R^m$, and $\varepsilon_i$ in $\R^d$. We interpret $f_i$ as a latent factor variable and $L$ as a random factor loading, with idiosyncratic disturbances $\varepsilon_i$. See \citet{fan2020statistical}, and references therein, for a textbook review of factor analysis in statistics and econometrics. We employ the above factor model to give a first illustration of the applicability of our main result Theorem~\ref{thm:yurinskii_sa_dependent}, the user-friendly Proposition~\ref{pro:yurinskii_sa_simplified}, and their specialized Corollaries~\ref{cor:yurinskii_sa_mixingale}--\ref{cor:yurinskii_sa_indep}. We consider three different sets of conditions to demonstrate the applicability of each of our corollaries for mixingales, martingales, and independent data, respectively. We assume throughout that $(\varepsilon_1, \ldots, \varepsilon_n)$ is zero-mean and finite variance, and that $(\varepsilon_1, \ldots, \varepsilon_n)$ is independent of $L$ and $(f_1, \ldots, f_n)$. Let $\cH_i$ be the $\sigma$-algebra generated by $L$, $(f_1, \ldots, f_i)$, and $(\varepsilon_1, \ldots, \varepsilon_i)$, with $\cH_0$ the $\sigma$-algebra generated by $L$ alone. \begin{itemize} \item \emph{Independent data}. Suppose that the factors $(f_1, \ldots, f_n)$ are independent conditional on $L$ and satisfy $\E [ f_i \mid L ] = 0$. Then, since $X_i$ are independent conditional on $\cH_0$ and with $\E [ X_i \mid \cH_0 ] = \E [ L f_i + \varepsilon_i \mid L ] = 0$, we can apply Corollary~\ref{cor:yurinskii_sa_indep} to $\sum_{i=1}^n X_i$. In general, we will obtain a coupling variable which has the Gaussian mixture distribution $T \mid \cH_0 \sim \cN(0, \Sigma)$ where $\Sigma= \sum_{i=1}^n (L\Var[f_i \mid L]L^\T +\Var[\varepsilon_i])$. In the special case where $L$ is non-random and $\cH_0$ is trivial, the coupling is Gaussian. Further, if $f_i\mid L$ and $\varepsilon_i$ are symmetric about zero and bounded, then $\pi_3=0$, and the coupling is improved. \item \emph{Martingales}. Suppose instead that we assume only a martingale condition on the latent factor variables so that $\E \left[ f_i \mid L, f_1, \ldots, f_{i-1} \right] = 0$. Then $\E [ X_i \mid \cH_{i-1} ] = L\, \E \left[ f_i \mid \cH_{i-1} \right] = 0$ and Corollary~\ref{cor:yurinskii_sa_martingale} is applicable to $\sum_{i=1}^n X_i$. The preceding comments on Gaussian mixture distributions and third-order couplings continue to apply. \item \emph{Mixingales}. Finally, assume that the factors follow the auto-regressive model $f_i = A f_{i-1} + u_i$ where $A \in \R^{m \times m}$ is non-random and $(u_1, \ldots, u_n)$ are zero-mean, independent, and independent of $(\varepsilon_1, \ldots, \varepsilon_n)$. Then $\E \left[ f_i \mid f_0 \right] = A^i f_0$, so taking $p \in [1, \infty]$ we see that $\E \big[ \| \E [ f_i \mid f_0 ] \|_p \big] = \E \big[ \| A^i f_0 \|_p \big] \leq \|A\|_p^i\,\E [ \|f_0\|_p ]$, and that clearly $f_i - \E [ f_i \mid \cH_n ] = 0$. Thus, whenever $\|A\|_p < 1$, the geometric sum formula implies that we can apply the mixingale result from Corollary~\ref{cor:yurinskii_sa_mixingale} to $\sum_{i=1}^n X_i$. The conclusions on Gaussian mixture distributions and third-order couplings parallel the previous cases. % \end{itemize} This simple application to factor modeling gives a preliminary illustration of the power of our main results, encompassing settings which could not be handled by employing Yurinskii couplings available in the existing literature. Even with independent data, we offer new Yurinskii couplings to Gaussian mixture distributions (due to the presence of the common random factor loading $L$), which could be further improved whenever the factors and residuals possess symmetric (conditional) distributions. Furthermore, our results do not impose any restrictions on the minimum eigenvalue of $\Sigma$, thereby allowing for more general factor structures. These improvements are maintained in the martingale, mixingale, and weakly dependent stationary data settings. \section{Strong approximation for martingale empirical processes}% \label{sec:yurinskii_emp_proc} In this section, we demonstrate how our main results can be applied to some more substantive problems in statistics. Having until this point studied only finite-dimensional (albeit potentially high-dimensional) random vectors, we now turn our attention to infinite-dimensional stochastic processes. Specifically, we consider empirical processes of the form $S(f) = \sum_{i=1}^{n} f(X_i)$ for $f \in \cF$ a problem-specific class of real-valued functions, where each $f(X_i)$ forms a martingale difference sequence with respect to an appropriate filtration. We construct (conditionally) Gaussian processes $T(f)$ for which an upper bound on the uniform coupling error $\sup_{f \in \cF} |S(f) - T(f)|$ is precisely quantified. We control the complexity of $\cF$ using metric entropy under Orlicz norms. The novel strong approximation results which we present concern the entire martingale empirical process $(S(f):f \in \cF)$, as opposed to just the scalar supremum of the empirical process, $\sup_{f \in \cF} |S(f)|$. This distinction has been carefully noted by \citet{chernozhukov2014gaussian}, who studied Gaussian approximation of empirical process suprema in the independent data setting and wrote (p.\ $1565$): ``A related but different problem is that of approximating \textit{whole} empirical processes by a sequence of Gaussian processes in the sup-norm. This problem is more difficult than [approximating the supremum of the empirical process].'' Indeed, the results we establish in this section are for a strong approximation for the entire empirical process by a sequence of Gaussian mixture processes in the supremum norm, when the data has a martingale difference structure (cf.\ Corollary \ref{cor:yurinskii_sa_martingale}). Our results can be further generalized to approximate martingale empirical processes (cf.\ Corollary \ref{cor:yurinskii_sa_mixingale}), but we do not consider this extension to reduce notation and the technical burden. \subsection{Motivating example: kernel density estimation} \label{sec:yurinskii_kde} We begin with a brief study of a canonical example of an empirical process which is non-Donsker (thus precluding the use of uniform central limit theorems) due to the presence of a function class whose complexity increases with the sample size: the kernel density estimator with i.i.d.\ scalar data. We give an overview of our general strategy for strong approximation of stochastic processes via discretization, and show explicitly in Lemma~\ref{lem:yurinskii_kde_eigenvalue} how it is crucial that we do not impose lower bounds on the eigenvalues of the discretized covariance matrix. Detailed calculations for this section are relegated to Appendix~\ref{app:yurinskii} for conciseness. Let $X_1, \ldots, X_n$ be i.i.d.\ $\Unif[0,1]$, take $K(x) = \frac{1}{\sqrt{2 \pi}} e^{-x^2/2}$ the Gaussian kernel and let $h \in (0,1]$ be a bandwidth. Then, for $a \in (0,1/4]$ and $x \in \cX = [a, 1-a]$ to avoid boundary issues, the kernel density estimator of the true density function $g(x) = 1$ is % \begin{align*} \hat g(x) &= \frac{1}{n} \sum_{i=1}^{n} K_h( X_i - x), \qquad K_h(u) = \frac{1}{h} K\left( \frac{u}{h} \right). \end{align*} % Consider establishing a strong approximation for the stochastic process $(\hat g(x)-\E [ \hat g(x) ] : x\in\cX)$ which is, upon rescaling, non-Donsker whenever the bandwidth decreases to zero in large samples. To match notation with the upcoming general result for empirical processes, set $f_x(u) = \frac{1}{n} (K_h( u - x) - \E[K_h( X_i - x)])$ so $S(x) \vcentcolon= S(f_x) = \hat g(x)-\E [ \hat g(x) ]$. The next step is standard: a mesh separates the local oscillations of the processes from the finite-dimensional coupling. For $\delta \in (0,1/2)$, set $N = \left\lfloor 1 + \frac{1 - 2a}{\delta} \right\rfloor$ and $\cX_\delta = (a + (j-1)\delta : 1 \leq j \leq N)$. Letting $T(x)$ be the approximating stochastic process to be constructed, consider the decomposition % \begin{align*} \sup_{x \in \cX} \big|S(x) - T(x)\big| &\leq \sup_{|x-x'| \leq \delta} \big|S(x) - S(x') \big| + \max_{x \in \cX_\delta} |S(x) - T(x)| + \sup_{|x-x'| \leq \delta} \big|T(x) - T(x')\big|. \end{align*} % Writing $S(\cX_\delta)$ for $\big(S(x) : x \in \cX_\delta\big)\in \mathbb{R}^N$, noting that this is a sum of i.i.d.\ random vectors, we apply Corollary~\ref{cor:yurinskii_sa_indep} as $\max_{x \in \cX_\delta} |S(x) - T(x)| = \| S(\cX_\delta) - T(\cX_\delta) \|_\infty$. We obtain that for each $\eta > 0$ there is a Gaussian vector $T(\cX_\delta)$ with the same covariance matrix as $S(\cX_\delta)$ satisfying % \begin{align*} \P\left( \|S(\cX_\delta) - T(\cX_\delta)\|_\infty > \eta \right) &\leq 31 \left( \frac{N \log 2 N}{\eta^3 n^2 h^2} \right)^{1/3} \end{align*} % assuming that $1/h \geq \log 2 N$. By the Vorob'ev--Berkes--Philipp theorem \citep[Theorem~1.1.10]{dudley1999uniform}, $T(\cX_\delta)$ extends to a Gaussian process $T(x)$ defined for all $x \in \cX$ and with the same covariance structure as $S(x)$. Next, chaining with the Bernstein--Orlicz and sub-Gaussian norms \citep[Section~2.2]{van1996weak} shows that if $\log(N/h) \lesssim \log n$ and $n h \gtrsim \log n$, % \begin{align*} \sup_{|x-x'| \leq \delta} \big\|S(x) - S(x') \big\|_\infty &\lesssim_\P \delta \sqrt{\frac{\log n}{n h^3}} \ \quad\text{and}\quad \sup_{|x-x'| \leq \delta} \big\|T(x) - T(x')\big\|_\infty \lesssim_\P \delta \sqrt{\frac{\log n}{n h^3}}. \end{align*} % Finally, for any $R_n\to\infty$ (see Remark~\ref{rem:yurinskii_coupling_bounds_probability}), the resulting bound on the coupling error is % \begin{align*} \sup_{x \in \cX} \big| S(x) - T(x) \big| &\lesssim_\P \left( \frac{N \log 2N}{n^2 h^2} \right)^{1/3} R_n + \delta \sqrt{\frac{\log n}{n h^3}}, \end{align*} % where the mesh size $\delta$ can then be approximately optimized to obtain the tightest possible strong approximation. The discretization strategy outlined above is at the core of the proof strategy for our upcoming Proposition~\ref{pro:yurinskii_emp_proc}. Since we will consider martingale empirical processes, our proof will rely on Corollary~\ref{cor:yurinskii_sa_martingale}, which, unlike the martingale Yurinskii coupling established by \citet{li2020uniform}, does not require a lower bound on the minimum eigenvalue of $\Sigma$. Using the simple kernel density example just discussed, we now demonstrate precisely the crucial importance of removing such eigenvalue conditions. The following Lemma~\ref{lem:yurinskii_kde_eigenvalue} shows that the discretized covariance matrix $\Sigma = n h\Var[S(\cX_\delta)]$ has exponentially small eigenvalues, which in turn will negatively affect the strong approximation bound if the \citet{li2020uniform} coupling were to be used instead of the results in this dissertation. \begin{lemma}[Minimum eigenvalue of a kernel density estimator covariance matrix]% \label{lem:yurinskii_kde_eigenvalue} % The minimum eigenvalue of $\Sigma=n h\Var[S(\cX_\delta)] \in \R^{N \times N}$ satisfies the upper bound % \begin{align*} \lambda_{\min}(\Sigma) &\leq 2 e^{-h^2/\delta^2} + \frac{h}{\pi a \delta} e^{-a^2 / h^2}. \end{align*} \end{lemma} % Figure~\ref{fig:yurinskii_min_eig} shows how the upper bound in Lemma \ref{lem:yurinskii_kde_eigenvalue} captures the behavior of the simulated minimum eigenvalue of $\Sigma$. In particular, the smallest eigenvalue decays exponentially fast in the discretization level $\delta$ and the bandwidth $h$. As seen in the calculations above, the coupling rate depends on $\delta / h$, while the bias will generally depend on $h$, implying that both $\delta$ and $h$ must converge to zero to ensure valid statistical inference. In general, this will lead to $\Sigma$ possessing extremely small eigenvalues, rendering strong approximation approaches such as that of \citet{li2020uniform} ineffective in such scenarios. % \begin{figure}[t] \centering \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/sim_2.pdf} \caption{$h = 0.03$} \end{subfigure} \begin{subfigure}{0.49\textwidth} \centering %\includegraphics[scale=0.64]{graphics/sim_1.pdf} \caption{$h = 0.01$} \end{subfigure} \caption[Minimum eigenvalue of the kernel density covariance matrix]{ Upper bounds on the minimum eigenvalue of the discretized covariance matrix in kernel density estimation, with $n=100$ and $a = 0.2$. Simulated: the kernel density estimator is simulated, resampling the data $100$ times to estimate its covariance. Computing matrix: the minimum eigenvalue of the limiting covariance matrix $\Sigma$ is computed explicitly. Upper bound: the bound derived in Lemma~\ref{lem:yurinskii_kde_eigenvalue} is shown. } \label{fig:yurinskii_min_eig} \end{figure} The discussion in this section focuses on the strong approximation of the centered process $\hat g(x)-\E [ \hat g(x) ]$. In practice, the goal is often rather to approximate the feasible process $\hat g(x)- g(x)$. The difference between these is captured by the smoothing bias $\E [ \hat g(x) ] - g(x)$, which is straightforward to control in this case with $\sup_{x \in \cX} \big| \E [ \hat g(x) ] - g(x) \big| \lesssim \frac{h}{a} e^{-a^2 / (2 h^2)}$. See Section \ref{sec:yurinskii_nonparametric} for further comments. \subsection{General result for martingale empirical processes} We now give our general result on a strong approximation for martingale empirical processes, obtained by applying the first result \eqref{eq:yurinskii_sa_martingale_order_2} in Corollary~\ref{cor:yurinskii_sa_martingale} with $p=\infty$ to a discretization of the empirical process, as in Section~\ref{sec:yurinskii_kde}. We then control the increments in the stochastic processes using chaining with Orlicz norms, but note that other tools are available, including generalized entropy with bracketing \citep{geer2000empirical} and sequential symmetrization \citep{rakhlin2015sequential}. A class of functions is said to be \emph{pointwise measurable} if it contains a countable subclass which is dense under the pointwise convergence topology. For a finite class $\cF$, write $\cF(x) = \big(f(x) : f \in \cF\big)$. Define the set of Orlicz functions % \begin{align*} \Psi &= \left\{ \psi: [0, \infty) \to [0, \infty) \text{ convex increasing, } \psi(0) = 0,\ \limsup_{x,y \to \infty} \tfrac{\psi(x) \psi(y)}{\psi(C x y)} < \infty \text{ for } C > 0 \right\} \end{align*} % and, for real-valued $Y$, the Orlicz norm $\vvvert Y \vvvert_\psi = \inf \left\{ C > 0: \E \left[ \psi(|Y|/C) \leq 1 \right] \right\}$ as in \citet[Section~2.2]{van1996weak}. \begin{proposition}[Strong approximation for martingale empirical processes]% \label{pro:yurinskii_emp_proc} Let $X_i$ be random variables for $1 \leq i \leq n$ taking values in a measurable space $\cX$, and $\cF$ be a pointwise measurable class of functions from $\cX$ to $\R$. Let $\cH_0, \ldots, \cH_n$ be a filtration such that each $X_i$ is $\cH_i$-measurable, with $\cH_0$ the trivial $\sigma$-algebra, and suppose that $\E[f(X_i) \mid \cH_{i-1}] = 0$ for all $f \in \cF$. Define $S(f) = \sum_{i=1}^n f(X_i)$ for $f\in\cF$ and let $\Sigma: \cF \times \cF \to \R$ be an almost surely positive semi-definite $\cH_0$-measurable random function. Suppose that for a non-random metric $d$ on $\cF$, constant $L$, and $\psi \in \Psi$, % \begin{align}% \label{eq:yurinskii_emp_proc_var} \Sigma(f,f) - 2\Sigma(f,f') + \Sigma(f',f') + \bigvvvert S(f) - S(f') \bigvvvert_\psi^2 &\leq L^2 d(f,f')^2 \quad \text{a.s.} \end{align} % Then for each $\eta > 0$ there is a process $T(f)$ which, conditional on $\cH_0$, is zero-mean and Gaussian, satisfying $\E\big[ T(f) T(f') \mid \cH_0 \big] = \Sigma(f,f')$ for all $f, f' \in \cF$, and for all $t > 0$ has % \begin{align*} &\P\left( \sup_{f \in \cF} \big| S(f) - T(f) \big| \geq C_\psi(t + \eta) \right) \leq C_\psi \inf_{\delta > 0} \inf_{\cF_\delta} \Bigg\{ \frac{\beta_\delta^{1/3} (\log 2 |\cF_\delta|)^{1/3}}{\eta } \\ &\qquad\quad+ \left(\frac{\sqrt{\log 2 |\cF_\delta|} \sqrt{\E\left[\|\Omega_\delta\|_2\right]}}{\eta }\right)^{2/3} + \psi\left(\frac{t}{L J_\psi(\delta)}\right)^{-1} + \exp\left(\frac{-t^2}{L^2 J_2(\delta)^2}\right) \Bigg\} \end{align*} % where $\cF_\delta$ is any finite $\delta$-cover of $(\cF,d)$ and $C_\psi$ is a constant depending only on $\psi$, with % \begin{align*} \beta_\delta &= \sum_{i=1}^n \E\left[ \|\cF_\delta(X_i)\|^2_2\|\cF_\delta(X_i)\|_\infty + \|V_i(\cF_\delta)^{1/2}Z_i\|^2_2 \|V_i(\cF_\delta)^{1/2}Z_i\|_\infty \right], \\ V_i(\cF_\delta) &= \E\big[\cF_\delta(X_i) \cF_\delta(X_i)^\T \mid \cH_{i-1} \big], \hspace*{27.7mm} \Omega_\delta = \sum_{i=1}^n V_i(\cF_\delta) - \Sigma(\cF_\delta), \\ J_\psi(\delta) &= \int_0^\delta \psi^{-1}\big( N_\varepsilon \big) \diff{\varepsilon} + \delta \psi^{-1} \big( N_\delta^2 \big), \hspace*{19mm} J_2(\delta) = \int_0^\delta \sqrt{\log N_\varepsilon} \diff{\varepsilon}, \end{align*} % where $N_\delta = N(\delta, \cF, d)$ is the $\delta$-covering number of $(\cF, d)$ and $Z_i$ are i.i.d.\ $\cN\big(0, I_{|\cF_\delta|}\big)$ independent of $\cH_n$. If $\cF_\delta$ is a minimal $\delta$-cover of $(\cF, d)$, then $|\cF_\delta| = N_\delta$. \end{proposition} Proposition~\ref{pro:yurinskii_emp_proc} is given in a rather general form to accommodate a range of different settings and applications. In particular, consider the following well-known Orlicz functions. % \begin{description} \item[Polynomial:] $\psi(x) = x^a$ for $a \geq 2$ has $\vvvert X \vvvert_2 \leq \vvvert X \vvvert_\psi$ and $\sqrt{\log x} \leq \sqrt{a} \psi^{-1}(x)$. \item[Exponential:] $\psi(x) = \exp(x^a) - 1$ for $a \in [1,2]$ has $\vvvert X \vvvert_2 \leq 2\vvvert X \vvvert_\psi$ and $\sqrt{\log x} \leq \psi^{-1}(x)$. \item[Bernstein:] $\psi(x) = \exp \Big( \Big(\frac{\sqrt{1+2ax}-1}{a}\Big)^{2} \Big)-1$ for $a > 0$ has $\vvvert X \vvvert_2 \leq (1+a)\vvvert X \vvvert_\psi$ \\ and $\sqrt{\log x}~\leq~\psi^{-1}(x)$. \end{description} % For these Orlicz functions and when $\Sigma(f, f') = \Cov[S(f), S(f')]$ is non-random, the terms involving $\Sigma$ in \eqref{eq:yurinskii_emp_proc_var} can be controlled by the Orlicz $\psi$-norm term; similarly, $J_2$ is bounded by $J_\psi$. Further, $C_\psi$ can be replaced by a universal constant $C$ which does not depend on the parameter $a$. See Section~2.2 in \citet{van1996weak} for details. If the conditional third moments of $f(X_i)$ given $\cH_{i-1}$ are all zero (if $f$ and $X_i$ are appropriately symmetric, for example), then the second inequality in Corollary~\ref{cor:yurinskii_sa_martingale} can be applied to obtain a tighter coupling inequality; the details of this are omitted for brevity, and the proof would proceed in exactly the same manner. In general, however, Proposition~\ref{pro:yurinskii_emp_proc} allows for a random covariance function, yielding a coupling to a stochastic process that is Gaussian only conditionally. Such a process can equivalently be viewed as a mixture of Gaussian processes, writing $T=\Sigma^{1/2} Z$ with an operator square root and where $Z$ is a Gaussian white noise on $\cF$ independent of $\cH_0$. This extension is in contrast with much of the existing strong approximation and empirical process literature, which tends to focus on couplings and weak convergence results with marginally Gaussian processes \citep{settati2009gaussian,chernozhukov2016empirical}. A similar approach was taken by \citet{berthet2006revisiting}, who used a Gaussian coupling due to \citet{zaitsev1987estimates,zaitsev1987gaussian} along with a discretization method to obtain strong approximations for empirical processes with independent data. They handled fluctuations in the stochastic processes with uniform $L^2$ covering numbers and bracketing numbers where we opt instead for chaining with Orlicz norms. Our version using the martingale Yurinskii coupling can improve upon theirs in approximation rate even for independent data in certain circumstances. Suppose the setup of Proposition~1 in \citet{berthet2006revisiting}; that is, $X_1, \ldots, X_n$ are i.i.d.\ and $\sup_{\cF} \|f\|_\infty \leq M$, with the VC-type assumption $\sup_\Q N(\varepsilon, \cF, d_\Q) \leq c_0 \varepsilon^{-\nu_0}$ where $d_\Q(f,f')^2 = \E_\Q\big[(f-f')^2\big]$ for a measure $\Q$ on $\cX$ and $M, c_0, \nu_0$ are constants. Using uniform $L^2$ covering numbers rather than Orlicz chaining in our Proposition~4 gives the following. Firstly, as $X_i$ are i.i.d., take $\Sigma(f, f') = \Cov[S(f), S(f')]$ so $\Omega_\delta = 0$. Let $\cF_\delta$ be a minimal $\delta$-cover of $(\cF, d_\P)$ with cardinality $N_\delta \lesssim \delta^{-\nu_0}$ where $\delta \to 0$. It is easy to show that $\beta_\delta \lesssim n \delta^{-\nu_0} \sqrt{\log(1/\delta)}$. Theorem~2.2.8 and Theorem~2.14.1 in \citet{van1996weak} then give % \begin{align*} \E\left[ \sup_{d_\P(f,f') \leq \delta} \Big( |S(f) - S(f')| + |T(f) - T(f')| \Big) \right] &\lesssim \sup_\Q \int_0^\delta \sqrt{n \log N(\varepsilon, \cF, d_\Q)} \diff{\varepsilon} \\ &\lesssim \delta \sqrt{n\log(1/\delta)}, \end{align*} % where we used the VC-type property to bound the entropy integral. So by our Proposition~\ref{pro:yurinskii_emp_proc}, for any sequence $R_n \to \infty$ (see Remark~\ref{rem:yurinskii_coupling_bounds_probability}), % \begin{align*} \sup_{f \in \cF} \big| S(f) - T(f) \big| &\lesssim_\P n^{1/3} \delta^{-\nu_0/3} \sqrt{\log(1/\delta)} R_n + \delta \sqrt{n\log(1/\delta)} \lesssim_\P n^{\frac{2+\nu_0}{6+2\nu_0}} \sqrt{\log n} R_n, \end{align*} % where we minimized over $\delta$ in the last step. \citet[Proposition~1]{berthet2006revisiting} achieved % \begin{align*} \sup_{f \in \cF} \big| S(f) - T(f) \big| &\lesssim_\P n^{\frac{5\nu_0}{4+10\nu_0}} (\log n)^{\frac{4+5\nu_0}{4+10\nu_0}}, \end{align*} % showing that our approach achieves a better approximation rate whenever $\nu_0 > 4/3$. In particular, our method is superior in richer function classes with larger VC-type dimension. For example, if $\cF$ is smoothly parameterized by $\theta \in \Theta \subseteq \R^d$ where $\Theta$ contains an open set, then $\nu_0 > 4/3$ corresponds to $d \geq 2$ and our rate is better as soon as the parameter space is more than one-dimensional. The difference in approximation rate is due to Zaitsev's coupling having better dependence on the sample size but worse dependence on the dimension. In particular, Zaitsev's coupling is stated only in $\ell^2$-norm and hence \citet[Equation~5.3]{berthet2006revisiting} are compelled to use the inequality $\|\cdot\|_\infty \leq \|\cdot\|_2$ in the coupling step, a bound which is loose when the dimension of the vectors (here on the order of $\delta^{-\nu_0}$) is even moderately large. We use the fact that our version of Yurinskii's coupling applies directly to the supremum norm, giving sharper dependence on the dimension. In Section~\ref{sec:yurinskii_local_poly} we apply Proposition~\ref{pro:yurinskii_emp_proc} to obtain strong approximations for local polynomial estimators in the nonparametric regression setting. In contrast with the series estimators of the upcoming Section~\ref{sec:yurinskii_series}, local polynomial estimators are not linearly separable and hence cannot be analyzed directly using the finite-dimensional Corollary~\ref{cor:yurinskii_sa_martingale}. \section{Applications to nonparametric regression} \label{sec:yurinskii_nonparametric} We illustrate the applicability of our previous strong approximation results with two substantial and classical examples in nonparametric regression estimation. Firstly, we present an analysis of partitioning-based series estimators, where we can apply Corollary~\ref{cor:yurinskii_sa_martingale} directly due to an intrinsic linear separability property. Secondly, we consider local polynomial estimators, this time using Proposition~\ref{pro:yurinskii_emp_proc} due to a non-linearly separable martingale empirical process. \subsection{Partitioning-based series estimators} \label{sec:yurinskii_series} Partitioning-based least squares methods are essential tools for estimation and inference in nonparametric regression, encompassing splines, piecewise polynomials, compactly supported wavelets and decision trees as special cases. See \citet{cattaneo2020large} for further details and references throughout this section. We illustrate the usefulness of Corollary~\ref{cor:yurinskii_sa_martingale} by deriving a Gaussian strong approximation for partitioning series estimators based on multivariate martingale data. Proposition~\ref{pro:yurinskii_series} shows how we achieve the best known rate of strong approximation for independent data by imposing an additional mild $\alpha$-mixing condition to control the time series dependence of the regressors. Consider the nonparametric regression setup with martingale difference residuals defined by $Y_i = \mu(W_i) + \varepsilon_i$ for $ 1 \leq i \leq n$ where the regressors $W_i$ have compact connected support $\cW \subseteq \R^m$, $\cH_i$ is the $\sigma$-algebra generated by $(W_1, \ldots, W_{i+1}, \varepsilon_1, \ldots, \varepsilon_i)$, $\E[\varepsilon_i \mid \cH_{i-1}] = 0$ and $\mu: \cW \to \R$ is the estimand. Let $p(w)$ be a $k$-dimensional vector of bounded basis functions on $\cW$ which are locally supported on a quasi-uniform partition \citep[Assumption~2]{cattaneo2020large}. Under minimal regularity conditions, the least-squares partitioning-based series estimator is $\hat\mu(w) = p(w)^{\T} \hat H^{-1} \sum_{i=1}^n p(W_i) Y_i$ with $\hat H = \sum_{i=1}^n p(W_i) p(W_i)^\T$. The approximation power of the estimator $\hat\mu(w)$ derives from letting $k\to\infty$ as $n\to\infty$. The assumptions made on $p(w)$ are mild enough to accommodate splines, wavelets, piecewise polynomials, and certain types of decision trees. For such a tree, $p(w)$ is comprised of indicator functions over $k$ axis-aligned rectangles forming a partition of $\cW$ (a Haar basis), provided that the partitions are constructed using independent data (e.g., with sample splitting). Our goal is to approximate the law of the stochastic process $(\hat\mu(w)-\mu(w):w\in\cW)$, which upon rescaling is typically not asymptotically tight as $k \to \infty$ and thus does not converge weakly. Nevertheless, exploiting the intrinsic linearity of the estimator $\hat\mu(w)$, we can apply Corollary~\ref{cor:yurinskii_sa_martingale} directly to construct a Gaussian strong approximation. Specifically, we write % \begin{equation*} \hat\mu(w) - \mu(w) = p(w)^\T H^{-1} S + p(w)^\T \big(\hat H^{-1} - H^{-1}\big) S + \Bias(w), \end{equation*} % where $H= \sum_{i=1}^n \E\left[p(W_i) p(W_i)^\T\right]$ is the expected outer product matrix, $S = \sum_{i=1}^n p(W_i) \varepsilon_i$ is the score vector, and $\Bias(w) = p(w)^{\T} \hat H^{-1}\sum_{i=1}^n p(W_i) \mu(W_i) - \mu(w)$. Imposing some mild time series restrictions and assuming stationarity, it is not difficult to show (see Section~\ref{sec:yurinskii_app_proofs}) that $\|\hat H - H\|_1 \lesssim_\P \sqrt{n k}$ and $\sup_{w\in\cW} |\Bias(w)| \lesssim_\P k^{-\gamma}$ for some $\gamma>0$, depending on the specific structure of the basis functions, the dimension $m$ of the regressors, and the smoothness of the regression function $\mu$. It remains to study the $k$-dimensional mean-zero martingale $S$ by applying Corollary~\ref{cor:yurinskii_sa_martingale} with $X_i=p(W_i) \varepsilon_i$. Controlling the convergence of the quadratic variation term $\E[\|\Omega\|_2]$ requires some time series dependence assumptions; we impose an $\alpha$-mixing condition on $(W_1, \ldots, W_n)$ for illustration \citep{bradley2005basic}. \begin{proposition}[Strong approximation for partitioning series estimators]% \label{pro:yurinskii_series} % Consider the nonparametric regression setup described above and further assume the following: % \begin{enumerate}[label=(\roman*)] \item $(W_i, \varepsilon_i)_{1 \leq i \leq n}$ is strictly stationary. \item $W_1, \ldots, W_n$ is $\alpha$-mixing with mixing coefficients satisfying $\sum_{j=1}^\infty \alpha(j) < \infty$. \item $W_i$ has a Lebesgue density on $\cW$ which is bounded above and away from zero. \item $\E\big[|\varepsilon_i|^3 \big] < \infty$ and $\E\big[\varepsilon_i^2 \mid \cH_{i-1}\big]=\sigma^2(W_i)$ is bounded away from zero. \item $p(w)$ is a basis with $k$ features satisfying Assumptions~2 and~3 in \citet{cattaneo2020large}. \end{enumerate} % Then, for any sequence $R_n \to \infty$, there is a zero-mean Gaussian process $G(w)$ indexed on $\cW$ with $\Var[G(w)] \asymp\frac{k}{n}$ satisfying $\Cov[G(w), G(w')] = \Cov[p(w)^\T H^{-1} S,\, p(w')^\T H^{-1} S]$ and % \begin{align*} \sup_{w \in \cW} \left| \hat\mu(w) - \mu(w) - G(w) \right| &\lesssim_\P \sqrt{\frac{k}{n}} \left( \frac{k^3 (\log k)^3}{n} \right)^{1/6} R_n + \sup_{w \in \cW} |\Bias(w)| \end{align*} % assuming the number of basis functions satisfies $k^3 / n \to 0$. If further $\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$ then % \begin{align*} \sup_{w \in \cW} \left| \hat\mu(w) - \mu(w) - G(w) \right| &\lesssim_\P \sqrt{\frac{k}{n}} \left( \frac{k^3 (\log k)^2}{n} \right)^{1/4} R_n + \sup_{w \in \cW} |\Bias(w)|. \end{align*} % \end{proposition} The core concept in the proof of Proposition~\ref{pro:yurinskii_series} is to apply Corollary~\ref{cor:yurinskii_sa_martingale} with $S = \sum_{i=1}^n p(W_i) \varepsilon_i$ and $p=\infty$ to construct $T \sim \cN\big(0, \Var[S]\big)$ such that $\|S - T \|_\infty$ is small, and then setting $G(w) = p(w)^\T H^{-1} T$. So long as the bias can be appropriately controlled, this result allows for uniform inference procedures such as uniform confidence bands or shape specification testing. The condition $k^3 / n \to 0$ is the same (up to logs) as that imposed by \citet{cattaneo2020large} for i.i.d. data, which gives the best known strong approximation rate for this problem. Thus, Proposition~\ref{pro:yurinskii_series} gives the same best approximation rate without requiring any extra restrictions for $\alpha$-mixing time series data. Our results improve substantially on \citet[Theorem~1]{li2020uniform}: using the notation of our Corollary~\ref{cor:yurinskii_sa_martingale}, and with any sequence $R_n \to \infty$, a valid (see Remark~\ref{rem:yurinskii_coupling_bounds_probability}) version of their martingale Yurinskii coupling is % \begin{align*} \|S-T\|_2 \lesssim_\P d^{1/2} r^{1/2}_n + (B_n d)^{1/3} R_n, \end{align*} % where $B_n = \sum_{i=1}^n \E[\|X_i\|_2^3]$ and $r_n$ is a term controlling the convergence of the quadratic variation, playing a similar role to our term $\E[\|\Omega\|_2]$. Under the assumptions of our Proposition~\ref{pro:yurinskii_series}, applying this result with $S = \sum_{i=1}^n p(W_i) \varepsilon_i$ yields a rate no better than $\|S-T\|_2 \lesssim_\P (n k)^{1/3} R_n$. As such, they attain a rate of strong approximation no faster than % \begin{align*} \sup_{w \in \cW} \left| \hat\mu(w) - \mu(w) - G(w) \right| &\lesssim_\P \sqrt{\frac{k}{n}} \left( \frac{k^5}{n} \right)^{1/6} R_n + \sup_{w \in \cW} |\Bias(w)|. \end{align*} % Hence, for this approach to yield a valid strong approximation, the number of basis functions must satisfy $k^5/n \to 0$, a more restrictive assumption than our $k^3 / n \to 0$ (up to logs). This difference is due to \citet{li2020uniform} using the $\ell^2$-norm version of Yurinskii's coupling rather than the recently established $\ell^\infty$ version. Further, our approach allows for an improved rate of distributional approximation whenever the residuals have zero conditional third moment. To illustrate the statistical applicability of Proposition~\ref{pro:yurinskii_series}, consider constructing a feasible uniform confidence band for the regression function $\mu$, using standardization and Studentization for statistical power improvements. We assume throughout that the bias is negligible. Proposition~\ref{pro:yurinskii_series} and anti-concentration for Gaussian suprema \citep[Corollary~2.1]{chernozhukov2014anti} yield a distributional approximation for the supremum statistic whenever $k^3(\log n)^6 / n \to 0$, giving % \begin{align*} \sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \left| \frac{\hat\mu(w)-\mu(w)}{\sqrt{\rho(w,w)}} \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \frac{G(w)}{\sqrt{\rho(w,w)}} \right| \leq t \right) \right| &\to 0, \end{align*} % where $\rho(w,w') = \E[G(w)G(w')]$. Further, by a Gaussian--Gaussian comparison result \citep[Lemma~3.1]{chernozhukov2013gaussian} and anti-concentration, we show (see the proof of Proposition~\ref{pro:yurinskii_series}) that with $\bW = (W_1, \ldots, W_n)$ and $\bY = (Y_1, \ldots, Y_n)$, % \begin{align*} \sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \left| \frac{\hat\mu(w)-\mu(w)}{\sqrt{\hat\rho(w,w)}} \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \frac{\hat G(w)}{\sqrt{\hat\rho(w,w)}} \right| \leq t \biggm| \bW, \bY \right) \right| &\to_\P 0, \end{align*} % where $\hat G(w)$ is a zero-mean Gaussian process conditional on $\bW$ and $\bY$ with conditional covariance function $\hat\rho(w,w') =\E\big[\hat G(w) \hat G(w') \mid \bW, \bY \big] = p(w)^\T \hat H^{-1} \hat V \hat H^{-1}p(w')$ for some estimator $\hat V$ satisfying $\frac{k (\log n)^2}{n} \big\|\hat V-\Var[S]\big\|_2 \to_\P 0$. For example, one could use the plug-in estimator $\hat V=\sum_{i=1}^n p(W_i) p(W_i)^\T \hat{\sigma}^2(W_i)$ where $\hat{\sigma}^2(w)$ satisfies $(\log n)^2 \sup_{w \in \cW} |\hat{\sigma}^2(w)-\sigma^2(w)| \to_\P 0$. This leads to the following feasible and asymptotically valid $100(1-\tau)\%$ uniform confidence band for partitioning-based series estimators based on martingale data. \begin{proposition}[Feasible uniform confidence bands for partitioning series estimators]% \label{pro:yurinskii_series_feasible} % Assume the setup of the preceding section. Then % \begin{align*} \P\Big( \mu(w) \in \Big[ \hat\mu(w) \pm \hat q(\tau) \sqrt{\hat\rho(w,w)} \Big] \ \text{for all } w \in \cW \Big) \to 1-\tau, \end{align*} % where % \begin{align*} \hat{q}(\tau) &= \inf \left\{ t \in \R: \P\left( \sup_{w \in \cW} \left| \frac{\hat G(w)}{\sqrt{\hat\rho(w,w)}} \right| \leq t \Bigm| \bW, \bY \right) \geq \tau \right\} \end{align*} % is the conditional quantile of the supremum of the Studentized Gaussian process. This can be estimated by resampling the conditional law of $\hat G(w) \mid \bW, \bY$ with a discretization of $w \in \cW$. \end{proposition} \subsection{Local polynomial estimators} \label{sec:yurinskii_local_poly} As a second example application we consider nonparametric regression estimation with martingale data employing local polynomial methods \citep{fan1996local}. In contrast with the partitioning-based series methods of Section~\ref{sec:yurinskii_series}, local polynomials induce stochastic processes which are not linearly separable, allowing us to showcase the empirical process result given in Proposition \ref{pro:yurinskii_emp_proc}. As before, suppose that $Y_i = \mu(W_i) + \varepsilon_i$ for $ 1 \leq i \leq n$ where $W_i$ has compact connected support $\cW \subseteq \R^m$, $\cH_i$ is the $\sigma$-algebra generated by $(W_1, \ldots, W_{i+1}, \varepsilon_1, \ldots, \varepsilon_i)$, $\E[\varepsilon_i \mid \cH_{i-1}] = 0$, and $\mu: \cW \to \R$ is the estimand. Let $K$ be a kernel function on $\R^m$ and $K_h(w) = h^{-m} K(w/h)$ for some bandwidth $h > 0$. Take $\gamma \geq 0$ a fixed polynomial order and let $k = (m+\gamma)!/(m!\gamma!)$ be the number of monomials up to order $\gamma$. Using multi-index notation, let $p(w)$ be the $k$-dimensional vector collecting the monomials $w^{\kappa}/\kappa!$ for $0 \leq |\kappa| \leq \gamma$, and set $p_h(w) = p(w/h)$. The local polynomial regression estimator of $\mu(w)$ is, with $e_1 = (1, 0, \ldots, 0)^\T \in \R^k$ the first standard unit vector, % \begin{align*} \hat{\mu}(w) &= e_1^\T\hat{\beta}(w) &\text{where} & &\hat{\beta}(w) &= \argmin_{\beta \in \R^{k}} \sum_{i=1}^n \left(Y_i - p_h(W_i-w)^\T \beta \right)^2 K_h(W_i-w). \end{align*} Our goal is again to approximate the distribution of the entire stochastic process, $(\hat{\mu}(w)-\mu(w):w\in\cW)$, which upon rescaling is non-Donsker if $h \to 0$, and decomposes as follows: % \begin{align*} \hat{\mu}(w)-\mu(w) &= e_1^\T H(w)^{-1} S(w) + e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big) S(w) + \Bias(w) \end{align*} % where $\hat H(w) = \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) p_h(W_i-w)^\T$, $H(w) = \E \big[ \hat H(w) \big]$, $S(w)= \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \varepsilon_i$, and $\Bias(w) = e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \mu(W_i) - \mu(w)$. A key distinctive feature of local polynomial regression is that both $\hat H(w)$ and $S(w)$ are functions of the evaluation point $w\in\cW$; contrast this with the partitioning-based series estimator discussed in Section~\ref{sec:yurinskii_series}, for which neither $\hat H$ nor $S$ depend on $w$. Therefore we use Proposition \ref{pro:yurinskii_emp_proc} to obtain a Gaussian strong approximation for the martingale empirical process directly. Under mild regularity conditions, including stationarity for simplicity and an $\alpha$-mixing assumption on the time-dependence of the data, we show $\sup_{w\in\cW} \|\hat H(w)-H(w)\|_2 \lesssim_\P \sqrt{n h^{-2m}\log n}$. Further, $\sup_{w\in\cW} |\Bias(w)| \lesssim_\P h^\gamma$ provided that the regression function is sufficiently smooth. It remains to analyze the martingale empirical process given by $\big(e_1^\T H(w)^{-1} S(w) : w\in\cW\big)$ via Proposition \ref{pro:yurinskii_emp_proc} by setting % \begin{align*} \cF = \left\{ (W_i, \varepsilon_i) \mapsto e_1^\T H(w)^{-1} K_h(W_i-w) p_h(W_i-w) \varepsilon_i : w \in \cW \right\}. \end{align*} % With this approach, we obtain the following result. \begin{proposition}[Strong approximation for local polynomial estimators]% \label{pro:yurinskii_local_poly} Under the nonparametric regression setup described above, assume further that % \begin{enumerate}[label=(\roman*)] \item $(W_i, \varepsilon_i)_{1 \leq i \leq n}$ is strictly stationary. \item $(W_i, \varepsilon_i)_{1 \leq i \leq n}$ is $\alpha$-mixing with mixing coefficients $\alpha(j) \leq e^{-2 j / C_\alpha}$ for some $C_\alpha > 0$. \item $W_i$ has a Lebesgue density on $\cW$ which is bounded above and away from zero. \item $\E\big[e^{|\varepsilon_i|/C_\varepsilon}\big] < \infty$ for $C_\varepsilon > 0$ and $\E\left[\varepsilon^2_i \mid \cH_{i-1}\right]=\sigma^2(W_i)$ is bounded away from zero. \item $K$ is a non-negative Lipschitz compactly supported kernel with $\int K(w) \diff{w} = 1$. \end{enumerate} % Then for any $R_n \to \infty$, there is a zero-mean Gaussian process $T(w)$ on $\cW$ with $\Var[T(w)] \asymp\frac{1}{n h^m}$ satisfying $\Cov[T(w), T(w')] = \Cov[e_1^\T H(w)^{-1} S(w),\, e_1^\T H(w')^{-1} S(w')]$ and % \begin{align*} \sup_{w \in \cW} \left|\hat \mu(w) - \mu(w) - T(w) \right| &\lesssim_\P \frac{R_n}{\sqrt{n h^m}} \left( \frac{(\log n)^{m+4}}{n h^{3m}} \right)^{\frac{1}{2m+6}} + \sup_{w \in \cW} |\Bias(w)|, \end{align*} % provided that the bandwidth sequence satisfies $n h^{3m} \to \infty$. % \end{proposition} If the residuals further satisfy $\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$, then a third-order Yurinskii coupling delivers an improved rate of strong approximation for Proposition~\ref{pro:yurinskii_local_poly}; this is omitted here for brevity. For completeness, the proof of Proposition~\ref{pro:yurinskii_local_poly} verifies that if the regression function $\mu(w)$ is $\gamma$ times continuously differentiable on $\cW$ then $\sup_w |\Bias(w)| \lesssim_\P h^\gamma$. Further, the assumption that $p(w)$ is a vector of monomials is unnecessary in general; any collection of bounded linearly independent functions which exhibit appropriate approximation power will suffice \citep{eggermont2009maximum}. As such, we can encompass local splines and wavelets, as well as polynomials, and also choose whether or not to include interactions between the regressor variables. The bandwidth restriction of $n h^{3m} \to \infty$ is analogous to that imposed in Proposition~\ref{pro:yurinskii_series} for partitioning-based series estimators, and as far as we know, has not been improved upon for non-i.i.d.\ data. Applying an anti-concentration result for Gaussian process suprema, such as Corollary~2.1 in \citet{chernozhukov2014anti}, allows one to write a Kolmogorov--Smirnov bound comparing the law of $\sup_{w \in \cW}|\hat\mu(w) - \mu(w)|$ to that of $\sup_{w \in \cW}|T(w)|$. With an appropriate covariance estimator, we can further replace $T(w)$ by a feasible version $\hat T(w)$ or its Studentized counterpart, enabling procedures for uniform inference analogous to the confidence bands constructed in Section~\ref{sec:yurinskii_series}. We omit the details of this to conserve space but note that our assumptions on $W_i$ and $\varepsilon_i$ ensure that Studentization is possible even when the discretized covariance matrix has small eigenvalues (Section~\ref{sec:yurinskii_kde}), as we normalize only by the diagonal entries. \citet[Remark~3.1]{chernozhukov2014gaussian} achieve better rates for approximating the supremum of the $t$-process based on i.i.d.\ data in Kolmogorov--Smirnov distance by bypassing the step where we first approximate the entire stochastic process (see Section~\ref{sec:yurinskii_emp_proc} for a discussion). Nonetheless, our approach targeting the entire process allows for a potential future treatment of other functionals as well as the supremum. We finally remark that in this setting of kernel-based local empirical processes, it is essential that our initial strong approximation result (Corollary~\ref{cor:yurinskii_sa_martingale}) does not impose a lower bound on the eigenvalues of the variance matrix $\Sigma$. This effect was demonstrated by Lemma \ref{lem:yurinskii_kde_eigenvalue}, Figure~\ref{fig:yurinskii_min_eig}, and their surrounding discussion in Section~\ref{sec:yurinskii_kde}. As such, the result of \citet{li2020uniform} is unsuited for this application, even in its simplest formulation, due to the strong minimum eigenvalue assumption. \section{Conclusion} \label{sec:yurinskii_conclusion} In this chapter we introduced as our main result a new version of Yurinskii's coupling which strictly generalizes all previously known forms of the result. Our formulation gave a Gaussian mixture coupling for approximate martingale vectors in $\ell^p$-norm where $1 \leq p \leq \infty$, with no restrictions on the minimum eigenvalues of the associated covariance matrices. We further showed how to obtain an improved approximation whenever third moments of the data are negligible. We demonstrated the applicability of this main result by first deriving a user-friendly version, and then specializing it to mixingales, martingales, and independent data, illustrating the benefits with a collection of simple factor models. We then considered the problem of constructing uniform strong approximations for martingale empirical processes, demonstrating how our new Yurinskii coupling can be employed in a stochastic process setting. As substantive illustrative applications of our theory to some well-established problems in statistical methodology, we showed how to use our coupling results for both vector-valued and empirical process-valued martingales in developing uniform inference procedures for partitioning-based series estimators and local polynomial models in nonparametric regression. At each stage we addressed issues of feasibility, compared our work with the existing literature, and provided implementable statistical inference procedures. The work in this chapter is based on \citet{cattaneo2022yurinskii}. \appendix \chapter{Supplement to Inference with Mondrian Random Forests} \label{app:mondrian} In this section we present the full proofs of all our results, and also state some useful technical preliminary and intermediate lemmas, along with some further properties of the Mondrian process not required for our primary analysis. See Section~\ref{sec:mondrian_overview_proofs} in the main text for an overview of the main proof strategies and a discussion of the challenges involved. We use the following simplified notation for convenience, whenever it is appropriate. We write $\I_{i b}(x) = \I \left\{ X_i \in T_b(x) \right\}$ and $N_b(x) = \sum_{i=1}^{n} \I_{i b}(x)$, as well as $\I_b(x) = \I \left\{ N_b(x) \geq 1 \right\}$. \section{Preliminary lemmas} We begin by bounding the maximum size of any cell in a Mondrian forest containing $x$. This result is used regularly throughout many of our other proofs, and captures the ``localizing'' behavior of the Mondrian random forest estimator, showing that Mondrian cells have side lengths at most on the order of $1/\lambda$. \begin{lemma}[Upper bound on the largest cell in a Mondrian forest]% \label{lem:mondrian_app_largest_cell} % Let $T_1, \ldots, T_b \sim \cM\big([0,1]^d, \lambda\big)$ and take $x \in (0,1)^d$. Then for all $t > 0$ % \begin{align*} \P \left( \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_b(x)_j| \geq \frac{t}{\lambda} \right) &\leq 2dB e^{-t/2}. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_largest_cell}] % We use the distribution of the Mondrian cell shape \citep[Proposition~1]{mourtada2020minimax}. We have $|T_b(x)_j| = \left( \frac{E_{bj1}}{\lambda} \wedge x_j \right) + \left( \frac{E_{bj2}}{\lambda} \wedge (1-x_j) \right)$ where $E_{bj1}$ and $E_{bj2}$ are i.i.d.\ $\Exp(1)$ variables for $1 \leq b \leq B$ and $1 \leq j \leq d$. Thus $|T_b(x)_j| \leq \frac{E_{bj1} + E_{bj2}}{\lambda}$ so by a union bound % \begin{align*} \P \left( \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_b(x)_j| \geq \frac{t}{\lambda} \right) &\leq \P \left( \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} (E_{bj1} \vee E_{bj2}) \geq \frac{t}{2} \right) \\ &\leq 2dB\, \P \left( E_{bj1} \geq \frac{t}{2} \right) \leq 2dB e^{-t/2}. \end{align*} % \end{proof} Next is another localization result, showing that the union of the cells $T_b(x)$ containing $x$ does not contain ``too many'' samples $X_i$. Thus the Mondrian random forest estimator fitted at $x$ only depends on $n/\lambda^d$ (the effective sample size) data points up to logarithmic terms. \begin{lemma}[Upper bound on the number of active data points]% \label{lem:mondrian_app_active_data} Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold, and define $N_{\cup}(x) = \sum_{i=1}^{n} \I \left\{ X_i \in \bigcup_{b=1}^{B} T_b(x) \right\}$. Then for $t > 0$ and sufficiently large $n$, with $\|f\|_\infty = \sup_{x \in [0,1]^d} f(x)$, % \begin{align*} \P \left( N_{\cup}(x) > t^{d+1} \frac{n}{\lambda^d} \|f\|_\infty \right) &\leq 4 d B e^{-t/4}. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_active_data}] Note $N_\cup(x) \sim \Bin\left(n, \int_{\bigcup_{b=1}^{B} T_b(x)} f(s) \diff s \right) \leq \Bin\left(n, 2^d \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_b(x)_j|^d \|f\|_\infty \right)$ conditionally on $\bT$. If $N \sim \Bin(n,p)$ then, by Bernstein's inequality, $\P\left( N \geq (1 + t) n p\right) \leq \exp\left(-\frac{t^2 n^2 p^2 / 2}{n p(1-p) + t n p / 3}\right) \leq \exp\left(-\frac{3t^2 n p}{6 + 2t}\right)$. Thus for $t \geq 2$, % \begin{align*} \P \left( N_{\cup}(x) > (1+t) n \frac{2^d t^d}{\lambda^d} \|f\|_\infty \Bigm| \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_j(x)| \leq \frac{t}{\lambda} \right) &\leq \exp\left(- \frac{2^d t^{d} n}{\lambda^d}\right). \end{align*} % By Lemma~\ref{lem:mondrian_app_largest_cell}, $\P \left( \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_j(x)| > \frac{t}{\lambda} \right) \leq 2 d B e^{-t/2}$. Hence % \begin{align*} &\P \left( N_{\cup}(x) > 2^{d+1} t^{d+1} \frac{n}{\lambda^d} \|f\|_\infty \right) \\ &\quad\leq \P \left( N_{\cup}(x) > 2 t n \frac{2^d t^d}{\lambda^d} \|f\|_\infty \Bigm| \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_j(x)| \leq \frac{t}{\lambda} \right) + \P \left( \max_{1 \leq b \leq B} \max_{1 \leq j \leq d} |T_j(x)| > \frac{t}{\lambda} \right) \\ &\quad\leq \exp\left(- \frac{2^d t^{d} n}{\lambda^d}\right) + 2 d B e^{-t/2}. \end{align*} % Replacing $t$ by $t/2$ gives that for sufficiently large $n$ such that $n / \lambda^d \geq 1$, % \begin{align*} \P \left( N_{\cup}(x) > t^{d+1} \frac{n}{\lambda^d} \|f\|_\infty \right) &\leq 4 d B e^{-t/4}. \end{align*} % \end{proof} Next we give a series of results culminating in a generalized moment bound for the denominator appearing in the Mondrian random forest estimator. We begin by providing a moment bound for the truncated inverse binomial distribution, which will be useful for controlling $\frac{\I_b(x)}{N_b(x)} \leq 1 \wedge \frac{1}{N_b(x)}$ because conditional on $T_b$ we have $N_b(x) \sim \Bin \left( n, \int_{T_b(x)} f(s) \diff s \right)$. Our constants could be significantly suboptimal but they are sufficient for our applications. \begin{lemma}[An inverse moment bound for the binomial distribution]% \label{lem:mondrian_app_binomial_bound} For $n \geq 1$ and $p \in [0,1]$, let $N \sim \Bin(n, p)$ and $a_1, \ldots, a_k \geq 0$. Then % \begin{align*} \E\left[ \prod_{j=1}^k \left( 1 \wedge \frac{1}{N + a_j} \right) \right] &\leq (9k)^k \prod_{j=1}^k \left( 1 \wedge \frac{1}{n p + a_j} \right). \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_binomial_bound}] By Bernstein's inequality, $\P\left( N \leq n p - t \right) \leq \exp\left(-\frac{t^2/2}{n p(1-p) + t/3}\right) \leq \exp\left(-\frac{3t^2}{6n p + 2t}\right)$. Therefore we have $\P\left( N \leq n p/4 \right) \leq \exp\left(-\frac{27 n^2 p^2 / 16}{6n p + 3 n p / 2}\right) = e^{-9 n p / 40}$. Partitioning by this event gives % \begin{align*} \E\left[ \prod_{j=1}^k \left( 1 \wedge \frac{1}{N + a_j} \right) \right] &\leq e^{-9 n p / 40} \prod_{j=1}^k \frac{1}{1 \vee a_j} + \prod_{j=1}^k \frac{1}{1 \vee (\frac{n p}{4} + a_j)} \\ &\leq \prod_{j=1}^k \frac{1}{\frac{9 n p}{40k} + (1 \vee a_j)} + \prod_{j=1}^k \frac{1}{1 \vee (\frac{n p}{4} + a_j)} \\ &\leq \prod_{j=1}^k \frac{1}{1 \vee \left(\frac{9 n p}{40k} + a_j\right)} + \prod_{j=1}^k \frac{1}{1 \vee (\frac{n p}{4} + a_j)} \\ &\leq 2 \prod_{j=1}^k \frac{1}{1 \vee \left(\frac{9 n p}{40k} + a_j\right)} \leq 2 \prod_{j=1}^k \frac{40k/9}{1 \vee \left(n p + a_j\right)} \\ &\leq (9k)^k \prod_{j=1}^k \left( 1 \wedge \frac{1}{n p + a_j} \right). \end{align*} \end{proof} Our next result is probably the most technically involved, allowing one to bound moments of (products of) $\frac{\I_b(x)}{N_b(x)}$ by the corresponding moments of (products of) $\frac{1}{n |T_b(x)|}$, again based on the heuristic that $N_b(x)$ is conditionally binomial so concentrates around its conditional expectation $n \int_{T_b(x)} f(x) \diff s \asymp n |T_b(x)|$. By independence of the trees, the latter expected products then factorize since the dependence on the data $X_i$ has been eliminated. The proof is complicated, and relies on the following induction procedure. First we consider the common refinement consisting of the subcells $\cR$ generated by all possible intersections of $T_b(x)$ over the selected trees (say $T_{b}(x), T_{b'}(x), T_{b''}(x)$ though there could be arbitrarily many). Note that $N_b(x)$ is the sum of the number of samples $X_i$ in each such subcell in $\cR$. We then apply Lemma~\ref{lem:mondrian_app_binomial_bound} repeatedly to each subcell in $\cR$ in turn, replacing the number of samples $X_i$ in that subcell with its volume multiplied by $n$, and controlling the error incurred at each step. We record the subcells which have been ``checked'' in this manner using the class $\cD \subseteq \cR$ and proceed by finite induction, beginning with $\cD = \emptyset$ and ending at $\cD = \cR$. \begin{lemma}[Generalized moment bound for Mondrian random forest denominators]% \label{lem:mondrian_app_moment_denominator} Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold. Let $T_b \sim \cM\big([0,1]^d, \lambda\big)$ be independent and $k_b \geq 1$ for $1 \leq b \leq B_0$. Then with $k = \sum_{b=1}^{B_0} k_b$, for sufficiently large $n$, % \begin{align*} \E\left[ \prod_{b=1}^{B_0} \frac{\I_b(x)}{N_b(x)^{k_b}} \right] &\leq \left( \frac{36k}{\inf_{x \in [0,1]^d} f(x)} \right)^{2^{B_0} k} \prod_{b=1}^{B_0} \E \left[ 1 \wedge \frac{1}{(n |T_b(x)|)^{k_b}} \right]. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_moment_denominator}] Define the common refinement of $\left\{ T_b(x) : 1 \leq b \leq {B_0} \right\}$ as the class of sets % \begin{align*} \cR &= \left\{ \bigcap_{b=1}^{B_0} D_b : D_b \in \big\{ T_b(x), T_b(x)^{\comp} \big\} \right\} \bigsetminus \left\{ \emptyset,\, \bigcap_{b=1}^{B_0} T_b(x)^\comp \right\} \end{align*} % and let $\cD \subset \cR$. We will proceed by induction on the elements of $\cD$, which represents the subcells we have checked, starting from $\cD = \emptyset$ and finishing at $\cD = \cR$. For $D \in \cR$ let $\cA(D) = \left\{ 1 \leq b \leq {B_0} : D \subseteq T_b(x) \right\}$ be the indices of the trees which are active on subcell $D$, and for $1 \leq b \leq {B_0}$ let $\cA(b) = \left\{ D \in \cR : D \subseteq T_b(x) \right\}$ be the subcells which are contained in $T_b(x)$, so that $b \in \cA(D) \iff D \in \cA(b)$. For a subcell $D \in \cR$, write $N_b(D) = \sum_{i=1}^{n} \I \left\{ X_i \in D \right\}$ so that $N_b(x) = \sum_{D \in \cA(b)} N_b(D)$. Note that for any $D \in \cR \setminus \cD$, % \begin{align*} &\E \left[ \prod_{b=1}^{B_0} \frac{1}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus \cD} N_b(D') + n \sum_{D' \in \cA(b) \cap \cD} |D'| \right)^{k_b} } \right] \\ &\quad= \E \left[ \prod_{b \notin \cA(D)} \frac{1}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus \cD} N_b(D') + n \sum_{D' \in \cA(b) \cap \cD} |D'| \right)^{k_b} } \right. \\ &\left. \qquad \times\,\E\left[ \prod_{b \in \cA(D)} \frac{1}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus \cD} N_b(D') + n \sum_{D' \in \cA(b) \cap \cD} |D'| \right)^{k_b} } \right.\right. \\ &\left.\left. \quad\qquad\qquad\biggm| \bT, N_b(D') : D' \in \cR \setminus (\cD \cup \{D\}) \right] \right]. \end{align*} % Now the inner conditional expectation is over $N_b(D)$ only. Since $f$ is bounded away from zero, % \begin{align*} N_b(D) &\sim \Bin\left( n - \sum_{D' \in \cR \setminus (\cD \cup \{D\})} N_b(D'), \ \frac{\int_{D} f(s) \diff s} {1 - \int_{\bigcup \left( \cR \setminus \cD \right) \setminus D} f(s) \diff s} \right) \\ &\geq \Bin\left( n - \sum_{D' \in \cR \setminus (\cD \cup \{D\})} N_b(D'), \ |D| \inf_{x \in [0,1]^d} f(x) \right) \end{align*} % conditional on $\bT$ and $N_b(D') : D' \in \cR \setminus (\cD \cup \{D\})$. For sufficiently large $t$ by Lemma~\ref{lem:mondrian_app_active_data} % \begin{align*} \P \left( \sum_{D' \in \cR \setminus (\cD \cup \{D\})} N_b(D') > t^{d+1} \frac{n}{\lambda^d} \|f\|_\infty \right) &\leq \P \left( N_{\cup}(x) > t^{d+1} \frac{n}{\lambda^d} \|f\|_\infty \right) \leq 4 d B_0 e^{-t/4}. \end{align*} % Thus $N_b(D) \geq \Bin(n/2, |D| \inf_x f(x))$ conditional on $\left\{ \bT, N_b(D') : D' \in \cR \setminus (\cD \cup \{D\}) \right\}$ with probability at least $1 - 4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}$. So by Lemma~\ref{lem:mondrian_app_binomial_bound}, % \begin{align*} &\E \Bigg[ \prod_{b \in \cA(D)} \! \frac{1}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus \cD} N_b(D') + n \sum_{D' \in \cA(b) \cap \cD} |D'| \right)^{k_b} } \biggm| \! \bT, N_b(D')\! : D' \in \cR \setminus \! (\cD \cup \{D\}) \Bigg] \\ &\quad\leq \E \! \left[ \prod_{b \in \cA(D)} \frac{(9k)^{k_b}}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus (\cD \cup \{D\})} N_b(D') + n |D| \inf_x f(x) / 2 + n \sum_{D' \in \cA(b) \cap \cD} |D'| \right)^{k_b}} \right] \\ &\qquad+ 4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}} \\ &\quad\leq \left( \frac{18k}{\inf_x f(x)} \right)^k \! \E \! \left[ \prod_{b \in \cA(D)} \frac{1}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus (\cD \cup \{D\})} N_b(D') + n \sum_{D' \in \cA(b) \cap (\cD \cup \{D\})} |D'| \right)^{k_b}} \right] \\ &\qquad+ 4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}. \end{align*} % Therefore plugging this back into the marginal expectation yields % \begin{align*} &\E\left[ \prod_{b=1}^{B_0} \frac{1}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus \cD} N_b(D') + n \sum_{D' \in \cA(b) \cap \cD} |D'| \right)^{k_b} } \right] \\ &\quad\leq \left( \frac{18k}{\inf_x f(x)} \right)^k \E \left[ \prod_{b=1}^{B_0} \frac{1}{ 1 \vee \left( \sum_{D' \in \cA(b) \setminus (\cD \cup \{D\})} N_b(D') + n \sum_{D' \in \cA(b) \cap (\cD \cup \{D\})} |D'| \right)^{k_b}} \right] \\ &\qquad+ 4 d B_0 e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}}. \end{align*} % Now we apply induction, starting with $\cD = \emptyset$ and adding $D \in \cR \setminus \cD$ to $\cD$ until $\cD = \cR$. This takes at most $|\cR| \leq 2^{B_0}$ steps and yields % \begin{align*} \E\left[ \prod_{b=1}^{B_0} \frac{\I_b(x)}{N_b(x)^{k_b}} \right] &\leq \E\left[ \prod_{b=1}^{B_0} \frac{1}{1 \vee N_b(x)^{k_b}} \right] = \E\left[ \prod_{b=1}^{B_0} \frac{1}{1 \vee \left( \sum_{D \in \cA(b)} N_b(D) \right)^{k_b}} \right] \leq \cdots \\ &\leq \left( \frac{18k}{\inf_x f(x)} \right)^{2^{B_0} k} \left( \prod_{b=1}^{B_0} \,\E \left[ \frac{1}{1 \vee (n |T_b(x)|)^{k_b}} \right] + 4 d B_0 2^{B_0} e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}} \right), \end{align*} % where the expectation factorizes due to independence of $T_b(x)$. The last step is to remove the trailing exponential term. To do this, note that by Jensen's inequality, % \begin{align*} \prod_{b=1}^{B_0} \,\E \left[ \frac{1}{1 \vee (n |T_b(x)|)^{k_b}} \right] &\geq \prod_{b=1}^{B_0} \frac{1} {\E \left[ 1 \vee (n |T_b(x)|)^{k_b} \right]} \geq \prod_{b=1}^{B_0} \frac{1}{n^{k_b}} = n^{-k} \geq 4 d B_0 2^{B_0} e^{\frac{-\sqrt \lambda}{8 \|f\|_\infty}} \end{align*} % for sufficiently large $n$ because $B_0$, $d$, and $k$ are fixed while $\log \lambda \gtrsim \log n$. \end{proof} Now that moments of (products of) $\frac{\I_b(x)}{N_b(x)}$ have been bounded by moments of (products of) $\frac{1}{n |T_b(x)|}$, we establish further explicit bounds for these in the next result. Note that the problem has been reduced to determining properties of Mondrian cells, so once again we return to the exact cell shape distribution given by \citet{mourtada2020minimax}, and evaluate the appropriate expectations by integration. Note that the truncation by taking the minimum with one inside the expectation is essential here, as otherwise second moment of the inverse Mondrian cell volume is not even finite. As such, there is a ``penalty'' of $\log n$ when bounding truncated second moments, and the upper bound for the $k$th moment is significantly larger than the naive assumption of $(\lambda^d / n)^k$ whenever $k \geq 3$. This ``small cell'' phenomenon in which the inverse volumes of Mondrian cells have heavy tails is a recurring challenge. \begin{lemma}[Inverse moments of the volume of a Mondrian cell]% \label{lem:mondrian_app_moment_cell} Suppose Assumption~\ref{ass:mondrian_estimator} holds and let $T \sim \cM\big([0,1]^d, \lambda\big)$. Then for sufficiently large $n$, % \begin{align*} \E\left[ 1 \wedge \frac{1}{(n |T(x)|)^k} \right] &\leq \left( \frac{\lambda^d}{n} \right)^{\I \left\{ k = 1 \right\}} \left( \frac{3 \lambda^{2d} \log n}{n^2} \right)^{\I \left\{ k \geq 2 \right\}} \prod_{j=1}^{d} \frac{1}{x_j (1-x_j)}. \end{align*} % \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_moment_cell}] By \citet[Proposition~1]{mourtada2020minimax}, $|T(x)| = \prod_{j=1}^{d} \left( \left(\frac{1}{\lambda} E_{j1} \right) \wedge x_j + \left( \frac{1}{\lambda} E_{j2} \right) \wedge (1-x_j) \right)$ where $E_{j1}$ and $E_{j2}$ are mutually independent $\Exp(1)$ random variables. Thus for $02$ we use $\frac{1}{1 \vee (n |T(x)|)^k} \leq \frac{1}{1 \vee (n |T(x)|)^{k-1}}$ to reduce $k$. Now if $k = 1$ we let $t \to 0$, giving % \begin{align*} \E \left[ \frac{1}{1 \vee (n |T(x)|)} \right] &\leq \frac{\lambda^d}{n} \prod_{j=1}^d \frac{1}{x_j(1-x_j)}, \end{align*} % and if $k = 2$ then we set $t = 1/n^2$ so that for sufficiently large $n$, % \begin{align*} \E \left[ \frac{1}{1 \vee (n |T(x)|)^2} \right] &\leq \frac{d}{n^2} + \frac{2 \lambda^{2d} \log n}{n^2} \prod_{j=1}^d \frac{1}{x_j(1-x_j)} \leq \frac{3 \lambda^{2d} \log n}{n^2} \prod_{j=1}^d \frac{1}{x_j(1-x_j)}. \end{align*} % Lower bounds which match up to constants for the first moment and up to logarithmic terms for the second moment are obtained as $\E \left[ 1 \wedge \frac{1}{(n|T(x)|)^2} \right] \geq \E \left[ 1 \wedge \frac{1}{n|T(x)|} \right]^2$ by Jensen, and % \begin{align*} \E \left[ 1 \wedge \frac{1}{n|T(x)|} \right] &\geq \frac{1}{1 + n \E \left[ |T(x)| \right]} \geq \frac{1}{1 + 2^d n / \lambda^d} \gtrsim \frac{\lambda^d}{n}. \end{align*} \end{proof} The endeavor to bound moments of (products of) $\frac{\I_b(x)}{N_b(x)}$ is concluded with the next result, combining the previous two lemmas to give a bound without expectations on the right. \begin{lemma}[Simplified generalized moment bound for Mondrian forest denominators]% \label{lem:mondrian_app_simple_moment_denominator} % Suppose Assumptions~\ref{ass:mondrian_data} and \ref{ass:mondrian_estimator} hold. Let $T_b \sim \cM\big([0,1]^d, \lambda\big)$ and $k_b \geq 1$ for $1 \leq b \leq B_0$. Then with $k = \sum_{b=1}^{B_0} k_b$, % \begin{align*} &\E\left[ \prod_{b=1}^{B_0} \frac{\I_b(x)}{N_b(x)^{k_b}} \right] \\ &\quad\leq \left( \frac{36k}{\inf_{x \in [0,1]^d} f(x)} \right)^{2^{B_0} k} \left( \prod_{j=1}^{d} \frac{1}{x_j (1-x_j)} \right)^{B_0} \prod_{b=1}^{B_0} \left( \frac{\lambda^d}{n} \right)^{\I \left\{ k_b = 1 \right\}} \left( \frac{\lambda^{2d} \log n}{n^2} \right)^{\I \left\{ k_b \geq 2 \right\}} \end{align*} % for sufficiently large $n$. % \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_simple_moment_denominator}] This follows directly from Lemmas~\ref{lem:mondrian_app_moment_denominator} and \ref{lem:mondrian_app_moment_cell}. \end{proof} Our final preliminary lemma is concerned with further properties of the inverse truncated binomial distribution, again with the aim of analyzing $\frac{\I_b(x)}{N_b(x)}$. This time, instead of merely upper bounding the moments, we aim to give convergence results for those moments, again in terms of moments of $\frac{1}{n |T_b(x)|}$. This time we only need to handle the first and second moment, so this result does not strictly generalize Lemma~\ref{lem:mondrian_app_binomial_bound} except in simple cases. The proof is by Taylor's theorem and the Cauchy--Schwarz inequality, using explicit expressions for moments of the binomial distribution and bounds from Lemma~\ref{lem:mondrian_app_binomial_bound}. \begin{lemma}[Expectation inequalities for the binomial distribution]% \label{lem:mondrian_app_binomial_expectation} Let $N \sim \Bin(n, p)$ and take $a, b \geq 1$. Then % \begin{align*} 0 &\leq \E \left[ \frac{1}{N+a} \right] - \frac{1}{n p+a} \leq \frac{2^{19}}{(n p+a)^2}, \\ 0 &\leq \E \left[ \frac{1}{(N+a)(N+b)} \right] - \frac{1}{(n p+a)(n p+b)} \leq \frac{2^{27}}{(n p +a)(n p +b)} \left( \frac{1}{n p + a} + \frac{1}{n p + b} \right). \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_binomial_expectation}] For the first result, Taylor's theorem with Lagrange remainder for $N \mapsto \frac{1}{N+a}$ around $n p$ gives % \begin{align*} \E \left[ \frac{1}{N+a} \right] &= \E \left[ \frac{1}{n p+a} - \frac{N - n p}{(n p+a)^2} + \frac{(N - n p)^2}{(\xi+a)^3} \right] \end{align*} % for some $\xi$ between $n p$ and $N$. The second term in the expectation is zero-mean, showing the non-negativity part, and the Cauchy--Schwarz inequality for the remaining term gives % \begin{align*} \E \left[ \frac{1}{N+a} \right] - \frac{1}{n p+a} &\leq \E \left[ \frac{(N - n p)^2}{(n p+a)^3} + \frac{(N - n p)^2}{(N+a)^3} \right] \\ &\leq \frac{\E\big[(N - n p)^2\big]}{(n p+a)^3} + \sqrt{ \E\big[(N - n p)^4\big] \E \left[ \frac{1}{(N+a)^6} \right]}. \end{align*} % Now we use $\E\big[(N - n p)^4\big] \leq n p(1+3n p)$ and apply Lemma~\ref{lem:mondrian_app_binomial_bound} to see that % \begin{align*} \E \left[ \frac{1}{N+a} \right] - \frac{1}{n p+a} &\leq \frac{n p}{(n p+a)^3} + \sqrt{\frac{54^6 n p(1+3 n p)}{(n p + a)^6}} \leq \frac{2^{19}}{(n p+a)^2}. \end{align*} % For the second result, Taylor's theorem applied to $N \mapsto \frac{1}{(N+a)(N+b)}$ around $n p$ gives % \begin{align*} \E \left[ \frac{1}{(N+a)(N+b)} \right] &= \E \left[ \frac{1}{(n p+a)(n p + b)} - \frac{(N - n p)(2 n p + a + b)}{(n p + a)^2 (n p + b)^2} \right] \\ &\quad+ \E \left[ \frac{(N - n p)^2}{(\xi+a)(\xi+b)} \left( \frac{1}{(\xi + a)^2} + \frac{1}{(\xi + a)(\xi + b)} + \frac{1}{(\xi + b)^2} \right) \right] \end{align*} % for some $\xi$ between $n p$ and $N$. The second term on the right is zero-mean, showing non-negativity, and applying the Cauchy--Schwarz inequality to the remaining term gives % \begin{align*} &\E \left[ \frac{1}{(N+a)(N+b)} \right] - \frac{1}{n p+a} \\ &\quad\leq \E \left[ \frac{2 (N - n p)^2}{(N+a)(N+b)} \left( \frac{1}{(N + a)^2} + \frac{1}{(N + b)^2} \right) \right] \\ &\qquad+ \E \left[ \frac{2 (N - n p)^2}{(n p +a)(n p +b)} \left( \frac{1}{(n p + a)^2} + \frac{1}{(n p + b)^2} \right) \right] \\ &\quad\leq \sqrt{ 4 \E \left[ (N - n p)^4 \right] \E \left[ \frac{1}{(N + a)^6 (N+b)^2} + \frac{1}{(N + b)^6 (N+a)^2} \right]} \\ &\qquad+ \frac{2 \E\big[(N - n p)^2\big]}{(n p +a)(n p +b)} \left( \frac{1}{(n p + a)^2} + \frac{1}{(n p + b)^2} \right). \end{align*} % Now we use $\E\big[(N - n p)^4\big] \leq n p(1+3n p)$ and apply Lemma~\ref{lem:mondrian_app_binomial_bound} to see that % \begin{align*} \E \left[ \frac{1}{(N+a)(N+b)} \right] - \frac{1}{n p+a} &\leq \sqrt{ \frac{4n p (1 + 3n p) \cdot 72^8}{(n p + a)^2 (n p + b)^2} \left( \frac{1}{(n p + a)^4} + \frac{1}{(n p + b)^4} \right)} \\ &\quad+ \frac{2 n p}{(n p +a)(n p +b)} \left( \frac{1}{(n p + a)^2} + \frac{1}{(n p + b)^2} \right) \\ &\leq \frac{2^{27}}{(n p + a) (n p + b)} \left( \frac{1}{n p + a} + \frac{1}{n p + b} \right). \end{align*} % \end{proof} \section{Proofs of main results} \label{sec:mondrian_app_proofs} \subsection{Mondrian random forests} We give rigorous proofs of the central limit theorem, bias characterization, and variance estimation results for the Mondrian random forest estimator without debiasing. See Section~\ref{sec:mondrian_overview_proofs} in the main text for details on our approaches to these proofs. \begin{proof}[Theorem~\ref{thm:mondrian_clt}] From the debiased version (Theorem~\ref{thm:mondrian_clt_debiased}) with $J=0$, $a_0 = 1$, and $\omega_0 = 1$. \end{proof} \begin{proof}[Theorem~\ref{thm:mondrian_bias}] \proofparagraph{removing the dependence on the trees} By measurability and with $\mu(X_i) = \E[Y_i \mid X_i]$ almost surely, % \begin{align*} \E \left[ \hat \mu(x) \mid \bX, \bT \right] - \mu(x) &= \frac{1}{B} \sum_{b=1}^B \sum_{i=1}^n \big( \mu(X_i) - \mu(x) \big) \frac{\I_{i b}(x)}{N_b(x)}. \end{align*} % Conditional on $\bX$, the terms in the outer sum depend only on $T_b$ so are i.i.d. As $\mu$ is Lipschitz, % \begin{align*} &\Var \big[ \E \left[ \hat \mu(x) \mid \bX, \bT \right] - \mu(x) \mid \bX \big] \leq \frac{1}{B} \E \left[ \left( \sum_{i=1}^n \big( \mu(X_i) - \mu(x) \big) \frac{\I_{i b}(x)}{N_b(x)} \right)^2 \Bigm| \bX \right] \\ &\quad\lesssim \frac{1}{B} \E \left[ \max_{1 \leq i \leq n} \big\| X_i - x \big\|_2^2 \left( \sum_{i=1}^n \frac{\I_{i b}(x)}{N_b(x)} \right)^2 \Bigm| \bX \right] \lesssim \frac{1}{B} \sum_{j=1}^{d} \E \left[ |T(x)_j|^2 \right] \lesssim \frac{1}{\lambda^2 B}, \end{align*} % using the law of $T(x)_j$ from \citet[Proposition~1]{mourtada2020minimax}. By Chebyshev's inequality, % \begin{align*} \big| \E \left[ \hat \mu(x) \mid \bX, \bT \right] - \E \left[ \hat \mu(x) \mid \bX \right] \big| &\lesssim_\P \frac{1}{\lambda \sqrt B}. \end{align*} \proofparagraph{showing the conditional bias converges in probability} Now $\E \left[ \hat\mu(x) \mid \bX \right]$ is a non-linear function of the i.i.d.\ random variables $X_i$, so we use the Efron--Stein inequality \citep{efron1981jackknife} to bound its variance. Let $\tilde X_{i j} = X_i$ if $i \neq j$ and be an independent copy of $X_j$, denoted $\tilde X_j$, if $i = j$. Write $\tilde \bX_j = (\tilde X_{1j}, \ldots, \tilde X_{n j})$ and similarly $\tilde \I_{i j b}(x) = \I \big\{ \tilde X_{i j} \in T_b(x) \big\}$ and $N_{j b}(x) = \sum_{i=1}^{n} \tilde \I_{i j b}(x)$. % \begin{align} \nonumber &\Var \left[ \sum_{i=1}^{n} \big( \mu(X_i) - \mu(x) \big) \E \left[ \frac{\I_{i b}(x)}{N_b(x)} \Bigm| \bX \right] \right] \\ \nonumber &\quad\leq \frac{1}{2} \sum_{j=1}^{n} \E \! \left[ \! \left( \sum_{i=1}^{n} \big( \mu(X_i) - \mu(x) \big) \E \! \left[ \frac{\I_{i b}(x)}{N_b(x)} \Bigm| \bX \right] - \sum_{i=1}^{n} \left( \mu(\tilde X_{i j}) - \mu(x) \right) \E \! \left[ \frac{\tilde \I_{i j b}(x)}{\tilde N_{j b}(x)} \Bigm| \tilde \bX_j \right] \right)^{\! \! 2} \right] \\ \nonumber &\quad\leq \frac{1}{2} \sum_{j=1}^{n} \E \left[ \left( \sum_{i=1}^{n} \left( \big( \mu(X_i) - \mu(x) \big) \frac{\I_{i b}(x)}{N_b(x)} - \left( \mu(\tilde X_{i j}) - \mu(x) \right) \frac{\tilde \I_{i j b}(x)}{\tilde N_{j b}(x)} \right) \right)^2 \right] \\ \nonumber &\quad\leq \sum_{j=1}^{n} \E \left[ \left( \sum_{i \neq j} \big( \mu(X_i) - \mu(x) \big) \left( \frac{\I_{i b}(x)}{N_b(x)} - \frac{\I_{i b}(x)}{\tilde N_{j b}(x)} \right) \right)^{\!\!2} \, \right] \\ \label{eq:mondrian_app_bias_efron_stein} &\qquad+ 2 \sum_{j=1}^{n} \E \left[ \left( \mu(X_j) - \mu(x) \right)^2 \frac{\I_{j b}(x)}{N_b(x)^2} \right]. \end{align} % For the first term in \eqref{eq:mondrian_app_bias_efron_stein} to be non-zero, we must have $|N_b(x) - \tilde N_{j b}(x)| = 1$. Writing $N_{-j b}(x) = \sum_{i \neq j} \I_{i b}(x)$, assume by symmetry that $\tilde N_{j b}(x) = N_{-j b}(x)$ and $N_b(x) = N_{-j b}(x) + 1$, and $\I_{j b}(x) = 1$. As $f$ is bounded and $\mu$ is Lipschitz, writing $\I_{-j b}(x) = \I \left\{ N_{-j b}(x) \geq 1 \right\}$, % \begin{align*} &\sum_{j=1}^{n} \E \left[ \left( \sum_{i \neq j} \left( \mu(X_i) - \mu(x) \right) \left( \frac{\I_{i b}(x)}{N_b(x)} - \frac{\I_{i b}(x)}{\tilde N_{j b}(x)} \right) \right)^{\! 2} \, \right] \\ &\quad\lesssim \sum_{j=1}^{n} \E \left[ \max_{1 \leq l \leq d} |T_b(x)_l|^2 \left( \frac{\sum_{i \neq j}\I_{i b}(x) \I_{j b}(x)} {N_{-j b}(x)(N_{-j b}(x) + 1)} \right)^2 \right] \lesssim \E \left[ \max_{1 \leq l \leq d} |T_b(x)_l|^2 \frac{\I_{b}(x)}{N_{b}(x)} \right]. \end{align*} % For $t > 0$, partition by $\left\{ \max_{1 \leq l \leq d} |T_b(x)_l| \geq t/\lambda \right\}$ and apply Lemma~\ref{lem:mondrian_app_largest_cell} and Lemma~\ref{lem:mondrian_app_simple_moment_denominator}: % \begin{align*} \E \left[ \max_{1 \leq l \leq d} |T_b(x)_l|^2 \frac{\I_{b}(x)}{N_{b}(x)} \right] &\leq \P \left( \max_{1 \leq l \leq d} |T_b(x)_l| \geq t/\lambda \right) + (t / \lambda)^2\, \E \left[ \frac{\I_{b}(x)}{N_{b}(x)} \right] \\ &\lesssim e^{-t/2} + \left( \frac{t}{\lambda} \right)^2 \frac{\lambda^d}{n} \lesssim \frac{1}{n^2} + \frac{(\log n)^2}{\lambda^2} \frac{\lambda^d}{n} \lesssim \frac{(\log n)^2}{\lambda^2} \frac{\lambda^{d}}{n}, \end{align*} % where we set $t = 4 \log n$. For the second term in \eqref{eq:mondrian_app_bias_efron_stein} we have % \begin{align*} \sum_{j=1}^{n} \E \left[ \left( \mu(X_j) - \mu(x) \right)^2 \frac{\I_{j b}(x)}{N_b(x)^2} \right] &\lesssim \E \left[ \max_{1 \leq l \leq d} |T_b(x)_l|^{2} \frac{\I_{b}(x)}{N_b(x)} \right] \lesssim \frac{(\log n)^2}{\lambda^2} \frac{\lambda^{d}}{n} \end{align*} % in the same manner. Hence % \begin{align*} \Var \left[ \sum_{i=1}^{n} \left( \mu(X_i) - \mu(x) \right) \E \left[ \frac{\I_{i b}(x)}{N_b(x)} \Bigm| \bX \right] \right] &\lesssim \frac{(\log n)^2}{\lambda^2} \frac{\lambda^{d}}{n}, \end{align*} % and so by Chebyshev's inequality, % \begin{align*} \big| \E \left[ \hat \mu(x) \mid \bX, \bT \right] - \E \left[ \hat \mu(x) \right] \big| &\lesssim_\P \frac{1}{\lambda \sqrt B} + \frac{\log n}{\lambda} \sqrt{ \frac{\lambda^{d}}{n} }. \end{align*} \proofparagraph{computing the limiting bias} It remains to compute the limit of $\E \left[ \hat \mu(x) \right] - \mu(x)$. Let $\bX_{-i} = (X_1, \ldots, X_{i-1}, X_{i+1}, \ldots, X_n)$ and $N_{-i b}(x) = \sum_{j=1}^n \I\{j \neq i\} \I\{X_j \in T_b(x)\}$. Then % \begin{align*} &\E \left[ \hat \mu(x) \right] - \mu(x) = \E \left[ \sum_{i=1}^{n} \left( \mu(X_i) - \mu(x) \right) \frac{\I_{i b}(x)}{N_b(x)} \right] \\ &\quad= \sum_{i=1}^{n} \E \left[ \E \left[ \frac{\left( \mu(X_i) - \mu(x) \right)\I_{i b}(x)} {N_{-i b}(x) + 1} \bigm| \bT, \bX_{-i} \right] \right] = n \, \E \left[ \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s} {N_{-i b}(x) + 1} \right]. \end{align*} % By Lemma~\ref{lem:mondrian_app_binomial_expectation}, as $N_{-i b}(x) \sim \Bin\left(n-1, \int_{T_b(x)} f(s) \diff s \right)$ given $\bT$ and $f$ is bounded below, % \begin{align*} \left| \E \! \left[ \frac{1}{N_{-i b}(x) + 1} \Bigm| \bT \right] - \frac{1}{(n-1) \! \int_{T_b(x)} \! f(s) \diff s + 1} \right| &\lesssim \frac{1}{n^2 \! \left( \int_{T_b(x)} f(s) \diff s \right)^2} \wedge 1 \lesssim \frac{1}{n^2 |T_b(x)|^2} \wedge 1, \end{align*} % and also % \begin{align*} \left| \frac{1}{(n-1) \int_{T_b(x)} f(s) \diff s + 1} - \frac{1}{n \int_{T_b(x)} f(s) \diff s} \right| &\lesssim \frac{1}{n^2 \left( \int_{T_b(x)} f(s) \diff s\right)^2} \wedge 1 \lesssim \frac{1}{n^2 |T_b(x)|^2} \wedge 1. \end{align*} % So by Lemmas~\ref{lem:mondrian_app_largest_cell} and \ref{lem:mondrian_app_moment_cell}, since $f$ is Lipschitz and bounded, using Cauchy--Schwarz, % \begin{align*} &\left| \E \left[ \hat \mu(x) \right] - \mu(x) - \E \left[ \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s} {\int_{T_b(x)} f(s) \diff s} \right] \right| \lesssim \E \left[ \frac{n \int_{T_b(x)} \left| \mu(s) - \mu(x) \right| f(s) \diff s} {n^2 |T_b(x)|^2 \vee 1} \right] \\ &\qquad\lesssim \E \left[ \frac{\max_{1 \leq l \leq d} |T_b(x)_l| } {n |T_b(x)| \vee 1} \right] \\ &\qquad\lesssim \frac{2 \log n}{\lambda} \, \E \left[ \frac{1}{n |T_b(x)| \vee 1} \right] + \P \left( \max_{1 \leq l \leq d} |T_b(x)_l| > \frac{2 \log n}{\lambda} \right)^{1/2} \E \left[ \frac{1} {n^2 |T_b(x)|^2 \vee 1} \right]^{1/2} \\ &\qquad\lesssim \frac{\log n}{\lambda} \, \frac{\lambda^d}{n} + \frac{d}{n} \frac{\lambda^d \sqrt{\log n}}{n} \lesssim \frac{\log n}{\lambda} \, \frac{\lambda^d}{n}. \end{align*} % Next set $A = \frac{1}{f(x) |T_b(x)|} \int_{T_b(x)} (f(s) - f(x)) \diff s \geq \inf_{s \in [0,1]^d} \frac{f(s)}{f(x)} - 1$. Use the Maclaurin series of $\frac{1}{1+x}$ up to order $\flbeta$ to see $\frac{1}{1 + A} = \sum_{k=0}^{\flbeta} (-1)^k A^k + O \left( A^{\flbeta + 1} \right)$. Hence % \begin{align*} &\E \left[ \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s} {\int_{T_b(x)} f(s) \diff s} \right] = \E \left[ \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s} {f(x) |{T_b(x)}|} \frac{1}{1 + A} \right] \\ &\quad= \E \left[ \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s} {f(x) |{T_b(x)}|} \left( \sum_{k=0}^{\flbeta} (-1)^k A^k + O \left( |A|^{\flbeta + 1} \right) \right) \right]. \end{align*} % Note that since $f$ and $\mu$ are Lipschitz, and by integrating the tail probability given in Lemma~\ref{lem:mondrian_app_largest_cell}, the Maclaurin remainder term is bounded by % \begin{align*} &\E \left[ \frac{\int_{T_b(x)} \left| \mu(s) - \mu(x) \right| f(s) \diff s} {f(x) |{T_b(x)}|} |A|^{\flbeta + 1} \right] \\ &\qquad= \E \left[ \frac{\int_{T_b(x)} \left| \mu(s) - \mu(x) \right| f(s) \diff s} {f(x) |{T_b(x)}|} \left( \frac{1}{f(x) |{T_b(x)}|} \int_{T_b(x)} (f(s) - f(x)) \diff s \right)^{\flbeta + 1} \right] \\ &\qquad\lesssim \E \left[ \max_{1 \leq l \leq d} |T_b(x)_l|^{\flbeta+2} \right] = \int_{0}^{\infty} \P \left( \max_{1 \leq l \leq d} |T_b(x)_l| \geq t^{\frac{1}{\flbeta+2}} \right) \diff t \leq \int_{0}^{\infty} 2 d e^{- \lambda t^{\frac{1}{\flbeta+2}} / 2} \diff t \\ &\qquad= \frac{2^{\flbeta + 3} d (\flbeta + 2)! } {\lambda^{\flbeta + 2}} \lesssim \frac{1}{\lambda^{\beta}}, \end{align*} % since $\int_0^\infty e^{-a x^{1/k}} \diff x = a^{-k} k!$. To summarize the progress so far, we have % \begin{align*} &\left| \E \left[ \hat \mu(x) \right] - \mu(x) - \sum_{k=0}^{\flbeta} (-1)^k \, \E \left[ \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s} {f(x)^{k+1} |T_b(x)|^{k+1}} \left( \int_{T_b(x)} (f(s) - f(x)) \diff s \right)^k \right] \right| \\ &\qquad\lesssim \frac{\log n}{\lambda} \frac{\lambda^d}{n} + \frac{1}{\lambda^\beta}. \end{align*} % We evaluate the expectation. By Taylor's theorem, with $\nu$ a multi-index, as $f \in \cH^\beta$, % \begin{align*} \left( \int_{T_b(x)} (f(s) - f(x)) \diff s \right)^k &= \left( \sum_{|\nu| = 1}^\flbeta \frac{\partial^\nu f(x)}{\nu !} \! \int_{T_b(x)} \!\! (s - x)^\nu \diff s \right)^k + O \! \left( |T_b(x)| \max_{1 \leq l \leq d} |T_b(x)_l|^\beta \right). \end{align*} % Next, by the multinomial theorem with a multi-index $u$ indexed by $\nu$ with $|\nu| \geq 1$, % \begin{align*} \left( \sum_{|\nu| = 1}^\flbeta \frac{\partial^\nu f(x)}{\nu !} \int_{T_b(x)} (s - x)^\nu \diff s \right)^k &= \sum_{|u| = k} \binom{k}{u} \left( \frac{\partial^\nu f(x)}{\nu !} \int_{T_b(x)} (s-x)^\nu \diff s \right)^u \end{align*} % where $\binom{k}{u}$ is a multinomial coefficient. By Taylor's theorem with $f, \mu \in \cH^\beta$, % \begin{align*} &\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s \\ &\quad= \sum_{|\nu'|=1}^{\flbeta} \sum_{|\nu''|=0}^{\flbeta} \frac{\partial^{\nu'} \mu(x)}{\nu' !} \frac{\partial^{\nu''} f(x)}{\nu'' !} \int_{T_b(x)} (s-x)^{\nu' + \nu''} \diff s + O \left( |T_b(x)| \max_{1 \leq l \leq d} |T_b(x)_l|^\beta \right). \end{align*} % Now by integrating the tail probabilities in Lemma~\ref{lem:mondrian_app_largest_cell}, $ \E \left[ \max_{1 \leq l \leq d} |T_b(x)_l|^\beta \right] \lesssim \frac{1}{\lambda^\beta}$. Therefore, by Lemma~\ref{lem:mondrian_app_moment_cell}, writing $T_b(x)^\nu$ for $\int_{T_b(x)} (s-x)^\nu \diff s$, % \begin{align*} &\sum_{k=0}^{\flbeta} (-1)^k \, \E \left[ \frac{\int_{T_b(x)} \left( \mu(s) - \mu(x) \right) f(s) \diff s} {f(x)^{k+1} |T_b(x)|^{k+1}} \left( \int_{T_b(x)} (f(s) - f(x)) \diff s \right)^k \right] \\ &\,= \! \sum_{k=0}^{\flbeta} (-1)^k \, \E \! \left[ \! \frac{ \sum_{|\nu'|=1}^{\flbeta} \! \sum_{|\nu''|=0}^{\flbeta} \! \frac{\partial^{\nu'} \mu(x)}{\nu' !} \frac{\partial^{\nu''} f(x)}{\nu'' !} T_b(x)^{\nu' + \nu''\!\!\!} }{f(x)^{k+1} |T_b(x)|^{k+1}} \!\! \sum_{|u| = k} \! \binom{k}{u} \!\! \left( \frac{\partial^\nu f(x)}{\nu !} T_b(x)^\nu \right)^{\!\! u} \right] \! + O \! \left( \frac{1}{\lambda^\beta} \right) \\ &\,= \sum_{|\nu'|=1}^{\flbeta} \sum_{|\nu''|=0}^{\flbeta} \sum_{|u|=0}^{\flbeta} \frac{\partial^{\nu'} \mu(x)}{\nu' !} \frac{\partial^{\nu''} f(x)}{\nu'' !} \left( \frac{\partial^\nu f(x)}{\nu !} \right)^u \binom{|u|}{u} \frac{(-1)^{|u|}}{f(x)^{|u|+1}} \E \left[ \frac{ T_b(x)^{\nu' + \nu''} (T_b(x)^\nu)^u}{|T_b(x)|^{|u|+1}} \right] \\ &\quad+ O \left( \frac{1}{\lambda^\beta} \right) . \end{align*} % We show this is a polynomial in $1/\lambda$. For $1 \leq j \leq d$, define $E_{1j*} \sim \Exp(1) \wedge (\lambda x_j)$ and $E_{2j*} \sim \Exp(1) \wedge (\lambda (1-x_j))$ independent so $T_b(x) = \prod_{j=1}^{d} [x_j - E_{1j*} / \lambda, x_j + E_{2j*} / \lambda]$. Then % \begin{align*} T_b(x)^\nu &= \int_{T_b(x)} (s-x)^\nu \diff s = \prod_{j=1}^d \int_{x_j - E_{1j*}/\lambda}^{x_j+E_{2j*}/\lambda} (s - x_j)^{\nu_j} \diff s = \prod_{j=1}^d \int_{-E_{1j*}}^{E_{2j*}} (s / \lambda)^{\nu_j} 1/\lambda \diff s \\ &= \lambda^{-d - |\nu|} \prod_{j=1}^d \int_{-E_{1j*}}^{E_{2j*}} s^{\nu_j} \diff s = \lambda^{-d - |\nu|} \prod_{j=1}^d \frac{E_{2j*}^{\nu_j + 1} + (-1)^{\nu_j} E_{1j*}^{\nu_j + 1}} {\nu_j + 1}. \end{align*} % So by independence over $j$, % \begin{align} \label{eq:mondrian_app_bias_calc} &\E \left[ \frac{ T_b(x)^{\nu' + \nu''} (T_b(x)^\nu)^u}{|T_b(x)|^{|u|+1}} \right] \\ \nonumber &\quad= \lambda^{- |\nu'| - |\nu''| - |\nu| \cdot u} \prod_{j=1}^d \E \left[ \frac{E_{2j*}^{\nu'_j + \nu''_j + 1} + (-1)^{\nu'_j + \nu''_j} E_{1j*}^{\nu'_j + \nu''_j + 1}} {(\nu'_j + \nu''_j + 1) (E_{2j*} + E_{1j*})} \frac{\left(E_{2j*}^{\nu_j + 1} + (-1)^{\nu_j} E_{1j*}^{\nu_j + 1}\right)^u} {(\nu_j + 1)^u (E_{2j*} + E_{1j*})^{|u|}} \right]. \end{align} % The final step is to replace $E_{1j*}$ by $E_{1j} \sim \Exp(1)$ and similarly for $E_{2j*}$. For some $C > 0$, % \begin{align*} \P \! \left( \bigcup_{j=1}^{d} \left( \left\{ E_{1j*} \neq E_{1j} \right\} \cup \left\{ E_{2j*} \neq E_{2j} \right\} \right) \! \right) &\leq 2d\, \P \! \left( \Exp(1) \geq \lambda \min_{1 \leq j \leq d} (x_j \wedge (1-x_j)) \! \right) \leq 2d e^{-C \lambda}. \end{align*} % Further, the quantity inside the expectation in \eqref{eq:mondrian_app_bias_calc} is bounded almost surely by one and so the error incurred by replacing $E_{1j*}$ and $E_{2j*}$ by $E_{1j}$ and $E_{2j}$ in \eqref{eq:mondrian_app_bias_calc} is at most $2 d e^{-C \lambda} \lesssim \lambda^{-\beta}$. Thus the limiting bias is % \begin{align} \nonumber &\E \left[ \hat \mu(x) \right] - \mu(x) \\ \nonumber &\quad= \sum_{|\nu'|=1}^{\flbeta} \sum_{|\nu''|=0}^{\flbeta} \sum_{|u|=0}^{\flbeta} \frac{\partial^{\nu'} \mu(x)}{\nu' !} \frac{\partial^{\nu''} f(x)}{\nu'' !} \left( \frac{\partial^\nu f(x)}{\nu !} \right)^u \binom{|u|}{u} \frac{(-1)^{|u|}}{f(x)^{|u|+1}} \, \lambda^{- |\nu'| - |\nu''| - |\nu| \cdot u} \\ \nonumber &\qquad\quad\times \prod_{j=1}^d \E \left[ \frac{E_{2j}^{\nu'_j + \nu''_j + 1} + (-1)^{\nu'_j + \nu''_j} E_{1j}^{\nu'_j + \nu''_j + 1}} {(\nu'_j + \nu''_j + 1) (E_{2j} + E_{1j})} \frac{\left(E_{2j}^{\nu_j + 1} + (-1)^{\nu_j} E_{1j}^{\nu_j + 1}\right)^u} {(\nu_j + 1)^u (E_{2j} + E_{1j})^{|u|}} \right] \\ \label{eq:mondrian_app_bias} &\qquad+ O \left( \frac{\log n}{\lambda} \frac{\lambda^d}{n} \right) + O \left( \frac{1}{\lambda^\beta} \right), \end{align} % recalling that $u$ is a multi-index which is indexed by the multi-index $\nu$. This is a polynomial in $\lambda$ of degree at most $\flbeta$, since higher-order terms can be absorbed into $O(1 / \lambda^\beta)$, which has finite coefficients depending only on the derivatives up to order $\flbeta$ of $f$ and $\mu$ at $x$. Now we show that the odd-degree terms in this polynomial are all zero. Note that a term is of odd degree if and only if $|\nu'| + |\nu''| + |\nu| \cdot u$ is odd. This implies that there exists $1 \leq j \leq d$ such that exactly one of either $\nu'_j + \nu''_j$ is odd or $\sum_{|\nu|=1}^{\flbeta} \nu_j u_\nu$ is odd. If $\nu'_j + \nu''_j$ is odd, then $\sum_{|\nu|=1}^{\flbeta} \nu_j u_\nu$ is even, so $|\{\nu : \nu_j u_\nu \text{ is odd}\}|$ is even. Consider the effect of swapping $E_{1j}$ and $E_{2j}$, an operation which preserves their joint law, in each of % \begin{align} \label{eq:mondrian_app_bias_odd_1} \frac{E_{2j}^{\nu'_j + \nu''_j + 1} - (-E_{1j})^{\nu'_j + \nu''_j + 1}} {E_{2j} + E_{1j}} \end{align} % and % \begin{align} \label{eq:mondrian_app_bias_odd_2} &\frac{\left(E_{2j}^{\nu_j + 1} - (-E_{1j})^{\nu_j + 1}\right)^u} {(E_{2j} + E_{1j})^{|u|}} = \!\!\! \prod_{\substack{|\nu| = 1 \\ \nu_j u_\nu \text{ even}}}^\beta \!\!\! \frac{\left(E_{2j}^{\nu_j + 1} - (-E_{1j})^{\nu_j + 1}\right)^{u_\nu}} {(E_{2j} + E_{1j})^{u_\nu}} \!\!\! \prod_{\substack{|\nu| = 1 \\ \nu_j u_\nu \text{ odd}}}^\beta \!\!\! \frac{\left(E_{2j}^{\nu_j + 1} - (-E_{1j})^{\nu_j + 1}\right)^{u_\nu}} {(E_{2j} + E_{1j})^{u_\nu}}. \end{align} % Clearly, $\nu'_j + \nu''_j$ being odd inverts the sign of \eqref{eq:mondrian_app_bias_odd_1}. For \eqref{eq:mondrian_app_bias_odd_2}, each term in the first product has either $\nu_j$ even or $u_\nu$ even, so its sign is preserved. Every term in the second product of \eqref{eq:mondrian_app_bias_odd_2} has its sign inverted due to both $\nu_j$ and $u_\nu$ being odd, but there are an even number of terms, preserving the overall sign. Therefore the expected product of \eqref{eq:mondrian_app_bias_odd_1} and \eqref{eq:mondrian_app_bias_odd_2} is zero by symmetry. If however $\nu'_j + \nu''_j$ is even, then $\sum_{|\nu|=1}^{\flbeta} \nu_j u_\nu$ is odd so $|\{\nu : \nu_j u_\nu \text{ is odd}\}|$ is odd. Clearly, the sign of \eqref{eq:mondrian_app_bias_odd_1} is preserved. Again the sign of the first product in \eqref{eq:mondrian_app_bias_odd_2} is preserved, and the sign of every term in \eqref{eq:mondrian_app_bias_odd_2} is inverted. However there are now an odd number of terms in the second product, so its overall sign is inverted. Therefore the expected product of \eqref{eq:mondrian_app_bias_odd_1} and \eqref{eq:mondrian_app_bias_odd_2} is again zero. \proofparagraph{calculating the second-order bias} Next we calculate some special cases, beginning with the form of the leading second-order bias, where the exponent in $\lambda$ is $|\nu'| + |\nu''| + u \cdot |\nu| = 2$, proceeding by cases on the values of $|\nu'|$, $|\nu''|$, and $|u|$. Firstly, if $|\nu'| = 2$ then $|\nu''| = |u| = 0$. Note that if any $\nu'_j = 1$ then the expectation in \eqref{eq:mondrian_app_bias} is zero. Hence we can assume $\nu'_j \in \{0, 2\}$, yielding % \begin{align*} \frac{1}{2 \lambda^2} \! \sum_{j=1}^d \frac{\partial^2 \mu(x)}{\partial x_j^2} \frac{1}{3} \E \! \left[ \frac{E_{2j}^{3} + E_{1j}^{3}} {E_{2j} + E_{1j}} \right] &\!= \frac{1}{2 \lambda^2} \! \sum_{j=1}^d \frac{\partial^2 \mu(x)}{\partial x_j^2} \frac{1}{3} \E \! \left[ E_{1j}^{2} + E_{2j}^{2} - E_{1j} E_{2j} \right] = \frac{1}{2 \lambda^2} \! \sum_{j=1}^d \frac{\partial^2 \mu(x)}{\partial x_j^2}, \end{align*} % where we used that $E_{1j}$ and $E_{2j}$ are independent $\Exp(1)$. Next we consider $|\nu'| = 1$ and $|\nu''| = 1$, so $|u| = 0$. Note that if $\nu'_j = \nu''_{j'} = 1$ with $j \neq j'$ then the expectation in \eqref{eq:mondrian_app_bias} is zero. So we need only consider $\nu'_j = \nu''_j = 1$, giving % \begin{align*} \frac{1}{\lambda^2} \frac{1}{f(x)} \sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j} \frac{\partial f(x)}{\partial x_j} \frac{1}{3} \E \left[ \frac{E_{2j}^{3} + E_{1j}^{3}} {E_{2j} + E_{1j}} \right] &= \frac{1}{\lambda^2} \frac{1}{f(x)} \sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j} \frac{\partial f(x)}{\partial x_j}. \end{align*} % Finally, we have the case where $|\nu'| = 1$, $|\nu''| = 0$ and $|u|=1$. Then $u_\nu = 1$ for some $|\nu| = 1$ and zero otherwise. Note that if $\nu'_j = \nu_{j'} = 1$ with $j \neq j'$ then the expectation is zero. So we need only consider $\nu'_j = \nu_j = 1$, giving % \begin{align*} &- \frac{1}{\lambda^2} \frac{1}{f(x)} \sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j} \frac{\partial f(x)}{\partial x_j} \frac{1}{4} \E \left[ \frac{(E_{2j}^2 - E_{1j}^2)^2} {(E_{2j} + E_{1j})^2} \right] \\ &\quad= - \frac{1}{4 \lambda^2} \frac{1}{f(x)} \sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j} \frac{\partial f(x)}{\partial x_j} \E \left[ E_{1j}^2 + E_{2j}^2 - 2 E_{1j} E_{2j} \right] = - \frac{1}{2 \lambda^2} \frac{1}{f(x)} \sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j} \frac{\partial f(x)}{\partial x_j}. \end{align*} % Hence the second-order bias term is % \begin{align*} \frac{1}{2 \lambda^2} \sum_{j=1}^d \frac{\partial^2 \mu(x)}{\partial x_j^2} + \frac{1}{2 \lambda^2} \frac{1}{f(x)} \sum_{j=1}^{d} \frac{\partial \mu(x)}{\partial x_j} \frac{\partial f(x)}{\partial x_j}. \end{align*} \proofparagraph{calculating the bias if the data is uniformly distributed} If $X_i \sim \Unif\big([0,1]^d\big)$ then $f(x) = 1$ and the bias expansion from \eqref{eq:mondrian_app_bias} becomes % \begin{align*} \sum_{|\nu'|=1}^{\flbeta} \lambda^{- |\nu'|} \frac{\partial^{\nu'} \mu(x)}{\nu' !} \prod_{j=1}^d \E \left[ \frac{E_{2j}^{\nu'_j + 1} + (-1)^{\nu'_j} E_{1j}^{\nu'_j + 1}} {(\nu'_j + 1) (E_{2j} + E_{1j})} \right]. \end{align*} % This is zero if any $\nu_j'$ is odd, so we group these terms based on the exponent of $\lambda$ to see % \begin{align*} \frac{B_r(x)}{\lambda^{2r}} &= \frac{1}{\lambda^{2r}} \sum_{|\nu|=r} \frac{\partial^{2 \nu} \mu(x)}{(2 \nu) !} \prod_{j=1}^d \frac{1}{2\nu_j + 1} \E \left[ \frac{E_{2j}^{2\nu_j + 1} + E_{1j}^{2\nu_j + 1}} {E_{2j} + E_{1j}} \right]. \end{align*} % Since $\int_0^\infty \frac{e^{-t}}{a+t} \diff t = e^a \Gamma(0,a)$ and $\int_0^\infty s^a \Gamma(0, a) \diff s = \frac{a!}{a+1}$, with $\Gamma(0, a) = \int_a^\infty \frac{e^{-t}}{t} \diff t$ the upper incomplete gamma function, the expectation is easily calculated as % \begin{align*} \E \left[ \frac{E_{2j}^{2\nu_j + 1} + E_{1j}^{2\nu_j + 1}} {E_{2j} + E_{1j}} \right] &= 2 \int_{0}^{\infty} s^{2\nu_j + 1} e^{-s} \int_{0}^{\infty} \frac{e^{-t}} {s + t} \diff t \diff s \\ &= 2 \int_{0}^{\infty} s^{2\nu_j + 1} \Gamma(0, s) \diff s = \frac{(2 \nu_j + 1)!}{\nu_j + 1}, \end{align*} % so finally % \begin{align*} \frac{B_r(x)}{\lambda^{2r}} &= \frac{1}{\lambda^{2r}} \sum_{|\nu|=r} \frac{\partial^{2 \nu} \mu(x)}{(2 \nu) !} \prod_{j=1}^d \frac{1}{2\nu_j + 1} \frac{(2 \nu_j + 1)!}{\nu_j + 1} = \frac{1}{\lambda^{2r}} \sum_{|\nu|=r} \partial^{2 \nu} \mu(x) \prod_{j=1}^d \frac{1}{\nu_j + 1}. \end{align*} % \end{proof} \begin{proof}[Theorem~\ref{thm:mondrian_variance_estimation}] This follows from the debiased version in Theorem~\ref{thm:mondrian_variance_estimation_debiased} with $J=0$, $a_0 = 1$, and $\omega_0 = 1$. \end{proof} \begin{proof}[Theorem~\ref{thm:mondrian_confidence}] % By Theorem~\ref{thm:mondrian_bias} and Theorem~\ref{thm:mondrian_variance_estimation}, % \begin{align*} \sqrt{\frac{n}{\lambda^d}} \frac{\hat \mu(x) - \mu(x)}{\hat \Sigma(x)^{1/2}} &= \sqrt{\frac{n}{\lambda^d}} \frac{\hat \mu(x) - \E \left[ \hat \mu(x) \mid \bX, \bT \right]} {\hat \Sigma(x)^{1/2}} + \sqrt{\frac{n}{\lambda^d}} \frac{\E \left[ \hat \mu(x) \mid \bX, \bT \right] - \mu(x)} {\hat \Sigma(x)^{1/2}} \\ &= \sqrt{\frac{n}{\lambda^d}} \frac{\hat \mu(x) - \E \left[ \hat \mu(x) \mid \bX, \bT \right]} {\hat \Sigma(x)^{1/2}} + \sqrt{\frac{n}{\lambda^d}} \, O_\P \left( \frac{1}{\lambda^{\beta \wedge 2}} + \frac{1}{\lambda \sqrt B} + \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}} \right). \end{align*} % The first term now converges weakly to $\cN(0,1)$ by Slutsky's theorem, Theorem~\ref{thm:mondrian_clt}, and Theorem~\ref{thm:mondrian_variance_estimation}, while the second term is $o_\P(1)$ by assumption. Validity of the confidence interval follows immediately. % \end{proof} \subsection{Debiased Mondrian random forests} We give rigorous proofs of the central limit theorem, bias characterization, variance estimation, confidence interval validity, and minimax optimality results for the debiased Mondrian random forest estimator. \begin{proof}[Theorem~\ref{thm:mondrian_clt_debiased}] We use the martingale central limit theorem given by \citet[Theorem~3.2]{hall1980martingale}. For each $1 \leq i \leq n$ define $\cH_{n i}$ to be the filtration generated by $\bT$, $\bX$, and $(\varepsilon_j : 1 \leq j \leq i)$, noting that $\cH_{n i} \subseteq \cH_{(n+1)i}$ because $B$ increases weakly as $n$ increases. Let $\I_{i b r}(x) = \I\{X_i \in T_{b r}(x)\}$ where $T_{b r}(x)$ is the cell containing $x$ in tree $b$ used to construct $\hat \mu_r(x)$, and similarly let $N_{b r}(x) = \sum_{i=1}^n \I_{i b r}(x)$ and $\I_{b r}(x) = \I\{N_{b r}(x) \geq 1\}$. Define the $\cH_{n i}$-measurable and square integrable variables % \begin{align*} S_i(x) &= \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)}, \end{align*} % which satisfy the martingale difference property $\E [ S_i(x) \mid \cH_{n i} ] = 0$. Further, % \begin{align*} \sqrt{\frac{n}{\lambda^d}} \big( \hat\mu_\rd(x) - \E\left[ \hat\mu_\rd(x) \mid \bX, \bT \right] \big) = \sum_{i=1}^n S_i(x). \end{align*} % By \citet[Theorem~3.2]{hall1980martingale} it suffices to check that % \begin{inlineroman} \item $\max_i |S_i(x)| \to 0$ in probability,% \label{it:mondrian_app_hall_prob} \item $\E\left[\max_i S_i(x)^2\right] \lesssim 1$, and% \label{it:mondrian_app_hall_exp} \item $\sum_i S_i(x)^2 \to \Sigma_\rd(x)$ in probability. \label{it:mondrian_app_hall_var} \end{inlineroman} \proofparagraph{checking condition \ref{it:mondrian_app_hall_prob}} % Since $J$ is fixed and $\E[|\varepsilon_i|^3 \mid X_i]$ is bounded, by Jensen's inequality and Lemma~\ref{lem:mondrian_app_simple_moment_denominator}, % \begin{align*} \E\left[\max_{1 \leq i \leq n} |S_i(x)| \right] &= \E\left[\max_{1 \leq i \leq n} \left| \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)} \right| \right] \\ &\leq \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} |\omega_r| \frac{1}{B} \E\left[\max_{1 \leq i \leq n} \left| \sum_{b=1}^B \frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)} \right| \right] \\ &\leq \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} |\omega_r| \frac{1}{B} \E\left[ \sum_{i=1}^{n} \left( \sum_{b=1}^B \frac{\I_{i b r}(x) |\varepsilon_i|} {N_{b r}(x)} \right)^3 \right]^{1/3} \\ &= \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} |\omega_r| \frac{1}{B} \E\left[ \sum_{i=1}^{n} |\varepsilon_i|^3 \sum_{b=1}^B \sum_{b'=1}^B \sum_{b''=1}^B \frac{\I_{i b r}(x) } {N_{b r}(x)} \frac{\I_{i b' r}(x) } {N_{b' r}(x)} \frac{\I_{i b'' r}(x) } {N_{b'' r}(x)} \right]^{1/3} \\ &\lesssim \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} |\omega_r| \frac{1}{B^{2/3}} \E\left[ \sum_{b=1}^B \sum_{b'=1}^B \frac{\I_{b r}(x)} {N_{b r}(x)} \frac{\I_{b' r}(x)} {N_{b' r}(x)} \right]^{1/3} \\ &\lesssim \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} |\omega_r| \frac{1}{B^{2/3}} \left( B^2 \frac{a_r^{2d} \lambda^{2d}}{n^2} + B \frac{a_r^{2d} \lambda^{2d} \log n}{n^2} \right)^{1/3} \\ &\lesssim \left( \frac{\lambda^d}{n} \right)^{1/6} + \left( \frac{\lambda^d}{n} \right)^{1/6} \left( \frac{\log n}{B} \right)^{1/3} \to 0. \end{align*} \proofparagraph{checking condition \ref{it:mondrian_app_hall_exp}} % Since $\E[\varepsilon_i^2 \mid X_i]$ is bounded and by Lemma~\ref{lem:mondrian_app_simple_moment_denominator}, % \begin{align*} \E\left[\max_{1 \leq i \leq n} S_i(x)^2 \right] &= \E\left[ \max_{1 \leq i \leq n} \left( \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)} \right)^2 \right] \\ &\leq \frac{n}{\lambda^d} \frac{1}{B^2} (J+1)^2 \max_{0 \leq r \leq J} \omega_r^2 \,\E\left[ \sum_{i=1}^{n} \sum_{b=1}^B \sum_{b'=1}^B \frac{\I_{i b r}(x) \I_{i b' r}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r}(x)} \right] \\ &\lesssim \frac{n}{\lambda^d} \max_{0 \leq r \leq J} \E\left[ \frac{\I_{b r}(x)}{N_{b r}(x)} \right] \lesssim \frac{n}{\lambda^d} \max_{0 \leq r \leq J} \frac{a_r^d \lambda^d}{n} \lesssim 1. \end{align*} \proofparagraph{checking condition \ref{it:mondrian_app_hall_var}} Next, we have % \begin{align} \label{eq:mondrian_app_clt_condition_sum} \sum_{i=1}^n S_i(x)^2 &= \sum_{i=1}^n \left( \sqrt{\frac{n}{\lambda^d}} \sum_{r=0}^{J} \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\I_{i b r}(x) \varepsilon_i} {N_{b r}(x)} \right)^2 \\ &= \nonumber \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \sum_{b=1}^B \sum_{b'=1}^B \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \\ \nonumber &= \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \sum_{b=1}^B \left( \frac{\I_{i b r}(x) \I_{i b r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b r'}(x)} + \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right). \end{align} % By boundedness of $\E[\varepsilon_i^2 \mid X_i]$ and Lemma~\ref{lem:mondrian_app_simple_moment_denominator}, the first term in \eqref{eq:mondrian_app_clt_condition_sum} vanishes as % \begin{align*} \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \sum_{b=1}^B \E \left[ \frac{\I_{i b r}(x) \I_{i b r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b r'}(x)} \right] &\lesssim \frac{n}{\lambda^d} \frac{1}{B^2} \max_{0 \leq r \leq J} \sum_{b=1}^B \E \left[ \frac{\I_{b r}(x)}{N_{b r}(x)} \right] \lesssim \frac{1}{B} \to 0. \end{align*} % For the second term in \eqref{eq:mondrian_app_clt_condition_sum}, the law of total variance gives % \begin{align} \nonumber &\Var \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] \\ \nonumber &\quad\leq (J+1)^4 \max_{0 \leq r, r' \leq J} \omega_r \omega_{r'} \Var \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] \\ \nonumber &\quad\lesssim \max_{0 \leq r, r' \leq J} \E \left[ \Var \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX, \bY \right] \right] \\ \label{eq:mondrian_app_total_variance} &\qquad+ \max_{0 \leq r, r' \leq J} \Var \left[ \E \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX, \bY \right] \right] \end{align} % For the first term in \eqref{eq:mondrian_app_total_variance}, % \begin{align*} &\E \left[ \Var \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX, \bY \right] \right] \\ &\quad= \frac{n^2}{\lambda^{2d}} \frac{1}{B^4} \sum_{i=1}^n \sum_{j=1}^n \sum_{b=1}^B \sum_{b' \neq b} \sum_{\tilde b=1}^B \sum_{\tilde b' \neq \tilde b} \E \Bigg[ \varepsilon_i^2 \varepsilon_j^2 \left( \frac{\I_{i b r}(x) \I_{i b' r'}(x) } {N_{b r}(x) N_{b' r'}(x)} - \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) } {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX \right] \right) \\ &\qquad\quad \times \left( \frac{\I_{j \tilde b r}(x) \I_{j \tilde b' r'}(x) } {N_{\tilde b r}(x) N_{ \tilde b' r'}(x)} - \E \left[ \frac{\I_{j \tilde b r}(x) \I_{j \tilde b' r'}(x) } {N_{\tilde b r}(x) N_{\tilde b' r'}(x)} \Bigm| \bX \right] \right) \Bigg]. \end{align*} % Since $T_{b r}$ is independent of $T_{b' r'}$ given $\bX, \bY$, the summands are zero whenever $\big|\{b, b', \tilde b, \tilde b'\}\big| = 4$. Since $\E[ \varepsilon_i^2 \mid X_i]$ is bounded and by the Cauchy--Schwarz inequality and Lemma~\ref{lem:mondrian_app_simple_moment_denominator}, % \begin{align*} &\E \left[ \Var \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX, \bY \right] \right] \\ &\quad\lesssim \frac{n^2}{\lambda^{2d}} \frac{1}{B^3} \sum_{b=1}^B \sum_{b' \neq b} \E \left[ \left( \sum_{i=1}^n \frac{\I_{i b r}(x) \I_{i b' r'}(x) } {N_{b r}(x) N_{b' r'}(x)} \right)^2 \right] \lesssim \frac{n^2}{\lambda^{2d}} \frac{1}{B} \E \left[ \frac{\I_{b r}(x)}{N_{b r}(x)} \frac{\I_{b' r'}(x)}{N_{b' r'}(x)} \right] \lesssim \frac{1}{B} \to 0. \end{align*} % For the second term in \eqref{eq:mondrian_app_total_variance}, the random variable inside the variance is a nonlinear function of the i.i.d.\ variables $(X_i, \varepsilon_i)$, so we apply the Efron--Stein inequality \citep{efron1981jackknife}. Let $(\tilde X_{i j}, \tilde Y_{i j}) = (X_i, Y_i)$ if $i \neq j$ and be an independent copy of $(X_j, Y_j)$, denoted $(\tilde X_j, \tilde Y_j)$, if $i = j$, and define $\tilde \varepsilon_{i j} = \tilde Y_{i j} - \mu(\tilde X_{i j})$. Write $\tilde \I_{i j b r}(x) = \I \big\{ \tilde X_{i j} \in T_{b r}(x) \big\}$ and $\tilde \I_{j b r}(x) = \I \big\{ \tilde X_{j} \in T_{b r}(x) \big\}$, and also $\tilde N_{j b r}(x) = \sum_{i=1}^{n} \tilde \I_{i j b r}(x)$. We use the leave-one-out notation $N_{-j b r}(x) = \sum_{i \neq j} \I_{i b r}(x)$ and also write $N_{-j b r \cap b' r'} = \sum_{i \neq j} \I_{i b r}(x) \I_{i b' r'}(x)$. Since $\E[ \varepsilon_i^4 \mid X_i]$ is bounded, % \begin{align*} &\Var \left[ \E \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX, \bY \right] \right] \\ &\quad\leq \Var \left[ \E \left[ \frac{n}{\lambda^d} \sum_{i=1}^n \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX, \bY \right] \right] \\ &\quad\leq \frac{1}{2} \frac{n^2}{\lambda^{2d}} \sum_{j=1}^{n} \E \left[ \left( \sum_{i=1}^n \left( \frac{\I_{i b r}(x) \I_{i b' r}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} - \frac{\tilde \I_{i j b r}(x) \tilde \I_{i j b' r'}(x) \tilde \varepsilon_{i j}^2} {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)} \right) \right)^2 \right] \\ &\quad\leq \frac{n^2}{\lambda^{2d}} \sum_{j=1}^{n} \E \left[ \left( \left| \frac{1} {N_{b }(x) N_{b' r'}(x)} - \frac{1} {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)} \right| \sum_{i \neq j} \I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2 \right)^2 \right] \\ &\qquad+ \frac{n^2}{\lambda^{2d}} \sum_{j=1}^{n} \E \left[ \left( \left( \frac{\I_{j b r}(x) \I_{j b' r'}(x) \varepsilon_j^2} {N_{b r}(x) N_{b' r'}(x)} - \frac{\tilde \I_{j b r}(x) \tilde \I_{j b' r'}(x) \tilde \varepsilon_j^2} {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)} \right) \right)^2 \right] \\ &\quad\lesssim \frac{n^2}{\lambda^{2d}} \sum_{j=1}^{n} \E \left[ N_{-j b r \cap b' r}(x)^2 \left| \frac{1} {N_{b r}(x) N_{b' r'}(x)} - \frac{1} {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)} \right|^2 + \frac{\I_{j b r}(x) \I_{j b' r'}(x)} {N_{b r}(x)^2 N_{b' r'}(x)^2} \right]. \end{align*} % For the first term in the above display, note that % \begin{align*} &\left| \frac{1}{N_{b r}(x) N_{b' r'}(x)} - \frac{1} {\tilde N_{j b r}(x) \tilde N_{j b' r'}(x)} \right| \\ &\quad\leq \frac{1}{N_{b r}(x)} \left| \frac{1} {N_{b' r'}(x)} - \frac{1} {\tilde N_{j b' r'}(x)} \right| + \frac{1}{\tilde N_{j b' r'}(x)} \left| \frac{1} {N_{b r}(x)} - \frac{1} {\tilde N_{j b r}(x)} \right| \\ &\quad\leq \frac{1}{N_{-j b r}(x)} \frac{1} {N_{-j b' r'}(x)^2} + \frac{1}{N_{-j b' r'}(x)} \frac{1} {N_{-j b r}(x)^2} \end{align*} % since $|N_{b r}(x) - \tilde N_{j b r}(x)| \leq 1$ and $|N_{b' r'}(x) - \tilde N_{j b' r'}(x)| \leq 1$. Further, these terms are non-zero only on the events $\{ X_j \in T_{b r}(x) \} \cup \{ \tilde X_j \in T_{b r}(x) \}$ and $\{ X_j \in T_{b' r'}(x) \} \cup \{ \tilde X_j \in T_{b' r'}(x) \}$ respectively, so % \begin{align*} &\Var \left[ \E \left[ \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{b=1}^B \sum_{b' \neq b} \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \Bigm| \bX, \bY \right] \right] \\ &\, \lesssim \frac{n^2}{\lambda^{2d}} \sum_{j=1}^{n} \E \left[ \frac{\I_{j b' r'}(x) + \tilde \I_{j b' r'}(x)}{N_{-j b r}(x)^2} \frac{N_{-j b r \cap b' r}(x)^2} {N_{-j b' r'}(x)^4} \right. \\ &\left. \qquad+ \frac{\I_{j b r}(x) + \tilde \I_{j b r}(x)}{N_{-j b' r'}(x)^2} \frac{N_{-j b r \cap b' r}(x)^2} {N_{-j b r}(x)^4} + \frac{\I_{j b r}(x) \I_{j b' r'}(x)} {N_{b r}(x)^2 N_{b' r'}(x)^2} \right] \\ &\, \lesssim \frac{n^2}{\lambda^{2d}} \sum_{j=1}^{n} \E \left[ \frac{\I_{j b r}(x) \I_{b r}(x) \I_{b' r'}(x)} {N_{b r}(x)^2 N_{b' r'}(x)^2} \right] \lesssim \frac{n^2}{\lambda^{2d}} \E \left[ \frac{\I_{b r}(x) \I_{b' r'}(x)} {N_{b r}(x) N_{b' r'}(x)^2} \right] \\ &\lesssim \frac{n^2}{\lambda^{2d}} \frac{\lambda^d}{n} \frac{\lambda^{2d} \log n}{n^2} \lesssim \frac{\lambda^d \log n}{n} \to 0, \end{align*} % where we used Lemma~\ref{lem:mondrian_app_simple_moment_denominator}. So $\sum_{i=1}^{n} S_i(x)^2 - n \,\E \left[ S_i(x)^2 \right] = O_\P \left( \frac{1}{\sqrt B} + \sqrt{\frac{\lambda^d \log n}{n}} \right) = o_\P(1)$. \proofparagraph{calculating the limiting variance} % Thus by \citet[Theorem~3.2]{hall1980martingale} we conclude that % \begin{align*} \sqrt{\frac{n}{\lambda^d}} \big( \hat\mu_\rd(x) - \E\left[ \hat\mu_\rd(x) \mid \bX, \bT \right] \big) &\rightsquigarrow \cN\big(0, \Sigma_\rd(x)\big) \end{align*} % as $n \to \infty$, assuming that the limit % \begin{align*} \Sigma_\rd(x) &= \lim_{n \to \infty} \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] \end{align*} % exists. Now we verify this and calculate the limit. Since $J$ is fixed, it suffices to find % \begin{align*} \lim_{n \to \infty} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] \end{align*} % for each $0 \leq r, r' \leq J$. Firstly, note that % \begin{align*} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] &= \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \sigma^2(X_i)} {N_{b r}(x) N_{b' r'}(x)} \right] \\ &= \frac{n^2}{\lambda^d} \sigma^2(x) \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x)} {N_{b r}(x) N_{b' r'}(x)} \right] \\ &\quad+ \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \big(\sigma^2(X_i) - \sigma^2(x) \big)} {N_{b r}(x) N_{b' r'}(x)} \right]. \end{align*} % Since $\sigma^2$ is Lipschitz and $\P \left(\max_{1 \leq l \leq d} |T_b(x)_l| \geq t/\lambda \right) \leq 2d e^{-t/2}$ by Lemma~\ref{lem:mondrian_app_largest_cell}, % \begin{align*} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \big|\sigma^2(X_i) - \sigma^2(x) \big|} {N_{b r}(x) N_{b' r'}(x)} \right] &\leq 2de^{-t/2} \frac{n^2}{\lambda^d} + \frac{n^2}{\lambda^d} \frac{t}{\lambda} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x)} {N_{b r}(x) N_{b' r'}(x)} \right] \\ &\lesssim \frac{n^2}{\lambda^d} \frac{\log n}{\lambda} \frac{\lambda^d}{n^2} \lesssim \frac{\log n}{\lambda}, \end{align*} % by Lemma~\ref{lem:mondrian_app_simple_moment_denominator}, where we set $t = 4 \log n$. Therefore % \begin{align*} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] &= \sigma^2(x) \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x)} {N_{b r}(x) N_{b' r'}(x)} \right] + O \left( \frac{\log n}{\lambda} \right). \end{align*} % Next, by conditioning on $T_{b r}$, $T_{b' r'}$, $N_{-i b r}(x)$, and $N_{-i b' r'}(x)$, % \begin{align*} &\E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x)} {N_{b r}(x) N_{b' r'}(x)} \right] = \E \left[ \frac{\int_{T_{b r}(x) \cap T_{b' r'}(x)} f(\xi) \diff \xi} {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)} \right] \\ &\quad= f(x) \, \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)} \right] + \E \left[ \frac{\int_{T_{b r}(x) \cap T_{b' r'}(x)} (f(\xi) - f(x)) \diff \xi} {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)} \right] \\ &\quad= f(x) \, \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)} \right] + O \left( \frac{\lambda^d}{n^2} \frac{(\log n)^{d+1}}{\lambda} \right) \end{align*} % arguing using Lemma~\ref{lem:mondrian_app_largest_cell}, the Lipschitz property of $f(x)$, and Lemma~\ref{lem:mondrian_app_simple_moment_denominator}. So % \begin{align*} \frac{n^2}{\lambda^d} \E \! \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] &= \sigma^2(x) f(x) \frac{n^2}{\lambda^d} \E \! \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)} \right] \! + O \! \left( \frac{(\log n)^{d+1}}{\lambda} \right). \end{align*} % Now we apply the binomial result in Lemma~\ref{lem:mondrian_app_binomial_expectation} to approximate the expectation. With $N_{-i b' r' \setminus b r}(x) = \sum_{j \neq i} \I\{X_j \in T_{b' r'}(x) \setminus T_{b r}(x)\}$, % \begin{align*} &\E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)} \right] = \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {N_{-i b r}(x)+1} \right. \\ &\qquad\left. \times \, \E \left[ \frac{1} {N_{-i b' r' \cap b r}(x)+N_{-i b' r' \setminus b r}(x)+1} \Bigm| \bT, N_{-i b' r' \cap b r}(x), N_{-i b r \setminus b' r'}(x) \right] \right]. \end{align*} % Now conditional on $\bT$, $N_{-i b' r' \cap b r}(x)$, and $N_{-i b r \setminus b' r'}(x)$, % \begin{align*} N_{-i b' r' \setminus b r}(x) &\sim \Bin\left( n - 1 - N_{-i b r}(x), \ \frac{\int_{T_{b' r'}(x) \setminus T_{b r}(x)} f(\xi) \diff \xi} {1 - \int_{T_{b r}(x)} f(\xi) \diff \xi} \right). \end{align*} % We bound these parameters above and below. Firstly, by Lemma~\ref{lem:mondrian_app_active_data} with $B=1$, % \begin{align*} \P \left( N_{-i b r}(x) > t^{d+1} \frac{n}{\lambda^d} \right) &\leq 4 d e^{- t / (4 \|f\|_\infty(1 + 1/a_r))} \leq e^{- t / C} \end{align*} % for some $C > 0$ and sufficiently large $t$. Next, if $f$ is $L$-Lipschitz in $\ell^2$, by Lemma~\ref{lem:mondrian_app_largest_cell}, % \begin{align*} &\P \left( \left| \frac{\int_{T_{b' r'}(x) \setminus T_{b r}(x)} f(\xi) \diff \xi} {1 - \int_{T_{b r}(x)} f(\xi) \diff \xi} - f(x) |T_{b' r'}(x) \setminus T_{b r}(x)| \right| > t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{\lambda} \right) \\ &\quad\leq \P \left( \int_{T_{b' r'}(x) \setminus T_{b r}(x)} \left| f(\xi) - f(x) \right| \diff \xi > t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2 \lambda} \right) \\ &\qquad+ \P \left( \frac{\int_{T_{b' r'}(x) \setminus T_{b r}(x)} f(\xi) \diff \xi \cdot \int_{T_{b r}(x)} f(\xi) \diff \xi} {1 - \int_{T_{b r}(x)} f(\xi) \diff \xi} > t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2\lambda} \right) \\ &\quad\leq \P \left( L d\, |T_{b' r'}(x) \setminus T_{b r}(x)| \max_{1 \leq j \leq d} |T_{b' r'}(x)_j| > t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2\lambda} \right) \\ &\qquad+ \P \left( \|f\|_\infty \,|T_{b' r'}(x) \setminus T_{b r}(x)| \frac{\|f\|_\infty |T_{b r}(x)|} {1 - \|f\|_\infty |T_{b r}(x)|} > t \, \frac{|T_{b' r'}(x) \setminus T_{b r}(x)|}{2\lambda} \right) \\ &\quad\leq \P \left( \max_{1 \leq j \leq d} |T_{b' r'}(x)_j| > \frac{t}{2\lambda L d} \right) +\P \left( |T_{b r}(x)| > \frac{t}{4\lambda \|f\|_\infty^2} \right) \\ &\quad\leq 2 d e^{-t a_r /(4L d)} + 2 d e^{-t a_r / (8 \|f\|_\infty^2)} \leq e^{-t/C}, \end{align*} % for large $t$, increasing $C$ as necessary. Thus with probability at least $1 - e^{-t/C}$, increasing $C$, % \begin{align*} N_{-i b' r' \setminus b r}(x) &\leq \Bin\left( n, \, |T_{b' r'}(x) \setminus T_{b r}(x)| \left( f(x) + \frac{t}{\lambda} \right) \right) \\ N_{-i b' r' \setminus b r}(x) &\geq \Bin\left( n \left( 1 - \frac{t^{d+1}}{\lambda^d} - \frac{1}{n} \right), \, |T_{b' r'}(x) \setminus T_{b r}(x)| \left( f(x) - \frac{t}{\lambda} \right) \right). \end{align*} % So by Lemma~\ref{lem:mondrian_app_binomial_expectation} conditionally on $\bT$, $N_{-i b' r' \cap b r}(x)$, and $N_{-i b r \setminus b' r'}(x)$, we have with probability at least $1 - e^{-t/C}$ that % \begin{align*} &\left| \E \left[ \frac{1} {N_{-i b' r' \cap b r}(x)+N_{-i b' r' \setminus b r}(x)+1} \Bigm| \bT, N_{-i b' r' \cap b r}(x), N_{-i b r \setminus b' r'}(x) \right] \right. \\ &\left. \qquad- \frac{1} {N_{-i b' r' \cap b r}(x) + n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1} \right| \\ &\quad\lesssim \frac{1 + \frac{n t}{\lambda} |T_{b' r'}(x) \setminus T_{b r}(x)|} {\left(N_{-i b' r' \cap b r}(x) + n |T_{b' r'}(x) \setminus T_{b r}(x)|+1\right)^2}. \end{align*} % Therefore, by the same approach as the proof of Lemma~\ref{lem:mondrian_app_moment_denominator}, taking $t = 3 C \log n$, % \begin{align*} & \left| \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {(N_{-i b r}(x)+1) (N_{-i b' r'}(x)+1)} \right.\right. \\ &\left.\left. \qquad - \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {(N_{-i b r}(x)+1) (N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1)} \right] \right| \\ &\quad\lesssim \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}{N_{-i b r}(x)+1} \frac{1 + \frac{n t}{\lambda} |T_{b' r'}(x) \setminus T_{b r}(x)|} {\left(N_{-i b' r' \cap b r}(x) + n |T_{b' r'}(x) \setminus T_{b r}(x)|+1\right)^2} \right] + e^{-t/C} \\ &\quad\lesssim \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {n |T_{b r}(x)|+1} \frac{1 + \frac{n t}{\lambda} |T_{b' r'}(x) \setminus T_{b r}(x)|} {(n |T_{b' r'}(x)| + 1)^2} \right] + e^{-t/C} \\ &\quad\lesssim \E \left[ \frac{1}{n} \frac{1} {(n |T_{b' r'}(x)| + 1)^2} + \frac{1}{n} \frac{t / \lambda} {n |T_{b' r'}(x)| + 1} \right] + e^{-t/C} \\ &\quad\lesssim \frac{\lambda^{2d} \log n}{n^3} + \frac{\log n}{n \lambda} \frac{\lambda^d}{n} \lesssim \frac{\lambda^d}{n^2} \left( \frac{\lambda^{d} \log n}{n} + \frac{\log n}{\lambda} \right). \end{align*} % Now apply the same argument to the other term in the expectation, to see that % \begin{align*} &\left| \E \left[ \frac{1} {N_{-i b r \cap b' r'}(x)+N_{-i b r \setminus b' r'}(x)+1} \Bigm| \bT, N_{-i b r \cap b' r'}(x), N_{-i b' r' \setminus b r}(x) \right] \right. \\ &\left. \qquad- \frac{1} {N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \setminus T_{b' r'}(x)|+1} \right| \\ &\quad\lesssim \frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \setminus T_{b' r'}(x)|} {\left(N_{-i b r \cap b' r'}(x) + n |T_{b r}(x) \setminus T_{b' r'}(x)|+1\right)^2}. \end{align*} % with probability at least $1 - e^{-t/C}$, and so likewise again with $t = 3 C \log n$, % \begin{align*} &\frac{n^2}{\lambda^d} \left| \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|}{N_{-i b r}(x)+1} \frac{1} {N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1} \right] \right. \\ &\left. \quad- \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \setminus T_{b' r'}(x)|+1} \right.\right. \\ &\qquad\qquad\left.\left. \times \frac{1} {N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1} \right] \right| \\ &\lesssim \frac{n^2}{\lambda^d} \, \E \left[ \frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \setminus T_{b' r'}(x)|} {\left(N_{-i b r \cap b' r'}(x) + n |T_{b r}(x) \setminus T_{b' r'}(x)|+1\right)^2} \right. \\ &\qquad\qquad\left. \times \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1} \right] + \frac{n^2}{\lambda^d} e^{-t/C} \\ &\lesssim \frac{\lambda^d \log n}{n} + \frac{\log n}{\lambda}. \end{align*} % Thus far we have proven that % \begin{align*} &\frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] = \sigma^2(x) f(x) \frac{n^2}{\lambda^d} \\ &\quad\times \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \setminus T_{b' r'}(x)|+1} \right. \\ &\left. \qquad\qquad \times \frac{1} {N_{-i b' r' \cap b r}(x)+n f(x) |T_{b' r'}(x) \setminus T_{b r}(x)|+1} \right] \\ &\quad+ O \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{\lambda^d \log n}{n} \right). \end{align*} % We remove the $N_{-i b r \cap b' r'}(x)$ terms. With probability at least $1 - e^{-t/C}$, conditional on $\bT$, % \begin{align*} N_{-i b r \cap b' r'}(x) &\leq \Bin\left( n, \, |T_{b r}(x) \cap T_{b' r'}(x)| \left( f(x) + \frac{t}{\lambda} \right) \right), \\ N_{-i b r \cap b' r'}(x) &\geq \Bin\left( n \left( 1 - \frac{t^{d+1}}{\lambda^d} - \frac{1}{n} \right), \, |T_{b r}(x) \cap T_{b' r'}(x)| \left( f(x) - \frac{t}{\lambda} \right) \right). \end{align*} % Therefore, by Lemma~\ref{lem:mondrian_app_binomial_expectation} applied conditionally on $\bT$, with probability at least $1 - e^{-t/C}$, % \begin{align*} & \left| \E \! \left[ \frac{1} {N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \!\setminus\! T_{b' r'}(x)|+1} \frac{1} {N_{-i b' r' \cap b r}(x) + n f(x) |T_{b' r'}(x) \!\setminus\! T_{b r}(x)|+1} \! \Bigm| \! \bT \right] \right. \\ &\left. \qquad- \frac{1} {n f(x) |T_{b r}(x)|+1} \frac{1} {n f(x) |T_{b' r'}(x)|+1} \right| \\ &\quad\lesssim \frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \cap T_{b' r'}(x)|} {(n |T_{b r}(x)| + 1)(n |T_{b' r'}(x)| + 1)} \left( \frac{1}{n |T_{b r}(x)| + 1} + \frac{1}{n |T_{b' r'}(x)| + 1} \right). \end{align*} % Now by Lemma~\ref{lem:mondrian_app_moment_cell}, with $t = 3 C \log n$, % \begin{align*} &\frac{n^2}{\lambda^d} \left| \E \! \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {N_{-i b r \cap b' r'}(x) + n f(x) |T_{b r}(x) \!\setminus\! T_{b' r'}(x)|+1} \frac{1} {N_{-i b' r' \cap b r}(x) + n f(x) |T_{b' r'}(x) \!\setminus\! T_{b r}(x)|+1} \right] \right. \\ &\left. \qquad- \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {n f(x) |T_{b r}(x)|+1} \frac{1} {n f(x) |T_{b' r'}(x)|+1} \right] \right| \\ &\quad\lesssim \frac{n^2}{\lambda^d} \E \left[ |T_{b r}(x) \cap T_{b' r'}(x)| \frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \cap T_{b' r'}(x)|} {(n |T_{b r}(x)| + 1)(n |T_{b' r'}(x)| + 1)} \frac{1}{n |T_{b r}(x)| + 1} + \frac{1}{n |T_{b' r'}(x)| + 1} \right] \\ &\qquad+ \frac{n^2}{\lambda^d} e^{-t/C} \\ &\quad\lesssim \frac{n^2}{\lambda^d} \frac{1}{n^3} \E \left[ \frac{1 + \frac{n t}{\lambda} |T_{b r}(x) \cap T_{b' r'}(x)|} {|T_{b r}(x)| |T_{b' r'}(x)|} \right] + \frac{n^2}{\lambda^d} e^{-t/C} \\ &\quad\lesssim \frac{1}{n \lambda^d} \E \left[ \frac{1}{|T_{b r}(x)| |T_{b' r'}(x)|} \right] + \frac{t}{\lambda^{d+1}} \E \left[ \frac{1}{|T_{b r}(x)|} \right] + \frac{n^2}{\lambda^d} e^{-t/C} \\ &\quad\lesssim \frac{\lambda^d}{n} + \frac{\log n}{\lambda}. \end{align*} % This allows us to deduce that % \begin{align*} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] &= \sigma^2(x) f(x) \frac{n^2}{\lambda^d} \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {(n f(x) |T_{b r}(x)|+1)(n f(x) |T_{b' r'}(x)|+1)} \right] \\ &\quad+ O \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{\lambda^d \log n}{n} \right). \end{align*} % Now that we have reduced the limiting variance to an expression only involving the sizes of Mondrian cells, we can exploit their exact distribution to compute this expectation. Recall from \citet[Proposition~1]{mourtada2020minimax} that we can write % \begin{align*} |T_{b r}(x)| &= \prod_{j=1}^{d} \left( \frac{E_{1j}}{a_r \lambda} \wedge x_j + \frac{E_{2j}}{a_r \lambda} \wedge (1 - x_j) \right), \\ |T_{b' r'}(x)| &= \prod_{j=1}^{d} \left( \frac{E_{3j}}{a_{r'} \lambda} \wedge x_j + \frac{E_{4j}}{a_{r'} \lambda} \wedge (1 - x_j) \right), \\ |T_{b r }(x)\cap T_{b' r'}(x)| &= \prod_{j=1}^{d} \left( \frac{E_{1j}}{a_r \lambda} \wedge \frac{E_{3j}}{a_{r'} \lambda} \wedge x_j + \frac{E_{2j}}{a_r \lambda} \wedge \frac{E_{4j}}{a_{r'} \lambda} \wedge (1 - x_j) \right) \end{align*} % where $E_{1j}$, $E_{2j}$, $E_{3j}$, and $E_{4j}$ are independent and $\Exp(1)$. Define their non-truncated versions % \begin{align*} |\tilde T_{b r}(x)| &= a_r^{-d} \lambda^{-d} \prod_{j=1}^{d} \left( E_{1j} + E_{2j} \right), \\ |\tilde T_{b' r'}(x)| &= a_{r'}^{-d} \lambda^{-d} \prod_{j=1}^{d} \left( E_{3j} + E_{4j} \right), \\ |\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)| &= \lambda^{-d} \prod_{j=1}^{d} \left( \frac{E_{1j}}{a_r} \wedge \frac{E_{3j}}{a_{r'}} + \frac{E_{2j}}{a_r} \wedge \frac{E_{4j}}{a_{r'}} \right), \end{align*} % and note that % \begin{align*} &\P \left( \big( \tilde T_{b r}(x), \tilde T_{b' r'}(x), \tilde T_{b r}(x) \cap T_{b' r'}(x) \big) \neq \big( T_{b r}(x), T_{b' r'}(x), T_{b r}(x) \cap T_{b' r'}(x) \big) \right) \\ &\,\leq \sum_{j=1}^{d} \big( \P(E_{1j} \geq a_r \lambda x_j) + \P(E_{3j} \geq a_{r'} \lambda x_j) + \P(E_{2j} \geq a_r \lambda (1 - x_j)) + \P(E_{4j} \geq a_{r'} \lambda (1 - x_j)) \big) \\ &\,\leq e^{-C \lambda} \end{align*} % for some $C > 0$ and sufficiently large $\lambda$. So by Cauchy--Schwarz and Lemma~\ref{lem:mondrian_app_moment_cell}, % \begin{align*} & \frac{n^2}{\lambda^d} \left| \E \left[ \frac{|T_{b r}(x) \cap T_{b' r'}(x)|} {n f(x) |T_{b r}(x)|+1} \frac{1} {n f(x) |T_{b' r'}(x)|+1} \right] - \E \left[ \frac{|\tilde T_{b r}(x) \cap T_{b' r'}(x)|} {n f(x) |\tilde T_{b r}(x)|+1} \frac{1} {n f(x) |\tilde T_{b' r'}(x)|+1} \right] \right| \\ &\quad\lesssim \frac{n^2}{\lambda^d} e^{-C \lambda} \lesssim e^{-C \lambda / 2} \end{align*} % as $\log \lambda \gtrsim \log n$. Therefore % \begin{align*} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] &= \sigma^2(x) f(x) \frac{n^2}{\lambda^d} \E \left[ \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|} {(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|+1)} \right] \\ &\quad+ O \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{\lambda^d \log n}{n} \right). \end{align*} % We remove the superfluous units in the denominators. Firstly, by independence of the trees, % \begin{align*} & \frac{n^2}{\lambda^d} \left| \E \left[ \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|} {(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|+1)} \right] - \E \left[ \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|} {(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|)} \right] \right| \\ &\quad\lesssim \frac{n^2}{\lambda^d} \E \left[ \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|} {n |\tilde T_{b r}(x)|} \frac{1} {n^2 |\tilde T_{b' r'}(x)|^2} \right] \lesssim \frac{1}{n \lambda^d} \E \left[ \frac{1}{|T_{b r}(x)|} \right] \E \left[ \frac{1}{|T_{b' r'}(x)|} \right] \lesssim \frac{\lambda^d}{n}. \end{align*} % Secondly, we have in exactly the same manner that % \begin{align*} \frac{n^2}{\lambda^d} \left| \E \left[ \frac{|\tilde T_{b r}(x) \cap T_{b' r'}(x)|} {(n f(x) |\tilde T_{b r}(x)|+1)(n f(x) |\tilde T_{b' r'}(x)|)} \right] - \E \left[ \frac{|\tilde T_{b r}(x) \cap T_{b' r'}(x)|} {n^2 f(x)^2 |\tilde T_{b r}(x)| |\tilde T_{b' r'}(x)|} \right] \right| &\lesssim \frac{\lambda^d}{n}. \end{align*} % Therefore % \begin{align*} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] &= \frac{\sigma^2(x)}{f(x)} \frac{1}{\lambda^d} \E \left[ \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|} {|\tilde T_{b r}(x)| |\tilde T_{b' r'}(x)|} \right] + O \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{\lambda^d \log n}{n} \right). \end{align*} % It remains to compute this integral. By independence over $1 \leq j \leq d$, % \begin{align*} &\E \left[ \frac{|\tilde T_{b r}(x) \cap \tilde T_{b' r'}(x)|} {|\tilde T_{b r}(x)| |\tilde T_{b' r'}(x)|} \right] \\ &\quad= a_r^d a_{r'}^d \lambda^d \prod_{j=1}^d \E \left[ \frac{ (E_{1j} / a_r) \wedge (E_{3j} / a_{r'}) + (E_{2j} a_r) \wedge (E_{4j} / a_{r'}) } { \left( E_{1j} + E_{2j} \right) \left( E_{3j} + E_{4j} \right)} \right] \\ &\quad= 2^d a_r^d a_{r'}^d \lambda^d \prod_{j=1}^d \E \left[ \frac{ (E_{1j} / a_r) \wedge (E_{3j} / a_{r'})} { \left( E_{1j} + E_{2j} \right) \left( E_{3j} + E_{4j} \right) } \right] \\ &\quad= 2^d a_r^d a_{r'}^d \lambda^d \prod_{j=1}^d \int_{0}^{\infty} \int_{0}^{\infty} \int_{0}^{\infty} \int_{0}^{\infty} \frac{ (t_1 / a_r) \wedge (t_3 / a_{r'}) } { \left( t_1 + t_2 \right) \left( t_3 + t_4 \right) } e^{-t_1 - t_2 - t_3 - t_4} \diff t_1 \diff t_2 \diff t_3 \diff t_4 \\ &\quad= 2^d a_r^d a_{r'}^d \lambda^d \prod_{j=1}^d \int_{0}^{\infty} \int_{0}^{\infty} ((t_1 / a_r) \wedge (t_3 / a_{r'})) e^{-t_1 - t_3} \\ &\qquad\times \left( \int_{0}^{\infty} \frac{e^{-t_2}}{t_1 + t_2} \diff t_2 \right) \left( \int_{0}^{\infty} \frac{e^{-t_4}}{t_3 + t_4} \diff t_4 \right) \diff t_1 \diff t_3 \\ &\quad= 2^d a_r^d a_{r'}^d \lambda^d \prod_{j=1}^d \int_{0}^{\infty} \int_{0}^{\infty} ((t / a_r) \wedge (s / a_{r'})) \Gamma(0, t) \Gamma(0, s) \diff t \diff s, \end{align*} % as $\int_0^\infty \frac{e^{-t}}{a + t} \diff t = e^a \Gamma(0, a)$ with $\Gamma(0, a) = \int_a^\infty \frac{e^{-t}}{t} \diff t$. Now % \begin{align*} &2 \int_{0}^{\infty} \int_{0}^{\infty} ((t / a_r) \wedge (s / a_{r'})) \Gamma(0, t) \Gamma(0, s) \diff t \diff s \\ &\quad= \int_0^\infty \Gamma(0, t) \left( \frac{1}{a_{r'}} \int_0^{a_{r'} t / a_r} 2 s \Gamma(0, s) \diff{s} + \frac{t}{a_r} \int_{a_{r'} t / a_r}^\infty 2 \Gamma(0, s) \diff{s} \right) \diff{t} \\ &\quad= \int_0^\infty \Gamma(0, t) \left( \frac{t}{a_r} e^{- \frac{a_{r'}}{a_r}t} - \frac{1}{a_{r'}} e^{- \frac{a_{r'}}{a_r}t} + \frac{1}{a_{r'}} - \frac{a_{r'}}{a_r^2} t^2 \Gamma\left(0, \frac{a_{r'}}{a_r} t\right) \right) \diff{t} \\ &\quad= \frac{1}{a_r} \int_0^\infty t e^{- \frac{a_{r'}}{a_r} t} \Gamma(0, t) \diff{t} - \frac{1}{a_{r'}} \int_0^\infty e^{- \frac{a_{r'}}{a_r} t} \Gamma(0, t) \diff{t} \\ &\qquad+ \frac{1}{a_{r'}} \int_0^\infty \Gamma(0, t) \diff{t} - \frac{a_{r'}}{a_r^2} \int_0^\infty t^2 \Gamma\left(0, \frac{a_{r'}}{a_r} t\right) \Gamma(0, t) \diff{t}, \end{align*} % since $\int_0^a 2 t \Gamma(0, t) \diff t = a^2 \Gamma(0, a) - a e^{-a} -e^{-a} + 1$ and $\int_a^\infty \Gamma(0, t) \diff t = e^{-a} - a \Gamma(0, a)$. Next, we use % $ \int_{0}^{\infty} \Gamma(0, t) \diff t = 1$, $\int_{0}^{\infty} e^{-at} \Gamma(0, t) \diff t = \frac{\log(1+a)}{a}$, $\int_{0}^{\infty} t e^{-at} \Gamma(0, t) \diff t = \frac{\log(1+a)}{a^2} - \frac{1}{a(a+1)}$, and $\int_{0}^{\infty} t^2 \Gamma(0, t) \Gamma(0, at) \diff t = - \frac{2a^2 + a + 2}{3a^2 (a+1)} + \frac{2(a^3 + 1) \log(a+1)}{3a^3} - \frac{2 \log a}{3}$ to see % \begin{align*} &2 \int_{0}^{\infty} \int_{0}^{\infty} ((t / a_r) \wedge (s / a_{r'})) \Gamma(0, t) \Gamma(0, s) \diff t \diff s \\ &\quad= \frac{a_r \log(1+a_{r'} / a_r)}{a_{r'}^2} - \frac{a_r / a_{r'}}{a_r + a_{r'}} - \frac{a_r \log(1 + a_{r'} / a_r)}{a_{r'}^2} + \frac{1}{a_{r'}} \\ &\qquad+ \frac{2 a_{r'}^2 + a_r a_{r'} + 2 a_r^2} {3 a_r a_{r'} (a_r + a_{r'})} - \frac{2(a_{r'}^3 + a_r^3) \log(a_{r'} / a_r+1)}{3 a_r^2 a_{r'}^2} + \frac{2 a_{r'} \log (a_{r'} / a_r)}{3 a_r^2} \\ &\quad= \frac{2}{3 a_r} + \frac{2}{3 a_{r'}} - \frac{2(a_r^3 + a_{r'}^3 ) \log(a_{r'} / a_{r}+1)} {3 a_r^2 a_{r'}^2} + \frac{2 a_{r'} \log (a_{r'} / a_{r})}{3 a_r^2} \\ &\quad= \frac{2}{3 a_r} + \frac{2}{3 a_{r'}} - \frac{2 a_{r'} \log(a_{r} / a_{r'} + 1)}{3 a_r^2} - \frac{2 a_r \log(a_{r'} / a_{r} + 1)}{3 a_{r'}^2} \\ &\quad= \frac{2}{3 a_r} \left( 1 - \frac{a_{r'}}{a_r} \log\left(\frac{a_{r}}{a_{r'}} + 1\right) \right) + \frac{2}{3 a_{r'}} \left( 1 - \frac{a_r }{a_{r'}} \log\left(\frac{a_{r'}}{a_{r}} + 1\right) \right). \end{align*} % Finally, we conclude by giving the limiting variance. % \begin{align*} &\sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \frac{n^2}{\lambda^d} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x) \varepsilon_i^2} {N_{b r}(x) N_{b' r'}(x)} \right] \\ &\quad= \frac{\sigma^2(x)}{f(x)} \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \left( \frac{2 a_{r'}}{3} \left( 1 - \frac{a_{r'}}{a_r} \log\left(\frac{a_r}{a_{r'}} + 1\right) \right) + \frac{2 a_r}{3} \left( 1 - \frac{a_r}{a_{r'}} \log\left(\frac{a_{r'}}{a_r} + 1\right) \right) \right)^d \\ &\qquad+ O \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{\lambda^d \log n}{n} \right). \end{align*} % So the limit exists, and with $\ell_{r r'} = \frac{2 a_r}{3} \left( 1 - \frac{a_{r}}{a_{r'}} \log\left(\frac{a_{r'}}{a_{r}} + 1\right) \right)$, the limiting variance is % \begin{align*} \Sigma_\rd(x) &= \frac{\sigma^2(x)}{f(x)} \sum_{r=0}^{J} \sum_{r'=0}^{J} \omega_r \omega_{r'} \left( \ell_{r r'} + \ell_{r' r} \right)^d. \end{align*} % \end{proof} The new bias characterization with debiasing is an algebraic consequence of the original bias characterization and the construction of the debiased Mondrian random forest estimator. \begin{proof}[Theorem~\ref{thm:mondrian_bias_debiased}] By the definition of the debiased estimator and Theorem~\ref{thm:mondrian_bias}, since $J$ and $a_r$ are fixed, % \begin{align*} \E \big[ \hat \mu_\rd(x) \mid \bX, \bT \big] &= \sum_{l=0}^J \omega_l \E \big[ \hat \mu_l(x) \Bigm| \bX, \bT \big] \\ &= \sum_{l=0}^J \omega_l \left( \mu(x) + \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor} \frac{B_r(x)}{a_l^{2r} \lambda^{2r}} \right) + O_\P \left( \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} + \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}} \right). \end{align*} % It remains to evaluate the first term. Recalling that $A_{r s} = a_{r-1}^{2 - 2s}$ and $A \omega = e_0$, we have % \begin{align*} &\sum_{l=0}^J \omega_l \left( \mu(x) + \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor} \frac{B_r(x)}{a_l^{2r} \lambda^{2r}} \right) \\ &\quad= \mu(x) \sum_{l=0}^J \omega_l + \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor} \frac{B_r(x)}{\lambda^{2r}} \sum_{l=0}^J \frac{\omega_l}{a_l^{2r}} \\ &\quad= \mu(x) (A \omega)_1 + \sum_{r=1}^{\lfloor \flbeta / 2 \rfloor \wedge J} \frac{B_r(x)}{\lambda^{2r}} (A \omega)_{r+1} + \sum_{r = (\lfloor \flbeta / 2 \rfloor \wedge J) + 1} ^{\lfloor \flbeta / 2 \rfloor} \frac{B_r(x)}{\lambda^{2r}} \sum_{l=0}^J \frac{\omega_l}{a_l^{2r}} \\ &\quad= \mu(x) + \I\{\lfloor \flbeta / 2 \rfloor \geq J + 1\} \frac{B_{J+1}(x)}{\lambda^{2J + 2}} \sum_{l=0}^J \frac{\omega_l}{a_l^{2J + 2}} + O \left( \frac{1}{\lambda^{2J + 4}} \right) \\ &\quad= \mu(x) + \I\{2J + 2 < \beta\} \frac{\bar\omega B_{J+1}(x)}{\lambda^{2J + 2}} + O \left( \frac{1}{\lambda^{2J + 4}} \right). \end{align*} % \end{proof} \begin{proof}[Theorem~\ref{thm:mondrian_variance_estimation_debiased}] \proofparagraph{consistency of $\hat\sigma^2(x)$} Recall that % \begin{align} \label{eq:mondrian_app_sigma2_hat_proof} \hat\sigma^2(x) &= \frac{1}{B} \sum_{b=1}^{B} \frac{\sum_{i=1}^n Y_i^2 \, \I\{X_i \in T_b(x)\}} {\sum_{i=1}^n \I\{X_i \in T_b(x)\}} - \hat \mu(x)^2. \end{align} % The first term in \eqref{eq:mondrian_app_sigma2_hat_proof} is simply a Mondrian forest estimator of $\E[Y_i^2 \mid X_i = x] = \sigma^2(x) + \mu(x)^2$, which is bounded and Lipschitz, where $\E[Y_i^4 \mid X_i]$ is bounded almost surely. So its conditional bias is controlled by Theorem~\ref{thm:mondrian_bias} and is at most $O_\P \left( \frac{1}{\lambda} + \frac{\log n}{\lambda} \sqrt{\lambda^d / n} \right)$. Its variance is at most $\frac{\lambda^d}{n}$ by Theorem~\ref{thm:mondrian_clt_debiased}. Consistency of the second term in \eqref{eq:mondrian_app_sigma2_hat_proof} follows directly from Theorems~\ref{thm:mondrian_bias} and \ref{thm:mondrian_clt_debiased} with the same bias and variance bounds. Therefore % \begin{align*} \hat\sigma^2(x) &= \sigma^2(x) + O_\P \left( \frac{1}{\lambda} + \sqrt{\frac{\lambda^d}{n}} \right). \end{align*} \proofparagraph{consistency of the sum} % Note that % \begin{align*} &\frac{n}{\lambda^d} \sum_{i=1}^n \left( \sum_{r=0}^J \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\I\{X_i \in T_{r b}(x)\}} {\sum_{i=1}^n \I\{X_i \in T_{r b}(x)\}} \right)^2 \\ &\quad= \frac{n}{\lambda^d} \frac{1}{B^2} \sum_{i=1}^n \sum_{r=0}^J \sum_{r'=0}^J \omega_r \omega_{r'} \sum_{b=1}^B \sum_{b'=1}^B \frac{\I_{i b r}(x) \I_{i b' r'}(x)} {N_{b r}(x) N_{b' r'}(x)}. \end{align*} % This is exactly the same as the quantity in \eqref{eq:mondrian_app_clt_condition_sum}, if we were to take $\varepsilon_i$ to be $\pm 1$ with equal probability. Thus we immediately have convergence in probability by the proof of Theorem~\ref{thm:mondrian_clt_debiased}: % \begin{align*} \frac{n}{\lambda^d} \sum_{i=1}^n \left( \sum_{r=0}^J \omega_r \frac{1}{B} \sum_{b=1}^B \frac{\I\{X_i \in T_{r b}(x)\}} {\sum_{i=1}^n \I\{X_i \in T_{r b}(x)\}} \right)^2 &= \frac{n^2}{\lambda^d} \sum_{r=0}^J \sum_{r'=0}^J \omega_r \omega_{r'} \E \left[ \frac{\I_{i b r}(x) \I_{i b' r'}(x)} {N_{b r}(x) N_{b' r'}(x)} \right] \\ &\quad+ O_\P \left( \frac{1}{\sqrt B} + \sqrt{\frac{\lambda^d \log n}{n}} \right). \end{align*} \proofparagraph{conclusion} By the proof of Theorem~\ref{thm:mondrian_clt_debiased} with $\varepsilon_i$ being $\pm 1$ with equal probability, and by previous parts, % \begin{align*} \hat\Sigma_\rd(x) = \Sigma_\rd(x) + O_\P \left( \frac{(\log n)^{d+1}}{\lambda} + \frac{1}{\sqrt B} + \sqrt{\frac{\lambda^d \log n}{n}} \right). \end{align*} \end{proof} \begin{proof}[Theorem~\ref{thm:mondrian_confidence_debiased}] % By Theorem~\ref{thm:mondrian_bias_debiased} and Theorem~\ref{thm:mondrian_variance_estimation_debiased}, % \begin{align*} \sqrt{\frac{n}{\lambda^d}} \frac{\hat \mu_\rd(x) - \mu(x)}{\hat \Sigma_\rd(x)^{1/2}} &= \sqrt{\frac{n}{\lambda^d}} \frac{\hat \mu_\rd(x) - \E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right]} {\hat \Sigma_\rd(x)^{1/2}} + \sqrt{\frac{n}{\lambda^d}} \frac{\E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right] - \mu(x)} {\hat \Sigma_\rd(x)^{1/2}} \\ &= \sqrt{\frac{n}{\lambda^d}} \frac{\hat \mu_\rd(x) - \E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right]} {\hat \Sigma_\rd(x)^{1/2}} + \sqrt{\frac{n}{\lambda^d}} \, O_\P \left( \frac{1}{\lambda^\beta} + \frac{1}{\lambda \sqrt B} + \frac{\log n}{\lambda} \sqrt{\frac{\lambda^d}{n}} \right). \end{align*} % The first term converges weakly to $\cN(0,1)$ by Slutsky's theorem and Theorems~\ref{thm:mondrian_clt_debiased} and \ref{thm:mondrian_variance_estimation_debiased}, while the second is $o_\P(1)$ by assumption. Validity of the confidence interval follows. % \end{proof} \begin{proof}[Theorem~\ref{thm:mondrian_minimax}] Theorem~\ref{thm:mondrian_bias_debiased} and the proof of Theorem~\ref{thm:mondrian_clt_debiased} with $J = \lfloor \flbeta / 2 \rfloor$ gives % \begin{align*} \E \left[ \big( \hat \mu_\rd(x) - \mu(x) \big)^2 \right] &= \E \left[ \big( \hat \mu_\rd(x) - \E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right] \big)^2 \right] + \E \left[ \big( \E \left[ \hat \mu_\rd(x) \mid \bX, \bT \right] - \mu(x) \big)^2 \right] \\ &\lesssim \frac{\lambda^d}{n} + \frac{1}{\lambda^{2\beta}} + \frac{1}{\lambda^2 B}. \end{align*} % We use here an $L^2$ version of Theorem~\ref{thm:mondrian_bias_debiased} which is immediate from the proof of Theorem~\ref{thm:mondrian_bias}, since we leveraged Chebyshev's inequality. Now since $\lambda \asymp n^{\frac{1}{d + 2 \beta}}$ and $B \gtrsim n^{\frac{2 \beta - 2}{d + 2 \beta}}$, % \begin{align*} \E \left[ \big( \hat \mu_\rd(x) - \mu(x) \big)^2 \right] &\lesssim n^{-\frac{2\beta}{d + 2 \beta}}. \end{align*} \end{proof} \section{Further properties of the Mondrian process} In section, we state and prove a collection of lemmas concerning various properties of the Mondrian process. While they are not used directly in our analysis of Mondrian random forest estimators, we believe that these results, along with the techniques displayed during their proofs, may be of potential independent interest. Our analysis of Mondrian random forest estimators in the main text is for the most part conducted pointwise, in the sense that we first fix $x \in [0,1]^d$ and then analyze $\hat\mu(x)$. This means that we interact with the Mondrian process only through $T(x)$; that is, the cell in $T$ which contains the point $x$. As such, we rely only on local properties of $T$, and may consider just a single Mondrian cell. The lemmas in this section take a more global approach to analyzing the Mondrian process, and we make statements about the entire process $T$, rather than individual cells $T(x)$. Such results may be useful for a future investigation of the uniform properties of Mondrian forest estimators, as well as being interesting in their own right. We begin with a tail bound for the number of cells appearing in a Mondrian tree, offering a multiplicative exponential inequality which complements the exact expectation result given in \citet[Proposition~2]{mourtada2020minimax}. The resulting bound in probability is the same up to logarithmic terms, and the sharp tail decay is useful in combination with union bounds in our upcoming results. \begin{lemma}[Tail bound for the number of cells in a Mondrian tree] \label{lem:mondrian_app_cells_tail} Let $D \subseteq \R^d$ be a rectangle and $T \sim \cM(D, \lambda)$. Writing $\# T$ for the number of cells in $T$, % \begin{align*} \P\left( \# T > 3 (1 + \lambda |D|_1)^d (t + 1 + d \log(1 + \lambda |D|_1)) \right) &\leq e^{-t}. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_cells_tail}] We refer to this method as the ``subcell trick'' and attribute it to \citet{mourtada2017universal}. For $\varepsilon > 0$, partition $D$ into at most $(1 + 1/\varepsilon)^d$ cells $D' \in \cD_\varepsilon$ with side lengths at most $(|D_1| \varepsilon, \ldots, |D_d| \varepsilon)$. Denote the restriction of a tree $T$ to a subcell $D'$ by $T \cap D'$. Since a split in $T$ induces a split in at least one $T \cap D'$, by a union bound % \begin{align*} \P\left(\# T > t \right) &\leq \P\left(\sum_{D' \in \cD_\varepsilon} \# (T \cap D') > t \right) \leq \sum_{D' \in \cD_\varepsilon} \P\left( \# (T \cap D') > \frac{t}{\# \cD_\varepsilon} \right). \end{align*} % Now $\# (T \cap D')$ is dominated by a Yule process with parameter $|D'|_1$ stopped at time $\lambda$ \citep[proof of Lemma~2]{mourtada2017universal}, so using that fact that if $X \sim \Yule(a)$ then $\P(X_t > n) \leq (1-e^{-at})^{n-1}$, % \begin{align*} \P\left(\# T > t \right) &\leq \# \cD_\varepsilon \, (1 - e^{-\lambda |D|_1 \varepsilon})^{t / \# \cD_\varepsilon - 1} \leq (1 + 1/\varepsilon)^d (1 - e^{-\lambda |D|_1 \varepsilon})^{t (1 + 1/\varepsilon)^{-d} - 1}. \end{align*} % Set $\varepsilon = \frac{1}{\lambda |D|_1}$, note $1-1/e \leq e^{-1/3}$ and replace $t$ by $3 (1 + \lambda |D|_1)^d (t + 1 + d \log(1 + \lambda |D|_1))$: % \begin{align*} &\P\left(\# T > t \right) \leq (1 + \lambda |D|_1)^d (1 - 1/e)^{t (1 + \lambda |D|_1)^{-d} - 1} \leq 2 (1 + \lambda |D|_1)^d e^{-t (1 + \lambda |D|_1)^{-d} / 3}, \\ &\P\left(\# T > 3 (1 + \lambda |D|_1)^d (t + 1 + d \log(1 + \lambda |D|_1)) \right) \leq e^{-t}. \end{align*} % \end{proof} Next we provide a rigorous justification to the observation that the cells in a Mondrian process should have the same shape distribution, though of course they are not independent. To state and prove this result, we need a way to identify a particular cell by endowing the cells in a Mondrian tree with a natural order. \begin{definition}[Canonical order of cells in a Mondrian tree] Let $T \sim \cM(D, \lambda)$. Each cell in a fixed realization of $T$ can be described by a word from the alphabet $\{l, r\}$, where $l$ indicates the cell to the left of a split and $r$ indicates the cell to the right. For example, if there are no splits we have one cell described by the empty word. After one split there are two cells, denoted $l$ and $r$. Now suppose that the cell $r$ splits again, giving two splits and three cells, denoted $l$, $r l$, and $r r$. Define the canonical ordering of the cells of $T$ by applying the lexicographic order to their words, with $l < r$. Note that it does not matter which coordinate each split occurs in: in two dimensions, $l$ might refer to the ``left'' or ``bottom'' and $r$ to the ``right'' or ``top'' cell. \end{definition} \begin{lemma}[Cells in a Mondrian tree have identically distributed shapes] \label{lem:mondrian_app_cells_identically_distributed} Let $T \sim \cM(D, \lambda)$ with ordered cells $D'_1, \ldots, D'_{\# T}$. For $\varepsilon_1, \ldots, \varepsilon_d \geq 0$ and $1 \leq i \leq k$, % \begin{align*} \P\left( |D'_{i1}| \leq \varepsilon_1, \ldots, |D'_{id}| \leq \varepsilon_d, \# T = k \right) &= \P\left( |D'_{11}| \leq \varepsilon_1, \ldots, |D'_{1d}| \leq \varepsilon_d, \# T = k \right). \end{align*} % Marginalizing over $\# T$ with $E_j$ i.i.d.\ $\Exp(1)$, \citet[Proposition~1]{mourtada2020minimax} gives % \begin{align*} \P\left( |D'_{i1}| > \varepsilon_1, \ldots, |D'_{id}| > \varepsilon_d \right) &= \prod_{j=1}^d \P\left( \frac{E_j}{\lambda} \wedge |D_j| > \varepsilon_j \right) = \prod_{j=1}^d \I\{|D_j| > \varepsilon_j\} e^{-\lambda \varepsilon_j}. \end{align*} \end{lemma} We observe a version of the famous Poisson process inspection or waiting time paradox in the sizes of Mondrian cells. The above Lemma~\ref{lem:mondrian_app_cells_identically_distributed} shows that for a large enough lifetime $\lambda$, the volume of any cell $D$ has the same distribution as the volume of a corner cell, and is asymptotically $\E[|D|] \asymp \E \left[ \prod_{j=1}^{d} (E_j / \lambda) \right] = 1/\lambda^d$. This is consistent with \citet[Proposition~2]{mourtada2020minimax} who give $\E[\# T] \asymp \lambda^d$. However, if instead of selecting a cell directly, we instead select a fixed interior point $x$ and query the cell $T(x)$ which contains it, we find that $\E[|T(x)|] \asymp \E \left[ \prod_{j=1}^{d} ((E_{1j} + E_{2j}) / \lambda) \right] = 2^d/\lambda^d$, where $E_{1j}, E_{2j}$ are i.i.d.\ $\Exp(1)$, by \citet[Proposition~1]{mourtada2020minimax}. Since $T(x)$ contains $x$ by construction, a size-biasing phenomenon occurs and we see that $T(x)$ is on average larger than a typical Mondrian cell. \begin{proof}[Lemma~\ref{lem:mondrian_app_cells_identically_distributed}] Let $w$ be the word associated with the cell $D_i \in T$. Note that $i=1$ if and only if $r \notin w$, as then $D_i$ is the left child of every split. So suppose $r \in w$. Let $\tilde w$ be the word obtained by replacing all occurrences of $r$ in $w$ with an $l$. Each such replacement corresponds to a split in $T$. Let $\tilde T$ be the same process as $T$ but with the following modification: for each split where a replacement was made, change the uniform random variable $S$ (from the definition of $T$, see Section~\ref{sec:mondrian_process}) to $1-S$. Since $S$ is independent of everything else in the construction of $T$, we observe that $\tilde T \sim \cM(D, \lambda)$ also. Further, there is almost surely exactly one cell in $\tilde T$ which has the same shape as $D$, as the uniform distribution has no atoms. Denote this cell by $\tilde D$ and note that the replacements imply that its word in $\tilde T$ is $\tilde w$. Thus $\tilde D = \tilde D_1$ in $\tilde T$ and so $(|D_{i1}|, \ldots, |D_{i d}|, \# T) = (|\tilde D_{11}|, \ldots, |\tilde D_{1d}|, \# \tilde T)$. Equality of the distributions follows. \end{proof} As our next result we provide a tail bound for the size of the largest Mondrian cell. The cells within a Mondrian tree are of course not independent, and in fact there should intuitively be some negative correlation between their sizes, due to the fact that they must all fit within the original cell $D$. \begin{lemma}[Tail bound on largest Mondrian cell] \label{lem:mondrian_app_largest_cell_tail} Let $T \sim \cM(D, \lambda)$. For any $\varepsilon > 0$, % \begin{align*} \P\left( \max_{D' \in T} \max_{1 \leq j \leq d} |D'_j| > \varepsilon \right) &\leq 5d (1 + \lambda |D|_1)^{d+1} e^{-\lambda \varepsilon}. \end{align*} % \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_largest_cell_tail}] Let $D_i$ be the ordered cells of $T$ and take $k \geq 1$. By union bounds and Lemma~\ref{lem:mondrian_app_cells_identically_distributed}, % \begin{align*} \P\left( \max_{D' \in T} \max_{1 \leq j \leq d} |D'_j| > \varepsilon \right) &\leq \sum_{l=1}^k \P\left( \max_{1 \leq i \leq l} \max_{1 \leq j \leq d} |D_{i j}| > \varepsilon, \# T = l \right) + \P\left( \# T > k \right) \\ &\leq \sum_{l=1}^k \sum_{i=1}^l \sum_{j=1}^d \P\big( |D_{i j}| > \varepsilon, \# T = l \big) + \P\left( \# T > k \right) \\ &\leq \sum_{l=1}^k l d \, \P\big( |D_{1j}| > \varepsilon, \# T = l \big) + \P\left( \# T > k \right) \\ &\leq k d \, \P\big(|D_{1 j}| > \varepsilon \big) + \P\left( \# T > k \right). \end{align*} % For the first term we use the exact distribution of $D_1$ from Lemma~\ref{lem:mondrian_app_cells_identically_distributed} and for the second term we apply Lemma~\ref{lem:mondrian_app_cells_tail}. % \begin{align*} \P\left( \max_{D' \in T} \max_{1 \leq j \leq d} |D'_j| > \varepsilon \right) &\leq k d \, \P\big(|D_{1 j}| > \varepsilon \big) + \P\left( \# T > k \right) \\ &\leq k d \, e^{-\lambda \varepsilon} + 2 (1 + \lambda |D|_1)^d e^{-k (1 + \lambda |D|_1)^{-d} / 3}. \end{align*} % Finally, set $k = \big\lceil 3 \lambda \varepsilon (1 + \lambda |D|_1)^d \big\rceil$ and note the bound is trivial unless $\varepsilon \leq |D|_1$. % \begin{align*} \P\left( \max_{D' \in T} \max_{1 \leq j \leq d} |D'_j| > \varepsilon \right) &\leq \big( 3 \lambda \varepsilon (1 + \lambda |D|_1)^d + 1 \big) d \, e^{-\lambda \varepsilon} + 2 (1 + \lambda |D|_1)^d e^{-\lambda \varepsilon} \\ &\leq 3d (1 + \lambda |D|_1)^{d+1} e^{-\lambda \varepsilon} + 2 (1 + \lambda |D|_1)^d e^{-\lambda \varepsilon} \\ &\leq 5d (1 + \lambda |D|_1)^{d+1} e^{-\lambda \varepsilon}. \end{align*} % \end{proof} For the remainder of this section, we turn our attention to the partitions generated by Mondrian random forests. In particular, we study the refinement generated by overlaying $B$ independent Mondrian processes with possibly different lifetime parameters, and intersecting their resulting individual partitions. \begin{definition}[Partition refinement]% % Let $T_1, \ldots, T_B$ be partitions of a set. Their common refinement is % \begin{align*} \bigwedge_{b=1}^B T_b = \left\{ \bigcap_{b=1}^B D_b: D_b \in T_b \right\} \bigsetminus \left\{ \emptyset \right\}. \end{align*} % \end{definition} We begin our analysis of Mondrian forest refinements with a pair of simple inequalities for bounding the total number of refined cells in Lemma~\ref{lem:mondrian_app_refinement_inequalities}. This result does not depend on the probabilistic structure of the Mondrian process, and holds for any rectangular partitions. \begin{lemma}[Inequalities for refinements of rectangular partitions] \label{lem:mondrian_app_refinement_inequalities} Let $T_1, \ldots, T_B$ be rectangular partitions of a $d$-dimensional rectangle $D$. Then % \begin{align} \label{eq:mondrian_app_refinement_1} \# \bigwedge_{b=1}^B T_b &\leq \prod_{b=1}^B \# T_b, \end{align} % and for all $B \leq d$ there exist $T_b$ such that \eqref{eq:mondrian_app_refinement_1} holds with equality. If $\# T_{b j}$ denotes the number of splits made by $T_b$ in dimension $j$, then % \begin{align} \label{eq:mondrian_app_refinement_2} \# \bigwedge_{b=1}^B T_b &\leq \prod_{j=1}^d \left( 1 + \sum_{b=1}^B \# T_{b j} \right), \end{align} % and for all $B \geq d$ there exist $T_b$ such that \eqref{eq:mondrian_app_refinement_2} holds with equality. \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_refinement_inequalities}] The first inequality \eqref{eq:mondrian_app_refinement_1} follows because every cell in $\bigwedge_b T_b$ is the intersection of cells $D_b \in T_b$ for $1 \leq b \leq B$, and there at at most $\prod_{b=1}^{B} \# T_b$ ways to choose these. This bound is achievable when $B \leq d$ by setting $T_b$ to be a tree with splits only in dimension $b$, so that every such intersection of cells gives a cell in the refinement. For the second inequality \eqref{eq:mondrian_app_refinement_2}, we construct a new forest of trees. In particular, for each $1 \leq j \leq d$ define $A_j$ to be the set of locations in $D_j$ where a tree $T_b$ makes a split in dimension $j$ for some $b$. Define $T'_j$ to be a tree which has splits only in dimension $j$ and at the locations prescribed by $A_j$. Clearly, since every split in $T'_j$ comes from a split in some $T_b$ in dimension $j$, we have $\# T'_j \leq 1 + \sum_b \# T_{b j}$. Applying the first inequality to this new forest yields $\# \bigwedge_j T'_j \leq \prod_j \# T'_j \leq \prod_j \big( 1 + \sum_b \# T_{b j} \big)$. Finally, note that $\bigwedge_j T'_j$ is a refinement of $\bigwedge_b T_b$ and the result follows. This bound is achievable when $B \geq d$ by letting $T_b$ have splits only in dimension $b$ when $b \leq d$ and to be the trivial partition otherwise. % \end{proof} The inequalities in Lemma~\ref{lem:mondrian_app_refinement_inequalities} provide rather crude bounds for the number of cells in a Mondrian forest refinement as they do not take into account the random structure. Indeed, it should be clear that the ``worst case'' scenarios, involving trees which contain splits only in a single direction, should be extremely unlikely under the Mondrian law. In Lemma~\ref{lem:mondrian_app_refinement} we confirm this intuition and provide an exact value for the expected number of cells in a Mondrian refinement by direct calculation. This result strictly generalizes the single tree version provided as \citet[Proposition~2]{mourtada2020minimax}. \begin{lemma}[Expected number of cells in a Mondrian forest refinement] \label{lem:mondrian_app_refinement} Let $D$ be a $d$-dimensional rectangle and take $\lambda_b > 0$ for $1 \leq b \leq B$. Let $T_b \sim \cM(D, \lambda_b)$ be independent. Then the expected number of cells in their refinement is exactly % \begin{align*} \E\left[\# \bigwedge_{b=1}^B T_b \right] &= \prod_{j=1}^d \left( 1 + |D_j| \sum_{b=1}^B \lambda_b \right). \end{align*} % \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_refinement}] By \citet[Proposition~2]{mourtada2020minimax} we have the result for a single tree: % \begin{align} \label{eq:mondrian_app_single_tree} \E\left[\# T_b \right] &= \prod_{j=1}^d \left( 1 + |D_j| \lambda_b \right). \end{align} % We proceed by induction on $B$. By the tower law, % \begin{align*} \E\left[\# \bigwedge_{b=1}^B T_b \right] &= \E\left[ \sum_{D' \in T_B} \# \bigwedge_{b=1}^{B-1} (T_b \cap D') \right] = \E\left[ \sum_{D' \in T_B} \E\left[ \# \bigwedge_{b=1}^{B-1} (T_b \cap D') \biggm| T_B \right] \right]. \end{align*} % Now by the restriction property of Mondrian processes \citep[Fact~2]{mourtada2020minimax}, observe that $T_b \cap D' \sim \cM(D', \lambda_b)$ conditional on $T_B$. Then by the induction hypothesis, % \begin{align*} \E\left[ \# \bigwedge_{b=1}^{B-1} (T_b \cap D') \biggm| T_B \right] &= \prod_{j=1}^d \left( 1 + |D'_j| \sum_{b=1}^{B-1} \lambda_b \right) = \E\big[ \# T_{D'} \mid T_B \big] \end{align*} % where $T_{D'} \sim \cM\big(D', \sum_{b=1}^{B-1} \lambda_B\big)$ conditional on $T_B$, by the result for a single tree \eqref{eq:mondrian_app_single_tree}. The restriction property finally shows that there exist realizations of $T_{D'}$ which ensure that $\sum_{D' \in T_B} \# T_{D'}$ is equal in distribution to $\# T$, where $T \sim \cM(D, \sum_{b=1}^B \lambda_b)$, so by \eqref{eq:mondrian_app_single_tree}, % \begin{align*} \E\left[\# \bigwedge_{b=1}^B T_b \right] &= \E\left[ \sum_{D' \in T_B} \E\big[ \# T_{D'} \mid T_B \big] \right] = \E\big[\# T \big] = \prod_{j=1}^d \left( 1 + |D_j| \sum_{b=1}^B \lambda_b \right). \end{align*} % \end{proof} While the exact expectation calculation in Lemma~\ref{lem:mondrian_app_refinement} is neat, sharper control on the tail behavior of the number of cells in a Mondrian refinement is desired. Lemma~\ref{lem:mondrian_app_refinement_tail} provides this, again making use of the subcell trick to convert a crude bound based on Lemma~\ref{lem:mondrian_app_refinement_inequalities} into a useful tail inequality. We assume for simplicity that all of the lifetimes are identical. \begin{lemma}[Tail bound on the number of cells in a Mondrian forest refinement] \label{lem:mondrian_app_refinement_tail} Let $T_b \sim \cM(D, \lambda)$ be i.i.d.\ for $1 \leq b \leq B$. Then % \begin{align*} \P\left( \# \bigwedge_{b=1}^B T_b > 3^d 2^{d^2} B^d (1+\lambda|D|_1)^d t^d \right) &\leq 2^{d+1} d B (1 + \lambda |D|_1)^d e^{-t}. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:mondrian_app_refinement_tail}] We begin with a coarse estimate and refine it with the subcell trick. By Lemma~\ref{lem:mondrian_app_refinement_inequalities} \eqref{eq:mondrian_app_refinement_2}, for any $t > 0$, recalling that $\# T_{b j}$ is the number of splits made by $T_b$ in dimension $j$, % \begin{align} \nonumber \P\left( \# \bigwedge_{b=1}^B T_b > t \right) &\leq \P\left( \prod_{j=1}^d \left( 1 + \sum_{b=1}^B \# T_{b j} \right) > t \right) \leq \sum_{j=1}^d \P\left( 1 + \sum_{b=1}^B \# T_{b j} > t^{1/d} \right) \\ \label{eq:mondrian_app_refinement_tail_coarse} &\leq d\, \P\left( \sum_{b=1}^B \# T_b > t^{1/d} \right) \leq d B\, \P\left( \# T_b > \frac{t^{1/d}}{B} \right). \end{align} % By the subcell trick, partition $D$ into at most $(1 + 1/\varepsilon)^d$ cells $D' \in \cD_\varepsilon$ with side lengths at most $(|D_1| \varepsilon, \ldots, |D_d| \varepsilon)$. As every cell in $\bigwedge_b T_b$ corresponds to at least one cell in $\bigwedge_b (T_b \cap D')$, % \begin{align*} \P\left( \# \bigwedge_{b=1}^B T_b > t \right) &\leq \P\left( \sum_{D' \in \cD_\varepsilon} \# \bigwedge_{b=1}^B (T_b \cap D') > t \right) \leq \sum_{D' \in \cD_\varepsilon} \P\left( \# \bigwedge_{b=1}^B (T_b \cap D') > \frac{t}{\# \cD_\varepsilon} \right). \end{align*} % Applying the coarse estimate \eqref{eq:mondrian_app_refinement_tail_coarse} to $\# \bigwedge_b (T_b \cap D')$ gives % \begin{align*} \P\left( \# \bigwedge_{b=1}^B T_b > t \right) &\leq d B \sum_{D' \in \cD_\varepsilon} \P\left( \# (T_b \cap D') > \frac{t^{1/d}}{B \# \cD_\varepsilon^{1/d}} \right). \end{align*} % Now apply Lemma~\ref{lem:mondrian_app_cells_tail} and set $\varepsilon = \frac{1}{\lambda |D|_1}$ to obtain % \begin{align*} \P\left( \# \bigwedge_{b=1}^B T_b > t \right) &\leq d B \sum_{D' \in \cD_\varepsilon} \P\left( \# (T_b \cap D') > \frac{t^{1/d}}{B \# \cD_\varepsilon^{1/d}} \right) \\ &\leq d B \sum_{D' \in \cD_\varepsilon} 2 (1 + \lambda |D'|_1)^d e^{- t^{1/d} \# \cD_\varepsilon^{-1/d} B^{-1} (1 + \lambda |D'|_1)^{-d} / 3} \\ &\leq 2 d B (1 + 1 / \varepsilon)^d (1 + \lambda \varepsilon |D|_1)^d e^{- t^{1/d} (1 + 1/\varepsilon)^{-1} B^{-1} (1 + \lambda \varepsilon |D|_1)^{-d} / 3} \\ &\leq 2^{d+1} d B (1 + \lambda |D|_1)^d e^{- t^{1/d} (1 + \lambda |D|_1)^{-1} B^{-1} 2^{-d} / 3}. \end{align*} % Finally, replacing $t$ by $3^d 2^{d^2} B^d (1+\lambda|D|_1)^d t^d$ we have % \begin{align*} \P\left( \# \bigwedge_{b=1}^B T_b > 3^d 2^{d^2} B^d (1+\lambda|D|_1)^d t^d \right) &\leq 2^{d+1} d B (1 + \lambda |D|_1)^d e^{-t}. \end{align*} % \end{proof} \chapter{Supplement to Dyadic Kernel Density Estimators} \label{app:kernel} This section contains complementary detailed expositions of some of our main results, along with additional technical lemmas which may be of independent interest. We also provide full proofs for all of our theoretical contributions. \section{Supplementary main results} In this first section we provide more detailed versions of some of the results presented in the main text, alongside some intermediate lemmas which were skipped for conciseness. We begin with some extra notation used throughout this appendix. For real vectors, $\|\cdot\|_p$ is the standard $\ell^p$-norm defined for $p \in [1, \infty]$. For real square matrices, $\|\cdot\|_p$ is the operator norm induced by the corresponding vector norm. In particular, $\|\cdot\|_1$ is the maximum absolute column sum, $\|\cdot\|_\infty$ is the maximum absolute row sum, and $\|\cdot\|_2$ is the maximum singular value. For real symmetric matrices, $\|\cdot\|_2$ coincides with the maximum absolute eigenvalue. We use $\|\cdot\|_{\max}$ to denote the largest absolute entry of a real matrix. For real-valued functions, $\|\cdot\|_\infty$ denotes the (essential) supremum norm. For a bounded set $\cX \subseteq \R$ and $a \geq 0$ we use $[\cX \pm a]$ to denote the compact interval $[\inf \cX - a, \ \sup \cX + a]$. For measurable subsets of $\R^d$ we use $\Leb$ to denote the Lebesgue measure, and for finite sets we use $|\cdot|$ for the cardinality. Write $\sum_i$ for $\sum_{i=1}^n$ when clear from context. Similarly, use $\sum_{i \Du \frac{t + C_1 \log n}{\sqrt{n}} \right) &\leq C_2 e^{-C_3 t}, \end{align*} % for some positive constants $C_1$, $C_2$, $C_3$, and for all $t > 0$. By integration of tail probabilities, % \begin{align*} \E\left[ \sup_{w \in \cW} \big| \sqrt{n} L_n'(w) - Z_n^{L\prime}(w)\big| \right] &\lesssim \frac{\Du \log n}{\sqrt{n}}. \end{align*} % Further, $Z_n^{L\prime}$ has the same covariance structure as $\sqrt{n} L_n'$ in the sense that for all $w, w' \in \cW$, % \begin{align*} \E\left[ Z_n^{L\prime}(w) Z_n^{L\prime}(w') \right] &= n \E\left[ L_n'(w) L_n'(w') \right]. \end{align*} % It also satisfies the following trajectory regularity property for any $\delta_n \in (0, 1/2]$: % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big| Z_n^{L\prime}(w) - Z_n^{L\prime}(w') \big| \right] &\lesssim \Du \delta_n \sqrt{\log 1/\delta_n}, \end{align*} % and has continuous trajectories. The process $Z_n^{L\prime}$ is a function only of $\bA_n'$ and some random noise which is independent of $(\bA_n', \bV_n')$. \end{lemma} \begin{lemma}[Conditional strong approximation of $E_n$] \label{lem:kernel_app_conditional_strong_approx_En} Suppose Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold. For $n \geq 2$ and $t_n > 0$ with $\left|\log t_n\right| \lesssim \log n$, there exists on some probability space a copy of $\big(\bA_n, \bV_n, E_n\big)$, denoted $\big(\bA_n', \bV_n', E_n'\big)$, and a process $\tilde Z^{E\prime}_n$ which is Gaussian conditional on $\bA_n'$ and mean-zero conditional on $\bA_n'$, satisfying % \begin{align*} \P\left( \sup_{w \in \cW} \big| \sqrt{n^2h} E_n'(w) - \tilde Z_n^{E\prime}(w) \big| > t_n \Bigm\vert \bA_n' \right) &\leq C_1 t_n^{-2} n^{-1/2} h^{-3/4} (\log n)^{3/4}, \end{align*} $\bA_n'$-almost surely for some constant $C_1 > 0$. Setting $t_n = n^{-1/4} h^{-3/8} (\log n)^{3/8} R_n$ for any sequence $R_n \to \infty$ and taking an expectation gives % \begin{align*} \sup_{w \in \cW} \big| \sqrt{n^2h} E_n'(w) - \tilde Z_n^{E\prime}(w) \big| &\lesssim_\P n^{-1/4} h^{-3/8} (\log n)^{3/8} R_n. \end{align*} % Further, $\tilde Z_n^{E\prime}$ has the same conditional covariance as $\sqrt{n^2h} E_n'$ in that for all $w, w' \in \cW$, % \begin{align*} \E\left[ \tilde Z_n^{E\prime}(w) \tilde Z_n^{E\prime}(w') \bigm\vert \bA_n' \right] &= n^2h \E\left[ E_n'(w) E_n'(w') \bigm\vert \bA_n' \right]. \end{align*} % It also satisfies the following trajectory regularity property for any $\delta_n \in (0, 1/(2h)]$: % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big| \tilde Z_n^{E\prime}(w) - \tilde Z_n^{E\prime}(w') \big| \right] &\lesssim \frac{\delta_n}{h} \sqrt{\log \frac{1}{h\delta_n}}, \end{align*} % and has continuous trajectories. \end{lemma} \begin{lemma}[Unconditional strong approximation of $E_n$] \label{lem:kernel_app_unconditional_strong_approx_En} Suppose Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold. Let $\big(\bA_n', \bV_n', \tilde Z_n^{E\prime}\big)$ be defined as in Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}. For each $n \geq 2$ there exists (on some probability space) a copy of $\big(\bA_n', \bV_n', \tilde Z_n^{E\prime}\big)$, denoted $\big(\bA_n'', \bV_n'', \tilde Z_n^{E\dprime}\big)$, and a centered Gaussian process $Z^{E\dprime}_n$ satisfying % \begin{align*} \E\left[ \sup_{w \in \cW} \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big| \right] &\lesssim n^{-1/6} (\log n)^{2/3}. \end{align*} % Further, $Z_n^{E\dprime}$ has the same (unconditional) covariance structure as $\tilde Z_n^{E\dprime}$ and $\sqrt{n^2h} E_n$ in the sense that for all $w, w' \in \cW$, % \begin{align*} \E\left[ Z_n^{E\dprime}(w) Z_n^{E\dprime}(w') \right] &= \E\left[ \tilde Z_n^{E\dprime}(w) \tilde Z_n^{E\dprime}(w') \right] = n^2h \, \E\left[ E_n(w) E_n(w') \right]. \end{align*} % It also satisfies the following trajectory regularity property for any $\delta_n \in (0, 1/(2h)]$: % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big| Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w') \big| \right] &\lesssim \frac{\delta_n}{h} \sqrt{\log \frac{1}{h\delta_n}}. \end{align*} % Finally, $Z_n^{E\dprime}$ is independent of $\bA_n''$ and has continuous trajectories. \end{lemma} We combine these strong approximations to deduce a coupling for $\hat f_W$ in Theorem~\ref{thm:kernel_app_strong_approx_fW}, taking care with independence to ensure the approximating processes are jointly Gaussian. \begin{theorem}[Strong approximation of $\hat f_W$] \label{thm:kernel_app_strong_approx_fW} Suppose that Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold. For each $n \geq 2$ and any sequence $R_n \to \infty$ there exists on some probability space a centered Gaussian process $Z_n^{f\prime}$ and a copy of $\hat f_W$, denoted $\hat f_W'$, satisfying % \begin{align*} &\sup_{w \in \cW} \Big| \hat f_W'(w) - \E[\hat f_W'(w)] - Z_n^{f\prime}(w) \Big| \\ &\quad\lesssim_\P n^{-1} \log n + n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-7/6} h^{-1/2} (\log n)^{2/3}. \end{align*} % Further, $Z_n^{f\prime}$ has the same covariance structure as $\hat f_W'(w)$ in the sense that for all $w, w' \in \cW$, % \begin{align*} \E\big[Z_n^{f\prime}(w) Z_n^{f\prime}(w')\big] &= \Cov\Big[ \hat f_W'(w), \hat f_W'(w') \Big] = \Sigma_n(w,w'). \end{align*} % It has continuous trajectories satisfying the following regularity property for any $\delta_n \in (0, 1/2]$: % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \Big| Z_n^{f\prime}(w) - Z_n^{f\prime}(w') \Big| \right] &\lesssim \frac{\Du}{\sqrt n} \delta_n \sqrt{\log \frac{1}{\delta_n}} + \frac{1}{\sqrt{n^2h}} \frac{\delta_n}{h} \sqrt{\log \frac{1}{h\delta_n}}. \end{align*} % \end{theorem} The main result Theorem~\ref{thm:kernel_strong_approx_Tn} now follows easily using Theorem~\ref{thm:kernel_app_strong_approx_fW}, the bias bound from Theorem~\ref{thm:kernel_bias}, and properties of $\Sigma_n$ established in Lemma~\ref{lem:kernel_variance_bounds}. \subsection{Covariance estimation} \label{sec:kernel_app_covariance_estimation} In this section we carefully construct a consistent estimator for the covariance function $\Sigma_n$. Firstly, we characterize $\Sigma_n$ in Lemma~\ref{lem:kernel_app_covariance_structure}. In Lemma~\ref{lem:kernel_app_covariance_estimation} we define the estimator and demonstrate that it converges in probability in a suitable sense. In Lemma~\ref{lem:kernel_app_alternative_covariance_estimator} we give an alternative representation which is more amenable to computation. \begin{lemma}[Covariance structure] \label{lem:kernel_app_covariance_structure} Suppose Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth} hold. Then $\Sigma_n$, as defined in Section~\ref{sec:kernel_degeneracy}, admits the following representations, where $1 \leq i < j < r \leq n$. % \begin{align*} \Sigma_n(w,w') &= \frac{2}{n(n-1)} \,\Cov\!\big[ k_h(W_{i j},w), k_h(W_{i j},w') \big] + \frac{4(n-2)}{n(n-1)} \,\Cov\!\big[ k_h(W_{i j},w), k_h(W_{i r},w') \big] \\ &= \frac{2}{n(n-1)} \,\Cov\!\big[ k_h(W_{i j},w), k_h(W_{i j},w') \big] \\ &\quad+ \frac{4(n-2)}{n(n-1)} \,\Cov\!\big[ \E[k_h(W_{i j},w) \mid A_i], \E[k_h(W_{i j},w') \mid A_i] \big], \end{align*} % \end{lemma} \begin{lemma}[Covariance estimation] \label{lem:kernel_app_covariance_estimation} Grant Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth}, and suppose $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$. Define % \begin{align*} S_{i j r}(w,w') &= \frac{1}{6} \Big( k_h(W_{i j},w) k_h(W_{i r},w') + k_h(W_{i j},w) k_h(W_{jr},w') + k_h(W_{i r},w) k_h(W_{i j},w') \\ &\quad+ k_h(W_{i r},w) k_h(W_{jr},w') + k_h(W_{jr},w) k_h(W_{i j},w') + k_h(W_{jr},w) k_h(W_{i r},w') \Big), \\ \hat \Sigma_n(w,w') &= \frac{4}{n^2(n-1)^2} \sum_{i 0$ on $\cW$. Then the optimization problem \eqref{eq:kernel_app_sdp} has an approximately optimal solution $\hat\Sigma_n^+$ which is uniformly entrywise-consistent for $\Sigma_n$ in the sense that % \begin{align*} \sup_{w,w' \in \cW} \left| \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right| &\lesssim_\P \frac{\sqrt{\log n}}{n}. \end{align*} \end{lemma} The optimization problem \eqref{eq:kernel_app_sdp} is stated for functions rather than matrices so is infinite-dimensional. However, when restricting to finite-size matrices, Lemma~\ref{lem:kernel_app_sdp} still holds and does not depend on the size of the matrices. Furthermore, the problem then becomes a semi-definite program and so can be solved to arbitrary precision in polynomial time in the size of the matrices \citep{laurent2005semidefinite}. The Lipschitz-type constraint in the optimization problem \eqref{eq:kernel_app_sdp} ensures that $\hat \Sigma_n^+$ is sufficiently smooth and is a technicality required by some of the later proofs. In practice this constraint is readily verified. \begin{lemma}[Positive semi-definite variance estimator bounds] \label{lem:kernel_app_variance_estimator_bounds} Suppose that Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth} hold, and that $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$. Then $\hat \Sigma_n^+(w,w) \geq 0$ almost surely for all $w \in \cW$ and % \begin{align*} \frac{\Dl^2}{n} + \frac{1}{n^2h} &\lesssim_\P \inf_{w \in \cW} \hat \Sigma_n^+(w,w) \leq \sup_{w \in \cW} \hat \Sigma_n^+(w,w) \lesssim_\P \frac{\Du^2}{n} + \frac{1}{n^2h}. \end{align*} \end{lemma} \subsection{Feasible uniform confidence bands} We use the strong approximation derived in Section~\ref{sec:kernel_app_strong_approx} and the positive semi-definite covariance estimator introduced in Section~\ref{sec:kernel_app_covariance_estimation} to construct feasible uniform confidence bands. We drop the prime notation for copies of processes in the interest of clarity. \begin{lemma}[Proximity of the standardized and studentized $t$-statistics] \label{lem:kernel_app_studentized_t_statistic} Let Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold, and suppose that $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$. Define for $w \in \cW$ the Studentized $t$-statistic process % \begin{align*} \hat T_n(w) = \frac{\hat f_W(w) - f_W(w)} {\sqrt{\hat\Sigma_n^+(w,w)}}. \end{align*} % Then % \begin{align*} \sup_{w \in \cW} \left| \hat T_n(w) - T_n(w) \right| &\lesssim_\P \sqrt{\frac{\log n}{n}} \left( \sqrt{\log n} + \frac{\sqrt n h^{p \wedge \beta}} {\Dl + 1/\sqrt{n h}} \right) \frac{1}{\Dl + 1/\sqrt{n h}}. \end{align*} \end{lemma} \begin{lemma}[Feasible Gaussian approximation of the infeasible Gaussian process] \label{lem:kernel_app_distributional_approx_feasible_gaussian} Let Assumptions \ref{ass:kernel_data} and \ref{ass:kernel_bandwidth} hold, and suppose that $n h \gtrsim \log n$ and $f_W(w) > 0$ on $\cW$. Define a process $\hat Z_n^T(w)$ which, conditional on the data $\bW_n$, is conditionally mean-zero and conditionally Gaussian, and whose conditional covariance structure is % \begin{align*} \E\left[ \hat Z_n^T(w) \hat Z_n^T(w') \bigm| \bW_n \right] &= \frac{\hat \Sigma_n^+(w,w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}} \end{align*} % Then the following conditional Kolmogorov--Smirnov result holds. % \begin{align*} \sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \hat Z_n^T(w) \right| \leq t \biggm\vert \bW_n \right) \right| &\lesssim_\P \frac{n^{-1/6}(\log n)^{5/6}} {\Dl^{1/3} + (n h)^{-1/6}}. \end{align*} \end{lemma} \begin{lemma}[Feasible Gaussian approximation of the studentized $t$-statistic] \label{lem:kernel_app_feasible_gaussian_approx} Let Assumptions \ref{ass:kernel_data}, \ref{ass:kernel_bandwidth} and \ref{ass:kernel_rates} hold, and suppose that $f_W(w) > 0$ on $\cW$. Then % \begin{align*} \sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \hat Z_n^T(w) \right| \leq t \Bigm\vert \bW_n \right) \right| &\ll_\P 1. \end{align*} \end{lemma} These intermediate lemmas can be used to establish the valid and feasible uniform confidence bands presented in Theorem~\ref{thm:kernel_ucb} in the main text. See Section~\ref{sec:kernel_app_proofs} for details. \subsection{Counterfactual dyadic density estimation} In this section we give a detailed analysis of the counterfactual estimator of Section~\ref{sec:kernel_counterfactual}. We begin with an assumption describing the counterfactual setup. \begin{assumption}[Counterfactual data generation] \label{ass:kernel_app_counterfactual} For each $r \in \{0,1\}$, let $\bW_n^r$, $\bA_n^r$, and $\bV_n^r$ be as in Assumption~\ref{ass:kernel_data}. Let $X_i^r$ be finitely-supported variables, setting $\bX_n^r = (X_1^r, \ldots, X_n^r)$. Suppose that $(A_i^r, X_i^r)$ are independent over $1 \leq i \leq n$ and that $\bX_n^r$ is independent of $\bV_n^r$. Assume that $W_{i j}^r \mid X_i^r, X_j^r$ has a Lebesgue density $f_{W \mid XX}^r(\,\cdot \mid x_1, x_2) \in \cH^\beta_{C_\rH}(\cW)$ and that $X_i^r$ has positive probability mass function $p_X^r(x)$ on a common support $\cX$. Suppose that $(\bA_n^0, \bV_n^0, \bX_n^0)$ and $(\bA_n^1, \bV_n^1, \bX_n^1)$ are independent. \end{assumption} The counterfactual density of $W_{i j}$ in population $1$ had $X_i, X_j$ followed population $0$ is % \begin{align*} f_W^{1 \triangleright 0}(w) &= \E\left[ f_{W \mid XX}^1\big(w \mid X_1^0, X_2^0\big) \right] = \sum_{x_1 \in \cX} \sum_{x_2 \in \cX} f_{W \mid XX}^{1}(w \mid x_1, x_2) \psi(x_1) \psi(x_2) p_X^{1}(x_1) p_X^{1}(x_2), \end{align*} % with $\psi(x) = p_X^0(x)/p_X^1(x)$ for $x \in \cX$. Define the counterfactual dyadic kernel density estimator % \begin{align*} \hat f_W^{1 \triangleright 0}(w) &= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^n \hat \psi(X_i^1) \hat \psi(X_j^1) k_h(W_{i j}^1, w), \end{align*} % where $\hat\psi(x) = \I\{\hat p_X^{1}(x) > 0\}\hat p_X^{0}(x) / \hat p_X^{1}(x)$ and $\hat p_X^{r}(x) = \frac{1}{n}\sum_{i = 1}^n \I\{X_i^r = x\}$. Since $p_X^r(x) > 0$, % \begin{align*} \hat\psi(x) - \psi(x) &= \frac{\hat p_X^{0}(x) - p_X^0(x)}{p_X^1(x)} - \frac{p_X^0(x)}{p_X^1(x)} \frac{\hat p_X^{1}(x) - p_X^1(x)}{p_X^1(x)} \\ &\quad+ \frac{\hat p_X^{1}(x) - p_X^1(x)}{p_X^1(x)} \frac{\hat p_X^{1}(x) p_X^0(x) - \hat p_X^{0}(x)p_X^1(x)} {\hat p_X^{1}(x) p_X^1(x)} \\ &= \frac{1}{n} \sum_{r=1}^n \kappa(X_r^0, X_r^1, x) + O_\P\left(\frac{1}{n}\right) \end{align*} % is an asymptotic linear representation where % \begin{align*} \kappa(X_i^0, X_i^1, x) &= \frac{\I\{X_i^0 = x\} - p_X^0(x)}{p_X^1(x)} - \frac{p_X^0(x)}{p_X^1(x)} \frac{\I\{X_i^1 = x\} - p_X^1(x)}{p_X^1(x)} \end{align*} % satisfies $\E[\kappa(X_i^0, X_i^1, x)] = 0$. We now establish uniform consistency and feasible strong approximation results for the counterfactual density estimator. \begin{lemma}[Bias of $\hat f_W^{1 \triangleright 0}$] \label{lem:kernel_app_counterfactual_bias} Suppose that Assumptions~\ref{ass:kernel_data}, \ref{ass:kernel_bandwidth}, and \ref{ass:kernel_app_counterfactual} hold. Then % \begin{align*} \sup_{w \in \cW} \big| \E\big[\hat f_W^{1 \triangleright 0}(w)\big] - f_W^{1 \triangleright 0}(w) \big| \lesssim h^{p \wedge \beta} + \frac{1}{n}. \end{align*} \end{lemma} \begin{lemma}[Hoeffding-type decomposition for $\hat f_W^{1 \triangleright 0}$] \label{lem:kernel_app_counterfactual_hoeffding} Suppose that Assumptions~\ref{ass:kernel_data}, \ref{ass:kernel_bandwidth}, and \ref{ass:kernel_app_counterfactual} hold. With $k_{i j} = k_h(W_{i j}^1, w)$, $\kappa_{r i} = \kappa(X_r^0, X_r^1, X_i^1)$ and $\psi_i = \psi(X_i^1)$, define the projections % \begin{align*} u &= \E\left[ k_{i j} \psi_i \psi_j \right], \\ u_i &= \frac{2}{3} \psi_i \E\left[ k_{i j} \psi_j \mid A_i^1 \right] + \frac{2}{3} \E\left[ k_{jr} \psi_j \kappa_{i r} \mid X_i^0, X_i^1 \right] - \frac{2}{3} u, \\ u_{i j} &= \frac{1}{3} \psi_i \psi_j \E\left[ k_{i j} \mid A_i^1, A_j^1 \right] + \frac{1}{3} \psi_i \E\left[ k_{i r} \psi_r \mid A_i^1 \right] + \frac{1}{3} \psi_i \E\left[ k_{i r} \kappa_{jr} \mid A_i^1, X_j^0, X_j^1 \right] \\ &\quad+ \frac{1}{3} \kappa_{j i} \E\left[ k_{i r} \psi_r \mid A_i^1 \right] + \frac{1}{3} \psi_j \E\left[ k_{jr} \psi_r \mid A_j^1 \right] + \frac{1}{3} \psi_j \E\left[ k_{jr} \kappa_{i r} \mid X_i^0, X_i^1, A_j^1 \right] \\ &\quad+ \frac{1}{3} \kappa_{i j} \E\left[ k_{jr} \psi_r \mid A_j^1 \right] - u_i - u_j + u, \\ u_{i j r} &= \frac{1}{3} \psi_i \psi_j \E\left[ k_{i j} \mid A_i^1, A_j^1 \right] + \frac{1}{3} \psi_i \kappa_{r j} \E\left[ k_{i j} \mid A_i^1, A_j^1 \right] + \frac{1}{3} \psi_j \kappa_{r i} \E\left[ k_{i j} \mid A_i^1, A_j^1 \right] \\ &\quad+ \frac{1}{3} \psi_i \psi_r \E\left[ k_{i r} \mid A_i^1, A_r^1 \right] + \frac{1}{3} \psi_i \kappa_{jr} \E\left[ k_{i r} \mid A_i^1, A_r^1 \right] + \frac{1}{3} \psi_r \kappa_{j i} \E\left[ k_{i r} \mid A_i^1, A_r^1 \right] \\ &\quad+ \frac{1}{3} \psi_j \psi_r \E\left[ k_{jr} \mid A_j^1, A_r^1 \right] + \frac{1}{3} \psi_j \kappa_{i r} \E\left[ k_{jr} \mid A_j^1, A_r^1 \right] + \frac{1}{3} \psi_r \kappa_{i j} \E\left[ k_{jr} \mid A_j^1, A_r^1 \right] \\ &\quad- u_{i j} - u_{i r} - u_{jr} + u_i + u_j + u_r - u, \\ v_{i j r} &= \frac{1}{3} k_{i j} \big(\psi_i \psi_j +\psi_i \kappa_{r j} +\psi_j \kappa_{r i} \big) + \frac{1}{3} k_{i r} \big(\psi_i \psi_r +\psi_i \kappa_{jr} +\psi_r \kappa_{j i} \big) \\ &\quad+ \frac{1}{3} k_{jr} \big(\psi_j \psi_r +\psi_j \kappa_{i r} +\psi_r \kappa_{i j} \big). \end{align*} % With $l_i^{1 \triangleright 0}(w) = u_i$ and $e_{i j r}^{1 \triangleright 0}(w) = v_{i j r} - u_{i j r}$, set % \begin{align*} L_n^{1 \triangleright 0}(w) &= \frac{3}{n} \sum_{i=1}^n l_i^{1 \triangleright 0}(w) &\text{and} & &E_n^{1 \triangleright 0}(w) &= \frac{6}{n(n-1)(n-2)} \sum_{i=1}^{n-2} \sum_{j=i+1}^{n-1} \sum_{r=i+1}^n e_{i j r}^{1 \triangleright 0}(w). \end{align*} % Then the following Hoeffding-type decomposition holds, where $O_\P(1/n)$ is uniform in $w \in \cW$. % \begin{align*} \hat f_W^{1 \triangleright 0}(w) = \E\big[\hat f_W^{1 \triangleright 0}(w)\big] + L_n^{1 \triangleright 0}(w) + E_n^{1 \triangleright 0}(w) + O_\P\left( \frac{1}{n} \right). \end{align*} % Further, the stochastic processes $L_n^{1 \triangleright 0}$ and $E_n^{1 \triangleright 0}$ are mean-zero and orthogonal in $L^2(\P)$. Define the upper and lower degeneracy constants as % \begin{align*} \Du^{1 \triangleright 0} &= \limsup_{n \to \infty} \sup_{w \in \cW} \Var\big[ l_i^{1 \triangleright 0}(w) \big]^{1/2} &\text{and}& & \Dl^{1 \triangleright 0} &= \liminf_{n \to \infty} \inf_{w \in \cW} \Var\big[ l_i^{1 \triangleright 0}(w) \big]^{1/2}. \end{align*} \end{lemma} \begin{lemma}[Uniform consistency of $\hat f_W^{1 \triangleright 0}$] \label{lem:kernel_app_counterfactual_uniform_consistency} Suppose that Assumptions~\ref{ass:kernel_data}, \ref{ass:kernel_bandwidth}, and \ref{ass:kernel_app_counterfactual} hold. Then % \begin{align*} \E\left[ \sup_{w \in \cW} \big|\hat f_W^{1 \triangleright 0}(w) - f_W^{1 \triangleright 0}(w) \right] &\lesssim h^{p \wedge \beta} + \frac{\Du^{1 \triangleright 0}}{\sqrt n} + \sqrt{\frac{\log n}{n^2h}}. \end{align*} \end{lemma} \begin{lemma}[Strong approximation of $\hat f_W^{1 \triangleright 0}$] \label{lem:kernel_app_counterfactual_sa} On an appropriately enlarged probability space and for any sequence $R_n \to \infty$, there exists a mean-zero Gaussian process $Z_n^{f, 1 \triangleright 0}$ with the same covariance structure as $\hat f_W^{1 \triangleright 0}(w)$ satisfying % \begin{align*} &\sup_{w \in \cW} \left| \hat f_W^{1 \triangleright 0}(w) - \E\big[\hat f_W^{1 \triangleright 0}(w)\big] - Z_n^{f, 1 \triangleright 0}(w) \right| \\ &\quad\lesssim_\P n^{-1} \log n + n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-7/6} h^{-1/2} (\log n)^{2/3}. \end{align*} \end{lemma} \begin{lemma}[Counterfactual covariance structure] \label{lem:kernel_app_counterfactual_covariance_structure} Writing $k_{i j}'$ for $k_h(W_{i j}^1, w')$ etc., the counterfactual covariance function is % \begin{align*} &\Sigma_n^{1 \triangleright 0}(w,w') = \Cov\left[ \hat f_W^{1 \triangleright 0}(w), \hat f_W^{1 \triangleright 0}(w') \right] \\ &\quad= \frac{4}{n} \E\left[ \Big( \psi_i \E\big[ k_{i j} \psi_j \mid A_i^1 \big] + \E\left[ k_{r j} \psi_r \kappa_{i j} \mid X_i^0, X_i^1 \right] \Big) \right. \\ &\left. \qquad\qquad\quad \times \Big( \psi_i \E\big[ k_{i j}' \psi_j \mid A_i^1 \big] + \E\left[ k_{r j}' \psi_r \kappa_{i j} \mid X_i^0, X_i^1 \right] \Big) \right] \\ &\qquad+ \frac{2}{n^2} \E\left[ k_{i j} k_{i j}' \psi_i^2 \psi_j^2 \right] - \frac{4}{n} \E\left[ k_{i j} \psi_i \psi_j \right] \E\left[ k_{i j}' \psi_i \psi_j \right] + O\left( \frac{1}{n^{3/2}} + \frac{1}{\sqrt{n^4h}} \right). \end{align*} \end{lemma} \begin{lemma}[Gaussian approximation of the standardized counterfactual $t$-statistic] \label{lem:kernel_app_counterfactual_infeasible_t_statistic} Let Assumptions \ref{ass:kernel_data}, \ref{ass:kernel_bandwidth}, and \ref{ass:kernel_app_counterfactual} hold, and suppose $f_W^{1 \triangleright 0}(w) > 0$ on $\cW$. Define % \begin{align*} T_n^{1 \triangleright 0}(w) &= \frac{\hat f_W^{1 \triangleright 0}(w) - f_W^{1 \triangleright 0}(w)} {\sqrt{\Sigma_n^{1 \triangleright 0}(w,w)}} \quad\text{and}\quad Z_n^{T, 1 \triangleright 0}(w) = \frac{Z_n^{f, 1 \triangleright 0}(w)} {\sqrt{\Sigma_n^{1 \triangleright 0}(w,w)}}. \end{align*} % Then with $R_n \to \infty$ as in Lemma~\ref{lem:kernel_app_counterfactual_sa}, % \begin{align*} &\sup_{w \in \cW} \left| T_n^{1 \triangleright 0}(w) - Z_n^{T, 1 \triangleright 0}(w) \right| \\ &\quad\lesssim_\P \frac{ n^{-1/2} \log n + n^{-3/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-2/3} h^{-1/2} (\log n)^{2/3} + n^{1/2} h^{p \wedge \beta}} {\Dl^{1 \triangleright 0} + 1/\sqrt{n h}}. \end{align*} \end{lemma} \begin{theorem}[Infeasible counterfactual uniform confidence bands] \label{thm:kernel_app_counterfactual_infeasible_ucb} Let Assumptions \ref{ass:kernel_data}, \ref{ass:kernel_bandwidth}, \ref{ass:kernel_rates}, and \ref{ass:kernel_app_counterfactual} hold and suppose that $f_W^{1 \triangleright 0}(w) > 0$ on $\cW$. Let $\alpha \in (0,1)$ be a confidence level and define $q^{1 \triangleright 0}_{1-\alpha}$ as the quantile satisfying % \begin{align*} \P\left( \sup_{w \in \cW} \left| Z_n^{T,1 \triangleright 0}(w) \right| \leq q^{1 \triangleright 0}_{1-\alpha} \right) &= 1 - \alpha. \end{align*} % Then % \begin{align*} \P\left( f_W^{1 \triangleright 0}(w) \in \left[ \hat f_W^{1 \triangleright 0}(w) \pm q^{1 \triangleright 0}_{1-\alpha} \sqrt{\Sigma_n^{1 \triangleright 0}(w,w)} \, \right] \, \textup{for all } w \in \cW \right) \to 1 - \alpha. \end{align*} \end{theorem} % We propose an estimator for the counterfactual covariance function $\Sigma_n^{1 \triangleright 0}$. First let % \begin{align*} \hat\kappa(X_i^0, X_i^1, x) &= \frac{\I\{X_i^0 = x\} - \hat p_X^0(x)}{\hat p_X^1(x)} - \frac{\hat p_X^0(x)}{\hat p_X^1(x)} \frac{\I\{X_i^1 = x\} - \hat p_X^1(x)}{\hat p_X^1(x)}, \end{align*} % and define the leave-out conditional expectation estimators % \begin{align*} S_i^{1 \triangleright 0}(w) &= \hat\E\left[ k_h(W_{i j}^1,w) \psi(X_j^1) \mid A_i^1 \right] \\ &= \frac{1}{n-1} \left( \sum_{j=1}^{i-1} k_h(W_{j i}^1,w) \hat\psi(X_j^1) + \sum_{j=i+1}^n k_h(W_{i j}^1,w) \hat\psi(X_j^1) \right), \\ \tilde S_i^{1 \triangleright 0}(w) &= \hat\E\left[ k_h(W_{r j}^1,w) \psi(X_r^1) \kappa(X_i^0, X_i^1, X_j^1) \mid X_i^0, X_i^1 \right] \\ &= \frac{1}{n-1} \sum_{j=1}^n \I\{j \neq i\} \hat\kappa(X_i^0, X_i^1, X_j^1) S_j^{1 \triangleright 0}(w). \end{align*} % Then set % \begin{align*} \hat\Sigma_n^{1 \triangleright 0}(w,w') &= \frac{4}{n^2} \sum_{i=1}^n \left( \hat\psi(X_i^1) S_i^{1 \triangleright 0}(w) + \tilde S_i^{1 \triangleright 0}(w) \right) \left( \hat\psi(X_i^1) S_i^{1 \triangleright 0}(w') + \tilde S_i^{1 \triangleright 0}(w') \right) \\ &\quad- \frac{4}{n^3(n-1)} \sum_{i 0$ satisfy $\sup_{f \in \cF} \|f\|_{\bar\P,2} \leq \sigma \leq \|F\|_{\bar\P,2}$ and $M = \max_{1 \leq i \leq n} F(X_i)$. Then with $\delta = \sigma / \|F\|_{\bar\P,2} \in (0,1]$, % \begin{align*} \E \left[ \sup_{f \in \cF} \big| G_n(f) \big| \right] &\lesssim \|F\|_{\bar\P,2} \, J\big(\delta, \cF, F \big) + \frac{\|M\|_{\P,2} \, J(\delta, \cF, F)^2}{\delta^2 \sqrt{n}}, \end{align*} % where $\lesssim$ is up to a universal constant, and $J(\delta, \cF, F)$ is the covering integral % \begin{align*} J\big(\delta, \cF, F\big) &= \int_0^\delta \sqrt{1 + \sup_\Q \log N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2})} \diff{\varepsilon}, \end{align*} % with the supremum taken over finite discrete probability measures $\Q$ on $(S, \cS)$. \end{lemma} \begin{lemma}[A VC class maximal inequality for i.n.i.d.\ empirical processes] \label{lem:kernel_app_maximal_vc_inid} Assume the same setup as in Lemma~\ref{lem:kernel_app_maximal_entropy}, and suppose that $\cF$ forms a VC-type class in that % \begin{align*} \sup_\Q N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2}) &\leq (C_1/\varepsilon)^{C_2} \end{align*} % for all $\varepsilon \in (0,1]$, for some constants $C_1 \geq e$ (where $e$ is the standard exponential constant) and $C_2 \geq 1$. Then for $\delta \in (0,1]$ we have the covering integral bound % $J\big(\delta, \cF, F\big) \leq 3 \delta \sqrt{C_2 \log (C_1/\delta)}$, % and so by Lemma~\ref{lem:kernel_app_maximal_entropy}, up to a universal constant, % \begin{align*} \E \left[ \sup_{f \in \cF} \big| G_n(f) \big| \right] &\lesssim \sigma \sqrt{C_2 \log (C_1/\delta)} + \frac{\|M\|_{\P,2} C_2 \log(C_1/\delta)}{\sqrt{n}} \\ &\lesssim \sigma \sqrt{C_2 \log \big(C_1 \|F\|_{\bar\P,2}/\sigma\big)} + \frac{\|M\|_{\P,2} C_2 \log \big(C_1 \|F\|_{\bar\P,2}/\sigma\big)} {\sqrt{n}}. \end{align*} % \end{lemma} \subsection{Strong approximation results} Next we provide two strong approximation results. The first is a corollary of the KMT approximation \citep{komlos1975approximation} which applies to bounded-variation functions of i.i.d.\ variables. The second is an extension of the Yurinskii coupling \citep{belloni2019conditional} which applies to Lipschitz functions of i.n.i.d.\ variables. \begin{lemma}[A KMT approximation corollary] \label{lem:kernel_app_kmt_corollary} For $n \geq 1$ let $X_1, \ldots, X_n$ be i.i.d.\ real-valued random variables and $g_n: \R \times \R \to \R$ be a function satisfying the total variation bound $\sup_{x \in \R} \|g_n(\cdot, x)\|_\TV < \infty$. Then on some probability space there exist independent copies of $X_1, \ldots, X_n$, denoted $X_1', \ldots, X_n'$, and a mean-zero Gaussian process $Z_n(x)$ such that if we define the empirical process % \begin{align*} G_n(x) = \frac{1}{\sqrt n} \sum_{i=1}^n \Big(g_n(X_i',x) - \E\big[g_n(X_i',x)\big]\Big), \end{align*} % then for some universal positive constants $C_1$, $C_2$, and $C_3$, % \begin{align*} \P\left( \sup_{x \in \R} \big|G_n(x) - Z_n(x)\big| > \sup_{x \in \R} \|g_n(\cdot, x)\|_\TV \, \frac{t + C_1 \log n}{\sqrt n} \right) \leq C_2 e^{-C_3 t}. \end{align*} % Further, $Z_n$ has the same covariance structure as $G_n$ in the sense that for all $x,\, x' \in \R$, % \begin{align*} \E\big[Z_n(x) Z_n(x')\big] = \E\big[G_n(x) G_n(x')\big]. \end{align*} % By independently sampling from the law of $Z_n$ conditional on $X_1', \ldots, X_n'$, we can assume that $Z_n$ is a function only of $X_1', \ldots, X_n'$ and some independent random noise. \end{lemma} \begin{lemma}[Yurinskii coupling for Lipschitz i.n.i.d.\ empirical processes] \label{lem:kernel_app_yurinskii_corollary} For $n \geq 2$ let $X_1, \dots, X_n$ be independent but not necessarily identically distributed (i.n.i.d.) random variables taking values in a measurable space $(S, \cS)$ and let $\cX_n \subseteq \R$ be a compact interval with $\left|\log \Leb(\cX_n)\right| \leq C_1 \log n$ where $C_1 > 0$ is a constant. Let $g_n$ be measurable on $S \times \cX_n$ satisfying $\sup_{\xi \in S} \sup_{x \in \cX_n} |g_n(\xi, x)| \leq M_n$ and $\sup_{x \in \cX_n} \max_{1 \leq i \leq n} \Var[g_n(X_i, x)] \leq \sigma_n^2$, with $\left|\log M_n\right| \leq C_1 \log n$ and $\left|\log \sigma_n^2\right| \leq C_1 \log n$. Suppose that $g_n$ satisfies the following uniform Lipschitz condition: % \begin{align*} \sup_{\xi \in S} \sup_{x,x' \in \cX_n} \left| \frac{g_n(\xi, x) - g_n(\xi, x')} {x-x'} \right| \leq l_{n,\infty}, \end{align*} % and also the following $L^2$ Lipschitz condition: % \begin{align*} \sup_{x,x' \in \cX_n} \E\left[ \frac{1}{n} \sum_{i=1}^n \left| \frac{g_n(X_i, x) - g_n(X_i, x')} {x-x'} \right|^2 \right]^{1/2} \leq l_{n,2}, \end{align*} % where $0 < l_{n,2} \leq l_{n,\infty}$, $\left|\log l_{n,2}\right| \leq C_1 \log n$, and $\left|\log l_{n,\infty}\right| \leq C_1 \log n$. Then for any $t_n > 0$ with $\left|\log t_n\right| \leq C_1 \log n$, there is a probability space carrying independent copies of $X_1, \ldots, X_n$ denoted $X_1', \ldots, X_n'$ and a mean-zero Gaussian process $Z_n(x)$ such that if we define the empirical process % $G_n(x) = \frac{1}{\sqrt n} \sum_{i=1}^n \big( g_n(X'_i,x) - \E[g_n(X'_i,x)] \big)$, % then % \begin{align*} &\P\left( \sup_{x \in \cX_n} \big| G_n(x) - Z_n(x) \big| > t_n \right) \\ &\quad\leq \frac{ C_2 \sigma_n \sqrt{\Leb(\cX_n)} \sqrt{\log n} \sqrt{M_n + \sigma_n\sqrt{\log n}} }{n^{1/4} t_n^2} \sqrt{ l_{n,2} \sqrt{\log n} + \frac{l_{n,\infty}}{\sqrt n} \log n} \end{align*} % where $C_2 > 0$ is a constant depending only on $C_1$. Further, $Z_n$ has the same covariance structure as $G_n$ in the sense that for all $x, x' \in \cX_n$, % \begin{align*} \E\big[Z_n(x) Z_n(x')\big] = \E\big[G_n(x) G_n(x')\big]. \end{align*} \end{lemma} \subsection{The Vorob'ev--Berkes--Philipp theorem} We present a generalization of the Vorob'ev--Berkes--Philipp theorem \citep{dudley1999uniform} which allows one to ``glue'' multiple random variables or stochastic processes onto the same probability space, while preserving some pairwise distributions. We begin with some definitions. \begin{definition}[Tree] A \emph{tree} is a finite undirected graph which is connected and contains no cycles or self-loops. \end{definition} \begin{definition}[Polish Borel probability space] A \emph{Polish Borel probability space} is a triple $(\cX, \cF, \P)$, where $\cX$ is a Polish space (a topological space metrizable by a complete separable metric), $\cF$ is the Borel $\sigma$-algebra induced on $\cX$ by its topology, and $\P$ is a probability measure on $(\cX, \cF)$. Important examples of Polish spaces include $\R^d$ and the Skorokhod space $\cD[0,1]^d$ for $d \geq 1$. In particular, one can consider vectors of real-valued random variables or stochastic processes indexed by compact subsets of $\R^d$ which have almost surely continuous trajectories. \end{definition} \begin{definition}[Projection of a law] Let $(\cX_1, \cF_1)$ and $(\cX_2, \cF_2)$ be measurable spaces, and let $\P_{12}$ be a law on the product space $(\cX_1 \times \cX_2, \cF_1 \otimes \cF_2)$. The \emph{projection} of $\P_{12}$ onto $\cX_1$ is the law $\P_1$ defined on $(\cX_1, \cF_1)$ by $\P_1 = \P_{12} \circ \pi_1^{-1}$ where $\pi_1(x_1, x_2) = x_1$ is the first-coordinate projection. \end{definition} \begin{lemma}[Vorob'ev--Berkes--Philipp theorem, tree form] \label{lem:kernel_app_vbp} Let $\cT$ be a tree with vertex set $\cV = \{1, \ldots, n\}$ and edge set $\cE$. Suppose that attached to each vertex $i$ is a Polish Borel probability space $(\cX_i, \cF_i, \P_i)$. Suppose that attached to each edge $(i,j) \in \cE$ (where $i t)\leq r_i$ for each $1 \leq i \leq n-1$, where $\|\cdot\|$ is a norm on $\cD[0,1]$. Then there exist copies of $X_1, \ldots, X_n$ denoted $X_1'', \ldots, X_n''$ satisfying $\P\big(\|X_{i+1}'' - X_i''\| > t)\leq r_i$ for each $1 \leq i \leq n$. That is, all of the inequalities can be satisfied simultaneously on the same probability space. \end{enumerate} \end{remark} \section{Proofs} \label{sec:kernel_app_proofs} We present full proofs of all the results stated in Chapter~\ref{ch:kernel} and Appendix~\ref{app:kernel}. \subsection{Preliminary lemmas} In this section we list some results in probability and U-statistic theory which are used in proofs of our main results. Other auxiliary lemmas will be introduced when they are needed. \begin{lemma}[Bernstein's inequality for independent random variables] \label{lem:kernel_app_bernstein} Let $X_1, \ldots, X_n$ be independent real-valued random variables with $\E[X_i] = 0$, $|X_i| \leq M$, and $\E[X_i^2] \leq \sigma^2$, where $M$ and $\sigma$ are non-random. Then for all $t>0$, % \begin{align*} \P \left( \left| \frac{1}{n} \sum_{i=1}^n X_i \right| \geq t \right) \leq 2 \exp \left( - \frac{t^2 n} {2 \sigma^2 + \frac{2}{3} M t} \right). \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_bernstein}] See for example Lemma~2.2.9 in~\citet{van1996weak}. \end{proof} \begin{lemma}[The matrix Bernstein inequality] \label{lem:kernel_app_matrix_bernstein} For $1 \leq i \leq n$ let $X_i$ be independent symmetric $d \times d$ real random matrices with expected values $\mu_i = \E[X_i]$. Suppose that $\|X_i - \mu_i\|_2 \leq M$ almost surely for all $1 \leq i \leq n$ where $M$ is non-random, and define $\sigma^2 = \big\| \sum_i \E[(X_i - \mu_i)^2] \big\|_2$. Then there exists a universal constant $C > 0$ such that for any $t > 0$ and $q \geq 1$, % \begin{align*} \P\left( \left\| \sum_{i=1}^n \left( X_i - \mu_i \right) \right\|_2 \geq 2 \sigma \sqrt{t} + \frac{4}{3} M t \right) &\leq 2 d e^{-t}, \\ \E\left[ \left\| \sum_{i=1}^n \left( X_i - \mu_i \right) \right\|_2^q \right]^{1/q} &\leq C \sigma \sqrt{q + \log 2d} + C M (q + \log 2d). \end{align*} % Another simplified version of this is as follows: suppose that $\|X_i\|_2 \leq M$ almost surely, so that $\|X_i - \mu_i\|_2 \leq 2M$. Then since $\sigma^2 \leq n M^2$, we have % \begin{align*} \P\left( \left\| \sum_{i=1}^n \left( X_i - \mu_i \right) \right\|_2 \geq 4M \big(t + \sqrt{n t}\big) \right) &\leq 2 d e^{-t}, \\ \E\left[ \left\| \sum_{i=1}^n \left( X_i - \mu_i \right) \right\|_2^q \right]^{1/q} &\leq C M \big(q + \log 2d + \sqrt{n(q + \log 2d)}\big). \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_matrix_bernstein}] See Lemma~3.2 in \citet{minsker2019moment}. \end{proof} \begin{lemma}[A maximal inequality for Gaussian vectors] \label{lem:kernel_app_gaussian_vector_maximal} Take $n \geq 2$. Let $X_i \sim \cN(0, \sigma_i^2)$ for $1 \leq i \leq n$ with $\sigma_i^2 \leq \sigma^2$. Then % \begin{align} \label{eq:kernel_app_gaussian_vector_maximal} \E\left[ \max_{1 \leq i \leq n} X_i \right] &\leq \sigma \sqrt{2 \log n}, \\ \label{eq:kernel_app_gaussian_vector_maximal_abs} \E\left[ \max_{1 \leq i \leq n} |X_i| \right] &\leq 2 \sigma \sqrt{\log n}. \end{align} % If $\Sigma_1$ and $\Sigma_2$ are constant positive semi-definite $n \times n$ matrices and $N \sim \cN(0,I_n)$, then % \begin{align} \label{eq:kernel_app_gaussian_difference_psd} \E\Big[ \big\| \Sigma_1^{1/2} N - \Sigma_2^{1/2} N \big\|_\infty \Big] &\leq 2 \sqrt{\log n} \, \big\| \Sigma_1 - \Sigma_2 \big\|_2^{1/2}. \end{align} % If further $\Sigma_1$ is positive definite, then % \begin{align} \label{eq:kernel_app_gaussian_difference_pd} \E\Big[ \big\| \Sigma_1^{1/2} N - \Sigma_2^{1/2} N \big\|_\infty \Big] &\leq \sqrt{\log n} \, \lambda_{\min}(\Sigma_1)^{-1/2} \, \big\| \Sigma_1 - \Sigma_2 \big\|_2. \end{align} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_gaussian_vector_maximal}] For $t > 0$, Jensen's inequality on the concave logarithm function gives % \begin{align*} \E\left[ \max_{1 \leq i \leq n} X_i \right] &= \frac{1}{t} \E\left[ \log \exp \max_{1 \leq i \leq n} t X_i \right] \leq \frac{1}{t} \log \E\left[ \exp \max_{1 \leq i \leq n} t X_i \right] \leq \frac{1}{t} \log \sum_{i=1}^n \E\left[ \exp t X_i \right] \\ &= \frac{1}{t} \log \sum_{i=1}^n \exp \left( \frac{t^2 \sigma_i^2}{2} \right) \leq \frac{1}{t} \log n + \frac{t \sigma^2}{2}, \end{align*} % by the Gaussian moment generating function. Minimizing with $t = \sqrt{2 \log n} / \sigma$ yields \eqref{eq:kernel_app_gaussian_vector_maximal}. For \eqref{eq:kernel_app_gaussian_vector_maximal_abs}, we use the symmetry of the Gaussian distribution: % \begin{align*} \E\left[ \max_{1 \leq i \leq n} |X_i| \right] &= \E\left[ \max_{1 \leq i \leq n} \{X_i, -X_i\} \right] \leq \sigma \sqrt{2 \log 2n} \leq 2 \sigma \sqrt{\log n}. \end{align*} % For \eqref{eq:kernel_app_gaussian_difference_psd} and \eqref{eq:kernel_app_gaussian_difference_pd}, note that $\Sigma_1^{1/2} N - \Sigma_2^{1/2} N$ is Gaussian with covariance matrix $\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2$. The variances of of its components are the diagonal elements of this matrix, namely % \begin{align*} \sigma_i^2 &= \Var\big[ \big(\Sigma_1^{1/2} N - \Sigma_2^{1/2} N\big)_i \big] = \Big(\big( \Sigma_1^{1/2} - \Sigma_2^{1/2} \big)^2\Big)_{ii}. \end{align*} % Note that if $e_i$ is the $i$th standard unit basis vector, then for any real symmetric matrix $A$, we have $e_i^\T A^2 e_i = (A^2)_{ii}$, so in particular $(A^2)_{ii} \leq \|A\|_2^2$. Therefore % \begin{align*} \sigma_i^2 &\leq \big\| \Sigma_1^{1/2} - \Sigma_2^{1/2} \big\|_2^2 =\vcentcolon \sigma^2. \end{align*} % Applying \eqref{eq:kernel_app_gaussian_vector_maximal_abs} then gives % \begin{align*} \E\Big[ \big\| \Sigma_1^{1/2} N - \Sigma_2^{1/2} N \big\|_\infty \Big] &\leq 2 \sqrt{\log n} \, \big\| \Sigma_1^{1/2} - \Sigma_2^{1/2} \big\|_2. \end{align*} % By Theorem~X.1.1 in \citet{bhatia1997matrix}, we can deduce % \begin{align*} \big\| \Sigma_1^{1/2} - \Sigma_2^{1/2} \big\|_2 &\leq \big\| \Sigma_1 - \Sigma_2 \big\|_2^{1/2}, \end{align*} % giving \eqref{eq:kernel_app_gaussian_difference_psd}. If $\Sigma_1$ is positive definite, Theorem~X.3.8 in \citet{bhatia1997matrix} gives \eqref{eq:kernel_app_gaussian_difference_pd}: % \begin{align*} \big\| \Sigma_1^{1/2} - \Sigma_2^{1/2} \big\|_2 &\leq \frac{1}{2} \lambda_{\min}(\Sigma_1)^{-1/2} \, \big\| \Sigma_1 - \Sigma_2 \big\|_2. \end{align*} % \end{proof} \begin{lemma}[Maximal inequalities for Gaussian processes] \label{lem:kernel_app_gaussian_process_maximal} Let $Z$ be a separable mean-zero Gaussian process indexed by $x \in \cX$. Recall that $Z$ is separable for example if $\cX$ is Polish and $Z$ has continuous trajectories. Define its covariance structure on $\cX \times \cX$ by $\Sigma(x, x') = \E[Z(x) Z(x')]$, and the corresponding semimetric on $\cX$ by % \begin{align*} \rho(x,x') &= \E\big[\big(Z(x) - Z(x')\big)^2\big]^{1/2} = \big(\Sigma(x,x) - 2 \Sigma(x,x') + \Sigma(x',x')\big)^{1/2}. \end{align*} % Let $N(\varepsilon, \cX, \rho)$ denote the $\varepsilon$-covering number of $\cX$ with respect to the semimetric $\rho$. Define $\sigma = \sup_x \Sigma(x,x)^{1/2}$. Then there exists a universal constant $C > 0$ such that for any $\delta > 0$, % \begin{align*} \E\left[ \sup_{x \in \cX} |Z(x)| \right] &\leq C \sigma + C \int_0^{2\sigma} \sqrt{\log N(\varepsilon, \cX, \rho)} \diff{\varepsilon}, \\ \E\left[ \sup_{\rho(x,x') \leq \delta} |Z(x) - Z(x')| \right] &\leq C \int_0^{\delta} \sqrt{\log N(\varepsilon, \cX, \rho)} \diff{\varepsilon}. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_gaussian_process_maximal}] See Corollary~2.2.8 in \citet{van1996weak}, noting that for any $x,x' \in \cX$, we have $\E[|Z(x)|] \lesssim \sigma$ and $\rho(x,x') \leq 2\sigma$, implying that $\log N(\varepsilon, \cX, \rho) = 0$ for all $\varepsilon > 2 \sigma$. \end{proof} \begin{lemma}[Anti-concentration for Gaussian process absolute suprema] \label{lem:kernel_app_anticoncentration} Let $Z$ be a separable mean-zero Gaussian process indexed by a semimetric space $\cX$ with $\E[Z(x)^2] = 1$ for all $x \in \cX$. Then for any $\varepsilon > 0$, % \begin{align*} \sup_{t \in \R} \P\left( \left| \sup_{x \in \cX} \big| Z(x) \big| - t \right| \leq \varepsilon \right) &\leq 4 \varepsilon \left( 1 + \E\left[ \sup_{x \in \cX} \big| Z(x) \big| \right] \right). \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_anticoncentration}] See Corollary~2.1 in \citet{chernozhukov2014anti}. \end{proof} \begin{lemma}[No slowest rate of convergence in probability] \label{lem:kernel_app_slow_convergence} Let $X_n$ be a sequence of real-valued random variables with $X_n = o_\P(1)$. Then there exists a deterministic sequence $\varepsilon_n \to 0$ such that $\P\big(|X_n| > \varepsilon_n\big) \leq \varepsilon_n$ for all $n \geq 1$. \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_slow_convergence}] Define the following deterministic sequence for $k \geq 1$. % \begin{align*} \tau_k &= \sup \big\{ n \geq 1: \P\big(|X_n| > 1/k\big) > 1/k \big\} \vee (\tau_{k-1} +1) \end{align*} % with $\tau_0 = 0$. Since $X_n = o_\P(1)$, each $\tau_k$ is finite and so we can define $\varepsilon_n = \frac{1}{k}$ where $\tau_k < n \leq \tau_{k+1}$. Then, noting that $\varepsilon_n \to 0$, we have $\P\big(|X_n| > \varepsilon_n\big) = \P\big(|X_n| > 1/k\big) \leq 1/k = \varepsilon_n$. \end{proof} \begin{lemma}[General second-order Hoeffding-type decomposition] \label{lem:kernel_app_general_hoeffding} Let $\cU$ be a vector space. Let $u_{i j} \in \cU$ be defined for $1 \leq i, j \leq n$ and $i \neq j$. Suppose that $u_{i j} = u_{j i}$ for all $i,j$. Then for any $u_i \in \cU$ (for $1 \leq i \leq n$) and any $u \in \cU$, the following decomposition holds: % \begin{align*} \sum_{i=1}^n \sum_{\substack{j=1 \\ j \neq i}}^n \big(u_{i j} - u\big) &= 2(n-1) \sum_{i=1}^n \big(u_i - u\big) + \sum_{i=1}^n \sum_{\substack{j=1 \\ j \neq i}}^n \big(u_{i j} - u_i - u_j + u\big). \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_general_hoeffding}] We compute the left hand side minus the right hand side, beginning by observing that all of the $u_{i j}$ and $u$ terms clearly cancel. % \begin{align*} &\sum_{i=1}^n \sum_{j \neq i}^n \big(u_{i j} - u\big) - 2(n-1) \sum_{i=1}^n \big(u_i - u\big) - \sum_{i=1}^n \sum_{j \neq i} \big(u_{i j} - u_i - u_j + u\big) \\ &\qquad= - 2(n-1) \sum_{i=1}^n u_i - \sum_{i=1}^n \sum_{j \neq i}^n \big(- u_i - u_j\big) = - 2(n-1) \sum_{i=1}^n u_i + \sum_{i=1}^n \sum_{j \neq i}^n u_i + \sum_{j=1}^n \sum_{i \neq j}^n u_j \\ &\qquad= - 2(n-1) \sum_{i=1}^n u_i + (n-1) \sum_{i=1}^n u_i + (n-1) \sum_{j=1}^n u_j = 0. \end{align*} \end{proof} \begin{lemma}[A U-statistic concentration inequality] \label{lem:kernel_app_ustat_concentration} Let $(S,\cS)$ be a measurable space and $X_1, \ldots, X_n$ be i.i.d.\ $S$-valued random variables. Let $H: S^m \to \R$ be a function of $m$ variables satisfying the symmetry property $H(x_1, \ldots, x_m) = H(x_{\tau (1)}, \ldots, x_{\tau (m)})$ for any $m$-permutation $\tau$. Suppose also that $\E[H(X_1, \ldots, X_m)] = 0$. Let $M = \|H\|_\infty$ and $\sigma^2 = \E\big[\E[H(X_1, \ldots, X_m) \mid X_1]^2\big]$. Define the U-statistic % \begin{align*} U_n &= \frac{m!(n-m)!}{n!} \sum_{1 \leq i_1 < \cdots < i_m \leq n} H(X_1, \ldots, X_n). \end{align*} % Then for any $t > 0$, with $C_1(m)$, $C_2(m)$ positive constants depending only on $m$, % \begin{align*} \P\left( |U_n| > t \right) &\leq 4 \exp \left( - \frac{n t^2}{C_1(m) \sigma^2 + C_2(m) M t} \right). \end{align*} % \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_ustat_concentration}] See Theorem~2 in \citet{arcones1995bernstein}. \end{proof} \begin{lemma}[A second-order U-process maximal inequality] \label{lem:kernel_app_uprocess_maximal} Let $X_1 \ldots, X_n$ be i.i.d.\ random variables taking values in a measurable space $(S, \cS)$ with distribution $\P$. Let $\cF$ be a class of measurable functions from $S \times S$ to $\R$ which is also pointwise measurable. Define the degenerate second-order U-process % \begin{align*} U_n(f) = \frac{2}{n(n-1)} \sum_{i 0$ be any deterministic value satisfying $\sup_{f \in \cF} \|f\|_{\P,2} \leq \sigma \leq \|F\|_{\P,2}$, and define the random variable $M = \max_{i,j} |F(X_i, X_j)|$. Then there exists a universal constant $C_3 > 0$ satisfying % \begin{align*} n \E\left[ \sup_{f \in \cF} \big| U_n(f) \big| \right] &\leq C_3 \sigma \Big( C_2 \log\big(C_1 \|F\|_{\P,2} / \sigma \big) \Big) + \frac{C_3 \|M\|_{\P,2}}{\sqrt{n}} \Big( C_2 \log\big(C_1 \|F\|_{\P,2} / \sigma \big) \Big)^2. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_uprocess_maximal}] Apply Corollary~5.3 from \citet{chen2020jackknife} with the order of the U-statistic fixed at $r=2$, and with $k=2$. \end{proof} \begin{lemma}[A U-statistic matrix concentration inequality] \label{lem:kernel_app_ustat_matrix_concentration} Let $X_1, \ldots, X_n$ be i.i.d.\ random variables taking values in a measurable space $(S, \cS)$. Suppose $H: S^2 \to \R^{d \times d}$ is a measurable matrix-valued function of two variables satisfying the following: % \begin{enumerate}[label=(\roman*)] \item $H(X_1, X_2)$ is an almost surely symmetric matrix. \item $\|H(X_1, X_2)\|_2 \leq M$ almost surely. \item $H$ is a symmetric function in its arguments in that $H(X_1, X_2) = H(X_2, X_1)$. \item $H$ is degenerate in the sense that $\E[H(X_1, x_2)] = 0$ for all $x_2 \in S$. \end{enumerate} % Let $U_n = \sum_i \sum_{j \neq i} H(X_i, X_j)$ be a U-statistic, and define the variance-type constant % \begin{align*} \sigma^2 &= \E\left[ \left\| \E\left[ H(X_i, X_j)^2 \mid X_j \right] \right\|_2 \right]. \end{align*} % Then for a universal constant $C > 0$ and for all $t > 0$, % \begin{align*} \P\left( \|U_n\|_2 \geq C \sigma n (t + \log d) + C M \sqrt{n} (t + \log d)^{3/2} \right) &\leq C e^{-t}. \end{align*} % By Jensen's inequality, $\sigma^2 \leq \E[ \| H(X_i, X_j)^2 \|_2 ] = \E[ \| H(X_i, X_j) \|_2^2 ] \leq M^2$, giving the simpler % \begin{align*} \P\left( \|U_n\|_2 \geq 2 C M n (t + \log d)^{3/2} \right) &\leq C e^{-t}. \end{align*} % From this last inequality we deduce a moment bound by integration of tail probabilities: % \begin{align*} \E\left[ \|U_n\|_2 \right] &\lesssim M n (\log d)^{3/2}. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_ustat_matrix_concentration}] We apply results from \citet{minsker2019moment}. \proofparagraph{decoupling} Let $\bar U_n = \sum_{i=1}^n \sum_{j=1}^n H(X_i^{(1)}, X_j^{(2)})$ be a decoupled matrix U-statistic, where $X^{(1)}$ and $X^{(2)}$ are i.i.d.\ copies of the sequence $X_1, \ldots, X_n$. By Lemma~5.2 in \citet{minsker2019moment}, since we are only stating this result for degenerate U-statistics of order 2, there exists a universal constant $D_2$ such that for any $t > 0$, we have % \begin{align*} \P\left( \|U_n\|_2 \geq t \right) &\leq D_2 \P\left( \|\bar U_n\|_2 \geq t / D_2 \right). \end{align*} \proofparagraph{concentration of the decoupled U-statistic} By Equation~11 in \citet{minsker2019moment}, we have the following concentration inequality for decoupled degenerate U-statistics. For some universal constant $C_1$ and for any $t > 0$, % \begin{align*} \P\left( \|\bar U_n\|_2 \geq C_1 \sigma n (t + \log d) + C_1 M \sqrt{n} (t + \log d)^{3/2} \right) &\leq e^{-t}. \end{align*} \proofparagraph{concentration of the original U-statistic} Hence we have % \begin{align*} &\P\left( \|U_n\|_2 \geq C_1 D_2 \sigma n (t + \log d) + C_1 D_2 M \sqrt{n} (t + \log d)^{3/2} \right) \\ &\quad\leq D_2 \P\left( \|\bar U_n\|_2 \geq C_1 \sigma n (t + \log d) + C_1 M \sqrt{n} (t + \log d)^{3/2} \right) \leq D_2 e^{-t}. \end{align*} % The main result follows by setting $C = C_1 + C_1 D_2$. \proofparagraph{moment bound} We now obtain a moment bound for the simplified version. We already have that % \begin{align*} \P\left( \|U_n\|_2 \geq 2 C M n (t + \log d)^{3/2} \right) &\leq C e^{-t}. \end{align*} % This implies that for any $t \geq \log d$, we have % \begin{align*} \P\left( \|U_n\|_2 \geq 8 C M n t^{3/2} \right) &\leq C e^{-t}. \end{align*} % Defining $s = 8 C M n t^{3/2}$ so $t = \left( \frac{s}{8C M n} \right)^{2/3}$ shows that for any $s \geq 8C M n(\log d)^{3/2}$, % \begin{align*} \P\left( \|U_n\|_2 \geq s \right) &\leq C e^{-\left( \frac{s}{8C M n} \right)^{2/3}}. \end{align*} % Hence the moment bound is obtained: % \begin{align*} \E\left[ \|U_n\|_2 \right] &= \int_0^\infty \P\left( \|U_n\|_2 \geq s \right) \diff{s} \\ &= \int_0^{8C M n(\log d)^{3/2}} \P\left( \|U_n\|_2 \geq s \right) \diff{s} + \int_{8C M n(\log d)^{3/2}}^\infty \P\left( \|U_n\|_2 \geq s \right) \diff{s} \\ &\leq 8C M n(\log d)^{3/2} + \int_0^\infty C e^{-\left( \frac{s}{8C M n} \right)^{2/3}} \diff{s} \\ &= 8C M n(\log d)^{3/2} + 8C M n \int_0^\infty e^{s^{-2/3}} \diff{s} \lesssim Mn(\log d)^{3/2}. \end{align*} \end{proof} \subsection{Technical lemmas} Before presenting the proof of Lemma~\ref{lem:kernel_app_maximal_entropy}, we give some auxiliary lemmas; namely a symmetrization inequality (Lemma~\ref{lem:kernel_app_symmetrization}), a Rademacher contraction principle (Lemma~\ref{lem:kernel_app_contraction}), and a Hoffman--J{\o}rgensen inequality (Lemma~\ref{lem:kernel_app_hoffmann}). Recall that the Rademacher distribution places probability mass of $1/2$ on each of the points $-1$ and $1$. \begin{lemma}[A symmetrization inequality for i.n.i.d.\ variables] \label{lem:kernel_app_symmetrization} Let $(S, \cS)$ be a measurable space and $\cF$ a class of Borel-measurable functions from $S$ to $\R$ which is pointwise measurable (i.e.\ it contains a countable dense subset under pointwise convergence). Let $X_1, \ldots, X_n$ be independent but not necessarily identically distributed $S$-valued random variables. Let $a_1, \ldots, a_n$ be arbitrary points in $S$ and $\phi$ a non-negative non-decreasing convex function from $\R$ to $\R$. Define $\varepsilon_1, \ldots, \varepsilon_n$ as independent Rademacher random variables, independent of $X_1, \ldots, X_n$. Then % \begin{align*} \E \left[ \phi \left( \sup_{f \in \cF} \left| \sum_{i=1}^n \Big( f(X_i) - \E[f(X_i)] \Big) \right| \right) \right] &\leq \E \left[ \phi \left( 2 \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i \Big( f(X_i) - a_i \Big) \right| \right) \right]. \end{align*} % Note that in particular this holds with $a_i = 0$ and also holds with $\phi(t) = t \vee 0$. \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_symmetrization}] See Lemma~2.3.6 in \citet{van1996weak}. % \end{proof} \begin{lemma}[A Rademacher contraction principle] \label{lem:kernel_app_contraction} Let $\varepsilon_1, \ldots, \varepsilon_n$ be independent Rademacher random variables and $\cT$ be a bounded subset of $\R^n$. Define $M = \sup_{t \in \cT} \max_{1 \leq i \leq n} |t_i|$. Then, noting that the supremum is measurable because $\cT$ is a subset of a separable metric space and is therefore itself separable, % \begin{align*} \E \left[ \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i^2 \right| \right] &\leq 4M \, \E \left[ \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i \right| \right]. \end{align*} % This gives the following corollary. Let $X_1, \ldots, X_n$ be mutually independent and also independent of $\varepsilon_1, \ldots, \varepsilon_n$. Let $\cF$ be a pointwise measurable class of functions from a measurable space $(S, \cS)$ to $\R$, with measurable envelope $F$. Define $M = \max_i F(X_i)$. Then we obtain % \begin{align*} \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i)^2 \right| \right] &\leq 4 \E \left[ M \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right]. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_contraction}] Apply Theorem~4.12 from \citet{ledoux1991probability} with $F$ the identity function and % \begin{align*} \psi_i(s) = \psi(s) &= \min \left( \frac{s^2}{2M}, \frac{M}{2} \right). \end{align*} % This is a weak contraction (i.e.\ 1-Lipschitz) because it is continuous, differentiable on $(-M,M)$ with derivative bounded by $|\psi'(s)| \leq |s|/M \leq 1$, and constant outside $(-M,M)$. Note that since $|t_i| \leq M$ by definition, we have $\psi_i(t_i) = t_i^2 / (2M)$. Hence by Theorem~4.12 from \citet{ledoux1991probability}, % \begin{align*} \E \left[ F \left( \frac{1}{2} \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i \psi_i(t_i) \right| \right) \right] &\leq \E \left[ F \left( \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i \right| \right) \right], \\ \E \left[ \frac{1}{2} \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i \frac{t_i^2}{2M} \right| \right] &\leq \E \left[ \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i \right| \right], \\ \E \left[ \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i^2 \right| \right] &\leq 4M \, \E \left[ \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i \right| \right]. \end{align*} % For the corollary, set $\cT = \left\{\big(f(X_1), \ldots, f(X_n)\big) : f \in \cF\right\}$. For a fixed realization $X_1, \ldots, X_n$, % \begin{align*} \E_\varepsilon \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i)^2 \right| \right] &= \E_\varepsilon \left[ \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i^2 \right| \right] \\ &\leq 4 \E_\varepsilon \left[ M \sup_{t \in \cT} \left| \sum_{i=1}^n \varepsilon_i t_i \right| \right] = 4 \E_\varepsilon \left[ M \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right]. \end{align*} % Taking an expectation over $X_1, \ldots, X_n$ and applying Fubini's theorem yields the result. \end{proof} \begin{lemma}[A Hoffmann--J{\o}rgensen inequality] \label{lem:kernel_app_hoffmann} Let $(S, \cS)$ be a measurable space and $X_1, \ldots, X_n$ be $S$-valued random variables. Suppose that $\cF$ is a pointwise measurable class of functions from $S$ to $\R$ with finite envelope $F$. Let $\varepsilon_1, \ldots, \varepsilon_n$ be independent Rademacher variables independent of $X_1, \ldots, X_n$. For $q \in (1, \infty)$, % \begin{align*} \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| ^q \right] ^{1/q} &\leq C_q \left( \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right] + \E \left[ \max_{1 \leq i \leq n} \sup_{f \in \cF} \big| f(X_i) \big|^q \right]^{1/q} \right), \end{align*} % where $C_q$ is a positive constant depending only on $q$. \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_hoffmann}] We use Talagrand's formulation of a Hoffmann--J{\o}rgensen inequality. Consider the independent $\ell^\infty(\cF)$-valued random functionals $u_i$ defined by $u_i(f) = \varepsilon_i f(X_i)$, where $\ell^\infty(\cF)$ is the Banach space of bounded functions from $\cF$ to $\R$, equipped with the norm $\|u\|_\cF = \sup_{f \in \cF} |u(f)|$. Then Remark~3.4 in \citet{kwapien1991hypercontraction} gives % \begin{align*} \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n u_i(f) \right| ^q \right] ^{1/q} &\leq C_q \left( \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n u_i(f) \right| \right] + \E \left[ \max_{1 \leq i \leq n} \sup_{f \in \cF} \left| u_i(f) \right|^q \right]^{1/q} \right) \\ \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| ^q \right] ^{1/q} &\leq C_q \left( \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right] + \E \left[ \max_{1 \leq i \leq n} \sup_{f \in \cF} \big| f(X_i) \big|^q \right]^{1/q} \right). \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_maximal_entropy}] We follow the proof of Theorem~5.2 from \citet{chernozhukov2014gaussian}, using our i.n.i.d.\ versions of the symmetrization inequality (Lemma~\ref{lem:kernel_app_symmetrization}), Rademacher contraction principle (Lemma~\ref{lem:kernel_app_contraction}), and Hoffmann--J{\o}rgensen inequality (Lemma~\ref{lem:kernel_app_hoffmann}). Without loss of generality, we may assume that $J(1, \cF, F) < \infty$ as otherwise there is nothing to prove, and that $F > 0$ everywhere on $S$. Let $\P_n = n^{-1} \sum_i \delta_{X_i}$ be the empirical distribution of $X_i$, and define the empirical variance bound $\sigma_n^2 = \sup_\cF n^{-1} \sum_i f(X_i)^2$. By the i.n.i.d.\ symmetrization inequality (Lemma~\ref{lem:kernel_app_symmetrization}), % \begin{align*} \E \left[ \sup_{f \in \cF} \big| G_n(f) \big| \right] &= \frac{1}{\sqrt n} \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \Big( f(X_i) - \E[f(X_i)] \Big) \right| \right] \leq \frac{2}{\sqrt n} \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right], \end{align*} % where $\varepsilon_1, \ldots, \varepsilon_n$ are independent Rademacher random variables, independent of $X_1, \ldots, X_n$. Then the standard entropy integral inequality from the proof of Theorem~5.2 in the supplemental materials for \citet{chernozhukov2014gaussian} gives for a universal constant $C_1 > 0$, % \begin{align*} \frac{1}{\sqrt n} \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \Bigm\vert X_1, \ldots, X_n \right] &\leq C_1 \|F\|_{\P_n,2} \, J(\sigma_n / \|F\|_{\P_n,2}, \cF, F). \end{align*} % Taking marginal expectations and applying Jensen's inequality along with a convexity result for the covering integral, as in Lemma~A.2 in \citet{chernozhukov2014gaussian}, gives % \begin{align*} Z &\vcentcolon= \frac{1}{\sqrt n} \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right] \leq C_1 \|F\|_{\bar\P,2} \, J(\E[\sigma_n^2]^{1/2} / \|F\|_{\bar\P,2}, \cF, F). \end{align*} % Now use symmetrization (Lemma~\ref{lem:kernel_app_symmetrization}), the contraction principle (Lemma~\ref{lem:kernel_app_contraction}), the Cauchy--Schwarz inequality, and the Hoffmann--J{\o}rgensen inequality (Lemma~\ref{lem:kernel_app_hoffmann}) to deduce that % \begin{align*} \E[\sigma_n^2] &= \E\left[ \sup_{f \in \cF} \frac{1}{n} \sum_{i=1}^n f(X_i)^2 \right] \leq \sup_{f \in \cF} \E_{\bar\P} \left[ f(X_i)^2 \right] + \frac{1}{n} \E\left[ \sup_{f \in \cF} \left| \sum_{i=1}^n f(X_i)^2 - \E \left[ f(X_i)^2 \right] \right| \right] \\ &\leq \sigma^2 + \frac{2}{n} \E\left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i)^2 \right| \right] \leq \sigma^2 + \frac{8}{n} \E\left[ M \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right] \\ &\leq \sigma^2 + \frac{8}{n} \E\left[ M^2 \right]^{1/2} \E\left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right|^2 \right]^{1/2} \\ &\leq \sigma^2 + \frac{8}{n} \|M\|_{\P,2} \, C_2 \left( \E \left[ \sup_{f \in \cF} \left| \sum_{i=1}^n \varepsilon_i f(X_i) \right| \right] + \E \left[ \max_{1 \leq i \leq n} \sup_{f \in \cF} \big| f(X_i) \big|^2 \right]^{1/2} \right) \\ &= \sigma^2 + \frac{8C_2}{n} \|M\|_{\P,2} \, \left( \sqrt{n} Z + \|M\|_{\P,2} \right) \lesssim \sigma^2 + \frac{\|M\|_{\P,2} Z}{\sqrt n} + \frac{\|M\|_{\P,2}^2}{n}, \end{align*} % where $\lesssim$ indicates a bound up to a universal constant. Hence taking a square root we see that, following the notation from the proof of Theorem~5.2 in the supplemental materials to \citet{chernozhukov2014gaussian}, % \begin{align*} \sqrt{\E[\sigma_n^2]} &\lesssim \sigma + \|M\|_{\P,2}^{1/2} Z^{1/2} n^{-1/4} + \|M\|_{\P,2} n^{-1/2} \lesssim \|F\|_{\bar\P,2} \left( \Delta \vee \sqrt{DZ} \right), \end{align*} % where $\Delta^2 = \|F\|_{\bar\P,2}^{-2} \big(\sigma^2 \vee (\|M\|_{\P,2}^2 / n) \big) \geq \delta^2$ and $D = \|M\|_{\P,2} n^{-1/2} \|F\|_{\bar\P,2}^{-2}$. Thus returning to our bound on $Z$, we now have % \begin{align*} Z &\lesssim \|F\|_{\bar\P,2} \, J(\Delta \vee \sqrt{DZ}, \cF, F). \end{align*} % The final steps proceed as in the proof of Theorem~5.2 from \citet{chernozhukov2014gaussian}, considering cases separately for $\Delta \geq \sqrt{DZ}$ and $\Delta < \sqrt{DZ}$, and applying convexity properties of the entropy integral $J$. \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_maximal_vc_inid}] We assume the VC-type condition % $\sup_\Q N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2}) \leq (C_1/\varepsilon)^{C_2}$ % for all $\varepsilon \in (0,1]$, with constants $C_1 \geq e$ and $C_2 \geq 1$. Hence for $\delta \in (0,1]$, the entropy integral can be bounded as % \begin{align*} J\big(\delta, \cF, F\big) &= \int_0^\delta \sqrt{1 + \sup_\Q \log N(\cF, \rho_\Q, \varepsilon \|F\|_{\Q,2})} \diff{\varepsilon} \leq \int_0^\delta \sqrt{1 + C_2 \log (C_1/\varepsilon)} \diff{\varepsilon} \\ &\leq \int_0^\delta \left( 1 + \sqrt{C_2 \log (C_1/\varepsilon)} \right) \diff{\varepsilon} = \delta + \sqrt{C_2} \int_0^\delta \sqrt{\log (C_1/\varepsilon)} \diff{\varepsilon} \\ &\leq \delta + \sqrt{\frac{C_2}{\log (C_1/\delta)}} \int_0^\delta \log (C_1/\varepsilon) \diff{\varepsilon} = \delta + \sqrt{\frac{C_2}{\log (C_1/\delta)}} \big( \delta + \delta \log (C_1/\delta) \big) \\ &\leq 3 \delta \sqrt{C_2 \log (C_1/\delta)}. \end{align*} % The remaining equations now follow by Lemma~\ref{lem:kernel_app_maximal_entropy}. \end{proof} Before proving Lemma~\ref{lem:kernel_app_kmt_corollary}, we give a bounded-variation characterization (Lemma~\ref{lem:kernel_app_bv_characterization}). \begin{lemma}[A characterization of bounded-variation functions] \label{lem:kernel_app_bv_characterization} Let $\cV_1$ be the class of real-valued functions on $[0,1]$ which are 0 at 1 and have total variation bounded by 1. Also define the class of half-interval indicator functions $\cI = \{\I[0,t]: t \in [0,1]\}$. For any topological vector space $\cX$, define the symmetric convex hull of a subset $\cY \subseteq \cX$ as % \begin{align*} \symconv \cY &= \left\{ \sum_{i=1}^n \lambda_i y_i : \sum_{i=1}^n \lambda_i = 1, \ \lambda_i \geq 0, \ y_i \in \cY \cup -\cY, \ n \in \N \right\}. \end{align*} % Denote its closure by $\overline\symconv \ \cY$. Under the pointwise convergence topology, $\cV_1 \subseteq \overline\symconv \ \cI$. \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_bv_characterization}] Firstly, let $\cD \subseteq \cV_1$ be the class of real-valued functions on $[0,1]$ which are 0 at 1, have total variation exactly 1, and are weakly monotone decreasing. Therefore, for $g \in \cD$, we have $\|g\|_\TV = g(0) = 1$. Let $S = \{s_1, s_2, \dots\} \subseteq [0,1]$ be the countable set of discontinuity points of $g$. We want to find a sequence of convex combinations of elements of $\cI$ which converges pointwise to $g$. To do this, first define the sequence of meshes % \begin{align*} A_n = \{s_k : 1 \leq k \leq n\} \cup \{k/n : 0 \leq k \leq n\}, \end{align*} % which satisfies $\bigcup_n A_n = S \cup ([0,1] \cap \Q)$. Endow $A_n$ with the ordering induced by the canonical order on $\R$, giving $A_n = \{a_1, a_2, \ldots\}$, and define the sequence of functions % \begin{align*} g_n(x) = \sum_{k = 1}^{|A_n|-1} \I[0,a_k] \big( g(a_k) - g(a_{k+1}) \big), \end{align*} % where clearly $\I[0, a_k] \in \cI$, $g(a_k) - g(a_{k+1}) \geq 0$, and $\sum_{k = 1}^{|A_n|-1} \big( g(a_k) - g(a_{k+1}) \big) = g(0) - g(1) = 1$. Therefore $g_n$ is a convex combination of elements of $\cI$. Further, note that for $a_k \in A_n$, % \begin{align*} g_n(a_k) = \sum_{j = k}^{|A_n|-1} \big( g(a_j) - g(a_{j+1}) \big) = g(a_k) - g(a_{|A_n|}) = g(a_k) - g(1) = g(a_k). \end{align*} % Hence if $x \in S$, then eventually $x \in A_n$ so $g_n(x) \to g(x)$. Alternatively, if $x \not\in S$, then $g$ is continuous at $x$. But $g_n \to g$ on the dense set $\bigcup_n A_n$, so also $g_n(x) \to g(x)$. Hence $g_n \to g$ pointwise on $[0,1]$. Now take $f \in \cV_1$. By the Jordan decomposition for total variation functions \citep{royden1988real}, we can write $f = f^+ - f^-$, with $f^+$ and $f^-$ weakly decreasing, $f^+(1) = f^-(1) = 0$, and $\|f^+\|_\TV + \|f^-\|_\TV = \|f\|_\TV$. Supposing that both $\|f^+\|_\TV$ and $\|f^-\|_\TV$ are strictly positive, let $g_n^+$ approximate the unit-variation function $f^+/\|f^+\|_\TV$ and $g_n^-$ approximate $f^-/\|f^-\|_\TV$ as above. Then since trivially % \begin{align*} f = \|f^+\|_\TV f^+ / \|f^+\|_\TV - \|f^-\|_\TV f^- / \|f^-\|_\TV + \big(1 - \|f^+\|_\TV - \|f^-\|_\TV) \cdot 0, \end{align*} % we have that the convex combination % \begin{align*} g_n^+ \|f^+\|_\TV - g_n^- \|f^-\|_\TV + \big(1 - \|f^+\|_\TV - \|f^-\|_\TV) \cdot 0 \end{align*} % converges pointwise to $f$. This also holds if either of the total variations $\|f^\pm\|_\TV$ are zero, since then the corresponding sequence $g_n^\pm$ need not be defined. Now note that each of $g_n^+$, $\,-g_n^-$, and $0$ are in $\symconv \cI$, so $f \in \overline\symconv \ \cI$ under pointwise convergence. \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_kmt_corollary}] We follow the Gaussian approximation method given in Section~2 of \citet{gine2004kernel}. The KMT approximation theorem \citep{komlos1975approximation} asserts the existence of a probability space carrying $n$ i.i.d.\ uniform random variables $\xi_1, \ldots, \xi_n \sim \Unif[0,1]$ and a standard Brownian motion $B_n(s): s \in [0,1]$ such that if % \begin{align*} \alpha_n(s) &\vcentcolon= \frac{1}{\sqrt{n}} \sum_{i=1}^n \big( \I\{\xi_i \leq s\} - s \big), &\beta_n(s) &\vcentcolon= B_n(s) - s B_n(1), \end{align*} % then for some universal positive constants $C_1$, $C_2$, $C_3$, and for all $t > 0$, % \begin{align*} \P\left( \sup_{s \in [0,1]} \big| \alpha_n(s) - \beta_n(s) \big| > \frac{t + C_1\log n}{\sqrt{n}} \right) \leq C_2 e^{-C_3 t}. \end{align*} % We can view $\alpha_n$ and $\beta_n$ as random functionals defined on the class of half-interval indicator functions $\cI = \big\{\I[0,s]: s \in [0,1]\big\}$ in the following way. % \begin{align*} \alpha_n(\I[0,s]) &= \frac{1}{\sqrt{n}} \sum_{i=1}^n \big( \I[0,s](\xi_i) - \E[\I[0,s](\xi_i)]), \\ \beta_n(\I[0,s]) &= \int_0^1 \I[0,s](u) \diff{B_n(u)} - B_n(1) \int_0^1 \I[0,s](u) \diff{u}, \end{align*} % where the integrals are defined as It{\^o} and Riemann--Stieltjes integrals in the usual way for stochastic integration against semimartingales \citep[Chapter~5]{legall2016brownian}. Now we extend their definitions to the class $\cV_1$ of functions on $[0,1]$ which are 0 at 1 and have total variation bounded by 1. This is achieved by noting that by Lemma~\ref{lem:kernel_app_bv_characterization}, we have $\cV_1 \subseteq \overline\symconv \ \cI$ where $\overline{\symconv} \ \cI$ is the smallest symmetric convex class containing $\cI$ which is closed under pointwise convergence. Thus by the dominated convergence theorem, every function in $\cV_1$ is approximated in $L^2$ by finite convex combinations of functions in $\pm\cI$, and the extension to $g \in \cV_1$ follows by linearity and $L^2$ convergence of (stochastic) integrals: % \begin{align*} \alpha_n(g) &= \frac{1}{\sqrt{n}} \sum_{i=1}^n \big( g(\xi_i) - \E[g(\xi_i)]), &\beta_n(g) &= \int_0^1 g(s) \diff{B_n(s)} - B_n(1) \int_0^1 g(s) \diff{s}. \end{align*} % Now we show that the norm induced on $(\alpha_n - \beta_n)$ by the function class $\cV_1$ is a.s.\ identical to the supremum norm. Writing the sums as integrals and using integration by parts for finite-variation Lebesgue--Stieltjes and It\^o integrals, and recalling that $g(1) = \alpha_n(0) = B_n(0) = 0$, % \begin{align*} \sup_{g \in \cV_1} \big|\alpha_n(g) - \beta_n(g)\big| &= \sup_{g \in \cV_1} \left| \int_0^1 g(s) \diff{\alpha_n(s)} - \int_0^1 g(s) \diff{B_n(s)} + B_n(1) \int_0^1 g(s) \diff{s} \right| \\ &= \sup_{g \in \cV_1} \left| \int_0^1 \alpha_n(s) \diff{g(s)} - \int_0^1 B_n(s) \diff{g(s)} + B_n(1) \int_0^1 s \diff{g(s)} \right| \\ &= \sup_{g \in \cV_1} \left| \int_0^1 \big(\alpha_n(s) - \beta_n(s)\big) \diff{g(s)} \right| = \sup_{s \in [0,1]} \big| \alpha_n(s) - \beta_n(s) \big|, \end{align*} % where in the last line the upper bound is because $\|g\|_\TV \leq 1$, and the lower bound is by taking $g_\varepsilon = \pm \I[0,s_\varepsilon]$ where $|\alpha_n(s_\varepsilon) - \beta_n(s_\varepsilon)| \geq \sup_s |\alpha_n(s) - \beta_n(s)| - \varepsilon$. Hence we obtain % \begin{align} \label{eq:kernel_app_kmt_concentration} \P\left( \sup_{g \in \cV_1} \big|\alpha_n(g) - \beta_n(g)\big| > \frac{t + C_1\log n}{\sqrt{n}} \right) \leq C_2 e^{-C_3 t}. \end{align} % Now define $V_n = \sup_{x \in \R} \|g_n(\cdot, x)\|_\TV$, noting that if $V_n = 0$ then the result is trivially true by setting $Z_n = 0$. Let $F_X$ be the common c.d.f.\ of $X_i$, and define the quantile function $F_X^{-1}(s) = \inf \{u: F_X(u) \geq s\}$ for $s \in [0,1]$, writing $\inf \emptyset = \infty$ and $\inf \R = -\infty$. Consider the function class % \begin{align*} \cG_n = \big\{ V_n^{-1} g_n\big(F_X^{-1}(\cdot), x\big) - V_n^{-1} g_n\big(F_X^{-1}(1), x\big) : x \in \R \big\}, \end{align*} % noting that $g_n(\cdot,x)$ is finite-variation so $g_n(\pm \infty, x)$ can be interpreted as the relevant limit. By monotonicity of $F_X$ and the definition of $V_n$, the members of $\cG_n$ have total variation of at most $1$ and are 0 at 1, implying that $\cG_n \subseteq \cV_1$. Noting that $\alpha_n$ and $\beta_n$ are random linear operators which a.s.\ annihilate constant functions, define % \begin{align*} Z_n(x) &= \beta_n \Big(g_n\big(F_X^{-1}(\cdot), x\big)\Big) = V_n \beta_n \Big( V_n^{-1} g_n\big(F_X^{-1}(\cdot), x\big) - V_n^{-1} g_n\big(F_X^{-1}(1), x\big) \Big), \end{align*} % which is a mean-zero continuous Gaussian process. Its covariance structure is % \begin{align*} &\E[Z_n(x) Z_n(x')] \\ &= \E\bigg[ \left( \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{B_n(s)} - B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s} \right) \\ &\quad\times \left( \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{B_n(s)} - B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s} \right) \bigg] \\ &= \E\left[ \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{B_n(s)} \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{B_n(s)} \right] \\ &\quad- \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s} \ \E\left[ B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{B_n(s)} \right] \\ &\quad- \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s} \ \E\left[ B_n(1) \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{B_n(s)} \right] \\ &\quad+ \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s} \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s} \ \E\left[ B_n(1)^2 \right] \\ &= \int_0^1 g_n\big(F_X^{-1}(s),x\big) g_n\big(F_X^{-1}(s),x'\big) \diff{s} - \int_0^1 g_n\big(F_X^{-1}(s),x\big) \diff{s} \int_0^1 g_n\big(F_X^{-1}(s),x'\big) \diff{s} \\ &= \E\Big[ g_n\big(F_X^{-1}(\xi_i), x\big) g_n\big(F_X^{-1}(\xi_i), x'\big) \Big] - \E\Big[ g_n\big(F_X^{-1}(\xi_i), x\big) \Big] \E\Big[ g_n\big(F_X^{-1}(\xi_i), x'\big) \Big] \\ &= \E\Big[ g_n\big(X_i, x\big) g_n\big(X_i, x'\big) \Big] - \E\Big[ g_n\big(X_i, x\big) \Big] \E\Big[ g_n\big(X_i, x'\big) \Big] = \E\big[ G_n(x) G_n(x') \big] \end{align*} % as desired, by the It\^o isometry for stochastic integrals, writing $B_n(1) = \int_0^1 \diff{B_n(s)}$; and noting that $F_X^{-1}(\xi_i)$ has the same distribution as $X_i$. Finally, note that % \begin{align*} G_n(x) &= \alpha_n \Big(g_n\big(F_X^{-1}(\cdot), x\big)\Big) = V_n \alpha_n \Big( V_n^{-1} g_n\big(F_X^{-1}(\cdot), x\big) - V_n^{-1} g_n\big(F_X^{-1}(1), x\big) \Big), \end{align*} % and so by \eqref{eq:kernel_app_kmt_concentration} % \begin{align*} \P\left( \sup_{x \in \R} \Big|G_n(x) - Z_n(x)\Big| > V_n \frac{t + C_1 \log n}{\sqrt n} \right) &\leq \P\left( \sup_{g \in \cV_1} \big|\alpha_n(g) - \beta_n(g)\big| > \frac{t + C_1\log n}{\sqrt{n}} \right) \\ &\leq C_2 e^{-C_3 t}. \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_yurinskii_corollary}] Take $0 < \delta_n \leq \Leb(\cX_n)$ and let $\cX_n^\delta = \big\{ x_1, \dots, x_{|\cX_n^\delta|}\big\}$ be a $\delta_n$-covering of $\cX_n$ with cardinality $|\cX_n^\delta| \leq \Leb(\cX_n)/\delta_n$. Suppose that $\left|\log \delta_n\right| \lesssim C_1 \log n$ up to a universal constant. We first use the Yurinskii coupling to construct a Gaussian process $Z_n$ which is close to $G_n$ on this finite cover. Then we bound the fluctuations in $G_n$ and in $Z_n$ using entropy methods. \proofparagraph{Yurinskii coupling} Define the i.n.i.d.\ and mean-zero variables % \begin{align*} h_i(x) &= \frac{1}{\sqrt n} \Big( g_n(X_i', x) - \E[g_n(X_i', x)] \Big), \end{align*} % where $X_1', \ldots, X_n'$ are independent copies of $X_1, \ldots, X_n$ on some new probability space, so that we have $G_n(x) = \sum_{i=1}^n h_i(x)$ in distribution. Also define the length-$|\cX_n^\delta|$ random vector % \begin{align*} h_i^\delta &= \big( h_i(x): x \in \cX_n^\delta \big). \end{align*} % By an extension of Yurinskii's coupling to general norms \citep[supplemental materials, Lemma~38]{belloni2019conditional}, there exists on the new probability space a Gaussian length-$|\cX_n^\delta|$ vector $Z_n^\delta$ which is mean-zero and with the same covariance structure as $ \sum_{i=1}^n h_i^\delta $ satisfying % \begin{align*} \P\left( \bigg\| \sum_{i=1}^n h_i^\delta - Z_n^\delta \bigg\|_\infty > 3 t_n \right) \leq \min_{s > 0} \left( 2 \P\big( \|N\|_\infty > s) + \frac{\beta s^2}{t_n^3} \right), \end{align*} % where % \begin{align*} \beta = \sum_{i=1}^n \Big( \E\big[\|h_i^\delta\|_2^2 \, \|h_i^\delta\|_\infty \big] + \E\big[\|z_i\|_2^2 \, \|z_i\|_\infty \big] \Big), \end{align*} % with $z_i \sim \cN(0, \Var[h_i^\delta])$ independent and $N \sim \cN(0, I_{|\cX_n^\delta|})$. By the bounds on $g_n$, % \begin{align*} \E\big[\|h_i^\delta\|_2^2 \, \|h_i^\delta\|_\infty \, \big] \leq \frac{M_n}{\sqrt n} \E\big[\|h_i^\delta\|_2^2 \, \big] = \frac{M_n}{\sqrt n} \sum_{x \in \cX_n^\delta} \E\big[h_i(x)^2 \, \big] \leq \frac{M_n}{\sqrt n} \frac{|\cX_n^\delta| \sigma_n^2}{n} \leq \frac{M_n \sigma_n^2 \Leb(\cX_n)}{n^{3/2}\delta_n}. \end{align*} % By the fourth moment bound for Gaussian variables, % \begin{align*} \E\big[ \|z_i\|_2^4 \, \big] &\leq |\cX_n^\delta| \, \E\big[ \|z_i\|_4^4 \big] \leq |\cX_n^\delta|^2 \, \max_j \E\big[ (z_i^{(j)})^4 \big] \leq 3 |\cX_n^\delta|^2 \, \max_j \E\big[ (z_i^{(j)})^2 \big]^2 \\ &= 3 |\cX_n^\delta|^2 \, \max_{x \in \cX_n^\delta} \E\big[ h_i(x)^2 \big]^2 \leq \frac{3\sigma_n^4 \Leb(\cX_n)^2}{n^2\delta_n^2} . \end{align*} % Also by Jensen's inequality and for $|\cX_n^\delta| \geq 2$, assuming $C_1 > 1$ without loss of generality, % \begin{align*} \E\big[ \|z_i\|_\infty^2 \big] &\leq \frac{4 \sigma_n^2}{n} \log \E\big[ e^{\|z_i\|_\infty^2 / (4\sigma_n^2/n)} \big] \leq \frac{4 \sigma_n^2}{n} \log \E\left[ \sum_{j=1}^{|\cX_n^\delta|} e^{(z_i^{(j)})^2 / (4\sigma_n^2/n)} \right] \leq \frac{4\sigma_n^2}{n} \log \big(2|\cX_n^\delta|\big) \\ &\leq \frac{4\sigma_n^2}{n} \left( \log 2 + \log \Leb(\cX_n) - \log \delta_n \right) \leq \frac{12 C_1 \sigma_n^2 \log n}{n}, \end{align*} % where we used the moment generating function of a $\chi_1^2$ random variable. Therefore we can apply the Cauchy--Schwarz inequality to obtain % \begin{align*} \E\big[\|z_i\|_2^2 \, \|z_i\|_\infty \big] &\leq \sqrt{ \E\big[\|z_i\|_2^4 \big]} \sqrt{ \E\big[ \|z_i\|_\infty^2 \big]} \leq \sqrt{ \frac{3\sigma_n^4 \Leb(\cX_n)^2}{n^2\delta_n^2}} \sqrt{ \frac{12 C_1 \sigma_n^2 \log n}{n} } \\ &\leq \frac{6\sigma_n^3 \Leb(\cX_n) \sqrt{C_1 \log n}}{n^{3/2} \delta_n}. \end{align*} % Now summing over the $n$ samples gives % \begin{align*} \beta \leq \frac{M_n \sigma_n^2 \Leb(\cX_n)}{\sqrt n \delta_n} + \frac{6\sigma_n^3 \Leb(\cX_n) \sqrt{C_1 \log n}} {\sqrt n \delta_n} = \frac{\sigma_n^2 \Leb(\cX_n)}{\sqrt n \delta_n} \Big(M_n + 6\sigma_n \sqrt{C_1 \log n}\Big). \end{align*} % By a union bound and Gaussian tail probabilities, we have that $\P\big( \|N\|_\infty > s) \leq 2|\cX_n^\delta| e^{-s^2/2}$. Thus we get the following Yurinskii coupling inequality for all $s > 0$: % \begin{align*} \P\left( \bigg\| \sum_{i=1}^n h_i^\delta - Z_n^\delta \bigg\|_\infty > t_n \right) &\leq \frac{4 \Leb(\cX_n)}{\delta_n} e^{-s^2/2} + \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3} \Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big). \end{align*} % Note that $Z_n^\delta$ now extends by the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}) to a mean-zero Gaussian process $Z_n$ on the compact interval $\cX_n$ with covariance structure % \begin{align*} \E\big[ Z_n(x) Z_n(x') \big] = \E\big[ G_n(x) G_n(x') \big], \end{align*} % satisfying for any $s' > 0$ % \begin{align*} &\P\left( \sup_{x \in \cX_n^\delta} \big| G_n(x) - Z_n(x) \big| > t_n \right) \leq \frac{4 \Leb(\cX_n)}{\delta_n} e^{-s^2/2} + \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3} \Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big). \end{align*} \proofparagraph{regularity of $G_n$} Next we bound the fluctuations in the empirical process $G_n$. Consider the following classes of functions on $S$ and their associated (constant) envelope functions. By continuity of $g_n$, each class is pointwise measurable (to see this, restrict the index sets to rationals). % \begin{align*} \cG_n &= \big\{ g_n(\cdot, x): x \in \cX_n \big\}, &\Env(\cG_n) &= M_n, \\ \cG_n^\delta &= \big\{ g_n(\cdot, x) - g_n(\cdot, x'): x, x' \in \cX_n, |x-x'| \leq \delta_n \big\}, &\Env(\cG_n^\delta) &= l_{n,\infty} \delta_n. \end{align*} % We first show these are VC-type. By the uniform Lipschitz assumption, % \begin{align*} \big\| g_n(\cdot, x) - g_n(\cdot, x') \big\|_\infty &\leq l_{n,\infty} |x-x'| \end{align*} % for all $x,x' \in \cX_n$. Therefore, with $\Q$ ranging over the finitely-supported distributions on $(S, \cS)$, noting that any $\|\cdot\|_\infty$-cover is a $\rho_\Q$-cover, % \begin{align*} \sup_\Q N\big(\cG_n, \rho_\Q, \varepsilon l_{n,\infty} \!\Leb(\cX_n)\big) &\leq N\big(\cG_n, \|\cdot\|_\infty, \varepsilon l_{n,\infty} \!\Leb(\cX_n)\big) \leq N\big(\cX_n, |\cdot|, \varepsilon \!\Leb(\cX_n)\big) \leq 1/\varepsilon. \end{align*} % Replacing $\varepsilon$ by $\varepsilon M_n/(l_{n,\infty} \Leb(\cX_n))$ gives % \begin{align*} \sup_\Q N\big(\cG_n, \rho_\Q, \varepsilon M_n \big) &\leq \frac{l_{n,\infty} \Leb(\cX_n)}{\varepsilon M_n}, \end{align*} % and so $\cG_n$ is a VC-type class. To see that $\cG_n^\delta$ is also a VC-type class, we construct a cover in the following way. Let $\cF_n$ be an $\varepsilon$-cover for $(\cG_n, \|\cdot\|_\infty)$. By the triangle inequality, $\cF_n - \cF_n$ is a $2\varepsilon$-cover for $(\cG_n - \cG_n, \|\cdot\|_\infty)$ of cardinality at most $|\cF_n|^2$, where the subtractions are set subtractions. Since $\cG_n^\delta \subseteq \cG_n - \cG_n$, we see that $\cF_n - \cF_n$ is a $2\varepsilon$-external cover for $\cG_n^\delta$. Thus % \begin{align*} \sup_\Q N\big(\cG_n^\delta, \rho_\Q, \varepsilon l_{n,\infty} \Leb(\cX_n)\big) &\leq N\big(\cG_n^\delta, \|\cdot\|_\infty, \varepsilon l_{n,\infty} \Leb(\cX_n)\big) \\ &\leq N\big(\cG_n, \|\cdot\|_\infty, \varepsilon l_{n,\infty} \Leb(\cX_n)\big)^2 \leq 1/\varepsilon^2. \end{align*} % Replacing $\varepsilon$ by $\varepsilon \delta_n/\Leb(\cX_n)$ gives % \begin{align*} \sup_\Q N\big(\cG_n^\delta, \rho_\Q, \varepsilon l_{n,\infty} \delta_n \big) &\leq \frac{\Leb(\cX_n)^2}{\varepsilon^2 \delta_n^2} \leq (C_{1,n}/\varepsilon)^{2} \end{align*} % with $C_{1,n} = \Leb(\cX_n) / \delta_n$, demonstrating that $\cG_n^\delta$ forms a VC-type class. We now apply the maximal inequality for i.n.i.d.\ data given in Lemma~\ref{lem:kernel_app_maximal_vc_inid}. To do this, note that $\sup_{\cG_n^\delta} \|g\|_{\bar\P,2} \leq l_{n,2} \delta_n$ by the $L^2$ Lipschitz condition, and recall $\Env(\cG_n^\delta) = l_{n,\infty} \delta_n$. Therefore Lemma~\ref{lem:kernel_app_maximal_vc_inid} with $\|F\|_{\bar\P,2} = l_{n,\infty} \delta_n$, $\|M\|_{\P,2} = l_{n,\infty} \delta_n$, and $\sigma = l_{n,2} \delta_n$ gives, up to universal constants % \begin{align*} &\E\left[ \sup_{g \in \cG_n^\delta} \left| \frac{1}{\sqrt{n}} \sum_{i=1}^n \Big( g(X_i) - \E[g(X_i)] \Big) \right| \right] \\ &\quad\lesssim \sigma \sqrt{2 \log \big(C_{1,n} \|F\|_{\bar\P,2}/\sigma\big)} + \frac{\|M\|_{\P,2} 2 \log \big(C_{1,n} \|F\|_{\bar\P,2}/\sigma\big)} {\sqrt{n}} \\ &\quad\lesssim l_{n,2} \delta_n \sqrt{C_1 \log n} + \frac{l_{n,\infty} \delta_n}{\sqrt n} C_1 \log n, \end{align*} % and hence by Markov's inequality, % \begin{align*} &\P\left( \sup_{|x-x'| \leq \delta_n} \big| G_n(x) - G_n(x') \big| > t_n \right) \\ &= \P\left( \sup_{|x-x'| \leq \delta_n} \frac{1}{\sqrt{n}} \left| \sum_{i=1}^n \Big( g_n(X_i, x) - \E[g_n(X_i, x)] - g_n(X_i, x') + \E[g_n(X_i, x')] \Big) \right| > t_n \right) \\ &= \P\left( \sup_{g \in \cG_n^\delta} \left| \frac{1}{\sqrt{n}} \sum_{i=1}^n \Big( g(X_i) - \E[g(X_i)] \Big) \right| > t_n \right) \leq \frac{1}{t} \E\left[ \sup_{g \in \cG_n^\delta} \left| \frac{1}{\sqrt{n}} \sum_{i=1}^n \Big( g(X_i) - \E[g(X_i)] \Big) \right| \right] \\ &\lesssim \frac{l_{n,2} \delta_n}{t_n} \sqrt{C_1 \log n} + \frac{l_{n,\infty} \delta_n}{t_n \sqrt n} C_1 \log n. \end{align*} \proofparagraph{regularity of $Z_n$} Next we bound the fluctuations in the Gaussian process $Z_n$. Let $\rho$ be the following semimetric: % \begin{align*} \rho(x, x')^2 &= \E\big[\big( Z_n(x) - Z_n(x') \big)^2\big] = \E\big[\big( G_n(x) - G_n(x') \big)^2\big] \\ &= \frac{1}{n} \sum_{i=1}^n \E\big[\big( h_i(x) - h_i(x') \big)^2\big] \leq l_{n,2}^2 \, |x - x'|^2. \end{align*} % Hence $\rho(x, x') \leq l_{n,2} \, |x - x'|$. By the Gaussian process maximal inequality from Lemma~\ref{lem:kernel_app_gaussian_process_maximal}, we obtain that % \begin{align*} &\E\bigg[ \sup_{|x - x'| \leq \delta_n} \big| Z_n(x) - Z_n(x') \big| \bigg] \lesssim \E\bigg[ \sup_{\rho(x,x') \leq l_{n,2} \delta_n} \big| Z_n(x) - Z_n(x') \big| \bigg] \\ &\quad\leq \int_0^{l_{n,2} \delta_n} \sqrt{\log N(\varepsilon, \cX_n, \rho)} \diff{\varepsilon} \leq \int_0^{l_{n,2} \delta_n} \sqrt{\log N(\varepsilon / l_{n,2}, \cX_n, |\cdot|)} \diff{\varepsilon} \\ &\quad\leq \int_0^{l_{n,2} \delta_n} \sqrt{\log \left( 1 + \frac{\Leb(\cX_n) l_{n,2}}{\varepsilon} \right)} \diff{\varepsilon} \leq \int_0^{l_{n,2} \delta_n} \sqrt{\log \left( \frac{2\Leb(\cX_n) l_{n,2}}{\varepsilon} \right)} \diff{\varepsilon} \\ &\quad\leq \log \left(\frac{2\Leb(\cX_n)}{\delta_n} \right)^{-1/2} \int_0^{l_{n,2} \delta_n} \log \left( \frac{2\Leb(\cX_n) l_{n,2}}{\varepsilon} \right) \diff{\varepsilon} \\ &\quad= \log \left(\frac{2\Leb(\cX_n)}{\delta_n} \right)^{-1/2} \left( l_{n,2} \delta_n \log \left( 2 \Leb(\cX_n) l_{n,2} \right) + l_{n,2} \delta_n + l_{n,2} \delta_n \log \left( \frac{1}{l_{n,2} \delta_n} \right) \right) \\ &\quad= \log \left(\frac{2\Leb(\cX_n)}{\delta_n} \right)^{-1/2} l_{n,2} \delta_n \left( 1 + \log \left( \frac{2\Leb(\cX_n)}{\delta_n} \right) \right) \lesssim l_{n,2} \delta_n \sqrt{\log \left( \frac{\Leb(\cX_n)}{\delta_n} \right)} \\ &\quad\lesssim l_{n,2} \delta_n \sqrt{C_1 \log n}, \end{align*} % where we used that $\delta_n \leq \Leb(\cX_n)$. So by Markov's inequality, % \begin{align*} \P\left( \sup_{|x - x'| \leq \delta_n} \big| Z_n(x) - Z_n(x') \big| > t_n \right) &\lesssim t_n^{-1} l_{n,2} \delta_n \sqrt{C_1 \log n}. \end{align*} \proofparagraph{conclusion} By the results of the previous parts, we have up to universal constants that % \begin{align*} &\P\left( \sup_{x \in \cX_n} \big| G_n(x) - Z_n(x) \big| > t_n \right) \\ &\quad\leq \P\left( \sup_{x \in \cX_n^\delta} \big| G_n(x) - Z_n(x) \big| > t_n / 3 \right) + \P\left( \sup_{|x-x'| \leq \delta_n} \big| G_n(x) - G_n(x') \big| > t_n / 3 \right) \\ &\qquad+ \P\left( \sup_{|x - x'| \leq \delta_n} \big| Z_n(x) - Z_n(x') \big| > t_n / 3 \right) \\ &\quad\lesssim \frac{4 \Leb(\cX_n)}{\delta_n} e^{-s^2/2} + \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3} \Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big) \\ &\qquad+ \frac{l_{n,2} \delta_n}{t_n} \sqrt{C_1 \log n} + \frac{l_{n,\infty} \delta_n}{t_n \sqrt n} C_1 \log n. \end{align*} % Choosing an approximately optimal mesh size of % \begin{align*} \delta_n &= \sqrt{ \frac{\sigma_n^2 \Leb(\cX_n) \log n}{\sqrt n t_n^3} \Big(M_n + \sigma_n \sqrt{\log n}\Big) } \Bigg/ \sqrt{ t_n^{-1} l_{n,2} \sqrt{\log n} \left( 1 + \frac{l_{n,\infty} \sqrt{\log n}}{l_{n,2} \sqrt{n}} \right) } \end{align*} % gives $\log |\delta_n| \lesssim C_1 \log n$ for a universal constant, so with $s$ a large enough multiple of $\sqrt{\log n}$, % \begin{align*} &\P\left( \sup_{x \in \cX_n} \big| G_n(x) - Z_n(x) \big| > t_n \right) \\ &\quad\lesssim \frac{4 \Leb(\cX_n)}{\delta_n} e^{-s^2/2} + \frac{\sigma_n^2 \Leb(\cX_n) s^2}{\sqrt n \delta_n t_n^3} \Big(M_n + 6 \sigma_n \sqrt{C_1 \log n}\Big) \\ &\qquad+ \frac{l_{n,2} \delta_n}{t_n} \sqrt{C_1 \log n} + \frac{l_{n,\infty} \delta_n}{t_n \sqrt n} C_1 \log n \\ &\quad\lesssim \delta_n \frac{l_{n,2} \sqrt {\log n}}{t_n} \left( 1 + \frac{l_{n,\infty} \sqrt{\log n}}{l_{n,2} \sqrt n} \right) \\ &\quad\lesssim \frac{\sigma_n \sqrt{\Leb(\cX_n)} \sqrt{\log n} \sqrt{M_n + \sigma_n \sqrt{\log n}}} {n^{1/4} t_n^2} \sqrt{l_{n,2} \sqrt {\log n} + \frac{l_{n,\infty}}{\sqrt n} \log n}. \end{align*} % \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_vbp}] The proof is by induction on the number of vertices in the tree. Let $\cT$ have $n$ vertices, and suppose that vertex $n$ is a leaf connected to vertex $n-1$ by an edge, relabeling the vertices if necessary. By the induction hypothesis we assume that there is a probability measure $\P^{(n-1)}$ on $\prod_{i=1}^{n-1} \cX_i$ whose projections onto $\cX_i$ are $\P_i$ and whose projections onto $\cX_i \times \cX_j$ are $\P_{i j}$, for $i,j \leq n-1$. Now apply the original Vorob'ev--Berkes--Philipp theorem, which can be found as Theorem~1.1.10 in \citet{dudley1999uniform}, to the spaces $\prod_{i=1}^{n-2} \cX_i$,\, $\cX_{n-1}$, and $\cX_n$; and to the laws $\P^{(n-1)}$ and $\P_{n-1, n}$. This gives a law $\P^{(n)}$ which agrees with $\P_i$ at every vertex by definition, and agrees with $\P_{i j}$ for all $i,j \leq n-1$. It also agrees with $\P_{n-1,n}$, and this is the only edge touching vertex $n$. Hence $\P^{(n)}$ satisfies the desired properties. \end{proof} \subsection{Main results} \label{sec:kernel_app_main} We give supplementary details for our main results on consistency, minimax optimality, strong approximation, covariance estimation, feasible inference and counterfactual estimation. We begin with a basic fact about Lipschitz functions. \begin{lemma}[Lipschitz kernels are bounded] \label{lem:kernel_app_lipschitz_kernels_bounded} Let $\cX \subseteq \R$ be a connected set. Let $f: \cX \to \R$ satisfy the Lipschitz condition $|f(x) - f(x')| \leq C |x-x'|$ for some $C > 0$ and all $x, x' \in \cX$. Suppose also that $f$ is a kernel in the sense that $\int_\cX f(x) \diff{x} = 1$. Then we have % \begin{align*} \sup_{x \in \cX} |f(x)| &\leq C \Leb(\cX) + \frac{1}{\Leb(\cX)}. \end{align*} % Now let $g: \cX \to [0,\infty)$ satisfy $|g(x) - g(x')| \leq C |x-x'|$ for some $C > 0$ and all $x, x' \in \cX$. Suppose $g$ is a sub-kernel with $\int_\cX g(x) \diff{x} \leq 1$. Then for any $M \in \big(0, \Leb(\cX)\big]$, we have % \begin{align*} \sup_{x \in \cX} f(x) &\leq C M + \frac{1}{M}. \end{align*} \end{lemma} Applying Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded} to the density and kernel functions defined in Assumptions~\ref{ass:kernel_data} and~\ref{ass:kernel_bandwidth} yields the following. Firstly, since $k_h(\cdot, w)$ is $C_\rL / h^2$-Lipschitz on $[w \pm h] \cap \cW$ and integrates to one, we have by the first inequality in Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded} that % \begin{align*} |k_h(s,w)| &\leq \frac{2 C_\rL + 1}{h} + \frac{1}{\Leb(\cW)}. \end{align*} % Since each of $f_{W \mid AA}(\cdot \mid a,a')$, $f_{W \mid A}(\cdot \mid a)$, and $f_W$ is non-negative, and $C_\rH$-Lipschitz on $\cW$ and integrates to at most one over $\cW$, taking $M = \frac{1}{\sqrt{C_\rH}} \wedge \Leb(\cW)$ in the second inequality in Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded} gives % \begin{align*} f_{W \mid AA}(w \mid a,a') &\leq 2 \sqrt{C_\rH} + \frac{1}{\Leb(\cW)}, \\ f_{W \mid A}(w \mid a) &\leq 2 \sqrt{C_\rH} + \frac{1}{\Leb(\cW)}, \\ f_W(w) &\leq 2 \sqrt{C_\rH} + \frac{1}{\Leb(\cW)}. \end{align*} \begin{proof}[Lemma~\ref{lem:kernel_app_lipschitz_kernels_bounded}] We begin with the first inequality. Note that if $\Leb(\cX) = \infty$ there is nothing to prove. Suppose for contradiction that $|f(x)| > C \Leb(\cX) + \frac{1}{\Leb(\cX)}$ for some $x \in \cX$. If $f(x) \geq 0$ then by the Lipschitz property, for any $y \in \cX$, % \begin{align*} f(y) \geq f(x) - C|y-x| > C \Leb(\cX) + \frac{1}{\Leb(\cX)} - C\Leb(\cX) = \frac{1}{\Leb(\cX)}. \end{align*} % Similarly, if $f(x) \leq 0$ then % \begin{align*} f(y) \leq f(x) + C|y-x| < - C \Leb(\cX) - \frac{1}{\Leb(\cX)} + C\Leb(\cX) = -\frac{1}{\Leb(\cX)}. \end{align*} % But then either $\int_\cX f(x) \diff{x} > \int_\cX 1/\Leb(\cX) \diff{x} = 1$ or $\int_\cX f(x) \diff{x} < \int_\cX -1/\Leb(\cX) \diff{x} = -1 < 1$, giving a contradiction. For the second inequality, assume that $f$ is non-negative on $\cX$, and take $M \in \big(0, \Leb(\cX)\big]$. Suppose for contradiction that $f(x) > C M + \frac{1}{M}$ for some $x \in \cX$. Then by the Lipschitz property, $f(y) \geq 1/M$ for all $y$ such that $|y - x| \leq M$. Since $\cX$ is connected, we have $\Leb(\cX \cap [x \pm M]) \geq M$ and so we deduce that $\int_\cX f(x) \diff{x} > M/M = 1$ which is a contradiction. \end{proof} \begin{proof}[Theorem~\ref{thm:kernel_bias}] Begin by defining % \begin{align*} P_p(s,w) &= \sum_{r = 0}^p \frac{f_W^{(r)}(w)}{r!} {(s-w)^r} \end{align*} % for $s, w \in \cW$ as the degree-$p$ Taylor polynomial of $f_W$, centered at $w$ and evaluated at $s$. Note that for $p \leq \flbeta-1$, by Taylor's theorem with Lagrange remainder, % \begin{align*} f_W(s) - P_p(s,w) &= \frac{f_W^{(p+1)}(w')}{(p+1)!} (s-w)^{p+1} \end{align*} % for some $w'$ between $w$ and $s$. Also note that for any $p$, % \begin{align*} \int_{\cW} k_h(s,w) \big( P_p(s,w) - P_{p-1}(s,w) \big) \diff{s} &= \int_{\cW} k_h(s,w) \frac{f_W^{(p)}(w)}{p!} (s-w)^p \diff{s} = h^p b_p(w). \end{align*} % Further, by the order of the kernel, % \begin{align*} \E\big[\hat f_W(w)\big] - f_W(w) &= \int_{\cW} k_h(s,w) f_W(s) \diff{s} - f_W(w) = \int_{\cW} k_h(s,w) \big(f_W(s) - f_W(w)\big) \diff{s} \\ &= \int_{\cW} k_h(s,w) \big(f_W(s) - P_{p-1}(s,w)\big) \diff{s}. \end{align*} \proofparagraph{low-order kernel} Suppose that $p \leq \flbeta - 1$. Then % \begin{align*} &\sup_{w \in \cW} \big| \E[\hat f_W(w)] - f_W(w) - h^p b_p(w) \big| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \big(f_W(s) - P_{p-1}(s,w)\big) \diff{s} - h^p b_p(w) \right| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \big( f_W(s) - P_{p}(s,w) + P_{p}(s,w) - P_{p-1}(s,w) \big) \diff{s} - h^p b_p(w) \right| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \big( f_W(s) - P_{p}(s,w) \big) \diff{s} \right| = \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \frac{f_W^{(p+1)}(w')}{(p+1)!} (s-w)^{p+1} \diff{s} \right| \\ &\quad\leq \sup_{w \in \cW} \left| \int_{[w \pm h]} \frac{C_\rk}{h} \frac{C_\rH}{(p+1)!} h^{p+1} \diff{s} \right| \leq \frac{2C_\rk C_\rH}{(p+1)!} h^{p+1}. \end{align*} \proofparagraph{order of kernel matches smoothness} Suppose that $p = \flbeta$. Then % \begin{align*} &\sup_{w \in \cW} \big| \E[\hat f_W(w)] - f_W(w) - h^p b_p(w) \big| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \big(f_W(s) - P_{\flbeta - 1}(s,w)\big) \diff{s} - h^p b_p(w) \right| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \big( f_W(s) - P_{\flbeta}(s,w) + P_{\flbeta}(s,w) - P_{\flbeta - 1}(s,w) \big) \diff{s} - h^{\flbeta} b_{\flbeta}(w) \right| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \big( f_W(s) - P_{\flbeta}(s,w) \big) \diff{s} \right| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) \frac{f_W^{(\flbeta)}(w') - f_W^{(\flbeta)}(w)}{\flbeta!} (s-w)^{\flbeta} \diff{s} \right| \\ &\quad\leq \sup_{w \in \cW} \left| \int_{[w \pm h]} \frac{C_\rk}{h} \frac{C_\rH h^{\beta - \flbeta}}{\flbeta !} h^{\flbeta} \diff{s} \right| \leq \frac{2 C_\rk C_\rH}{\flbeta !} h^\beta. \end{align*} \proofparagraph{high-order kernel} Suppose that $p \geq \flbeta+1$. Then as in the previous part % \begin{align*} \sup_{w \in \cW} \big| \E[\hat f_W(w)] - f_W(w) \big| &= \sup_{w \in \cW} \left| \int_{[w \pm h] \cap \cW} \!\!\!\! k_h(s,w) \big( f_W(s) - P_{\flbeta}(s,w) \big) \diff{s} \right| \leq \frac{2 C_\rk C_\rH}{\flbeta !} h^\beta. \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_hoeffding}] \proofparagraph{Hoeffding-type decomposition} \begin{align*} \hat f_W(w) - E_n(w) - \E[\hat f_W(w)] &= \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} \Big( \E[k_h(W_{i j},w) \mid A_i, A_j] - \E[k_h(W_{i j},w)] \Big) \\ &= \frac{1}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j \neq i} \Big( \E[k_h(W_{i j},w) \mid A_i, A_j] - \E[k_h(W_{i j},w)] \Big), \end{align*} % and apply Lemma~\ref{lem:kernel_app_general_hoeffding} with % \begin{align*} u_{i j} &= \frac{1}{n(n-1)} \E\big[k_h(W_{i j},w) \mid A_i, A_j\big], &u_i &= \frac{1}{n(n-1)} \E\big[k_h(W_{i j},w) \mid A_i\big], \\ u &= \frac{1}{n(n-1)} \E\big[k_h(W_{i j},w)\big], \end{align*} % to see % \begin{align*} \hat f_W(w) - E_n(w) - \E[\hat f_W(w)] &= \frac{2}{n} \sum_{i=1}^n \big(u_i - u\big) + \frac{1}{n(n-1)} \sum_{i=1}^n \sum_{j \neq i} \big( u_{i j} - u_i - u_j + u \big) \\ &= \frac{2}{n} \sum_{i=1}^n l_i(w) + \frac{2}{n(n-1)} \sum_{i=1}^n \sum_{j = i+1}^n q_{i j}(w) = L_n + Q_n. \end{align*} \proofparagraph{expectation and covariance of $L_n$, $Q_n$, and $E_n$} $L_n$, $Q_n$, and $E_n$ are clearly mean-zero. For orthogonality, note that their summands have the following properties, for any $1 \leq i < j \leq n$ and $1 \leq r < s \leq n$, and for any $w, w' \in \cW$: % \begin{align*} \E\big[ l_i(w) q_{rs}(w') \big] &= \E\big[ l_i(w) \E\big[ q_{rs}(w') \mid A_i \big] \big] = 0, \\ \E\big[ l_i(w) e_{rs}(w') \big] &= \begin{cases} \E\big[ l_i(w) \big] \E\big[ e_{rs}(w') \big], \text{ if } i \notin \{r,s\}, \\ \E\big[ l_i(w) \E\big[ e_{rs}(w') \mid A_r, A_s \big] \big], \text{ if } i \in \{r,s\}, \end{cases} \\ &= 0, \\ \E\big[ q_{i j}(w) e_{rs}(w') \big] &= \begin{cases} \E\big[ q_{i j}(w) \big] \E\big[ e_{rs}(w') \big], \text{ if } \{i,j\} \cap \{r,s\} = \emptyset, \\ \E\big[ \E\big[ q_{i j}(w) \mid A_i \big] \E\big[ e_{rs}(w') \mid A_i \big] \big], \text{ if } \{i,j\} \cap \{r,s\} = \{i\}, \\ \E\big[ \E\big[ q_{i j}(w) \mid A_j \big] \E\big[ e_{rs}(w') \mid A_j \big] \big], \text{ if } \{i,j\} \cap \{r,s\} = \{j\}, \\ \E\big[ q_{i j}(w) \E\big[ e_{rs}(w') \mid A_r, A_s \big] \big], \text{ if } \{i,j\} = \{r,s\}, \end{cases} \\ &= 0, \end{align*} % by independence of $\bA_n$ and $\bV_n$ and as $\E[q_{rs}(w) \mid A_i] = 0$ and $\E[e_{i j}(w) \mid A_i, A_j] = 0$. \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_trichotomy}] \proofparagraph{total degeneracy} Suppose $\Dl = 0$, so $\Var[f_{W \mid A}(w \mid A_i)] = 0$ for all $w \in \cW$. Therefore, for all $w \in \cW$, we have $f_{W \mid A}(w) = f_W(w)$ almost surely. By taking a union over $\cW \cap \Q$ and by continuity of $f_{W \mid A}$ and $f_W$, this implies that $f_{W \mid A}(w) = f_W(w)$ for all $w \in \cW$ almost surely. Thus % \begin{align*} \E\left[ k_h(W_{i j},w) \mid A_i \right] &= \int_{\cW} k_h(s,w) f_{W \mid A}(s \mid A_i) \diff{s} = \int_{\cW} k_h(s,w) f_W(s) \diff{s} = \E\left[ k_h(W_{i j},w) \right] \end{align*} % for all $w \in \cW$ almost surely. Hence $l_i(w) = 0$ and so $L_n(w) = 0$ for all $w \in \cW$ almost surely. \proofparagraph{no degeneracy} Suppose $\Dl > 0$. As $f_{W|A}(\cdot \mid a)$ is $C_\rH$-Lipschitz for all $a \in \cA$ and since $|k_h| \leq C_\rk/h$, % \begin{align*} &\sup_{w \in \cW} \left| \E[k_h(W_{i j},w) \mid A_i] - f_{W \mid A}(w \mid A_i) \right| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW} k_h(s,w) f_{W \mid A}(s \mid A_i) \diff{s} - f_{W \mid A}(w \mid A_i) \right| \\ &\quad= \sup_{w \in \cW} \left| \int_{\cW \cap [w \pm h]} k_h(s,w) \left( f_{W \mid A}(s \mid A_i) - f_{W \mid A}(w \mid A_i) \right) \diff{s} \right| \\ &\quad\leq 2h \frac{C_\rk}{h} C_\rH h \leq 2 C_\rk C_\rH h \end{align*} % almost surely. Therefore, since $f_{W \mid A}(w \mid a) \leq C_\rd$, we have % \begin{align*} \sup_{w \in \cW} \left| \Var\big[ \E[k_h(W_{i j},w) \mid A_i] \big] - \Var\left[ f_{W \mid A}(w \mid A_i) \right] \right| &\leq 16 C_\rk C_\rH C_\rd h \end{align*} % whenever $h$ is small enough that $2 C_\rk C_\rH h \leq C_\rd$. Thus % \begin{align*} \inf_{w \in \cW} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big] &\geq \inf_{w \in \cW}\Var[f_{W \mid A}(w \mid A_i)] - 16 C_\rk C_\rH C_\rd h. \end{align*} % Therefore, if $\Dl > 0$, then eventually $\inf_{w \in \cW} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big] \geq \Dl/2$. Finally, % \begin{align*} \inf_{w \in \cW}\Var[L_n(w)] &= \frac{4}{n} \inf_{w \in \cW} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big] \geq \frac{2 \Dl}{n}. \end{align*} \proofparagraph{partial degeneracy} Since $f_{W \mid A}(w \mid A_i)$ is bounded by $C_\rd$ and $C_\rH$-Lipschitz in $w$, we have that $\Var[f_{W \mid A}(w \mid A_i)]$ is continuous on $\cW$. Thus if $\Dl = 0$, there is at least one point $w \in \cW$ for which $\Var[f_{W \mid A}(w \mid A_i)] = 0$ by compactness. Let $w$ be any such degenerate point. Then by the previous part, % \begin{align*} \Var[L_n(w)] = \frac{4}{n} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big] &\leq 64 C_\rk C_\rH C_\rd \frac{h}{n}. \end{align*} % If conversely $w$ is not a degenerate point then $\Var[f_{W \mid A}(w \mid A_i)] > 0$ so eventually % \begin{align*} \Var[L_n(w)] = \frac{4}{n} \Var\big[\E[k_h(W_{i j},w) \mid A_i]\big] &\geq \frac{2}{n} \Var[f_{W \mid A}(w \mid A_i)]. \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_uniform_concentration}] We establish VC-type properties of function classes and apply empirical process theory. \proofparagraph{establishing VC-type classes} Consider the following function classes: % \begin{align*} \cF_1 &= \Big\{ W_{i j} \mapsto k_h(W_{i j},w) : w \in \cW \Big\}, \\ \cF_2 &= \Big\{ (A_i, A_j) \mapsto \E\big[ k_h(W_{i j},w) \mid A_i, A_j \big] : w \in \cW \Big\}, \\ \cF_3 &= \Big\{ A_i \mapsto \E\big[ k_h(W_{i j},w) \mid A_i \big] : w \in \cW \Big\}. \end{align*} % For $\cF_1$, take $0 < \varepsilon \leq \Leb(\cW)$ and $\cW_\varepsilon$ an $\varepsilon$-cover of $\cW$ of cardinality at most $\Leb(\cW)/\varepsilon$. As % \begin{align*} \sup_{s, w, w' \in \cW} \left| \frac{k_h(s,w) - k_h(s,w')} {w-w'} \right| &\leq \frac{C_\mathrm{L}}{h^2} \end{align*} % almost surely, we see that % \begin{align*} \sup_\Q N\left(\cF_1, \rho_\Q, \frac{C_\mathrm{L}}{h^2} \varepsilon \right) &\leq N\left(\cF_1, \|\cdot\|_\infty, \frac{C_\mathrm{L}}{h^2} \varepsilon \right) \leq \frac{\Leb(\cW)}{\varepsilon}, \end{align*} % where $\Q$ ranges over Borel probability measures on $\cW$. Since $\frac{C_\rk}{h}$ is an envelope for $\cF_1$, % \begin{align*} \sup_\Q N\left(\cF_1, \rho_\Q, \frac{C_\rk}{h} \varepsilon \right) &\leq \frac{C_\rL}{C_\rk} \frac{\Leb(\cW)}{h \varepsilon}. \end{align*} % Thus for all $\varepsilon \in (0,1]$, % \begin{align*} \sup_\Q N\left(\cF_1, \rho_\Q, \frac{C_\rk}{h} \varepsilon \right) &\leq \frac{C_\rL}{C_\rk} \frac{\Leb(\cW) \vee 1}{h \varepsilon} \leq (C_1/(h\varepsilon))^{C_2}, \end{align*} % where $C_1 = \frac{C_\rL}{C_\rk} (\Leb(\cW) \vee 1)$ and $C_2 = 1$. Next, $\cF_2$ forms a smoothly parameterized class of functions since for $w,w' \in \cW$ we have by the uniform Lipschitz properties of $f_{W \mid AA}(\cdot \mid A_i, A_j)$ and $k_h(s, \cdot)$, with $|w-w'| \leq h$, % \begin{align*} &\left| \E\big[ k_h(W_{i j},w) \mid A_i, A_j \big] - \E\big[ k_h(W_{i j},w') \mid A_i, A_j \big] \right| \\ &\quad= \left| \int_{[w \pm h] \cap \cW} k_h(s,w) f_{W \mid AA}(s \mid A_i, A_j) \diff{s} - \int_{[w' \pm h] \cap \cW} k_h(s,w') f_{W \mid AA}(s \mid A_i, A_j) \diff{s} \right| \\ &\quad= \left| \int_{[w \pm 2h] \cap \cW} \big( k_h(s,w) - k_h(s,w') \big) f_{W \mid AA}(s \mid A_i, A_j) \diff{s} \right| \\ &\quad= \left| \int_{[w \pm 2h] \cap \cW} \big( k_h(s,w) - k_h(s,w') \big) \big( f_{W \mid AA}(s \mid A_i, A_j) - f_{W \mid AA}(w \mid A_i, A_j) \big) \diff{s} \right| \\ &\quad\leq 4h \frac{C_\rL}{h^2} |w-w'| 2 C_\rH h \leq 8 C_\rL C_\rH |w-w'| \leq C_3 |w-w'|, \end{align*} % where $C_3 = 8 C_\rL C_\rH$. The same holds for $|w-w'| > h$ as the Lipschitz property is local. By taking $\E[\, \cdot \mid A_i]$, it can be seen by the contraction property of conditional expectation that the same holds for the singly-conditioned terms: % \begin{align*} \left| \E\big[ k_h(W_{i j},w) \mid A_i \big] - \E\big[ k_h(W_{i j},w') \mid A_i \big] \right| &\leq C_3 |w-w'|. \end{align*} % Therefore $\cF_3$ is also smoothly parameterized in exactly the same manner. Let % \begin{align*} C_4 &= \sup_{w \in \cW} \esssup_{A_i, A_j} \big| \E\big[ k_h(W_{i j},w) \mid A_i, A_j \big] \big| \\ &= \sup_{w \in \cW} \esssup_{A_i, A_j} \left| \int_{[w \pm h] \cap \cW} k_h(s,w) f_{W \mid AA}(s \mid A_i, A_j) \diff{s} \right| \\ &\leq 2h \frac{C_\rk}{h} C_\rd \leq 2 C_\rk C_\rd. \end{align*} % For $\varepsilon \in (0,1]$, take an $(\varepsilon C_4/C_3)$-cover of $\cW$ of cardinality at most $C_3 \Leb(\cW) / (\varepsilon C_4)$. By the above parameterization properties, this cover induces an $\varepsilon C_4$-cover for both $\cF_2$ and $\cF_3$: % \begin{align*} \sup_\Q N\big(\cF_2, \rho_\Q, \varepsilon C_4 \big) &\leq N\big(\cF_2, \|\cdot\|_\infty, \varepsilon C_4 \big) \leq C_3 \Leb(\cW) / (\varepsilon C_4), \\ \sup_\Q N\big(\cF_3, \rho_\Q, \varepsilon C_4 \big) &\leq N\big(\cF_3, \|\cdot\|_\infty, \varepsilon C_4 \big) \leq C_3 \Leb(\cW) / (\varepsilon C_4). \end{align*} % Hence $\cF_1$, $\cF_2$, and $\cF_3$ form VC-type classes with envelopes $F_1 = C_\rk / h$ and $F_2 = F_3 = C_4$: % \begin{align*} \sup_\Q N\left(\cF_1, \rho_\Q, \varepsilon C_\rk / h \right) &\leq (C_1/(h\varepsilon))^{C_2}, &\sup_\Q N\big(\cF_2, \rho_\Q, \varepsilon C_4 \big) &\leq (C_1/\varepsilon)^{C_2}, \\ \sup_\Q N\big(\cF_3, \rho_\Q, \varepsilon C_4 \big) &\leq (C_1/\varepsilon)^{C_2}, \end{align*} % for some constants $C_1 \geq e$ and $C_2 \geq 1$, where we augment the constants if necessary. \proofparagraph{controlling $L_n$} Observe that $\sqrt{n}L_n$ is the empirical process of the i.i.d.\ variables $A_i$ indexed by $\cF_3$. We apply Lemma~\ref{lem:kernel_app_maximal_vc_inid} with $\sigma = C_4$: % \begin{align*} \E \left[ \sup_{w \in \cW} \big| \sqrt{n} L_ n(w) \big| \right] &\lesssim C_4 \sqrt{C_2 \log C_1} + \frac{C_4 C_2 \log C_1} {\sqrt{n}} \lesssim 1. \end{align*} % By Lemma~\ref{lem:kernel_trichotomy}, the left hand side is zero whenever $\Du = 0$, so we can also write % \begin{align*} \E \left[ \sup_{w \in \cW} \big| \sqrt{n} L_n(w) \big| \right] &\lesssim \Du. \end{align*} \proofparagraph{controlling $Q_n$} Observe that $n Q_n$ is the completely degenerate second-order U-process of the i.i.d.\ variables $A_i$ indexed by $\cF_2$. This function class is again uniformly bounded and VC-type, so applying the U-process maximal inequality from Lemma~\ref{lem:kernel_app_uprocess_maximal} yields with $\sigma = C_4$ % \begin{align*} \E \left[ \sup_{w \in \cW} \big| n Q_n(w) \big| \right] &\lesssim C_4 C_2 \log C_1 + \frac{C_4 (C_2 \log C_1)^2} {\sqrt{n}} \lesssim 1. \end{align*} \proofparagraph{controlling $E_n$} Conditional on $\bA_n$, note that $n E_n$ is the empirical process of the conditionally i.n.i.d.\ variables $W_{i j}$ indexed by $\cF_1$. We apply Lemma~\ref{lem:kernel_app_maximal_vc_inid} conditionally with % \begin{align*} \sigma^2 &= \sup_{w \in \cW} \E\Big[ \big( k_h(W_{i j},w) - \E[k_h(W_{i j},w) \mid A_i, A_j] \big)^2 \mid A_i, A_j \Big] \leq \sup_{w \in \cW} \E\Big[ k_h(W_{i j},w)^2 \mid A_i, A_j \Big] \\ &\leq \sup_{w \in \cW} \int_{[w \pm h] \cap \cW} k_h(s,w)^2 f_{W \mid AA}(s \mid A_i, A_j) \diff{s} \leq 2h \frac{C_\rk^2}{h^2} \lesssim 1/h \end{align*} % and noting that we have a sample size of $\frac{1}{2}n(n-1)$, giving % \begin{align*} \E \left[ \sup_{w \in \cW} \big| n E_n(w) \big| \right] &\lesssim \sigma \sqrt{C_2 \log \big((C_1/h) F_1 / \sigma \big)} + \frac{F_1 C_2 \log \big((C_1/h) F_1 / \sigma\big)} {n} \\ &\lesssim \frac{1}{\sqrt h} \sqrt{C_2 \log \big((C_1/h) (C_\rk/h) \sqrt h \big)} + \frac{(C_\rk/h) C_2 \log \big((C_1/h) (C_\rk/h) \sqrt h \big)} {n} \\ &\lesssim \sqrt{\frac{\log 1/h}{h}} + \frac{\log \big(1/h\big)} {n h} \lesssim \sqrt{\frac{\log n}{h}}, \end{align*} % where the last line follows by the bandwidth assumption of $\frac{\log n}{n^2h} \to 0$. \end{proof} \begin{proof}[Theorem~\ref{thm:kernel_uniform_consistency}] This follows from Theorem~\ref{thm:kernel_bias} and Lemma~\ref{lem:kernel_uniform_concentration}. \end{proof} Before proving Theorem~\ref{thm:kernel_minimax} we first give a lower bound result for parametric point estimation in Lemma~\ref{lem:kernel_app_neyman_pearson_bernoulli}. \begin{lemma}[A Neyman--Pearson result for Bernoulli random variables] \label{lem:kernel_app_neyman_pearson_bernoulli} Recall that the Bernoulli distribution $\Ber(\theta)$ places mass $\theta$ at $1$ and mass $1-\theta$ at $0$. Define $\P_\theta^n$ as the law of $(A_1, A_2, \ldots, A_n, V)$, where $A_1, \ldots, A_n$ are i.i.d.\ $\Ber(\theta)$, and $V$ is an $\R^d$-valued random variable for some $d \geq 1$ which is independent of the $A$ variables and with a fixed distribution that does not depend on $\theta$. Let $\theta_0 = \frac{1}{2}$ and $\theta_{1,n} = \frac{1}{2} + \frac{1}{\sqrt{8n}}$. Then for any estimator $\tilde \theta_n$ which is a function of $(A_1, A_2, \ldots, A_n, V)$ only, % \begin{align*} \P_{\theta_0}^n \left( \big| \tilde \theta_n - \theta_0 \big| \geq \frac{1}{\sqrt{32n}} \right) + \P_{\theta_{1,n}}^n \left( \big| \tilde \theta_n - \theta_{1,n} \big| \geq \frac{1}{\sqrt{32n}} \right) \geq \frac{1}{2}. \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_neyman_pearson_bernoulli}] Let $f: \{0,1\}^n \to \{0,1\}$ be any function. Considering this function as a statistical test, the Neyman--Pearson lemma and Pinsker's inequality \citep{gine2021mathematical} give % \begin{align*} \P_{\theta_0}^n \big( f=1 \big) +\P_{\theta_{1,n}}^n \big( f=0 \big) &\geq 1- \TV\left( \P_{\theta_0}^n, \P_{\theta_{1,n}}^n \right) \geq 1- \sqrt{ \frac{1}{2} \KL \left( \P_{\theta_0}^n \bigm\| \P_{\theta_{1,n}}^n \right)} \\ &= 1- \sqrt{ \frac{n}{2} \KL \left( \Ber(\theta_0) \bigm\| \Ber(\theta_{1,n}) \right) + \frac{n}{2} \KL \left( V \bigm\| V \right)} \\ &= 1- \sqrt{ \frac{n}{2} \KL \left( \Ber(\theta_0) \bigm\| \Ber(\theta_{1,n}) \right)}, \end{align*} % where $\TV$ is the total variation distance and $\KL$ is the Kullback--Leibler divergence. In the penultimate line we used the tensorization of Kullback--Leibler divergence \citep{gine2021mathematical}, noting that the law of $V$ is fixed and hence does not contribute. We now evaluate this Kullback--Leibler divergence at the specified parameter values. % \begin{align*} \P_{\theta_0}^n \big( f=1 \big) +\P_{\theta_{1,n}}^n \big( f=0 \big) &\geq 1- \sqrt{ \frac{n}{2} \KL \left( \Ber(\theta_0) \bigm\| \Ber(\theta_{1,n}) \right)} \\ &= 1- \sqrt{\frac{n}{2}} \sqrt{ \theta_0 \log \frac{\theta_0}{\theta_{1,n}} + (1 - \theta_0) \log \frac{1 - \theta_0}{1 - \theta_{1,n}}} \\ &= 1- \sqrt{\frac{n}{2}} \sqrt{ \frac{1}{2} \log \frac{1/2}{1/2 + 1/\sqrt{8n}} + \frac{1}{2} \log \frac{1/2}{1/2 - 1/\sqrt{8n}}} \\ &= 1- \frac{\sqrt n}{2} \sqrt{\log \frac{1}{1 - 1/(2n)}} \geq 1- \frac{\sqrt n}{2} \sqrt{\frac{1}{n}} = \frac{1}{2}, \end{align*} % where in the penultimate line we used that $\log \frac{1}{1-x} \leq 2x$ for $x \in [0,1/2]$. Now define a test $f$ by $f = 1$ if $\tilde \theta_n > \frac{1}{2} + \frac{1}{\sqrt{32n}}$ and $f=0$ otherwise, to see % \begin{align*} \P_{\theta_0}^n \left( \tilde \theta_n > \frac{1}{2} + \frac{1}{\sqrt{32n}} \right) + \P_{\theta_{1,n}}^n \left( \tilde \theta_n \leq \frac{1}{2} + \frac{1}{\sqrt{32n}} \right) \geq \frac{1}{2}. \end{align*} % By the triangle inequality, recalling that $\theta_0 = \frac{1}{2}$ and $\theta_{1,n} = \frac{1}{2} + \frac{1}{\sqrt{8n}}$, we have % \begin{align*} \left\{ \tilde \theta_n > \frac{1}{2} + \frac{1}{\sqrt{32n}} \right\} &\subseteq \left\{ \left| \tilde \theta_n - \theta_0 \right| \geq \frac{1}{\sqrt{32n}} \right\} \\ \left\{ \tilde \theta_n \leq \frac{1}{2} + \frac{1}{\sqrt{32n}} \right\} &\subseteq \left\{ \left| \tilde \theta_n - \theta_{1,n} \right| \geq \frac{1}{\sqrt{32n}} \right\}. \end{align*} % Thus by the monotonicity of measures, % \begin{align*} \P_{\theta_0}^n \left( \big| \tilde \theta_n - \theta_0 \big| \geq \frac{1}{\sqrt{32n}} \right) + \P_{\theta_{1,n}}^n \left( \big| \tilde \theta_n - \theta_{1,n} \big| \geq \frac{1}{\sqrt{32n}} \right) \geq \frac{1}{2}. \end{align*} \end{proof} \begin{proof}[Theorem~\ref{thm:kernel_minimax}] \proofparagraph{lower bound for $\cP$} By translation and scaling of the data, we may assume without loss of generality that $\cW = [-1,1]$. We may also assume that $C_\rH \leq 1/2$, since reducing $C_\rH$ can only shrink the class of distributions. Define the dyadic distribution $\P_\theta$ with parameter $\theta \in [1/2, 1]$ as follows: $A_1, \ldots, A_n$ are i.i.d.\ $\Ber(\theta)$, while $V_{i j}$ for $1 \leq i < j \leq n$ are i.i.d.\ and independent of $\bA_n$. The distribution of $V_{i j}$ is given by its density function $f_V(v) = \frac{1}{2} + C_\rH v$ on $[-1,1]$. Finally, generate $W_{i j} = W(A_i, A_j, V_{i j}) \vcentcolon= (2 A_i A_j - 1) V_{i j}$. Note that the function $W$ does not depend on $\theta$. The conditional and marginal densities of $W_{i j}$ are for $w \in [-1,1]$ % \begin{align*} f_{W \mid AA}(w \mid A_i, A_j) &= \begin{cases} \frac{1}{2} + C_\rH w & \text{if } A_i = A_j = 1, \\ \frac{1}{2} - C_\rH w & \text{if } A_i = 0 \text{ or } A_j = 0, \\ \end{cases} \\ f_{W \mid A}(w \mid A_i) &= \begin{cases} \frac{1}{2} + (2 \theta - 1) C_\rH w & \text{if } A_i = 1, \\ \frac{1}{2} - C_\rH w & \text{if } A_i = 0 , \\ \end{cases} \\ f_W(w)&= \frac{1}{2} + (2\theta^2 - 1) C_\rH w. \end{align*} % Clearly, $f_W \in \cH^\beta_{C_\rH}(\cW)$ and $f_{W \mid AA}(\cdot \mid a, a') \in \cH^1_{C_\rH}(\cW)$. Also $\sup_{w \in \cW} \|f_{W \mid A}(w \mid \cdot\,)\|_\TV \leq 1$. Therefore $\P_\theta$ satisfies Assumption~\ref{ass:kernel_data} and so $\big\{\P_\theta : \theta \in [1/2, 1] \big\} \subseteq \cP$. Note that $f_W(1) = \frac{1}{2} + (2\theta^2 - 1) C_\rH $, so $\theta^2 = \frac{1}{2 C_\rH}(f_W(1) - 1/2 + C_\rH)$. Thus if $\tilde f_W$ is some density estimator depending only on the data $\bW_n$, we define the parameter estimator % \begin{align*} \tilde \theta_n^2 &\vcentcolon= \frac{1}{2 C_\rH}\left( \tilde f_W(1) - \frac{1}{2} + C_\rH \right) \vee 0. \end{align*} % This gives the inequality % \begin{align*} \big| \tilde \theta_n^2 - \theta^2 \big| &= \left| \frac{1}{2 C_\rH}\left( \tilde f_W(1) - \frac{1}{2} + C_\rH \right) \vee 0 - \frac{1}{2 C_\rH}\left( f_W(1) - \frac{1}{2} + C_\rH \right) \right| \\ &\leq \frac{1}{2 C_\rH} \sup_{w \in \cW} \left| \tilde f_W(w) - f_W(w) \right|. \end{align*} % Therefore, since also $\tilde \theta \geq 0$ and $\theta \geq \frac{1}{2}$, % \begin{align*} \big| \tilde \theta_n - \theta \big| &= \frac{\big|\tilde \theta_n^2 - \theta^2\big|} {\tilde \theta_n + \theta} \leq \frac{1}{C_\rH} \sup_{w \in \cW} \left| \tilde f_W(w) - f_W(w) \right|. \end{align*} % Now we apply the point estimation lower bound from Lemma~\ref{lem:kernel_app_neyman_pearson_bernoulli}, setting $\theta_0 = \frac{1}{2}$ and $\theta_{1,n} = \frac{1}{2} + \frac{1}{\sqrt{8n}}$, noting that the estimator $\tilde \theta_n$ is a function of $\bW_n$ only, thus is a function of $\bA_n$ and $\bV_n$ only and so satisfies the conditions. % \begin{align*} &\P_{\theta_0} \left( \sup_{w \in \cW} \big| \tilde f_W(w) - f^{(0)}_W(w) \big| \geq \frac{1}{C\sqrt{n}} \right) + \P_{\theta_{1,n}} \left( \sup_{w \in \cW} \big| \tilde f_W(w) - f^{(1)}_W(w) \big| \geq \frac{1}{C\sqrt{n}} \right) \\ &\quad\geq \P_{\theta_0} \left( \big| \tilde \theta_n - \theta_0 \big| \geq \frac{1}{C C_\rH \sqrt{n}} \right) + \P_{\theta_{1,n}} \left( \big| \tilde \theta_n - \theta_{1,n} \big| \geq \frac{1}{C C_\rH \sqrt{n}} \right) \\ &\quad\geq \P_{\theta_0} \left( \big| \tilde \theta_n - \theta_0 \big| \geq \frac{1}{\sqrt{32n}} \right) + \P_{\theta_{1,n}} \left( \big| \tilde \theta_n - \theta_{1,n} \big| \geq \frac{1}{\sqrt{32n}} \right) \geq \frac{1}{2}, \end{align*} % where we set $C \geq \frac{\sqrt{32}}{C_\rH}$. Therefore we deduce that % \begin{align*} \inf_{\tilde f_W} \sup_{\P \in \cP} \P\left( \sup_{w \in \cW} \big| \tilde f_W(w) - f_W(w) \big| \geq \frac{1}{C \sqrt n} \right) \geq \frac{1}{4} \end{align*} % and so % \begin{align*} \inf_{\tilde f_W} \sup_{\P \in \cP} \E_\P\left[ \sup_{w \in \cW} \big| \tilde f_W(w) - f_W(w) \big| \right] \geq \frac{1}{4 C \sqrt{n}}. \end{align*} \proofparagraph{lower bound for $\cP_\rd$} For the subclass of totally degenerate distributions, we rely on the main theorem from \citet{khasminskii1978lower}. Let $\cP_0$ be the subclass of $\cP_\rd$ consisting of the distributions which satisfy $A_1 = \cdots = A_n = 0$ and $W_{i j} \vcentcolon= A_i + A_j + V_{i j} = V_{i j}$, so that $W_{i j}$ are i.i.d.\ with common density $f_W = f_V$. Define the class % \begin{align*} \cF &= \left\{ f \text{ density function on } \R, \ f \in \cH^\beta_{C_\rH}(\cW) \right\}. \end{align*} % Write $\E_f$ for the expectation under $W_{i j}$ having density $f$. Then by \citet{khasminskii1978lower}, % \begin{align*} \liminf_{n \to \infty} \inf_{\tilde f_W} \sup_{f \in \cF} \E_f\left[ \left( \frac{n^2}{\log n} \right)^{\frac{\beta}{2\beta + 1}} \sup_{w \in \cW} \big| \tilde f_W(w) - f_W(w) \big| \right] > 0, \end{align*} % where $\tilde f_W$ is any density estimator depending only on the $\frac{1}{2}n(n-1)$ i.i.d.\ data samples $\bW_n$. Now every density function in $\cH^\beta_{C_\rH}(\cW)$ corresponds to a distribution in $\cP_0$ and therefore to a distribution in $\cP_\rd$. Thus for large enough $n$ and some positive constant $C$, % \begin{align*} \inf_{\tilde f_W} \sup_{\P \in \cP_\rd} \E_\P\left[ \sup_{w \in \cW} \big| \tilde f_W(w) - f_W(w) \big| \right] \geq \frac{1}{C} \left( \frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta + 1}}. \end{align*} \proofparagraph{upper bounds} The upper bounds follow by using a dyadic kernel density estimator $\hat f_W$ with a boundary bias-corrected Lipschitz kernel of order $p \geq \beta$ and a bandwidth of $h$. Theorem~\ref{thm:kernel_bias} gives % \begin{align*} \sup_{\P \in \cP} \sup_{w \in \cW} \big| \E_\P\big[\hat f_W(w)\big] - f_W(w) \big| \leq \frac{4C_\rk C_\rH}{\flbeta !} h^\beta. \end{align*} % Then, treating the degenerate and non-degenerate cases separately and noting that all inequalities hold uniformly over $\cP$ and $\cP_\rd$, the proof of Lemma~\ref{lem:kernel_uniform_concentration} shows that % \begin{align*} \sup_{\P \in \cP} \E_\P\left[ \sup_{w \in \cW} \big|\hat f_W(w) - \E_\P[\hat f_W(w)]\big| \right] &\lesssim \frac{1}{\sqrt n} + \sqrt{\frac{\log n}{n^2h}}, \\ \sup_{\P \in \cP_\rd} \E_\P\left[ \sup_{w \in \cW} \big|\hat f_W(w) - \E_\P[\hat f_W(w)]\big| \right] &\lesssim \sqrt{\frac{\log n}{n^2h}}. \end{align*} % Thus combining these yields that % \begin{align*} \sup_{\P \in \cP} \E_\P\left[ \sup_{w \in \cW} \big|\hat f_W(w) - f_W(w)\big| \right] &\lesssim h^\beta + \frac{1}{\sqrt n} + \sqrt{\frac{\log n}{n^2h}}, \\ \sup_{\P \in \cP_\rd} \E_\P\left[ \sup_{w \in \cW} \big|\hat f_W(w) - f_W(w)\big| \right] &\lesssim h^\beta + \sqrt{\frac{\log n}{n^2h}}. \end{align*} % Set $h = \left( \frac{\log n}{n^2} \right)^{\frac{1}{2\beta+1}}$ and note that $\beta \geq 1$ implies that $\left(\frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta+1}} \ll \frac{1}{\sqrt n}$. So for $C > 0$, % \begin{align*} \sup_{\P \in \cP} \E_\P\left[ \sup_{w \in \cW} \big|\hat f_W(w) - f_W(w)\big| \right] &\lesssim \frac{1}{\sqrt n} + \left( \frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta+1}} \leq \frac{C}{\sqrt n}, \\ \sup_{\P \in \cP_\rd} \E_\P\left[ \sup_{w \in \cW} \big|\hat f_W(w) - f_W(w)\big| \right] &\leq C\left( \frac{\log n}{n^2} \right)^{\frac{\beta}{2\beta+1}}. \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_covariance_structure}] We write $k_{i j}$ for $k_h(W_{i j},w)$ and $k_{i j}'$ for $k_h(W_{i j},w')$, in the interest of brevity. % \begin{align*} \Sigma_n(w,w') &= \E\Big[ \big( \hat f_W(w) - \E[\hat f_W(w)] \big) \big( \hat f_W(w') - \E[\hat f_W(w')] \big) \Big] \\ &= \E\left[ \left( \frac{2}{n(n-1)} \sum_{i \Du \frac{t + C_1 \log n}{\sqrt n} \right) \leq C_2 e^{-C_3 t}. \end{align*} % Integrating tail probabilities shows that % \begin{align*} \E\left[ \sup_{w \in \cW} \Big|\sqrt{n} L_n'(w) - Z_n^{L\prime}(w)\Big| \right] &\leq \Du \frac{C_1 \log n}{\sqrt n} + \int_0^\infty \frac{\Du}{\sqrt n} C_2 e^{-C_3 t} \diff{t} \lesssim \frac{\Du \log n}{\sqrt n}. \end{align*} % Further, $Z_n^{L\prime}$ has the same covariance structure as $G_n^{L\prime}$ in the sense that for all $w, w' \in \cW$, % \begin{align*} \E\big[Z_n^{L\prime}(w) Z_n^{L\prime}(w')\big] = \E\big[G_n^{L\prime}(w) G_n^{L\prime}(w')\big], \end{align*} % and clearly $L_n'$ is equal in distribution to $L_n$. To obtain the trajectory regularity property of $Z_n^{L\prime}$, note that it was shown in the proof of Lemma~\ref{lem:kernel_uniform_concentration} that for all $w,w' \in \cW$, % \begin{align*} \left| k_h^A(A_i,w) - k_h^A(A_i,w') \right| &\leq C |w-w'| \end{align*} % for some constant $C > 0$. Therefore, since the $A_i$ are i.i.d., % \begin{align*} &\E\left[ \big| Z_n^{L\prime}(w) - Z_n^{L\prime}(w') \big|^2 \right]^{1/2} = \sqrt{n} \E\left[ \big| L_n(w) - L_n(w') \big|^2 \right]^{1/2} \\ &\quad= \sqrt{n} \E\left[ \left| \frac{1}{n} \sum_{i=1}^n \Big( k_h^A(A_i,w) - k_h^A(A_i,w') - \E\big[k_h^A(A_i,w)] + \E\big[k_h^A(A_i,w')] \Big) \right|^2 \right]^{1/2} \\ &\quad= \E\left[ \Big| k_h^A(A_i,w) - k_h^A(A_i,w') - \E\big[k_h^A(A_i,w)] + \E\big[k_h^A(A_i,w')] \Big|^2 \right]^{1/2} \lesssim |w-w'|. \end{align*} % Therefore, by the regularity result for Gaussian processes in Lemma~\ref{lem:kernel_app_gaussian_process_maximal}, with $\delta_n \in (0, 1/2]$: % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big| Z_n^{L\prime}(w) - Z_n^{L\prime}(w') \big| \right] &\lesssim \int_0^{\delta_n} \sqrt{\log 1/\varepsilon} \diff{\varepsilon} \lesssim \delta_n \sqrt{\log 1/\delta_n} \lesssim \Du \delta_n \sqrt{\log 1/\delta_n}, \end{align*} % where the last inequality is because $Z_n^{L\prime} \equiv 0$ whenever $\Du = 0$. There is a modification of $Z_n^{L\prime}$ with continuous trajectories by Kolmogorov's continuity criterion \citep[Theorem~2.9]{legall2016brownian}. Note that $L_n'$ is $\bA_n'$-measurable and so by Lemma~\ref{lem:kernel_app_kmt_corollary} we can assume that $Z_n^{L\prime}$ depends only on $\bA_n'$ and some random noise which is independent of $(\bA_n', \bV_n')$. Finally, in order to have $\bA_n', \bV_n', L_n'$, and $Z_n^{L\prime}$ all defined on the same probability space, we note that $\bA_n$ and $\bV_n$ are random vectors while $L_n'$ and $Z_n^{L\prime}$ are stochastic processes with continuous sample paths indexed on the compact interval $\cW$. Hence the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}) allows us to ``glue'' them together in the desired way on another new probability space, giving $\big(\bA_n', \bV_n', L_n', Z_n^{L\prime}\big)$, retaining the single prime notation for clarity. \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_strong_approx_Ln}] See Lemma~\ref{lem:kernel_app_strong_approx_Ln} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}] We apply Lemma~\ref{lem:kernel_app_yurinskii_corollary} conditional on $\bA_n$. While this lemma is not in its current form stated for conditional distributions, the Yurinskii coupling on which it depends can be readily extended by following the proof of \citet[Lemma~38]{belloni2019conditional}, using a conditional version of Strassen's theorem \cite[Theorem~B.2]{chen2020jackknife}. Care must similarly be taken in embedding the conditionally Gaussian vectors into a conditionally Gaussian process, using the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}). By the mutual independence of $A_i$ and $V_{i j}$, we have that the observations $W_{i j}$ are independent (but not necessarily identically distributed) conditionally on $\bA_n$. Note that $\sup_{s,w \in \cW} |k_h(s,w)| \lesssim M_n = h^{-1}$ and $\E[k_h(W_{i j},w)^2 \mid \bA_n] \lesssim \sigma_n^2 = h^{-1}$. The following uniform Lipschitz condition holds with $l_{n,\infty} = C_\rL h^{-2}$, by the Lipschitz property of the kernels: % \begin{align*} \sup_{s,w,w' \in \cW} \left| \frac{k_h(s, w) - k_h(s, w')} {w-w'} \right| \leq l_{n,\infty}. \end{align*} % Also, the following $L^2$ Lipschitz condition holds uniformly with $l_{n,2} = 2 C_\rL \sqrt{C_\rd} h^{-3/2}$: % \begin{align*} &\E\big[ \big| k_h(W_{i j}, w) - k_h(W_{i j}, w') \big|^2 \mid \bA_n \big]^{1/2} \\ &\quad\leq \frac{C_\rL}{h^2} |w-w'| \left( \int_{([w \pm h] \cup [w' \pm h]) \cap \cW} f_{W \mid AA}(s \mid \bA_n) \diff{s} \right)^{1/2} \\ &\quad\leq \frac{C_\rL}{h^2} |w-w'| \sqrt{4h C_\rd} \leq l_{n,2} |w-w'|. \end{align*} % So we apply Lemma~\ref{lem:kernel_app_yurinskii_corollary} conditionally on $\bA_n$ to the $\frac{1}{2}n(n-1)$ observations, noting that % \begin{align*} \sqrt{n^2h} E_n(w) = \sqrt{\frac{2 n h}{n-1}} \sqrt{\frac{2}{n(n-1)}} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} \Big( k_h(W_{i j},w) - \E[k_h(W_{i j},w) \mid A_i, A_j] \Big), \end{align*} % to deduce that for $t_n > 0$ there exist (an enlarged probability space) conditionally mean-zero and conditionally Gaussian processes $\tilde Z_n^{E\prime}(w)$ with the same conditional covariance structure as $\sqrt{n^2 h} E_n(w)$ and satisfying % \begin{align*} &\P\left( \sup_{w \in \cW} \big| \sqrt{n^2h} E_n(w) - \tilde Z_n^{E\prime}(w) \big| > t_n \Bigm\vert \bA_n' \right) \\ &\quad= \P\left( \sup_{w \in \cW} \left| \sqrt{\frac{n(n-1)}{2}} E_n(w) - \sqrt{\frac{n-1}{2 n h}} \tilde Z_n^{E\prime}(w) \right| > \sqrt{\frac{n-1}{2 n h}} t_n \Bigm\vert \bA_n' \right) \\ &\quad\lesssim \frac{ \sigma_n \sqrt{\Leb(\cW)} \sqrt{\log n} \sqrt{M_n + \sigma_n\sqrt{\log n}} }{n^{1/2} t_n^2 / h} \sqrt{ l_{n,2} \sqrt{\log n} + \frac{l_{n,\infty}}{n} \log n} \\ &\quad\lesssim \frac{ h^{-1/2} \sqrt{\log n} \sqrt{h^{-1} + h^{-1/2} \sqrt{\log n}} }{n^{1/2} t_n^2 / h} \sqrt{ h^{-3/2} \sqrt{\log n} + \frac{h^{-2}}{n} \log n} \\ &\quad\lesssim \sqrt{\frac{\log n}{n}} \frac{ \sqrt{1 + \sqrt{h \log n}} }{t_n^2} \sqrt{ \sqrt{\frac{\log n}{h^3}} \left( 1 + \sqrt{\frac{\log n}{n^2 h}} \right) } \\ &\quad\lesssim \sqrt{\frac{\log n}{n}} \frac{ 1 }{t_n^2} \left( \frac{\log n}{h^3} \right)^{1/4} \lesssim t_n^{-2} n^{-1/2} h^{-3/4} (\log n)^{3/4}, \end{align*} % where we used $h \lesssim 1 / \log n$ and $\frac{\log n}{n^2 h} \lesssim 1$. To obtain the trajectory regularity property of $\tilde Z_n^{E\prime}$, note that for $w, w' \in \cW$, by conditional independence, % \begin{align*} &\E\left[ \big| \tilde Z_n^{E\prime}(w) - \tilde Z_n^{E\prime}(w') \big|^2 \mid \bA_n' \right]^{1/2} = \sqrt{n^2h} \, \E\left[ \big| E_n(w) - E_n(w') \big|^2 \mid \bA_n \right]^{1/2} \\ &\quad\lesssim \sqrt{n^2h} \, \E\left[ \left| \frac{2}{n(n-1)} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} \Big( k_h(W_{i j},w) - k_h(W_{i j},w') \Big) \right|^2 \Bigm\vert \bA_n \right]^{1/2} \\ &\quad\lesssim \sqrt{h} \, \E\left[ \big| k_h(W_{i j},w) - k_h(W_{i j},w') \big|^2 \bigm\vert \bA_n \right]^{1/2} \lesssim h^{-1} |w-w'|. \end{align*} % So by the regularity result for Gaussian processes in Lemma~\ref{lem:kernel_app_gaussian_process_maximal}, with $\delta_n \in (0, 1/(2h)]$: % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big| \tilde Z_n^{E\prime}(w) - \tilde Z_n^{E\prime}(w') \big| \mid \bA_n' \right] &\lesssim \int_0^{\delta_n/h} \sqrt{\log (\varepsilon^{-1} h^{-1})} \diff{\varepsilon} \lesssim \frac{\delta_n}{h} \sqrt{\log \frac{1}{h\delta_n}}, \end{align*} % and there exists a modification with continuous trajectories. Finally, in order to have $\bA_n', \bV_n', E_n'$, and $\tilde Z_n^{E\prime}$ all defined on the same probability space, we note that $\bA_n$ and $\bV_n$ are random vectors while $E_n'$ and $\tilde Z_n^{E\prime}$ are stochastic processes with continuous sample paths indexed on the compact interval $\cW$. Hence the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}) allows us to ``glue together'' $\big(\bA_n, \bV_n, E_n\big)$ and $\big(E_n', \tilde Z_n^{E\prime}\big)$ in the desired way on another new probability space, giving $\big(\bA_n', \bV_n', E_n', \tilde Z_n^{E\prime}\big)$, retaining the single prime notation for clarity. The trajectories of the conditionally Gaussian processes $\tilde Z_n^{E\prime}$ depend on the choice of $t_n$, necessitating the use of a divergent sequence $R_n$ to establish bounds in probability. \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_conditional_strong_approx_En}] See Lemma~\ref{lem:kernel_app_conditional_strong_approx_En} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_unconditional_strong_approx_En}] \proofparagraph{defining $Z_n^{E\dprime}$} Pick $\delta_n \to 0$ with $\log 1/\delta_n \lesssim \log n$. Let $\cW_\delta$ be a $\delta_n$-covering of $\cW$ with cardinality $\Leb(\cW)/\delta_n$ which is also a $\delta_n$-packing. Let $\tilde Z_{n,\delta}^{E\prime}$ be the restriction of $\tilde Z_n^{E\prime}$ to $\cW_\delta$. Let $\tilde \Sigma_n^E(w, w') = \E\big[\tilde Z_n^{E\prime}(w) \tilde Z_n^{E\prime}(w') \mid \bA_n' \big]$ be the conditional covariance function of $\tilde Z_n^{E\prime}$, and define $\Sigma_n^E(w,w') = \E\big[\tilde \Sigma_n^E(w,w')\big]$. Let $\tilde \Sigma^E_{n,\delta}$ and $\Sigma^E_{n,\delta}$ be the restriction matrices of $\tilde \Sigma^E_n$ and $\Sigma^E_n$ to $\cW_\delta \times \cW_\delta$, noting that, as (conditional) covariance matrices, these are (almost surely) positive semi-definite. Let $N \sim \cN(0, I_{|\cW_\delta|})$ be independent of $\bA_n'$, and define using the matrix square root $\tilde Z_{n,\delta}^{E\dprime} = \big(\tilde \Sigma^E_{n,\delta})^{1/2} N$, which has the same distribution as $\tilde Z_{n,\delta}^{E\prime}$, conditional on $\bA_n'$. Extend it using the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}) to the compact interval $\cW$, giving a conditionally Gaussian process $\tilde Z_n^{E\dprime}$ which has the same distribution as $\tilde Z_{n}^{E\prime}$, conditional on $\bA_n'$. Define $Z_{n,\delta}^{E\dprime} = \big(\Sigma^E_{n,\delta})^{1/2} N$, noting that this is independent of $\bA_n'$, and extend it using the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}) to a Gaussian process $Z_n^{E\dprime}$ on the compact interval $\cW$, which is independent of $\bA_n'$ and has covariance structure given by $\Sigma_n^E$. \proofparagraph{closeness of $Z_n^{E\dprime}$ and $\tilde Z_n^{E\dprime}$ on the mesh} Note that conditionally on $\bA_n'$, $\tilde Z_{n,\delta}^{E\dprime} - Z_{n,\delta}^{E\dprime}$ is a length-$|\cW_\delta|$ Gaussian random vector with covariance matrix $\big( \big(\tilde \Sigma^E_{n,\delta}\big)^{1/2} - \big(\Sigma^E_{n,\delta}\big)^{1/2} \big)^2$. So by the Gaussian maximal inequality in Lemma~\ref{lem:kernel_app_gaussian_vector_maximal} applied conditionally on $\bA_n'$, % \begin{align*} \E\left[ \max_{w \in \cW_\delta} \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big| \Bigm| \bA_n' \right] &\lesssim \sqrt{\log n} \left\| \tilde\Sigma^E_{n,\delta} - \Sigma^E_{n,\delta} \right\|_2^{1/2}, \end{align*} % since $\log |\cW_\delta| \lesssim \log n$. Next, we apply some U-statistic theory to $\tilde\Sigma^E_{n,\delta} - \Sigma^E_{n,\delta}$, with the aim of applying the matrix concentration result for second-order U-statistics presented in Lemma~\ref{lem:kernel_app_ustat_matrix_concentration}. Firstly, we note that since the conditional covariance structures of $\tilde Z_n^{E\prime}$ and $\sqrt{n^2h} E_n$ are equal in distribution, we have, writing $E_n(\cW_\delta)$ for the vector $\big(E_n(w) : w \in \cW_\delta\big)$ and similarly for $k_h(W_{i j}, \cW_\delta)$, % \begin{align*} \tilde\Sigma^E_{n,\delta} &= n^2h \E[E_n(\cW_\delta) E_n(\cW_\delta)^\T \mid \bA_n] \\ &= n^2h \frac{4}{n^2(n-1)^2} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} \E\left[ \Big( k_h(W_{i j}, \cW_\delta) - \E\left[ k_h(W_{i j}, \cW_\delta) \mid \bA_n \right] \Big) \right. \\ &\qquad\left. \times\Big( k_h(W_{i j}, \cW_\delta) - \E\left[ k_h(W_{i j}, \cW_\delta) \mid \bA_n \right] \Big)^\T \bigm\vert \bA_n \right] \\ &= \frac{4h}{(n-1)^2} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} u(A_i, A_j), \end{align*} % where we define the random $|\cW_\delta| \times |\cW_\delta|$ matrices % \begin{align*} u(A_i, A_j) &= \E\!\left[ k_h(W_{i j}, \cW_\delta) k_h(W_{i j}, \cW_\delta)^\T \mid \bA_n \right] - \E\!\left[ k_h(W_{i j}, \cW_\delta) \mid \bA_n \right] \E\!\left[ k_h(W_{i j}, \cW_\delta) \mid \bA_n \right]^\T. \end{align*} % Let $u(A_i) = \E[u(A_i, A_j) \mid A_i]$ and $u = \E[u(A_i, A_j)]$. The decomposition $\tilde \Sigma^E_{n,\delta} - \Sigma^E_{n,\delta} = \tilde L +\tilde Q$ holds by Lemma~\ref{lem:kernel_app_general_hoeffding}, where % \begin{align*} \tilde L &= \frac{4h}{n-1} \sum_{i=1}^n \big( u(A_i) - u \big), &\tilde Q &= \frac{4h}{(n-1)^2} \sum_{i=1}^{n-1} \sum_{j=i+1}^{n} \big( u(A_i, A_j) - u(A_i) - u(A_j) + u \big). \end{align*} % Next, we seek an almost sure upper bound on $\|u(A_i, A_j)\|_2$. Since this is a symmetric matrix, we have by H{\"o}lder's inequality % \begin{align*} \|u(A_i, A_j)\|_2 &\leq \|u(A_i, A_j)\|_1^{1/2} \|u(A_i, A_j)\|_\infty^{1/2} = \max_{1 \leq k \leq |\cW_\delta|} \sum_{l=1}^{|\cW_\delta|} |u(A_i, A_j)_{kl}|. \end{align*} % The terms on the right hand side can be bounded as follows, writing $w, w'$ for the $k$th and $l$th points in $\cW_\delta$ respectively: % \begin{align*} |u(A_i, A_j)_{kl}| &= \big| \E\left[ k_h(W_{i j}, w) k_h(W_{i j}, w') \mid \bA_n \right] - \E\left[ k_h(W_{i j}, w) \mid \bA_n \right] \E\left[ k_h(W_{i j}, w') \mid \bA_n \right] \big| \\ &\lesssim \E\left[ | k_h(W_{i j}, w) k_h(W_{i j}, w') | \mid \bA_n \right] + \E\left[ | k_h(W_{i j}, w) | \mid \bA_n \right] \E\left[ | k_h(W_{i j}, w') | \mid \bA_n \right] \\ &\lesssim h^{-1} \I\big\{ |w-w'| \leq 2h \big\} + 1 \lesssim h^{-1} \I\big\{ |k-l| \leq 2h/\delta_n \big\} + 1, \end{align*} % where we used that $|w-w'| \geq |k-l| \delta_n$ because $\cW_\delta$ is a $\delta_n$-packing. Hence % \begin{align*} \|u(A_i, A_j)\|_2 &\leq \max_{1 \leq k \leq |\cW_\delta|} \sum_{l=1}^{|\cW_\delta|} |u(A_i, A_j)_{kl}| \lesssim \max_{1 \leq k \leq |\cW_\delta|} \sum_{l=1}^{|\cW_\delta|} \Big( h^{-1} \I\big\{ |k-l| \leq 2h/\delta_n \big\} + 1 \Big) \\ &\lesssim 1/\delta_n + 1/h + |\cW_\delta| \lesssim 1/\delta_n + 1/h. \end{align*} % Clearly, the same bound holds for $\|u(A_i)\|_2$ and $\|u\|_2$, by Jensen's inequality. Therefore, applying the matrix Bernstein inequality (Lemma~\ref{lem:kernel_app_matrix_bernstein}) to the zero-mean matrix $\tilde L$ gives % \begin{align*} \E\left[ \left\| \tilde L \right\|_2 \right] &\lesssim \frac{h}{n} \left(\frac{1}{\delta_n} + \frac{1}{h} \right) \left( \log |\cW_\delta| + \sqrt{n \log |\cW_\delta|} \right) \lesssim \left(\frac{h}{\delta_n} + 1 \right) \sqrt{\frac{\log n}{n}}. \end{align*} % The matrix U-statistic concentration inequality (Lemma~\ref{lem:kernel_app_ustat_matrix_concentration}) with $\tilde Q$ gives % \begin{align*} \E\left[ \big\| \tilde Q \big\|_2 \right] &\lesssim \frac{h}{n^2} n \left(\frac{1}{\delta_n} + \frac{1}{h} \right) \left( \log |\cW_\delta| \right)^{3/2} \lesssim \left(\frac{h}{\delta_n} + 1 \right) \frac{(\log n)^{3/2}}{n}. \end{align*} % Hence taking a marginal expectation and applying Jensen's inequality, % \begin{align*} &\E\left[ \max_{w \in \cW_\delta} \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big| \right] \\ &\quad\lesssim \sqrt{\log n} \ \E\left[ \left\| \tilde\Sigma^E_{n,\delta} - \Sigma^E_{n,\delta} \right\|_2^{1/2} \right] \lesssim \sqrt{\log n} \ \E\left[ \left\| \tilde\Sigma^E_{n,\delta} - \Sigma^E_{n,\delta} \right\|_2 \right]^{1/2} \\ &\quad\lesssim \sqrt{\log n} \ \E\left[ \left\| \tilde L + \tilde Q \right\|_2 \right]^{1/2} \lesssim \sqrt{\log n} \ \E\left[ \left\| \tilde L \right\|_2 + \left\| \tilde Q \right\|_2 \right]^{1/2} \\ &\quad\lesssim \sqrt{\log n} \left( \left(\frac{h}{\delta_n} + 1 \right) \sqrt{\frac{\log n}{n}} + \left(\frac{h}{\delta_n} + 1 \right) \frac{(\log n)^{3/2}}{n} \right)^{1/2} \\ &\quad\lesssim \sqrt{\frac{h}{\delta_n} + 1} \frac{(\log n)^{3/4}}{n^{1/4}}. \end{align*} \proofparagraph{regularity of $Z_n^E$ and $\tilde Z_n^{E\prime}$} Define the semimetrics % \begin{align*} \rho(w, w')^2 &= \E\left[ \big|Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w')\big|^2 \right], &\tilde\rho(w, w')^2 &= \E\left[ \big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big|^2 \mid \bA_n \right]. \end{align*} % We bound $\tilde \rho$ as follows, since $\tilde Z_n^{E\dprime}$ and $\sqrt{n^2h} E_n$ have the same conditional covariance structure: % \begin{align*} \tilde\rho(w, w') &= \E\left[ \big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big|^2 \mid \bA_n' \right]^{1/2} \\ &= \sqrt{n^2 h} \, \E\left[ \big|E_n(w) - E_n(w')\big|^2 \mid \bA_n' \right]^{1/2} \lesssim h^{-1} |w-w'|, \end{align*} % uniformly in $\bA_n'$, where the last line was shown in the proof of Lemma~\ref{lem:kernel_app_conditional_strong_approx_En}. Note that also % \begin{align*} \rho(w, w') &= \sqrt{\E[\tilde \rho(w,w')^2]} \lesssim h^{-1} |w-w'|. \end{align*} % Thus Lemma~\ref{lem:kernel_app_gaussian_process_maximal} applies directly to $Z_n^E$ and conditionally to $\tilde Z_n^{E\prime}$, with $\delta_n \in (0, 1/(2h)]$, demonstrating that % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big| \bigm\vert \bA_n' \right] &\lesssim \int_0^{\delta_n / h} \sqrt{\log (1 / (\varepsilon h))} \diff{\varepsilon} \lesssim \frac{\delta_n}{h} \sqrt{\log \frac{1}{h \delta_n}}, \\ \E\left[ \sup_{|w-w'| \leq \delta_n} |Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w')| \right] &\lesssim \int_0^{\delta_n / h} \sqrt{\log (1 / (\varepsilon h))} \diff{\varepsilon} \lesssim \frac{\delta_n}{h} \sqrt{\log \frac{1}{h \delta_n}}. \end{align*} % Continuity of trajectories follows from this. \proofparagraph{conclusion} We use the previous parts to deduce that % \begin{align*} &\E\left[ \sup_{w \in \cW} \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big| \right] \\ &\quad\lesssim \E\left[ \max_{w \in \cW_\delta} \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big| \right] \\ &\qquad+ \E\left[ \sup_{|w-w'| \leq \delta_n} \left\{ \big|\tilde Z_n^{E\dprime}(w) - \tilde Z_n^{E\dprime}(w')\big| + \big|Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w')\big| \right\} \right] \\ &\quad\lesssim \sqrt{\frac{h}{\delta_n} + 1} \frac{(\log n)^{3/4}}{n^{1/4}} + \frac{\delta_n \sqrt{\log n}}{h}. \end{align*} % Setting $\delta_n = h \left( \frac{\log n}{n} \right)^{1/6}$ gives % \begin{align*} \E\left[ \sup_{w \in \cW} \big|\tilde Z_n^{E\dprime}(w) - Z_n^{E\dprime}(w)\big| \right] &\lesssim n^{-1/6} (\log n)^{2/3}. \end{align*} % Independence of $Z_n^{E\dprime}$ and $\bA_n''$ follows by applying the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}), conditionally on $\bA_n'$, to the variables $\big(\bA_n', \tilde Z_n^{E\prime}\big)$ and $\big(\tilde Z_n^{E\dprime}, Z_n^{E\dprime}\big)$. \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_unconditional_strong_approx_En}] See Lemma~\ref{lem:kernel_app_unconditional_strong_approx_En} \end{proof} \begin{proof}[Theorem~\ref{thm:kernel_app_strong_approx_fW}] We add together the strong approximations for the $L_n$ and $E_n$ terms, and then add an independent Gaussian process to account for the variance of $Q_n$. \proofparagraph{gluing together the strong approximations} Let $\big(\bA_n', \bV_n', L_n', Z_n^{L\prime}\big)$ be the strong approximation for $L_n$ derived in Lemma~\ref{lem:kernel_app_strong_approx_Ln}. Let $\big(\bA_n'', \bV_n'', E_n'', \tilde Z_n^{E\dprime}\big)$ and $\big(\bA_n''', \bV_n''', \tilde Z_n^{E\tprime}, Z_n^{E\tprime}\big)$ be the conditional and unconditional strong approximations for $E_n$ given in Lemmas~\ref{lem:kernel_app_conditional_strong_approx_En} and \ref{lem:kernel_app_unconditional_strong_approx_En} respectively. The first step is to define copies of these variables and processes on the same probability space. This is achieved by applying the Vorob'ev--Berkes--Philipp theorem (Lemma~\ref{lem:kernel_app_vbp}). Dropping the prime notation for clarity, we construct $\big(\bA_n, \bV_n, L_n, Z_n^L, E_n, \tilde Z_n^E, Z_n^E\big)$ with the following properties: % \begin{enumerate}[label=(\roman*)] \item $\sup_{w \in \cW} \big| \sqrt{n} L_n(w) - Z_n^L(w)\big| \lesssim_\P n^{-1/2} \log n$, \item $\sup_{w \in \cW} \big|\sqrt{n^2h} E_n(w) - \tilde Z^E_n(w) \big| \lesssim_\P n^{-1/4} h^{-3/8} (\log n)^{3/8} R_n$, \item $\sup_{w \in \cW} \big| \tilde Z^E_n(w) - Z^E_n(w) \big| \lesssim_\P n^{-1/6} (\log n)^{2/3}$, \item $Z_n^L$ is independent of $Z_n^E$. \end{enumerate} % Note that the independence of $Z_n^L$ and $Z_n^E$ follows since $Z_n^L$ depends only on $\bA_n$ and some independent random noise, while $Z_n^E$ is independent of $\bA_n$. Therefore $(Z_n^L, Z_n^E)$ are jointly Gaussian. To get the strong approximation result for $\hat f_W$, define the Gaussian process % \begin{align*} Z_n^f(w) &= \frac{1}{\sqrt n} Z_n^L(w) + \frac{1}{n} Z_n^Q(w) + \frac{1}{\sqrt{n^2h}} Z_n^E(w), \end{align*} % where $Z_n^Q(w)$ is a mean-zero Gaussian process independent of everything else with covariance % \begin{align*} \E\big[ Z_n^Q(w) Z_n^Q(w') \big] &= n^2 \E\big[ Q_n(w) Q_n(w') \big]. \end{align*} % As shown in the proof of Lemma~\ref{lem:kernel_uniform_concentration}, the process $Q_n(w)$ is uniformly Lipschitz and uniformly bounded in $w$. Thus by Lemma~\ref{lem:kernel_app_gaussian_process_maximal}, we have $\E\big[\sup_{w \in \cW} |Z_n^Q(w)|\big] \lesssim 1$. Therefore the uniform approximation error is given by % \begin{align*} & \sup_{w \in \cW} \big| \hat f_W(w) - \E[\hat f_W(w)] - Z_n^f(w) \big| \\ &\quad= \sup_{w \in \cW} \left| \frac{1}{\sqrt n} Z_n^L(w) + \frac{1}{n} Z_n^Q(w) + \frac{1}{\sqrt{n^2h}} Z_n^E(w) - \Big( L_n(w) + Q_n(w) + E_n(w) \Big) \right| \\ &\quad\leq \sup_{w \in \cW} \bigg( \frac{1}{\sqrt n} \left| Z_n^L(w) - \sqrt{n} L_n(w) \right| + \frac{1}{\sqrt{n^2h}} \left| \tilde Z_n^E(w) - \sqrt{n^2h} E_n(w) \right| \\ &\qquad+ \frac{1}{\sqrt{n^2h}} \left| Z_n^E(w) - \tilde Z_n^E(w) \right| \big| Q_n(w) \big| + \frac{1}{n} \big| Z_n^Q(w) \big| \bigg) \\ &\quad\lesssim_\P n^{-1} \log n + n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-7/6} h^{-1/2} (\log n)^{2/3}. \end{align*} \proofparagraph{covariance structure} Since $L_n$, $Q_n$, and $E_n$ are mutually orthogonal in $L^2$ (as shown in Lemma~\ref{lem:kernel_hoeffding}), we have the following covariance structure: % \begin{align*} \E\big[Z_n^f(w) Z_n^f(w')\big] &= \frac{1}{n} \E\big[ Z_n^L(w) Z_n^L(w') \big] + \frac{1}{n^2} \E\big[ Z_n^Q(w) Z_n^Q(w') \big] + \frac{1}{n^2h} \E\big[ Z_n^E(w) Z_n^E(w') \big] \\ &= \E\big[ L_n(w) L_n(w') \big] + \E\big[ Q_n(w) Q_n(w') \big] + \E\big[ E_n(w) E_n(w') \big] \\ &= \E\big[ \big(\hat f_W(w) - \E[\hat f_W(w)]\big) \big(\hat f_W(w') - \E[\hat f_W(w')]\big) \big]. \end{align*} \proofparagraph{trajectory regularity} The trajectory regularity of the process $Z_n^f$ follows directly by adding the regularities of the processes $\frac{1}{\sqrt n} Z_n^L$, $\frac{1}{n} Z_n^Q$, and $\frac{1}{\sqrt{n^2h}} Z_n^E$. Similarly, $Z_n^f$ has continuous trajectories. \end{proof} \begin{proof}[Theorem~\ref{thm:kernel_strong_approx_Tn}] Define $Z_n^T(w) = \frac{Z_n^f(w)}{\sqrt{\Sigma_n(w,w)}}$ so that % \begin{align*} \left| T_n(w) - Z_n^T(w) \right| &= \frac{\big| \hat f_W(w) - f_W(w) - Z_n^f(w) \big|} {\sqrt{\Sigma_n(w,w)}}. \end{align*} % By Theorems~\ref{thm:kernel_app_strong_approx_fW} and \ref{thm:kernel_bias}, the numerator can be bounded above by % \begin{align*} &\sup_{w \in \cW} \left| \hat f_W(w) - f_W(w) - Z_n^f(w) \right| \\ &\quad\leq \sup_{w \in \cW} \left| \hat f_W(w) - \E\big[\hat f_W(w)\big] - Z_n^f(w) \right| + \sup_{w \in \cW} \left| \E\big[\hat f_W(w)\big] - f_W(w) \right| \\ &\quad\lesssim_\P n^{-1} \log n + n^{-5/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-7/6} h^{-1/2} (\log n)^{2/3} + h^{p \wedge \beta}. \end{align*} % By Lemma~\ref{lem:kernel_variance_bounds} with $\inf_\cW f_W(w) > 0$, the denominator is bounded below by % \begin{align*} \inf_{w \in \cW} \sqrt{\Sigma_n(w,w)} &\gtrsim \frac{\Dl}{\sqrt n} + \frac{1}{\sqrt{n^2h}}, \end{align*} % and the result follows. \end{proof} \begin{proof}[Theorem~\ref{thm:kernel_infeasible_ucb}] Note that the covariance structure of $Z_n^T$ is given by % \begin{align*} \Cov\big[ Z_n^T(w), Z_n^T(w') \big] &= \frac{\Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}}. \end{align*} % We apply an anti-concentration result to establish that all quantiles of $\sup_{w \in \cW} \big|Z_n^T(w)\big|$ exist. To do this, we must first establish regularity properties of $Z_n^T$. \proofparagraph{$L^2$ regularity of $Z_n^T$} Writing $k_{i j}'$ for $k_h(W_{i j},w')$ etc., note that by Lemma~\ref{lem:kernel_app_covariance_structure}, % \begin{align*} &\big| \Sigma_n(w,w') - \Sigma_n(w, w'') \big| \\ &\quad= \left| \frac{2}{n(n-1)} \Cov\big[ k_{i j}, k_{i j}' \big] + \frac{4(n-2)}{n(n-1)} \Cov\big[ k_{i j}, k_{i r}' \big] \right. \\ &\left. \quad\qquad- \frac{2}{n(n-1)} \Cov\big[ k_{i j}, k_{i j}'' \big] - \frac{4(n-2)}{n(n-1)} \Cov\big[ k_{i j}, k_{i r}'' \big] \right| \\ &\quad\leq \frac{2}{n(n-1)} \Big| \Cov\big[ k_{i j}, k_{i j}' - k_{i j}'' \big] \Big| + \frac{4(n-2)}{n(n-1)} \Big| \Cov\big[ k_{i j}, k_{i r}' - k_{i r}'' \big] \Big| \\ &\quad\leq \frac{2}{n(n-1)} \|k_{i j}\|_\infty \|k_{i j}' - k_{i j}''\|_\infty + \frac{4(n-2)}{n(n-1)} \|k_{i j}\|_\infty \|k_{i r}' - k_{i r}''\|_\infty \\ &\quad\leq \frac{4}{n h^3} C_\rk C_\rL |w'-w''| \lesssim n^{-1}h^{-3} |w'-w''| \end{align*} % uniformly in $w, w', w'' \in \cW$. Therefore, by Lemma~\ref{lem:kernel_variance_bounds}, with $\delta_n \leq n^{-2} h^2$, we have % \begin{align*} \inf_{|w-w'| \leq \delta_n} \Sigma_n(w,w') &\gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h} - n^{-1} h^{-3} \delta_n \gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h} - \frac{1}{n^3h} \gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h}, \\ \sup_{|w-w'| \leq \delta_n} \Sigma_n(w,w') &\lesssim \frac{\Du^2}{n} + \frac{1}{n^2h} + n^{-1} h^{-3} \delta_n \lesssim \frac{\Du^2}{n} + \frac{1}{n^2h} + \frac{1}{n^3h} \lesssim \frac{\Du^2}{n} + \frac{1}{n^2h}. \end{align*} % The $L^2$ regularity of $Z_n^T$ is % \begin{align*} \E\left[ \big( Z_n^T(w) - Z_n^T(w') \big)^2 \right] &= 2 - 2 \frac{\Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}}. \end{align*} % Applying the elementary result that for $a,b,c > 0$, % \begin{align*} 1 - \frac{a}{\sqrt{b c}} &= \frac{b(c-a) + a(b-a)} {\sqrt{b c}\big(\sqrt{b c} + a\big)}, \end{align*} % with $a = \Sigma_n(w,w')$, $b = \Sigma_n(w,w)$, and $c = \Sigma_n(w',w')$, and noting $|c-a| \lesssim n^{-1} h^{-3} |w-w'|$ and $|b-a| \lesssim n^{-1} h^{-3} |w-w'|$ and $\frac{\Dl^2}{n} + \frac{1}{n^2h} \lesssim a,b,c \lesssim \frac{\Du^2}{n} + \frac{1}{n^2h}$, yields % \begin{align*} \E\left[ \big( Z_n^T(w) - Z_n^T(w') \big)^2 \right] &\lesssim \frac{(\Du^2/n + 1/(n^2h))n^{-1}h^{-3}|w-w'|} {(\Dl^2/n + 1/(n^2h))^2} \\ &\lesssim \frac{n^{2} h^{-4}|w-w'|} {n^{-4}h^{-2}} \lesssim n^2 h^{-2} |w-w'|. \end{align*} % Thus the semimetric induced by $Z_n^T$ on $\cW$ is % \begin{align*} \rho(w,w') &\vcentcolon= \E\left[ \big( Z_n^T(w) - Z_n^T(w') \big)^2 \right]^{1/2} \lesssim n h^{-1} \sqrt{|w-w'|}. \end{align*} \proofparagraph{trajectory regularity of $Z_n^T$} By the bound on $\rho$ from the previous part, we deduce the covering number bound % \begin{align*} N(\varepsilon, \cW, \rho) &\lesssim N\big( \varepsilon, \cW, n h^{-1} \sqrt{|\cdot|} \big) \lesssim N\big( n^{-1} h \varepsilon, \cW, \sqrt{|\cdot|} \big) \\ &\lesssim N\big( n^{-2} h^2 \varepsilon^2, \cW, |\cdot| \big) \lesssim n^2 h^{-2} \varepsilon^{-2}. \end{align*} % Now apply the Gaussian process regularity result from Lemma~\ref{lem:kernel_app_gaussian_process_maximal}. % \begin{align*} \E\left[ \sup_{\rho(w,w') \leq \delta} \big| Z_n^T(w) - Z_n^T(w') \big| \right] &\lesssim \int_0^{\delta} \sqrt{\log N(\varepsilon, \cW, \rho)} \diff{\varepsilon} \lesssim \int_0^{\delta} \sqrt{\log (n^2 h^{-2} \varepsilon^{-2})} \diff{\varepsilon} \\ &\lesssim \int_0^{\delta} \left( \sqrt{\log n} + \sqrt{\log 1/\varepsilon} \right) \diff{\varepsilon} \lesssim \delta \left( \sqrt{\log n} + \sqrt{\log 1/\delta} \right), \end{align*} % and so % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big| Z_n^T(w) - Z_n^T(w') \big| \right] &\lesssim \E\left[ \sup_{\rho(w,w') \leq n h^{-1} \delta_n^{1/2}} \big| Z_n^T(w) - Z_n^T(w') \big| \right] \lesssim n h^{-1} \sqrt{\delta_n \log n}, \end{align*} % whenever $1/\delta_n$ is at most polynomial in $n$. \proofparagraph{existence of the quantile} Apply the Gaussian anti-concentration result from Lemma~\ref{lem:kernel_app_anticoncentration}, noting that $Z_n^T$ is separable, mean-zero, and has unit variance: % \begin{align*} \sup_{t \in \R} \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - t \right| \leq 2\varepsilon_n \right) &\leq 8 \varepsilon_n \left( 1 + \E\left[ \sup_{w \in \cW} \big| Z_n^T(w) \big| \right] \right). \end{align*} % To bound the supremum on the right hand side, apply the Gaussian process maximal inequality from Lemma~\ref{lem:kernel_app_gaussian_process_maximal} with $\sigma \leq 1$ and $N(\varepsilon, \cW, \rho) \lesssim n^2 h^{-2} \varepsilon^{-2}$: % \begin{align*} \E\left[ \sup_{w \in \cW} \big|Z_n^T(w)\big| \right] &\lesssim 1 + \int_0^{2} \sqrt{\log (n^2 h^{-2} \varepsilon^{-2})} \diff{\varepsilon} \lesssim \sqrt{\log n}. \end{align*} % Therefore % \begin{align*} \sup_{t \in \R} \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - t \right| \leq \varepsilon \right) &\lesssim \varepsilon \sqrt{\log n}. \end{align*} % Letting $\varepsilon \to 0$ shows that the distribution function of $\sup_{w \in \cW} \big|Z_n^T(w)\big|$ is continuous, and therefore all of its quantiles exist. \proofparagraph{validity of the infeasible uniform confidence band} Under Assumption~\ref{ass:kernel_rates} and with a sufficiently slowly diverging sequence $R_n$, the strong approximation rate established in Theorem~\ref{thm:kernel_strong_approx_Tn} is % \begin{align*} &\sup_{w \in \cW} \left| T_n(w) - Z_n^T(w) \right| \\ &\quad\lesssim_\P \frac{ n^{-1/2} \log n + n^{-3/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-2/3} h^{-1/2} (\log n)^{2/3} + n^{1/2} h^{p \wedge \beta}} {\Dl + 1/\sqrt{n h}} \ll \frac{1}{\sqrt{\log n}}. \end{align*} % So by Lemma~\ref{lem:kernel_app_slow_convergence}, take $\varepsilon_n$ such that % \begin{align*} \P \left( \sup_{w \in \cW} \left| T_n(w) - Z_n^T(w) \right| > \varepsilon_n \right) &\leq \varepsilon_n \sqrt{\log n} \end{align*} % and $\varepsilon_n \sqrt{\log n} \to 0$. So by the previously established anti-concentration result, % \begin{align*} &\P\left( \left| \hat f_W(w) - f_W(w) \right| \leq q_{1-\alpha} \sqrt{\Sigma_n(w,w)} \textup{ for all } w \in \cW \right) \\ &\quad= \P\left( \sup_{w \in \cW} \left| T_n(w) \right| \leq q_{1-\alpha} \right) \\ &\quad\leq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq q_{1-\alpha} + \varepsilon_n \right) + \P \left( \sup_{w \in \cW} \left| T_n(w) - Z_n^T(w) \right| > \varepsilon_n \right) \\ &\quad\leq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq q_{1-\alpha} \right) + \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - q_{1-\alpha} \right| \leq \varepsilon_n \right) + \varepsilon_n \sqrt{\log n} \\ &\quad\leq 1 - \alpha + 2 \varepsilon_n \sqrt{\log n}. \end{align*} % The lower bound follows analogously: % \begin{align*} &\P\left( \left| \hat f_W(w) - f_W(w) \right| \leq q_{1-\alpha} \sqrt{\Sigma_n(w,w)} \textup{ for all } w \in \cW \right) \\ &\quad\geq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq q_{1-\alpha} - \varepsilon_n \right) - \varepsilon_n \sqrt{\log n} \\ &\quad\geq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq q_{1-\alpha} \right) - \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - q_{1-\alpha} \right| \leq \varepsilon_n \right) - \varepsilon_n \sqrt{\log n} \\ &\quad\leq 1 - \alpha - 2 \varepsilon_n \sqrt{\log n}. \end{align*} % Finally, we apply $\varepsilon_n \sqrt{\log n} \to 0$ to see % \begin{align*} \left| \P\left( \left| \hat f_W(w) - f_W(w) \right| \leq q_{1-\alpha} \sqrt{\Sigma_n(w,w)} \textup{ for all } w \in \cW \right) - (1 - \alpha) \right| &\to 0. \end{align*} \end{proof} Before proving Lemma~\ref{lem:kernel_app_covariance_estimation}, we provide the following useful concentration inequality. This is essentially a corollary of the U-statistic concentration inequality given in Theorem~3.3 in \citet{gine2000exponential}. \begin{lemma}[A concentration inequality] \label{lem:kernel_app_dyadic_concentration} Let $X_{i j}$ be mutually independent for $1 \leq i < j \leq n$ taking values in a measurable space $\cX$. Let $h_1$, $h_2$ be measurable functions from $\cX$ to $\R$ satisfying the following for all $i$ and $j$. % \begin{align*} \E\big[h_1(X_{i j})\big] &= 0, &\E\big[h_2(X_{i j})\big] &=0, \\ \E\big[h_1(X_{i j})^2\big] &\leq \sigma^2, &\E\big[h_2(X_{i j})^2\big] &\leq \sigma^2, \\ \big|h_1(X_{i j})\big| &\leq M, &\big|h_2(X_{i j})\big| &\leq M. \end{align*} % Consider the sum % \begin{align*} S_n &= \sum_{1 \leq i < j < r \leq n} h_1(X_{i j}) h_2(X_{i r}). \end{align*} % Then $S_n$ satisfies the concentration inequality % \begin{align*} \P\big( |S_n| \geq t \big) &\leq C \exp\left( -\frac{1}{C} \min \left\{ \frac{t^2}{n^3 \sigma^4}, \frac{t}{\sqrt{n^3 \sigma^4}}, \frac{t^{2/3}}{(n M \sigma)^{2/3}}, \frac{t^{1/2}}{M} \right\} \right) \end{align*} % for some universal constant $C > 0$ and for all $t>0$. \end{lemma} \begin{proof}[Lemma~\ref{lem:kernel_app_dyadic_concentration}] We proceed in three main steps. Firstly, we write $S_n$ as a second-order U-statistic where we use double indices instead of single indices. Then we use a decoupling result to introduce extra independence. Finally, a concentration result is applied to the decoupled U-statistic. \proofparagraph{writing $S_n$ as a second-order U-statistic} Note that we can write $S_n$ as the second-order U-statistic % \begin{align*} S_n &= \sum_{1 \leq i < j \leq n} \sum_{1 \leq q < r \leq n} h_{i j q r} (X_{i j}, X_{qr}), \end{align*} % where % \begin{align*} h_{i j q r} (a,b) &= h_1(a) h_2(b) \, \I\{j 0$ satisfying % $\P\big( |S_n| \geq t \big) \leq C_1 \P\big( C_1 |\tilde S_n| \geq t \big)$, % where % $\tilde S_n = \sum_{1 \leq i < j \leq n} \sum_{1 \leq q < r \leq n} h_{i j q r} (X_{i j}, X'_{qr})$, % with $(X'_{i j})$ an independent copy of $(X_{i j})$. \proofparagraph{U-statistic concentration} The U-statistic kernel $h_{i j q r}(X_{i j}, X'_{qr})$ is totally degenerate in that % $ \E[h_{i j q r}(X_{i j}, X'_{qr}) \mid X_{i j}] = \E[h_{i j q r}(X_{i j}, X'_{qr}) \mid X'_{qr}] = 0$. % Define and bound the following quantities: % \pagebreak % \begin{align*} A &= \max_{i j q r} \|h_{i j q r}(X_{i j}, X'_{qr})\|_\infty \leq M^2, \\ B &= \max \left\{ \left\| \sum_{1 \leq i < j \leq n} \E\Big[ h_{i j q r}(X_{i j}, X'_{qr})^2 \mid X_{i j} \Big] \right\|_\infty, \left\| \sum_{1 \leq q < r \leq n} \E\Big[ h_{i j q r}(X_{i j}, X'_{qr})^2 \mid X'_{qr} \Big] \right\|_\infty \right\}^{1/2} \\ &= \max \left\{ \left\| \sum_{1 \leq i < j \leq n} h_1(X_{i j})^2 \E\big[ h_2(X_{qr}')^2 \big] \I\{j 0$ and for all $t > 0$, % \begin{align*} \P\left( |\tilde S_n| \geq t \right) &\leq C_2 \exp\left( -\frac{1}{C_2} \min \left\{ \frac{t^2}{C^2}, \frac{t}{D}, \frac{t^{2/3}}{B^{2/3}}, \frac{t^{1/2}}{A^{1/2}} \right\} \right) \\ &\leq C_2 \exp\left( -\frac{1}{C_2} \min \left\{ \frac{t^2}{n^3 \sigma^4}, \frac{t}{\sqrt{n^3 \sigma^4}}, \frac{t^{2/3}}{(n M \sigma)^{2/3}}, \frac{t^{1/2}}{M} \right\} \right). \end{align*} \proofparagraph{Conclusion} By the previous parts and absorbing constants into a new constant $C > 0$, we therefore have % \begin{align*} \P\left( |S_n| \geq t \right) &\leq C_1 \P\left( C_1 |\tilde S_n| \geq t \right) \\ &\leq C_1 C_2 \exp\left( -\frac{1}{C_2} \min \left\{ \frac{t^2}{n^3 \sigma^4 C_1^2}, \frac{t}{\sqrt{n^3 \sigma^4 C_1}}, \frac{t^{2/3}}{(n M \sigma C_1)^{2/3}}, \frac{t^{1/2}}{M C_1^{1/2}} \right\} \right) \\ &\leq C \exp\left( -\frac{1}{C} \min \left\{ \frac{t^2}{n^3 \sigma^4}, \frac{t}{\sqrt{n^3 \sigma^4}}, \frac{t^{2/3}}{(n M \sigma)^{2/3}}, \frac{t^{1/2}}{M} \right\} \right). \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_covariance_estimation}] Throughout this proof we will write $k_{i j}$ for $k_h(W_{i j},w)$ and $k_{i j}'$ for $k_h(W_{i j},w')$, in the interest of brevity. Similarly, we write $S_{i j r}$ to denote $S_{i j r}(w,w')$. The estimand and estimator are reproduced below for clarity. % \begin{align*} \Sigma_n(w,w') &= \frac{2}{n(n-1)} \E[k_{i j} k_{i j}'] + \frac{4(n-2)}{n(n-1)} \E[k_{i j} k_{i r}'] - \frac{4n-6}{n(n-1)} \E[k_{i j}] \E[k_{i j}'] \\ \hat \Sigma_n(w,w') &= \frac{2}{n(n-1)} \frac{2}{n(n-1)} \sum_{i 0$ and since $n h \gtrsim \log n$, the class $\cF$ has a constant envelope function given by $F(a) \lesssim \sqrt{n h}$. Clearly, $M = \sup_a F(a) \lesssim \sqrt{n h}$. Also by definition of $\Sigma_n$ and orthogonality of $L_n$, $Q_n$, and $E_n$, we have $\sup_{f \in \cF} \E[f(A_i)^2] \leq \sigma^2 = 1$. To verify a VC-type condition on $\cF$ we need to establish the regularity of the process. By Lipschitz properties of $L_n$ and $\Sigma_n$ derived in the proofs of Lemma~\ref{lem:kernel_uniform_concentration} and Theorem~\ref{thm:kernel_infeasible_ucb} respectively, we have % \begin{align*} \left| \frac{L_n(w)} {\sqrt{\Sigma_n(w,w)}} - \frac{L_n(w')} {\sqrt{\Sigma_n(w',w')}} \right| &\lesssim \frac{\big|L_n(w) - L_n(w')\big|} {\sqrt{\Sigma_n(w,w)}} + \left| L_n(w') \right| \left| \frac{1} {\sqrt{\Sigma_n(w,w)}} - \frac{1} {\sqrt{\Sigma_n(w',w')}} \right| \\ &\lesssim \sqrt{n^2h} |w-w'| +\left| \frac{\Sigma_n(w,w) - \Sigma_n(w',w')} {\Sigma_n(w,w)\sqrt{\Sigma_n(w',w')}} \right| \\ &\lesssim \sqrt{n^2h} |w-w'| + (n^2h)^{3/2} \left| \Sigma_n(w,w) - \Sigma_n(w',w') \right| \\ &\lesssim \sqrt{n^2h} |w-w'| + (n^2h)^{3/2} n^{-1} h^{-3} |w-w'| \lesssim n^4 |w-w'|, \end{align*} % uniformly over $w,w' \in \cW$. By compactness of $\cW$ we have the covering number bound % $N(\cF, \|\cdot\|_\infty, \varepsilon) \lesssim N(\cW, |\cdot|, n^{-4} \varepsilon) \lesssim n^4 \varepsilon^{-1}$. % Thus by Lemma~\ref{lem:kernel_app_maximal_vc_inid}, % \begin{align*} \E \left[ \sup_{w \in \cW} \left| \frac{L_n(w)} {\sqrt{\Sigma_n(w,w)}} \right| \right] &\lesssim \sqrt{\log n} + \frac{\sqrt{n h} \log n}{\sqrt{n}} \lesssim \sqrt{\log n}. \end{align*} % Therefore % \begin{align*} \sup_{w,w' \in \cW} \left| \frac{ \hat f_W(w) \hat f_W(w') - \E\big[k_{i j}\big] \E\big[k_{i j'}\big]} {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right| &\lesssim_\P \sqrt{\log n}. \end{align*} \proofparagraph{decomposition of the $S_{i j r}$ term} We first decompose the $S_{i j r}$ term into two parts, and obtain a pointwise concentration result for each. This is extended to a uniform concentration result by considering the regularity of the covariance estimator process. Note that $\E[S_{i j r}] = \E[k_{i j} k_{i r}']$, and hence % \begin{align*} &\frac{6}{n(n-1)(n-2)} \sum_{i 0$: % \begin{align*} &\P\left( \left| \sum_{i t \biggm\vert \bA_n \right) \\ &\quad\leq C_1 \exp\left( -\frac{1}{C_1} \min \left\{ \frac{t^2}{n^3 \sigma^4}, \frac{t}{\sqrt{n^3 \sigma^4}}, \frac{t^{2/3}}{(n M \sigma)^{2/3}}, \frac{t^{1/2}}{M} \right\} \right) \\ &\quad\leq C_1 \exp\left( -\frac{1}{C_1} \min \left\{ \frac{t^2 h^2}{n^3}, \frac{t h}{\sqrt{n^3}}, \frac{t^{2/3} h}{n^{2/3}}, t^{1/2} h \right\} \right), \end{align*} % and therefore with $t \geq 1$ and since $n h \gtrsim \log n$, introducing and adjusting a new constant $C_2$ where necessary, % \begin{align*} &\P\left( \left| \frac{6}{n(n-1)(n-2)} \sum_{i t \frac{\log n}{\sqrt{n^3 h^2}} \Bigm\vert \bA_n \right) \\ &\quad\leq \P\left( \left| \sum_{i t n^{3/2} h^{-1} \log n / 24 \Bigm\vert \bA_n \right) \\ &\quad\leq C_2 \exp\left( -\frac{1}{C_2} \min \left\{ (t \log n)^2, t \log n, (t \log n)^{2/3} (n h)^{1/3}, (t n h \log n)^{1/2} n^{1/4} \right\} \right) \\ &\quad\leq C_2 \exp\left( -\frac{1}{C_2} \min \left\{ t \log n, t \log n, t^{2/3} \log n, t^{1/2} n^{1/4} \log n \right\} \right) \\ &\quad= C_2 \exp\left( -\frac{t^{2/3} \log n}{C_2} \right) = C_2 n^{-t^{2/3} / C_2}. \end{align*} % Now for the term in \eqref{eq:kernel_app_Sijr1_decomp2}, note that $\frac{3}{n} \sum_{r=j+1}^n \E[k_{i r}' \mid \bA_n]$ is $\bA_n$-measurable and bounded uniformly in $i,j$. Also, using the previously established conditional variance and almost sure bounds on $k_{i j}$, Bernstein's inequality (Lemma~\ref{lem:kernel_app_bernstein}) applied conditionally gives for some constant $C_3 > 0$ % \begin{align*} &\P\left( \Bigg| \frac{2}{(n-1)(n-2)} \sum_{i=1}^{n-2} \sum_{j=i+1}^{n-1} \Big( k_{i j} - \E[k_{i j} \mid \bA_n] \Big) \cdot \frac{3}{n} \sum_{r=j+1}^n \E[k_{i r}' \mid \bA_n] \Bigg| > t \sqrt{\frac{\log n}{n^2h}} \Bigm\vert \bA_n \right) \\ &\qquad\leq 2 \exp \left( - \frac{t^2 n^2 \log n / (n^2h)} {C_3/(2h) + C_3 t \sqrt{\log n / (n^2h)} / (2h)} \right) \\ &\qquad= 2 \exp \left( - \frac{t^2 \log n} {C_3/2 + C_3 t \sqrt{\log n / (n^2h)} / 2} \right) \leq 2 \exp \left( - \frac{t^2 \log n}{C_3} \right) = 2 n^{-t^2 / C_3}. \end{align*} % The term in \eqref{eq:kernel_app_Sijr1_decomp3} is controlled in exactly the same way. Putting these together, noting the symmetry in $i,j,r$ and taking a marginal expectation, we obtain the unconditional pointwise concentration inequality % \begin{align*} \P\left( \Bigg| \frac{6}{n(n-1)(n-2)} \sum_{i t \frac{\log n}{\sqrt{n^3h^2}} + t \sqrt{\frac{\log n}{n^2h}} \right) &\leq C_2 n^{-t^{2/3} / C_2} + 4 n^{-t^2 / (4C_3)}. \end{align*} % Multiplying by $\big(\Sigma_n(w,w) + \Sigma_n(w',w')\big)^{-1/2} \lesssim \sqrt{n^2h}$ gives (adjusting constants if necessary) % \begin{align*} &\P\left( \Bigg| \frac{6}{n(n-1)(n-2)} \sum_{i t \frac{\log n}{\sqrt{n h}} + t \sqrt{\log n} \right) \\ &\quad\leq C_2 n^{-t^{2/3} / C_2} + 4 n^{-t^2 / (4C_3)}. \end{align*} \proofparagraph{pointwise concentration of the $S_{i j r}^{(2)}$ term} We apply the U-statistic concentration inequality from Lemma~\ref{lem:kernel_app_ustat_concentration}. Note that the terms $\E[S_{i j r} \mid \bA_n]$ are permutation-symmetric functions of the random variables $A_i, A_j$, and $A_r$ only, making $S_{i j r}^{(2)}$ the summands of a (non-degenerate) mean-zero third-order U-statistic. While we could apply a third-order Hoeffding decomposition here to achieve degeneracy, it is unnecessary as Lemma~\ref{lem:kernel_app_ustat_concentration} is general enough to deal with the non-degenerate case directly. The quantity of interest here is % \begin{align*} \frac{6}{n(n-1)(n-2)} \sum_{i t \sqrt{\log n} \sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')} \right) \\ &\quad\leq 4 \exp \left( - \frac{n t^2 (\Sigma_n(w,w) + \Sigma_n(w',w')) \log n} {C_4 (n\Sigma_n(w,w) + n\Sigma_n(w',w')) + C_4 t \sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}\sqrt{\log n}} \right) \\ &\quad\leq 4 \exp \left( - \frac{t^2 \log n} {C_4 + C_4 t (\Sigma_n(w,w) + \Sigma_n(w',w'))^{-1/2} \sqrt{\log n} / n} \right) \\ &\quad\leq 4 \exp \left( - \frac{t^2 \log n} {C_4 + C_4 t \sqrt{h}} \right) \leq 4 n^{-t^2 / C_4} \end{align*} % for some universal constant $C_4 > 0$ (which may change from line to line), since the order of this U-statistic is fixed at three. \proofparagraph{concentration of the $S_{i j r}$ term on a mesh} Pick $\delta_n \to 0$ with $\log 1/\delta_n \lesssim \log n$. Let $\cW_\delta$ be a $\delta_n$-covering of $\cW$ with cardinality $O(1/\delta_n)$. Then $\cW_\delta \times \cW_\delta$ is a $2\delta_n$-covering of $\cW \times \cW$ with cardinality $O(1/\delta_n^2)$, under the Manhattan metric $d\big((w_1, w_1'), (w_2, w_2')\big) = |w_1 - w_2| + |w_1' - w_2'|$. By the previous parts, we have that for fixed $w$ and $w'$: % \begin{align*} &\P\Bigg( \Bigg| \frac{6}{n(n-1)(n-2)} \sum_{i t \frac{\log n}{\sqrt{n h}} + 2t \sqrt{\log n} \Bigg) \\ &\quad\leq C_2 n^{-t^{2/3} / C_2} + 4 n^{-t^2 / (4C_3)} + 4 n^{-t^2 / C_4}. \end{align*} % Taking a union bound over $\cW_\delta \times \cW_\delta$, noting that $n h \gtrsim \log n$ and adjusting constants gives % \begin{align*} &\P\Bigg( \sup_{w, w' \in \cW_\delta} \Bigg| \frac{6}{n(n-1)(n-2)} \sum_{i t \sqrt{\log n} \Bigg) \\ &\quad\lesssim \delta_n^{-2} \Big( C_2 n^{-t^{2/3} / C_2} + 4 n^{-t^2 / (4C_3)} + 4 n^{-t^2 / C_4} \Big) \lesssim \delta_n^{-2} n^{-t^{2/3} / C_5}, \end{align*} % for some constant $C_5 > 0$. \proofparagraph{regularity of the $S_{i j r}$ term} Next we bound the fluctuations in $S_{i j r}(w,w')$. Writing $k_{i j}(w)$ for $k_h(W_{i j},w)$, note that % \begin{align*} \big| k_{i j}(w_1) k_{i r}(w_1') - k_{i j}(w_2) k_{i r}(w_2') \big| &\lesssim \frac{1}{h} \big| k_{i j}(w_1) - k_{i j}(w_2) \big| + \frac{1}{h} \big| k_{i r}(w_1') - k_{i r}(w_2') \big| \\ &\lesssim \frac{1}{h^3} \Big( |w_1 - w_2| + |w_1' - w_2'| \Big), \end{align*} % by the Lipschitz property of the kernel, and similarly for the other summands in $S_{i j r}$. Therefore, % \begin{align*} \sup_{|w_1-w_2| \leq \delta_n} \sup_{|w_1'-w_2'| \leq \delta_n} \big| S_{i j r}(w_1, w_1') - S_{i j r}(w_2, w_2') \big| &\lesssim \delta_n h^{-3}. \end{align*} % Also as noted in the proof of Theorem~\ref{thm:kernel_infeasible_ucb}, % \begin{align*} \sup_{|w_1-w_2| \leq \delta_n} \sup_{|w_1'-w_2'| \leq \delta_n} \big| \Sigma_n(w_1,w_1') - \Sigma_n(w_2, w_2') \big| &\lesssim \delta_n n^{-1}h^{-3}. \end{align*} % Therefore, since $\sqrt{\Sigma_n(w,w)} \gtrsim \sqrt{n^2h}$ and $|S_{i j r}| \lesssim h^{-2}$, using $\frac{a}{\sqrt b} - \frac{c}{\sqrt d} = \frac{a-c}{\sqrt b} + c \frac{d-b}{\sqrt{b d} \sqrt{b+d}}$, % \begin{align*} &\sup_{|w_1-w_2| \leq \delta_n} \sup_{|w_1'-w_2'| \leq \delta_n} \left| \frac{S_{i j r}(w_1, w_1')} {\sqrt{\Sigma_n(w_1,w_1) + \Sigma_n(w_1',w_1')}} - \frac{S_{i j r}(w_2, w_2')} {\sqrt{\Sigma_n(w_2,w_2) + \Sigma_n(w_2',w_2')}} \right| \\ &\quad\lesssim \delta_n h^{-3} \sqrt{n^2h} + h^{-2} \delta_n n^{-1} h^{-3} (n^2h)^{3/2} \lesssim \delta_n n h^{-5/2} + \delta_n n^{2} h^{-7/2} \lesssim \delta_n n^{6}, \end{align*} % where in the last line we use that $1/h \lesssim n$. \proofparagraph{uniform concentration of the $S_{i j r}$ term} By setting $\delta_n = n^{-6} \sqrt{\log n}$, the fluctuations can be at most $\sqrt{\log n}$, so we have for $t \geq 1$ % \begin{align*} &\P\Bigg( \sup_{w, w' \in \cW} \Bigg| \frac{6}{n(n-1)(n-2)} \sum_{i 2t \sqrt{\log n} \Bigg) \\ &\quad\lesssim \delta_n^{-2} n^{-t^{2/3} / C_5} \lesssim n^{12-t^{2/3} / C_5}. \end{align*} % This converges to zero for any sufficiently large $t$, so % \begin{align*} \sup_{w, w' \in \cW} \Bigg| \frac{6}{n(n-1)(n-2)} \sum_{i t \sqrt{\frac{\log n}{n^2h^3}} \Bigm\vert \bA_n \right) \\ &\quad\leq 2 \exp\left( - \frac{t^2 n^2 \log n / (n^2h^3)} {C_6 h^{-3} / 2 + C_6 t h^{-2} \sqrt{\log n / (n^2h^3)} / 2} \right) \\ &\quad\leq 2 \exp\left( - \frac{t^2 \log n} {C_6 / 2 + C_6 t \sqrt{\log n / (n^2h)} / 2} \right) \leq 2 \exp\left( - \frac{t^2 \log n}{C_6} \right) = 2 n^{-t^2 / C_6}, \end{align*} % where $C_6$ is a universal positive constant. \proofparagraph{pointwise concentration of the $S_{i j}^{(2)}$ term} We apply the U-statistic concentration inequality from Lemma~\ref{lem:kernel_app_ustat_concentration}. Note that $S_{i j}^{(2)}$ are permutation-symmetric functions of the random variables $A_i$ and $A_j$ only, making them the summands of a (non-degenerate) mean-zero second-order U-statistic. Note that $\big|S_{i j}^{(2)}\big| \lesssim h^{-1}$ and so trivially $\E\big[\E[S_{i j}^{(2)} \mid A_i ]^2 \big] \lesssim h^{-2}$. Thus by Lemma~\ref{lem:kernel_app_ustat_concentration}, since the order of this U-statistic is fixed at two, for some universal positive constant $C_7$ we have % \begin{align*} \P\left( \Bigg| \frac{2}{n(n-1)} \sum_{i t \sqrt{\frac{\log n}{n h^2}} \right) &\leq 2 \exp\left( - \frac{t^2 n \log n / (n h^2)} {C_7 h^{-2} / 2 + C_7 t h^{-1} \sqrt{\log n / (n h^2)} / 2} \right) \\ &\leq 2 \exp\left( - \frac{t^2 \log n} {C_7 / 2 + C_7 t \sqrt{\log n / n} / 2} \right) \\ &\leq 2 \exp\left( - \frac{t^2 \log n}{C_7} \right) = 2 n^{-t^2 / C_7}. \end{align*} \proofparagraph{concentration of the $k_{i j}k_{i j}'$ term on a mesh} As before, use a union bound on the mesh $\cW_\delta \times \cW_\delta$. % \begin{align*} &\P\left( \sup_{w,w' \in \cW_\delta} \left| \frac{2}{n(n-1)} \sum_{i t \sqrt{\frac{\log n}{n^2h^3}} + t \sqrt{\frac{\log n}{n h^2}} \right) \\ &\ \leq \P\!\left( \!\sup_{w,w' \in \cW_\delta} \Bigg| \frac{2}{n(n-1)} \sum_{i t \sqrt{\frac{\log n}{n^2h^3}} \right) \! + \P\!\left( \!\sup_{w,w' \in \cW_\delta} \Bigg| \frac{2}{n(n-1)} \sum_{i t \sqrt{\frac{\log n}{n h^2}} \right) \\ &\ \lesssim \delta_n^{-2} n^{-t^2 / C_6} + \delta_n^{-2} n^{-t^2 / C_7}. \end{align*} \proofparagraph{regularity of the $k_{i j}k_{i j}'$ term} As for the $S_{i j r}$ term, % $\big| k_{i j}(w_1) k_{i j}(w_1') - k_{i j}(w_2) k_{i j}(w_2') \big| \lesssim \frac{1}{h^3} \Big( |w_1 - w_2| + |w_1' - w_2'| \Big)$. \proofparagraph{uniform concentration of the $k_{i j}k_{i j}'$ term} Setting $\delta_n = h^3\sqrt{\log n / (n h^2)}$, the fluctuations are at most $\sqrt{\log n / (n h^2)}$, so for $t \geq 1$ % \begin{align*} &\P\left( \sup_{w,w' \in \cW} \left| \frac{2}{n(n-1)} \sum_{i t \sqrt{\frac{\log n}{n^2h^3}} + 2t \sqrt{\frac{\log n}{n h^2}} \right) \\ &\quad\leq \P\left( \sup_{w,w' \in \cW_\delta} \left| \frac{2}{n(n-1)} \sum_{i t \sqrt{\frac{\log n}{n^2h^3}} + t \sqrt{\frac{\log n}{n h^2}} \right) \\ &\qquad+ \P\left( \sup_{|w_1-w_2| \leq \delta_n} \sup_{|w_1'-w_2'| \leq \delta_n} \big| k_{i j}(w_1) k_{i j}(w_1') - k_{i j}(w_2) k_{i j}(w_2') \big| > t \sqrt{\frac{\log n}{n h^2}} \right) \\ &\quad\lesssim \delta_n^{-2} n^{-t^2 / C_6} + \delta_n^{-2} n^{-t^2 / C_7} \lesssim n^{1-t^2 / C_6} h^{-4} + n^{1-t^2 / C_7} h^{-4} \lesssim n^{5-t^2 / C_8}, \end{align*} % where $C_8 > 0$ is a constant and in the last line we use $1/h \lesssim n$. This converges to zero for any sufficiently large $t$, so by Lemma~\ref{lem:kernel_variance_bounds} we have % \begin{align*} \sup_{w,w' \in \cW} \left| \frac{2}{n(n-1)} \sum_{i 0$ there exists a feasible function $M_\varepsilon$ with $\objective(M_\varepsilon) \leq \objective^* + \varepsilon$, and we call such a solution $\varepsilon$-optimal. Let $\hat \Sigma_n^+$ be an $n^{-1}$-optimal solution. Then % \begin{align*} \objective(\hat \Sigma_n^+) &\leq \objective^* + n^{-1} \leq \objective(\Sigma_n) + n^{-1}. \end{align*} % Thus by the triangle inequality, % \begin{align*} \sup_{w,w' \in \cW} \left| \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right| &\leq \objective(\hat \Sigma_n^+) + \objective(\Sigma_n) \leq 2 \, \objective(\Sigma_n) + n^{-1} \lesssim_\P \frac{\sqrt{\log n}}{n}. \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_variance_estimator_bounds}] Since $\hat \Sigma_n^+$ is positive semi-definite, we must have $\hat \Sigma_n^+(w,w) \geq 0$. Now Lemma~\ref{lem:kernel_app_sdp} implies that for all $\varepsilon \in (0,1)$ there exists a $C_\varepsilon$ such that % \begin{align*} &\P\left( \Sigma_n(w,w) - C_\varepsilon \frac{\sqrt{\log n}}{n} \sqrt{\Sigma_n(w,w)} \leq \hat \Sigma_n^+(w,w) \right. \\ &\left. \qquad\leq \Sigma_n(w,w) + C_\varepsilon \frac{\sqrt{\log n}}{n} \sqrt{\Sigma_n(w,w)}, \quad \forall w \in \cW \right) \geq 1-\varepsilon. \end{align*} % Consider the function $g_a(t) = t - a \sqrt{t}$ and note that it is increasing on $\{t \geq a^2/4\}$. Applying this with $t = \Sigma_n(w,w)$ and $a = \frac{\sqrt{\log n}}{n}$, noting that by Lemma~\ref{lem:kernel_variance_bounds} we have $t = \Sigma_n(w,w) \gtrsim \frac{1}{n^2h} \gg \frac{\log n}{4n^2} = a^2/4$, shows that for $n$ large enough, % \begin{align*} \inf_{w \in \cW} \Sigma_n(w,w) - \frac{\sqrt{\log n}}{n} \sqrt{\inf_{w \in \cW} \Sigma_n(w,w)} \lesssim_\P \inf_{w \in \cW}\hat \Sigma_n^+(w,w), \\ \sup_{w \in \cW}\hat \Sigma_n^+(w,w) \lesssim_\P \sup_{w \in \cW} \Sigma_n(w,w) + \frac{\sqrt{\log n}}{n} \sqrt{\sup_{w \in \cW} \Sigma_n(w,w)}. \end{align*} % Applying the bounds from Lemma~\ref{lem:kernel_variance_bounds} yields % \begin{align*} \frac{\Dl^2}{n} + \frac{1}{n^2h} - \frac{\sqrt{\log n}}{n} \left( \frac{\Dl}{\sqrt n} + \frac{1}{\sqrt{n^2h}} \right) \lesssim_\P \inf_{w \in \cW}\hat \Sigma_n^+(w,w), \\ \sup_{w \in \cW}\hat \Sigma_n^+(w,w) \lesssim_\P \frac{\Du^2}{n} + \frac{1}{n^2h} + \frac{\sqrt{\log n}}{n} \left( \frac{\Du}{\sqrt n} + \frac{1}{\sqrt{n^2h}} \right) \end{align*} % and so % \begin{align*} \frac{\Dl^2}{n} + \frac{1}{n^2h} \lesssim_\P \inf_{w \in \cW}\hat \Sigma_n^+(w,w) \leq \sup_{w \in \cW}\hat \Sigma_n^+(w,w) \lesssim_\P \frac{\Du^2}{n} + \frac{1}{n^2h}. \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_sdp}] See Lemma~\ref{lem:kernel_app_covariance_estimation} and Lemma~\ref{lem:kernel_app_sdp}. \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_studentized_t_statistic}] % We have % \begin{align*} &\sup_{w \in \cW} \left| \hat T_n(w) - T_n(w) \right| = \sup_{w \in \cW} \bigg\{ \left| \hat f_W(w) - f_W(w) \right| \cdot \bigg| \frac{1} {\hat\Sigma_n^+(w,w)^{1/2}} - \frac{1}{\Sigma_n(w,w)^{1/2}} \bigg| \bigg\} \\ &\quad\leq \sup_{w \in \cW} \left| \frac{\hat f_W(w) - \E\big[\hat f_W(w)\big]} {\sqrt{\Sigma_n(w,w)}} + \frac{\E\big[\hat f_W(w)\big] - f_W(w)} {\sqrt{\Sigma_n(w,w)}} \right| \cdot \sup_{w \in \cW} \left| \frac{\hat\Sigma_n^+(w,w) - \Sigma_n(w,w)} {\sqrt{\Sigma_n(w,w) \hat\Sigma_n^+(w,w)}} \right|. \end{align*} % Now from the proof of Lemma~\ref{lem:kernel_app_covariance_estimation} we have that $\sup_{w \in \cW} \left| \frac{\hat f_W(w) - \E\big[\hat f_W(w)\big]} {\sqrt{\Sigma_n(w,w)}} \right| \lesssim_\P \sqrt{\log n}$, while Theorem~\ref{thm:kernel_bias} gives $\sup_{w \in \cW} \big| \E\big[\hat f_W(w)\big] - f_W(w) \big| \lesssim h^{p \wedge \beta}$. By Lemma~\ref{lem:kernel_variance_bounds}, note that $\sup_{w \in \cW} \Sigma_n(w,w)^{-1/2} \lesssim \frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}}$, and $\sup_{w \in \cW} \hat \Sigma_n^+(w,w)^{-1/2} \lesssim_\P \frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}}$ by Lemma~\ref{lem:kernel_app_variance_estimator_bounds}. Thus, applying Lemma~\ref{lem:kernel_app_sdp} to control the covariance estimation error, % \begin{align*} \sup_{w \in \cW} \left| \hat T_n(w) - T_n(w) \right| &\lesssim_\P \left( \sqrt{\log n} + \frac{h^{p \wedge \beta}}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}} \right) \frac{\sqrt{\log n}}{n} \frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}} \\ &\lesssim_\P \sqrt{\frac{\log n}{n}} \left( \sqrt{\log n} + \frac{\sqrt n h^{p \wedge \beta}} {\Dl + 1/\sqrt{n h}} \right) \frac{1}{\Dl + 1/\sqrt{n h}}. \end{align*} \end{proof} \begin{proof}[% Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}] Firstly, note that $\hat Z_n^T$ exists by noting that $\hat \Sigma_n^+(w,w')$ and therefore also $\frac{\hat \Sigma_n^+(w,w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}$ are positive semi-definite functions and appealing to the Kolmogorov consistency theorem \citep{gine2021mathematical}. To obtain the desired Kolmogorov--Smirnov result we discretize and use the Gaussian--Gaussian comparison result found in Lemma~3.1 in \citet{chernozhukov2013gaussian}. \proofparagraph{bounding the covariance discrepancy} Define the maximum discrepancy in the (conditional) covariances of $\hat Z_n^T$ and $Z_n^T$ by % \begin{align*} \Delta &\vcentcolon= \sup_{w, w' \in \cW} \left| \frac{\hat \Sigma_n^+(w,w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}} - \frac{\Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}} \right|. \end{align*} % This variable can be bounded in probability in the following manner. First note that by the Cauchy--Schwarz inequality for covariances, $|\Sigma_n(w,w')| \leq \sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}$. Hence % \begin{align*} \Delta &\leq \sup_{w, w' \in \cW} \left\{ \left| \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}} \right| + \left| \frac{\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')} - \sqrt{\Sigma_n(w,w) \Sigma_n(w',w')}} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}} \right| \right\} \\ &\leq \sup_{w, w' \in \cW} \left\{ \sqrt{\frac{\Sigma_n(w,w) + \Sigma_n(w',w')} {\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}} \left| \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right| \right\} \\ &\quad+ \sup_{w, w' \in \cW} \left| \frac{\hat \Sigma_n^+(w,w)\hat \Sigma_n^+(w',w') - \Sigma_n(w,w) \Sigma_n(w',w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w') \Sigma_n(w,w) \Sigma_n(w',w')}} \right|. \end{align*} % For the first term, note that $\inf_{w \in \cW} \hat \Sigma_n^+(w,w) \gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h}$ by Lemma~\ref{lem:kernel_app_variance_estimator_bounds} and also $\sup_{w \in \cW} \left|\frac{\hat \Sigma_n(w,w)}{\Sigma_n(w,w)} - 1\right| \lesssim_\P \sqrt{h \log n}$ by the proof of Lemma~\ref{lem:kernel_app_sdp}. Thus by Lemma~\ref{lem:kernel_app_sdp}, % \begin{align*} &\sup_{w, w' \in \cW} \left\{ \sqrt{\frac{\Sigma_n(w,w) + \Sigma_n(w',w')} {\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}} \left| \frac{\hat \Sigma_n^+(w,w') - \Sigma_n(w,w')} {\sqrt{\Sigma_n(w,w) + \Sigma_n(w',w')}} \right| \right\} \\ &\quad\lesssim_\P \frac{\sqrt{\log n}}{n} \frac{1}{\Dl/\sqrt{n} + 1/\sqrt{n^2h}} \lesssim_\P \sqrt{\frac{\log n}{n}} \frac{1}{\Dl + 1/\sqrt{n h}}. \end{align*} % For the second term, we have by the same bounds % \begin{align*} &\sup_{w, w' \in \cW} \left| \frac{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w') - \Sigma_n(w,w) \Sigma_n(w',w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w') \Sigma_n(w,w) \Sigma_n(w',w')}} \right| \\ &\quad\leq \sup_{w, w' \in \cW} \left\{ \frac{\big| \hat \Sigma_n^+(w,w) - \Sigma_n(w,w)\big| \hat \Sigma_n^+(w',w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w') \Sigma_n(w,w) \Sigma_n(w',w')}} \right\} \\ &\qquad+ \sup_{w, w' \in \cW} \left\{ \frac{\big| \hat \Sigma_n^+(w',w') - \Sigma_n(w',w')\big| \Sigma_n(w,w)} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w') \Sigma_n(w,w) \Sigma_n(w',w')}} \right\} \\ &\quad\leq \sup_{w, w' \in \cW} \left\{ \frac{\big| \hat \Sigma_n^+(w,w) - \Sigma_n(w,w)\big|} {\sqrt{\Sigma_n(w,w)}} \frac{\sqrt{\hat \Sigma_n^+(w',w')}} {\sqrt{\hat \Sigma_n^+(w,w) \Sigma_n(w',w')}} \right\} \\ &\qquad+ \!\sup_{w, w' \in \cW}\! \left\{ \frac{\big| \hat \Sigma_n^+(w',w') - \Sigma_n(w',w')\big|} {\sqrt{\Sigma_n(w',w')}} \frac{\sqrt{\Sigma_n(w,w)}} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}} \right\} \lesssim_\P \sqrt{\frac{\log n}{n}} \frac{1}{\Dl + 1/\sqrt{n h}}. \end{align*} % Therefore $\Delta \lesssim_\P \sqrt{\frac{\log n}{n}} \frac{1}{\Dl + 1/\sqrt{n h}}$. \proofparagraph{Gaussian comparison on a mesh} Let $\cW_\delta$ be a $\delta_n$-covering of $\cW$ with cardinality $O(1/\delta_n)$, where $1/\delta_n$ is at most polynomial in $n$. The scaled (conditionally) Gaussian processes $Z_n^T$ and $\hat Z_n^T$ both have pointwise (conditional) variances of 1. Therefore, by Lemma~3.1 in \citet{chernozhukov2013gaussian}, % \begin{align*} \sup_{t \in \R} \left| \P\left( \sup_{w \in \cW_\delta} Z_n^T(w) \leq t \right) - \P\left( \sup_{w \in \cW_\delta} \hat Z_n^T(w) \leq t \Bigm\vert \bW_n \right) \right| &\lesssim \Delta^{1/3} \Big( 1 \vee \log \frac{1}{\Delta \delta_n} \Big)^{2/3} \end{align*} % uniformly in the data. By the previous part and since $x (\log 1/x)^2$ is increasing on $\big(0, e^{-2}\big)$, % \begin{align*} &\sup_{t \in \R} \left| \P\left( \sup_{w \in \cW_\delta} Z_n^T(w) \leq t \right) - \P\left( \sup_{w \in \cW_\delta} \hat Z_n^T(w) \leq t \Bigm\vert \bW_n \right) \right| \\ &\quad\lesssim_\P \left( \sqrt{\frac{\log n}{n}} \frac{1}{\Dl + 1/\sqrt{n h}} \right)^{1/3} (\log n)^{2/3} \lesssim_\P \frac{n^{-1/6}(\log n)^{5/6}} {\Dl^{1/3} + (n h)^{-1/6}}. \end{align*} \proofparagraph{trajectory regularity of $Z_n^T$} In the proof of Theorem~\ref{thm:kernel_infeasible_ucb} we established that $Z_n^T$ satisfies the regularity property % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \big| Z_n^T(w) - Z_n^T(w') \big| \right] &\lesssim n h^{-1} \sqrt{\delta_n \log n}, \end{align*} % whenever $1/\delta_n$ is at most polynomial in $n$. \proofparagraph{conditional $L^2$ regularity of $\hat Z_n^T$} By Lemma~\ref{lem:kernel_app_sdp}, with $n h \gtrsim \log n$, we have uniformly in $w,w'$, % \begin{align*} \big| \hat \Sigma_n^+(w,w') - \hat \Sigma_n^+(w,w) \big| &\lesssim n^{-1} h^{-3} |w-w'|. \end{align*} % Taking $\delta_n \leq n^{-2} h^2$, Lemma~\ref{lem:kernel_app_variance_estimator_bounds} gives % \begin{align*} \inf_{|w-w'| \leq \delta_n} \hat \Sigma_n^+(w,w') \gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h} - n^{-1} h^{-3} \delta_n \gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h} - \frac{1}{n^3h} \gtrsim \frac{\Dl^2}{n} + \frac{1}{n^2h}. \end{align*} % The conditional $L^2$ regularity of $\hat Z_n^T$ is % \begin{align*} \E\left[ \big( \hat Z_n^T(w) - \hat Z_n^T(w') \big)^2 \bigm\vert \bW_n \right] &= 2 - 2 \frac{\hat \Sigma_n^+(w,w')} {\sqrt{\hat \Sigma_n^+(w,w) \hat \Sigma_n^+(w',w')}}. \end{align*} % Applying the same elementary result as for $Z_n^T$ in the proof of Theorem~\ref{thm:kernel_infeasible_ucb} yields % \begin{align*} \E\left[ \big( \hat Z_n^T(w) - \hat Z_n^T(w') \big)^2 \bigm\vert \bW_n \right] &\lesssim_\P n^2 h^{-2} |w-w'|. \end{align*} % Thus the conditional semimetric induced by $\hat Z_n^T$ on $\cW$ is % \begin{align*} \hat\rho(w,w') &\vcentcolon= \E\left[ \big( \hat Z_n^T(w) - \hat Z_n^T(w') \big)^2 \bigm\vert \bW_n \right]^{1/2} \lesssim_\P n h^{-1} \sqrt{|w-w'|}. \end{align*} \proofparagraph{conditional trajectory regularity of $\hat Z_n^T$} As for $Z_n^T$ in the proof of Theorem~\ref{thm:kernel_infeasible_ucb}, we apply Lemma~\ref{lem:kernel_app_gaussian_process_maximal}, now conditionally, to obtain % \begin{align*} \E\left[ \sup_{|w-w'| \leq \delta_n} \left| \hat Z_n^T(w) - \hat Z_n^T(w') \right| \Bigm\vert \bW_n \right] &\lesssim_\P n h^{-1} \sqrt{\delta_n \log n}, \end{align*} % whenever $1/\delta_n$ is at most polynomial in $n$. \proofparagraph{uniform Gaussian comparison} Now we use the trajectory regularity properties to extend the Gaussian--Gaussian comparison result from a finite mesh to all of $\cW$. Write the previously established approximation rate as % \begin{align*} r_n &= \frac{n^{-1/6}(\log n)^{5/6}} {\Dl^{1/3} + (n h)^{-1/6}}. \end{align*} % Take $\varepsilon_n > 0$ and observe that uniformly in $t \in \R$, % \begin{align*} &\P\left( \sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \leq t \Bigm\vert \bW_n \right) \\ &\quad\leq \P\left( \sup_{w \in \cW_\delta} \big| \hat Z_n^T(w) \big| \leq t + \varepsilon_n \Bigm\vert \bW_n \right) + \P\left( \sup_{|w-w'| \leq \delta_n} \left| \hat Z_n^T(w) - \hat Z_n^T(w') \right| \geq \varepsilon_n \Bigm\vert \bW_n \right) \\ &\quad\leq \P\left( \sup_{w \in \cW_\delta} \big| Z_n^T(w) \big| \leq t + \varepsilon_n \right) + O_\P(r_n) + \P\left( \sup_{|w-w'| \leq \delta_n} \left| \hat Z_n^T(w) - \hat Z_n^T(w') \right| \geq \varepsilon_n \Bigm\vert \bW_n \right) \\ &\quad\leq \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t + 2\varepsilon_n \right) + O_\P(r_n) + \P\left( \sup_{|w-w'| \leq \delta_n} \left| Z_n^T(w) - Z_n^T(w') \right| \geq \varepsilon_n \right) \\ &\qquad+ \P\left( \sup_{|w-w'| \leq \delta_n} \left| \hat Z_n^T(w) - \hat Z_n^T(w') \right| \geq \varepsilon_n \Bigm\vert \bW_n \right) \\ &\quad\leq \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t + 2\varepsilon_n \right) + O_\P(r_n) + O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}) \\ &\quad\leq \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t \right) + \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - t \right| \leq 2\varepsilon_n \right) \\ &\qquad+ O_\P(r_n) + O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}). \end{align*} % The converse inequality is obtained analogously as follows: % \begin{align*} &\P\left( \sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \leq t \Bigm\vert \bW_n \right) \\ &\quad\geq \P\left( \sup_{w \in \cW_\delta} \big| \hat Z_n^T(w) \big| \leq t - \varepsilon_n \Bigm\vert \bW_n \right) - \P\left( \sup_{|w-w'| \leq \delta_n} \left| \hat Z_n^T(w) - \hat Z_n^T(w') \right| \geq \varepsilon_n \Bigm\vert \bW_n \right) \\ &\quad\geq \P\left( \sup_{w \in \cW_\delta} \big| Z_n^T(w) \big| \leq t - \varepsilon_n \right) - O_\P(r_n) - \P\left( \sup_{|w-w'| \leq \delta_n} \left| \hat Z_n^T(w) - \hat Z_n^T(w') \right| \geq \varepsilon_n \Bigm\vert \bW_n \right) \\ &\quad\geq \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t - 2\varepsilon_n \right) - O_\P(r_n) - \P\left( \sup_{|w-w'| \leq \delta_n} \left| Z_n^T(w) - Z_n^T(w') \right| \geq \varepsilon_n \right) \\ &\qquad- \P\left( \sup_{|w-w'| \leq \delta_n} \left| \hat Z_n^T(w) - \hat Z_n^T(w') \right| \geq \varepsilon_n \Bigm\vert \bW_n \right) \\ &\quad\geq \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t - 2\varepsilon_n \right) - O_\P(r_n) - O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}) \\ &\quad\geq \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t \right) - \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - t \right| \leq 2\varepsilon_n \right) \\ &\qquad- O_\P(r_n) - O_\P(\varepsilon_n^{-1} n h^{-1} \sqrt{\delta_n \log n}). \end{align*} % Combining these uniform upper and lower bounds gives % \begin{align*} &\sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \leq t \Bigm\vert \bW_n \right) - \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t \right) \right| \\ &\qquad\lesssim_\P \sup_{t \in \R} \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - t \right| \leq 2\varepsilon_n \right) + r_n + \varepsilon_n^{-1} n h^{-1/2} \delta_n^{1/2} \sqrt{\log n}. \end{align*} % For the remaining term, apply anti-concentration for $Z_n^T$ from the proof of Theorem~\ref{thm:kernel_infeasible_ucb}: % \begin{align*} \sup_{t \in \R} \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| - t \right| \leq \varepsilon \right) &\lesssim \varepsilon \sqrt{\log n}. \end{align*} % Therefore % \begin{align*} &\sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \leq t \Bigm\vert \bW_n \right) - \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t \right) \right| \\ &\qquad\lesssim_\P \varepsilon_n \sqrt{\log n} + r_n + \varepsilon_n^{-1} n h^{-1/2} \delta_n^{1/2} \sqrt{\log n}. \end{align*} % Taking $\varepsilon = r_n / \sqrt{\log n}$ and then $\delta_n = n^{-2} h r_n^2 \varepsilon_n^2 / \log n$ yields % \begin{align*} \left| \P\left( \sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \leq t \Bigm\vert \bW_n \right) - \P\left( \sup_{w \in \cW} \big| Z_n^T(w) \big| \leq t \right) \right| &\lesssim_\P r_n = \frac{n^{-1/6}(\log n)^{5/6}} {\Dl^{1/3} + (n h)^{-1/6}}. \end{align*} \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_feasible_gaussian_approx}] \proofparagraph{Kolmogorov--Smirnov approximation} Let $Z_n^T$ and $\hat Z_n^T$ be defined as in the proof of Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}. Write % \begin{align*} r_n &= \frac{n^{-1/6}(\log n)^{5/6}} {\Dl^{1/3} + (n h)^{-1/6}} \end{align*} % for the rate of approximation from Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}. For any $\varepsilon_n > 0$ and uniformly in $t \in \R$: % \begin{align*} &\P\left( \sup_{w \in \cW} \left| \hat Z_n^T(w) \right| \leq t \Bigm\vert \bW_n \right) \leq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq t \right) + O_\P(r_n) \\ &\quad\leq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq t - \varepsilon_n \right) + \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| -t \right| \leq \varepsilon_n \right) + O_\P(r_n) \\ &\quad\leq \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq t \right) + \P\left( \sup_{w \in \cW} \left| \hat T_n(w) - Z_n^T(w) \right| \geq \varepsilon_n \right) \\ &\qquad+ \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| -t \right| \leq \varepsilon_n \right) + O_\P(r_n) \\ &\quad\leq \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq t \right) + \P\left( \sup_{w \in \cW} \left| \hat T_n(w) - Z_n^T(w) \right| \geq \varepsilon_n \right) + \varepsilon_n \sqrt{\log n} + O_\P(r_n), \end{align*} % where in the last line we used the anti-concentration result from Lemma~\ref{lem:kernel_app_anticoncentration} applied to $Z_n^T$, as in the proof of Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}. The corresponding lower bound is as follows: % \begin{align*} &\P\left( \sup_{w \in \cW} \left| \hat Z_n^T(w) \right| \leq t \Bigm\vert \bW_n \right) \geq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq t \right) - O_\P(r_n) \\ &\quad\geq \P\left( \sup_{w \in \cW} \left| Z_n^T(w) \right| \leq t + \varepsilon_n \right) - \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| -t \right| \leq \varepsilon_n \right) - O_\P(r_n) \\ &\quad\geq \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \hat T_n(w) - Z_n^T(w) \right| \geq \varepsilon_n \right) \\ &\qquad- \P\left( \left| \sup_{w \in \cW} \big| Z_n^T(w) \big| -t \right| \leq \varepsilon_n \right) - O_\P(r_n) \\ &\quad\geq \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \hat T_n(w) - Z_n^T(w) \right| \geq \varepsilon_n \right) - \varepsilon_n \sqrt{\log n} - O_\P(r_n). \end{align*} \proofparagraph{$t$-statistic approximation} To control the remaining term, note that by Theorem~\ref{thm:kernel_strong_approx_Tn} and Lemma~\ref{lem:kernel_app_studentized_t_statistic}, % \begin{align*} &\sup_{w \in \cW} \left| \hat T_n(w) - Z_n^T(w) \right| \\ &\quad\leq \sup_{w \in \cW} \left| \hat T_n(w) - T_n(w) \right| + \sup_{w \in \cW} \left| T_n(w) - Z_n^T(w) \right| \\ &\quad\lesssim_\P \sqrt{\frac{\log n}{n}} \left( \sqrt{\log n} + \frac{\sqrt n h^{p \wedge \beta}} {\Dl + 1/\sqrt{n h}} \right) \frac{1}{\Dl + 1/\sqrt{n h}} \\ &\qquad+ \frac{ n^{-1/2} \log n + n^{-3/4} h^{-7/8} (\log n)^{3/8} R_n + n^{-2/3} h^{-1/2} (\log n)^{2/3} + n^{1/2} h^{p \wedge \beta}} {\Dl + 1/\sqrt{n h}} \end{align*} % and denote this last quantity by $r_n'$. Then for any $\varepsilon_n \gg r_n'$, we have % \begin{align*} \sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \hat Z_n^T(w) \right| \leq t \Bigm\vert \bW_n \right) \right| &\lesssim_\P \varepsilon_n \sqrt{\log n} + r_n + o(1). \end{align*} \proofparagraph{rate analysis} This rate is $o_\P(1)$ with an appropriate choice of $\varepsilon_n$ whenever $r_n \to 0$ and $r_n' \sqrt{\log n} \to 0$, by Lemma~\ref{lem:kernel_app_slow_convergence}, along with a slowly diverging sequence $R_n$. Explicitly, we require the following. % \begin{align*} \frac{n^{-1/2} (\log n)^{3/2}}{\Dl + 1/\sqrt{n h}} &\to 0, &\frac{h^{p \wedge \beta} \log n}{\Dl^2 + (n h)^{-1}} &\to 0, \\ \frac{n^{-1/2} (\log n)^{3/2}} {\Dl + 1/\sqrt{n h}} &\to 0, &\frac{n^{-3/4} h^{-7/8} (\log n)^{7/8}} {\Dl + 1/\sqrt{n h}} &\to 0, \\ \frac{n^{-2/3} h^{-1/2} (\log n)^{7/6}} {\Dl + 1/\sqrt{n h}} &\to 0, &\frac{n^{1/2} h^{p \wedge \beta} (\log n)^{1/2}} {\Dl + 1/\sqrt{n h}} &\to 0, \\ \frac{n^{-1/6}(\log n)^{5/6}} {\Dl^{1/3} + (n h)^{-1/6}} &\to 0. \end{align*} % Using the fact that $h \lesssim n^{-\varepsilon}$ for some $\varepsilon > 0$ and removing trivial statements leaves us with % \begin{align*} \frac{n^{-3/4} h^{-7/8} (\log n)^{7/8}} {\Dl + 1/\sqrt{n h}} &\to 0, &\frac{n^{1/2} h^{p \wedge \beta} (\log n)^{1/2}} {\Dl + 1/\sqrt{n h}} &\to 0. \end{align*} % We analyze these based on the degeneracy and verify that they hold under Assumption~\ref{ass:kernel_rates}. % \begin{enumerate}[label=(\roman*)] \item No degeneracy: if $\Dl > 0$ then we need % \begin{align*} n^{-3/4} h^{-7/8} (\log n)^{7/8} &\to 0, &n^{1/2} h^{p \wedge \beta} (\log n)^{1/2} &\to 0. \end{align*} % These reduce to $n^{-6/7} \log n \ll h \ll (n \log n)^{-\frac{1}{2(p \wedge \beta)}}$. \item Partial or total degeneracy: if $\Dl = 0$ then we need % \begin{align*} n^{-1/4} h^{-3/8} (\log n)^{7/8} &\to 0, &n h^{(p \wedge \beta) + 1/2} (\log n)^{1/2} &\to 0. \end{align*} % These reduce to $n^{-2/3} (\log n)^{7/3} \ll h \ll (n^2 \log n)^{-\frac{1}{2(p \wedge \beta) + 1}}$. % \end{enumerate} \end{proof} \begin{proof}[Theorem~\ref{thm:kernel_ucb}] \proofparagraph{existence of the conditional quantile} We argue as in the proof of Lemma~\ref{lem:kernel_app_distributional_approx_feasible_gaussian}, now also conditioning on the data. In particular, using the anti-concentration result from Lemma~\ref{lem:kernel_app_anticoncentration}, the regularity property of $\hat Z_n^T$, and the Gaussian process maximal inequality from Lemma~\ref{lem:kernel_app_gaussian_process_maximal}, we see that for any $\varepsilon > 0$, % \begin{align*} \sup_{t \in \R} \P\left( \left| \sup_{w \in \cW} \big| \hat Z_n^T(w) \big| - t \right| \leq 2\varepsilon \Bigm\vert \bW_n \right) &\leq 8 \varepsilon \left( 1 + \E\left[ \sup_{w \in \cW} \big| \hat Z_n^T(w) \big| \Bigm\vert \bW_n \right] \right) \lesssim \varepsilon \sqrt{\log n}. \end{align*} % Thus letting $\varepsilon \to 0$ shows that the conditional distribution function of $\sup_{w \in \cW} \big|\hat Z_n^T(w)\big|$ is continuous, and therefore all of its conditional quantiles exist. \proofparagraph{validity of the confidence band} Define the following (conditional) distribution functions. % \begin{align*} F_Z(t \mid \bW_n) &= \P\left( \sup_{w \in \cW} \left| \hat Z_n^T(w) \right| \leq t \Bigm\vert \bW_n \right), &F_T(t) &= \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq t \right), \end{align*} % along with their well-defined right-quantile functions, % \begin{align*} F_Z^{-1}(p \mid \bW_n) &= \sup \big\{ t \in \R \, : \, F_Z(t \mid \bW_n) = p \big\}, &F_T^{-1}(p) &= \sup \big\{ t \in \R \, : \, F_T(t) = p \big\}. \end{align*} % Note that $t \leq F_Z^{-1}(p \mid \bW_n)$ if and only if $F_Z(t \mid \bW_n) \leq p$. Take $\alpha \in (0,1)$ and define the quantile $\hat q_{1-\alpha} = F_Z^{-1}(1-\alpha \mid \bW_n)$, so that $F_Z(\hat q_{1-\alpha} \mid \bW_n) = 1-\alpha$. By Lemma~\ref{lem:kernel_app_feasible_gaussian_approx}, % \begin{align*} \sup_{t \in \R} \big| F_Z(t \mid \bW_n) - F_T(t) \big| &= o_\P(1). \end{align*} % Thus by Lemma~\ref{lem:kernel_app_slow_convergence}, this can be replaced by % \begin{align*} \P\left( \sup_{t \in \R} \big| F_Z(t \mid \bW_n) - F_T(t) \big| > \varepsilon_n \right) &\leq \varepsilon_n \end{align*} % for some $\varepsilon_n \to 0$. Therefore % \begin{align*} \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq \hat q_{1-\alpha} \right) &= \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq F_Z^{-1}(1-\alpha \mid \bW_n) \right) \\ &= \P\left( F_Z\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \Bigm\vert \bW_n \right) \leq 1 - \alpha \right) \\ &\leq \P\left( F_T\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \right) \leq 1 - \alpha + \varepsilon_n \right) + \varepsilon_n \leq 1 - \alpha + 3\varepsilon_n, \end{align*} % where we used the fact that for any real-valued random variable $X$ with distribution function $F$, we have $\big|\P\big(F(X) \leq t\big) - t\big| \leq \Delta$, where $\Delta$ is the size of the largest jump discontinuity in $F$. By uniform integrability, $\sup_{t \in \R} \big| F_Z(t) - F_T(t) \big| = o(\varepsilon_n)$. Since $F_Z$ has no jumps, we must have $\Delta \leq \varepsilon_n$ for $F_T$. Finally, a lower bound is constructed in an analogous manner, giving % \begin{align*} \P\left( \sup_{w \in \cW} \left| \hat T_n(w) \right| \leq \hat q_{1-\alpha} \right) &\geq 1 - \alpha - 3\varepsilon_n. \end{align*} % \end{proof} \begin{proof}[Lemma~\ref{lem:kernel_app_counterfactual_bias}] Writing $k_{i j} = k_h(W_{i j}^1, w)$, $\psi_i = \psi(X_i^1)$, $\hat\psi_i = \hat\psi(X_i^1)$, and $\kappa_{i j} = \kappa(X_i^0, X_i^1, X_j^1)$, % \begin{align*} \E\big[\hat f_W^{1 \triangleright 0}(w)\big] &= \E\left[ \frac{2}{n(n-1)} \sum_{i 0$ and $p \in [1, \infty]$, with $\E^*$ the outer expectation, if % \begin{align*} \E^* \left[ \sup_{A \in \cB(\R^d)} \Big\{ \P \big( X \in A \mid \cH' \big) - F \big( A_p^\eta \mid \cH' \big) \Big\} \right] \leq \rho, \end{align*} % where $A_p^\eta = \{x \in \R^d : \|x - A\|_p \leq \eta\}$ and $\|x - A\|_p = \inf_{x' \in A} \|x - x'\|_p$, then there exists an $\R^d$-valued random variable $Y$ with $Y \mid \cH' \sim F(\cdot \mid \cH')$ and $\P \left( \|X-Y\|_p > \eta \right) \leq \rho$. % \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_strassen}] By Theorem~B.2 in \citet{chen2020jackknife}, noting that the $\sigma$-algebra generated by $Z$ is countably generated and using the metric induced by the $\ell^p$-norm. \end{proof} Next, we present in Lemma~\ref{lem:yurinskii_app_smooth_approximation} an analytic result concerning the smooth approximation of Borel set indicator functions, similar to that given in \citet[Lemma~39]{belloni2019conditional}. \begin{lemma}[Smooth approximation of Borel indicator functions]% \label{lem:yurinskii_app_smooth_approximation} Let $A \subseteq \R^d$ be a Borel set and $Z \sim \cN(0, I_d)$. For $\sigma, \eta > 0$ and $p \in [1, \infty]$, define % \begin{align*} g_{A\eta}(x) &= \left( 1 - \frac{\|x-A^\eta\|_p}{\eta} \right) \vee 0 & &\text{and} &f_{A\eta\sigma}(x) &= \E\big[g_{A\eta}(x + \sigma Z) \big]. \end{align*} % Then $f$ is infinitely differentiable and with $\varepsilon = \P(\|Z\|_p > \eta / \sigma)$, for all $k \geq 0$, any multi-index $\kappa = (\kappa_1,\dots, \kappa_d)\in\N^d$, and all $x,y \in \R^d$, we have $|\partial^\kappa f_{A\eta\sigma}(x)| \leq \frac{\sqrt{\kappa!}}{\sigma^{|\kappa|}}$ and % \begin{align*} &\Bigg| f_{A\eta\sigma}(x+y) - \sum_{|\kappa| = 0}^k \frac{1}{\kappa!} \partial^\kappa f_{A\eta\sigma}(x) y^\kappa \Bigg| \leq \frac{\|y\|_p \|y\|_2^k}{\sigma^k \eta \sqrt{k!}}, \\ &(1 - \varepsilon) \I\big\{x \in A\big\} \leq f_{A\eta\sigma}(x) \leq \varepsilon + (1 - \varepsilon) \I\big\{x \in A^{3\eta}\big\}. \end{align*} % \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_smooth_approximation}] Drop subscripts on $g_{A\eta}$ and $f_{A \eta \sigma}$. By Taylor's theorem with Lagrange remainder, for $t \in [0,1]$, % \begin{align*} \Bigg| f(x + y) - \sum_{|\kappa|=0}^{k} \frac{1}{\kappa!} \partial^{\kappa} f(x) y^\kappa \Bigg| \leq \Bigg| \sum_{|\kappa|=k} \frac{y^\kappa}{\kappa!} \big( \partial^{\kappa} f(x + t y) - \partial^{\kappa} f(x) \big) \Bigg|. \end{align*} % Now with $\phi(x) = \frac{1}{\sqrt{2 \pi}} e^{-x^2/2}$, % \begin{align*} f(x) &= \E\big[g(x + \sigma W) \big] = \int_{\R^d} g(x + \sigma u) \prod_{j=1}^{d} \phi(u_j) \diff u = \frac{1}{\sigma^d} \int_{\R^d} g(u) \prod_{j=1}^{d} \phi \left( \frac{u_j-x_j}{\sigma} \right) \diff u \end{align*} % and since the integrand is bounded, we exchange differentiation and integration to compute % \begin{align} \nonumber \partial^\kappa f(x) &= \frac{1}{\sigma^{d+|\kappa|}} \int_{\R^d} g(u) \prod_{j=1}^{d} \partial^{\kappa_j} \phi \left( \frac{u_j-x_j}{\sigma} \right) \diff u = \left( \frac{-1}{\sigma} \right)^{|\kappa|} \int_{\R^d} g(x + \sigma u) \prod_{j=1}^{d} \partial^{\kappa_j} \phi(u_j) \diff u \\ \label{eq:yurinskii_app_smoothing_derivative} &= \left( \frac{-1}{\sigma} \right)^{|\kappa|} \E \Bigg[ g(x + \sigma Z) \prod_{j=1}^{d} \frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)} \Bigg], \end{align} % where $Z \sim \cN(0, I_d)$. Recalling that $|g(x)| \leq 1$ and applying the Cauchy--Schwarz inequality, % \begin{align*} \left| \partial^\kappa f(x) \right| &\leq \frac{1}{\sigma^{|\kappa|}} \prod_{j=1}^{d} \E \left[ \left( \frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)} \right)^2 \right]^{1/2} \leq \frac{1}{\sigma^{|\kappa|}} \prod_{j=1}^{d} \sqrt{\kappa_j!} = \frac{\sqrt{\kappa!}}{\sigma^{|\kappa|}}, \end{align*} % as the expected square of the Hermite polynomial of degree $\kappa_j$ against the standard Gaussian measure is $\kappa_j!$. By the reverse triangle inequality, $|g(x + t y) - g(x)| \leq t \|y\|_p / \eta$, so by \eqref{eq:yurinskii_app_smoothing_derivative}, % \begin{align*} &\left| \sum_{|\kappa|=k} \frac{y^\kappa}{\kappa!} \big( \partial^{\kappa} f(x + t y) - \partial^{\kappa} f(x) \big) \right| \\ &\quad= \left| \sum_{|\kappa|=k} \frac{y^\kappa}{\kappa!} \frac{1}{\sigma^{|\kappa|}} \E \Bigg[ \big( g(x + t y + \sigma Z) - g(x + \sigma Z) \big) \prod_{j=1}^{d} \frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)} \Bigg] \right| \\ &\quad\leq \frac{t \|y\|_p}{\sigma^k \eta} \, \E \left[ \Bigg| \sum_{|\kappa|=k} \frac{y^\kappa}{\kappa!} \prod_{j=1}^{d} \frac{\partial^{\kappa_j}\phi(Z_j)}{\phi(Z_j)} \Bigg| \right]. \end{align*} % Therefore, by the Cauchy--Schwarz inequality, % \begin{align*} &\Bigg( \sum_{|\kappa|=k} \frac{y^\kappa}{\kappa!} \big( \partial^{\kappa} f(x + t y) - \partial^{\kappa} f(x) \big) \Bigg)^2 \leq \frac{t^2 \|y\|_p^2}{\sigma^{2k} \eta^2} \, \E \left[ \Bigg( \sum_{|\kappa|=k} \frac{y^\kappa}{\kappa!} \prod_{j=1}^{d} \frac{\partial^{\kappa_j} \phi(Z_j)}{\phi(Z_j)} \Bigg)^2 \right] \\ &\quad= \frac{t^2 \|y\|_p^2}{\sigma^{2k} \eta^2} \sum_{|\kappa|=k} \sum_{|\kappa'|=k} \frac{y^{\kappa + \kappa'}}{\kappa! \kappa'!} \prod_{j=1}^{d} \, \E \left[ \frac{\partial^{\kappa_j} \phi(Z_j)}{\phi(Z_j)} \frac{\partial^{\kappa'_j} \phi(Z_j)}{\phi(Z_j)} \right]. \end{align*} % Orthogonality of Hermite polynomials gives zero if $\kappa_j \neq \kappa'_j$. By the multinomial theorem, % \begin{align*} \left| f(x + y) - \sum_{|\kappa|=0}^{k} \frac{1}{\kappa!} \partial^{\kappa} f(x) y^\kappa \right| &\leq \frac{\|y\|_p}{\sigma^k \eta} \Bigg( \sum_{|\kappa|=k} \frac{y^{2 \kappa}}{\kappa!} \Bigg)^{1/2} \leq \frac{\|y\|_p}{\sigma^k \eta \sqrt{k!}} \Bigg( \sum_{|\kappa|=k} \frac{k!}{\kappa!} y^{2 \kappa} \Bigg)^{1/2} \\ &\leq \frac{\|y\|_p \|y\|_2^k}{\sigma^k \eta \sqrt{k!}}. \end{align*} % For the final result, since $f(x) = \E \left[ g(x + \sigma Z) \right]$ and $\I\big\{x \in A^\eta\big\}\leq g(x)\leq \I\big\{x \in A^{2\eta}\big\}$, % \begin{align*} f(x) &\leq \P \left( x + \sigma Z \in A^{2 \eta} \right) \\ &\leq \P \left( \|Z\|_p > \frac{\eta}{\sigma} \right) + \I \left\{ x \in A^{3 \eta} \right\} \P \left( \|Z\|_p \leq \frac{\eta}{\sigma} \right) = \varepsilon + (1 - \varepsilon) \I \left\{ x \in A^{3 \eta} \right\}, \\ f(x) &\geq \P \left( x + \sigma Z \in A^{\eta} \right) \geq \I \left\{ x \in A \right\} \P \left( \|Z\|_p \leq \frac{\eta}{\sigma} \right) = (1 - \varepsilon) \I \left\{ x \in A \right\}. \end{align*} % \end{proof} We provide a useful Gaussian inequality in Lemma~\ref{lem:yurinskii_app_gaussian_useful} which helps bound the $\beta_{\infty,k}$ moment terms appearing in several places throughout the analysis. \begin{lemma}[A useful Gaussian inequality]% \label{lem:yurinskii_app_gaussian_useful} Let $X \sim \cN(0, \Sigma)$ where $\sigma_j^2 = \Sigma_{j j} \leq \sigma^2$ for all $1 \leq j \leq d$. Then % \begin{align*} \E\left[ \|X\|_2^2 \|X\|_\infty \right] &\leq 4 \sigma \sqrt{\log 2d} \,\sum_{j=1}^d \sigma_j^2 &&\text{and} &\E\left[ \|X\|_2^3 \|X\|_\infty \right] &\leq 8 \sigma \sqrt{\log 2d} \,\bigg( \sum_{j=1}^d \sigma_j^2 \bigg)^{3/2}. \end{align*} % \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_gaussian_useful}] By Cauchy--Schwarz, with $k \in \{2,3\}$, we have $\E\left[\|X\|_2^{k} \|X\|_\infty \right] \leq \E\big[\|X\|_2^{2k} \big]^{1/2} \E\big[\|X\|_\infty^2 \big]^{1/2}$. For the first term, by H{\"o}lder's inequality and the even moments of the normal distribution, % \begin{align*} \E\big[\|X\|_2^4 \big] &= \E\Bigg[ \bigg( \sum_{j=1}^d X_j^2 \bigg)^2 \Bigg] = \sum_{j=1}^d \sum_{k=1}^d \E\big[ X_j^2 X_k^2 \big] \leq \bigg( \sum_{j=1}^d \E\big[X_j^4 \big]^{\frac{1}{2}} \bigg)^2 = 3 \bigg( \sum_{j=1}^d \sigma_j^2 \bigg)^2, \\ \E\big[\|X\|_2^6 \big] &= \sum_{j=1}^d \sum_{k=1}^d \sum_{l=1}^d \E\big[ X_j^2 X_k^2 X_l^2 \big] \leq \bigg( \sum_{j=1}^d \E\big[X_j^6 \big]^{\frac{1}{3}} \bigg)^3 = 15 \bigg( \sum_{j=1}^d \sigma_j^2 \bigg)^3. \end{align*} % For the second term, by Jensen's inequality and the $\chi^2$ moment generating function, % \begin{align*} \E\big[\|X\|_\infty^2 \big] &= \E\left[ \max_{1 \leq j \leq d} X_j^2 \right] \leq 4 \sigma^2 \log \sum_{j=1}^d \E\Big[ e^{X_j^2 / (4\sigma^2)} \Big] \leq 4 \sigma^2 \log \sum_{j=1}^d \sqrt{2} \leq 4 \sigma^2 \log 2 d. \end{align*} % \end{proof} We provide an $\ell^p$-norm tail probability bound for Gaussian variables in Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}, motivating the definition of the term $\phi_p(d)$. \begin{lemma}[Gaussian \texorpdfstring{$\ell^p$}{lp}-norm bound]% \label{lem:yurinskii_app_gaussian_pnorm} Let $X \sim \cN(0, \Sigma)$ where $\Sigma \in \R^{d \times d}$ is a positive semi-definite matrix. Then we have that $\E\left[ \|X\|_p \right] \leq \phi_p(d) \max_{1 \leq j \leq d} \sqrt{\Sigma_{j j}}$ with $\phi_p(d) = \sqrt{pd^{2/p} }$ for $p \in [1,\infty)$ and $\phi_\infty(d) = \sqrt{2\log 2d}$. \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}] For $p \in [1, \infty)$, as each $X_j$ is Gaussian, we have $\big(\E\big[|X_j|^p\big]\big)^{1/p} \leq \sqrt{p\, \E[X_j^2]} = \sqrt{p \Sigma_{j j}}$. So % \begin{align*} \E\big[\|X\|_p\big] &\leq \Bigg(\sum_{j=1}^d \E \big[ |X_j|^p \big] \Bigg)^{1/p} \leq \Bigg(\sum_{j=1}^d p^{p/2} \Sigma_{j j}^{p/2} \Bigg)^{1/p} \leq \sqrt{p d^{2/p}} \max_{1\leq j\leq d} \sqrt{\Sigma_{j j}} \end{align*} % by Jensen's inequality. For $p=\infty$, with $\sigma^2 = \max_j \Sigma_{j j}$, for $t>0$, % \begin{align*} \E\big[\|X\|_\infty \big] &\leq t \log \sum_{j=1}^d \E\Big[ e^{|X_j| / t} \Big] \leq t \log \sum_{j=1}^d \E\Big[ 2 e^{X_j / t} \Big] \leq t \log \Big(2 d e^{\sigma^2/(2t^2)}\Big) \leq t \log 2 d + \frac{\sigma^2}{2t}, \end{align*} % again by Jensen's inequality. Setting $t = \frac{\sigma}{\sqrt{2 \log 2d}}$ gives $\E\big[\|X\|_\infty \big] \leq \sigma \sqrt{2 \log 2d}$. % \end{proof} We give a Gaussian--Gaussian $\ell^p$-norm approximation as Lemma~\ref{lem:yurinskii_app_feasible_gaussian}, useful for ensuring approximations remain valid upon substituting an estimator for the true variance matrix. \begin{lemma}[Gaussian--Gaussian approximation in \texorpdfstring{$\ell^p$}{lp}-norm]% \label{lem:yurinskii_app_feasible_gaussian} Let $\Sigma_1, \Sigma_2 \in \R^{d \times d}$ be positive semi-definite and take $Z \sim \cN(0, I_d)$. For $p \in [1, \infty]$ we have % \begin{align*} \P\left( \left\| \left(\Sigma_1^{1/2} - \Sigma_2^{1/2}\right) Z \right\|_p > t \right) &\leq 2 d \exp \left( \frac{-t^2} {2 d^{2/p} \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2^2} \right). \end{align*} \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_feasible_gaussian}] Let $\Sigma \in \R^{d \times d}$ be positive semi-definite and write $\sigma^2_j = \Sigma_{j j} $. For $p \in [1, \infty)$ by a union bound and Gaussian tail probabilities, % \begin{align*} \P\left(\big\| \Sigma^{1/2} Z \big\|_p > t \right) &= \P\Bigg( \sum_{j=1}^d \left| \left( \Sigma^{1/2} Z \right)_j \right|^p > t^p \Bigg) \leq \sum_{j=1}^d \P\Bigg( \left| \left( \Sigma^{1/2} Z \right)_j \right|^p > \frac{t^p \sigma_j^p}{\|\sigma\|_p^p} \Bigg) \\ &= \sum_{j=1}^d \P\Bigg( \left| \sigma_j Z_j \right|^p > \frac{t^p \sigma_j^p}{\|\sigma\|_p^p} \Bigg) = \sum_{j=1}^d \P\left( \left| Z_j \right| > \frac{t}{\|\sigma\|_p} \right) \leq 2 d \, \exp\left( \frac{-t^2}{2 \|\sigma\|_p^2} \right). \end{align*} % The same result holds for $p = \infty$ since % \begin{align*} \P\left(\big\| \Sigma^{1/2} Z \big\|_\infty > t \right) &= \P\left( \max_{1 \leq j \leq d} \left| \left( \Sigma^{1/2} Z \right)_j \right| > t \right) \leq \sum_{j=1}^d \P\left( \left| \left( \Sigma^{1/2} Z \right)_j \right| > t \right) \\ &= \sum_{j=1}^d \P\left( \left| \sigma_j Z_j \right| > t \right) \leq 2 \sum_{j=1}^d \exp\left( \frac{-t^2}{2 \sigma_j^2} \right) \leq 2 d \exp\left( \frac{-t^2}{2 \|\sigma\|_\infty^2} \right). \end{align*} % Now we apply this to the matrix $\Sigma = \big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2$. For $p \in [1, \infty)$, % \begin{align*} \|\sigma\|_p^p &= \sum_{j=1}^d (\Sigma_{j j})^{p/2} = \sum_{j=1}^d \Big(\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big)_{j j}^{p/2} \leq d \max_{1 \leq j \leq d} \Big(\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big)_{j j}^{p/2} \\ &\leq d \, \Big\|\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big\|_2^{p/2} = d \, \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2^p \end{align*} % Similarly, for $p = \infty$ we have % \begin{align*} \|\sigma\|_\infty &= \max_{1 \leq j \leq d} (\Sigma_{j j})^{1/2} = \max_{1 \leq j \leq d} \Big(\big(\Sigma_1^{1/2} - \Sigma_2^{1/2}\big)^2\Big)_{j j}^{1/2} \leq \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2. \end{align*} % Thus for all $p \in [1, \infty]$ we have $\|\sigma\|_p \leq d^{1/p} \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2$, with $d^{1/\infty} = 1$. Hence % \begin{align*} \P\left( \left\| \left(\Sigma_1^{1/2} - \Sigma_2^{1/2}\right) Z \right\|_p > t \right) &\leq 2 d \exp \left( \frac{-t^2}{2 \|\sigma\|_p^2} \right) \leq 2 d \exp \left( \frac{-t^2} {2 d^{2/p} \big\|\Sigma_1^{1/2} - \Sigma_2^{1/2}\big\|_2^2} \right). \end{align*} % \end{proof} We give a variance bound and an exponential inequality for $\alpha$-mixing variables. \begin{lemma}[Variance bounds for \texorpdfstring{$\alpha$}{alpha}-mixing random variables] \label{lem:yurinskii_app_variance_mixing} Let $X_1, \ldots, X_n$ be real-valued $\alpha$-mixing random variables with mixing coefficients $\alpha(j)$. Then % \begin{enumerate}[label=(\roman*)] \item \label{it:yurinskii_app_variance_mixing_bounded} If for constants $M_i$ we have $|X_i| \leq M_i$ a.s.\ then % \begin{align*} \Var\left[ \sum_{i=1}^n X_i \right] &\leq 4 \sum_{j=1}^\infty \alpha(j) \sum_{i=1}^n M_i^2. \end{align*} \item \label{it:yurinskii_app_variance_mixing_exponential} If $\alpha(j) \leq e^{-2j / C_\alpha}$ then for any $r>2$ there is a constant $C_r$ depending only on $r$ with % \begin{align*} \Var\left[ \sum_{i=1}^n X_i \right] &\leq C_r C_\alpha \sum_{i=1}^n \E\big[|X_i|^r\big]^{2/r}. \end{align*} \end{enumerate} % \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_variance_mixing}] Define $\alpha^{-1}(t) = \inf\{j \in \N : \alpha(j) \leq t\}$ and $Q_i(t) = \inf\{s \in \R : \P(|X_i| > s) \leq t\}$. By Corollary~1.1 in \citet{rio2017asymptotic} and H{\"o}lder's inequality for $r > 2$, % \begin{align*} \Var\left[ \sum_{i=1}^n X_i \right] &\leq 4 \sum_{i=1}^n \int_0^1 \alpha^{-1}(t) Q_i(t)^2 \diff{t} \\ &\leq 4 \sum_{i=1}^n \left( \int_0^1 \alpha^{-1}(t)^{\frac{r}{r-2}} \diff{t} \right)^{\frac{r-2}{r}} \left( \int_0^1 |Q_i(t)|^r \diff{t} \right)^{\frac{2}{r}} \diff{t}. \end{align*} % Now note that if $U \sim \Unif[0,1]$ then $Q_i(U)$ has the same distribution as $X_i$. Therefore % \begin{align*} \Var\left[ \sum_{i=1}^n X_i \right] &\leq 4 \left( \int_0^1 \alpha^{-1}(t)^{\frac r{r-2}} \diff{t} \right)^{\frac{r-2}r} \sum_{i=1}^n \E[|X_i|^r]^{\frac 2 r}. \end{align*} % If $\alpha(j) \leq e^{-2j/C_\alpha}$ then $\alpha^{-1}(t) \leq \frac{-C_\alpha \log t}{2}$ so, for some constant $C_r$ depending only on $r$, % \begin{align*} \Var\left[ \sum_{i=1}^n X_i \right] \leq 2 C_\alpha \left( \int_0^1 (-\log t)^{\frac r{r-2}} \diff{t} \right)^{\frac{r-2} r} \sum_{i=1}^n \E[|X_i|^r]^{\frac 2 r} \leq C_r C_\alpha \sum_{i=1}^n \E[|X_i|^r]^{\frac 2 r}. \end{align*} % Alternatively, if for constants $M_i$ we have $|X_i| \leq M_i$ a.s.\ then % \begin{align*} \Var\left[ \sum_{i=1}^n X_i \right] &\leq 4 \int_0^1 \alpha^{-1}(t) \diff{t} \sum_{i=1}^n M_i^2 \leq 4 \sum_{j=1}^\infty \alpha(j) \sum_{i=1}^n M_i^2. \end{align*} % \end{proof} \begin{lemma}[Exponential concentration inequalities for \texorpdfstring{$\alpha$}{alpha}-mixing random variables] \label{lem:yurinskii_app_exponential_mixing} Let $X_1, \ldots, X_n$ be zero-mean real-valued variables with $\alpha$-mixing coefficients $\alpha(j) \leq e^{-2 j / C_\alpha}$. \begin{enumerate}[label=(\roman*)] \item \label{it:yurinskii_app_exponential_mixing_bounded} Suppose $|X_i| \leq M$ a.s.\ for $1 \leq i \leq n$. Then for all $t > 0$ there is a constant $C_1$ with % \begin{align*} \P\left( \left| \sum_{i=1}^n X_i \right| > C_1 M \big( \sqrt{n t} + (\log n)(\log \log n) t \big) \right) &\leq C_1 e^{-t}. \end{align*} % \item \label{it:yurinskii_app_exponential_mixing_bernstein} If further $\sum_{j=1}^n |\Cov[X_i, X_j]| \leq \sigma^2$, then for all $t > 0$ there is a constant $C_2$ with % \begin{align*} \P\left( \left| \sum_{i=1}^n X_i \right| \geq C_2 \big( (\sigma \sqrt n + M) \sqrt t + M (\log n)^2 t \big) \right) &\leq C_2 e^{-t}. \end{align*} \end{enumerate} \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_exponential_mixing}] \begin{enumerate}[label=(\roman*)] \item By Theorem~1 in \citet{merlevede2009bernstein}, % \begin{align*} \P\left( \left| \sum_{i=1}^n X_i \right| > t \right) &\leq \exp\left( -\frac{C_1 t^2}{n M^2 + Mt (\log n)(\log\log n)} \right). \end{align*} % Replace $t$ by $M \sqrt{n t} + M (\log n)(\log \log n) t$. \item By Theorem~2 in \citet{merlevede2009bernstein}, % \begin{align*} \P\left( \left| \sum_{i=1}^n X_i \right| > t \right) &\leq \exp\left( -\frac{C_2 t^2}{n\sigma^2 + M^2 + Mt (\log n)^2} \right). \end{align*} % Replace $t$ by $\sigma \sqrt n \sqrt t + M \sqrt t + M (\log n)^2 t$. \end{enumerate} % \end{proof} \subsection{Main results} To establish Theorem~\ref{thm:yurinskii_sa_dependent}, we first give the analogous result for martingales as Lemma~\ref{lem:yurinskii_app_sa_martingale}. Our approach is similar to that used in modern versions of Yurinskii's coupling for independent data, as in Theorem~1 in \citet{lecam1988} and Theorem~10 in Chapter~10 of \citet{pollard2002user}. The proof of Lemma~\ref{lem:yurinskii_app_sa_martingale} relies on constructing a ``modified'' martingale, which is close to the original martingale, but which has an $\cH_0$-measurable terminal quadratic variation. \begin{lemma}[Strong approximation for vector-valued martingales]% \label{lem:yurinskii_app_sa_martingale} Let $X_1, \ldots, X_n$ be $\R^d$-valued square-integrable random vectors adapted to a countably generated filtration $\cH_0, \ldots, \cH_n$. Suppose that $\E[X_i \mid \cH_{i-1}] = 0$ for all $1 \leq i \leq n$ and define $S = \sum_{i=1}^n X_i$. Let $V_i = \Var[X_i \mid \cH_{i-1}]$ and $\Omega = \sum_{i=1}^n V_i - \Sigma$ where $\Sigma$ is a positive semi-definite $\cH_0$-measurable $d \times d$ random matrix. For each $\eta > 0$ and $p \in [1,\infty]$ there is $T \mid \cH_0 \sim \cN(0, \Sigma)$ with % \begin{align*} \P\big(\|S-T\|_p > 5\eta\big) &\leq \inf_{t>0} \left\{ 2 \P\big( \|Z\|_p > t \big) + \min\left\{ \frac{\beta_{p,2} t^2}{\eta^3}, \frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3} \right\} \right\} \\ \nonumber &\quad+ \inf_{M \succeq 0} \big\{ 2\gamma(M) + \delta_p(M,\eta) + \varepsilon_p(M, \eta)\big\}, \end{align*} % where the second infimum is over all positive semi-definite $d \times d$ non-random matrices, and % \begin{align*} \beta_{p,k} &= \sum_{i=1}^n \E\left[\| X_i \|^k_2 \| X_i \|_p + \|V_i^{1/2} Z_i \|^k_2 \|V_i^{1/2} Z_i \|_p \right], \qquad\gamma(M) = \P\big(\Omega \npreceq M\big), \\ \delta_p(M,\eta) &= \P\left( \big\|\big((\Sigma +M)^{1/2}- \Sigma^{1/2}\big) Z\big\|_p \geq \eta \right), \qquad\pi_3 = \sum_{i=1}^{n+m} \sum_{|\kappa| = 3} \E \Big[ \big| \E \left[ X_i^\kappa \mid \cH_{i-1} \right] \big| \Big], \\ \varepsilon_p(M, \eta) &= \P\left(\big\| (M - \Omega)^{1/2} Z \big\|_p\geq \eta, \ \Omega \preceq M\right), \end{align*} % for $k \in \{2,3\}$, with $Z, Z_1,\dots ,Z_n$ i.i.d.\ standard Gaussian on $\R^d$ independent of $\cH_n$. \end{lemma} \begin{proof}[Lemma~\ref{lem:yurinskii_app_sa_martingale}] \proofparagraph{constructing a modified martingale} Take $M \succeq 0$ a fixed positive semi-definite $d \times d$ matrix. We start by constructing a new martingale based on $S$ whose quadratic variation is $\Sigma + M$. Take $m \geq 1$ and define % \begin{align*} H_k &= \Sigma + M - \sum_{i=1}^{k} V_i, \qquad\qquad\qquad\qquad\tau = \sup \big\{ k\in\{0,1,\dots,n\} : H_k \succeq 0 \big\}, \\ \tilde X_i &= X_i\I\{i \leq \tau\} + \frac{1}{\sqrt{m}} H_\tau^{1/2} Z_i\I\{n+1 \leq i \leq n+m\}, \qquad\qquad\tilde S = \sum_{i=1}^{n+m} \tilde X_i, \end{align*} % where $Z_{n+1}, \ldots, Z_{n+m}$ is an i.i.d.\ sequence of standard Gaussian vectors in $\R^d$ independent of $\cH_n$, noting that $H_0 = \Sigma + M \succeq 0$ a.s. Define the filtration $\tilde \cH_0, \ldots, \tilde \cH_{n+m}$, where $\tilde \cH_i = \cH_i$ for $0 \leq i \leq n$ and is the $\sigma$-algebra generated by $\cH_n$ and $Z_{n+1}, \dots, Z_{i}$ for $n+1 \leq i\leq n+m$. Observe that $\tau$ is a stopping time with respect to $\tilde\cH_i$ because $H_{i+1} - H_i = -V_{i+1} \preceq 0$ almost surely, so $\{\tau \leq i\} = \{H_{i+1} \nsucceq 0\}$ for $0\leq i \eta \big) \leq \P\big( \| H_n^{1/2} Z \|_p > \eta,\, \Omega \preceq M) + \P\big( \Omega \npreceq M \big)$, so % \begin{align*}% \label{eq:yurinskii_app_approx_modified_original} \P\big( \| S - \tilde S \|_p > \eta\big) &\leq 2 \P\big(\Omega \npreceq M \big) + \P\big( \| (M-\Omega)^{1/2}Z \|_p > \eta,\, \Omega \preceq M \big) = 2 \gamma(M) + \varepsilon_p(M, \eta). \end{align*} \proofparagraph{strong approximation of the modified martingale} Let $\tilde Z_1, \ldots, \tilde Z_{n+m}$ be i.i.d.\ $\cN(0, I_d)$ and independent of $\tilde \cH_{n+m}$. Define $\check X_i = \tilde V_i^{1/2} \tilde Z_i$ and $\check S = \sum_{i=1}^{n+m} \check X_i$. Fix a Borel set $A \subseteq \R^d$ and $\sigma, \eta > 0$ and let $f = f_{A\eta\sigma}$ be the function defined in Lemma~\ref{lem:yurinskii_app_smooth_approximation}. By the Lindeberg method, write the telescoping sum % \begin{align*} \E\Big[f\big(\tilde S\big) - f\big(\check S\big) \mid \cH_0 \Big] &= \sum_{i=1}^{n+m} \E\Big[ f\big(Y_i + \tilde X_i\big) - f\big(Y_i + \check X_i\big) \mid \cH_0 \Big] \end{align*} % where $Y_i = \sum_{j=1}^{i-1} \tilde X_j + \sum_{j=i+1}^{n+m} \check X_j$. By Lemma~\ref{lem:yurinskii_app_smooth_approximation} we have for $k \geq 0$ % \begin{align*} &\Bigg| \E\big[ f(Y_i + \tilde X_i) - f(Y_i + \check X_i) \mid \cH_0 \big] - \sum_{|\kappa| = 0}^k \frac{1}{\kappa!} \E \left[ \partial^\kappa f(Y_i) \left( \tilde X_i^\kappa - \check X_i^\kappa \right) \bigm| \cH_0 \right] \Bigg| \\ &\quad\leq \frac{1}{\sigma^k \eta \sqrt{k!}} \E \left[ \|\tilde X_i\|_p \|\tilde X_i\|_2^k + \|\check X_i\|_p \|\check X_i\|_2^k \bigm| \cH_0 \right]. \end{align*} % With $k \in \{2, 3\}$, we bound each summand. With $|\kappa| = 0$ we have $\tilde X_i^\kappa = \check X_i^\kappa$, so consider $|\kappa| = 1$. Noting that $\sum_{i=1}^{n+m} \tilde V_i = \Sigma + M$, define % \begin{align*} \tilde Y_i &= \sum_{j=1}^{i-1} \tilde X_j + \tilde Z_i \Bigg(\sum_{j=i+1}^{n+m} \tilde V_j\Bigg)^{1/2} = \sum_{j=1}^{i-1} \tilde X_j + \tilde Z_i \Bigg(\Sigma + M - \sum_{j=1}^{i} \tilde V_j\Bigg)^{1/2} \end{align*} % and let $\check \cH_i$ be the $\sigma$-algebra generated by $\tilde \cH_{i-1}$ and $\tilde Z_i$. Note that $\tilde Y_i$ is $\check \cH_i$-measurable and that $Y_i$ and $\tilde Y_i$ have the same distribution conditional on $\tilde \cH_{n+m}$. So % \begin{align*} &\sum_{|\kappa| = 1} \frac{1}{\kappa!} \E\left[ \partial^\kappa f(Y_i) \big( \tilde X_i^\kappa - \check X_i^\kappa \big) \bigm| \cH_0 \right] = \E \left[ \nabla f(Y_i)^\T \big( \tilde X_i - \tilde V_i^{1/2} \tilde Z_i \big) \bigm| \cH_0 \right] \\ &\quad= \E \left[ \nabla f(\tilde Y_i)^\T \tilde X_i \bigm| \cH_0 \right] - \E \left[ \nabla f(Y_i)^\T \tilde V_i^{1/2} \tilde Z_i \bigm| \cH_0 \right] \\ &\quad= \E \left[ \nabla f(\tilde Y_i)^\T \E \left[ \tilde X_i \mid \check \cH_i \right] \bigm| \cH_0 \right] - \E \left[ \tilde Z_i \right] \E \left[ \nabla f(Y_i)^\T \tilde V_i^{1/2} \bigm| \cH_0 \right] \\ &\quad= \E \left[ \nabla f(\tilde Y_i)^\T \E \left[ \tilde X_i \mid \tilde \cH_{i-1} \right] \bigm| \cH_0 \right] - 0 = 0. \end{align*} % Next, if $|\kappa| = 2$ then % \begin{align*} &\sum_{|\kappa| = 2} \frac{1}{\kappa!} \E \left[ \partial^\kappa f(Y_i) \left( \tilde X_i^\kappa - \check X_i^\kappa \right) \bigm| \cH_0 \right] \\ &\quad= \frac{1}{2} \E \left[ \tilde X_i^\T \nabla^2 f(Y_i) \tilde X_i - \tilde Z_i^\T \tilde V_i^{1/2} \nabla^2 f(Y_i) \tilde V_i^{1/2} \tilde Z_i \bigm| \cH_0 \right] \\ &\quad= \frac{1}{2} \E \left[ \E \left[ \Tr \nabla^2 f(\tilde Y_i) \tilde X_i \tilde X_i^\T \bigm| \check \cH_i \right] \bigm| \cH_0 \right] - \frac{1}{2} \E \left[ \Tr \tilde V_i^{1/2} \nabla^2 f(Y_i) \tilde V_i^{1/2} \bigm| \cH_0 \right] \E \left[ \tilde Z_i \tilde Z_i^\T \right] \\ &\quad= \frac{1}{2} \E \left[ \Tr \nabla^2 f(Y_i) \E \left[ \tilde X_i \tilde X_i^\T \bigm| \tilde \cH_{i-1} \right] \bigm| \cH_0 \right] - \frac{1}{2} \E \left[ \Tr \nabla^2 f(Y_i) \tilde V_i \bigm| \cH_0 \right] = 0. \end{align*} % Finally, if $|\kappa| = 3$, then since $\check X_i \sim \cN(0, \tilde V_i)$ conditional on $\tilde \cH_{n+m}$, we have by symmetry of the Gaussian distribution and Lemma~\ref{lem:yurinskii_app_smooth_approximation}, % \begin{align*} & \left| \sum_{|\kappa| = 3} \frac{1}{\kappa!} \E \left[ \partial^\kappa f(Y_i) \left( \tilde X_i^\kappa - \check X_i^\kappa \right) \bigm| \cH_0 \right] \right| \\ &\quad= \left| \sum_{|\kappa| = 3} \frac{1}{\kappa!} \left( \E \left[ \partial^\kappa f(\tilde Y_i) \E \left[ \tilde X_i^\kappa \mid \check \cH_i \right] \bigm| \cH_0 \right] - \E \left[ \partial^\kappa f(Y_i) \, \E \left[ \check X_i^\kappa \bigm| \tilde \cH_{n+m} \right] \bigm| \cH_0 \right] \right) \right| \\ &\quad= \left| \sum_{|\kappa| = 3} \frac{1}{\kappa!} \E \left[ \partial^\kappa f(Y_i) \, \E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right] \bigm| \cH_0 \right] \right| \leq \frac{1}{\sigma^3} \sum_{|\kappa| = 3} \E \left[ \left| \E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right] \right| \bigm| \cH_0 \right]. \end{align*} % Combining these and summing over $i$ with $k=2$ shows % \begin{align*} \E\left[ f\big(\tilde S\big) - f\big(\check S\big) \bigm| \cH_0 \right] &\leq \frac{1}{\sigma^2 \eta \sqrt{2}} \sum_{i=1}^{n+m} \E \left[ \|\tilde X_i\|_p \|\tilde X_i\|_2^2 + \|\check X_i\|_p \|\check X_i\|_2^2 \bigm| \cH_0 \right] \end{align*} % On the other hand, taking $k = 3$ gives % \begin{align*} \E\left[ f\big(\tilde S\big) - f\big(\check S\big) \bigm| \cH_0 \right] &\leq \frac{1}{\sigma^3 \eta \sqrt{6}} \sum_{i=1}^{n+m} \E \left[ \|\tilde X_i\|_p \|\tilde X_i\|_2^3 + \|\check X_i\|_p \|\check X_i\|_2^3 \bigm| \cH_0 \right] \\ &\quad+ \frac{1}{\sigma^3} \sum_{i=1}^{n+m} \sum_{|\kappa| = 3} \E \left[ \left| \E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right] \right| \bigm| \cH_0 \right]. \end{align*} % For $1 \leq i \leq n$ we have $\|\tilde X_i\| \leq \|X_i\|$ and $\|\check X_i\| \leq \|V_i^{1/2} \tilde Z_i\|$. For $n+1 \leq i \leq n+m$ we have $\tilde X_i = H_\tau^{1/2} Z_i / \sqrt m$ and $\check X_i = H_\tau^{1/2} \tilde Z_i / \sqrt m$ which are equal in distribution given $\cH_0$. So with % \begin{align*} \tilde \beta_{p,k} &= \sum_{i=1}^{n} \E \left[ \|X_i\|_p \|X_i\|_2^k + \|V_i^{1/2} Z_i\|_p \|V_i^{1/2} Z_i\|_2^k \bigm| \cH_0 \right], \end{align*} % we have, since $k \in \{2,3\}$, % \begin{align*} &\sum_{i=1}^{n+m} \E \left[ \|\tilde X_i\|_p \|\tilde X_i\|_2^k + \|\check X_i\|_p \|\check X_i\|_2^k \bigm| \cH_0 \right] \leq \tilde\beta_{p,k} + \frac{2}{\sqrt m} \E \left[ \|H_\tau^{1/2} Z\|_p \|H_\tau^{1/2} Z\|_2^k \bigm| \cH_0 \right]. \end{align*} % Since $H_i$ is weakly decreasing under the semi-definite partial order, we have $H_\tau \preceq H_0 = \Sigma + M$ implying that $|(H_\tau)_{j j}| \leq \|\Sigma + M\|_{\max}$ and $\E\big[|(H_\tau^{1/2} Z)_j|^3 \mid \cH_0 \big] \leq \sqrt{8/\pi}\, \|\Sigma + M\|_{\max}^{3/2}$. Hence as $p \geq 1$ and $k \in \{2,3\}$, % \begin{align*} \E\left[ \|H_\tau^{1/2}Z\|_p \|H_\tau^{1/2}Z\|_2^k \bigm| \cH_0 \right] &\leq \E\left[\|H_\tau^{1/2} Z\|_1^{k+1} \bigm| \cH_0 \right] \leq d^{k+1} \max_{1\leq j\leq d} \E\left[|(H_\tau^{1/2} Z)_j|^{k+1} \bigm| \cH_0 \right] \\ &\leq 3 d^4 \, \|\Sigma + M\|_{\max}^{(k+1)/2} \leq 6 d^4 \, \|\Sigma \|_{\max}^{(k+1)/2} + 6 d^4 \|M\|. \end{align*} % Assuming some $X_i$ is not identically zero so the result is non-trivial, and supposing that $\Sigma$ is bounded a.s.\ (replacing $\Sigma$ by $\Sigma \cdot \I\{\|\Sigma\|_{\max} \leq C\}$ for an appropriately large $C$ if necessary), take $m$ large enough that % \begin{align} \label{eq:yurinskii_app_bound_extra_terms} \frac{2}{\sqrt m} \E \left[ \|H_\tau^{1/2} Z\|_p \|H_\tau^{1/2} Z\|_2^k \bigm| \cH_0 \right] \leq \frac{1}{4} \beta_{p,k}. \end{align} % Further, if $|\kappa| = 3$ then $\big|\E \big[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \big]\big| \leq \big| \E \left[ X_i^\kappa \mid \cH_{i-1} \right]\big|$ for $1 \leq i \leq n$ while by symmetry of the Gaussian distribution $\E \left[ \tilde X_i^\kappa \mid \tilde \cH_{i-1} \right] = 0$ for $n+1 \leq i \leq n+m$. Hence with % \begin{align*} \tilde \pi_3 &= \sum_{i=1}^{n+m} \sum_{|\kappa| = 3} \E \Big[ \big| \E \left[ X_i^\kappa \mid \cH_{i-1} \right] \big| \mid \cH_0 \Big], \end{align*} % we have % \begin{align*} \E\left[ f\big(\tilde S\big) - f\big(\check S\big) \bigm| \cH_0 \right] &\leq \min \left\{ \frac{3 \tilde \beta_{p,2}}{4 \sigma^2 \eta} + \frac{\beta_{p,2}}{4 \sigma^2 \eta}, \frac{3 \tilde \beta_{p,3}}{4 \sigma^3 \eta} + \frac{\beta_{p,3}}{4 \sigma^3 \eta} + \frac{\tilde \pi_3}{\sigma^3} \right\}. \end{align*} % Along with Lemma~\ref{lem:yurinskii_app_smooth_approximation}, and with $\sigma = \eta / t$ and $\varepsilon = \P(\|Z\|_p > t)$, we conclude that % \begin{align*} &\P(\tilde S \in A \mid \cH_0) = \E\big[\I\{\tilde S \in A\} - f(\tilde S) \mid \cH_0 \big] + \E\big[f(\tilde S) - f\big(\check S\big) \mid \cH_0 \big] + \E \big[f\big(\check S\big) \mid \cH_0 \big] \\ &\,\leq \varepsilon\P(\tilde S \in A \mid \cH_0) + \min \! \left\{ \frac{3 \tilde \beta_{p,2}}{4 \sigma^2 \eta} + \frac{\beta_{p,2}}{4 \sigma^2 \eta}, \frac{3 \tilde \beta_{p,3}}{4 \sigma^3 \eta} + \frac{\beta_{p,3}}{4 \sigma^3 \eta} + \frac{\tilde \pi_3}{\sigma^3} \right\} + \varepsilon + (1 - \varepsilon) \P\big(\check S \in A_p^{3\eta} \mid \cH_0 \big) \\ &\,\leq \P\big( \check S \in A_p^{3\eta} \mid \cH_0 \big) + 2 \P(\|Z\|_p > t) + \min\!\left\{ \frac{3 \tilde \beta_{p,2} t^2}{4 \eta^3} + \frac{\beta_{p,2} t^2}{4 \eta^3}, \frac{3 \tilde \beta_{p,3} t^3}{4 \eta^4} + \frac{\beta_{p,3} t^3}{4 \eta^4} + \frac{\tilde \pi_3 t^3}{\eta^3} \right\}. \end{align*} % Taking a supremum and an outer expectation yields with $\beta_{p,k} = \E\big[\tilde \beta_{p,k}\big]$ and $\pi_3 = \E[\tilde \pi_3]$, % \begin{align*} &\E^* \left[ \sup_{A \in \cB(\R^d)} \left\{ \P(\tilde S \in A \mid \cH_0) - \P\big( \check S \in A_p^{3\eta} \mid \cH_0 \big) \right\} \right] \\ &\quad\leq 2 \P(\|Z\|_p > t) + \min \left\{ \frac{\beta_{p,2} t^2}{\eta^3}, \frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3} \right\}. \end{align*} % Finally, since $\check S = \sum_{i=1}^n \tilde V_i^{1/2} \tilde Z_i \sim \cN(0,\Sigma + M)$ conditional on $\cH_0$, the conditional Strassen theorem in Lemma~\ref{lem:yurinskii_app_strassen} ensures the existence of $\tilde S$ and $\tilde T \mid \cH_0 \sim \cN(0, \Sigma + M)$ such that % \begin{align} \label{eq:yurinskii_app_approx_modified_martingale} \P\left(\|\tilde S-\tilde T\|_p>3\eta\right) &\leq \inf_{t>0} \left\{ 2 \P(\|Z\|_p > t) + \min \left\{ \frac{\beta_{p,2} t^2}{\eta^3}, \frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3} \right\} \right\}, \end{align} % since the infimum is attained by continuity of $\|Z\|_p$. \proofparagraph{conclusion} We show how to write $\tilde T = (\Sigma + M)^{1/2} W$ where $W \sim \cN(0,I_d)$ and use this representation to construct $T \mid \cH_0 \sim \cN(0, \Sigma)$. By the spectral theorem, let $\Sigma + M = U \Lambda U^\T$ where $U$ is a $d \times d$ orthogonal random matrix and $\Lambda$ is a diagonal $d \times d$ random matrix with diagonal entries satisfying $\lambda_1 \geq \cdots \geq \lambda_r > 0$ and $\lambda_{r+1} = \cdots = \lambda_d = 0$ where $r = \rank (\Sigma + M)$. Let $\Lambda^+$ be the Moore--Penrose pseudo-inverse of $\Lambda$ (obtained by inverting its non-zero elements) and define $W = U (\Lambda^+)^{1/2} U^\T \tilde T + U \tilde W$, where the first $r$ elements of $\tilde W$ are zero and the last $d-r$ elements are i.i.d.\ $\cN(0,1)$ independent from $\tilde T$. Then, it is easy to check that $W \sim \cN(0, I_d)$ and that $\tilde T = (\Sigma + M)^{1/2} W$. Now define $T = \Sigma^{1/2} W$ so % \begin{equation}% \label{eq:yurinskii_app_approx_target} \P\big(\|T - \tilde T\|_p > \eta\big) = \P\big(\big\|\big((\Sigma + M)^{1/2} - \Sigma^{1/2} \big) W \big\|_p>\eta \big) = \delta_p(M, \eta). \end{equation} % Finally \eqref{eq:yurinskii_app_approx_modified_original}, \eqref{eq:yurinskii_app_approx_modified_martingale}, \eqref{eq:yurinskii_app_approx_target}, the triangle inequality, and a union bound conclude the proof since by taking an infimum over $M \succeq 0$, and by possibly reducing the constant of $1/4$ in \eqref{eq:yurinskii_app_bound_extra_terms} to account for this infimum being potentially unattainable, % \begin{align*} \P\big(\|S-T\|_p > 5\eta\big) &\leq \P\big(\|\tilde S - \tilde T \|_p > 3\eta \big) +\P\big(\|S - \tilde S \|_p > \eta\big) +\P\big(\|T - \tilde T \|_p > \eta\big) \\ &\leq \inf_{t>0} \left\{ 2 \P\big( \|Z\|_p > t \big) + \min\left\{ \frac{\beta_{p,2} t^2}{\eta^3}, \frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3} \right\} \right\} \\ &\quad+ \inf_{M \succeq 0} \big\{ 2\gamma(M) + \delta_p(M,\eta) + \varepsilon_p(M, \eta)\big\}. \end{align*} % \end{proof} Lemma~\ref{lem:yurinskii_app_sa_martingale} and the martingale approximation immediately yield Theorem~\ref{thm:yurinskii_sa_dependent}. \begin{proof}[Theorem~\ref{thm:yurinskii_sa_dependent}] Apply Lemma~\ref{lem:yurinskii_app_sa_martingale} to the martingale $\sum_{i=1}^{n} \tilde X_i$, noting that $S - \sum_{i=1}^{n} \tilde X_i = U$. \end{proof} Bounding the quantities in Theorem~\ref{thm:yurinskii_sa_dependent} gives a user-friendly version as Proposition~\ref{pro:yurinskii_sa_simplified}. \begin{proof}[Proposition~\ref{pro:yurinskii_sa_simplified}] Set $M = \nu^2 I_d$ and bound the terms appearing the main inequality in Proposition~\ref{pro:yurinskii_sa_simplified}. \proofparagraph{bounding $\P( \|Z\|_p > t )$} By Markov's inequality and Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}, we have $\P( \|Z\|_p > t ) \leq \E[\|Z\|_p] / t \leq \phi_p(d) / t$. \proofparagraph{bounding $\gamma(M)$} With $M = \nu^2 I_d$, by Markov's inequality, $\gamma(M) = \P\big(\Omega \npreceq M\big) = \P\big(\|\Omega\|_2 > \nu^2 \big) \leq \nu^{-2} \E[\|\Omega\|_2]$. \proofparagraph{bounding $\delta(M, \eta)$} By Markov's inequality and Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}, using $\max_j |M_{j j}| \leq \|M\|_2$ for $M \succeq 0$, % \begin{align*} \delta_{p}(M,\eta) &= \P\left( \big\|\big((\Sigma +M)^{1/2}- \Sigma^{1/2}\big) Z\big\|_p \geq \eta \right) \leq \frac{\phi_p(d)} {\eta} \E \left[ \big\|(\Sigma +M)^{1/2}- \Sigma^{1/2}\big\|_2 \right]. \end{align*} % For semi-definite matrices the eigenvalue operator commutes with smooth matrix functions so % \begin{align*} \|(\Sigma +M)^{1/2}- \Sigma^{1/2}\|_2 &= \max_{1 \leq j \leq d} \left| \sqrt{\lambda_j(\Sigma) + \nu^2} - \sqrt{\lambda_j(\Sigma)} \right| \leq \nu \end{align*} % and hence $\delta_{p}(M,\eta) \leq \phi_p(d)\nu / \eta$. \proofparagraph{bounding $\varepsilon(M, \eta)$} Note that $(M -\Omega)^{1/2}Z$ is a centered Gaussian conditional on $\cH_n$, on the event $\{\Omega \preceq M\}$. We thus have by Markov's inequality, Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}, and Jensen's inequality that % \begin{align*} \varepsilon_p(M, \eta) &= \P\left(\big\| (M - \Omega)^{1/2} Z \big\|_p\geq \eta, \ \Omega \preceq M\right) \leq \frac{1}{\eta} \E\left[ \I\{\Omega \preceq M\} \E\left[ \big\| (M - \Omega)^{1/2} Z \big\|_p \mid \cH_n \right] \right] \\ &\leq \frac{\phi_p(d)}{\eta} \E\left[ \I\{\Omega \preceq M\} \max_{1 \leq j \leq d} \sqrt{(M - \Omega)_{j j}} \right] \leq \frac{\phi_p(d)}{\eta} \E\left[ \sqrt{\|M - \Omega\|_2} \right] \\ &\leq \frac{\phi_p(d)}{\eta} \E\left[ \sqrt{\|\Omega\|_2} + \nu \right] \leq \frac{\phi_p(d)}{\eta} \left(\sqrt{\E[\|\Omega\|_2]} + \nu \right). \end{align*} % Thus by Theorem~\ref{thm:yurinskii_sa_dependent} and the previous parts, % \begin{align*} \P\big(\|S-T\|_p > 6\eta\big) &\leq \inf_{t>0} \left\{ 2 \P\big(\|Z\|_p>t\big) + \min\left\{ \frac{\beta_{p,2} t^2}{\eta^3}, \frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3} \right\} \right\} \\ &\quad+ \inf_{M \succeq 0} \big\{ 2\gamma(M) + \delta_p(M,\eta) + \varepsilon_p(M, \eta)\big\} +\P\big(\|U\|_p>\eta\big) \\ &\leq \inf_{t>0} \left\{ \frac{2 \phi_p(d)}{t} + \min\left\{ \frac{\beta_{p,2} t^2}{\eta^3}, \frac{\beta_{p,3} t^3}{\eta^4} + \frac{\pi_3 t^3}{\eta^3} \right\} \right\} \\ &\quad+ \inf_{\nu > 0} \left\{ \frac{2\E \left[ \|\Omega\|_2 \right]}{\nu^2} + \frac{2 \phi_p(d) \nu}{\eta} \right\} + \frac{\phi_p(d) \sqrt{\E \left[ \|\Omega\|_2 \right]}}{\eta} +\P\big(\|U\|_p>\eta\big). \end{align*} % Set $t = 2^{1/3} \phi_p(d)^{1/3} \beta_{p,2}^{-1/3} \eta$ and $\nu = \E[\|\Omega\|_2]^{1/3} \phi_p(d)^{-1/3} \eta^{1/3}$, then replace $\eta$ with $\eta / 6$ to see % \begin{align*} \P\big(\|S-T\|_p > 6\eta\big) &\leq 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3} +\P\left(\|U\|_p>\frac{\eta}{6}\right). \end{align*} % Whenever $\pi_3 = 0$ we can set $t = 2^{1/4} \phi_p(d)^{1/4} \beta_{p,3}^{-1/4} \eta$, and with $\nu$ as above we obtain % \begin{align*} \P\big(\|S-T\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,3} \phi_p(d)^3}{\eta^4} \right)^{1/4} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3} +\P\left(\|U\|_p>\frac{\eta}{6}\right). \end{align*} % \end{proof} After establishing Proposition~\ref{pro:yurinskii_sa_simplified}, Corollaries~\ref{cor:yurinskii_sa_mixingale}, \ref{cor:yurinskii_sa_martingale}, and \ref{cor:yurinskii_sa_indep} follow easily. \begin{proof}[Corollary~\ref{cor:yurinskii_sa_mixingale}] Proposition~\ref{pro:yurinskii_sa_simplified} with $\P ( \|U\|_p > \frac{\eta}{6} ) \leq \frac{6}{\eta} \sum_{i=1}^{n} c_i (\zeta_{i} + \zeta_{n-i+1})$. \end{proof} \begin{proof}[Corollary~\ref{cor:yurinskii_sa_martingale}] By Proposition~\ref{pro:yurinskii_sa_simplified} with $U=0$ a.s. \end{proof} \begin{proof}[Corollary~\ref{cor:yurinskii_sa_indep}] By Corollary~\ref{cor:yurinskii_sa_martingale} with $\Omega=0$ a.s. \end{proof} We conclude this section with a discussion expanding on the comments made in Remark~\ref{rem:yurinskii_coupling_bounds_probability} on deriving bounds in probability from Yurinskii's coupling. Consider for illustration the independent data second-order result given in Corollary~\ref{cor:yurinskii_sa_indep}: for each $\eta > 0$, there exists $T_n \mid \cH_0 \sim \cN(0, \Sigma)$ satisfying % \begin{align*} \P\big(\|S_n-T_n\|_p > \eta\big) &\leq 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3}, \end{align*} % where here we make explicit the dependence on the sample size $n$ for clarity. The naive approach to converting this into a probability bound for $\|S_n-T_n\|_p$ is to select $\eta$ to ensure the right-hand side is of order $1$, arguing that the probability can then be made arbitrarily small by taking, in this case, $\eta$ to be a large enough multiple of $\beta_{p,2}^{1/3} \phi_p(d)^{2/3}$. However, the somewhat subtle mistake is in neglecting the fact that the realization of the coupling variable $T_n$ will in general depend on $\eta$, rendering the resulting bound invalid. As an explicit example of this phenomenon, take $\eta > 1$ and suppose $\|S_n - T_n(\eta)\| = \eta$ with probability $1 - 1/\eta$ and $\|S_n - T_n(\eta)\| = n$ with probability $1/\eta$. Then $\P\big(\|S_n - T_n(\eta)\| > \eta\big) = 1/\eta$ but it is not true for any $\eta$ that $\|S_n - T_n(\eta)\| \lesssim_\P 1$. We propose in Remark~\ref{rem:yurinskii_coupling_bounds_probability} the following fix. Instead of selecting $\eta$ to ensure the right-hand side is of order $1$, we instead choose it so the bound converges (slowly) to zero. This is easily achieved by taking the naive and incorrect bound and multiplying by some divergent sequence $R_n$. The resulting inequality reads, in the case of Corollary~\ref{cor:yurinskii_sa_indep} with $\eta = \beta_{p,2}^{1/3} \phi_p(d)^{2/3} R_n$, % \begin{align*} \P\Big(\|S_n-T_n\|_p > \beta_{p,2}^{1/3} \phi_p(d)^{2/3} R_n \Big) &\leq \frac{24}{R_n} \to 0. \end{align*} % We thus recover, for the price of a rate which is slower by an arbitrarily small amount, a valid upper bound in probability, as we can immediately conclude that % \begin{align*} \|S_n-T_n\|_p \lesssim_\P \beta_{p,2}^{1/3} \phi_p(d)^{2/3} R_n. \end{align*} \subsection{Strong approximation for martingale empirical processes} We begin by presenting some calculations omitted from the main text relating to the motivating example of kernel density estimation with i.i.d.\ data. First, the bias is bounded as % \begin{align*} \big| \E \big[ \hat g(x) \big] - g(x) \big| &= \left| \int_{\frac{-x}{h}}^{\frac{1-x}{h}} K(\xi) \diff \xi - 1 \right| \leq 2 \int_{\frac{a}{h}}^\infty \frac{1}{\sqrt{2 \pi}} e^{-\frac{\xi^2}{2}} \diff \xi \leq \frac{h}{a} \sqrt{\frac{2}{\pi}} e^{-\frac{a^2}{2 h^2}}. \end{align*} % Next, we do the calculations necessary to apply Corollary~\ref{cor:yurinskii_sa_indep}. Define $k_{i j} = \frac{1}{n h} K \left( \frac{X_i - x_j}{h} \right)$ and $k_i = (k_{i j} : 1 \leq j \leq N)$. Then $\|k_i\|_\infty \leq \frac{1}{n h \sqrt{2 \pi}}$ a.s.\ and $\E[\|k_i\|_2^2] \leq \frac{N}{n^2 h} \int_{-\infty}^\infty K(\xi)^2 \diff \xi \leq \frac{N}{2 n^2 h \sqrt{\pi}}$. Let $V = \Var[k_i] \in \R^{N \times N}$, so assuming that $1/h \geq \log 2 N$, by Lemma~\ref{lem:yurinskii_app_gaussian_useful} we bound % \begin{align*} \beta_{\infty,2} &= n \E\left[\| k_i \|^2_2 \| k_i \|_\infty \right] + n \E \left[ \|V^{1/2} Z \|^2_2 \|V^{1/2} Z \|_\infty \right] \leq \frac{N}{\sqrt{8} n^2 h^2 \pi} + \frac{4 N \sqrt{\log 2 N}}{\sqrt{8} n^2 h^{3/2} \pi^{3/4}} \leq \frac{N}{n^2 h^2}. \end{align*} % Finally, we verify the stochastic continuity bounds. By the Lipschitz property of $K$, it is easy to show that for $x,x' \in \cX$ we have $\left|\frac{1}{h} K \left( \frac{X_i - x}{h} \right) - \frac{1}{h} K \left( \frac{X_i - x'}{h} \right)\right| \lesssim \frac{|x-x'|}{h^2}$ almost surely, and also that $\E \Big[ \left|\frac{1}{h} K \left( \frac{X_i - x}{h} \right) - \frac{1}{h} K \left( \frac{X_i - x'}{h} \right)\right|^2 \Big] \lesssim \frac{|x-x'|^2}{h^3}$. By chaining with the Bernstein--Orlicz norm and polynomial covering numbers, % \begin{align*} \sup_{|x-x'| \leq \delta} \big\|S(x) - S(x')\big\|_\infty \lesssim_\P \delta \sqrt{\frac{\log n}{n h^3}} \end{align*} % whenever $\log(N/h) \lesssim \log n$ and $n h \gtrsim \log n$. By a Gaussian process maximal inequality \citep[Corollary~2.2.8]{van1996weak} the same bound holds for $T(x)$ with % \begin{align*} \sup_{|x-x'| \leq \delta} \big\|T(x) - T(x')\big\|_\infty \lesssim_\P \delta \sqrt{\frac{\log n}{n h^3}}. \end{align*} \begin{proof}[Lemma~\ref{lem:yurinskii_kde_eigenvalue}] For $x, x' \in [a, 1-a]$, the scaled covariance function of this nonparametric estimator is % \begin{align*} n h\, \Cov\big[\hat g(x), \hat g(x')\big] &= \frac{1}{h} \E \left[ K \left( \frac{X_i - x}{h} \right) K \left( \frac{X_i - x'}{h} \right) \right] \\ &\quad- \frac{1}{h} \E \left[ K \left( \frac{X_i - x}{h} \right) \right] \E \left[ K \left( \frac{X_i - x'}{h} \right) \right] \\ &= \frac{1}{2 \pi} \int_{\frac{-x}{h}}^{\frac{1-x}{h}} \exp \left( - \frac{t^2}{2} \right) \exp \left( - \frac{1}{2} \left( t + \frac{x - x'}{h} \right)^2 \right) \diff t - h I(x) I(x') \end{align*} % where $I(x) = \frac{1}{\sqrt 2 \pi} \int_{-x/h}^{(1-x)/h} e^{-t^2/2} \diff t$. Completing the square and a substitution gives % \begin{align*} n h\, \Cov\big[\hat g(x), \hat g(x')\big] &= \frac{1}{2 \pi} \exp \left( - \frac{1}{4} \left( \frac{x-x'}{h} \right)^2 \right) \int_{\frac{-x-x'}{2h}}^{\frac{2-x-x'}{2h}} \exp \left(-t^2\right) \diff t - h I(x) I(x'). \end{align*} % Now we show that since $x, x'$ are not too close to the boundary of $[0,1]$, the limits in the above integral can be replaced by $\pm \infty$. Note that $\frac{-x-x'}{2h} \leq \frac{-a}{h}$ and $\frac{2-x-x'}{2h} \geq \frac{a}{h}$ so % \begin{align*} \int_{-\infty}^{\infty} \exp \left(-t^2\right) \diff t - \int_{\frac{-x-x'}{2h}}^{\frac{2-x-x'}{2h}} \exp \left(-t^2\right) \diff t \leq 2 \int_{a/h}^\infty \exp \left(-t^2\right) \diff t \leq \frac{h}{a} \exp \left(- \frac{a^2}{h^2}\right). \end{align*} % Therefore, since $\int_{-\infty}^{\infty} e^{-t^2} \diff t = \sqrt \pi$, % \begin{align*} \left| n h\, \Cov\big[\hat g(x), \hat g(x')\big] - \frac{1}{2 \sqrt \pi} \exp \left( - \frac{1}{4} \left( \frac{x-x'}{h} \right)^2 \right) + h I(x) I(x') \right| \leq \frac{h}{2 \pi a} \exp \left(- \frac{a^2}{h^2}\right). \end{align*} % Define the $N \times N$ matrix $\tilde\Sigma_{i j} = \frac{1}{2 \sqrt \pi} \exp \left( - \frac{1}{4} \left( \frac{x_i-x_j}{h} \right)^2 \right)$. By \citet[Proposition~2.4, Proposition~2.5, and Equation~2.10]{baxter1994norm}, with $\cB_k = \big\{b \in \R^\Z : \sum_{i \in \Z} \I\{b_i \neq 0\} \leq k \big\}$, % \begin{align*} \inf_{k \in \N} \inf_{b \in \R^k} \frac{\sum_{i=1}^k \sum_{j=1}^k b_i b_j \, e^{-\lambda(i-j)^2}} {\sum_{i=1}^k b_i^2} = \sqrt{\frac{\pi}{\lambda}} \sum_{i=-\infty}^{\infty} \exp \left( - \frac{(\pi e + 2 \pi i)^2}{4 \lambda} \right). \end{align*} % We use Riemann sums, noting that $\pi e + 2 \pi x = 0$ at $x = -e/2 \approx -1.359$. Consider the substitutions $\Z \cap (-\infty, -3] \mapsto (-\infty, -2]$, $\{-2, -1\} \mapsto \{-2, -1\}$, and $\Z \cap [0, \infty) \mapsto [-1, \infty)$. % \begin{align*} \sum_{i \in \Z} e^{-(\pi e + 2 \pi i)^2 / 4 \lambda} &\leq \int_{-\infty}^{-2} e^{ - (\pi e + 2 \pi x)^2/4 \lambda} \diff x + e^{- (\pi e - 4 \pi)^2/4 \lambda} \\ &\quad+ e^{ - (\pi e - 2 \pi)^2 / 4 \lambda} + \int_{-1}^{\infty} e^{ -(\pi e + 2 \pi x)^2 / 4 \lambda} \diff x. \end{align*} % Now use the substitution $t = \frac{\pi e + 2 \pi x}{2 \sqrt \lambda}$ and suppose $\lambda < 1$, yielding % \begin{align*} \sum_{i \in \Z} e^{-(\pi e + 2 \pi i)^2 / 4 \lambda} &\leq \frac{\sqrt \lambda}{\pi} \int_{-\infty}^{\frac{\pi e - 4 \pi}{2 \sqrt \lambda}} e^{-t^2} \diff t + e^{- (\pi e - 4 \pi)^2/4 \lambda} + e^{ - (\pi e - 2 \pi)^2 / 4 \lambda} + \frac{\sqrt \lambda}{\pi} \int_{\frac{\pi e - 2 \pi}{2 \sqrt \lambda}}^{\infty} e^{-t^2} \diff t \\ &\leq \left( 1 + \frac{1}{\pi} \frac{\lambda}{4 \pi - \pi e} \right) e^{-(\pi e - 4 \pi)^2 / 4 \lambda} + \left( 1 + \frac{1}{\pi} \frac{\lambda}{\pi e - 2 \pi} \right) e^{- (\pi e - 2 \pi)^2 / 4 \lambda} \\ &\leq \frac{13}{12} e^{-(\pi e - 4 \pi)^2 / 4 \lambda} + \frac{8}{7} e^{- (\pi e - 2 \pi)^2 / 4 \lambda} \leq \frac{9}{4} \exp \left( - \frac{5}{4 \lambda} \right). \end{align*} % Therefore % \begin{align*} \inf_{k \in \N} \inf_{b \in \cB_k} \frac{\sum_{i \in \Z} \sum_{j \in \Z} b_i b_j \, e^{-\lambda(i-j)^2}} {\sum_{i \in \Z} b_i^2} < \frac{4}{\sqrt \lambda} \exp \left( - \frac{5}{4 \lambda} \right) < 4 e^{-1/\lambda}. \end{align*} % From this and since $\tilde\Sigma_{i j} = \frac{1}{2 \sqrt \pi} e^{-\lambda(i-j)^2}$ with $\lambda = \frac{1}{4(N-1)^2 h^2} \leq \frac{\delta^2}{h^2}$, for each $h$ and some $\delta \leq h$, we have $\lambda_{\min}(\tilde\Sigma) \leq 2 e^{-h^2/\delta^2}$. Recall that % \begin{align*} \left| \Sigma_{i j} - \tilde\Sigma_{i j} + h I(x_i) I(x_j) \right| \leq \frac{h}{2 \pi a} \exp \left(- \frac{a^2}{h^2}\right). \end{align*} % For any positive semi-definite $N \times N$ matrices $A$ and $B$ and vector $v$ we have $\lambda_{\min}(A - v v^\T) \leq \lambda_{\min}(A)$ and $\lambda_{\min}(B) \leq \lambda_{\min}(A) + \|B-A\|_2 \leq \lambda_{\min}(A) + N \|B-A\|_{\max}$. Hence with $I_i = I(x_i)$, % \begin{align*} \lambda_{\min}(\Sigma) &\leq \lambda_{\min}(\tilde\Sigma - h I I^\T) + \frac{N h}{2 \pi a} \exp \left(- \frac{a^2}{h^2}\right) \leq 2 e^{-h^2/\delta^2} + \frac{h}{\pi a \delta} e^{-a^2 / h^2}. \end{align*} \end{proof} \begin{proof}[Proposition~\ref{pro:yurinskii_emp_proc}] Let $\cF_\delta$ be a $\delta$-cover of $(\cF, d)$. Using a union bound, we can write % \begin{align*} &\P\left(\sup_{f \in \cF} \big| S(f) - T(f) \big| \geq 2t + \eta \right) \leq \P\left(\sup_{f \in \cF_\delta} \big| S(f) - T(f) \big| \geq \eta \right) \\ &\qquad\qquad+ \P\left(\sup_{d(f,f') \leq \delta} \big| S(f) - S(f') \big| \geq t \right) + \P\left(\sup_{d(f,f') \leq \delta} \big| T(f) - T(f') \big| \geq t \right). \end{align*} \proofparagraph{bounding the difference on $\cF_\delta$} We apply Corollary~\ref{cor:yurinskii_sa_martingale} with $p = \infty$ to the martingale difference sequence $\cF_\delta(X_i) = \big(f(X_i) : f \in \cF_\delta\big)$ which takes values in $\R^{|\cF_\delta|}$. Square integrability can be assumed otherwise $\beta_\delta = \infty$. Note $\sum_{i=1}^n \cF_\delta(X_i) = S(\cF_\delta)$ and $\phi_\infty(\cF_\delta) \leq \sqrt{2 \log 2 |\cF_\delta|}$. Therefore there exists a conditionally Gaussian vector $T(\cF_\delta)$ with the same covariance structure as $S(\cF_\delta)$ conditional on $\cH_0$ satisfying % \begin{align*} \P\left( \sup_{f \in \cF_\delta} \big| S(f) - T(f) \big| \geq \eta \right) &\leq \frac{24\beta_\delta^{\frac{1}{3}} (2\log 2 |\cF_\delta|)^{\frac{1}{3}}}{\eta} + 17\left(\frac{\sqrt{2 \log 2 |\cF_\delta|} \sqrt{\E\left[\|\Omega_\delta\|_2\right]}}{\eta }\right)^{\frac{2}{3}}. \end{align*} \proofparagraph{bounding the fluctuations in $S(f)$} Since $\big\| S(f) - S(f') \big\|_\psi \leq L d(f,f')$, by Theorem~2.2.4 in \citet{van1996weak} % \begin{align*} \left\| \sup_{d(f,f') \leq \delta} \big| S(f) - S(f') \big| \right\|_\psi &\leq C_\psi L \left( \int_0^\delta \psi^{-1}(N_\varepsilon) \diff{\varepsilon} + \delta \psi^{-1}(N_\delta^2) \right) = C_\psi L J_\psi(\delta). \end{align*} % Then, by Markov's inequality and the definition of the Orlicz norm, % \begin{align*} \P\left( \sup_{d(f,f') \leq \delta} \big| S(f) - S(f') \big| \geq t \right) &\leq \psi\left(\frac{t}{C_\psi L J_\psi(\delta)} \right)^{-1}. \end{align*} \proofparagraph{bounding the fluctuations in $T(f)$} By the Vorob'ev--Berkes--Philipp theorem \citep{dudley1999uniform}, $T(\cF_\delta)$ extends to a conditionally Gaussian process $T(f)$. Firstly, since $\bigvvvert T(f) - T(f') \bigvvvert_2 \leq L d(f,f')$ conditionally on $\cH_0$, and $T(f)$ is a conditional Gaussian process, we have $\big\| T(f) - T(f') \big\|_{\psi_2} \leq 2 L d(f,f')$ conditional on $\cH_0$ by \citet[Chapter~2.2, Complement~1]{van1996weak}, where $\psi_2(x) = \exp(x^2) - 1$. Thus again by Theorem~2.2.4 in \citet{van1996weak}, again conditioning on $\cH_0$, % \begin{align*} \left\| \sup_{d(f,f') \leq \delta} \big| T(f) - T(f') \big| \right\|_{\psi_2} &\leq C_1 L \int_0^\delta \sqrt{\log N_\varepsilon} \diff{\varepsilon} = C_1 L J_2(\delta) \end{align*} % for some universal constant $C_1 > 0$, where we used $\psi_2^{-1}(x) = \sqrt{\log(1+x)}$ and monotonicity of covering numbers. Then by Markov's inequality and the definition of the Orlicz norm, % \begin{align*} \P\left( \sup_{d(f,f') \leq \delta} \big| T(f) - T(f') \big| \geq t \right) &\leq \left( \exp\left( \frac{t^2}{C_1^2 L^2 J_2(\delta)^2} \right) - 1 \right)^{-1} \!\vee 1 \leq 2 \exp\left( \frac{-t^2}{C_1^2 L^2 J_2(\delta)^2} \right). \end{align*} % \proofparagraph{conclusion} The result follows by scaling $t$ and $\eta$ and enlarging constants if necessary. % \end{proof} \subsection{Applications to nonparametric regression} \begin{proof}[Proposition~\ref{pro:yurinskii_series}] Proceed according to the decomposition in Section~\ref{sec:yurinskii_series}. By stationarity and Lemma~SA-2.1 in \citet{cattaneo2020large}, we have $\sup_w \|p(w)\|_1 \lesssim 1$ and also $\|H\|_1 \lesssim n/k$ and $\|H^{-1}\|_1 \lesssim k/n$. \proofparagraph{bounding $\beta_{\infty,2}$ and $\beta_{\infty,3}$} Set $X_i = p(W_i) \varepsilon_i$ so $S = \sum_{i=1}^n X_i$, and set $\sigma^2_i = \sigma^2(W_i)$ and $V_i = \Var[X_i \mid \cH_{i-1}] = \sigma_i^2 p(W_i) p(W_i)^\T$. Recall from Corollary~\ref{cor:yurinskii_sa_martingale} that for $r \in \{2,3\}$, % \begin{align*} \beta_{\infty,r} = \sum_{i=1}^n \E\left[\| X_i \|^r_2 \| X_i \|_\infty + \|V_i^{1/2} Z_i \|^r_2 \|V_i^{1/2} Z_i \|_\infty \right] \end{align*} % with $Z_i \sim \cN(0,1)$ i.i.d.\ and independent of $V_i$. For the first term, we use $\sup_w \|p(w)\|_2 \lesssim 1$ and bounded third moments of $\varepsilon_i$: % \begin{align*} \E\left[ \| X_i \|^r_2 \| X_i \|_\infty \right] &\leq \E\left[ |\varepsilon_i|^3 \| p(W_i) \|^{r+1}_2 \right] \lesssim 1. \end{align*} % For the second term, apply Lemma~\ref{lem:yurinskii_app_gaussian_useful} conditionally on $\cH_n$ with $\sup_w \|p(w)\|_2 \lesssim 1$ to see % \begin{align*} &\E\left[ \|V_i^{1/2} Z_i \|^r_2 \|V_i^{1/2} Z_i \|_\infty \right] \lesssim \sqrt{\log 2k} \ \E\left[ \max_{1 \leq j \leq k} (V_i)_{j j}^{1/2} \bigg( \sum_{j=1}^k (V_i)_{j j} \bigg)^{r/2} \right] \\ &\quad\lesssim \sqrt{\log 2k} \ \E\left[ \sigma_i^{r+1} \max_{1 \leq j \leq k} p(W_i)_j \bigg( \sum_{j=1}^k p(W_i)_{j}^2 \bigg)^{r/2} \right] \lesssim \sqrt{\log 2k} \ \E\left[ \sigma_i^{r+1} \right] \lesssim \sqrt{\log 2k}. \end{align*} % Putting these together yields % $\beta_{\infty,2} \lesssim n \sqrt{\log 2k}$ and $\beta_{\infty,3} \lesssim n \sqrt{\log 2k}$. \proofparagraph{bounding $\Omega$} Set $\Omega = \sum_{i=1}^n \big(V_i - \E[V_i] \big)$ so % \begin{align*} \Omega &= \sum_{i=1}^n \big(\sigma_i^2 p(W_i)p(W_i)^\T - \E\left[ \sigma_i^2 p(W_i)p(W_i)^\T \right]\big). \end{align*} % Observe that $\Omega_{j l}$ is the sum of a zero-mean strictly stationary $\alpha$-mixing sequence and so $\E[\Omega_{j l}^2] \lesssim n$ by Lemma~\ref{lem:yurinskii_app_variance_mixing}% \ref{it:yurinskii_app_variance_mixing_bounded}. Since the basis functions satisfy Assumption~3 in \citet{cattaneo2020large}, $\Omega$ has a bounded number of non-zero entries in each row, so by Jensen's inequality % \begin{align*} \E\left[ \|\Omega\|_2 \right] &\leq \E\left[ \|\Omega\|_\rF \right] \leq \left( \sum_{j=1}^k \sum_{l=1}^k \E\left[ \Omega_{j l}^2 \right] \right)^{1/2} \lesssim \sqrt{n k}. \end{align*} % \proofparagraph{strong approximation} By Corollary~\ref{cor:yurinskii_sa_martingale} and the previous parts, with any sequence $R_n \to \infty$, % \begin{align*} \|S - T \|_\infty &\lesssim_\P \beta_{\infty,2}^{1/3} (\log 2k)^{1/3} R_n + \sqrt{\log 2k} \sqrt{\E[\|\Omega\|_2]} R_n \\ &\lesssim_\P n^{1/3} \sqrt{\log 2k} R_n + (n k)^{1/4} \sqrt{\log 2k} R_n. \end{align*} % If further $\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$ then the third-order version of Corollary~\ref{cor:yurinskii_sa_martingale} applies since % \begin{align*} \pi_3 &= \sum_{i=1}^{n} \sum_{|\kappa| = 3} \E \Big[ \big| \E [ X_i^\kappa \mid \cH_{i-1} ] \big| \Big] = \sum_{i=1}^{n} \sum_{|\kappa| = 3} \E \Big[ \big| p(W_i)^\kappa \, \E [ \varepsilon_i^3 \mid \cH_{i-1} ] \big| \Big] = 0, \end{align*} % giving % \begin{align*} \|S - T \|_\infty &\lesssim_\P \beta_{\infty,3}^{1/4} (\log 2k)^{3/8} R_n + \sqrt{\log 2k} \sqrt{\E[\|\Omega\|_2]} R_n \lesssim_\P (n k)^{1/4} \sqrt{\log 2k} R_n. \end{align*} % By H{\"o}lder's inequality and with $\|H^{-1}\|_1 \lesssim k/n$ we have % \begin{align*} \sup_{w \in \cW} \left| p(w)^\T H^{-1} S - p(w)^\T H^{-1} T \right| &\leq \sup_{w \in \cW} \|p(w)\|_1 \|H^{-1}\|_1 \| S - T \|_\infty \lesssim n^{-1} k \| S - T \|_\infty. \end{align*} \proofparagraph{convergence of $\hat H$} We have $\hat H - H = \sum_{i=1}^n \big(p(W_i)p(W_i)^\T - \E\left[ p(W_i)p(W_i)^\T \right]\big)$. Observe that $(\hat H - H)_{j l}$ is the sum of a zero-mean strictly stationary $\alpha$-mixing sequence and so $\E[(\hat H - H)_{j l}^2] \lesssim n$ by Lemma~\ref{lem:yurinskii_app_variance_mixing}% \ref{it:yurinskii_app_variance_mixing_bounded}. Since the basis functions satisfy Assumption~3 in \citet{cattaneo2020large}, $\hat H-H$ has a bounded number of non-zero entries in each row and so by Jensen's inequality % \begin{align*} \E\left[ \|\hat H-H\|_1 \right] &= \E\left[ \max_{1 \leq i \leq k} \sum_{j=1}^k \big|(\hat H-H)_{i j}\big| \right] \leq \E\left[ \sum_{1 \leq i \leq k} \Bigg( \sum_{j=1}^k |(\hat H-H)_{i j}| \Bigg)^2 \right]^{\frac{1}{2}} \lesssim \sqrt{n k}. \end{align*} \proofparagraph{bounding the matrix term} Note $\|\hat H^{-1}\|_1 \leq \|H^{-1}\|_1 + \|\hat H^{-1}\|_1 \|\hat H-H\|_1 \|H^{-1}\|_1$ so by the previous part, we deduce % \begin{align*} \|\hat H^{-1}\|_1 \leq \frac{\|H^{-1}\|_1} {1 - \|\hat H-H\|_1 \|H^{-1}\|_1} \lesssim_\P \frac{k/n} {1 - \sqrt{n k}\, k/n} \lesssim_\P \frac{k}{n} \end{align*} % as $k^3 / n \to 0$. Note that by the martingale structure, since $p(W_i)$ is bounded and supported on a region with volume at most of the order $1/k$, and as $W_i$ has a Lebesgue density, % \begin{align*} \Var[T_j] &= \Var[S_j] = \Var\left[ \sum_{i=1}^n \varepsilon_i p(W_i)_j \right] = \sum_{i=1}^n \E\left[ \sigma_i^2 p(W_i)_j^2 \right] \lesssim \frac{n}{k}. \end{align*} % So by the Gaussian maximal inequality in Lemma~\ref{lem:yurinskii_app_gaussian_pnorm}, $\|T\|_\infty \lesssim_\P \sqrt{\frac{n \log 2k}{k}}$. Since $k^3/n \to 0$, % \begin{align*} \sup_{w \in \cW} \left| p(w)^\T (\hat H^{-1} - H^{-1}) S \right| &\leq \sup_{w \in \cW} \|p(w)^\T\|_1 \|\hat H^{-1}\|_1 \|\hat H - H\|_1 \|H^{-1}\|_1 \|S - T\|_\infty \\ &\quad+ \sup_{w \in \cW} \|p(w)^\T\|_1 \|\hat H^{-1}\|_1 \|\hat H - H\|_1 \|H^{-1}\|_1 \|T\|_\infty \\ &\lesssim_\P \frac{k^2}{n^2} \sqrt{n k} \!\left( n^{1/3} \sqrt{\log 2k} + (n k)^{1/4} \sqrt{\log 2k} \right) \!+ \frac{k^2}{n^2} \sqrt{n k} \sqrt{\frac{n \log 2k}{k}} \\ &\lesssim_\P \frac{k^2}{n} \sqrt{\log 2k}. \end{align*} % \proofparagraph{conclusion of the main result} By the previous parts, with $G(w) = p(w)^\T H^{-1} T$, % \begin{align*} &\sup_{w \in \cW} \left| \hat\mu(w) - \mu(w) - p(w)^\T H^{-1} T \right| \\ &\quad= \sup_{w \in \cW} \left| p(w)^\T H^{-1} (S - T) + p(w)^\T (\hat H^{-1} - H^{-1}) S + \Bias(w) \right| \\ &\quad\lesssim_\P \frac{k}{n} \|S - T\|_\infty + \frac{k^2}{n} \sqrt{\log 2k} + \sup_{w \in \cW} |\Bias(w)| \\ &\quad\lesssim_\P \frac{k}{n} \left( n^{1/3} \sqrt{\log 2k} + (n k)^{1/4} \sqrt{\log 2k} \right) R_n + \frac{k^2}{n} \sqrt{\log 2k} + \sup_{w \in \cW} |\Bias(w)| \\ &\quad\lesssim_\P n^{-2/3} k \sqrt{\log 2k} R_n + n^{-3/4} k^{5/4} \sqrt{\log 2k} R_n + \frac{k^2}{n} \sqrt{\log 2k} + \sup_{w \in \cW} |\Bias(w)| \\ &\quad\lesssim_\P n^{-2/3} k \sqrt{\log 2k} R_n + \sup_{w \in \cW} |\Bias(w)| \end{align*} % since $k^3/n \to 0$. If further $\E \left[ \varepsilon_i^3 \mid \cH_{i-1} \right] = 0$ then % \begin{align*} \sup_{w \in \cW} \left| \hat\mu(w) - \mu(w) - p(w)^\T H^{-1} T \right| &\lesssim_\P \frac{k}{n} \|S - T\|_\infty + \frac{k^2}{n} \sqrt{\log 2k} + \sup_{w \in \cW} |\Bias(w)| \\ &\lesssim_\P n^{-3/4} k^{5/4} \sqrt{\log 2k} R_n + \sup_{w \in \cW} |\Bias(w)|. \end{align*} % Finally, we verify the variance bounds for the Gaussian process. With $\sigma^2(w)$ bounded above, % \begin{align*} \Var[G(w)] &= p(w)^\T H^{-1} \Var\left[ \sum_{i=1}^n p(W_i) \varepsilon_i \right] H^{-1} p(w) \\ &= p(w)^\T H^{-1} \E\left[\sum_{i=1}^n p(W_i) p(W_i)^\T \sigma^2(W_i) \right] H^{-1} p(w) \\ &\lesssim \|p(w)\|_2^2 \|H^{-1}\|_2^2 \|H\|_2 \lesssim k/n. \end{align*} % Similarly, since $\sigma^2(w)$ is bounded away from zero, % \begin{align*} \Var[G(w)] &\gtrsim \|p(w)\|_2^2 \|H^{-1}\|_2^2 \|H^{-1}\|_2^{-1} \gtrsim k/n. \end{align*} \proofparagraph{bounding the bias} We delegate the task of carefully deriving bounds on the bias to \citet{cattaneo2020large}, who provide a high-level assumption on the approximation error in Assumption~4 and then use it to derive bias bounds in Section~3 of the form $\sup_{w \in \cW} |\Bias(w)| \lesssim_\P k^{-\gamma}$. This assumption is then verified for B-splines, wavelets, and piecewise polynomials in their supplemental appendix. \end{proof} \begin{proof}[Proposition~\ref{pro:yurinskii_series_feasible}] \proofparagraph{infeasible supremum approximation} Provided that the bias is negligible, for all $s > 0$ we have % \begin{align*} &\sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \left| \frac{\hat\mu(w)-\mu(w)}{\sqrt{\rho(w,w)}} \right| \leq t \right) - \P\left( \sup_{w \in \cW} \left| \frac{G(w)}{\sqrt{\rho(w,w)}} \right| \leq t \right) \right| \\ &\quad\leq \sup_{t \in \R} \P\left( t \leq \sup_{w \in \cW} \left| \frac{G(w)}{\sqrt{\rho(w,w)}} \right| \leq t + s \right) + \P\left( \sup_{w \in \cW} \left| \frac{\hat\mu(w)-\mu(w)-G(w)}{\sqrt{\rho(w,w)}} \right| > s \right). \end{align*} % By the Gaussian anti-concentration result given as Corollary~2.1 in \citet{chernozhukov2014anti} applied to a discretization of $\cW$, the first term is at most $s \sqrt{\log n}$ up to a constant factor, and the second term converges to zero whenever $\frac{1}{s} \left( \frac{k^3 (\log k)^3}{n} \right)^{1/6} \to 0$. Thus a suitable value of $s$ exists whenever $\frac{k^3(\log n)^6}{n} \to 0$. \proofparagraph{feasible supremum approximation} By \citet[Lemma~3.1]{chernozhukov2013gaussian} and discretization, with $\rho(w,w') = \E[\hat\rho(w,w')]$, % \begin{align*} &\sup_{t \in \R} \left| \P\left( \sup_{w \in \cW} \left| \frac{\hat G(w)}{\sqrt{\hat\rho(w,w)}} \right| \leq t \biggm| \bW, \bY \right) - \P\left( \left| \frac{G(w)}{\sqrt{\rho(w,w)}} \right| \leq t \right) \right| \\ &\quad\lesssim_\P \sup_{w,w' \in \cW} \left| \frac{\hat\rho(w,w')} {\sqrt{\hat\rho(w,w)\hat\rho(w',w')}} - \frac{\rho(w,w')} {\sqrt{\rho(w,w)\rho(w',w')}} \right|^{1/3} (\log n)^{2/3} \\ &\quad\lesssim_\P \left(\frac n k \right)^{1/3} \sup_{w,w' \in \cW} |\hat\rho(w,w') - \rho(w,w')|^{1/3} (\log n)^{2/3} \\ &\quad\lesssim_\P \left( \frac{n (\log n)^2}{k} \right)^{1/3} \sup_{w,w' \in \cW} \left| p(w)^\T \hat H^{-1} \left( \hat{V}[S] - \Var[S] \right) \hat H^{-1} p(w') \right|^{1/3} \\ &\quad\lesssim_\P \left( \frac{k (\log n)^2}{n} \right)^{1/3} \left\| \hat{V}[S] - \Var[S] \right\|_2^{1/3}, \end{align*} % and vanishes in probability whenever $\frac{k (\log n)^2}{n} \big\| \hat{V}[S] - \Var[S] \big\|_2 \to_\P 0$. For the plug-in estimator, % \begin{align*} &\left\| \hat{V}[S] - \Var[S] \right\|_2 = \left\| \sum_{i=1}^n p(W_i) p(W_i^\T) \hat\sigma^2(W_i) - n \E\left[ p(W_i) p(W_i^\T) \sigma^2(W_i) \right] \right\|_2 \\ &\quad\lesssim_\P \sup_{w \in \cW} |\hat{\sigma}^2(w)-\sigma^2(w)| \, \big\| \hat H \big\|_2 \\ &\qquad+ \left\| \sum_{i=1}^n p(W_i) p(W_i^\T) \sigma^2(W_i) - n \E\left[ p(W_i) p(W_i^\T) \sigma^2(W_i) \right] \right\|_2 \\ &\quad\lesssim_\P \frac{n}{k} \sup_{w \in \cW} |\hat{\sigma}^2(w)-\sigma^2(w)| + \sqrt{n k}, \end{align*} % where the second term is bounded by the same argument used to bound $\|\hat H - H\|_1$. Thus, the feasible approximation is valid whenever $(\log n)^2 \sup_{w \in \cW} |\hat{\sigma}^2(w)-\sigma^2(w)| \to_\P 0$ and $\frac{k^3 (\log n)^4}{n} \to 0$. The validity of the uniform confidence band follows immediately. % \end{proof} \begin{proof}[Proposition~\ref{pro:yurinskii_local_poly}] We apply Proposition~\ref{pro:yurinskii_emp_proc} with the metric $d(f_w, f_{w'}) = \|w-w'\|_2$ and the function class % \begin{align*} \cF &= \left\{ (W_i, \varepsilon_i) \mapsto e_1^\T H(w)^{-1} K_h(W_i-w) p_h(W_i-w) \varepsilon_i :\ w \in \cW \right\}, \end{align*} % with $\psi$ chosen as a suitable Bernstein Orlicz function. \proofparagraph{bounding $H(w)^{-1}$} Recall that $H(w) = \sum_{i=1}^n \E[K_h(W_i-w) p_h(W_i-w)p_h(W_i-w)^\T]$ and let $a(w) \in \R^k$ with $\|a(w)\|_2 = 1$. Since the density of $W_i$ is bounded away from zero on $\cW$, % \begin{align*} a(w)^\T H(w) a(w) &= n \E\left[ \big( a(w)^\T p_h(W_i-w) \big)^2 K_h(W_i-w) \right] \\ &\gtrsim n \int_\cW \big( a(w)^\T p_h(u-w) \big)^2 K_h(u-w) \diff{u} \gtrsim n \int_{\frac{\cW-w}{h}} \big( a(w)^\T p(u) \big)^2 K(u) \diff{u}. \end{align*} % This is continuous in $a(w)$ on the compact set $\|a(w)\|_2 = 1$ and $p(u)$ forms a polynomial basis so $a(w)^\T p(u)$ has finitely many zeroes. Since $K(u)$ is compactly supported and $h \to 0$, the above integral is eventually strictly positive for all $x \in \cW$, and hence is bounded below uniformly in $w \in \cW$ by a positive constant. Therefore $\sup_{w \in \cW} \|H(w)^{-1}\|_2 \lesssim 1/n$. \proofparagraph{bounding $\beta_\delta$} Let $\cF_\delta$ be a $\delta$-cover of $(\cF, d)$ with cardinality $|\cF_\delta| \asymp \delta^{-m}$ and let $\cF_\delta(W_i, \varepsilon_i) = \big(f(W_i, \varepsilon_i) : f\in \cF_\delta\big)$. Define the truncated errors $\tilde\varepsilon_i = \varepsilon_i\I\{-a \log n \leq \varepsilon_i \leq b \log n\}$ and note that $\E\big[e^{|\varepsilon_i|/C_\varepsilon}\big] < \infty$ implies that $\P(\exists i: \tilde\varepsilon_i \neq \varepsilon_i) \lesssim n^{1-(a \vee b)/C_\varepsilon}$. Hence, by choosing $a$ and $b$ large enough, with high probability, we can replace all $\varepsilon_i$ by $\tilde\varepsilon_i$. Further, it is always possible to increase either $a$ or $b$ along with some randomization to ensure that $\E[\tilde\varepsilon_i] = 0$. Since $K$ is bounded and compactly supported, $W_i$ has a bounded density and $|\tilde\varepsilon_i| \lesssim \log n$, % \begin{align*} \bigvvvert f(W_i, \tilde\varepsilon_i) \bigvvvert_2 &= \E\left[ \left| e_1^\T H(w)^{-1} K_h(W_i-w) p_h(W_i-w) \tilde\varepsilon_i \right|^2 \right]^{1/2} \\ &\leq \E\left[ \|H(w)^{-1}\|_2^2 K_h(W_i-w)^2 \|p_h(W_i-w)\|_2^2 \sigma^2(W_i) \right]^{1/2} \\ &\lesssim n^{-1} \E\left[ K_h(W_i-w)^2 \right]^{1/2} \lesssim n^{-1} h^{-m / 2}, \\ \bigvvvert f(W_i, \tilde\varepsilon_i) \bigvvvert_\infty &\leq \bigvvvert \|H(w)^{-1}\|_2 K_h(W_i-w) \|p_h(W_i-w)\|_2 |\tilde\varepsilon_i| \bigvvvert_\infty \\ &\lesssim n^{-1} \bigvvvert K_h(W_i-w) \bigvvvert_\infty \log n \lesssim n^{-1} h^{-m} \log n. \end{align*} % Therefore % \begin{align*} \E\left[ \|\cF_\delta(W_i, \tilde\varepsilon_i)\|_2^2 \|\cF_\delta(W_i, \tilde\varepsilon_i)\|_\infty \right] &\leq \!\sum_{f\in\cF_\delta} \!\bigvvvert f(W_i, \tilde\varepsilon_i) \bigvvvert_2^2 \max_{f\in\cF_\delta} \bigvvvert f(W_i, \tilde\varepsilon_i) \bigvvvert_\infty \!\lesssim n^{-3} \delta^{-m} h^{-2m} \log n. \end{align*} % Let $V_i(\cF_\delta) = \E\big[\cF_\delta(W_i, \tilde\varepsilon_i) \cF_\delta(W_i, \tilde\varepsilon_i)^\T \mid \cH_{i-1}\big]$ and $Z_i \sim \cN(0, I_d)$ be i.i.d.\ and independent of $\cH_n$. Note that $V_i(f,f) = \E[f(W_i, \tilde\varepsilon_i)^2 \mid W_i] \lesssim n^{-2} h^{-2m}$ and $\E[V_i(f,f)] = \E[f(W_i, \tilde\varepsilon_i)^2] \lesssim n^{-2} h^{-m}$. Thus by Lemma~\ref{lem:yurinskii_app_gaussian_useful}, % \begin{align*} \E\left[ \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|^2_2 \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|_\infty \right] &= \E\left[ \E\left[ \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|^2_2 \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|_\infty \mid \cH_n \right] \right] \\ &\leq 4 \sqrt{\log 2|\cF_\delta|} \,\E\Bigg[ \max_{f \in \cF_\delta} \sqrt{V_i(f,f)} \sum_{f \in \cF_\delta} V_i(f,f) \Bigg] \\ &\lesssim n^{-3} h^{-2m} \delta^{-m} \sqrt{\log(1/\delta)}. \end{align*} % Thus since $\log(1/\delta) \asymp \log(1/h) \asymp\log n$, % \begin{align*} \beta_\delta &= \sum_{i=1}^n \E\left[ \|\cF_\delta(W_i, \tilde\varepsilon_i)\|_2^2 \|\cF_\delta(W_i, \tilde\varepsilon_i)\|_\infty + \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|^2_2 \big\| V_i(\cF_\delta)^{1/2} Z_i \big\|_\infty \right] \lesssim \frac{\log n} {n^2 h^{2m} \delta^m}. \end{align*} \proofparagraph{bounding $\Omega_\delta$} Let $C_K>0$ be the radius of a $\ell^2$-ball containing the support of $K$ and note that % \begin{align*} \left| V_i(f,f') \right| &= \Big| \E\Big[ e_1^\T H(w)^{-1} p_h(W_i-w) e_1^\T H(w')^{-1} p_h(W_i-w') \\ &\qquad\times K_h(W_i-w) K_h(W_i-w') \tilde\varepsilon_i^2 \Bigm| \cH_{i-1} \Big] \Big| \\ &\lesssim n^{-2} K_h(W_i-w) K_h(W_i-w') \\ &\lesssim n^{-2} h^{-m} K_h(W_i-w) \I\{\|w-w'\|_2 \leq 2 C_K h\}. \end{align*} % Since $W_i$ are $\alpha$-mixing with $\alpha(j) < e^{-2j / C_\alpha}$, Lemma~\ref{lem:yurinskii_app_variance_mixing}% \ref{it:yurinskii_app_variance_mixing_exponential} with $r=3$ gives % \begin{align*} &\Var\left[ \sum_{i=1}^n V_i(f,f') \right] \\ &\quad\lesssim \sum_{i=1}^n \E\left[ |V_i(f,f')|^3 \right] ^{2/3} \lesssim n^{-3} h^{-2m} \E\left[ K_h(W_i-w)^3 \right] ^{2/3} \I\{\|w-w'\|_2 \leq 2 C_K h\} \\ &\quad\lesssim n^{-3} h^{-2m} (h^{-2m})^{2/3} \I\{\|w-w'\|_2 \leq 2 C_K h\} \\ &\quad\lesssim n^{-3} h^{-10m/3} \I\{\|w-w'\|_2 \leq 2 C_K h\}. \end{align*} % Therefore, by Jensen's inequality, % \begin{align*} \E\big[ \|\Omega_\delta\|_2 \big] &\leq \E\big[ \|\Omega_\delta\|_\rF \big] \leq \E\Bigg[ \sum_{f,f' \in \cF_\delta} (\Omega_\delta)_{f,f'}^2 \Bigg]^{1/2} \leq \Bigg( \sum_{f,f' \in \cF_\delta} \Var\left[ \sum_{i=1}^n V_i(f,f') \right] \Bigg)^{1/2} \\ &\lesssim n^{-3/2} h^{-5m/3} \Bigg( \sum_{f,f' \in \cF_\delta} \I\{\|w-w'\|_2 \leq 2 C_K h\} \Bigg)^{1/2} \\ &\lesssim n^{-3/2} h^{-5m/3} \big(h^{m} \delta^{-2m} \big)^{1/2} \lesssim n^{-3/2} h^{-7m/6} \delta^{-m}. \end{align*} % Note that we could have used $\|\cdot\|_1$ rather than $\|\cdot\|_\rF$, but this term is negligible either way. \proofparagraph{regularity of the stochastic processes} For each $f, f' \in \cF$, define the mean-zero and $\alpha$-mixing random variables % \begin{align*} u_i(f,f') &= e_1^\T \big( H(w)^{-1} K_h(W_i-w) p_h(W_i-w) - H(w')^{-1} K_h(W_i-w') p_h(W_i-w') \big) \tilde\varepsilon_i. \end{align*} % Note that for all $1 \leq j \leq k$, by the Lipschitz property of the kernel and monomials, % \begin{align*} &\left| K_h(W_i-w) - K_h(W_i-w') \right| \\ &\quad\lesssim h^{-m-1} \|w-w'\|_2 \big( \I\{\|W_i-w\| \leq C_K h\} + \I\{\|W_i-w'\| \leq C_K h\} \big), \\ &\left| p_h(W_i-w)_j - p_h(W_i-w')_j \right| \lesssim h^{-1} \|w-w'\|_2, \end{align*} % to deduce that for any $1 \leq j,l \leq k$, % \begin{align*} \big| H(w)_{j l} - H(w')_{j l} \big| &= \big| n \E\big[ K_h(W_i-w) p_h(W_i-w)_j p_h(W_i-w)_l \\ &\qquad- K_h(W_i-w') p_h(W_i-w')_j p_h(W_i-w')_l \big] \big| \\ &\leq n\E\left[ \left| K_h(W_i-w) - K_h(W_i-w') \right| \left| p_h(W_i-w)_j p_h(W_i-w)_l \right| \right] \\ &\quad+ n\E\left[ \left| p_h(W_i-w)_j - p_h(W_i-w')_j \right| \left| K_h(W_i-w') p_h(W_i-w)_l \right| \right] \\ &\quad+ n\E\left[ \left| p_h(W_i-w)_l - p_h(W_i-w')_l \right| \left| K_h(W_i-w') p_h(W_i-w')_j \right| \right] \\ &\lesssim n h^{-1}\|w-w'\|_2. \end{align*} % Therefore, as the dimension of the matrix $H(w)$ is fixed, % \begin{align*} \big\| H(w)^{-1} - H(w')^{-1} \big\|_2 &\leq \big\| H(w)^{-1}\big\|_2 \big\| H(w')^{-1}\big\|_2 \big\| H(w) - H(w') \big\|_2 \lesssim \frac{\|w-w'\|_2}{n h}. \end{align*} % Hence % \begin{align*} \big| u_i(f,f') \big| &\leq \big\| H(w)^{-1} K_h(W_i-w) p_h(W_i-w) - H(w')^{-1} K_h(W_i-w') p_h(W_i-w') \tilde\varepsilon_i \big\|_2 \\ &\leq \big\| H(w)^{-1} - H(w')^{-1} \big\|_2 \big\| K_h(W_i-w) p_h(W_i-w) \tilde\varepsilon_i \big\|_2 \\ &\quad+ \big| K_h(W_i-w) - K_h(W_i-w') \big| \big\| H(w')^{-1} p_h(W_i-w) \tilde\varepsilon_i \big\|_2 \\ &\quad+ \big\| p_h(W_i-w) - p_h(W_i-w') \big\|_2 \big\| H(w')^{-1} K_h(W_i-w') \tilde\varepsilon_i \big\|_2 \\ &\lesssim \frac{\|w-w'\|_2}{n h} \big| K_h(W_i-w) \tilde\varepsilon_i \big| + \frac{1}{n} \big| K_h(W_i-w) - K_h(W_i-w') \big| \,|\tilde\varepsilon_i| \\ &\lesssim \frac{\|w-w'\|_2 \log n}{n h^{m+1}}, \end{align*} % and from the penultimate line, we also deduce that % \begin{align*} \Var[u_i(f,f')] &\lesssim \frac{\|w-w'\|_2^2}{n^2h^2} \E\left[ K_h(W_i-w)^2 \sigma^2(X_i) \right] \\ &\quad+ \frac{1}{n^2} \E\left[ \big( K_h(W_i-w) - K_h(W_i-w') \big)^2 \sigma^2(X_i) \right] \lesssim \frac{\|w-w'\|_2^2}{n^2h^{m+2}}. \end{align*} % Further, $\E[u_i(f,f') u_j(f,f')] = 0$ for $i \neq j$ so by Lemma~\ref{lem:yurinskii_app_exponential_mixing}% \ref{it:yurinskii_app_exponential_mixing_bernstein}, for a constant $C_1>0$, % \begin{align*} \P\left( \Big| \sum_{i=1}^n u_i(f,f') \Big| \geq \frac{C_1 \|w-w'\|_2}{\sqrt n h^{m/2+1}} \left( \sqrt{t} + \sqrt{\frac{(\log n)^2}{n h^m}} \sqrt t + \sqrt{\frac{(\log n)^6}{n h^m}} t \right) \right) &\leq C_1 e^{-t}. \end{align*} % Therefore, adjusting the constant if necessary and since $n h^{m} \gtrsim (\log n)^7$, % \begin{align*} \P\left( \Big| \sum_{i=1}^n u_i(f,f') \Big| \geq \frac{C_1 \|w-w'\|_2}{\sqrt{n} h^{m/2+1}} \left( \sqrt{t} + \frac{t}{\sqrt{\log n}} \right) \right) &\leq C_1 e^{-t}. \end{align*} % \Citet[Lemma~2]{van2013bernstein} with $\psi(x) = \exp\Big(\big(\sqrt{1+2 x / \sqrt{\log n}}-1 \big)^2 \log n \Big)-1$ now shows that % \begin{align*} \Bigvvvert \sum_{i=1}^n u_i(f,f') \Bigvvvert_\psi &\lesssim \frac{\|w-w'\|_2}{\sqrt{n} h^{m/2+1}} \end{align*} % so we take $L = \frac{1}{\sqrt{n} h^{m/2+1}}$. Noting $\psi^{-1}(t) = \sqrt{\log(1+t)} + \frac{\log(1+t)}{2\sqrt{\log n}}$ and $N_\delta \lesssim \delta^{-m}$, % \begin{align*} J_\psi(\delta) &= \int_0^\delta \psi^{-1}\big( N_\varepsilon \big) \diff{\varepsilon} + \delta \psi^{-1} \big( N_\delta \big) \lesssim \frac{\delta \log(1/\delta)}{\sqrt{\log n}} + \delta \sqrt{\log(1/\delta)} \lesssim \delta \sqrt{\log n}, \\ J_2(\delta) &= \int_0^\delta \sqrt{\log N_\varepsilon} \diff{\varepsilon} \lesssim \delta \sqrt{\log(1/\delta)} \lesssim \delta \sqrt{\log n}. \end{align*} \proofparagraph{strong approximation} Recalling that $\tilde\varepsilon_i = \varepsilon_i$ for all $i$ with high probability, by Proposition~\ref{pro:yurinskii_emp_proc}, for all $t, \eta > 0$ there exists a zero-mean Gaussian process $T(w)$ satisfying % \begin{align*} \E\left[ \left(\sum_{i=1}^n f_w(W_i, \varepsilon_i)\right) \left(\sum_{i=1}^n f_{w'}(W_i, \varepsilon_i)\right) \right] &= \E\big[ T(w) T(w') \big] \end{align*} % for all $w, w' \in \cW$ and % \begin{align*} &\P\left( \sup_{w \in \cW} \left| \sum_{i=1}^n f_{w}(W_i, \varepsilon_i) - T(w) \right| \geq C_\psi(t + \eta) \right) \\ &\quad\leq C_\psi \inf_{\delta > 0} \inf_{\cF_\delta} \Bigg\{ \frac{\beta_\delta^{1/3} (\log 2 |\cF_\delta|)^{1/3}}{\eta } + \left(\frac{\sqrt{\log 2 |\cF_\delta|} \sqrt{\E\left[\|\Omega_\delta\|_2\right]}}{\eta }\right)^{2/3} \\ &\qquad+ \psi\left(\frac{t}{L J_\psi(\delta)}\right)^{-1} + \exp\left(\frac{-t^2}{L^2 J_2(\delta)^2}\right) \Bigg\} \\ &\quad\leq C_\psi \Bigg\{ \frac{ \left(\frac{\log n} {n^2 h^{2m} \delta^{m}} \right)^{1/3} (\log n)^{1/3}}{\eta } + \left(\frac{\sqrt{\log n} \sqrt{n^{-3/2} h^{-7m/6} \delta^{-m}} }{\eta }\right)^{2/3} \\ &\qquad+ \psi\left(\frac{t}{\frac{1}{\sqrt{n} h^{m/2+1}} J_\psi(\delta)}\right)^{-1} + \exp\left(\frac{-t^2}{ \left( \frac{1}{\sqrt{n} h^{m/2+1}} \right)^2 J_2(\delta)^2}\right) \Bigg\} \\ &\quad\leq C_\psi \Bigg\{ \frac{ (\log n)^{2/3}}{n^{2/3} h^{2m/3} \delta^{m/3} \eta} + \left(\frac{ n^{-3/4} h^{-7m/12} \delta^{-m/2} \sqrt{\log n}} {\eta }\right)^{2/3} \\ &\qquad+ \psi\left(\frac{t\sqrt{n} h^{m/2+1}} {\delta \sqrt{\log n}}\right)^{-1} + \exp\left(\frac{-t^2n h^{m+2}} {\delta^2 \log n}\right) \Bigg\}. \end{align*} % Noting $\psi(x) \geq e^{x^2/4}$ for $x \leq 4 \sqrt{\log n}$, any $R_n \to \infty$ gives the probability bound % \begin{align*} \sup_{w \in \cW} \left| \sum_{i=1}^n f_{w}(W_i, \varepsilon_i) - T(w) \right| &\lesssim_\P \frac{(\log n)^{2/3}}{n^{2/3} h^{2m/3} \delta^{m/3}} R_n + \frac{\sqrt{\log n}}{n^{3/4} h^{7m/12} \delta^{m/2}} R_n + \frac{\delta \sqrt{\log n}} {\sqrt{n} h^{m/2+1}}. \end{align*} % Optimizing over $\delta$ gives $\delta \asymp \left(\frac{\log n}{n h^{m-6}}\right)^{\frac{1}{2m+6}} = h \left( \frac{\log n}{n h^{3m}} \right)^{\frac{1}{2m+6}}$ and so % \begin{align*} \sup_{w \in \cW} \left| \sum_{i=1}^n f_{w}(W_i, \varepsilon_i) - T(w) \right| &\lesssim_\P \left( \frac{(\log n)^{m+4}}{n^{m+4}h^{m(m+6)}} \right)^{\frac{1}{2m+6}} R_n. \end{align*} \proofparagraph{convergence of $\hat H(w)$} For $1 \leq j,l \leq k$ define the zero-mean random variables % \begin{align*} u_{i j l}(w) &= K_h(W_i-w) p_h(W_i-w)_j p_h(W_i-w)_l - \E\big[K_h(W_i-w) p_h(W_i-w)_j p_h(W_i-w)_l \big] \end{align*} % and note that $|u_{i j l}(w)| \lesssim h^{-m}$. By Lemma~\ref{lem:yurinskii_app_exponential_mixing}% \ref{it:yurinskii_app_exponential_mixing_bounded} for a constant $C_2 > 0$ and all $t > 0$, % \begin{align*} \P\left( \left| \sum_{i=1}^n u_{i j l}(w) \right| > C_2 h^{-m} \big( \sqrt{n t} + (\log n)(\log \log n) t \big) \right) &\leq C_2 e^{-t}. \end{align*} % Further, note that by Lipschitz properties, % \begin{align*} \left| \sum_{i=1}^n u_{i j l}(w) - \sum_{i=1}^n u_{i j l}(w') \right| &\lesssim h^{-m-1} \|w-w'\|_2 \end{align*} % so there is a $\delta$-cover of $(\cW, \|\cdot\|_2)$ with size at most $n^a \delta^{-a}$ for some $a > 0$. Adjusting $C_2$, % \begin{align*} \P\left( \sup_{w \in \cW} \left| \sum_{i=1}^n u_{i j l}(w) \right| > C_2 h^{-m} \big( \sqrt{n t} + (\log n)(\log \log n) t \big) + C_2 h^{-m-1} \delta \right) &\leq C_2 n^a \delta^{-a} e^{-t} \end{align*} % and hence % \begin{align*} \sup_{w \in \cW} \left| \sum_{i=1}^n u_{i j l}(w) \right| &\lesssim_\P h^{-m} \sqrt{n \log n} + h^{-m} (\log n)^3 \lesssim_\P \sqrt{\frac{n \log n}{h^{2m}}}. \end{align*} % Therefore % \begin{align*} \sup_{w\in\cW} \|\hat H(w)-H(w)\|_2 &\lesssim_\P \sqrt{\frac{n \log n}{h^{2m}}}. \end{align*} \proofparagraph{bounding the matrix term} Firstly, note that since $\sqrt{\frac{\log n}{n h^{2m}}} \to 0$, we have that uniformly in $w \in \cW$ % \begin{align*} \|\hat H(w)^{-1}\|_2 \leq \frac{\|H(w)^{-1}\|_2} {1 - \|\hat H(w)-H(w)\|_2 \|H(w)^{-1}\|_2} &\lesssim_\P \frac{1/n} {1 - \sqrt{\frac{n \log n}{h^{2m}}} \frac{1}{n}} \lesssim_\P \frac{1}{n}. \end{align*} % Therefore % \begin{align*} &\sup_{w \in \cW} \big| e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big) S(w) \big| \leq \sup_{w \in \cW} \big\|\hat H(w)^{-1} - H(w)^{-1}\big\|_2 \|S(w)\|_2 \\ &\quad\leq \sup_{w \in \cW} \big\|\hat H(w)^{-1}\big\|_2 \big\|H(w)^{-1}\big\|_2 \big\|\hat H(w) - H(w)\big\|_2 \|S(w)\|_2 \lesssim_\P \sqrt{\frac{\log n}{n^3 h^{2m}}} \sup_{w \in \cW} \|S(w)\|_2. \end{align*} % Now for $1 \leq j \leq k$ write $u_{i j}(w) = K_h(W_i-w) p_h(W_i-w)_j \tilde \varepsilon_i$ so that $S(w)_j = \sum_{i=1}^n u_{i j}(w)$ with high probability. Note that $u_{i j}(w)$ are zero-mean with $\Cov[u_{i j}(w), u_{i' j}(w)] = 0$ for $ i \neq i'$. Also $|u_{i j}(w)| \lesssim h^{-m} \log n$ and $\Var[u_{i j}(w)] \lesssim h^{-m}$. By Lemma~\ref{lem:yurinskii_app_exponential_mixing}% \ref{it:yurinskii_app_exponential_mixing_bernstein} for a constant $C_3>0$, % \begin{align*} \P\left( \Big| \sum_{i=1}^n u_{i j}(w) \Big| \geq C_3 \big( (h^{-m/2} \sqrt n + h^{-m} \log n) \sqrt t + h^{-m} (\log n)^3 t \big) \right) &\leq C_3 e^{-t}, \\ \P\left( \Big| \sum_{i=1}^n u_{i j}(w) \Big| > C_3 \left( \sqrt{\frac{tn}{h^{m}}} + \frac{t(\log n)^3}{h^{m}} \right) \right) &\leq C_3 e^{-t}, \end{align*} % where we used $n h^{m} \gtrsim (\log n)^2$ and adjusted the constant if necessary. As before, $u_{i j}(w)$ is Lipschitz in $w$ with a constant which is at most polynomial in $n$, so for some $a>0$ % \begin{align*} \P\left( \sup_{w \in \cW} \Big| \sum_{i=1}^n u_{i j}(w) \Big| > C_3 \left( \sqrt{\frac{tn}{h^{m}}} + \frac{t(\log n)^3}{h^{m}} \right) \right) &\leq C_3 n^a e^{-t}, \\ \sup_{w \in \cW} \|S(w)\|_2 \lesssim_\P \sqrt{\frac{n \log n}{h^{m}}} + \frac{(\log n)^4}{h^{m}} &\lesssim_\P \sqrt{\frac{n \log n}{h^{m}}} \end{align*} % as $n h^m \gtrsim (\log n)^7$. Finally, % \begin{align*} \sup_{w \in \cW} \big| e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big) S(w) \big| &\lesssim_\P \sqrt{\frac{\log n}{n^3 h^{2m}}} \sqrt{\frac{n \log n}{h^{m}}} \lesssim_\P \frac{\log n}{\sqrt{n^2 h^{3m}}}. \end{align*} \proofparagraph{bounding the bias} Since $\mu \in \cC^\gamma$, we have, by the multivariate version of Taylor's theorem, % \begin{align*} \mu(W_i) &= \sum_{|\kappa|=0}^{\gamma-1} \frac{1}{\kappa!} \partial^{\kappa} \mu(w) (W_i-w)^\kappa + \sum_{|\kappa|=\gamma} \frac{1}{\kappa!} \partial^{\kappa} \mu(w') (W_i-w)^\kappa \end{align*} % for some $w'$ on the line segment connecting $w$ and $W_i$. Now since $p_h(W_i-w)_1 = 1$, % \begin{align*} &e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \mu(w) \\ &\quad= e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) p_h(W_i-w)^\T e_1 \mu(w) = e_1^\T e_1 \mu(w) = \mu(w). \end{align*} % Therefore % \begin{align*} \Bias(w) &= e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \mu(W_i) - \mu(w) \\ &= e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \\ &\quad\times \Bigg( \sum_{|\kappa|=0}^{\gamma-1} \frac{1}{\kappa!} \partial^{\kappa} \mu(w) (W_i-w)^\kappa + \sum_{|\kappa|=\gamma} \frac{1}{\kappa!} \partial^{\kappa} \mu(w') (W_i-w)^\kappa - \mu(w) \Bigg) \\ &= \sum_{|\kappa|=1}^{\gamma-1} \frac{1}{\kappa!} \partial^{\kappa} \mu(w) e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) (W_i-w)^\kappa \\ &\quad+ \sum_{|\kappa|=\gamma} \frac{1}{\kappa!} \partial^{\kappa} \mu(w') e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) (W_i-w)^\kappa \\ &= \sum_{|\kappa|=\gamma} \frac{1}{\kappa!} \partial^{\kappa} \mu(w') e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) (W_i-w)^\kappa, \end{align*} % where we used that $p_h(W_i-w)$ is a vector containing monomials in $W_i-w$ of order up to $\gamma$, so $e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) (W_i-w)^\kappa = 0$ whenever $1 \leq |\kappa| \leq \gamma$. Finally, % \begin{align*} \sup_{w\in\cW} |\Bias(w)| &= \sup_{w\in\cW} \Bigg| \sum_{|\kappa|=\gamma} \frac{1}{\kappa!} \partial^{\kappa} \mu(w') e_1^\T \hat H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) (W_i-w)^\kappa \Bigg| \\ &\lesssim_\P \sup_{w\in\cW} \max_{|\kappa| = \gamma} \left| \partial^{\kappa} \mu(w') \right| \|\hat H(w)^{-1}\|_2 \Bigg\| \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \Bigg\|_2 h^\gamma \\ &\lesssim_\P \frac{h^\gamma}{n} \sup_{w\in\cW} \Bigg\| \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \Bigg\|_2. \end{align*} % Write $\tilde u_{i j}(w) = K_h(W_i-w)p_h(W_i-w)_j$ and note $|\tilde u_{i j}(w)| \lesssim h^{-m}$ and $\E[\tilde u_{i j}(w)] \lesssim 1$, so % \begin{align*} \P\left( \left| \sum_{i=1}^n \tilde u_{i j}(w) - \E\left[ \sum_{i=1}^n \tilde u_{i j}(w) \right] \right| > C_4 h^{-m} \big( \sqrt{n t} + (\log n)(\log \log n) t \big) \right) &\leq C_4 e^{-t} \end{align*} % by Lemma~\ref{lem:yurinskii_app_exponential_mixing}% \ref{it:yurinskii_app_exponential_mixing_bounded} for a constant $C_4$, By Lipschitz properties, this implies % \begin{align*} \sup_{w \in \cW} \left| \sum_{i=1}^n \tilde u_{i j}(w) \right| &\lesssim_\P n \left( 1 + \sqrt{\frac{\log n}{n h^{2m}}} \right) \lesssim_\P n. \end{align*} % Therefore $\sup_{w\in\cW} |\Bias(w)| \lesssim_\P n h^\gamma / n \lesssim_\P h^\gamma$. \proofparagraph{conclusion} By the previous parts, % \begin{align*} \sup_{w \in \cW} \left|\hat \mu(w) - \mu(w) - T(w) \right| &\leq \sup_{w \in \cW} \left|e_1^\T H(w)^{-1} S(w) - T(w) \right| \\ &\quad+ \sup_{w \in \cW} \left| e_1^\T \big(\hat H(w)^{-1} - H(w)^{-1}\big) S(w) \right| + \sup_{w \in \cW} |\Bias(w)| \\ &\lesssim_\P \left( \frac{(\log n)^{m+4}}{n^{m+4}h^{m(m+6)}} \right)^{\frac{1}{2m+6}} R_n + \frac{\log n}{\sqrt{n^2 h^{3m}}} + h^\gamma \\ &\lesssim_\P \frac{R_n}{\sqrt{n h^m}} \left( \frac{(\log n)^{m+4}}{n h^{3m}} \right)^{\frac{1}{2m+6}} + h^\gamma, \end{align*} % where the last inequality follows because $n h^{3m} \to \infty$ and $\frac{1}{2m+6} \leq \frac{1}{2}$. Finally, we verify the upper and lower bounds on the variance of the Gaussian process. Since the spectrum of $H(w)^{-1}$ is bounded above and below by $1/n$, % \begin{align*} \Var[T(w)] &= \Var\left[ e_1^\T H(w)^{-1} \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \varepsilon_i \right] \\ &= e_1^\T H(w)^{-1} \Var\left[ \sum_{i=1}^n K_h(W_i-w) p_h(W_i-w) \varepsilon_i \right] H(w)^{-1} e_1^\T \\ &\lesssim \|H(w)^{-1}\|_2^2 \max_{1 \leq j \leq k} \sum_{i=1}^n \Var\big[ K_h(W_i-w) p_h(W_i-w)_j \sigma(W_i) \big] \\ &\lesssim \frac{1}{n^2} n \frac{1}{h^m} \lesssim \frac{1}{n h^m}. \end{align*} % Similarly, $\Var[T(w)] \gtrsim \frac{1}{n h^m}$ by the same argument used to bound eigenvalues of $H(w)^{-1}$. % \end{proof} \section{High-dimensional central limit theorems for martingales}% \label{sec:yurinskii_app_high_dim_clt} We present an application of our main results to high-dimensional central limit theorems for martingales. Our main contribution here is the generality of our results, which are broadly applicable to martingale data and impose minimal extra assumptions. In exchange for the scope and breadth of our results, we naturally do not necessarily achieve state-of-the-art distributional approximation errors in certain special cases, such as with independent data or when restricting the class of sets over which the central limit theorem must hold. Extensions of our high-dimensional central limit theorem results to mixingales and other approximate martingales, along with third-order refinements and Gaussian mixture target distributions, are possible through methods akin to those used to establish our main results in Section~\ref{sec:yurinskii_main_results}, but we omit these for succinctness. Our approach to deriving a high-dimensional martingale central limit theorem proceeds as follows. Firstly, the upcoming Proposition~\ref{pro:yurinskii_app_clt} uses our main result on martingale coupling (Corollary~\ref{cor:yurinskii_sa_martingale}) to reduce the problem to that of providing anti-concentration results for high-dimensional Gaussian vectors. We then demonstrate the utility of this reduction by employing a few such anti-concentration methods from the existing literature. Proposition~\ref{pro:yurinskii_app_bootstrap} gives a feasible implementation via the Gaussian multiplier bootstrap, enabling valid resampling-based inference using the resulting conditional Gaussian distribution. Finally, in Section~\ref{sec:yurinskii_app_lp} we provide an example application: distributional approximation for $\ell^p$-norms of high-dimensional martingale vectors in Kolmogorov--Smirnov distance, relying on some recent results concerning Gaussian perimetric inequalities \citep{nazarov2003maximal,kozbur2021dimension, giessing2023anti,chernozhukov2017detailed}. We begin this section with some notation. Assume the setup of Corollary~\ref{cor:yurinskii_sa_martingale} and suppose $\Sigma$ is non-random. Let $\cA$ be a class of measurable subsets of $\R^d$ and take $T \sim \cN(0, \Sigma)$. For $\eta>0$ and $p \in [1, \infty]$ define the Gaussian perimetric quantity % \begin{align*} \Delta_p(\cA, \eta) &= \sup_{A\in \cA} \big\{\P(T\in A_p^\eta\setminus A) \vee \P(T\in A \setminus A_p^{-\eta})\big\}, \end{align*} % where $A_p^\eta = \{x \in \R^d : \|x - A\|_p \leq \eta\}$, $A_p^{-\eta} = \R^d \setminus (\R^d \setminus A)_p^\eta$, and $\|x - A\|_p = \inf_{x' \in A} \|x - x'\|_p$. Using this perimetric term allows us to convert coupling results to central limit theorems as follows. Denote by $\Gamma_p(\eta)$ the rate of strong approximation attained in Corollary~\ref{cor:yurinskii_sa_martingale}: % \begin{align*} \Gamma_p(\eta) &= 24 \left( \frac{\beta_{p,2} \phi_p(d)^2}{\eta^3} \right)^{1/3} + 17 \left( \frac{\E \left[ \|\Omega\|_2 \right] \phi_p(d)^2}{\eta^2} \right)^{1/3}. \end{align*} \begin{proposition}[High-dimensional central limit theorem for martingales]% \label{pro:yurinskii_app_clt} Take the setup of Corollary~\ref{cor:yurinskii_sa_martingale}, and $\Sigma$ non-random. For a class $\cA$ of measurable sets in $\R^d$, % \begin{equation}% \label{eq:yurinskii_app_high_dim_clt} \sup_{A\in \cA} \big|\P(S\in A) -\P(T\in A)\big| \leq \inf_{p \in [1, \infty]} \inf_{\eta>0} \big\{\Gamma_p(\eta) + \Delta_p(\cA, \eta) \big\}. \end{equation} \end{proposition} \begin{proof}[Proposition~\ref{pro:yurinskii_app_clt}] This follows from Strassen's theorem (Lemma~\ref{lem:yurinskii_app_strassen}), but we provide a proof for completeness. % \begin{align*} \P(S \in A) &\leq \P(T \in A) + \P(T \in A_p^\eta \setminus A) + \P(\|S - T\| > \eta) \end{align*} % and applying this to $\R^d \setminus A$ gives % \begin{align*} \P(S\in A) &= 1 - \P(S\in \R^d \setminus A) \\ &\geq 1 - \P(T \in \R^d \setminus A) - \P(T \in (\R^d \setminus A)_p^\eta \setminus (\R^d \setminus A)) - \P(\|S - T\| > \eta) \\ &= \P(T \in A) - \P(T \in A \setminus A_p^{-\eta}) - \P(\|S - T\| > \eta). \end{align*} % Since this holds for all $p \in [1, \infty]$, % \begin{align*} \sup_{A\in \cA} \big|\P(S\in A) -\P(T\in A)\big| &\leq \sup_{A \in \cA} \big\{\P(T \in A_p^\eta\setminus A) \vee \P(T \in A \setminus A_p^{-\eta})\big\} + \P(\|S - T\| > \eta) \\ &\leq \inf_{p \in [1, \infty]} \inf_{\eta>0} \big\{\Gamma_p(\eta) + \Delta_p(\cA, \eta) \big\}. \end{align*} % \end{proof} The term $\Delta_p(\cA, \eta)$ in \eqref{eq:yurinskii_app_high_dim_clt} is a Gaussian anti-concentration quantity so it depends on the law of $S$ only through the covariance matrix $\Sigma$. A few results are available in the literature for bounding this term. For instance, with $\cA = \cC = \{A \subseteq \R^d \text{ is convex}\}$, \citet{nazarov2003maximal} showed % \begin{equation}% \label{eq:yurinskii_app_convex_anticonc} \Delta_2(\cC, \eta) \asymp \eta\sqrt{\|\Sigma^{-1}\|_{\rF}}, \end{equation} % whenever $\Sigma$ is invertible. Proposition~\ref{pro:yurinskii_app_clt} with $p=2$ and \eqref{eq:yurinskii_app_convex_anticonc} yield for convex sets % \begin{align*} \sup_{A\in \cC} \big|\P(S\in A) -\P(T\in A)\big| &\lesssim \inf_{\eta > 0} \left\{ \left(\frac{\beta_{p,2} d}{\eta^3}\right)^{1/3} + \left(\frac{\E[\|\Omega \|_2] d}{\eta^2}\right)^{1/3} + \eta \sqrt{\|\Sigma^{-1}\|_\rF} \right\}. \end{align*} Alternatively, one can take $\cA = \cR$, the class of axis-aligned rectangles in $\R^d$. By Nazarov's Gaussian perimetric inequality \citep{nazarov2003maximal,chernozhukov2017central}, % \begin{align}% \label{eq:yurinskii_app_rect_anticonc} \Delta_\infty(\cR, \eta) \leq \frac{\eta (\sqrt{2\log d} + 2)}{\sigma_{\min}} \end{align} % whenever $\min_j \, \Sigma_{j j} \geq \sigma_{\min}^2$ for some $\sigma_{\min}>0$. Proposition~\ref{pro:yurinskii_app_clt} with $p = \infty$ and \eqref{eq:yurinskii_app_rect_anticonc} yields % \begin{align*}% &\sup_{A\in \cR} \big|\P(S\in A) -\P(T\in A)\big| \lesssim \inf_{\eta > 0} \left\{ \left(\frac{\beta_{\infty,2} \log 2d}{\eta^3}\right)^{1/3} + \left(\frac{\E[\|\Omega \|_2] \log 2d}{\eta^2}\right)^{1/3} + \frac{\eta \sqrt{\log 2d}}{\sigma_{\min}} \right\}. \end{align*} % In situations where $\liminf_n \min_j \, \Sigma_{j j} = 0$, it may be possible in certain cases to regularize the minimum variance away from zero and then apply a Gaussian--Gaussian rectangular approximation result such as Lemma~2.1 from \citet{chernozhukov2023nearly}. \begin{remark}[Comparisons with the literature] The literature on high-dimensional central limit theorems has developed rapidly in recent years \citep[see][and references therein]{% zhai2018high,% koike2021notes,% buzun2022strong,% lopes2022central,% chernozhukov2023nearly% }, particularly for the special case of sums of independent random vectors on the rectangular sets $\cR$. % Our corresponding results are rather weaker in terms of dependence on the dimension than for example \citet[Theorem~2.1]{chernozhukov2023nearly}. This is an inherent issue due to our approach of first considering the class of all Borel sets and only afterwards specializing to the smaller class $\cR$, where sharper results in the literature directly target the Kolmogorov--Smirnov distance via Stein's method and Slepian interpolation. \end{remark} Next, we present a version of Proposition~\ref{pro:yurinskii_app_clt} in which the covariance matrix $\Sigma$ is replaced by an estimator $\hat \Sigma$. This ensures that the associated conditionally Gaussian vector is feasible and can be resampled, allowing Monte Carlo quantile estimation via a Gaussian multiplier bootstrap. \begin{proposition}[Bootstrap central limit theorem for martingales]% \label{pro:yurinskii_app_bootstrap} Assume the setup of Corollary~\ref{cor:yurinskii_sa_martingale}, with $\Sigma$ non-random, and let $\hat \Sigma$ be an $\bX$-measurable random $d \times d$ positive semi-definite matrix, where $\bX = (X_1, \ldots, X_n)$. For a class $\cA$ of measurable subsets of $\R^d$, % \begin{align*} &\sup_{A\in \cA} \left| \P\big(S \in A\big) - \P\big(\hat \Sigma^{1/2} Z \in A \bigm| \bX \big) \right| \\ &\quad\leq \inf_{p \in [1,\infty]} \inf_{\eta>0} \left\{ \Gamma_p(\eta) + 2 \Delta_p(\cA, \eta) + 2d \exp\left(\frac{-\eta^2} {2d^{2/p}\big\|\hat \Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2} \right) \right\}, \end{align*} % where $Z \sim \cN(0,I_d)$ is independent of $\bX$. \end{proposition} \begin{proof}[Proposition~\ref{pro:yurinskii_app_bootstrap}] Since $T = \Sigma^{1/2} Z$ is independent of $\bX$, % \begin{align*} &\left| \P\big(S \in A\big) - \P\left(\hat \Sigma^{1/2} Z \in A \bigm| \bX\right) \right| \\ &\quad\leq \left| \P\big(S \in A\big) - \P\big(T \in A\big) \right| +\left| \P\big(\Sigma^{1/2} Z \in A\big) - \P\left(\hat \Sigma^{1/2} Z \in A \bigm| \bX\right) \right|. \end{align*} % The first term is bounded by Proposition~\ref{pro:yurinskii_app_clt}; the second by Lemma~\ref{lem:yurinskii_app_feasible_gaussian} conditional on $\bX$. % \begin{align*} &\left| \P\big(S \in A\big) - \P\left(\hat \Sigma^{1/2} Z \in A \bigm| \bX\right) \right| \\ &\quad\leq \Gamma_p(\eta) + \Delta_p(\cA, \eta) + \Delta_{p'}(\cA, \eta') + 2 d \exp \left( \frac{-\eta'^2} {2 d^{2/p'} \big\|\hat\Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2} \right) \end{align*} % for all $A \in \cA$ and any $p, p' \in [1, \infty]$ and $\eta, \eta' > 0$. Taking a supremum over $A$ and infima over $p = p'$ and $\eta = \eta'$ yields the result. We do not need $p = p'$ and $\eta = \eta'$ in general. % \end{proof} A natural choice for $\hat\Sigma$ in certain situations is the sample covariance matrix $\sum_{i=1}^n X_i X_i^\T$, or a correlation-corrected variant thereof. In general, whenever $\hat \Sigma$ does not depend on unknown quantities, one can sample from the law of $\hat T = \hat\Sigma^{1/2} Z$ conditional on $\bX$ to approximate the distribution of $S$. Proposition~\ref{pro:yurinskii_app_bootstrap} verifies that this Gaussian multiplier bootstrap approach is valid whenever $\hat\Sigma$ and $\Sigma$ are sufficiently close. To this end, Theorem~X.1.1 in \citet{bhatia1997matrix} gives $\big\|\hat\Sigma^{1/2} - \Sigma^{1/2}\big\|_2 \leq \big\|\hat\Sigma - \Sigma\big\|_2^{1/2}$ and Problem~X.5.5 in the same gives $\big\|\hat\Sigma^{1/2} - \Sigma^{1/2}\big\|_2 \leq \big\|\Sigma^{-1/2}\big\|_2 \big\|\hat\Sigma - \Sigma\big\|_2$ when $\Sigma$ is invertible. The latter often gives a tighter bound when the minimum eigenvalue of $\Sigma$ can be bounded away from zero, and consistency of $\hat \Sigma$ can be established using a range of matrix concentration inequalities. In Section~\ref{sec:yurinskii_app_lp} we apply Proposition~\ref{pro:yurinskii_app_clt} to the special case of approximating the distribution of the $\ell^p$-norm of a high-dimensional martingale. Proposition~\ref{pro:yurinskii_app_bootstrap} is then used to ensure that feasible distributional approximations are also available. \subsection{Application: distributional approximation of martingale \texorpdfstring{$\ell^p$}{lp}-norms} \label{sec:yurinskii_app_lp} In empirical applications, including nonparametric significance tests \citep{lopes2020bootstrapping} and nearest neighbor search procedures \citep{biau2015high}, an estimator or test statistic can be expressed under the null hypothesis as the $\ell^p$-norm of a zero-mean martingale for some $p \in [1, \infty]$. In the notation of Corollary~\ref{cor:yurinskii_sa_martingale}, it is of interest to bound Kolmogorov--Smirnov quantities of the form $\sup_{t \geq 0} \big| \P( \|S\|_p \leq t) - \P( \|T\|_p \leq t) \big|$. Let $\cB_p$ be the class of closed $\ell^p$-balls in $\R^d$ centered at the origin and set $\Delta_p(\eta) \vcentcolon= \Delta_p(\cB_p, \eta) = \sup_{t \geq 0} \P( t < \|T\|_p \leq t + \eta )$. \begin{proposition}[Distributional approximation of martingale $\ell^p$-norms] \label{pro:yurinskii_app_application_lp} Assume the setup of Corollary~\ref{cor:yurinskii_sa_martingale}, with $\Sigma$ non-random. Then for $T \sim \cN(0, \Sigma)$, % \begin{equation}% \label{eq:yurinskii_app_application_lp} \sup_{t \geq 0} \big| \P( \|S\|_p \leq t ) - \P\left( \|T\|_p \leq t \right) \big| \leq \inf_{\eta>0} \big\{\Gamma_p(\eta) + \Delta_p(\eta) \big\}. \end{equation} % \end{proposition} \begin{proof}[Proposition~\ref{pro:yurinskii_app_application_lp}] Applying Proposition~\ref{pro:yurinskii_app_clt} with $\cA=\cB_p$ gives % \begin{align*} \sup_{t \geq 0} \big| \P( \|S\|_p \leq t ) - \P\left( \|T\|_p \leq t \right) \big| &= \sup_{A\in \cB_p} \big|\P(S\in A) -\P(T\in A)\big| \\ &\leq \inf_{\eta>0} \big\{\Gamma_p(\eta) + \Delta_p(\cB_p, \eta) \big\} \leq \inf_{\eta>0} \big\{\Gamma_p(\eta) + \Delta_p(\eta) \big\}. \end{align*} % \end{proof} The right-hand side of \eqref{eq:yurinskii_app_application_lp} can be controlled in various ways. % In the case of $p=\infty$, note that $\ell^\infty$-balls are rectangles so $\cB_\infty\subseteq \cR$ and \eqref{eq:yurinskii_app_rect_anticonc} applies, giving $\Delta_\infty(\eta) \leq \eta (\sqrt{2\log d} + 2) / \sigma_{\min}$ whenever $\min_j \Sigma_{j j} \geq \sigma_{\min}^2$. Alternatively, \citet[Theorem~1]{giessing2023anti} provides $\Delta_\infty(\eta) \lesssim \eta / \sqrt{\Var[\|T\|_\infty] + \eta^2}$. By H{\"o}lder duality of $\ell^p$-norms, we can write $\|T\|_p = \sup_{\|u\|_q \leq 1} u^\T T$ where $1/p + 1/q = 1$. Applying the Gaussian process anti-concentration result of \citet[Theorem~2]{giessing2023anti} yields the more general $\Delta_p(\eta) \lesssim \eta / \sqrt{\Var[\|T\|_p] + \eta^2}$. Thus, the problem can be reduced to that of bounding $\Var\left[\|T\|_p\right]$, with techniques for doing so discussed in \citet[Section~4]{giessing2023anti}. Alongside the $\ell^p$-norms, other functionals can be analyzed in this manner, including the maximum and other order statistics \citep{kozbur2021dimension,giessing2023anti}. To conduct inference in this setting, we must feasibly approximate the quantiles of $\|T\|_p$. To that end, take a significance level $\tau\in(0,1)$ and set % $\hat q_p(\tau) = \inf \big\{t \in \R: \P(\|\hat T\|_p \leq t \mid \bX) \geq \tau \}$ where $\hat T \mid \bX \sim \cN(0, \hat\Sigma)$, % with $\hat\Sigma$ any $\bX$-measurable positive semi-definite estimator of $\Sigma$. Note that for the canonical estimator $\hat\Sigma = \sum_{i=1}^n X_i X_i^\T$ we can write $\hat T =\sum_{i=1}^n X_i Z_i$ with $Z_1,\dots,Z_n$ i.i.d.\ standard Gaussian independent of $\bX$, yielding the Gaussian multiplier bootstrap. Now assuming the law of $\|\hat T\|_p \mid \bX$ has no atoms, we can apply Proposition~\ref{pro:yurinskii_app_bootstrap} to see % \begin{align*} &\sup_{\tau\in(0,1)} \big|\P\left(\|S\|_p \leq \hat q_p(\tau)\right) - \tau \big| \leq \E\left[ \sup_{t \geq 0} \big| \P(\|S\|_p \leq t) - \P(\|\hat T\|_p \leq t \mid \bX) \big| \right] \\ &\qquad\leq \inf_{\eta>0} \left\{ \Gamma_p(\eta) + 2 \Delta_p(\eta) + 2d\, \E\left[ \exp\left(\frac{-\eta^2} {2d^{2/p}\big\|\hat \Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2}\right) \right] \right\}, \end{align*} % and hence the bootstrap is valid whenever $\|\hat \Sigma^{1/2} - \Sigma^{1/2}\big\|_2^2$ is sufficiently small. See the preceding discussion regarding methods for bounding this object. \begin{remark}[One-dimensional distributional approximations] In our application to distributional approximation of $\ell^p$-norms, the object of interest $\|S\|_p$ is a one-dimensional functional of the high-dimensional martingale; contrast this with the more general Proposition~\ref{pro:yurinskii_app_clt} which directly considers the $d$-dimensional random vector $S$. As such, our coupling-based approach may be improved in certain settings by applying a more carefully tailored smoothing argument. For example, \citet{belloni2018high} employ a ``log sum exponential'' bound \citep[see also][]{chernozhukov2013gaussian} for the maximum statistic $\max_{1 \leq j \leq d} S_j$ along with a coupling due to \citet{chernozhukov2014gaussian} to attain an improved dependence on the dimension. Naturally, their approach does not permit the formulation of high-dimensional central limit theorems over arbitrary classes of Borel sets as in our Proposition~\ref{pro:yurinskii_app_clt}. \end{remark} \clearpage \addcontentsline{toc}{chapter}{Bibliography} \bibliographystyle{phd_dissertation} \bibliography{refs} \end{document} tex-fmt-0.5.2/tests/target/phd_dissertation_refs.bib000066400000000000000000001107321473573253500226140ustar00rootroot00000000000000@article{aldous1981representations, author = {Aldous, David J}, journal = {Journal of Multivariate Analysis}, number = {4}, pages = {581--598}, title = {Representations for partially exchangeable arrays of random variables}, volume = {11}, year = {1981}, } @inproceedings{anastasiou2019normal, title = {Normal approximation for stochastic gradient descent via non-asymptotic rates of martingale {CLT}}, author = {Anastasiou, Andreas and Balasubramanian, Krishnakumar and Erdogdu, Murat A}, booktitle = {Conference on Learning Theory}, pages = {115--137}, year = {2019}, organization = {Proceedings of Machine Learning Research} } @article{arcones1993limit, title = {Limit theorems for {U}-processes}, author = {Arcones, Miguel A and Gin{\'e}, Evarist}, journal = {Annals of Probability}, pages = {1494--1542}, year = {1993}, } @article{arcones1995bernstein, author = {Arcones, Miguel A}, journal = {Statistics \& Probability Letters}, number = {3}, pages = {239--247}, title = {A {Bernstein}-type inequality for {U}-statistics and {U}-processes}, volume = {22}, year = {1995}, } @inproceedings{arnould2023interpolation, title = {Is interpolation benign for random forest regression?}, author = {Arnould, Ludovic and Boyer, Claire and Scornet, Erwan}, booktitle = {International Conference on Artificial Intelligence and Statistics}, pages = {5493--5548}, year = {2023}, organization = {Proceedings of Machine Learning Research}, } @article{atchade2014martingale, title = {A martingale decomposition for quadratic forms of {Markov} chains (with applications)}, author = {Atchad{\'e}, Yves F and Cattaneo, Matias D}, journal = {Stochastic Processes and their Applications}, volume = {124}, number = {1}, pages = {646--677}, year = {2014}, } @article{baxter1994norm, title = {Norm estimates for inverses of {Toeplitz} distance matrices}, author = {Baxter, Brad J. C.}, journal = {Journal of Approximation Theory}, volume = {79}, number = {2}, pages = {222--242}, year = {1994}, } @article{belloni2015some, title = {Some new asymptotic theory for least squares series: Pointwise and uniform results}, author = {Belloni, Alexandre and Chernozhukov, Victor and Chetverikov, Denis and Kato, Kengo}, journal = {Journal of Econometrics}, volume = {186}, number = {2}, pages = {345--366}, year = {2015}, } @article{belloni2018high, title = {A high dimensional central limit theorem for martingales, with applications to context tree models}, author = {Belloni, Alexandre and Oliveira, Roberto I}, journal = {Preprint}, note = {\arxiv{1809.02741}}, year = {2018} } @article{belloni2019conditional, author = {Belloni, Alexandre and Chernozhukov, Victor and Chetverikov, Denis and Fern{\'a}ndez-Val, Iv{\'a}n}, journal = {Journal of Econometrics}, number = {1}, pages = {4--29}, title = {Conditional quantile processes based on series or many regressors}, volume = {213}, year = {2019}, } @article{berthet2006revisiting, title = {Revisiting two strong approximation results of {Dudley} and {Philipp}}, author = {Berthet, Philippe and Mason, David M}, journal = {Lecture Notes--Monograph Series}, pages = {155--172}, volume = {51}, year = {2006}, note = {High Dimensional Probability}, } @book{bhatia1997matrix, author = {Bhatia, Rajendra}, publisher = {Springer}, address = {New York, NY}, series = {Graduate Texts in Mathematics}, title = {Matrix Analysis}, volume = {169}, year = {1997}, } @article{biau2012analysis, title = {Analysis of a random forests model}, author = {Biau, G{\'e}rard}, journal = {Journal of Machine Learning Research}, volume = {13}, pages = {1063--1095}, year = {2012}, } @incollection{biau2015high, title = {High-Dimensional $p$-Norms}, author = {Biau, G{\'e}rard and Mason, David M}, booktitle = {Mathematical Statistics and Limit Theorems}, editor = {Marc Hallin and David M Mason and Dietmar Pfeifer and Josef G. Steinebach}, pages = {21--40}, year = {2015}, publisher = {Springer} } @article{birge2001alternative, author = {Birg{\'e}, Lucien}, journal = {Lecture Notes--Monograph Series}, pages = {113--133}, title = {An alternative point of view on {Lepski}'s method}, volume = {36}, year = {2001}, note = {State of the Art in Probability and Statistics} } @book{boucheron2013concentration, title = {Concentration Inequalities: A Nonasymptotic Theory of Independence}, author = {Boucheron, St{\'e}phane and Lugosi, G{\'a}bor and Massart, Pascal}, year = {2013}, publisher = {Oxford University Press}, } @article{bradley2005basic, title = {Basic Properties of Strong Mixing Conditions. {A} survey and Some Open Questions}, author = {Bradley, Richard C}, journal = {Probability Surveys}, volume = {2}, pages = {107--144}, year = {2005} } @article{breiman2001random, title = {Random forests}, author = {Breiman, Leo}, journal = {Machine learning}, volume = {45}, pages = {5--32}, year = {2001}, } @misc{bureau2017daily, author = {{Bureau of Meteorology, Australian Government}}, title = {Daily Weather Observations}, year = {2017}, note = {\href{http://www.bom.gov.au/climate/data/} {\texttt{http://www.bom.gov.au/climate/data/}}. Accessed October 2023}, } @inproceedings{buzun2022strong, title = {Strong {Gaussian} Approximation for the Sum of Random Vectors}, author = {Buzun, Nazar and Shvetsov, Nikolay and Dylov, Dmitry V}, booktitle = {Conference on Learning Theory}, volume = {178}, pages = {1693--1715}, year = {2022}, organization = {Proceedings of Machine Learning Research} } @article{calonico2018effect, author = {Calonico, Sebastian and Matias D. Cattaneo and Max H. Farrell}, journal = {Journal of the American Statistical Association}, number = {522}, pages = {767--779}, title = {On the Effect of Bias Estimation on Coverage Accuracy in Nonparametric Inference}, volume = {113}, year = {2018}, } @article{calonico2022coverage, author = {Calonico, Sebastian and Matias D. Cattaneo and Max H. Farrell}, journal = {Bernoulli}, volume = {28}, number = {4}, pages = {2998--3022}, title = {Coverage Error Optimal Confidence Intervals for Local Polynomial Regression}, year = {2022}, } @inproceedings{caruana2004ensemble, title = {Ensemble selection from libraries of models}, author = {Caruana, Rich and Niculescu-Mizil, Alexandru and Crew, Geoff and Ksikes, Alex}, booktitle = {Proceedings of the Twenty-First International Conference on Machine Learning}, pages = {18}, year = {2004} } @article{cattaneo2020large, author = {Matias D. Cattaneo and Max H. Farrell and Yingjie Feng}, title = {{Large sample properties of partitioning-based series estimators}}, volume = {48}, journal = {Annals of Statistics}, number = {3}, pages = {1718--1741}, keywords = {Nonparametric regression, robust bias correction, series methods, sieve methods, strong approximation, tuning parameter selection, uniform inference}, year = {2020}, } @article{cattaneo2022yurinskii, author = {Cattaneo, Matias Damian and Masini, Ricardo Pereira and Underwood, William George}, title = {{Yurinskii's} Coupling for Martingales}, year = {2022}, journal = {Preprint}, note = {\arxiv{2210.00362}} } @article{cattaneo2023inference, author = {Cattaneo, Matias Damian and Klusowski, Jason M and Underwood, William George}, title = {Inference with {Mondrian} Random Forests}, journal = {Preprint}, year = {2023}, note = {\arxiv{2310.09702}} } @article{cattaneo2024uniform, author = {Cattaneo, Matias Damian and Feng, Yingjie and Underwood, William George}, title = {Uniform Inference for Kernel Density Estimators with Dyadic Data}, year = {2024}, journal = {Journal of the American Statistical Association}, volume = {forthcoming}, } @article{chatterjee2006generalization, title = {A generalization of the {Lindeberg} principle}, author = {Chatterjee, Sourav}, journal = {Annals of Probability}, volume = {34}, number = {6}, pages = {2061--2076}, year = {2006} } @article{chen2020jackknife, title = {Jackknife multiplier bootstrap: finite sample approximations to the {U}-process supremum with applications}, author = {Chen, Xiaohui and Kato, Kengo}, journal = {Probability Theory and Related Fields}, volume = {176}, number = {3}, pages = {1097--1163}, year = {2020}, } @article{chernozhukov2013gaussian, title = {Gaussian approximations and multiplier bootstrap for maxima of sums of high-dimensional random vectors}, author = {Chernozhukov, Victor and Chetverikov, Denis and Kato, Kengo}, journal = {Annals of Statistics}, volume = {41}, number = {6}, pages = {2786--2819}, year = {2013}, } @article{chernozhukov2013inference, title = {Inference on counterfactual distributions}, author = {Chernozhukov, Victor and Fern{\'a}ndez-Val, Iv{\'a}n and Melly, Blaise}, journal = {Econometrica}, volume = {81}, number = {6}, pages = {2205--2268}, year = {2013}, } @article{chernozhukov2014anti, title = {Anti-concentration and honest, adaptive confidence bands}, author = {Chernozhukov, Victor and Chetverikov, Denis and Kato, Kengo}, journal = {Annals of Statistics}, volume = {42}, number = {5}, pages = {1787--1818}, year = {2014}, } @article{chernozhukov2014gaussian, title = {Gaussian approximation of suprema of empirical processes}, author = {Chernozhukov, Victor and Chetverikov, Denis and Kato, Kengo}, journal = {Annals of Statistics}, volume = {42}, number = {4}, pages = {1564--1597}, year = {2014}, } @article{chernozhukov2016empirical, title = {Empirical and multiplier bootstraps for suprema of empirical processes of increasing complexity, and related {Gaussian} couplings}, author = {Chernozhukov, Victor and Chetverikov, Denis and Kato, Kengo}, journal = {Stochastic Processes and their Applications}, volume = {126}, number = {12}, pages = {3632--3651}, year = {2016}, } @article{chernozhukov2017central, author = {Victor Chernozhukov and Denis Chetverikov and Kengo Kato}, title = {{Central limit theorems and bootstrap in high dimensions}}, volume = {45}, journal = {Annals of Probability}, number = {4}, pages = {2309--2352}, year = {2017}, } @article{chernozhukov2017detailed, title = {Detailed proof of {Nazarov}'s inequality}, author = {Chernozhukov, Victor and Chetverikov, Denis and Kato, Kengo}, journal = {Preprint}, note = {\arxiv{1711.10696}}, year = {2017} } @article{chernozhukov2023nearly, title = {Nearly optimal central limit theorem and bootstrap approximations in high dimensions}, author = {Chernozhukov, Victor and Chetverikov, Denis and Koike, Yuta}, journal = {Annals of Applied Probability}, volume = {33}, number = {3}, pages = {2374--2425}, year = {2023} } @article{chi2022asymptotic, title = {Asymptotic Properties of High-Dimensional Random Forests}, author = {Chi, Chien-Ming and Vossler, Patrick and Fan, Yingying and Lv, Jinchi}, volume = {50}, journal = {Annals of Statistics}, number = {6}, pages = {3415--3438}, year = {2022} } @article{chiang2020empirical, title = {Empirical likelihood and uniform convergence rates for dyadic kernel density estimation}, author = {Harold D. Chiang and Bing Yang Tan}, journal = {Journal of Business and Economic Statistics}, volume = {41}, number = {3}, pages = {906--914}, year = {2023}, } @article{chiang2022inference, author = {Harold D. Chiang and Kengo Kato and Yuya Sasaki}, journal = {Journal of the American Statistical Association}, title = {Inference for High-Dimensional Exchangeable Arrays}, volume = {118}, number = {543}, pages = {1595--1605}, year = {2023}, } @article{cuny2014martingale, title = {On martingale approximations and the quenched weak invariance principle}, author = {Cuny, Christophe and Merlev{\`e}de, Florence}, journal = {Annals of Probability}, volume = {42}, number = {2}, pages = {760--793}, year = {2014}, } @article{davezies2021exchangeable, author = {Laurent Davezies and Xavier D'Haultf{\oe}uille and Yannick Guyonvarch}, journal = {Annals of Statistics}, number = {2}, pages = {845--862}, title = {Empirical process results for exchangeable arrays}, volume = {49}, year = {2021}, } @article{dedecker2007weak, title = {On the weak invariance principle for non-adapted sequences under projective criteria}, author = {Dedecker, J{\'e}r{\^o}me and Merlev{\`e}de, Florence and Voln{\`y}, Dalibor}, journal = {Journal of Theoretical Probability}, volume = {20}, pages = {971--1004}, year = {2007}, } @article{dehling1983limit, title = {Limit theorems for sums of weakly dependent {Banach} space valued random variables}, author = {Dehling, Herold}, journal = {Zeitschrift f{\"u}r Wahrscheinlichkeitstheorie und verwandte Gebiete}, volume = {63}, number = {3}, pages = {393--432}, year = {1983}, } @article{delapena1995decoupling, author = {de la Pe{\~n}a, Victor H and Montgomery-Smith, Stephen J}, journal = {Annals of Probability}, number = {2}, pages = {806--816}, title = {Decoupling inequalities for the tail probabilities of multivariate {U}-statistics}, volume = {23}, year = {1995}, } @article{dinardo1996distribution, title = {Labor Market Institutions and the Distribution of Wages, 1973--1992: A Semiparametric Approach}, author = {John DiNardo and Nicole M Fortin and Thomas Lemieux}, journal = {Econometrica}, volume = {64}, number = {5}, pages = {1001--1004}, year = {1996} } @article{dudley1983invariance, title = {Invariance principles for sums of {Banach} space valued random elements and empirical processes}, author = {Dudley, RM and Philipp, Walter}, journal = {Zeitschrift f{\"u}r Wahrscheinlichkeitstheorie und verwandte Gebiete}, volume = {62}, number = {4}, pages = {509--552}, year = {1983}, } @book{dudley1999uniform, author = {Dudley, R. M.}, publisher = {Cambridge University Press}, series = {Cambridge Studies in Advanced Mathematics}, title = {Uniform Central Limit Theorems}, year = {1999}, } @article{duroux2018impact, title={Impact of subsampling and tree depth on random forests}, author={Duroux, Roxane and Scornet, Erwan}, journal={ESAIM: Probability and Statistics}, volume={22}, pages={96--128}, year={2018}, } @article{efron1981jackknife, title = {The jackknife estimate of variance}, author = {Efron, Bradley and Stein, Charles}, journal = {Annals of Statistics}, pages = {586--596}, year = {1981}, } @book{eggermont2009maximum, title = {Maximum Penalized Likelihood Estimation: Volume II: Regression}, author = {Eggermont, Paul P B and LaRiccia, Vincent N}, series = {Springer Series in Statistics}, year = {2009}, publisher = {Springer}, address = {New York, NY}, } @book{fan1996local, author = {Fan, J. and I. Gijbels}, title = {Local Polynomial Modelling and Its Applications}, series = {Monographs on Statistics and Applied Probability}, volume = {66}, publisher = {Chapman \& Hall/CRC}, address = {New York, NY}, year = {1996} } @book{fan2020statistical, title = {Statistical Foundations of Data Science}, series = {Data Science Series}, author = {Fan, Jianqing and Li, Runze and Zhang, Cun-Hui and Zou, Hui}, year = {2020}, publisher = {Chapman \& Hall/CRC}, address = {New York, NY}, } @article{friedberg2020local, title = {Local linear forests}, author = {Friedberg, Rina and Tibshirani, Julie and Athey, Susan and Wager, Stefan}, journal = {Journal of Computational and Graphical Statistics}, volume = {30}, number = {2}, pages = {503--517}, year = {2020}, } @article{gao2021minimax, author = {Gao, Chao and Ma, Zongming}, journal = {Statistical Science}, number = {1}, pages = {16--33}, title = {Minimax rates in network analysis: Graphon estimation, community detection and hypothesis testing}, volume = {36}, year = {2021}, } @article{gao2022towards, title = {Towards convergence rate analysis of random forests for classification}, author = {Gao, Wei and Xu, Fan and Zhou, Zhi-Hua}, journal = {Artificial Intelligence}, volume = {313}, pages = {103788}, year = {2022}, } @book{geer2000empirical, title = {Empirical Processes in {M}-Estimation}, author = {Sara A van de Geer}, volume = {6}, year = {2000}, publisher = {Cambridge University Press}, series = {Cambridge Series in Statistical and Probabilistic Mathematics}, } @article{giessing2023anti, title = {Anti-concentration of Suprema of {Gaussian} Processes and {Gaussian} Order Statistics}, author = {Giessing, Alexander}, journal = {Preprint}, note = {\arxiv{2310.12119}}, year = {2023} } @incollection{gine2000exponential, author = {Gin{\'e}, Evarist and Lata{\l}a, Rafa{\l} and Zinn, Joel}, booktitle = {High Dimensional Probability II}, pages = {13--38}, publisher = {Birkh{\"a}user}, address = {Boston, MA}, title = {Exponential and moment inequalities for {U}-statistics}, year = {2000}, editor = {Evarist Gin{\'e} and David M Mason and Jon A Wellner}, } @article{gine2004kernel, author = {Gin{\'e}, Evarist and Koltchinskii, Vladimir and Sakhanenko, Lyudmila}, journal = {Probability Theory and Related Fields}, number = {2}, pages = {167--198}, title = {Kernel density estimators: convergence in distribution for weighted sup-norms}, volume = {130}, year = {2004}, } @article{gine2010confidence, author = {Gin{\'e}, Evarist and Nickl, Richard}, journal = {Annals of Statistics}, number = {2}, pages = {1122--1170}, title = {Confidence bands in density estimation}, volume = {38}, year = {2010}, } @book{gine2021mathematical, author = {Gin{\'e}, Evarist and Nickl, Richard}, publisher = {Cambridge University Press}, series = {Cambridge Series in Statistical and Probabilistic Mathematics}, title = {Mathematical Foundations of Infinite-Dimensional Statistical Models}, year = {2021}, } @incollection{graham2020network, author = {Graham, Bryan S}, booktitle = {Handbook of Econometrics}, pages = {111--218}, publisher = {Elsevier}, title = {Network data}, volume = {7}, year = {2020}, editor = {Steven N Durlauf and Lars Peter Hansen and James J. Heckman and Rosa L Matzkin}, } @techreport{graham2021minimax, author = {Graham, Bryan S and Niu, Fengshi and Powell, James L}, institution = {National Bureau of Economic Research}, title = {Minimax Risk and Uniform Convergence Rates for Nonparametric Dyadic Regression}, year = {2021}, } @article{graham2024kernel, title = {Kernel density estimation for undirected dyadic data}, author = {Graham, Bryan S and Niu, Fengshi and Powell, James L}, journal = {Journal of Econometrics}, volume = {240}, number = {2}, year = {2024}, } @book{hall1980martingale, title = {Martingale Limit Theory and its Application}, author = {Hall, Peter and Heyde, Christopher C}, year = {1980}, publisher = {Academic Press}, address = {New York, NY}, } @article{hall1992effect, author = {Hall, Peter}, journal = {Annals of Statistics}, volume = {20}, number = {2}, pages = {675--694}, title = {Effect of bias estimation on coverage accuracy of bootstrap confidence intervals for a probability density}, year = {1992}, } @article{hall2001bootstrapping, author = {Hall, Peter and Kang, Kee-Hoon}, journal = {Annals of Statistics}, number = {5}, pages = {1443--1468}, title = {Bootstrapping nonparametric density estimators with empirically chosen bandwidths}, volume = {29}, year = {2001}, } @incollection{head2014gravity, title = {Gravity equations: Workhorse, toolkit, and cookbook}, author = {Head, Keith and Mayer, Thierry}, booktitle = {Handbook of International Economics}, volume = {4}, pages = {131--195}, year = {2014}, publisher = {Elsevier}, editor = {Gita Gopinath and Elhanan Helpman and Kenneth Rogoff}, } @article{hoover1979relations, author = {Hoover, Douglas N}, journal = {Preprint, Institute for Advanced Study, Princeton, NJ}, title = {Relations on probability spaces and arrays of random variables}, year = {1979}, } @article{huang2003local, title = {Local asymptotics for polynomial spline regression}, author = {Huang, Jianhua Z}, journal = {Annals of Statistics}, volume = {31}, number = {5}, pages = {1600--1635}, year = {2003}, } @book{kenny2020dyadic, title = {Dyadic Data Analysis}, author = {Kenny, David A and Kashy, Deborah A and Cook, William L}, year = {2020}, series = {Methodology in the Social Sciences Series}, publisher = {Guilford Press} } @article{khasminskii1978lower, author = {Khasminskii, Rafail Z}, journal = {Theory of Probability and its Applications}, number = {4}, pages = {794--798}, title = {A lower bound on the risks of nonparametric estimates of densities in the uniform metric}, volume = {23}, year = {1978}, } @inproceedings{klusowski2021sharp, title = {Sharp analysis of a simple model for random forests}, author = {Klusowski, Jason M}, booktitle = {International Conference on Artificial Intelligence and Statistics}, pages = {757--765}, year = {2021}, organization = {Proceedings of Machine Learning Research} } @article{klusowski2024large, title = {Large scale prediction with decision trees}, author = {Klusowski, Jason M and Tian, Peter M}, journal = {Journal of the American Statistical Association}, pages = {525-537}, volume = {119}, number = {545}, year = {2024}, } @article{koike2021notes, title = {Notes on the dimension dependence in high-dimensional central limit theorems for hyperrectangles}, author = {Koike, Yuta}, journal = {Japanese Journal of Statistics and Data Science}, volume = {4}, pages = {257--297}, year = {2021}, } @book{kolaczyk2009statistical, author = {Kolaczyk, Eric D}, year = {2009}, title = {Statistical Analysis of Network Data: Methods and Models}, series = {Springer Series in Statistics}, publisher = {Springer}, address = {New York, NY}, } @article{komlos1975approximation, author = {Koml{\'o}s, J{\'a}nos and Major, P{\'e}ter and Tusn{\'a}dy, G{\'a}bor}, journal = {Zeitschrift f{\"u}r Wahrscheinlichkeitstheorie und verwandte Gebiete}, number = {1-2}, pages = {111--131}, title = {An approximation of partial sums of independent {RV}s, and the sample {DF}. {I}}, volume = {32}, year = {1975}, } @article{kozbur2021dimension, title = {Dimension-Free Anticoncentration Bounds for {Gaussian} Order Statistics with Discussion of Applications to Multiple Testing}, author = {Kozbur, Damian}, journal = {Preprint}, note = {\arxiv{2107.10766}}, year = {2021} } @article{kwapien1991hypercontraction, author = {Kwapien, Stanislaw and Szulga, Jerzy}, journal = {Annals of Probability}, number = {1}, pages = {369--379}, title = {Hypercontraction methods in moment inequalities for series of independent random variables in normed spaces}, volume = {19}, year = {1991}, } @article{lakshminarayanan2014mondrian, title = {Mondrian forests: Efficient online random forests}, author = {Lakshminarayanan, Balaji and Roy, Daniel M and Teh, Yee Whye}, journal = {Advances in Neural Information Processing Systems}, volume = {27}, year = {2014} } @inproceedings{lakshminarayanan2016mondrian, title = {Mondrian forests for large-scale regression when uncertainty matters}, author = {Lakshminarayanan, Balaji and Roy, Daniel M and Teh, Yee Whye}, booktitle = {Artificial Intelligence and Statistics}, pages = {1478--1487}, year = {2016}, organization = {Proceedings of Machine Learning Research} } @incollection{laurent2005semidefinite, author = {Monique Laurent and Franz Rendl}, booktitle = {Discrete Optimization}, pages = {393--514}, publisher = {Elsevier}, series = {Handbooks in Operations Research and Management Science}, title = {Semidefinite Programming and Integer Programming}, volume = {12}, year = {2005}, editor = {K Aardal and G L Nemhauser and R Weismantel}, } @techreport{lecam1988, author = {Le Cam, L}, title = {On the {Prokhorov} distance between the empirical process and the associated {Gaussian} bridge}, institution = {University of California, Berkeley}, year = {1988} } @book{ledoux1991probability, author = {Ledoux, Michel and Talagrand, Michel}, publisher = {Springer}, series = {Classics in Mathematics}, address = {Berlin, Heidelberg}, title = {Probability in Banach Spaces}, year = {1991}, } @book{legall2016brownian, author = {Le Gall, Jean-Fran{\c{c}}ois}, publisher = {Springer}, address = {Berlin, Heidelberg}, title = {Brownian Motion, Martingales, and Stochastic Calculus}, series = {Graduate Texts in Mathematics}, volume = {274}, year = {2016}, } @article{lepskii1992asymptotically, author = {Lepskii, O V}, journal = {Theory of Probability \& its Applications}, number = {4}, pages = {682--697}, title = {Asymptotically minimax adaptive estimation. {I}: Upper bounds. Optimally adaptive estimates}, volume = {36}, year = {1992}, } @article{li2020uniform, title = {Uniform nonparametric inference for time series}, journal = {Journal of Econometrics}, volume = {219}, number = {1}, pages = {38-51}, year = {2020}, author = {Jia Li and Zhipeng Liao} } @article{lopes2020bootstrapping, title = {Bootstrapping max statistics in high dimensions: Near-parametric rates under weak variance decay and application to functional and multinomial data}, author = {Lopes, Miles E and Lin, Zhenhua and M{\"u}ller, Hans-Georg}, journal = {Annals of Statistics}, volume = {48}, number = {2}, pages = {1214--1229}, year = {2020}, } @article{lopes2022central, title = {Central limit theorem and bootstrap approximation in high dimensions: Near $1/n$ rates via implicit smoothing}, author = {Lopes, Miles E}, journal = {Annals of Statistics}, volume = {50}, number = {5}, pages = {2492--2513}, year = {2022}, } @article{luke2007network, title = {Network analysis in public health: history, methods, and applications}, author = {Luke, Douglas A and Harris, Jenine K}, journal = {Annual Review of Public Health}, volume = {28}, pages = {69--93}, year = {2007}, } @inproceedings{ma2020isolation, title = {Isolation {Mondrian} forest for batch and online anomaly detection}, author = {Ma, Haoran and Ghojogh, Benyamin and Samad, Maria N and Zheng, Dongyu and Crowley, Mark}, booktitle = {2020 IEEE International Conference on Systems, Man, and Cybernetics}, pages = {3051--3058}, year = {2020}, organization = {Institute of Electrical and Electronics Engineers}, } @article{magda2018martingale, title = {Martingale approximations for random fields}, author = {Magda, Peligrad and Zhang, Na}, journal = {Electronic Communications in Probability}, volume = {23}, number = {28}, pages = {1--9}, year = {2018} } @article{matsushita2021jackknife, author = {Matsushita, Yukitoshi and Otsu, Taisuke}, journal = {Biometrika}, number = {3}, pages = {661--674}, title = {Jackknife empirical likelihood: small bandwidth, sparse network and high-dimensional asymptotics}, volume = {108}, year = {2021}, } @article{mcleish1975invariance, title = {Invariance principles for dependent variables}, author = {McLeish, Don L}, journal = {Zeitschrift f{\"u}r Wahrscheinlichkeitstheorie und verwandte Gebiete}, volume = {32}, number = {3}, pages = {165--178}, year = {1975}, } @incollection{merlevede2009bernstein, title = {Bernstein inequality and moderate deviations under strong mixing conditions}, author = {Merlev{\`e}de, Florence and Peligrad, Magda and Rio, Emmanuel}, booktitle = {High Dimensional Probability V, the Luminy volume}, pages = {273--292}, year = {2009}, publisher = {Institute of Mathematical Statistics}, editor = {Christian Houdr{\'e} and Vladimir Koltchinskii and David M Mason and Magda Peligrad}, } @article{minsker2019moment, author = {Minsker, Stanislav and Wei, Xiaohan}, journal = {Electronic Journal of Probability}, number = {133}, pages = {1--32}, title = {Moment inequalities for matrix-valued {U}-statistics of order 2}, volume = {24}, year = {2019}, } @manual{mosek, author = {{MOSEK ApS}}, title = {The {MOSEK} {Optimizer} {API} for {C} manual. Version 9.3}, year = {2021}, } @article{mourtada2017universal, title = {Universal consistency and minimax rates for online {Mondrian} forests}, author = {Mourtada, Jaouad and Ga{\"\i}ffas, St{\'e}phane and Scornet, Erwan}, journal = {Advances in Neural Information Processing Systems}, volume = {30}, year = {2017} } @article{mourtada2020minimax, title = {Minimax optimal rates for {Mondrian} trees and forests}, author = {Mourtada, Jaouad and Ga{\"i}ffas, St{\'e}phane and Scornet, Erwan}, journal = {Annals of Statistics}, volume = {48}, number = {4}, pages = {2253--2276}, year = {2020}, } @article{mourtada2021amf, title = {{AMF}: Aggregated {Mondrian} forests for online learning}, author = {Mourtada, Jaouad and Ga{\"\i}ffas, St{\'e}phane and Scornet, Erwan}, journal = {Journal of the Royal Statistical Society Series B: Statistical Methodology}, volume = {83}, number = {3}, pages = {505--533}, year = {2021}, } @incollection{nazarov2003maximal, title = {On the Maximal Perimeter of a Convex Set in $\mathbb{R}^n$ with Respect to a {Gaussian} Measure}, author = {Nazarov, Fedor}, booktitle = {Geometric Aspects of Functional Analysis}, pages = {169--187}, year = {2003}, publisher = {Springer}, editor = {Vitali D Milman and Gideon Schechtman}, } @article{oreilly2022stochastic, title = {Stochastic geometry to generalize the {Mondrian} process}, author = {O'Reilly, Eliza and Tran, Ngoc Mai}, journal = {SIAM Journal on Mathematics of Data Science}, volume = {4}, number = {2}, pages = {531--552}, year = {2022}, } @incollection{peligrad2010conditional, title = {Conditional central limit theorem via martingale approximation}, author = {Peligrad, M}, booktitle = {Dependence in Probability, Analysis and Number Theory, volume in memory of Walter Philipp}, pages = {295--311}, year = {2010}, publisher = {Kendrick Press}, editor = {Istvan Berkes and Richard C Bradley and Herold Dehling and Magda Peligrad and Robert Tichy}, } @book{pollard2002user, author = {Pollard, David}, publisher = {Cambridge University Press}, title = {A User's Guide to Measure Theoretic Probability}, series = {Cambridge Series in Statistical and Probabilistic Mathematics}, year = {2002}, } @article{rakhlin2015sequential, title = {Sequential complexities and uniform martingale laws of large numbers}, author = {Rakhlin, Alexander and Sridharan, Karthik and Tewari, Ambuj}, journal = {Probability Theory and Related Fields}, volume = {161}, number = {1}, pages = {111--153}, year = {2015}, } @article{ray2021bernstein, title = {On the {Bernstein}--von {Mises} theorem for the {Dirichlet} process}, author = {Ray, Kolyan and van der Vaart, Aad}, journal = {Electronic Journal of Statistics}, volume = {15}, number = {1}, pages = {2224--2246}, year = {2021}, } @book{rio2017asymptotic, title = {Asymptotic Theory of Weakly Dependent Random Processes}, series = {Probability Theory and Stochastic Modelling}, author = {Rio, Emmanuel}, volume = {80}, year = {2017}, publisher = {Springer}, address = {Berlin, Heidelberg}, } @inproceedings{roy2008mondrian, title = {The {Mondrian} Process.}, author = {Roy, Daniel M and Teh, Yee Whye}, booktitle = {Neural Information Processing Systems}, volume = {21}, year = {2008} } @book{royden1988real, author = {Royden, Halsey Lawrence and Fitzpatrick, Patrick}, publisher = {Macmillan}, address = {New York, NY}, title = {Real Analysis}, year = {1988}, } @article{schucany1977improvement, title = {Improvement of kernel type density estimators}, author = {Schucany, William R and Sommers, John P}, journal = {Journal of the American Statistical Association}, volume = {72}, number = {358}, pages = {420--423}, year = {1977}, } @article{scillitoe2021uncertainty, title = {Uncertainty quantification for data-driven turbulence modelling with {Mondrian} forests}, author = {Scillitoe, Ashley and Seshadri, Pranay and Girolami, Mark}, journal = {Journal of Computational Physics}, volume = {430}, pages = {110116}, year = {2021}, } @article{scornet2015consistency, author = {Erwan Scornet and G{\'e}rard Biau and Jean-Philippe Vert}, journal = {Annals of Statistics}, keywords = {Additive model, consistency, Dimension reduction, random forests, Randomization, Sparsity}, number = {4}, pages = {1716--1741}, title = {Consistency of random forests}, volume = {43}, year = {2015}, } @article{settati2009gaussian, title = {Gaussian approximation of the empirical process under random entropy conditions}, author = {Settati, Adel}, journal = {Stochastic Processes and their Applications}, volume = {119}, number = {5}, pages = {1541--1560}, year = {2009}, } @article{sheehy1992uniform, title = {Uniform {Donsker} classes of functions}, author = {Sheehy, Anne and Wellner, Jon A}, journal = {Annals of Probability}, volume = {20}, number = {4}, pages = {1983--2030}, year = {1992}, } @book{simonoff1996smoothing, title = {Smoothing Methods in Statistics}, author = {Simonoff, Jeffrey S}, series = {Springer Series in Statistics}, year = {1996}, publisher = {Springer Science}, address = {New York, NY}, } @article{stone1982optimal, title = {Optimal global rates of convergence for nonparametric regression}, author = {Stone, Charles J}, journal = {Annals of Statistics}, pages = {1040--1053}, year = {1982}, } @book{van1996weak, title = {Weak Convergence and Empirical Processes}, author = {van der Vaart, Aad Willem and Wellner, Jon August}, year = {1996}, series = {Springer Series in Statistics}, publisher = {Springer}, address = {New York, NY}, } @article{van2013bernstein, title = {The {Bernstein}--{Orlicz} norm and deviation inequalities}, author = {van de Geer, Sara and Lederer, Johannes}, journal = {Probability Theory and Related Fields}, volume = {157}, number = {1}, pages = {225--250}, year = {2013}, } @inproceedings{vicuna2021reducing, title = {Reducing numerical precision preserves classification accuracy in {Mondrian} Forests}, author = {Vicuna, Marc and Khannouz, Martin and Kiar, Gregory and Chatelain, Yohan and Glatard, Tristan}, booktitle = {2021 IEEE International Conference on Big Data}, pages = {2785--2790}, year = {2021}, organization = {Institute of Electrical and Electronics Engineers}, } @book{wand1994kernel, author = {Wand, Matt P and Jones, M Chris}, publisher = {Chapman \& Hall/CRC}, address = {New York, NY}, title = {Kernel Smoothing}, year = {1994}, series = {Monographs on Statistics and Applied Probability}, volume = {60}, } @article{wu2004martingale, title = {Martingale approximations for sums of stationary processes}, author = {Wu, Wei Biao and Woodroofe, Michael}, journal = {Annals of Probability}, volume = {32}, number = {2}, pages = {1674--1690}, year = {2004} } @article{yurinskii1978error, author = {Yurinskii, Vadim Vladimirovich}, journal = {Theory of Probability \& its Applications}, number = {2}, pages = {236--247}, title = {On the error of the {Gaussian} approximation for convolutions}, volume = {22}, year = {1978}, } @article{zaitsev1987estimates, title = {Estimates of the {L}{\'e}vy--{Prokhorov} distance in the multivariate central limit theorem for random variables with finite exponential moments}, author = {Zaitsev, A Yu}, journal = {Theory of Probability \& Its Applications}, volume = {31}, number = {2}, pages = {203--220}, year = {1987}, } @article{zaitsev1987gaussian, title = {On the {Gaussian} approximation of convolutions under multidimensional analogues of {S.\ N.\ Bernstein's} inequality conditions}, author = {Zaitsev, A Yu}, journal = {Probability Theory and Related Fields}, volume = {74}, number = {4}, pages = {535--566}, year = {1987}, } @article{zhai2018high, title = {A high-dimensional {CLT} in $\mathcal{W}_2$ distance with near optimal convergence rate}, author = {Zhai, Alex}, journal = {Probability Theory and Related Fields}, volume = {170}, number = {3}, pages = {821--845}, year = {2018}, } @article{zhao2008martingale, title = {On martingale approximations}, author = {Zhao, Ou and Woodroofe, Michael}, journal = {Annals of Applied Probability}, volume = {18}, number = {5}, pages = {1831--1847}, year = {2008} } @article{zhou2019deep, title = {Deep forest}, author = {Zhou, Zhi-Hua and Feng, Ji}, journal = {National Science Review}, volume = {6}, number = {1}, pages = {74--86}, year = {2019}, } tex-fmt-0.5.2/tests/target/puthesis.cls000066400000000000000000000070701473573253500201230ustar00rootroot00000000000000\NeedsTeXFormat{LaTeX2e} \ProvidesClass{puthesis} \RequirePackage{setspace} \RequirePackage{xcolor} \def\current@color{ Black} \newcounter{subyear} \setcounter{subyear}{\number\year} \def\submitted#1{\gdef\@submitted{#1}} \def\@submittedyear{\ifnum\month>10 \stepcounter{subyear}\thesubyear \else\thesubyear\fi} \def\@submittedmonth{\ifnum\month>10 January\else\ifnum\month>8 November \else\ifnum\month>6 September\else May\fi\fi\fi} \def\adviser#1{\gdef\@adviser{#1}} \long\def\@abstract{\@latex@error{No \noexpand\abstract given}\@ehc} \newcommand*{\frontmatter}{ %\pagenumbering{roman} } \newcommand*{\mainmatter}{ %\pagenumbering{arabic} } \newcommand*{\makelot}{} \newcommand*{\makelof}{} \newcommand*{\makelos}{} \newcommand*{\begincmd}{ \doublespacing \frontmatter\maketitlepage\makecopyrightpage\makeabstract \makeacknowledgments\makededication\tableofcontents\clearpage \makelot\clearpage\makelof\clearpage\makelos \clearpage\mainmatter} \def\@submitted{\@submittedmonth~\@submittedyear} \def\@dept{Operations Research and Financial Engineering} \def\@deptpref{Department of} \def\departmentprefix#1{\gdef\@deptpref{#1}} \def\department#1{\gdef\@dept{#1}} \long\def\acknowledgments#1{\gdef\@acknowledgments{#1}} \def\dedication#1{\gdef\@dedication{#1}} \newcommand{\maketitlepage}{{ \thispagestyle{empty} \sc \vspace*{0in} \begin{center} \LARGE \@title \end{center} \vspace{.6in} \begin{center} \@author \end{center} \vspace{.6in} \begin{center} A Dissertation \\ Presented to the Faculty \\ of Princeton University \\ in Candidacy for the Degree \\ of Doctor of Philosophy \end{center} \vspace{.3in} \begin{center} Recommended for Acceptance \\ by the \@deptpref \\ \@dept \\ Adviser: \@adviser \end{center} \vspace{.3in} \begin{center} \@submitted \end{center} \clearpage }} \newcommand*{\makecopyrightpage}{ \thispagestyle{empty} \vspace*{0in} \begin{center} \copyright\ Copyright by \@author, \number\year. \\ All rights reserved. \end{center} \clearpage} \newcommand*{\makeabstract}{ \newpage \addcontentsline{toc}{section}{Abstract} \begin{center} \Large \textbf{Abstract} \end{center} \@abstract \clearpage } \def\makeacknowledgments{ \ifx\@acknowledgments\undefined \else \addcontentsline{toc}{section}{Acknowledgments} \begin{center} \Large \textbf{Acknowledgments} \end{center} \@acknowledgments \clearpage \fi } \def\makededication{ \ifx\@dedication\undefined \else \vspace*{1.5in} \begin{flushright} \@dedication \end{flushright} \clearpage \fi } \DeclareOption{myorder}{ \renewcommand*{\begincmd}{\doublespacing}} \DeclareOption{lot}{\renewcommand*{\makelot}{ \addcontentsline{toc}{section}{List of Tables}\listoftables}} \DeclareOption{lof}{\renewcommand*{\makelof}{ \addcontentsline{toc}{section}{List of Figures and Tables}\listoffigures}} \DeclareOption{los}{ \renewcommand*{\makelos}{ \RequirePackage{losymbol} \section*{List of Symbols\@mkboth {LIST OF SYMBOLS}{LIST OF SYMBOLS}} \@starttoc{los} \addcontentsline{toc}{section}{List of Symbols} } } \DeclareOption*{\PassOptionsToClass{\CurrentOption}{report}} \ProcessOptions \LoadClass{report} \setlength{\oddsidemargin}{0.2in} \setlength{\evensidemargin}{0.2in} \setlength{\topmargin}{0in} \setlength{\headheight}{0in} \setlength{\headsep}{0in} \setlength{\textheight}{8.9in} \setlength{\textwidth}{6.1in} \setlength{\footskip}{0.5in} \long\def\abstract#1{\gdef\@abstract{#1}} \AtBeginDocument{\begincmd} \endinput tex-fmt-0.5.2/tests/target/quiver.sty000066400000000000000000000031321473573253500176230ustar00rootroot00000000000000% *** quiver *** % A package for drawing commutative diagrams exported from https://q.uiver.app. % % This package is currently a wrapper around the `tikz-cd` package, % importing necessary TikZ % libraries, and defining a new TikZ style for curves of a fixed height. % % Version: 1.4.2 % Authors: % - varkor (https://github.com/varkor) % - AndréC (https://tex.stackexchange.com/users/138900/andr%C3%A9c) \NeedsTeXFormat{LaTeX2e} \ProvidesPackage{quiver}[2021/01/11 quiver] % `tikz-cd` is necessary to draw commutative diagrams. \RequirePackage{tikz-cd} % `amssymb` is necessary for `\lrcorner` and `\ulcorner`. \RequirePackage{amssymb} % `calc` is necessary to draw curved arrows. \usetikzlibrary{calc} % `pathmorphing` is necessary to draw squiggly arrows. \usetikzlibrary{decorations.pathmorphing} % A TikZ style for curved arrows of a fixed height, due to AndréC. \tikzset{curve/.style={settings={#1},to path={(\tikztostart) .. controls ($(\tikztostart)!\pv{pos}!(\tikztotarget)!\pv{height}!270:(\tikztotarget)$) % tex-fmt: skip and ($(\tikztostart)!1-\pv{pos}!(\tikztotarget)!\pv{height}!270:(\tikztotarget)$) % tex-fmt: skip .. (\tikztotarget)\tikztonodes}}, settings/.code={\tikzset{quiver/.cd,#1} \def\pv##1{\pgfkeysvalueof{/tikz/quiver/##1}}}, quiver/.cd,pos/.initial=0.35,height/.initial=0} % TikZ arrowhead/tail styles. \tikzset{tail reversed/.code={\pgfsetarrowsstart{tikzcd to}}} \tikzset{2tail/.code={\pgfsetarrowsstart{Implies[reversed]}}} \tikzset{2tail reversed/.code={\pgfsetarrowsstart{Implies}}} % TikZ arrow styles. \tikzset{no body/.style={/tikz/dash pattern=on 0 off 1mm}} \endinput tex-fmt-0.5.2/tests/target/readme.tex000066400000000000000000000002671473573253500175340ustar00rootroot00000000000000\documentclass{article} \begin{document} \begin{itemize} \item Lists with items over multiple lines \end{itemize} \begin{equation} E = m c^2 \end{equation} \end{document} tex-fmt-0.5.2/tests/target/sections.tex000066400000000000000000000007171473573253500201260ustar00rootroot00000000000000\documentclass{book} \begin{document} \section{Section test} Sectioning commands should be moved to their own lines. \subsection{Result} Even if there is more than one. \subsection{Result 2} Also \section*{A} unnumbered sectioning commands \subsection*{B} should be split onto their own lines, even if there \subsubsection*{C} is more than one. All of this \part{D} should also hold \part*{E} for parts \chapter{F} and chapters \chapter*{G}. \end{document} tex-fmt-0.5.2/tests/target/short_document.tex000066400000000000000000000022011473573253500213220ustar00rootroot00000000000000\documentclass{article} \usepackage{amsmath} \usepackage{amsthm} \newtheorem{theorem}{Theorem} \title{Testing \texttt{tex-fmt}} \author{William G.\ Underwood} \begin{document} \maketitle \begin{align} E = m c^2 \\ 1 + 2 + (3 + 4) + (5 + 6 + 7 + 8) + (9 + 10 + 11 + 12 + 13 + 14) \end{align} \begin{itemize} \item Item one % trailing comment with ]) brackets \item Item two on multiple lines \item Item three \begin{itemize} \item Subitem one of item two % this line has trailing spaces \item Subitem two of item two \end{itemize} \item Item four % trailing comment % with [( brackets \item \end{itemize} \begin{theorem}[Pythagoras]% \label{thm:pythagoras} For a right triangle with hypotenuse $c$ and other sides $a$ and $b$, we have % \begin{align*} a^2 + b^2 = c^2 \end{align*} % % some comments \end{theorem} This line contains \emph{emphasized} text. \emph{This line contains only emphasized text, and is broken over two lines}. \emph{This line contains only emphasized text, and is broken over three lines}. \end{document} % This file ends with trailing newlines tex-fmt-0.5.2/tests/target/tikz_network.sty000066400000000000000000001404611473573253500210510ustar00rootroot00000000000000% ============================================================================ % File : tikz-network.sty -- Library for plotting networks in TikZ % Author : Juergen Hackl % Creation : 2017-02-28 % Time-stamp: % Version : 1.0 (2018-07-30) % % Copyright (c) 2018 Juergen Hackl % % This program is free software: you can redistribute it and/or modify % it under the terms of the GNU General Public License as published by % the Free Software Foundation, either version 3 of the License, or % (at your option) any later version. % % This program is distributed in the hope that it will be useful, % but WITHOUT ANY WARRANTY; without even the implied warranty of % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the % GNU General Public License for more details. % % You should have received a copy of the GNU General Public License % along with this program. If not, see . % ============================================================================ \NeedsTeXFormat{LaTeX2e} \ProvidesPackage{tikz-network}[2018/07/30 tikz-network v1.0] %============================================================================= % Used packages %============================================================================= \RequirePackage{etex} \RequirePackage{xifthen} \RequirePackage{xkeyval}[2005/11/25] \RequirePackage{tikz} \RequirePackage{datatool} \RequirePackage{graphicx} \usetikzlibrary{arrows} \usetikzlibrary{positioning} \usetikzlibrary{3d} \usetikzlibrary{fit} \usetikzlibrary{calc} \usetikzlibrary{backgrounds} \usetikzlibrary{arrows.meta} \usetikzlibrary{shapes.geometric} %============================================================================= %============================================================================= % Predefined variables %============================================================================= %<---------------------------------------------------------------------------> % Vertex %<---------------------------------------------------------------------------> \definecolor{vertexfill}{HTML}{abd7e6} \newcommand*{\DefaultUnit}{cm} \newcommand*{\DistanceScale}{1} \newcommand*{\VertexShape}{circle} \newcommand*{\VertexInnerSep}{2pt} \newcommand*{\VertexOuterSep}{0pt} \newcommand*{\VertexMinSize}{0.6\DefaultUnit} \newcommand*{\VertexLineWidth}{1pt} \newcommand*{\VertexLineColor}{black} \newcommand*{\VertexLineOpacity}{1} \newcommand*{\VertexTextColor}{black} \newcommand*{\VertexFillColor}{vertexfill} \newcommand*{\VertexFillOpacity}{1} \newcommand*{\VertexTextFont}{\scriptsize}%\tiny} \newcommand*{\VertexTextRotation}{0} \newcommand*{\VertexTextOpacity}{1} %<---------------------------------------------------------------------------> % Edge %<---------------------------------------------------------------------------> \newcommand*{\EdgeArrow}{-latex} \newcommand*{\EdgeLineWidth}{1.5pt} \newcommand*{\EdgeColor}{black!75} \newcommand*{\EdgeOpacity}{1} \newcommand*{\EdgeTextFillColor}{white} \newcommand*{\EdgeTextFillOpacity}{1} \newcommand*{\EdgeInnerSep}{0pt} \newcommand*{\EdgeOuterSep}{1pt} \newcommand*{\EdgeTextRotation}{0} \newcommand*{\EdgeTextOpacity}{1} \newcommand*{\EdgeTextFont}{\scriptsize} %<---------------------------------------------------------------------------> % Plane %<---------------------------------------------------------------------------> \newcommand*{\PlaneLineWidth}{1.5pt} \newcommand*{\PlaneLineColor}{black} \newcommand*{\PlaneLineOpacity}{1} \newcommand*{\PlaneGridLineWidth}{.5pt} \newcommand*{\PlaneGridColor}{black} \newcommand*{\PlaneGridOpacity}{.5} \newcommand*{\PlaneFillColor}{vertexfill} \newcommand*{\PlaneFillOpacity}{.3} \newcommand*{\PlaneWidth}{5\DefaultUnit} \newcommand*{\PlaneHeight}{5\DefaultUnit} %<---------------------------------------------------------------------------> % Text %<---------------------------------------------------------------------------> \newcommand*{\TextInnerSep}{2pt} \newcommand*{\TextOuterSep}{0pt} \newcommand*{\TextFont}{\normalsize} \newcommand*{\TextColor}{black} \newcommand*{\TextRotation}{0} \newcommand*{\TextOpacity}{1} %<---------------------------------------------------------------------------> % Network %<---------------------------------------------------------------------------> \newcommand*{\NetworkLayerDistance}{-2} \newcommand*{\xAngle}{-12} \newcommand*{\xLength}{1} \newcommand*{\yAngle}{37} \newcommand*{\yLength}{1} \newcommand*{\zAngle}{90} \newcommand*{\zLength}{1} \tikzset{edge canvas/.style={}} \tikzset{multilayer 2d/.style={y={(0:1cm)},x={(90:1cm)},z={(90:0cm)},every node/.append style={transform shape},}} \def\Origin{\draw [->] (0,0,0) -- (2,0,0) node [at end, right] {$y$}; \draw [->] (0,0,0) -- (0,2,0) node [at end, right] {$x$}; \draw [->] (0,0,0) -- (0,0,2) node [at end, left] {$z$};} %============================================================================= % Predefined Styles %============================================================================= %<---------------------------------------------------------------------------> % Init Default Vertex Style %<---------------------------------------------------------------------------> \define@cmdkey [NW] {DVS} {Shape}{} \define@cmdkey [NW] {DVS} {MinSize}{} \define@cmdkey [NW] {DVS} {LineWidth}{} \define@cmdkey [NW] {DVS} {LineColor}{} \define@cmdkey [NW] {DVS} {LineOpacity}{} \define@cmdkey [NW] {DVS} {FillColor}{} \define@cmdkey [NW] {DVS} {FillOpacity}{} \define@cmdkey [NW] {DVS} {TextColor}{} \define@cmdkey [NW] {DVS} {TextFont}{} \define@cmdkey [NW] {DVS} {TextRotation}{} \define@cmdkey [NW] {DVS} {TextOpacity}{} \define@cmdkey [NW] {DVS} {InnerSep}{} \define@cmdkey [NW] {DVS} {OuterSep}{} \presetkeys [NW] {DVS} { Shape = \VertexShape, MinSize = \VertexMinSize, LineWidth = \VertexLineWidth, LineColor = \VertexLineColor, FillColor = \VertexFillColor, LineOpacity = \VertexLineOpacity, FillOpacity = \VertexFillOpacity, InnerSep = \VertexInnerSep, OuterSep = \VertexOuterSep, TextColor = \VertexTextColor, TextRotation = \VertexTextRotation, TextOpacity = \VertexTextOpacity, TextFont = \VertexTextFont }{} %<---------------------------------------------------------------------------> % Init Default Edge Style %<---------------------------------------------------------------------------> \define@cmdkey [NW] {DES} {Arrow}{} \define@cmdkey [NW] {DES} {LineWidth}{} \define@cmdkey [NW] {DES} {Color}{} \define@cmdkey [NW] {DES} {Opacity}{} \define@cmdkey [NW] {DES} {TextFillColor}{} \define@cmdkey [NW] {DES} {TextFillOpacity}{} \define@cmdkey [NW] {DES} {TextFont}{} \define@cmdkey [NW] {DES} {TextRotation}{} \define@cmdkey [NW] {DES} {TextOpacity}{} \define@cmdkey [NW] {DES} {InnerSep}{} \define@cmdkey [NW] {DES} {OuterSep}{} \presetkeys [NW] {DES} { Arrow = \EdgeArrow, LineWidth = \EdgeLineWidth, Color = \EdgeColor, Opacity = \EdgeOpacity, TextFillColor = \EdgeTextFillColor, TextFillOpacity = \EdgeTextFillOpacity, InnerSep = \EdgeInnerSep, OuterSep = \EdgeOuterSep, TextRotation = \EdgeTextRotation, TextOpacity = \EdgeTextOpacity, TextFont = \EdgeTextFont }{} %<---------------------------------------------------------------------------> % Init Default Plane Style %<---------------------------------------------------------------------------> \define@cmdkey [NW] {DPS} {LineWidth}{} \define@cmdkey [NW] {DPS} {LineColor}{} \define@cmdkey [NW] {DPS} {LineOpacity}{} \define@cmdkey [NW] {DPS} {GridLineWidth}{} \define@cmdkey [NW] {DPS} {GridColor}{} \define@cmdkey [NW] {DPS} {GridOpacity}{} \define@cmdkey [NW] {DPS} {FillColor}{} \define@cmdkey [NW] {DPS} {FillOpacity}{} \presetkeys [NW] {DPS} { LineWidth = \PlaneLineWidth, LineColor = \PlaneLineColor, LineOpacity = \PlaneLineOpacity, GridLineWidth = \PlaneGridLineWidth, GridColor = \PlaneGridColor, GridOpacity = \PlaneGridOpacity, FillColor = \PlaneFillColor, FillOpacity = \PlaneFillOpacity }{} %<---------------------------------------------------------------------------> % Init Default Text Style %<---------------------------------------------------------------------------> \define@cmdkey [NW] {DTS} {InnerSep}{} \define@cmdkey [NW] {DTS} {OuterSep}{} \define@cmdkey [NW] {DTS} {TextFont}{} \define@cmdkey [NW] {DTS} {TextColor}{} \define@cmdkey [NW] {DTS} {TextRotation}{} \define@cmdkey [NW] {DTS} {TextOpacity}{} \presetkeys [NW] {DTS} { InnerSep = \TextInnerSep, OuterSep = \TextOuterSep, TextFont = \TextFont, TextColor = \TextColor, TextRotation = \TextRotation, TextOpacity = \TextOpacity }{} %<---------------------------------------------------------------------------> % Init Default Coordinates 3D %<---------------------------------------------------------------------------> \define@cmdkey [NW] {COS} {xAngle}{} \define@cmdkey [NW] {COS} {xLength}{} \define@cmdkey [NW] {COS} {yAngle}{} \define@cmdkey [NW] {COS} {yLength}{} \define@cmdkey [NW] {COS} {zAngle}{} \define@cmdkey [NW] {COS} {zLength}{} \presetkeys [NW] {COS} { xAngle = \xAngle, xLength = \xLength, yAngle = \yAngle, yLength = \yLength, zAngle = \zAngle, zLength = \zLength }{} %<---------------------------------------------------------------------------> % Default Style %<---------------------------------------------------------------------------> \newcommand*{\SetVertexStyle}[1][]{\NW@SetVertexStyleDefault[#1]}% \def\NW@SetVertexStyleDefault[#1]{% \setkeys[NW]{DVS}{#1}% \tikzset{VertexStyle/.style = { draw, shape = \cmdNW@DVS@Shape, color = \cmdNW@DVS@LineColor, fill = \cmdNW@DVS@FillColor, inner sep = \cmdNW@DVS@InnerSep, outer sep = \cmdNW@DVS@OuterSep, minimum size = \cmdNW@DVS@MinSize, line width = \cmdNW@DVS@LineWidth, font = \cmdNW@DVS@TextFont, fill opacity = \cmdNW@DVS@FillOpacity, draw opacity = \cmdNW@DVS@LineOpacity }} \tikzset{LabelStyle/.style={ \cmdNW@DVS@TextColor, font = \cmdNW@DVS@TextFont, rotate = \cmdNW@DVS@TextRotation, opacity = \cmdNW@DVS@TextOpacity, }} }% \newcommand*{\SetEdgeStyle}[1][]{\NW@SetEdgeStyleDefault[#1]}% \def\NW@SetEdgeStyleDefault[#1]{% \setkeys[NW]{DES}{#1}% \tikzset{EdgeStyle/.style = {\cmdNW@DES@Arrow, line width = \cmdNW@DES@LineWidth, color = \cmdNW@DES@Color, opacity = \cmdNW@DES@Opacity }} \tikzset{EdgeLabelStyle/.style={circle, fill = \cmdNW@DES@TextFillColor, fill opacity = \cmdNW@DES@TextFillOpacity, inner sep = \cmdNW@DES@InnerSep, outer sep = \cmdNW@DES@OuterSep, rotate = \cmdNW@DES@TextRotation, text opacity = \cmdNW@DES@TextOpacity, font = \cmdNW@DES@TextFont }} }% \newcommand*{\SetPlaneStyle}[1][]{\NW@SetPlaneStyleDefault[#1]}% \def\NW@SetPlaneStyleDefault[#1]{% \setkeys[NW]{DPS}{#1}% \tikzset{PlaneBorderStyle/.style = {draw, line width = \cmdNW@DPS@LineWidth, color = \cmdNW@DPS@LineColor, draw opacity = \cmdNW@DPS@LineOpacity }} \tikzset{PlaneFillStyle/.style = { fill = \cmdNW@DPS@FillColor, fill opacity = \cmdNW@DPS@FillOpacity }} \tikzset{PlaneGridStyle/.style = {draw, line width = \cmdNW@DPS@GridLineWidth, color = \cmdNW@DPS@GridColor, opacity = \cmdNW@DPS@GridOpacity }} }% \newcommand*{\SetTextStyle}[1][]{\NW@SetTextStyleDefault[#1]}% \def\NW@SetTextStyleDefault[#1]{% \setkeys[NW]{DTS}{#1}% \tikzset{TextStyle/.style = { inner sep = \cmdNW@DTS@InnerSep, outer sep = \cmdNW@DTS@OuterSep, color = \cmdNW@DTS@TextColor, rotate = \cmdNW@DTS@TextRotation, text opacity = \cmdNW@DTS@TextOpacity, font = \cmdNW@DTS@TextFont }} }% \tikzset{ multilayer/.code={% \ifthenelse{\equal{#1}{3d}}{ \tikzset{edge canvas/.style={canvas is yx plane at z=0}} \tikzset{multilayer 3d} }{ \tikzset{edge canvas/.style={}} \tikzset{multilayer 2d} } }, } \newcommand*{\SetCoordinates}[1][]{\NW@SetCoordinates[#1]}% \def\NW@SetCoordinates[#1]{% \setkeys[NW]{COS}{#1}% \tikzset{multilayer 3d/.style={ y={(\cmdNW@COS@xAngle:\cmdNW@COS@xLength \DefaultUnit)}, x={(\cmdNW@COS@yAngle:\cmdNW@COS@yLength \DefaultUnit)}, z={(\cmdNW@COS@zAngle:\cmdNW@COS@zLength \DefaultUnit)}, every node/.append style={transform shape}, }} %\tikzset{edge canvas/.style={canvas is yx plane at z=0}} }% %<---------------------------------------------------------------------------> % Apply default settings %<---------------------------------------------------------------------------> \SetCoordinates \SetVertexStyle \SetEdgeStyle \SetPlaneStyle \SetTextStyle %<---------------------------------------------------------------------------> % Redefine settings %<---------------------------------------------------------------------------> \newcommand*{\SetLayerDistance}[1]{\renewcommand{\NetworkLayerDistance}{#1}} \newcommand*{\SetDefaultUnit}[1]{\renewcommand{\DefaultUnit}{#1}} \newcommand*{\SetDistanceScale}[1]{\renewcommand{\DistanceScale}{#1}} \newcommand*{\SetPlaneWidth}[1]{\renewcommand{\PlaneWidth}{#1}} \newcommand*{\SetPlaneHeight}[1]{\renewcommand{\PlaneHeight}{#1}} \newcommand*{\EdgesInBG}{\presetkeys [NW] {edge} {NotInBG = false}{}} \newcommand*{\EdgesNotInBG}{\presetkeys [NW] {edge} {NotInBG = true}{}} %============================================================================= % Vertex and Edge creation %============================================================================= %<---------------------------------------------------------------------------> % Init Vertex %<---------------------------------------------------------------------------> \define@cmdkey [NW] {vertex} {x}{} \define@cmdkey [NW] {vertex} {y}{} \define@cmdkey [NW] {vertex} {label}{} \define@cmdkey [NW] {vertex} {size}{} \define@cmdkey [NW] {vertex} {color}{} \define@cmdkey [NW] {vertex} {opacity}{} \define@cmdkey [NW] {vertex} {style}{} \define@cmdkey [NW] {vertex} {layer}{} \define@cmdkey [NW] {vertex} {shape}{} \define@cmdkey [NW] {vertex} {fontsize}{} \define@cmdkey [NW] {vertex} {fontcolor}{} \define@cmdkey [NW] {vertex} {fontscale}{} \define@boolkey [NW] {vertex} {RGB}[true]{} \define@boolkey [NW] {vertex} {IdAsLabel}[true]{} \define@boolkey [NW] {vertex} {NoLabel}[true]{} \define@boolkey [NW] {vertex} {Math}[true]{} \define@boolkey [NW] {vertex} {Network}[true]{} \define@boolkey [NW] {vertex} {Pseudo}[true]{} \define@cmdkey [NW] {vertex} {distance}{} \define@cmdkey [NW] {vertex} {position}{} \presetkeys [NW] {vertex} {Network = false,}{} %<---------------------------------------------------------------------------> % Vertex %<---------------------------------------------------------------------------> \newcommand*{\Vertex}[1][]{\@vertex[#1]}% \def\@vertex[#1]#2{% \setkeys[NW]{vertex}{#1}% % Check if Vertex is used in a network, if so no default settings are % necessary, otherwise default settings are applied. \ifNW@vertex@Network \cmdNW@vertex@opacity \else \setkeys[NW]{vertex}{ x = {0}, y = {0}, label = {}, size = {}, color = {}, opacity = {}, layer = {}, shape = {}, style = {}, fontsize = {}, fontcolor = {}, fontscale = {}, NoLabel = false, IdAsLabel = false, Math = false, RGB = false, Pseudo = false, distance = {0}, position = {center}, } \setkeys[NW]{vertex}{#1}% \fi \@@vertex{#2}% } \def\@@vertex#1{% \def\vstyle{VertexStyle} \begin{scope} % [ % scale=1,yshift=0,every node/.append % style={yslant=0.5,xslant=-1},yslant=0.5,xslant=-1 % ] % If option NoLabel is true, no labels are printed in the network \ifNW@vertex@NoLabel \def\vertex@L{}% \def\vertex@Name{}% \else % if IdAsLabel is true, the label of the vertex is equal to the vertex id \ifNW@vertex@IdAsLabel \def\vertex@Name{#1} \def\vertex@L{\vertex@Name} % Otherwise the label is equal to the label if it is non empty \else \ifthenelse{\not\equal{\cmdNW@vertex@label}{}}{ \def\vertex@L{\cmdNW@vertex@label} \def\vertex@Name{#1} }{ \def\vertex@Name{#1} \def\vertex@L{} } \fi \fi % Check if Math is true, if so the label will be in math mode \ifNW@vertex@Math \def\vertex@Label{$\vertex@L$}% \else \def\vertex@Label{\vertex@L}% \fi % Check if the size of the vertex is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@vertex@size}{}}{ \tikzset{LocalVertexSize/.style={minimum size = \cmdNW@vertex@size \DefaultUnit}} }{ \tikzset{LocalVertexSize/.style={}} } % Check if the font size of the vertex label is redefined, if so % the new style is used \ifthenelse{\not\equal{\cmdNW@vertex@fontsize}{}}{ \tikzset{LocalVertexFontSize/.style={font = \cmdNW@vertex@fontsize}} }{ \tikzset{LocalVertexFontSize/.style={}} } % Check if the font scale of the vertex label is redefined, if so % the new style is used \ifthenelse{\not\equal{\cmdNW@vertex@fontscale}{}}{ \tikzset{LocalVertexFontScale/.style={scale = \cmdNW@vertex@fontscale}} }{ \tikzset{LocalVertexFontScale/.style={}} } % Check if the opacity of the vertex is redefined, if so the new % style is used \ifthenelse{\not\equal{\cmdNW@vertex@opacity}{}}{ \tikzset{LocalVertexOpacity/.style={fill opacity = \cmdNW@vertex@opacity}} }{ \tikzset{LocalVertexOpacity/.style={}} } % Check if the shape of the vertex is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@vertex@shape}{}}{ \tikzset{LocalVertexShape/.style={shape = \cmdNW@vertex@shape}} }{ \tikzset{LocalVertexShape/.style={}} } % Check if the color of the vertex is redefined, if so the new style is % used. If the option RGB is true, RGB values can be used to define the % color of the vertex entered in the form {R,G,B}. If RGB is not true the % default colors of tikz can be used (e.g. blue!50!green) \ifNW@vertex@RGB \ifthenelse{\not\equal{\cmdNW@vertex@color}{}}{ \pgfutil@definecolor{LocalColor}{RGB}{\cmdNW@vertex@color} \tikzset{LocalVertexFill/.style={fill = LocalColor}} }{ \tikzset{LocalVertexFill/.style={}} } \ifthenelse{\not\equal{\cmdNW@vertex@fontcolor}{}}{ \pgfutil@definecolor{LocalFontColor}{RGB}{\cmdNW@vertex@fontcolor} \tikzset{LocalVertexFontColor/.style={color = LocalFontColor}} }{ \tikzset{LocalVertexFontColor/.style={}} } \else \ifthenelse{\not\equal{\cmdNW@vertex@color}{}}{ \tikzset{LocalVertexFill/.style={fill = \cmdNW@vertex@color}} }{ \tikzset{LocalVertexFill/.style={}} } \ifthenelse{\not\equal{\cmdNW@vertex@fontcolor}{}}{ \tikzset{LocalVertexFontColor/.style={color = \cmdNW@vertex@fontcolor}} }{ \tikzset{LocalVertexFontColor/.style={}} } \fi % Define empty style for the vertex \ifNW@vertex@Pseudo \tikzset{LocalPseudo/.style={opacity = 0}} \else \tikzset{LocalPseudo/.style={}} \fi % Define local style for the label \tikzset{LocalLabel/.style={label = {[LabelStyle, LocalVertexFontColor, LocalVertexFontSize, LocalVertexFontScale, label distance=\cmdNW@vertex@distance]\cmdNW@vertex@position:\vertex@Label}}} \ifthenelse{\equal{\cmdNW@vertex@layer}{}}{ \protected@edef\@tempa{% \noexpand\node[\vstyle,LocalVertexSize,LocalVertexOpacity, LocalVertexFill,LocalVertexShape,LocalLabel, \cmdNW@vertex@style,LocalPseudo](#1)% at (\cmdNW@vertex@x*\DistanceScale\DefaultUnit, \cmdNW@vertex@y*\DistanceScale\DefaultUnit){}}% \@tempa; }{ \begin{scope}[canvas is yx plane at z=(\cmdNW@vertex@layer-1)*\NetworkLayerDistance] \protected@edef\@tempa{% \noexpand\node[\vstyle,LocalVertexSize,LocalVertexOpacity, LocalVertexFill,LocalVertexShape,LocalLabel, \cmdNW@vertex@style,LocalPseudo](#1)% at (\cmdNW@vertex@x*\DistanceScale\DefaultUnit, \cmdNW@vertex@y*\DistanceScale\DefaultUnit){}}% \@tempa; \end{scope} } \end{scope} } %<---------------------------------------------------------------------------> % Init Edge %<---------------------------------------------------------------------------> \define@cmdkey [NW] {edge} {label}{} \define@cmdkey [NW] {edge} {lw}{} \define@cmdkey [NW] {edge} {color}{} \define@cmdkey [NW] {edge} {opacity}{} \define@cmdkey [NW] {edge} {style}{} \define@cmdkey [NW] {edge} {fontcolor}{} \define@cmdkey [NW] {edge} {fontsize}{} \define@cmdkey [NW] {edge} {fontscale}{} \define@boolkey [NW] {edge} {RGB}[true]{} \define@boolkey [NW] {edge} {Math}[true]{} \define@boolkey [NW] {edge} {Direct}[true]{} \define@boolkey [NW] {edge} {Network}[true]{} \define@cmdkey [NW] {edge} {bend}{} \define@cmdkey [NW] {edge} {position}{} \define@cmdkey [NW] {edge} {distance}{} \define@cmdkey [NW] {edge} {loopsize}{} \define@cmdkey [NW] {edge} {loopposition}{} \define@cmdkey [NW] {edge} {loopshape}{} \define@boolkey [NW] {edge} {NotInBG}[true]{} \define@cmdkey [NW] {edge} {path}{} \presetkeys [NW] {edge} {Network = false,}{} % NotInBG = false,}{} %<---------------------------------------------------------------------------> % Edge %<---------------------------------------------------------------------------> \newcommand*{\Edge}[1][]{\@edge[#1]}% \def\@edge[#1](#2)(#3){% \setkeys[NW]{edge}{#1}% % Check if Vertex is used in a network, if so no default settings are % necessary, otherwise default settings are applied. \ifNW@edge@Network \else \setkeys[NW]{edge}{ label = {}, lw = {}, path = {}, color = {}, opacity = {}, style = {}, fontcolor = {}, fontsize = {}, fontscale = {}, RGB = false, Math = false, Direct = false, NotInBG = false, bend = {0}, loopsize = {1\DefaultUnit}, position = {}, loopposition= {0}, loopshape = {90}, distance = {.5} } \setkeys[NW]{edge}{#1}% \fi \def\estyle{EdgeStyle} % \ifNW@edge@NotInBG \tikzset{EdgeInBG/.style={}} \else \tikzset{EdgeInBG/.style={on background layer}} \fi \begin{scope}[edge canvas,EdgeInBG] % [ % scale=1,yshift=0,every node/.append % style={yslant=0.5,xslant=-1},yslant=0.5,xslant=-1 % ] % Check if Direct is true, if so use default arrow style \ifNW@edge@Direct \tikzset{LocalArrow/.style={}} \else \tikzset{LocalArrow/.style={-}} \fi % Check if the line width of the vertex is redefined, if so the new style is % used \ifthenelse{\not\equal{\cmdNW@edge@lw}{}}{ \tikzset{LocalEdgeLW/.style={line width = \cmdNW@edge@lw}} }{ \tikzset{LocalEdgeLW/.style={}} } % Check if the opacity of the vertex is redefined, if so the new % style is used \ifthenelse{\not\equal{\cmdNW@edge@opacity}{}}{ \tikzset{LocalEdgeOpacity/.style={opacity = \cmdNW@edge@opacity}} \tikzset{LocalTextOpacity/.style={text opacity = \cmdNW@edge@opacity}} }{ \tikzset{LocalEdgeOpacity/.style={}} \tikzset{LocalTextOpacity/.style={}} } % Check if the font size of the edge is redefined, if so the new style is % used \ifthenelse{\not\equal{\cmdNW@edge@fontsize}{}}{ \tikzset{LocalEdgeFontSize/.style={font = \cmdNW@edge@fontsize}} }{ \tikzset{LocalEdgeFontSize/.style={}} } % Check if the font scale of the edge is redefined, if so the new style is % used \ifthenelse{\not\equal{\cmdNW@edge@fontscale}{}}{ \tikzset{LocalEdgeFontScale/.style={scale = \cmdNW@edge@fontscale}} }{ \tikzset{LocalEdgeFontScale/.style={}} } % Check if the color of the vertex is redefined, if so the new style is % used. If the option RGB is true, RGB values can be used to define the % color of the vertex entered in the form {R,G,B}. If RGB is not true the % default colors of tikz can be used (e.g. blue!50!green) \ifNW@edge@RGB \ifthenelse{\not\equal{\cmdNW@edge@color}{}}{ \pgfutil@definecolor{LocalColor}{RGB}{\cmdNW@edge@color} \tikzset{LocalEdgeColor/.style={color = LocalColor}} }{ \tikzset{LocalEdgeColor/.style={}} } \ifthenelse{\not\equal{\cmdNW@edge@fontcolor}{}}{ \pgfutil@definecolor{LocalFontColor}{RGB}{\cmdNW@edge@fontcolor} \tikzset{LocalEdgeFontColor/.style={text = LocalFontColor}} }{ \tikzset{LocalEdgeFontColor/.style={}} } \else \ifthenelse{\not\equal{\cmdNW@edge@color}{}}{ \tikzset{LocalEdgeColor/.style={color = \cmdNW@edge@color}} }{ \tikzset{LocalEdgeColor/.style={}} } \ifthenelse{\not\equal{\cmdNW@edge@fontcolor}{}}{ \tikzset{LocalEdgeFontColor/.style={text = \cmdNW@edge@fontcolor}} }{ \tikzset{LocalEdgeFontColor/.style={}} } \fi % Check if Math is true, if so the label will be in math mode \ifNW@edge@Math \def\edge@L{$\cmdNW@edge@label$}% \else \def\edge@L{\cmdNW@edge@label}% \fi % Check if a label is assigned, if so create a label variable \ifthenelse{\not\equal{\cmdNW@edge@label}{}}{ \def\edge@Label{node[EdgeLabelStyle,LocalTextOpacity,LocalEdgeFontColor, LocalEdgeFontSize,LocalEdgeFontScale,pos=\cmdNW@edge@distance, \cmdNW@edge@position]{\edge@L}} }{ \def\edge@Label{} } % Check if it is a self loop or a normal edge % Normal edge \ifthenelse{\not\equal{#2}{#3}}{ \ifthenelse{\not\equal{\cmdNW@edge@path}{}}{ \def\edge@pts{}% \@for\tmp:=\cmdNW@edge@path\do{ \edef\edge@pts{\edge@pts (\tmp) --} } \protected@edef\@tempa{% \noexpand\draw[\estyle,LocalEdgeLW,LocalEdgeOpacity,LocalEdgeColor, LocalArrow,\cmdNW@edge@style] (#2) -- \edge@pts (#3)} \@tempa; }{ \protected@edef\@tempa{% \noexpand\path[\estyle,LocalEdgeLW,LocalEdgeOpacity,LocalEdgeColor, LocalArrow,\cmdNW@edge@style] (#2) edge [bend left = \cmdNW@edge@bend] \edge@Label (#3)}% \@tempa; } }{% Self loop \protected@edef\@tempa{% \noexpand\path[\estyle,LocalEdgeLW,LocalEdgeOpacity,LocalEdgeColor, LocalArrow,\cmdNW@edge@style] (#2) edge [in=-\cmdNW@edge@loopshape/2+\cmdNW@edge@loopposition, out=\cmdNW@edge@loopshape/2+\cmdNW@edge@loopposition,loop, distance=\cmdNW@edge@loopsize,] \edge@Label (#3)}% \@tempa; } \end{scope} } %============================================================================= % Vertices and Edges creation %============================================================================= %<---------------------------------------------------------------------------> % Init Vertices %<---------------------------------------------------------------------------> \define@cmdkey [NW] {vertices} {layer}{} \define@cmdkey [NW] {vertices} {size}{} \define@cmdkey [NW] {vertices} {color}{} \define@cmdkey [NW] {vertices} {opacity}{} \define@cmdkey [NW] {vertices} {style}{} \define@cmdkey [NW] {vertices} {shape}{} \define@boolkey [NW] {vertices} {RGB}[true]{} \define@boolkey [NW] {vertices} {IdAsLabel}[true]{} \define@boolkey [NW] {vertices} {NoLabel}[true]{} \define@boolkey [NW] {vertices} {Math}[true]{} \define@boolkey [NW] {vertices} {Pseudo}[true]{} \presetkeys [NW] {vertices} { layer = {}, opacity = {}, size = {}, color = {}, style = {}, shape = {}, RGB = false, IdAsLabel = false, NoLabel = false, Math = false, Pseudo = false, }{} \newcommand*{\setkeysexpanded}[2]{% \expandafter\setkeysexpandedaux\expandafter{#2}{#1}} \newcommand*{\setkeysexpandedaux}[2]{% \setkeys[NW]{#2}{#1}} % \newcommand*{\setkeysexpandedx}[2]{% % \expandafter\setkeysexpandedauxx\expandafter{#2}{#1}} % \newcommand*{\setkeysexpandedauxx}[2]{% % \setkeys[NW]{#2}{#1}} %<---------------------------------------------------------------------------> % Vertices %<---------------------------------------------------------------------------> \newcommand*{\Vertices}[1][]{\@vertices[#1]}% \def\@vertices[#1]#2{% \setkeys[NW]{vertices}{#1}% \@@vertices{#2}% } \def\@@vertices#1{% % Check if data base already exist \DTLifdbexists{#1}{}{ % create dummy data base to store name \DTLnewdb{#1} % delete existing vertices data base \DTLifdbexists{vertices}{ \DTLgdeletedb{vertices} }{} % Load data file for vertices \DTLloaddb[noheader=false]{vertices}{#1} } % Define variables to store option values \def\vertex@Options{}% \def\vertex@id{}% \def\vertex@rgbValues{}% % Go through each row and create vertices \DTLforeach*{vertices}{}{% % reset storage variable to default values \edef\vertex@Options{x=0,y=0,label={},size={},color={},fontcolor={}, fontsize={},fontscale={}, opacity={},layer={},style={},NoLabel=false,IdAsLabel=false, Math=false,RGB=false,Pseudo=false,distance={0},position={center},shape={},}% \edef\vertex@rgbValues{}% % Go through each row element \DTLforeachkeyinrow{\thisValue}{ \DTLifeq{\dtlkey}{id}{ % Assign vertex id to storage variable \edef\vertex@id{\thisValue}% }{ \DTLifeq{\dtlkey}{R}{ \edef\vertex@rgbValues{\vertex@rgbValues \thisValue,} }{ \DTLifeq{\dtlkey}{G}{ \edef\vertex@rgbValues{\vertex@rgbValues \thisValue,} }{ \DTLifeq{\dtlkey}{B}{ \edef\vertex@rgbValues{\vertex@rgbValues \thisValue,} }{ % Assign option to storage variable \edef\vertex@Options{\vertex@Options \dtlkey=\thisValue,} }}}} } % Add general settings for the Vertex % NoLabel \ifNW@vertices@NoLabel \edef\vertex@Options{\vertex@Options NoLabel=true,} \fi % Pseudo \ifNW@vertices@Pseudo \edef\vertex@Options{\vertex@Options Pseudo=true,} \fi % IdAsLabel \ifNW@vertices@IdAsLabel \edef\vertex@Options{\vertex@Options IdAsLabel=true,} \fi % Math \ifNW@vertices@Math \edef\vertex@Options{\vertex@Options Math=true,} \fi % RGB \ifNW@vertices@RGB \edef\vertex@Options{\vertex@Options RGB=true,color={\vertex@rgbValues},} \fi % opacity \ifthenelse{\not\equal{\cmdNW@vertices@opacity}{}} { \edef\vertex@Options{\vertex@Options opacity=\cmdNW@vertices@opacity,} }{} % size \ifthenelse{\not\equal{\cmdNW@vertices@size}{}} { \edef\vertex@Options{\vertex@Options size=\cmdNW@vertices@size,} }{} % shape \ifthenelse{\not\equal{\cmdNW@vertices@shape}{}} { \edef\vertex@Options{\vertex@Options shape=\cmdNW@vertices@shape,} }{} % color \ifthenelse{\not\equal{\cmdNW@vertices@color}{}} { \edef\vertex@Options{\vertex@Options color=\cmdNW@vertices@color,} }{} \ifthenelse{\not\equal{\cmdNW@vertices@style}{}}{ \edef\vertex@Options{\vertex@Options style={\cmdNW@vertices@style},} }{} % Apply settings for the Vertex \setkeysexpanded{vertex}{\vertex@Options}% \ifthenelse{\not\equal{\cmdNW@vertices@layer}{}}{ \ifthenelse{\equal{\cmdNW@vertices@layer}{\cmdNW@vertex@layer}}{ \Vertex[Network]{\vertex@id} }{} }{ \Vertex[Network]{\vertex@id} } % Create Vertex } % Delete data base % \DTLgdeletedb{#1} } %<---------------------------------------------------------------------------> % Init Edges %<---------------------------------------------------------------------------> \def\myvariable{\KeySettingCommand{false}} \define@cmdkey [NW] {edges} {layer}{} \define@cmdkey [NW] {edges} {vertices}{} \define@cmdkey [NW] {edges} {style}{} \define@cmdkey [NW] {edges} {lw}{} \define@cmdkey [NW] {edges} {color}{} \define@cmdkey [NW] {edges} {opacity}{} \define@boolkey [NW] {edges} {RGB}[true]{} \define@boolkey [NW] {edges} {Math}[true]{} \define@boolkey [NW] {edges} {Direct}[true]{} \define@boolkey [NW] {edges} {NoLabel}[true]{} \define@boolkey [NW] {edges} {NotInBG}[true]{} \presetkeys [NW] {edges} { layer = {}, vertices = {}, style = {}, lw = {}, color = {}, opacity = {}, RGB = false, Math = false, Direct = false, NoLabel = false, NotInBG = false, }{} \newcommand{\shortcut}[1]{% \@tempswafalse \@for\next:=#1\do {\if@tempswa+\else\@tempswatrue\fi\textbf{\next}}% } \newcounter{LayerCounter} \newcommand\myfunc[1]{\setcounter{LayerCounter}{0}\@for\tmp:=#1\do{ \stepcounter{LayerCounter} \arabic{LayerCounter}-a-\textbf{\tmp}} } %<---------------------------------------------------------------------------> % Edges %<---------------------------------------------------------------------------> \newcommand*{\Edges}[1][]{\@edges[#1]}% \def\@edges[#1]#2{% \setkeys[NW]{edges}{#1}% \@@edges{#2}% } \def\@@edges#1{% \begin{scope} % Check if data base already exist \DTLifdbexists{#1}{}{ % create dummy data base to store name \DTLnewdb{#1} % delete existing vertices data base \DTLifdbexists{edges}{ \DTLgdeletedb{edges} }{} % Load data file for vertices \DTLloaddb[noheader=false]{edges}{#1} } % % Load data file for vertices % \DTLloaddb[noheader=false]{#1}{#1} % Define variables to store option values \def\edge@Options{}% \def\edge@u{}% \def\edge@v{}% \def\edge@u@layer{}% \def\edge@v@layer{}% \def\edge@rgbValues{}% \def\u@layer{}% \def\v@layer{}% % % Assign where the edges are drawn from to \ifthenelse{\not\equal{\cmdNW@edges@layer}{}}{ % set layer count back to 0 \setcounter{LayerCounter}{0} \@for\tmp:=\cmdNW@edges@layer\do{ \stepcounter{LayerCounter} \ifthenelse{\value{LayerCounter}=1}{ \edef\u@layer{\tmp}% }{ \edef\v@layer{\tmp}% } } }{} % Go through each row and create edges \DTLforeach*{edges}{}{% % reset storage variable to default values \edef\edge@Options{label = {}, lw = {}, color = {}, opacity = {}, style = {}, RGB = false, Math = false, Direct = false, NotInBG = false, bend = {0}, loopsize = {1\DefaultUnit}, position = {}, loopposition = {0}, loopshape = {90}, distance = {.5}, path = {}, fontcolor = {}, fontsize = {}, fontscale ={},} \edef\edge@rgbValues{}% % Go through each row element \DTLforeachkeyinrow{\thisValue}{ \DTLifeq{\dtlkey}{u}{ % Assign edge id to storage variable \edef\edge@u{\thisValue}% }{ \DTLifeq{\dtlkey}{v}{ \edef\edge@v{\thisValue}% }{ \DTLifeq{\dtlkey}{R}{ \edef\edge@rgbValues{\edge@rgbValues \thisValue,} }{ \DTLifeq{\dtlkey}{G}{ \edef\edge@rgbValues{\edge@rgbValues \thisValue,} }{ \DTLifeq{\dtlkey}{B}{ \edef\edge@rgbValues{\edge@rgbValues \thisValue,} }{ % Assign option to storage variable \edef\edge@Options{\edge@Options \dtlkey=\thisValue,} }}}}} } % Add general settings for the Edges % NoLabel \ifNW@edges@NoLabel \edef\edge@Options{\edge@Options label={},} \fi % Direct \ifNW@edges@Direct \edef\edge@Options{\edge@Options Direct=true,} \fi % Math \ifNW@edges@Math \edef\edge@Options{\edge@Options Math=true,} \fi % RGB \ifNW@edges@RGB \edef\edge@Options{\edge@Options RGB=true,color={\edge@rgbValues},} \fi \ifthenelse{\not\equal{\cmdNW@edges@style}{}}{ \edef\edge@Options{\edge@Options style={\cmdNW@edges@style},} }{} % lw \ifthenelse{\not\equal{\cmdNW@edges@lw}{}} { \edef\edge@Options{\edge@Options lw=\cmdNW@edges@lw,} }{} % color \ifthenelse{\not\equal{\cmdNW@edges@color}{}} { \edef\edge@Options{\edge@Options color=\cmdNW@edges@color,} }{} % opacity \ifthenelse{\not\equal{\cmdNW@edges@opacity}{}} { \edef\edge@Options{\edge@Options opacity=\cmdNW@edges@opacity,} }{} % NoLabel \ifNW@edges@NotInBG \edef\edge@Options{\edge@Options NotInBG=true,} \fi % Apply settings for the Edge \setkeysexpanded{edge}{\edge@Options}% % Create Edge \ifthenelse{\equal{\cmdNW@edges@layer}{}}{ \Edge[Network](\edge@u)(\edge@v) }{ \ifthenelse{\not\equal{\cmdNW@edges@vertices}{}}{ \DTLifdbexists{vertices}{ \DTLgdeletedb{vertices} }{} % Load data file for vertices \DTLloaddb[noheader=false]{vertices}{\cmdNW@edges@vertices} }{} % find assigned layer to the used vertices \DTLforeach*{vertices}{\id=id,\layer=layer}{% \ifthenelse{\equal{\id}{\edge@u}}{ \edef\edge@u@layer{\layer}% \dtlbreak }{} } \DTLforeach*{vertices}{\id=id,\layer=layer}{% \ifthenelse{\equal{\id}{\edge@v}}{ \edef\edge@v@layer{\layer}% \dtlbreak }{} } % if the edge is an intra layer edge \ifthenelse{\equal{\u@layer}{\v@layer}}{ \ifthenelse{\equal{\u@layer}{\edge@u@layer}}{ \ifthenelse{\equal{\v@layer}{\edge@v@layer}}{ \Edge[Network](\edge@u)(\edge@v) }{} }{} }{ \ifthenelse{\equal{\u@layer}{\edge@u@layer}}{ \ifthenelse{\equal{\v@layer}{\edge@v@layer}}{ \Edge[Network](\edge@u)(\edge@v) }{} }{} \ifthenelse{\equal{\v@layer}{\edge@u@layer}}{ \ifthenelse{\equal{\u@layer}{\edge@v@layer}}{ \Edge[Network](\edge@u)(\edge@v) }{} }{} } } } \end{scope} % Delete data base % \DTLgdeletedb{#1} } %<---------------------------------------------------------------------------> % Init Layer %<---------------------------------------------------------------------------> \define@cmdkey [NW] {layer} {layer}{} \define@cmdkey [NW] {layer} {z}{} \define@cmdkey [NW] {layer} {opacity}{} \presetkeys [NW] {layer} { layer = {1}, opacity = {}, z = {}, }{} %<---------------------------------------------------------------------------> % Layer %<---------------------------------------------------------------------------> %\def\@layer{canvas is yx plane at z=-3,} \def\@layer[#1]#2{ \setkeys[NW]{layer}{#1} \ifthenelse{\not\equal{\cmdNW@layer@z}{}}{ \tikzset{LocalLayerZ/.style={canvas is yx plane at z=\cmdNW@layer@z}} }{ \tikzset{LocalLayerZ/.style={canvas is yx plane at z=(\cmdNW@layer@layer-1)*\NetworkLayerDistance}} } \ifthenelse{\not\equal{\cmdNW@layer@opacity}{}}{ \tikzset{LocalLayerOpacity/.style={fill opacity = \cmdNW@layer@opacity}} }{ \tikzset{LocalLayerOpacity/.style={}} } \begin{scope}[LocalLayerZ,LocalLayerOpacity] } %\newcommand*{\Layer}[1][]{\@layer[#1]}% \newenvironment{Layer}[1][]{\@layer[#1]1}{ \end{scope} } %\def\@layer[#1]#2{} % \newcommand*{\Edges}[1][]{\@edges[#1]}% % \def\@edges[#1]#2{% % \setkeys[NW]{edges}{#1}% % \@@edges{#2}% % } % \def\@@edges#1{% %<---------------------------------------------------------------------------> % Init Plane %<---------------------------------------------------------------------------> \define@cmdkey [NW] {plane} {x}{} \define@cmdkey [NW] {plane} {y}{} \define@cmdkey [NW] {plane} {width}{} \define@cmdkey [NW] {plane} {height}{} \define@cmdkey [NW] {plane} {color}{} \define@cmdkey [NW] {plane} {opacity}{} \define@cmdkey [NW] {plane} {style}{} \define@cmdkey [NW] {plane} {layer}{} \define@cmdkey [NW] {plane} {grid}{} \define@cmdkey [NW] {plane} {image}{} \define@boolkey [NW] {plane} {RGB}[true]{} \define@boolkey [NW] {plane} {InBG}[true]{} \define@boolkey [NW] {plane} {NoFill}[true]{} \define@boolkey [NW] {plane} {NoBorder}[true]{} \define@boolkey [NW] {plane} {ImageAndFill}[true]{} \presetkeys [NW] {plane} { x = {0}, y = {0}, width = {\PlaneWidth}, height = {\PlaneHeight}, color = {}, opacity = {}, style = {}, layer = {1}, grid = {}, image = {}, RGB = false, InBG = false, NoFill = false, NoBorder= false, ImageAndFill= false, }{} %<---------------------------------------------------------------------------> % Plane %<---------------------------------------------------------------------------> \newcommand*{\Plane}[1][]{\@plane[#1]}% \def\@plane[#1]{% \setkeys[NW]{plane}{#1}% \ifNW@plane@ImageAndFill \setkeys[NW]{plane}{#1}% \else \ifthenelse{\not\equal{\cmdNW@plane@image}{}}{ \setkeys[NW]{plane}{#1,NoFill} }{} \fi \@@plane% } \def\@@plane{% % Draw Plane on the Background layer \ifNW@plane@InBG \tikzset{InBGStyle/.style={on background layer}} \else \tikzset{InBGStyle/.style={}} \fi \begin{scope}[InBGStyle] % Check if the color of the plane is redefined, if so the new style is % used. If the option RGB is true, RGB values can be used to define the % color of the plane entered in the form {R,G,B}. If RGB is not true the % default colors of tikz can be used (e.g. blue!50!green) \ifNW@plane@RGB \ifthenelse{\not\equal{\cmdNW@plane@color}{}}{ \pgfutil@definecolor{LocalColor}{RGB}{\cmdNW@plane@color} \tikzset{LocalPlaneFill/.style={fill = LocalColor}} }{ \tikzset{LocalPlaneFill/.style={}} } \else \ifthenelse{\not\equal{\cmdNW@plane@color}{}}{ \tikzset{LocalPlaneFill/.style={fill = \cmdNW@plane@color}} }{ \tikzset{LocalPlaneFill/.style={}} } \fi % Check if the opacity of the plane is redefined, if so the new % style is used \ifthenelse{\not\equal{\cmdNW@plane@opacity}{}}{ \tikzset{LocalPlaneOpacity/.style={fill opacity = \cmdNW@plane@opacity}} }{ \tikzset{LocalPlaneOpacity/.style={}} } \begin{scope}[canvas is yx plane at z=(\cmdNW@plane@layer-1)*\NetworkLayerDistance] % Draw the fill of the Plane \ifNW@plane@NoFill \else \protected@edef\@tempa{% \noexpand\fill[PlaneFillStyle,LocalPlaneFill,LocalPlaneOpacity]( \cmdNW@plane@x*\DistanceScale,\cmdNW@plane@y*\DistanceScale) rectangle ++ (\cmdNW@plane@width*\DistanceScale,\cmdNW@plane@height*\DistanceScale)}% \@tempa; \fi % Draw image on the Plane \ifthenelse{\not\equal{\cmdNW@plane@image}{}}{ %\protected@edef\@tempa{% %\noexpand \node[inner sep=0pt,LocalPlaneOpacity] at ($(\cmdNW@plane@width/2,\cmdNW@plane@height/2)+ (\cmdNW@plane@x,\cmdNW@plane@y)$) {\includegraphics[width=\cmdNW@plane@width\DefaultUnit, height=\cmdNW@plane@height\DefaultUnit]{\cmdNW@plane@image}}; %}% %\@tempa; }{} % Draw grid on the Plane \ifthenelse{\not\equal{\cmdNW@plane@grid}{}}{ \protected@edef\@tempa{% \noexpand\draw[PlaneGridStyle,step=\cmdNW@plane@grid*\DistanceScale]( \cmdNW@plane@x*\DistanceScale,\cmdNW@plane@y*\DistanceScale) grid ++ (\cmdNW@plane@width*\DistanceScale,\cmdNW@plane@height*\DistanceScale)}% \@tempa; }{} % Draw the border of the Plane \ifNW@plane@NoBorder \else \protected@edef\@tempa{% \noexpand\draw[PlaneBorderStyle,\cmdNW@plane@style]( \cmdNW@plane@x*\DistanceScale,\cmdNW@plane@y*\DistanceScale) rectangle ++ (\cmdNW@plane@width*\DistanceScale,\cmdNW@plane@height*\DistanceScale)}% \@tempa; \fi \end{scope} \end{scope} } %<---------------------------------------------------------------------------> % Init Text %<---------------------------------------------------------------------------> \define@cmdkey [NW] {text} {x}{} \define@cmdkey [NW] {text} {y}{} \define@cmdkey [NW] {text} {layer}{} \define@cmdkey [NW] {text} {color}{} \define@cmdkey [NW] {text} {opacity}{} \define@cmdkey [NW] {text} {rotation}{} \define@cmdkey [NW] {text} {fontsize}{} \define@cmdkey [NW] {text} {anchor}{} \define@cmdkey [NW] {text} {position}{} \define@cmdkey [NW] {text} {distance}{} \define@cmdkey [NW] {text} {style}{} \define@cmdkey [NW] {text} {width}{} \define@boolkey [NW] {text} {RGB}[true]{} \presetkeys [NW] {text} { x = {0}, y = {0}, layer = {}, color = {}, opacity = {}, fontsize = {}, anchor = {}, position = {}, rotation = {}, distance = {0\DefaultUnit}, style = {}, width = {}, RGB = false, }{} %<---------------------------------------------------------------------------> % Text %<---------------------------------------------------------------------------> \newcommand*{\Text}[1][]{\@text[#1]}% \def\@text[#1]#2{% \setkeys[NW]{text}{#1}% \@@text{#2}% } \def\@@text#1{% % Check if the color of the text is redefined, if so the new style is % used. If the option RGB is true, RGB values can be used to define the % color of the text entered in the form {R,G,B}. If RGB is not true the % default colors of tikz can be used (e.g. blue!50!green) \ifNW@text@RGB \ifthenelse{\not\equal{\cmdNW@text@color}{}}{ \pgfutil@definecolor{LocalColor}{RGB}{\cmdNW@text@color} \tikzset{LocalTextColor/.style={color = LocalColor}} }{ \tikzset{LocalTextColor/.style={}} } \else \ifthenelse{\not\equal{\cmdNW@text@color}{}}{ \tikzset{LocalTextColor/.style={color = \cmdNW@text@color}} }{ \tikzset{LocalTextColor/.style={}} } \fi % Check if the opacity of the text is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@text@opacity}{}}{ \tikzset{LocalTextOpacity/.style={text opacity = \cmdNW@text@opacity}} }{ \tikzset{LocalTextOpacity/.style={}} } % Check if the rotation of the text is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@text@rotation}{}}{ \tikzset{LocalTextRotation/.style={rotate = \cmdNW@text@rotation}} }{ \tikzset{LocalTextRotation/.style={}} } % Check if the font size of the text is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@text@fontsize}{}}{ \tikzset{LocalTextFontSize/.style={font = \cmdNW@text@fontsize}} }{ \tikzset{LocalTextFontSize/.style={}} } % Check if the position of the text is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@text@position}{}}{ \tikzset{LocalTextPosition/.style={\cmdNW@text@position = \cmdNW@text@distance}} }{ \tikzset{LocalTextPosition/.style={}} } % Check if the anchor of the text is redefined, if so the new style is used \ifthenelse{\not\equal{\cmdNW@text@anchor}{}}{ \tikzset{LocalTextAnchor/.style={anchor = \cmdNW@text@anchor}} }{ \tikzset{LocalTextAnchor/.style={}} } % Check if the text width of the text is redefined, if so the new % style is used \ifthenelse{\not\equal{\cmdNW@text@width}{}}{ \tikzset{LocalTextWidth/.style={text width = \cmdNW@text@width}} }{ \tikzset{LocalTextWidth/.style={}} } \ifthenelse{\equal{\cmdNW@text@layer}{}}{ \protected@edef\@tempa{% \noexpand\node[TextStyle, LocalTextColor, LocalTextOpacity, LocalTextFontSize, LocalTextRotation, LocalTextPosition, LocalTextAnchor, LocalTextWidth, \cmdNW@text@style] at (\cmdNW@text@x*\DistanceScale,\cmdNW@text@y*\DistanceScale){#1} }\@tempa;% }{ \begin{scope}[canvas is yx plane at z=(\cmdNW@text@layer-1)*\NetworkLayerDistance] \protected@edef\@tempa{% \noexpand\node[TextStyle, LocalTextColor, LocalTextOpacity, LocalTextFontSize, LocalTextRotation, LocalTextPosition, LocalTextAnchor, LocalTextWidth, \cmdNW@text@style] at (\cmdNW@text@x*\DistanceScale,\cmdNW@text@y*\DistanceScale){#1} }\@tempa;% \end{scope} } } \endinput %============================================================================= % eof % % Local Variables: % mode: latex % mode: flyspell % mode: auto-fill % fill-column: 80 % TeX-master: t % End: tex-fmt-0.5.2/tests/target/unicode.tex000066400000000000000000000006771473573253500177320ustar00rootroot00000000000000\documentclass{article} \begin{document} This is a long line with a unicode arrow in the middle of it ↓ which should be split correctly Here an indent begins ( and should not be closed with this arrow and comment ↓% until the next parenthesis ) This line contains some French accent characters éééééééééééééééééééééééééééééé which include zero-width chars, so look narrower than they are. \end{document} tex-fmt-0.5.2/tests/target/verbatim.tex000066400000000000000000000007641473573253500201120ustar00rootroot00000000000000\documentclass{article} \usepackage{listings} \begin{document} \begin{verbatim} code code code code code code code code code code code code code code code code code code code code code code code code code code code code code code code code code code code code \item \item \item \begin{align} E = mc^2 \end{align} \end{verbatim} \begin{lstlisting}[caption={A very long and complicated caption that does not fit into one line}] Code \end{lstlisting} \end{document} tex-fmt-0.5.2/tests/target/wgu-cv.cls000066400000000000000000000036641473573253500174740ustar00rootroot00000000000000%! TeX root = WGUnderwood.tex % class \NeedsTeXFormat{LaTeX2e} \ProvidesClass{wgu-cv} % packages \LoadClass[10pt]{article} \RequirePackage[margin=1in,top=0.9in]{geometry} \RequirePackage{hyperref} %\RequirePackage{fontspec} \RequirePackage{microtype} \RequirePackage{fancyhdr} \RequirePackage{enumitem} \RequirePackage{ifthen} % variables \def\yourname#1{\def\@yourname{#1}} \def\youraddress#1{\def\@youraddress{#1}} \def\youremail#1{\def\@youremail{#1}} \def\yourwebsite#1{\def\@yourwebsite{#1}} % settings %\setmainfont{Libre Baskerville}[Scale=0.9] %\setmonofont{Source Code Pro}[Scale=0.97] \geometry{a4paper} \setlength\parindent{0pt} \bibliographystyle{abbrvnat} \pagestyle{fancy} \renewcommand{\headrulewidth}{0pt} \cfoot{\thepage} \rfoot{\today} \setlist{ leftmargin=0.5cm, topsep=0cm, partopsep=0cm, parsep=-0.04cm, % item spacing before=\vspace{0.12cm}, after=\vspace{0.08cm}, } % arxiv \newcommand{\arxiv}[1]{% \href{https://arxiv.org/abs/#1}{% \texttt{arXiv{:}{\allowbreak}#1}}% } % github \newcommand{\github}[1]{% GitHub: \href{https://github.com/#1}{% \texttt{#1}}% } % title \renewcommand{\maketitle}{% \vspace*{-1.2cm}% \begin{center}% \begin{huge}% \@yourname \\ \end{huge}% \vspace{0.5cm}% \@youraddress \\ \vspace{0.16cm}% \begin{minipage}{0.45\textwidth}% \centering% \href{mailto:\@youremail}{\nolinkurl{\@youremail}}% \end{minipage}% \begin{minipage}{0.45\textwidth}% \centering% \href{https://\@yourwebsite}{\nolinkurl{\@yourwebsite}}% \end{minipage} \end{center}% } % section \renewcommand{\section}[1]{% \vspace{0.3cm}% \par\hbox{\large\textbf{#1}\strut}% \vspace{-0.25cm}% \rule{\textwidth}{0.8pt}% \vspace{-0.15cm}% } % subsection \renewcommand{\subsection}[2]{% \vspace{0.30cm}% \textbf{#1}% \hfill{#2}% \vspace{0.03cm}% } % subsubsection \renewcommand{\subsubsection}[1]{% \linebreak \textit{#1}% \vspace{0.05cm}% } tex-fmt-0.5.2/tests/target/wrap.tex000066400000000000000000000041071473573253500172450ustar00rootroot00000000000000\documentclass{article} \begin{document} % no comment This line is too long because it has more than eighty characters inside it. Therefore it should be split. % break before comment This line is too long because it has more than eighty characters inside it. Therefore it % should be split. % break after spaced comment This line is too long because it has more than % eighty characters % inside it. Therefore it should be split. % break after non-spaced comment This line is too long because it has more than% eighty characters % inside it. Therefore it should be split. % unbreakable line Thislineistoolongbecauseithasmorethan%eightycharactersinsideit.Buttherearenospacessoitcannotbesplit. % line can be broken after 80 chars Thislineistoolongbecauseithasmorethaneightycharactersinsideitandtherearenospacesuntillater where there are some spaces so we can split this line here % long line only after indenting ( 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 1234567890 123 ) % double break after comment This line has a long comment. % This comment is very long so needs to % be split over three lines which is another edge case which should % be checked here with all these extra words % double break after only comment % This line is all a long comment. This comment is very long so needs % to be split over three lines which is another edge case which % should be checked here with all these extra words % lines containing \ This line would usually be split at the special character part with a slash\ but it's best to break the line earlier. % long lines with brackets (This line is too long because it has more than eighty characters inside it. Therefore it should be split. It also needs splitting onto multiple lines, and the middle lines should be indented due to these brackets.) % long lines with double brackets ((This line is too long because it has more than eighty characters inside it. Therefore it should be split. It also needs splitting onto multiple lines, and the middle lines should be doubly indented due to these brackets.)) \end{document} tex-fmt-0.5.2/tex-fmt.toml000066400000000000000000000002161473573253500154000ustar00rootroot00000000000000# tex-fmt.toml check = false print = false wrap = true wraplen = 80 tabsize = 2 tabchar = "space" stdin = false verbosity = "warn" lists = []