#!/usr/bin/env bash
set -Eeuo pipefail

if [[ -z "${IMAGE+x}" ]]; then
  if [[ "${ENABLE_OCR:-0}" == "1" ]]; then
    IMAGE="markitdown-ocr:latest"
  else
    IMAGE="markitdown:latest"
  fi
fi
REPO_URL="${REPO_URL:-https://github.com/microsoft/markitdown.git}"
MARKITDOWN_SRC="${MARKITDOWN_SRC:-./markitdown}"
OCR_DOCKERFILE="${OCR_DOCKERFILE:-./Dockerfile.markitdown-ocr}"
LLM_CLIENT="${LLM_CLIENT:-openai}"
LLM_MODEL="${LLM_MODEL:-gpt-4o}"
INPUT_DIR="${1:-./doc}"
OUTPUT_DIR="${2:-./markdown}"

usage() {
  cat <<'EOF'
Usage:
  ./convert_docs_to_md.sh [input_dir] [output_dir]

Defaults:
  input_dir   ./doc
  output_dir  ./markdown

Supported files:
  Word, Excel, PowerPoint, and PDF documents

Environment variables:
  IMAGE            Docker image name to use. Default: markitdown:latest, or markitdown-ocr:latest with ENABLE_OCR=1
  MARKITDOWN_SRC   Local MarkItDown source path for docker build. Default: ./markitdown
  REPO_URL         MarkItDown git repository. Default: https://github.com/microsoft/markitdown.git
  FORCE_BUILD=1    Rebuild IMAGE even if it already exists
  STRICT=1         Exit with non-zero status if any file fails. Default: 0
  ENABLE_OCR=1     Use the official markitdown-ocr plugin for embedded images in PDF/DOCX/XLSX/PPTX.
                   Requires OPENAI_API_KEY and a vision-capable model.
  LLM_CLIENT       LLM client passed to MarkItDown when ENABLE_OCR=1. Default: openai
  LLM_MODEL        LLM model passed to MarkItDown when ENABLE_OCR=1. Default: gpt-4o
  OPENAI_BASE_URL  Optional OpenAI-compatible endpoint passed into the container.

Examples:
  ./convert_docs_to_md.sh
  ./convert_docs_to_md.sh ./doc ./md
  FORCE_BUILD=1 ./convert_docs_to_md.sh
  ENABLE_OCR=1 OPENAI_API_KEY=... ./convert_docs_to_md.sh
EOF
}

log() {
  printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" >&2
}

die() {
  printf 'Error: %s\n' "$*" >&2
  exit 1
}

if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
  usage
  exit 0
fi

command -v docker >/dev/null 2>&1 || die "docker is required but was not found in PATH"

if [[ ! -d "$INPUT_DIR" ]]; then
  die "input directory does not exist: $INPUT_DIR"
fi

if [[ "${ENABLE_OCR:-0}" == "1" && -z "${OPENAI_API_KEY:-}" ]]; then
  die "ENABLE_OCR=1 requires OPENAI_API_KEY"
fi

ensure_image() {
  if [[ "${FORCE_BUILD:-0}" != "1" ]] && docker image inspect "$IMAGE" >/dev/null 2>&1; then
    log "Using existing Docker image: $IMAGE"
    return
  fi

  if [[ ! -d "$MARKITDOWN_SRC/.git" ]]; then
    command -v git >/dev/null 2>&1 || die "git is required to clone $REPO_URL"
    log "Cloning MarkItDown from $REPO_URL into $MARKITDOWN_SRC"
    git clone "$REPO_URL" "$MARKITDOWN_SRC"
  else
    log "Using existing MarkItDown source: $MARKITDOWN_SRC"
  fi

  if [[ "${ENABLE_OCR:-0}" == "1" ]]; then
    [[ -f "$OCR_DOCKERFILE" ]] || die "OCR Dockerfile does not exist: $OCR_DOCKERFILE"
    log "Building Docker image with official markitdown-ocr plugin: $IMAGE"
    docker build -t "$IMAGE" -f "$OCR_DOCKERFILE" "$MARKITDOWN_SRC"
  else
    log "Building Docker image: $IMAGE"
    docker build -t "$IMAGE" "$MARKITDOWN_SRC"
  fi
}

is_supported_file() {
  local file_lower
  file_lower="$(printf '%s' "$1" | tr '[:upper:]' '[:lower:]')"
  case "$file_lower" in
    *.doc|*.docx|*.docm|*.dot|*.dotx|*.xls|*.xlsx|*.xlsm|*.xlsb|*.ppt|*.pptx|*.pptm|*.pps|*.ppsx|*.pot|*.potx|*.potm|*.pdf)
      return 0
      ;;
    *)
      return 1
      ;;
  esac
}

is_ooxml_file() {
  local file_lower
  file_lower="$(printf '%s' "$1" | tr '[:upper:]' '[:lower:]')"
  case "$file_lower" in
    *.docx|*.docm|*.dotx|*.xlsx|*.xlsm|*.xlsb|*.pptx|*.pptm|*.ppsx|*.potx|*.potm)
      return 0
      ;;
    *)
      return 1
      ;;
  esac
}

is_zip_file() {
  local src="$1"
  local header
  header="$(head -c 4 "$src" | od -An -tx1 | tr -d ' \n')"

  case "$header" in
    504b0304|504b0506|504b0708)
      return 0
      ;;
    *)
      return 1
      ;;
  esac
}

record_failure() {
  local rel="$1"
  local reason="$2"

  printf '%s\t%s\n' "$rel" "$reason" >> "$FAILURE_LOG"
}

convert_one() {
  local rel="$1"
  local src="$INPUT_ABS/$rel"
  local dst="$OUTPUT_ABS/${rel%.*}.md"
  local out_rel="${rel%.*}.md"
  local tmp="$dst.tmp.$$"
  local container_input="/input/$rel"
  local container_output="/output/${rel%.*}.md.tmp.$$"
  local docker_args=(--rm --user "$(id -u):$(id -g)")

  mkdir -p "$(dirname "$dst")"

  if is_ooxml_file "$src" && ! is_zip_file "$src"; then
    record_failure "$rel" "invalid Office Open XML file: extension requires a zip-based file, but content is not zip"
    log "Skipped: $rel (not a valid zip-based Office file; it may be a renamed .doc/.xls or a damaged file)"
    return 1
  fi

  if [[ "${ENABLE_OCR:-0}" == "1" ]]; then
    docker_args+=(-e "OPENAI_API_KEY=$OPENAI_API_KEY")

    if [[ -n "${OPENAI_BASE_URL:-}" ]]; then
      docker_args+=(-e "OPENAI_BASE_URL=$OPENAI_BASE_URL")
    fi
  fi

  docker_args+=(
    -v "$INPUT_ABS:/input:ro"
    -v "$OUTPUT_ABS:/output"
    "$IMAGE"
  )

  if [[ "${ENABLE_OCR:-0}" == "1" ]]; then
    docker_args+=(--use-plugins --llm-client "$LLM_CLIENT" --llm-model "$LLM_MODEL")
  fi

  docker_args+=("$container_input" -o "$container_output")

  if docker run "${docker_args[@]}"; then
    mv "$tmp" "$dst"
    log "Converted: $rel -> $out_rel"
    return 0
  fi

  rm -f "$tmp"
  record_failure "$rel" "markitdown conversion failed"
  log "Failed: $rel"
  return 1
}

ensure_image
mkdir -p "$OUTPUT_DIR"

INPUT_ABS="$(cd "$INPUT_DIR" && pwd -P)"
OUTPUT_ABS="$(cd "$OUTPUT_DIR" && pwd -P)"
FAILURE_LOG="$OUTPUT_ABS/conversion_failures.tsv"

converted=0
failed=0
found=0

printf 'file\treason\n' > "$FAILURE_LOG"

while IFS= read -r -d '' src; do
  rel="${src#$INPUT_ABS/}"
  if ! is_supported_file "$src"; then
    continue
  fi

  found=$((found + 1))
  if convert_one "$rel"; then
    converted=$((converted + 1))
  else
    failed=$((failed + 1))
  fi
done < <(find "$INPUT_ABS" -type f -print0)

if [[ "$failed" -eq 0 ]]; then
  rm -f "$FAILURE_LOG"
fi

log "Done. Found: $found, converted: $converted, failed: $failed, output: $OUTPUT_DIR"
if [[ "$failed" -gt 0 ]]; then
  log "Failure details: $FAILURE_LOG"
fi

if [[ "$failed" -gt 0 && "${STRICT:-0}" == "1" ]]; then
  exit 1
fi
