-
Notifications
You must be signed in to change notification settings - Fork 9
feat: improve API-ref extraction quality and coverage in the RAG #83
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
5c233e0
b22cbee
755363b
758e5f8
a3c8532
6061c33
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,3 @@ | ||
| lightspeed-rag-content @ git+https://github.com/lightspeed-core/rag-content@main | ||
| packaging | ||
| lxml | ||
| html2text |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,28 @@ def clean_url(unclean_url): | |
| class OpenStackDocsMetadataProcessor(MetadataProcessor): | ||
| """Metadata processor for OpenStack documentation.""" | ||
|
|
||
| API_REF_SERVICE_MAPPING = { | ||
|
lpiwowar marked this conversation as resolved.
|
||
| "cinder": "block-storage", | ||
| "nova": "compute", | ||
| "trove": "database", | ||
| "designate": "dns", | ||
| "keystone": "identity", | ||
| "glance": "image", | ||
| "watcher": "resource-optimization", | ||
| "masakari": "instance-ha", | ||
| "barbican": "key-manager", | ||
| "octavia": "load-balancer", | ||
| "zaqar": "messaging", | ||
| "neutron": "network", | ||
| "swift": "object-store", | ||
| "adjutant": "registration", | ||
| "heat": "orchestration", | ||
| "placement": "placement", | ||
| "blazar": "reservation", | ||
| "manila": "shared-file-system", | ||
| # Add more mappings as needed | ||
| } | ||
|
|
||
| def __init__(self, folder_path: str): | ||
| super(OpenStackDocsMetadataProcessor, self).__init__() | ||
| self.folder_path = Path(folder_path) | ||
|
|
@@ -46,22 +68,41 @@ def url_function(self, path: str) -> str: | |
| except ValueError: | ||
| relative_path = path_obj.name | ||
|
|
||
| relative_path = relative_path.as_posix() | ||
| relative_path_str = relative_path.as_posix() | ||
|
|
||
| # Remove _docs suffix: /cinder/2025.2_docs/ → /cinder/2025.2/ | ||
| relative_path = re.sub(r"/(\d+\.\d+)_docs/", r"/\1/", relative_path) | ||
| # Extract project name from path (first component) | ||
| path_parts = relative_path_str.split("/") | ||
| project_name = path_parts[0] if path_parts else "" | ||
|
|
||
| # Check if this is API-Ref documentation | ||
| if "_api-ref/" in relative_path_str: | ||
| # This is API-Ref documentation - use different URL pattern | ||
| # Pattern: project/version_api-ref/... -> /api-ref/service/... | ||
|
|
||
| # Get the service name from mapping | ||
| service_name = self.API_REF_SERVICE_MAPPING.get(project_name, project_name) | ||
|
|
||
| # Remove _api-ref suffix: /cinder/2025.2_api-ref/ → /cinder/2025.2/api-ref/ | ||
| relative_path = re.sub(r"/(\d+\.\d+)_api-ref/", r"/\1/api-ref/", relative_path) | ||
| # Remove project name and version_api-ref prefix | ||
| # Example: heat/2025.2_api-ref/v1/index.txt -> v1/index.txt | ||
| api_ref_pattern = re.compile(r"^[^/]+/(?:\d+\.\d+|latest)_api-ref/") | ||
| remaining_path = api_ref_pattern.sub("", relative_path_str) | ||
|
|
||
| # Replace .txt with .html | ||
| remaining_path = remaining_path.replace(".txt", ".html") | ||
| # Build API-Ref URL | ||
| return f"{self.base_url}/api-ref/{service_name}/{remaining_path}" | ||
|
Comment on lines
+77
to
+93
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don’t silently fall back to the repository name for API-Ref services.
🤖 Prompt for AI Agents |
||
|
|
||
| # Regular documentation - existing logic | ||
| # Remove _docs suffix: /cinder/2025.2_docs/ → /cinder/2025.2/ | ||
| relative_path_str = re.sub(r"/(\d+\.\d+)_docs/", r"/\1/", relative_path_str) | ||
|
|
||
| # Handle "latest" version | ||
| relative_path = relative_path.replace("/latest_docs/", "/latest/") | ||
| relative_path = relative_path.replace("/latest_api-ref/", "/latest/api-ref/") | ||
| relative_path_str = relative_path_str.replace("/latest_docs/", "/latest/") | ||
|
|
||
| # Replace .txt with .html | ||
| relative_path = relative_path.replace(".txt", ".html") | ||
| relative_path_str = relative_path_str.replace(".txt", ".html") | ||
|
|
||
| return f"{self.base_url}/{relative_path}" | ||
| return f"{self.base_url}/{relative_path_str}" | ||
|
|
||
|
|
||
| class RedHatDocsMetadataProcessor(MetadataProcessor): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -205,58 +205,75 @@ deps = | |
| # Its regular doc build produces no usable output, but its API-Ref is needed by Neutron. | ||
| if [ "$project" != "neutron-lib" ]; then | ||
| tox -etext-docs | ||
| [ "${CLEAN_FILES}" == "venv" ] && rm -rf .tox/text-docs | ||
| fi | ||
| [ "${CLEAN_FILES}" == "venv" ] && rm -rf .tox/text-docs | ||
|
|
||
| # Build API-Ref if enabled | ||
| if [ "$OS_API_DOCS" = "true" ] && [ -d "./api-ref/source" ]; then | ||
| if ! grep -q "text-api-ref" tox.ini; then | ||
| echo "$tox_text_api_ref_target" >> tox.ini | ||
| # Build API-Ref if enabled and the project has an api-ref directory | ||
| local api_ref_failed="false" | ||
| if [ "$OS_API_DOCS" = "true" ]; then | ||
| local api_dir="" | ||
| if [ -d "./api-ref/source" ]; then api_dir="api-ref"; | ||
| elif [ -d "./api-guide/source" ]; then api_dir="api-guide"; | ||
|
omkarjoshi0304 marked this conversation as resolved.
|
||
| else | ||
| echo "INFO: No api-ref or api-guide directory found for $project" | ||
| fi | ||
|
|
||
| local api_ref_failed="false" | ||
| echo "Building API-Ref documentation for $project..." | ||
| tox -etext-api-ref || api_ref_failed="true" | ||
|
|
||
| if [ "$api_ref_failed" != "true" ]; then | ||
| echo "Converting API-Ref HTML to plain text..." | ||
| rm -rf ./api-ref/build/text | ||
| mv ./api-ref/build/html ./api-ref/build/text | ||
|
|
||
| # Convert HTML to text | ||
| while read -r html_file; do | ||
| text_file="${html_file%.html}.txt" | ||
| [ -e "$html_file" ] && html2text "$html_file" utf8 > "$text_file" | ||
| done <<< "$(find ./api-ref/build/text -name "*.html")" | ||
|
|
||
| # Cleanup | ||
| find ./api-ref/build/text -type f ! -name "*.txt" -delete | ||
| find ./api-ref/build/text -mindepth 1 -depth -type d -empty -delete | ||
|
|
||
| # Remove unpublished metadata (JIRA OSPRH-19255 requirement #1) | ||
| find ./api-ref/build/text -name "genindex.txt" -delete | ||
| find ./api-ref/build/text -name "search.txt" -delete | ||
| find ./api-ref/build/text -path "*/_sources/*" -delete | ||
| find ./api-ref/build/text -type d -name "_sources" -delete | ||
|
|
||
| # index.txt and api_microversion_history.txt handling to prevent unreachable URLs | ||
| api_file_count=$(find ./api-ref/build/text -name "*.txt" \ | ||
| ! -name "index.txt" ! -name "genindex.txt" \ | ||
| ! -name "search.txt" ! -name "api_microversion_history.txt" \ | ||
| -type f | wc -l) | ||
|
|
||
| if [ "$api_file_count" -gt 0 ]; then | ||
| # Has real API files - remove navigation files | ||
| find ./api-ref/build/text -name "index.txt" -delete | ||
| find ./api-ref/build/text -name "api_microversion_history.txt" -delete | ||
| else | ||
| # Only has index.txt - skip to avoid unreachable URLs | ||
| echo "Skipping API-Ref for $project (no content files)" | ||
| rm -rf ./api-ref/build/text | ||
| if [ -n "$api_dir" ]; then | ||
| echo "Building API-Ref documentation for $project using $api_dir..." | ||
|
|
||
| if ! grep -q "text-api-ref" tox.ini; then | ||
| # Adjust the target if it's api-guide instead of api-ref | ||
| local adjusted_target | ||
| adjusted_target="${tox_text_api_ref_target//api-ref/$api_dir}" | ||
|
omkarjoshi0304 marked this conversation as resolved.
|
||
| echo "$adjusted_target" >> tox.ini | ||
| fi | ||
|
|
||
| if ! tox -etext-api-ref; then | ||
|
Comment on lines
+224
to
+231
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: # First, let's look at the script and understand the context
head -250 scripts/get_openstack_plaintext_docs.sh | tail -50Repository: openstack-lightspeed/rag-content Length of output: 2228 🏁 Script executed: # Get more context around the problematic lines
sed -n '200,260p' scripts/get_openstack_plaintext_docs.shRepository: openstack-lightspeed/rag-content Length of output: 2774 🏁 Script executed: # Look at where tox_text_api_ref_target is set
rg "tox_text_api_ref_target" scripts/get_openstack_plaintext_docs.sh -nRepository: openstack-lightspeed/rag-content Length of output: 198 🏁 Script executed: # Look at api_dir usage
rg "api_dir" scripts/get_openstack_plaintext_docs.sh -n -B2 -A2Repository: openstack-lightspeed/rag-content Length of output: 3138 🏁 Script executed: # Check for pandoc references
rg "pandoc" scripts/get_openstack_plaintext_docs.sh -n -B2 -A2Repository: openstack-lightspeed/rag-content Length of output: 431 🏁 Script executed: # Get the complete definition of tox_text_api_ref_target starting at line 142
sed -n '140,180p' scripts/get_openstack_plaintext_docs.shRepository: openstack-lightspeed/rag-content Length of output: 1713 🏁 Script executed: # Check if pandoc is validated anywhere in the script (startup checks)
rg "pandoc" scripts/get_openstack_plaintext_docs.sh -B5 -A5Repository: openstack-lightspeed/rag-content Length of output: 691 🏁 Script executed: # Check for startup validation of tools
sed -n '1,100p' scripts/get_openstack_plaintext_docs.sh | grep -A20 "Check"Repository: openstack-lightspeed/rag-content Length of output: 750 🏁 Script executed: # Look for any tool availability checks at the beginning
sed -n '1,150p' scripts/get_openstack_plaintext_docs.shRepository: openstack-lightspeed/rag-content Length of output: 5564 🏁 Script executed: # Let's verify the actual behavior by tracing through what happens with api-guide
# Create a test to show what the substitution produces
cat > /tmp/test_subst.sh << 'EOF'
#!/bin/bash
api_dir="api-guide"
tox_text_api_ref_target="
[testenv:text-api-ref]
description =
Build API reference documentation in HTML format.
commands =
sphinx-build --keep-going -j auto -b html -d api-ref/build/doctrees api-ref/source api-ref/build/html
deps =
-c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/2025.2}
-r{toxinidir}/doc/requirements.txt
os-api-ref
"
echo "=== Original tox_text_api_ref_target ==="
echo "$tox_text_api_ref_target"
echo ""
echo "=== After global substitution with api_dir=api-guide ==="
adjusted_target="${tox_text_api_ref_target//api-ref/$api_dir}"
echo "$adjusted_target"
echo ""
echo "=== What tox environment name got created ==="
echo "$adjusted_target" | grep "testenv:"
echo ""
echo "=== Script tries to run ==="
echo "tox -etext-api-ref"
EOF
bash /tmp/test_subst.shRepository: openstack-lightspeed/rag-content Length of output: 1008 The
The proposed fix handling only path segments (e.g., Additionally, 🤖 Prompt for AI Agents |
||
| echo "WARNING: API-Ref build failed for $project" | ||
| exit 1 | ||
| fi | ||
|
|
||
| find ./api-ref/build/text -mindepth 1 -depth -type d -empty -delete 2>/dev/null || true | ||
| if [ "$api_ref_failed" != "true" ]; then | ||
| echo "Converting API-Ref HTML to plain text for $project..." | ||
| rm -rf "./$api_dir/build/text" | ||
| mkdir -p "./$api_dir/build/text" | ||
|
|
||
| converted_count=0 | ||
| while IFS= read -r -d '' html_file; do | ||
|
lpiwowar marked this conversation as resolved.
|
||
| rel_path="${html_file#./"$api_dir"/build/html/}" | ||
| text_file="./$api_dir/build/text/${rel_path%.html}.txt" | ||
| mkdir -p "$(dirname "$text_file")" | ||
|
|
||
| # Convert HTML to plain text using pandoc (consistent output) | ||
| pandoc -f html -t plain --wrap=preserve "$html_file" -o "$text_file" || { | ||
| echo "ERROR: Failed to convert $html_file" | ||
| return 1 | ||
| } | ||
|
Comment on lines
+247
to
+251
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: # First, let's check the file size and read the startup/early sections
wc -l scripts/get_openstack_plaintext_docs.shRepository: openstack-lightspeed/rag-content Length of output: 119 🏁 Script executed: # Read the beginning of the script to find startup validation
head -100 scripts/get_openstack_plaintext_docs.sh | cat -nRepository: openstack-lightspeed/rag-content Length of output: 4796 🏁 Script executed: # Search for dependency checks (tox, PYTHON, pandoc) in the script
rg -n "(command -v|which|type )" scripts/get_openstack_plaintext_docs.sh | head -20Repository: openstack-lightspeed/rag-content Length of output: 612 🏁 Script executed: # Check for pandoc usage throughout the script
rg -n "pandoc" scripts/get_openstack_plaintext_docs.shRepository: openstack-lightspeed/rag-content Length of output: 259 🏁 Script executed: # Search for any pandoc availability checks in the entire script
rg -n "pandoc" scripts/get_openstack_plaintext_docs.shRepository: openstack-lightspeed/rag-content Length of output: 259 🏁 Script executed: # Check the context around lines 247-251 to understand control flow
sed -n '200,280p' scripts/get_openstack_plaintext_docs.sh | cat -nRepository: openstack-lightspeed/rag-content Length of output: 4434 🏁 Script executed: # Check if there are any early checks for OS_API_DOCS and related setup
sed -n '35,150p' scripts/get_openstack_plaintext_docs.sh | cat -nRepository: openstack-lightspeed/rag-content Length of output: 5321 🏁 Script executed: # Check if there's any conditional logic around pandoc that we might have missed
sed -n '210,260p' scripts/get_openstack_plaintext_docs.sh | cat -nRepository: openstack-lightspeed/rag-content Length of output: 2703 🏁 Script executed: # Verify how OS_API_DOCS is used and when this code path executes
rg -B5 -A5 "OS_API_DOCS" scripts/get_openstack_plaintext_docs.sh | head -40Repository: openstack-lightspeed/rag-content Length of output: 1359 Add pandoc to startup dependency checks. When 🤖 Prompt for AI Agents |
||
|
|
||
| converted_count=$((converted_count + 1)) | ||
| done < <(find "./$api_dir/build/html" -name "*.html" -type f -print0) | ||
|
|
||
| echo "Converted $converted_count HTML files to text for $project" | ||
|
|
||
| # Cleanup unwanted files (logos, metadata, empty directories) | ||
| # shellcheck disable=SC2038 | ||
| find "./$api_dir/build/text" -type f -exec grep -l "logo-full.svg" {} + | xargs rm -f 2>/dev/null || true | ||
|
Akrog marked this conversation as resolved.
|
||
| find "./$api_dir/build/text" \( -name "genindex.txt" -o -name "search.txt" \) -delete 2>/dev/null || true | ||
| rm -rf "./$api_dir/build/text/_sources" 2>/dev/null || true | ||
|
|
||
| # Check for content (size > 1k) | ||
| api_file_count=$(find "./$api_dir/build/text" -name "*.txt" -type f -size +1k 2>/dev/null | wc -l) | ||
|
|
||
| if [ "$api_file_count" -gt 0 ]; then | ||
| echo "API-Ref: Found $api_file_count content files for $project" | ||
|
omkarjoshi0304 marked this conversation as resolved.
|
||
| else | ||
| echo "Skipping API-Ref for $project (no content found)" | ||
| rm -rf "./$api_dir/build/text" | ||
| fi | ||
| find "./$api_dir/build/text" -mindepth 1 -depth -type d -empty -delete 2>/dev/null || true | ||
| fi | ||
| fi | ||
| [ "${CLEAN_FILES}" == "venv" ] && rm -rf .tox/text-api-ref | ||
| fi | ||
|
|
||
| # These projects have all their docs under "latest" instead of "2025.2" | ||
|
|
@@ -271,12 +288,12 @@ deps = | |
| rm -rf "$project_output_dir" | ||
| mkdir -p "$project_output_dir" | ||
| # Only copy if text docs were built (skipped for neutron-lib) | ||
| [ -d "doc/build/text" ] && cp -r doc/build/text "$project_output_dir"/"$_output_version"_docs | ||
| [ -d "doc/build/text" ] && cp -r doc/build/text "${project_output_dir}/${_output_version}_docs" | ||
|
|
||
| # Copy API-Ref documentation only if OS_API_DOCS is enabled and build succeeded | ||
| if [ "$OS_API_DOCS" = "true" ] && [ -d "./api-ref/source" ] && \ | ||
| [ "$api_ref_failed" != "true" ] && [ -d "api-ref/build/text" ]; then | ||
| cp -r api-ref/build/text "$project_output_dir"/"$_output_version"_api-ref | ||
| # Copy API-Ref documentation if it was built successfully | ||
| if [ "$OS_API_DOCS" = "true" ] && [ "$api_ref_failed" != "true" ] && \ | ||
| [ -n "$api_dir" ] && [ -d "$api_dir/build/text" ]; then | ||
| cp -r "${api_dir}/build/text" "${project_output_dir}/${_output_version}_api-ref" | ||
| echo "API-Ref documentation copied for $project" | ||
| fi | ||
|
|
||
|
|
@@ -308,7 +325,7 @@ for os_project in "${os_projects[@]}"; do | |
| if [ "${num_running_subproc}" -ge "${NUM_WORKERS}" ]; then | ||
| echo "Using ${num_running_subproc}/${NUM_WORKERS} workers. Waiting ..." | ||
| wait -n || log_and_die "Subprocess generating text documentation failed!" | ||
| echo "Using $(( --num_running_subproc ))/${NUM_WORKERS} workers." | ||
| echo "Using $(( --num_running_subproc ))/${NUM_WORKERS} workers." | ||
| fi | ||
| done | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
cat Containerfile | head -50Repository: openstack-lightspeed/rag-content
Length of output: 1625
🏁 Script executed:
Repository: openstack-lightspeed/rag-content
Length of output: 538
🏁 Script executed:
Repository: openstack-lightspeed/rag-content
Length of output: 8242
🌐 Web query:
Pandoc 3.1.11.1 release checksum signature verification GitHub💡 Result:
No official Pandoc 3.1.11.1 release exists on GitHub at https://github.com/jgm/pandoc/releases. The latest releases are 3.9.0.2 (2026-03-19), with earlier ones like 3.1.8 (2023-09-09) and 3.1.2 (2023-03-28). Pandoc 3.1.11.1 appears to be a Debian/Ubuntu package version (haskell-pandoc 3.1.11.1+ds-2 in Debian testing/unstable), not an upstream release, so it lacks official GitHub binaries, checksums, or PGP signatures from jgm/pandoc. Pandoc releases on GitHub provide SHA256 checksums next to each binary asset (e.g., for 3.9.0.2-linux-amd64.tar.gz: a69abfababda8a56969a254b09f9553a7be89ddec00d4e0fe9fd585d71a67508). No PGP signatures are provided; verification relies on GitHub's hosting and SHA256 checksums. macOS .pkg files are Apple-signed and notarized. An announcement discussion exists at jgm/pandoc#9307, but no assets/checksums there for 3.1.11.1. For Debian packages, use apt cryptographic verification or compute local checksums against published Debian values if available.
Citations:
🌐 Web query:
curl -L vs curl -fsSL HTTP error handling differences💡 Result:
The key difference in HTTP error handling between curl -L and curl -fsSL lies in how they respond to HTTP errors (response codes >=400): - curl -L (follow redirects only): Does not treat HTTP errors as failures. It outputs the error page body to stdout, shows progress meter and any errors, and exits with code 0 (success). Scripts checking $? will not detect the error. - curl -fsSL (-f: fail on errors, -s: silent/no progress, -S: show-error, -L: follow redirects): Treats HTTP >=400 responses (after following redirects) as failures. Exits with code 22, suppresses the error body output (no data to stdout), hides progress but shows error message to stderr. Ideal for scripts to detect failures reliably. Both follow redirects (-L). Without -f, curl considers only connection issues as errors, not HTTP status codes. With -f, it checks the final response after redirects. Examples: curl -L https://httpstat.us/500 # Outputs error HTML, exit 0 curl -fsSL https://httpstat.us/500 # No output, "curl: (22) ... 500", exit 22 Note: -f may not catch 401/407 auth errors. Use --fail-with-body (curl >=7.76.0) to also output body on failure.
Citations:
Use an actual released version of Pandoc and add checksum verification.
Pandoc 3.1.11.1 does not exist as an official GitHub release—it's a Debian/Ubuntu package version only. The download URL will fail with a 404 error. Use a released version from the Pandoc releases page (e.g., 3.1.8 or later). Additionally, switch from
curl -Ltocurl -fsSLto fail fast on HTTP errors, download the archive explicitly, verify the published SHA256 checksum before extraction, and then extract it.🤖 Prompt for AI Agents