English to Spanish – Special Project

Contents

Client wants to translate website from English to Spanish using a Toggle button on the site. Prefers Deep L to Google Translate. Deep L is not playing nice with our servers due to CORs and other factors.

See It in Action – https://www.diazdentalstudio.com

Solution

Create a series of Python scripts that can be updated manually as needed using Deep L for translation.

0 – HTML Tags

This script scans sitemap.xml of a website site and returns all of the different classes used as landmarks for creating files for translation.

from bs4 import BeautifulSoup
import requests
from collections import defaultdict
import json  # Import the json module

# Function to check for tags on a single URL
def check_tags_on_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    tag_count = defaultdict(int)
    
    # List of tags to check
    tags_to_check = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'a', 'div', 'img', 'ul', 'ol', 'li']

    # Count each tag
    for tag in tags_to_check:
        tag_count[tag] = len(soup.find_all(tag))
    
    return tag_count

# Assuming you have a function or list from the sitemap parsing script
def parse_sitemap(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'xml')
    urls = [element.text for element in soup.find_all('loc')]
    return urls

# URL of the sitemap
sitemap_url = 'https://www.diazdentalstudio.com/sitemap.xml'
urls = parse_sitemap(sitemap_url)

# Analyze tags across all URLs
overall_tag_count = defaultdict(int)

for url in urls:
    print(f"Checking tags on {url}")
    tag_count = check_tags_on_url(url)
    for tag, count in tag_count.items():
        overall_tag_count[tag] += count

# Print the summary
print("\nOverall Tag Usage Summary:")
for tag, count in overall_tag_count.items():
    print(f"{tag}: {count}")

# Save the summary to a file
with open('tag_usage_summary.json', 'w') as f:
    json.dump(overall_tag_count, f, indent=4)

1 – Extract Content

Extracts content and creates json files using sitemap.xml

import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urlparse
import os

# Function to extract content based on tags
def extract_content(soup, tags):
    content = {}
    for tag in tags:
        elements = soup.find_all(tag)
        content[tag] = [element.get_text(strip=True) for element in elements]
    return content

# List of tags to extract
tags_to_extract = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'a', 'div', 'img', 'ul', 'ol', 'li']

# Sitemap URL
sitemap_url = 'https://www.diazdentalstudio.com/sitemap.xml'

# Get the sitemap
response = requests.get(sitemap_url)
sitemap_soup = BeautifulSoup(response.content, 'xml')

# Extract all URLs from the sitemap
urls = [loc.get_text() for loc in sitemap_soup.find_all('loc')]

# Create a directory to store JSON files
output_dir = 'website_content_json'
os.makedirs(output_dir, exist_ok=True)

# Loop through each URL and process the page
for url in urls:
    page_response = requests.get(url)
    page_soup = BeautifulSoup(page_response.content, 'html.parser')
    
    # Extract content based on the tags
    page_content = extract_content(page_soup, tags_to_extract)
    
    # Generate a filename based on the page URL
    parsed_url = urlparse(url)
    filename = os.path.join(output_dir, f"{parsed_url.path.strip('/').replace('/', '_')}.json")
    
    # Save the content to a JSON file
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(page_content, f, ensure_ascii=False, indent=4)
    
    print(f"Saved content for {url} to {filename}")

print("Content extraction completed.")

2 – Parse Content

Extracts content and creates json files using sitemap.xml

import requests
from bs4 import BeautifulSoup
import json
import os

# URL of the XML sitemap
sitemap_url = "https://www.diazdentalstudio.com/sitemap.xml"

# Directory to save JSON files
output_dir = "page_jsons"
os.makedirs(output_dir, exist_ok=True)

# Function to extract tags from a page
def extract_tags_from_html(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    tags = {
        "h1": [tag.get_text(strip=True) for tag in soup.find_all("h1")],
        "h2": [tag.get_text(strip=True) for tag in soup.find_all("h2")],
        "h3": [tag.get_text(strip=True) for tag in soup.find_all("h3")],
        "h4": [tag.get_text(strip=True) for tag in soup.find_all("h4")],
        "h5": [tag.get_text(strip=True) for tag in soup.find_all("h5")],
        "h6": [tag.get_text(strip=True) for tag in soup.find_all("h6")],
        "p": [tag.get_text(strip=True) for tag in soup.find_all("p")],
        "span": [tag.get_text(strip=True) for tag in soup.find_all("span")],
        "a": [tag.get_text(strip=True) for tag in soup.find_all("a")],
        "div": [tag.get_text(strip=True) for tag in soup.find_all("div")],
        "img": [tag.get("alt", "") for tag in soup.find_all("img")],
        "ul": [" ".join([li.get_text(strip=True) for li in ul.find_all("li")]) for ul in soup.find_all("ul")],
        "ol": [" ".join([li.get_text(strip=True) for li in ol.find_all("li")]) for ol in soup.find_all("ol")],
        "li": [tag.get_text(strip=True) for tag in soup.find_all("li")]
    }

    return tags

# Function to parse the XML sitemap and extract URLs
def get_urls_from_sitemap(sitemap_url):
    response = requests.get(sitemap_url)
    soup = BeautifulSoup(response.content, "xml")
    urls = [loc.get_text() for loc in soup.find_all("loc")]
    return urls

# Get all URLs from the sitemap
urls = get_urls_from_sitemap(sitemap_url)

# Process each URL
for url in urls:
    try:
        print(f"Processing {url}...")
        tags = extract_tags_from_html(url)

        # Generate a filename from the URL
        filename = url.replace("https://www.diazdentalstudio.com/", "").replace("/", "_") + ".json"
        output_path = os.path.join(output_dir, filename)

        # Save the tags as a JSON file
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(tags, f, ensure_ascii=False, indent=4)

        print(f"Tags extracted and saved to {output_path}")

    except Exception as e:
        print(f"Error processing {url}: {e}")

3 – Json Translation Files

Uses parsed website json content files to create Spanish versions using the Deep L API key. These files will swap English with Spanish when website button is toggled. Place the created files in the domain root – httpdocs.

import os
import json
import requests

# DeepL API endpoint and key
DEEPL_API_URL = "https://api-free.deepl.com/v2/translate"
DEEPL_API_KEY = "555212bb-e22a-4f82-b84c-c8dae7e5ea2f:fx"

# Directory containing JSON files
json_dir = "page_jsons"

# Directory to save translated JSON files
translated_dir = "translated_jsons"
os.makedirs(translated_dir, exist_ok=True)

# Function to translate text using DeepL API
def translate_text(text, target_lang="es"):
    data = {
        "auth_key": DEEPL_API_KEY,
        "text": text,
        "target_lang": target_lang
    }
    response = requests.post(DEEPL_API_URL, data=data)

    try:
        result = response.json()
        if "translations" in result:
            return result["translations"][0]["text"]
        else:
            print(f"Error in translation response: {result}")
            return text  # Return the original text if translation fails
    except requests.exceptions.JSONDecodeError as e:
        print(f"Failed to decode JSON response: {e}")
        print(f"Response content: {response.content}")
        return text  # Return the original text if JSON decoding fails

# Process each JSON file in the directory
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        file_path = os.path.join(json_dir, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            content = json.load(f)

        # Translate each element in the JSON structure
        translated_content = {}
        for key, elements in content.items():
            translated_elements = []
            for element in elements:
                translated_elements.append(translate_text(element, target_lang="es"))
            translated_content[key] = translated_elements

        # Save the translated content to a new JSON file
        translated_filename = f"translated_{filename}"
        translated_file_path = os.path.join(translated_dir, translated_filename)
        with open(translated_file_path, "w", encoding="utf-8") as f:
            json.dump(translated_content, f, ensure_ascii=False, indent=4)

        print(f"Translated content saved to {translated_file_path}")

.htaccess Mods

Because .htaccess redirects visitors to ‘/’ – we need to modify the file to render the translated file when requesting Spanish – home page only.

# ---- index.html Redirection ----

RewriteCond %{THE_REQUEST} /index\.html\?_translate=([a-z]+) [NC]
RewriteRule ^ /?_translate=%1 [R=301,L]

# Redirect /index.html to /
RewriteCond %{THE_REQUEST} /index\.html [NC]
RewriteRule ^ / [R=301,L]

JavaScript File (translation-loader.js)

// Function to load the appropriate translation file based on the selected language
function loadTranslation(lang, page) {
    if (lang === 'en') {
        return;
    }

    const url = `/translated_${page}.html.json`;
    fetch(url)
        .then(response => {
            if (!response.ok) {
                throw new Error('Translation file not found');
            }
            return response.json();
        })
        .then(data => {
            // Apply translations (H1, H2, paragraphs, etc.)
            const h1Element = document.querySelector('h1');
            if (h1Element && data.h1 && data.h1.length) {
                h1Element.textContent = data.h1[0];
            }

            const h2Elements = document.querySelectorAll('h2');
            data.h2.forEach((h2Text, index) => {
                if (h2Elements[index]) {
                    h2Elements[index].textContent = h2Text;
                }
            });

            const paragraphElements = document.querySelectorAll('p');
            data.p.forEach((pText, index) => {
                if (paragraphElements[index]) {
                    paragraphElements[index].textContent = pText;
                }
            });

            const anchorElements = document.querySelectorAll('a');
            data.a.forEach((aText, index) => {
                if (anchorElements[index]) {
                    anchorElements[index].textContent = aText;
                }
            });

            const spanElements = document.querySelectorAll('span');
            data.span.forEach((spanText, index) => {
                if (spanElements[index]) {
                    spanElements[index].textContent = spanText;
                }
            });
        })
        .catch(error => {
            console.error('Error loading translation:', error);
        });
}

// Function to toggle between English and Spanish
function toggleLanguage() {
    const urlParams = new URLSearchParams(window.location.search);
    const currentLang = urlParams.get('_translate') || 'en';
    const newLang = currentLang === 'es' ? 'en' : 'es';

    // Create a new URL with the updated language parameter
    const newUrl = `${window.location.pathname}?_translate=${newLang}`;
    
    // Redirect the page to the new URL to reload the content with the selected language
    window.location.href = newUrl;
}

// Initialize translation when the page loads
function initTranslation() {
    const urlParams = new URLSearchParams(window.location.search);
    const lang = urlParams.get('_translate') || 'en';  // Default to English if no language is specified
    const page = window.location.pathname.split('/').pop().replace('.html', '');

    loadTranslation(lang, page);  // Load the translation for the detected language
}

// Add the event listener for the language toggle button when the DOM is fully loaded
document.addEventListener('DOMContentLoaded', () => {
    const toggleButton = document.getElementById('language-toggle-button');
    if (toggleButton) {
        toggleButton.addEventListener('click', toggleLanguage);
    }

    // Initialize the translation on page load
    initTranslation();
});

Where to Place the `toggleLanguage` Function:

This toggleLanguage function goes inside your translation-loader.js file, which is already linked in your HTML with:

<script src="assets/js/translation-loader.js" defer></script>

HTML Button

Used on site to toggle English to Spanish and back.

<a href="javascript:void(0);" id="language-toggle-button" class="btn-gray">Español</a>

How It Works:

When the page reloads, initTranslation() is called, and the translation is applied if the ?_translate=es parameter is detected.
The toggleLanguage function will be called when the user clicks the “Español” button.
The function checks the current language and toggles between English (en) and Spanish (es).
It updates the URL with the correct ?_translate=es parameter and reloads the page.

Updated on September 5, 2024

Was this article helpful?

Yes No