Client wants to translate website from English to Spanish using a Toggle button on the site. Prefers Deep L to Google Translate. Deep L is not playing nice with our servers due to CORs and other factors.
See It in Action – https://www.diazdentalstudio.com
Solution
Create a series of Python scripts that can be updated manually as needed using Deep L for translation.
0 – HTML Tags
This script scans sitemap.xml of a website site and returns all of the different classes used as landmarks for creating files for translation.
from bs4 import BeautifulSoup
import requests
from collections import defaultdict
import json # Import the json module
# Function to check for tags on a single URL
def check_tags_on_url(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
tag_count = defaultdict(int)
# List of tags to check
tags_to_check = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'a', 'div', 'img', 'ul', 'ol', 'li']
# Count each tag
for tag in tags_to_check:
tag_count[tag] = len(soup.find_all(tag))
return tag_count
# Assuming you have a function or list from the sitemap parsing script
def parse_sitemap(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'xml')
urls = [element.text for element in soup.find_all('loc')]
return urls
# URL of the sitemap
sitemap_url = 'https://www.diazdentalstudio.com/sitemap.xml'
urls = parse_sitemap(sitemap_url)
# Analyze tags across all URLs
overall_tag_count = defaultdict(int)
for url in urls:
print(f"Checking tags on {url}")
tag_count = check_tags_on_url(url)
for tag, count in tag_count.items():
overall_tag_count[tag] += count
# Print the summary
print("\nOverall Tag Usage Summary:")
for tag, count in overall_tag_count.items():
print(f"{tag}: {count}")
# Save the summary to a file
with open('tag_usage_summary.json', 'w') as f:
json.dump(overall_tag_count, f, indent=4)
1 – Extract Content
Extracts content and creates json files using sitemap.xml
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urlparse
import os
# Function to extract content based on tags
def extract_content(soup, tags):
content = {}
for tag in tags:
elements = soup.find_all(tag)
content[tag] = [element.get_text(strip=True) for element in elements]
return content
# List of tags to extract
tags_to_extract = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'a', 'div', 'img', 'ul', 'ol', 'li']
# Sitemap URL
sitemap_url = 'https://www.diazdentalstudio.com/sitemap.xml'
# Get the sitemap
response = requests.get(sitemap_url)
sitemap_soup = BeautifulSoup(response.content, 'xml')
# Extract all URLs from the sitemap
urls = [loc.get_text() for loc in sitemap_soup.find_all('loc')]
# Create a directory to store JSON files
output_dir = 'website_content_json'
os.makedirs(output_dir, exist_ok=True)
# Loop through each URL and process the page
for url in urls:
page_response = requests.get(url)
page_soup = BeautifulSoup(page_response.content, 'html.parser')
# Extract content based on the tags
page_content = extract_content(page_soup, tags_to_extract)
# Generate a filename based on the page URL
parsed_url = urlparse(url)
filename = os.path.join(output_dir, f"{parsed_url.path.strip('/').replace('/', '_')}.json")
# Save the content to a JSON file
with open(filename, 'w', encoding='utf-8') as f:
json.dump(page_content, f, ensure_ascii=False, indent=4)
print(f"Saved content for {url} to {filename}")
print("Content extraction completed.")
2 – Parse Content
Extracts content and creates json files using sitemap.xml
import requests
from bs4 import BeautifulSoup
import json
import os
# URL of the XML sitemap
sitemap_url = "https://www.diazdentalstudio.com/sitemap.xml"
# Directory to save JSON files
output_dir = "page_jsons"
os.makedirs(output_dir, exist_ok=True)
# Function to extract tags from a page
def extract_tags_from_html(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
tags = {
"h1": [tag.get_text(strip=True) for tag in soup.find_all("h1")],
"h2": [tag.get_text(strip=True) for tag in soup.find_all("h2")],
"h3": [tag.get_text(strip=True) for tag in soup.find_all("h3")],
"h4": [tag.get_text(strip=True) for tag in soup.find_all("h4")],
"h5": [tag.get_text(strip=True) for tag in soup.find_all("h5")],
"h6": [tag.get_text(strip=True) for tag in soup.find_all("h6")],
"p": [tag.get_text(strip=True) for tag in soup.find_all("p")],
"span": [tag.get_text(strip=True) for tag in soup.find_all("span")],
"a": [tag.get_text(strip=True) for tag in soup.find_all("a")],
"div": [tag.get_text(strip=True) for tag in soup.find_all("div")],
"img": [tag.get("alt", "") for tag in soup.find_all("img")],
"ul": [" ".join([li.get_text(strip=True) for li in ul.find_all("li")]) for ul in soup.find_all("ul")],
"ol": [" ".join([li.get_text(strip=True) for li in ol.find_all("li")]) for ol in soup.find_all("ol")],
"li": [tag.get_text(strip=True) for tag in soup.find_all("li")]
}
return tags
# Function to parse the XML sitemap and extract URLs
def get_urls_from_sitemap(sitemap_url):
response = requests.get(sitemap_url)
soup = BeautifulSoup(response.content, "xml")
urls = [loc.get_text() for loc in soup.find_all("loc")]
return urls
# Get all URLs from the sitemap
urls = get_urls_from_sitemap(sitemap_url)
# Process each URL
for url in urls:
try:
print(f"Processing {url}...")
tags = extract_tags_from_html(url)
# Generate a filename from the URL
filename = url.replace("https://www.diazdentalstudio.com/", "").replace("/", "_") + ".json"
output_path = os.path.join(output_dir, filename)
# Save the tags as a JSON file
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(tags, f, ensure_ascii=False, indent=4)
print(f"Tags extracted and saved to {output_path}")
except Exception as e:
print(f"Error processing {url}: {e}")
3 – Json Translation Files
Uses parsed website json content files to create Spanish versions using the Deep L API key. These files will swap English with Spanish when website button is toggled. Place the created files in the domain root – httpdocs.
import os
import json
import requests
# DeepL API endpoint and key
DEEPL_API_URL = "https://api-free.deepl.com/v2/translate"
DEEPL_API_KEY = "555212bb-e22a-4f82-b84c-c8dae7e5ea2f:fx"
# Directory containing JSON files
json_dir = "page_jsons"
# Directory to save translated JSON files
translated_dir = "translated_jsons"
os.makedirs(translated_dir, exist_ok=True)
# Function to translate text using DeepL API
def translate_text(text, target_lang="es"):
data = {
"auth_key": DEEPL_API_KEY,
"text": text,
"target_lang": target_lang
}
response = requests.post(DEEPL_API_URL, data=data)
try:
result = response.json()
if "translations" in result:
return result["translations"][0]["text"]
else:
print(f"Error in translation response: {result}")
return text # Return the original text if translation fails
except requests.exceptions.JSONDecodeError as e:
print(f"Failed to decode JSON response: {e}")
print(f"Response content: {response.content}")
return text # Return the original text if JSON decoding fails
# Process each JSON file in the directory
for filename in os.listdir(json_dir):
if filename.endswith(".json"):
file_path = os.path.join(json_dir, filename)
with open(file_path, "r", encoding="utf-8") as f:
content = json.load(f)
# Translate each element in the JSON structure
translated_content = {}
for key, elements in content.items():
translated_elements = []
for element in elements:
translated_elements.append(translate_text(element, target_lang="es"))
translated_content[key] = translated_elements
# Save the translated content to a new JSON file
translated_filename = f"translated_{filename}"
translated_file_path = os.path.join(translated_dir, translated_filename)
with open(translated_file_path, "w", encoding="utf-8") as f:
json.dump(translated_content, f, ensure_ascii=False, indent=4)
print(f"Translated content saved to {translated_file_path}")
.htaccess Mods
Because .htaccess redirects visitors to ‘/’ – we need to modify the file to render the translated file when requesting Spanish – home page only.
# ---- index.html Redirection ----
RewriteCond %{THE_REQUEST} /index\.html\?_translate=([a-z]+) [NC]
RewriteRule ^ /?_translate=%1 [R=301,L]
# Redirect /index.html to /
RewriteCond %{THE_REQUEST} /index\.html [NC]
RewriteRule ^ / [R=301,L]
JavaScript File (translation-loader.js)
// Function to load the appropriate translation file based on the selected language
function loadTranslation(lang, page) {
if (lang === 'en') {
return;
}
const url = `/translated_${page}.html.json`;
fetch(url)
.then(response => {
if (!response.ok) {
throw new Error('Translation file not found');
}
return response.json();
})
.then(data => {
// Apply translations (H1, H2, paragraphs, etc.)
const h1Element = document.querySelector('h1');
if (h1Element && data.h1 && data.h1.length) {
h1Element.textContent = data.h1[0];
}
const h2Elements = document.querySelectorAll('h2');
data.h2.forEach((h2Text, index) => {
if (h2Elements[index]) {
h2Elements[index].textContent = h2Text;
}
});
const paragraphElements = document.querySelectorAll('p');
data.p.forEach((pText, index) => {
if (paragraphElements[index]) {
paragraphElements[index].textContent = pText;
}
});
const anchorElements = document.querySelectorAll('a');
data.a.forEach((aText, index) => {
if (anchorElements[index]) {
anchorElements[index].textContent = aText;
}
});
const spanElements = document.querySelectorAll('span');
data.span.forEach((spanText, index) => {
if (spanElements[index]) {
spanElements[index].textContent = spanText;
}
});
})
.catch(error => {
console.error('Error loading translation:', error);
});
}
// Function to toggle between English and Spanish
function toggleLanguage() {
const urlParams = new URLSearchParams(window.location.search);
const currentLang = urlParams.get('_translate') || 'en';
const newLang = currentLang === 'es' ? 'en' : 'es';
// Create a new URL with the updated language parameter
const newUrl = `${window.location.pathname}?_translate=${newLang}`;
// Redirect the page to the new URL to reload the content with the selected language
window.location.href = newUrl;
}
// Initialize translation when the page loads
function initTranslation() {
const urlParams = new URLSearchParams(window.location.search);
const lang = urlParams.get('_translate') || 'en'; // Default to English if no language is specified
const page = window.location.pathname.split('/').pop().replace('.html', '');
loadTranslation(lang, page); // Load the translation for the detected language
}
// Add the event listener for the language toggle button when the DOM is fully loaded
document.addEventListener('DOMContentLoaded', () => {
const toggleButton = document.getElementById('language-toggle-button');
if (toggleButton) {
toggleButton.addEventListener('click', toggleLanguage);
}
// Initialize the translation on page load
initTranslation();
});
Where to Place the toggleLanguage
Function:
This toggleLanguage
function goes inside your translation-loader.js
file, which is already linked in your HTML with:
<script src="assets/js/translation-loader.js" defer></script>
HTML Button
Used on site to toggle English to Spanish and back.
<a href="javascript:void(0);" id="language-toggle-button" class="btn-gray">Español</a>
How It Works:
- When the page reloads,
initTranslation()
is called, and the translation is applied if the?_translate=es
parameter is detected. - The
toggleLanguage
function will be called when the user clicks the “Español” button. - The function checks the current language and toggles between English (
en
) and Spanish (es
). - It updates the URL with the correct
?_translate=es
parameter and reloads the page.