Automating translation of software using the Microsoft Translator and Python

January 25, 2013 at 07:40 AM | categories: Sysadmin, Tips, Unix, Linux | View Comments

The Microsoft translator provides an API that you can use for automated translation. It currently supports about 39 languages.

True to the nature of open source i found that someone had already written a python wrapper to the API. I extended the wrapper to use the requests and pofile packages.

My extended script is able to read gettext Portable Object PO source files and translate the strings and write the translations back into PO files, here by automating the whole translation process.

Source

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4

"""Automatic translation using m$ translator
    :copyright: 2012 Andrew Colin Kissa
    :copyright: 2011 by Openlabs Technologies & Consulting (P) Limited
    :license: BSD, see LICENSE for more details.
"""

import os
import re
import json
import time
import urllib
import logging
import datetime

import requests

from optparse import OptionParser

from polib import pofile

__all__ = ['Translator', 'TranslateApiException']

class ArgumentException(Exception):
    """Argument"""
    def __init__(self, message):
        self.message = message.replace('ArgumentException: ', '')
        super(ArgumentException, self).__init__(self.message)

class ArgumentOutOfRangeException(Exception):
    """ArgumentOutOfRange"""
    def __init__(self, message):
        self.message = message.replace('ArgumentOutOfRangeException: ', '')
        super(ArgumentOutOfRangeException, self).__init__(self.message)

class TranslateApiException(Exception):
    """TranslateApi"""
    def __init__(self, message, *args):
        self.message = message.replace('TranslateApiException: ', '')
        super(TranslateApiException, self).__init__(self.message, *args)

class Translator(object):
    """Implements AJAX API for the Microsoft Translator service
    
    """
    lang_url = 'http://api.microsofttranslator.com/V2/Ajax.svc/GetLanguagesForTranslate'
    oauth_url = 'https://datamarket.accesscontrol.windows.net/v2/OAuth2-13'
    translate_url = "http://api.microsofttranslator.com/V2/Ajax.svc/Translate"
    translate_array_url = "http://api.microsofttranslator.com/V2/Ajax.svc/TranslateArray"

    def __init__(self, client_id, client_secret,
            scope="http://api.microsofttranslator.com", debug=False):
        """

        :param client_id: The client ID that you specified when you registered
                          your application with Azure DataMarket.
        :param client_secret: The client secret value that you obtained when
                              you registered your application with Azure
                              DataMarket.
        :param scope: Defaults to http://api.microsofttranslator.com
        :param debug: If true, the logging level will be set to debug
        """

        self.client_id = client_id
        self.client_secret = client_secret
        self.scope = scope
        self.grant_type = "client_credentials"
        self.access_token = None
        self.debug = debug
        self.logger = logging.getLogger("microsofttranslator")
        self.session = None
        self.langs = []
        if self.debug:
            self.logger.setLevel(level=logging.DEBUG)

    def create_session(self):
        "create a requests session"
        self.session = requests.session()

    def get_access_token(self, force=None):
        """
        .. note::
            The value of access token can be used for subsequent calls to the
            Microsoft Translator API. The access token expires after 10
            minutes. It is always better to check elapsed time between time at
            which token issued and current time. If elapsed time exceeds 10
            minute time period renew access token by following obtaining
            access token procedure.

        :return: The access token to be used with subsequent requests
        """
        args = urllib.urlencode({
            'client_id': self.client_id,
            'client_secret': self.client_secret,
            'scope': self.scope,
            'grant_type': self.grant_type
        })

        if not self.session or force:
            self.create_session()

        response = json.loads(self.session.post(
            self.oauth_url,
            data=args
        ).content)
        self.access_token = response['access_token']
        return self.access_token

    def call(self, url, params):
        """Calls the given url with the params urlencoded
        """
        if not self.access_token:
            self.get_access_token()

        if not self.session:
            self.create_session()

        headers = {'Authorization': 'Bearer %s' % self.access_token}
        translation_url = '%s?%s' % (url, urllib.urlencode(params))
        response = self.session.get(translation_url, headers=headers)
        retval = json.loads(response.content.decode("UTF-8-sig"))

        if isinstance(retval, basestring) and \
                retval.startswith("ArgumentOutOfRangeException"):
            raise ArgumentOutOfRangeException(retval)

        if isinstance(retval, basestring) and \
                retval.startswith("TranslateApiException"):
            raise TranslateApiException(retval)

        if isinstance(retval, basestring) and \
                retval.startswith("ArgumentException"):
            self.access_token = None
            raise ArgumentException(retval)

        return retval

    def languages(self):
        """Check languages supported"""
        if not self.session:
            self.create_session()

        if not self.langs:
            self.langs = self.call(self.lang_url, {})
        return self.langs

    def translate(self, text, to_lang, from_lang=None,
            content_type='text/plain'):
        """Translates a text string from one language to another.

        :param text: A string representing the text to translate.
        :param to_lang: A string representing the language code to
            translate the text into.
        :param from_lang: A string representing the language code of the
            translation text. If left None the response will include the
            result of language auto-detection. (Default: None)
        :param content_type: The format of the text being translated.
            The supported formats are "text/plain" and "text/html". Any HTML
            needs to be well-formed.
        """
        params = {
            'text': text.encode('utf8'),
            'to': to_lang,
            'contentType': content_type,
            'category': 'general',
            }
        if from_lang is not None:
            params['from'] = from_lang
        return self.call(self.translate_url, params)

    def translate_array(self, texts, to_lang, from_lang=None, **options):
        """Translates an array of text strings from one language to another.

        :param texts: A list containing texts for translation.
        :param to_lang: A string representing the language code to 
            translate the text into.
        :param from_lang: A string representing the language code of the 
            translation text. If left None the response will include the 
            result of language auto-detection. (Default: None)
        :param options: A TranslateOptions element containing the values below. 
            They are all optional and default to the most common settings.

                Category: A string containing the category (domain) of the 
                    translation. Defaults to "general".
                ContentType: The format of the text being translated. The 
                    supported formats are "text/plain" and "text/html". Any 
                    HTML needs to be well-formed.
                Uri: A string containing the content location of this 
                    translation.
                User: A string used to track the originator of the submission.
                State: User state to help correlate request and response. The 
                    same contents will be returned in the response.
        """
        options = {
            'Category': "general",
            'Contenttype': "text/plain",
            'Uri': '',
            'User': 'default',
            'State': ''
            }.update(options)
        params = {
            'texts': json.dumps(texts),
            'to': to_lang,
            'options': json.dumps(options),
            }
        if from_lang is not None:
            params['from'] = from_lang

        return self.call(self.translate_array_url, params)

def format_date():
    "Return a date string in required format"
    return time.strftime("%Y-%m-%d %R+0200", time.strptime(time.ctime()))

def first_pass(items, thestring):
    "replace %(xxx)s vars"
    for item in items:
        thestring = thestring.replace(item, '|^^|', 1)
    return thestring

def second_pass(items, thestring):
    "replace %s with actual %(xxx)s"
    for item in items:
        thestring = thestring\
                    .replace('|^^|', item, 1)\
                    .replace('| ^ ^ |', item, 1)
    return thestring

def getpofs(matched, dirname, files):
    "utility to get po files"
    matched.extend([os.path.join(dirname, filename)
                    for filename in files
                    if filename.endswith('.po')])

def get_lang(dirname):
    "Get the language from directory name"
    return os.path.basename(
                os.path.dirname(
                    os.path.dirname(dirname)
                )
            )

def process(translator, raw_entry, language, sentry, regex):
    "Process and Translate the string"
    languages_bidi = ["he", "ar", "fa", "yi"]
    found = regex.findall(raw_entry)
    if found:
        if language in languages_bidi:
            return None
        raw_entry = first_pass(found, raw_entry)
    if datetime.datetime.now() >= sentry:
        print "Renewing token"
        translator.get_access_token(True)
        sentry = (datetime.datetime.now() +
                datetime.timedelta(minutes=8))
    new_entry = translator.translate(raw_entry, language)
    if found:
        new_entry = second_pass(found, new_entry)
    return new_entry

def createps(filename, client_id, api_key, meta, default_lang):
    "update po file"
    do_save = False
    trans = Translator(client_id, api_key)
    print "Processing: %s" % filename
    pobj = pofile(filename)
    lang = get_lang(filename)

    if (not lang in trans.languages() or lang == default_lang) and lang != 'zh':
        print "Language: %s not supported by API" % lang
        return

    try:
        match_re = re.compile(r'((?:%\([^\W]{1,}\)(?:s|d))|(?:{{\w+}}))')
        sentry = datetime.datetime.now() + datetime.timedelta(minutes=8)
        if lang == 'zh':
            lang = 'zh-CHS'
        for entry in pobj.untranslated_entries():
            try:
                msgstr = process(trans, entry.msgid, lang, sentry, match_re)
                if entry.msgid_plural:
                    if msgstr:
                        entry.msgstr_plural['0'] = msgstr
                    msgstr_plural = process(trans, entry.msgid_plural,
                                            lang, sentry, match_re)
                    if msgstr_plural:
                        entry.msgstr_plural['1'] = msgstr_plural
                else:
                    if msgstr:
                        entry.msgstr = msgstr
                do_save = True
            except (TranslateApiException, ArgumentOutOfRangeException), ermsg:
                print 'Error occured: %s' % str(ermsg)

        if do_save:
            pobj.metadata.update(meta)
            pobj.metadata['PO-Revision-Date'] = format_date()
            pobj.save(filename)
    except ArgumentException, errstr:
        print "Access Error: %s" % str(errstr)

if __name__ == '__main__':
    # Run tings mon

    CLIENT_ID = 'xxxxx'
    API_KEY = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

    metadata = {
        'Report-Msgid-Bugs-To': 'baruwa@lists.baruwa.org',
        'Last-Translator': 'Andrew Colin Kissa <andrew@topdog.za.net>',
        'Generated-By': 'auto-translate.py 0.0.1',
        'Language-Team': 'Baruwa Project',
    }

    usage = "usage: %prog directory"
    parser = OptionParser(usage)
    parser.add_option('-s', '--source', dest="source_lang", default="en")
    opts, arguments = parser.parse_args()

    if len(arguments) != 1:
        parser.error("Please specify the directory to process")

    directory = arguments[0]

    if not os.path.exists(directory):
        parser.error("Directory: %s does not exist" % directory)

    try:        
        pofiles = []
        os.path.walk(directory, getpofs, pofiles)
        _ = [createps(path, CLIENT_ID, API_KEY, metadata, opts.source_lang)
            for path in pofiles]
    except KeyboardInterrupt:
        print "\nCTRL-C pressed, exiting"

Usage

You need to obtain an access token from the Azure Marketplace instructions can be found here

Set the CLIENT_ID and API_KEY variables in the script to the values you obtain from Azure DataMarket.

Then point the script to the directory containing the PO files you want to translate and let it do its thing.

/auto-translate.py <directory>

Download

You can download the script from github


blog comments powered by Disqus