Converting a .dotx file to a .docx file with Python

It used to be that one could simply rename a Word Template .dot file to .doc. However, Microsoft has made it somewhat harder today.

One way to do this is through using Word Automation. However, another way is to do some XML manipulation, given that .dotx and .dotcx files are actually zip archives.

import xml.dom.minidom
import zipfile

def dotx2docx(src, dst)

    OLD_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.template.main+xml"
    NEW_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"
    CONTENTTYPE_FILENAME = '[Content_Types].xml'
    arc_in  = zipfile.ZipFile(src, 'r')
    arc_out = zipfile.ZipFile(dst, 'w')
    for zinfo in arc_in.infolist():
        if zinfo.filename == CONTENTTYPE_FILENAME:
                doc = xml.dom.minidom.parseString(arc_in.read(CONTENTTYPE_FILENAME))
                els = doc.getElementsByTagName('Override')
                for el in els:
                    attr_contenttype = el.getAttribute('ContentType')
                    if attr_contenttype == OLD_CONTENT_TYPE:
                        if el.getAttribute('PartName') == '/word/document.xml':
                            el.setAttribute('ContentType', NEW_CONTENT_TYPE)
                arc_out.writestr(CONTENTTYPE_FILENAME, doc.toxml())
                doc.unlink()
        else:
            contents = arc_in.read(zinfo.filename)
            arc_out.writestr(zinfo.filename, contents)
    arc_in.close()
    arc_out.close()

About this entry