1 files changed, 263 insertions, 138 deletions
diff --git a/web-export/update.py b/web-export/update.py
index dc15a59..1f43676 100755
--- a/web-export/update.py
+++ b/web-export/update.py
@@ -1,140 +1,265 @@
 #!/usr/bin/env python
 
-import os,re,string
-
-xmlto = "/usr/bin/xmlto"
-cvs = "http://cvs.freedesktop.org/"
-git = "http://cgit.freedesktop.org/"
-specindex = "specs.idx"
-
-try:
-	f = open(specindex, 'r')
-	lastname = ''
-	lastpath = ''
-	for line in f.readlines():
-		line = line.strip()
-		if not line or line.startswith('#'):
-			continue
-
-		(file, revision, version, path) = string.split(line)
-		use_git = False
-		if file.startswith("git:"):
-			use_git = True
-			git_data = file.split(":")
-			git_repo = git_data[1]
-			file = git_data[2]
-		name = os.path.splitext(os.path.split(file)[1])[0]
-		# Strip version from name
-		if re.search("\d\.\d+$", name):
-			name = re.sub("^(.*)-([^/]*)$", "\\1", name)
-
-		if use_git:
-			url = '%s%s/plain/%s?id=%s' % (git, git_repo, file, revision)
-		else:
-			url = '%s%s?rev=%s' % (cvs, file, revision)
-
-		if re.search("\.xml$", file):
-			os.system("mkdir %s 2> /dev/null" % (path))
-			if lastpath != path and lastname != name:
-				os.system("rm -f %s/%s-latest.html" % (path, name))
-				os.system("cd %s; ln -s %s-%s.html %s-latest.html" % (path,name,version,name))
-				os.system("rm -f %s/latest" % (path))
-				os.system("cd %s; ln -s %s latest" % (path,version))
-
-			# if ( lastpath == path and lastname == name and os.path.isfile("%s/%s-%s.xml" % (path, name, version))):
-			#	print "Updating", file, "Version", version, "rev", revision, "skipped."
-			#	continue
-
-			if os.system("wget -q '%s' -O wget.xml && (diff -q wget.xml %s/%s-%s.xml || mv wget.xml %s/%s-%s.xml)" % (url, path, name, version, path, name, version)):
-				print "Updating", file, "Version", version, "rev", revision, "FAILED."
-                        os.system("chmod g+w wget.xml");
-
-			print "Updating", file, "Version", version, "rev", revision, "ok"
-				
-		elif re.search("\.txt$", file):
-			os.system("mkdir %s 2> /dev/null" % (path))
-			if lastpath != path and lastname != name:
-				os.system("rm -f %s/%s-latest.txt" % (path, name))
-				os.system("cd %s; ln -s %s-%s.txt %s-latest.txt" % (path,name,version,name))
-
-			if ( lastpath == path and lastname == name and os.path.isfile("%s/%s-%s.txt" % (path, name, version))):
-				print "Updating", file, "Version", version, "rev", revision, "skipped."
-				continue
-
-			if os.system("wget -q '%s' -O wget.txt && (diff -q wget.txt %s/%s-%s.txt || mv wget.txt %s/%s-%s.txt)" % (url, path, name, version, path, name, version)):
-				print "Updating", file, "Version", version, "rev", revision, "FAILED."
-                        os.system("chmod g+w wget.txt");
-
-			print "Updating", file, "Version", version, "rev", revision, "ok"
-
-		elif re.search("\.dtd$", file):
-			os.system("mkdir %s 2> /dev/null" % (path))
-			if lastpath != path and lastname != name:
-				os.system("rm -f %s/%s-latest.dtd" % (path, name))
-				os.system("cd %s; ln -s %s-%s.dtd %s-latest.dtd" % (path,name,version,name))
-
-			if ( lastpath == path and lastname == name and os.path.isfile("%s/%s-%s.dtd" % (path, name, version))):
-				print "Updating", file, "Version", version, "rev", revision, "skipped."
-				continue
-
-			if os.system("wget -q '%s' -O wget.dtd && (diff -q wget.dtd %s/%s-%s.dtd || mv wget.dtd %s/%s-%s.dtd)" % (url, path, name, version, path, name, version)):
-				print "Updating", file, "Version", version, "rev", revision, "FAILED."
-                        os.system("chmod g+w wget.dtd");
-
-			print "Updating", file, "Version", version, "rev", revision, "ok"
-		else:
-			print "Skipping", file, ", unknown file."
-			continue
-
-		lastname = name
-		lastpath = path
-
-except IOError:
-	print "Can't open", specindex
-
-
-specs = os.listdir(".")
-
-for spec in specs:
-	if not os.path.isdir(spec):
-		continue
-	versions = os.listdir(spec)
-	for file in versions:
-		if re.search("\.xml$", file):
-			tmp = re.sub("(.*)(\.xml)$", "\\1", file)
-			name = re.sub("^(.*)-([^/]*)$", "\\1", tmp)
-			ver = re.sub("^(.*)-([^/]*)$", "\\2", tmp)
-			
-			print "Check", os.path.join(spec,ver), os.path.isdir(os.path.join(spec,ver))
-			print "Check", os.path.join(spec,name+"-"+ver+".html"), os.path.isfile(os.path.join(spec,name+"-"+ver+".html"))
-
-			if (	not os.path.isdir(os.path.join(spec,ver))
-			    	or not os.path.isfile(os.path.join(spec,name+"-"+ver+".html"))
-				or os.path.getmtime(os.path.join(spec,file)) > os.path.getmtime(os.path.join(spec,name+"-"+ver+".html"))):
-				os.system("rm -fR %s/%s" % (spec,ver))
-				os.system("rm -f %s/%s-%s.html" % (spec,name,ver))
-				os.system("mkdir %s/%s" % (spec,ver))
-				os.system("cd %s/%s; %s html ../%s" % (spec,ver,xmlto,file))
-				# os.system("mv index.html %s/%s-%s.html" % (spec,name,ver))
-				# os.system("sed -i %s/%s-%s.html -e 's/index.html/%s-%s.html/;'" % (spec,name,ver,name,ver))
-				os.system("cd %s;%s html-nochunks %s" % (spec,xmlto,file))
-		elif re.search("(?<!latest)\.html$", file) and not os.path.isfile(os.path.join(spec,re.sub("html","xml",file))):
-				tmp = re.sub("(.*)(\.html)$", "\\1", file)
-				name = re.sub("^(.*)-([^/]*)$", "\\1", tmp)
-				ver = re.sub("^(.*)-([^/]*)$", "\\2", tmp)
-				os.system("rm -fR %s/%s" % (spec,ver))
-				os.system("rm -f %s/%s-%s.html" % (spec,name,ver))
-	for file in versions:
-		if re.search("-latest\.dtd$", file):
-			# Do nothing
-			print "Skipping", file
-		elif re.search("\.dtd$", file):
-			tmp = re.sub("(.*)(\.dtd)$", "\\1", file)
-			name = re.sub("^(.*)-([^/]*)$", "\\1", tmp)
-			ver = re.sub("^(.*)-([^/]*)$", "\\2", tmp)
-			
-			print "Check", os.path.join(spec,ver), os.path.isdir(os.path.join(spec,ver))
-			print "Check", os.path.join(spec,name+"-"+ver+".html"), os.path.isfile(os.path.join(spec,name+"-"+ver+".html"))
-
-			os.system("mkdir %s/%s" % (spec,ver))
-			os.system("cp %s/%s-%s.dtd %s/%s/%s.dtd" % (spec,name,ver,spec,ver,name))
+# Dependencies to run this:
+#  - xmlto in $PATH
+
+# FIXME:
+#  - correctly handle all exceptions
+#  - copy dtd files where they should be
+#  - new structure for website:
+#    specs.fd.o/index.html -- general index
+#    specs.fd.o/desktop-entry/index.html -- index of all versions of desktop entry, with all formats
+#    specs.fd.o/desktop-entry/1.0/desktop-entry-spec.xml -- docbook version of the spec 1.0
+#    specs.fd.o/desktop-entry/1.0/index.html -- one-page html version of the spec 1.0
+#    specs.fd.o/desktop-entry/1.0/split/ -- multiple-page html version of the spec 1.0
+#    specs.fd.o/desktop-entry/latest/ -- link to directory containing latest version of the spec
+
+import os
+import sys
+
+import errno
+
+import StringIO
+import hashlib
+import shutil
+import subprocess
+import urllib
+import urllib2
+import urlparse
+
+DEVELOPMENT = False
+
+CVSWEB = 'http://cvs.freedesktop.org'
+GITWEB = 'http://cgit.freedesktop.org'
+HASH = 'md5'
+
+
+def safe_mkdir(dir):
+    if not dir:
+        return
+
+    try:
+        os.mkdir(dir)
+    except OSError, e:
+        if e.errno != errno.EEXIST:
+            raise e
+
+
+def get_hash_from_fd(fd, algo = HASH, read_blocks = 1024):
+    if algo not in [ 'md5' ]:
+        raise Exception('Internal error: hash algorithm \'%s\' not planned in code.' % algo)
+
+    hash = hashlib.new(algo)
+    while True:
+        data = fd.read(read_blocks)
+        if not data:
+            break
+        hash.update(data)
+    return hash.digest()
+
+
+def get_hash_from_url(url, algo = HASH):
+    fd = urllib2.urlopen(url, None)
+    digest = get_hash_from_fd(fd, algo)
+    fd.close()
+    return digest
+
+
+def get_hash_from_path(path, algo = HASH):
+    fd = open(path, 'rb')
+    digest = get_hash_from_fd(fd, algo, read_blocks = 32768)
+    fd.close()
+    return digest
+
+
+def get_hash_from_data(data, algo = HASH):
+    fd = StringIO.StringIO(data)
+    digest = get_hash_from_fd(fd, algo, read_blocks = 32768)
+    fd.close()
+    return digest
+
+
+class VcsObject:
+    def __init__(self, vcs, repo, file, revision = None):
+        self.vcs = vcs
+        self.repo = repo
+        self.file = file
+        self.revision = revision
+        self.data = None
+
+    def get_url(self):
+        query = {}
+        if self.vcs == 'git':
+            baseurl = GITWEB
+            path = '/'.join((self.repo, 'plain', self.file))
+            if self.revision:
+                query['id'] = self.revision
+        elif self.vcs == 'cvs':
+            baseurl = CVSWEB
+            path = self.file
+            if self.revision:
+                query['rev'] = self.revision
+        else:
+            raise Exception('Unknown VCS: %s' % self.vcs)
+
+        (scheme, netloc, basepath) = urlparse.urlsplit(baseurl)[0:3]
+        full_path = '/'.join((basepath, path))
+
+        query_str = urllib.urlencode(query)
+        return urlparse.urlunsplit((scheme, netloc, full_path, query_str, ''))
+
+    def fetch(self):
+        if self.data:
+            return
+
+        url = self.get_url()
+        fd = urllib2.urlopen(url, None)
+        self.data = fd.read()
+        fd.close()
+
+    def get_hash(self):
+        self.fetch()
+        return get_hash_from_data(self.data)
+
+
+class SpecObject():
+    def __init__(self, vcs, spec_dir, version):
+        self.vcs = vcs
+        self.spec_dir = spec_dir
+        self.version = version
+
+        basename = os.path.basename(self.vcs.file)
+        (self.basename_no_ext, self.ext) = os.path.splitext(basename)
+
+        self.filename = '%s-%s%s' % (self.basename_no_ext, self.version, self.ext)
+
+        if self.ext not in ['.xml', '.sgml', '.txt', '.dtd']:
+            raise Exception('Format \'%s\' not supported for %s' % (self.ext, self.vcs.get_url()))
+
+        self.downloaded = False
+        self.one_chunk = False
+        self.multiple_chunks = False
+
+    def download(self):
+        safe_mkdir(self.spec_dir)
+        path = os.path.join(self.spec_dir, self.filename)
+
+        if os.path.exists(path):
+            current_hash = get_hash_from_path(path)
+            vcs_hash = self.vcs.get_hash()
+            if current_hash == vcs_hash:
+                return
+
+        self.vcs.fetch()
+        fd = open(path, 'wb')
+        fd.write(self.vcs.data)
+        fd.close()
+
+        self.downloaded = True
+
+    def htmlize(self, force = False):
+        if not self.downloaded and not force:
+            return
+
+        path = os.path.join(self.spec_dir, self.filename)
+        (path_no_ext, ext) = os.path.splitext(path)
+
+        if self.ext == '.xml':
+            # One-chunk HTML
+            html_path = '%s%s' % (path_no_ext, '.html')
+            if os.path.exists(html_path):
+                os.unlink(html_path)
+
+            retcode = subprocess.call(['xmlto', '-o', self.spec_dir, 'html-nochunks', path])
+
+            if retcode != 0:
+                raise Exception('Cannot convert \'%s\' to HTML.' % path)
+            self.one_chunk = True
+
+            # Multiple chunks
+            html_dir = os.path.join(self.spec_dir, self.version)
+            if os.path.exists(html_dir):
+                shutil.rmtree(html_dir)
+            safe_mkdir(html_dir)
+
+            retcode = subprocess.call(['xmlto', '-o', html_dir, 'html', path])
+
+            if retcode != 0:
+                raise Exception('Cannot convert \'%s\' to multiple-chunks HTML.' % path)
+            self.multiple_chunks = True
+
+    def latestize(self):
+        filename_latest = '%s-latest%s' % (self.basename_no_ext, self.ext)
+
+        path_latest = os.path.join(self.spec_dir, filename_latest)
+        if os.path.exists(path_latest):
+            os.unlink(path_latest)
+        os.symlink(self.filename, path_latest)
+
+        if self.ext == '.xml':
+            # One-chunk HTML
+            html_path_latest = os.path.join(self.spec_dir, '%s%s' % (self.basename_no_ext, '.html'))
+            if os.path.exists(html_path_latest):
+                os.unlink(html_path_latest)
+
+            (filename_no_ext, ext) = os.path.splitext(self.filename)
+            html_filename = '%s%s' % (filename_no_ext, '.html')
+            html_path = os.path.join(self.spec_dir, html_filename)
+            if os.path.exists(html_path):
+                os.symlink(html_filename, html_path_latest)
+
+            # Multiple chunks
+            html_dir_latest = os.path.join(self.spec_dir, 'latest')
+            if os.path.exists(html_dir_latest):
+                os.unlink(html_dir_latest)
+
+            html_dir = os.path.join(self.spec_dir, self.version)
+            if os.path.exists(html_dir):
+                os.symlink(self.version, html_dir_latest)
+
+
+SCRIPT = VcsObject('git', 'xdg/xdg-specs', 'web-export/update.py')
+SPECS_INDEX = VcsObject('git', 'xdg/xdg-specs', 'web-export/specs.idx')
+
+
+def is_up_to_date():
+    current_hash = get_hash_from_path(__file__)
+    vcs_hash = SCRIPT.get_hash()
+
+    return current_hash == vcs_hash
+
+
+if not DEVELOPMENT:
+    if not is_up_to_date():
+        print >>sys.stderr, 'Script is not up-to-date, please download %s' % SCRIPT.get_url()
+        sys.exit(1)
+
+    SPECS_INDEX.fetch()
+    lines = SPECS_INDEX.data.split('\n')
+else:
+    lines = open('specs.idx').readlines()
+
+
+latests = []
+
+for line in lines:
+    line = line.strip()
+    if not line or line.startswith('#'):
+        continue
+
+    (data, revision, version, path) = line.split()
+    if data.startswith("git:"):
+        git_data = data.split(":")
+        vcs = VcsObject('git', git_data[1], git_data[2], revision)
+    else:
+        vcs = VcsObject('cvs', None, data, revision)
+
+    spec = SpecObject(vcs, path, version)
+
+    spec.download()
+    spec.htmlize()
+
+    # Create latest links if it's the first time we see this spec
+    if (spec.spec_dir, spec.basename_no_ext) not in latests:
+        latests.append((spec.spec_dir, spec.basename_no_ext))
+        spec.latestize()