#!/usr/bin/env bash
#
# git-clone-subset - clones a subset of a git repository
#
#    Copyright (C) 2012 Rodrigo Silva (MestreLion) <linux@rodrigosilva.com>
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program. If not see <http://www.gnu.org/licenses/gpl.html>
#
# Uses git clone and git filter-branch to remove from the clone all files but
# the ones requested, along with their associated commit history.

usage() {
	cat <<- USAGE
	Usage: $myname [options] <repository> <destination-dir> <pattern>
	USAGE
	if [[ "$1" ]] ; then
		cat >&2 <<- USAGE
		Try '$myname --help' for more information.
		USAGE
		exit 1
	fi
	cat <<-USAGE

	Clones a <repository> into a <destination-dir> and runs on the clone
	git filter-branch --prune-empty --tree-filter 'git rm ...' -- --all
	to prune from history all files except the ones matching <pattern>,
	effectively creating a clone with a subset of files (and history) of the
	original repository.

	Useful for creating a new repository out of a set of files from another
	repository, migrating (only) their associated history. Very similar to
	what git filter-branch --subdirectory-filter does, but for a file
	pattern instead of just a single directory.

	Options:
	  -h, --help
	     show this page.

	  <repository>
	    URL or local path to the git repository to be cloned.

	  <destination-dir>
	    Directory to create the clone. Same rules for git-clone applies: it
	    will be created if it does not exist and it must be empty otherwise.
	    But, unlike git-clone, this argument is not optional: git-clone uses
	    several rules to determine the "Humane" dir name of a cloned repo,
	    and $myname will not risk parse its output, let alone
	    predict the chosen name.

	  <pattern>
	    Glob pattern to match the desired files/dirs. It will be ultimately
	    evaluated by a call to bash, NOT git or sh, using extended glob
	    '!(<pattern>)' rule. Quote it or escape it on command line, so it
	    does not get evaluated prematurely by your current shell. Only a
	    single pattern is allowed: if more are required, use extglob's "|"
	    syntax. Globs will be evaluated with bash's shopt dotglob set, so
	    beware. Patterns should not contain spaces or special chars like
	    " ' \$ ( ) { } \`, not even quoted or escaped, since that might
	    interphere with the !() syntax after pattern expansion.

	    Pattern Examples:
	       "*.png"
	       "*.png|*icon*"
	       "*.h|src/|lib"

	Limitations:

	- Renames are NOT followed. As a workaround, list the rename history with
	  'git log --follow --name-status --format='%H' -- file | grep "^[RAD]"'
	  and include all multiple names of a file in the pattern, as in
	  "currentname|oldname|initialname". As a side efect, if a different
	  file has taken place of an old name, it will be preserved too, and
	  there is no way around this using this tool.

	- There is no (easy) way to keep some files in a dir: using 'dir/foo*'
	  as pattern will not work. So keep the whole dir and remove files
	  afterwards, using git filter-branch and a (quite complex) combination
	  of cloning, remote add, rebases, etc.

	- Pattern matching is quite limited, and many of bash's escaping and
	  quoting does not work properly when pattern is expanded inside !().

	Copyright (C) 2013 Rodrigo Silva (MestreLion) <linux@rodrigosilva.com>
	License: GPLv3 or later. See <http://www.gnu.org/licenses/gpl.html>
	USAGE
	exit 0
}

# Helper functions
myname="${0##*/}"
argerr()  { printf "%s: %s\n" "${0##*/}" "${1:-error}" >&2 ; usage 1 ; }
invalid() { argerr "invalid option: $1" ; }
missing() { argerr "missing ${2:+$2 }operand${1:+ from $1}." ; }

# Argument handling
for arg in "$@"; do case "$arg" in -h|--help) usage ;; esac; done

repo=$1
dir=$2
pattern=$3

[[ "$repo"    ]] || missing "" "<repository>"
[[ "$dir"     ]] || missing "" "<destination-dir>"
[[ "$pattern" ]] || missing "" "<pattern>"

(($# > 3)) && argerr "too many arguments: $4"

# Clone the repo and enter it
git clone --no-hardlinks "$repo" "$dir" && cd "$dir" &&

# Remove remotes (a clone is meant to be a different repository)
while read -r remote; do
	git remote rm "$remote" || break
done < <(git remote) &&

# The heart of the script
git filter-branch --prune-empty --tree-filter \
"bash -O dotglob -O extglob -c "\
"'git rm -rf --ignore-unmatch -- !($pattern)'" \
-- --all &&

# fix a bug in filter-branch where empty root commits are not removed
# even with --prune-empty. First we loop each root commit
while read -r root; do

	# Test if it's an non-empty commit
	if [[ "$(git ls-tree "$root")" ]]; then continue; fi

	# Now "remove" it by deleting its child's parent reference
	git filter-branch --force --parent-filter "sed 's/-p $root//'" -- --all

done < <(git rev-list --max-parents=0 HEAD) &&

# Deletes backups and reflogs, not needed in a clone
if [[ -e .git/refs/original/ ]] ; then
	git "for-each-ref" --format="%(refname)" refs/original/ | xargs -n 1 git update-ref -d
fi &&
git reflog expire --expire=now --all &&
git gc --prune=now
