public abstract class AbstractJob
extends org.apache.hadoop.conf.Configured
implements org.apache.hadoop.util.Tool
Superclass of many Mahout Hadoop "jobs". A job drives configuration and launch of one or more maps and reduces in order to accomplish some task.
Command line arguments available to all subclasses are:
In addition, note some key command line parameters that are parsed by Hadoop, which jobs may need to set:
Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other arguments.
Modifier and Type | Field and Description |
---|---|
protected Map<String,List<String>> |
argMap |
protected File |
inputFile |
protected org.apache.hadoop.fs.Path |
inputPath
input path, populated by
parseArguments(String[]) |
protected File |
outputFile |
protected org.apache.hadoop.fs.Path |
outputPath
output path, populated by
parseArguments(String[]) |
protected org.apache.hadoop.fs.Path |
tempPath
temp path, populated by
parseArguments(String[]) |
Modifier | Constructor and Description |
---|---|
protected |
AbstractJob() |
Modifier and Type | Method and Description |
---|---|
protected void |
addFlag(String name,
String shortName,
String description)
Add an option with no argument whose presence can be checked for using
containsKey method on the map returned by parseArguments(String[]) ; |
protected void |
addInputOption()
Add the default input directory option, '-i' which takes a directory
name as an argument.
|
protected org.apache.commons.cli2.Option |
addOption(org.apache.commons.cli2.Option option)
Add an arbitrary option to the set of options this job will parse when
parseArguments(String[]) is called. |
protected void |
addOption(String name,
String shortName,
String description)
Add an option to the the set of options this job will parse when
parseArguments(String[]) is called. |
protected void |
addOption(String name,
String shortName,
String description,
boolean required)
Add an option to the the set of options this job will parse when
parseArguments(String[]) is called. |
protected void |
addOption(String name,
String shortName,
String description,
String defaultValue)
Add an option to the the set of options this job will parse when
parseArguments(String[]) is called. |
protected void |
addOutputOption()
Add the default output directory option, '-o' which takes a directory
name as an argument.
|
protected static org.apache.commons.cli2.Option |
buildOption(String name,
String shortName,
String description,
boolean hasArg,
boolean required,
String defaultValue)
Build an option with the given parameters.
|
protected static org.apache.commons.cli2.Option |
buildOption(String name,
String shortName,
String description,
boolean hasArg,
int min,
int max,
boolean required,
String defaultValue) |
protected Class<? extends org.apache.lucene.analysis.Analyzer> |
getAnalyzerClassFromOption() |
protected org.apache.commons.cli2.Option |
getCLIOption(String name) |
org.apache.hadoop.conf.Configuration |
getConf() |
int |
getDimensions(org.apache.hadoop.fs.Path matrix)
Get the cardinality of the input vectors
|
float |
getFloat(String optionName) |
float |
getFloat(String optionName,
float defaultVal) |
protected org.apache.commons.cli2.Group |
getGroup() |
protected File |
getInputFile() |
protected org.apache.hadoop.fs.Path |
getInputPath()
Returns the input path established by a call to
parseArguments(String[]) . |
int |
getInt(String optionName) |
int |
getInt(String optionName,
int defaultVal) |
static String |
getOption(Map<String,List<String>> args,
String optName) |
String |
getOption(String optionName) |
String |
getOption(String optionName,
String defaultVal)
Get the option, else the default
|
List<String> |
getOptions(String optionName)
Options can occur multiple times, so return the list
|
protected File |
getOutputFile() |
protected org.apache.hadoop.fs.Path |
getOutputPath()
Returns the output path established by a call to
parseArguments(String[]) . |
protected org.apache.hadoop.fs.Path |
getOutputPath(String path) |
protected org.apache.hadoop.fs.Path |
getTempPath() |
protected org.apache.hadoop.fs.Path |
getTempPath(String directory) |
boolean |
hasOption(String optionName) |
static String |
keyFor(String optionName)
Build the option key (--name) from the option name
|
protected static void |
maybePut(Map<String,List<String>> args,
org.apache.commons.cli2.CommandLine cmdLine,
org.apache.commons.cli2.Option... opt) |
Map<String,List<String>> |
parseArguments(String[] args)
Parse the arguments specified based on the options defined using the
various
addOption methods. |
Map<String,List<String>> |
parseArguments(String[] args,
boolean inputOptional,
boolean outputOptional) |
protected void |
parseDirectories(org.apache.commons.cli2.CommandLine cmdLine,
boolean inputOptional,
boolean outputOptional)
Obtain input and output directories from command-line options or hadoop
properties.
|
protected org.apache.hadoop.mapreduce.Job |
prepareJob(org.apache.hadoop.fs.Path inputPath,
org.apache.hadoop.fs.Path outputPath,
Class<? extends org.apache.hadoop.mapreduce.InputFormat> inputFormat,
Class<? extends org.apache.hadoop.mapreduce.Mapper> mapper,
Class<? extends org.apache.hadoop.io.Writable> mapperKey,
Class<? extends org.apache.hadoop.io.Writable> mapperValue,
Class<? extends org.apache.hadoop.mapreduce.OutputFormat> outputFormat) |
protected org.apache.hadoop.mapreduce.Job |
prepareJob(org.apache.hadoop.fs.Path inputPath,
org.apache.hadoop.fs.Path outputPath,
Class<? extends org.apache.hadoop.mapreduce.InputFormat> inputFormat,
Class<? extends org.apache.hadoop.mapreduce.Mapper> mapper,
Class<? extends org.apache.hadoop.io.Writable> mapperKey,
Class<? extends org.apache.hadoop.io.Writable> mapperValue,
Class<? extends org.apache.hadoop.mapreduce.OutputFormat> outputFormat,
String jobname) |
protected org.apache.hadoop.mapreduce.Job |
prepareJob(org.apache.hadoop.fs.Path inputPath,
org.apache.hadoop.fs.Path outputPath,
Class<? extends org.apache.hadoop.mapreduce.InputFormat> inputFormat,
Class<? extends org.apache.hadoop.mapreduce.Mapper> mapper,
Class<? extends org.apache.hadoop.io.Writable> mapperKey,
Class<? extends org.apache.hadoop.io.Writable> mapperValue,
Class<? extends org.apache.hadoop.mapreduce.Reducer> reducer,
Class<? extends org.apache.hadoop.io.Writable> reducerKey,
Class<? extends org.apache.hadoop.io.Writable> reducerValue,
Class<? extends org.apache.hadoop.mapreduce.OutputFormat> outputFormat) |
protected org.apache.hadoop.mapreduce.Job |
prepareJob(org.apache.hadoop.fs.Path inputPath,
org.apache.hadoop.fs.Path outputPath,
Class<? extends org.apache.hadoop.mapreduce.Mapper> mapper,
Class<? extends org.apache.hadoop.io.Writable> mapperKey,
Class<? extends org.apache.hadoop.io.Writable> mapperValue,
Class<? extends org.apache.hadoop.mapreduce.Reducer> reducer,
Class<? extends org.apache.hadoop.io.Writable> reducerKey,
Class<? extends org.apache.hadoop.io.Writable> reducerValue) |
void |
setConf(org.apache.hadoop.conf.Configuration conf)
Overrides the base implementation to install the Oozie action configuration resource
into the provided Configuration object; note that ToolRunner calls setConf on the Tool
before it invokes run.
|
static void |
setS3SafeCombinedInputPath(org.apache.hadoop.mapreduce.Job job,
org.apache.hadoop.fs.Path referencePath,
org.apache.hadoop.fs.Path inputPathOne,
org.apache.hadoop.fs.Path inputPathTwo)
necessary to make this job (having a combined input path) work on Amazon S3, hopefully this is
obsolete when MultipleInputs is available again
|
protected static boolean |
shouldRunNextPhase(Map<String,List<String>> args,
AtomicInteger currentPhase) |
protected org.apache.hadoop.fs.Path inputPath
parseArguments(String[])
protected File inputFile
protected org.apache.hadoop.fs.Path outputPath
parseArguments(String[])
protected File outputFile
protected org.apache.hadoop.fs.Path tempPath
parseArguments(String[])
protected org.apache.hadoop.fs.Path getInputPath()
parseArguments(String[])
.
The source of the path may be an input option added using addInputOption()
or it may be the value of the mapred.input.dir
configuration
property.protected org.apache.hadoop.fs.Path getOutputPath()
parseArguments(String[])
.
The source of the path may be an output option added using addOutputOption()
or it may be the value of the mapred.input.dir
configuration
property.protected org.apache.hadoop.fs.Path getOutputPath(String path)
protected File getInputFile()
protected File getOutputFile()
protected org.apache.hadoop.fs.Path getTempPath()
protected org.apache.hadoop.fs.Path getTempPath(String directory)
public org.apache.hadoop.conf.Configuration getConf()
getConf
in interface org.apache.hadoop.conf.Configurable
getConf
in class org.apache.hadoop.conf.Configured
protected void addFlag(String name, String shortName, String description)
containsKey
method on the map returned by parseArguments(String[])
;protected void addOption(String name, String shortName, String description)
parseArguments(String[])
is called. This options has an argument
with null as its default value.protected void addOption(String name, String shortName, String description, boolean required)
parseArguments(String[])
is called.required
- if true the parseArguments(String[])
will throw
fail with an error and usage message if this option is not specified
on the command line.protected void addOption(String name, String shortName, String description, String defaultValue)
parseArguments(String[])
is called. If this option is not
specified on the command line the default value will be
used.defaultValue
- the default argument value if this argument is not
found on the command-line. null is allowed.protected org.apache.commons.cli2.Option addOption(org.apache.commons.cli2.Option option)
parseArguments(String[])
is called. If this option has no
argument, use containsKey
on the map returned by
parseArguments
to check for its presence. Otherwise, the
string value of the option will be placed in the map using a key
equal to this options long name preceded by '--'.protected org.apache.commons.cli2.Group getGroup()
protected void addInputOption()
parseArguments(String[])
is
called, the inputPath will be set based upon the value for this option.
If this method is called, the input is required.protected void addOutputOption()
parseArguments(String[])
is
called, the outputPath will be set based upon the value for this option.
If this method is called, the output is required.protected static org.apache.commons.cli2.Option buildOption(String name, String shortName, String description, boolean hasArg, boolean required, String defaultValue)
name
- the long name of the option prefixed with '--' on the command-lineshortName
- the short name of the option, prefixed with '-' on the command-linedescription
- description of the option displayed in help methodhasArg
- true if the option has an argument.required
- true if the option is required.defaultValue
- default argument value, can be null.protected static org.apache.commons.cli2.Option buildOption(String name, String shortName, String description, boolean hasArg, int min, int max, boolean required, String defaultValue)
protected org.apache.commons.cli2.Option getCLIOption(String name)
name
- The name of the optionOption
with the name, else nullpublic Map<String,List<String>> parseArguments(String[] args) throws IOException
addOption
methods. If -h is specified or an
exception is encountered print help and return null. Has the
side effect of setting inputPath and outputPath
if addInputOption
or addOutputOption
or mapred.input.dir
or mapred.output.dir
are present in the Configuration.Map<String,String>
containing options and their argument values.
The presence of a flag can be tested using containsKey
, while
argument values can be retrieved using get(optionName)
. The
names used for keys are the option name parameter prefixed by '--'.IOException
-- passes in false, false for the optional args.
public Map<String,List<String>> parseArguments(String[] args, boolean inputOptional, boolean outputOptional) throws IOException
args
- The args to parseinputOptional
- if false, then the input option, if set, need not be present. If true and input is an option
and there is no input, then throw an erroroutputOptional
- if false, then the output option, if set, need not be present. If true and output is an
option and there is no output, then throw an errorIOException
public static String keyFor(String optionName)
public String getOption(String optionName)
public String getOption(String optionName, String defaultVal)
optionName
- The name of the option to look up, without the --defaultVal
- The default value.public int getInt(String optionName)
public int getInt(String optionName, int defaultVal)
public float getFloat(String optionName)
public float getFloat(String optionName, float defaultVal)
public List<String> getOptions(String optionName)
optionName
- The unadorned (no "--" prefixing it) option namepublic boolean hasOption(String optionName)
public int getDimensions(org.apache.hadoop.fs.Path matrix) throws IOException
matrix
- IOException
protected void parseDirectories(org.apache.commons.cli2.CommandLine cmdLine, boolean inputOptional, boolean outputOptional)
addInputOption
or addOutputOption
has been called, this method will throw an OptionException
if
no source (command-line or property) for that value is present.
Otherwise, inputPath
or outputPath
will be
non-null only if specified as a hadoop property. Command-line options
take precedence over hadoop properties.IllegalArgumentException
- if either inputOption is present,
and neither --input
nor -Dmapred.input dir
are
specified or outputOption is present and neither --output
nor -Dmapred.output.dir
are specified.protected static void maybePut(Map<String,List<String>> args, org.apache.commons.cli2.CommandLine cmdLine, org.apache.commons.cli2.Option... opt)
public static String getOption(Map<String,List<String>> args, String optName)
args
- The input argument mapoptName
- The adorned (including "--") option nameprotected static boolean shouldRunNextPhase(Map<String,List<String>> args, AtomicInteger currentPhase)
protected org.apache.hadoop.mapreduce.Job prepareJob(org.apache.hadoop.fs.Path inputPath, org.apache.hadoop.fs.Path outputPath, Class<? extends org.apache.hadoop.mapreduce.InputFormat> inputFormat, Class<? extends org.apache.hadoop.mapreduce.Mapper> mapper, Class<? extends org.apache.hadoop.io.Writable> mapperKey, Class<? extends org.apache.hadoop.io.Writable> mapperValue, Class<? extends org.apache.hadoop.mapreduce.OutputFormat> outputFormat) throws IOException
IOException
protected org.apache.hadoop.mapreduce.Job prepareJob(org.apache.hadoop.fs.Path inputPath, org.apache.hadoop.fs.Path outputPath, Class<? extends org.apache.hadoop.mapreduce.InputFormat> inputFormat, Class<? extends org.apache.hadoop.mapreduce.Mapper> mapper, Class<? extends org.apache.hadoop.io.Writable> mapperKey, Class<? extends org.apache.hadoop.io.Writable> mapperValue, Class<? extends org.apache.hadoop.mapreduce.OutputFormat> outputFormat, String jobname) throws IOException
IOException
protected org.apache.hadoop.mapreduce.Job prepareJob(org.apache.hadoop.fs.Path inputPath, org.apache.hadoop.fs.Path outputPath, Class<? extends org.apache.hadoop.mapreduce.Mapper> mapper, Class<? extends org.apache.hadoop.io.Writable> mapperKey, Class<? extends org.apache.hadoop.io.Writable> mapperValue, Class<? extends org.apache.hadoop.mapreduce.Reducer> reducer, Class<? extends org.apache.hadoop.io.Writable> reducerKey, Class<? extends org.apache.hadoop.io.Writable> reducerValue) throws IOException
IOException
protected org.apache.hadoop.mapreduce.Job prepareJob(org.apache.hadoop.fs.Path inputPath, org.apache.hadoop.fs.Path outputPath, Class<? extends org.apache.hadoop.mapreduce.InputFormat> inputFormat, Class<? extends org.apache.hadoop.mapreduce.Mapper> mapper, Class<? extends org.apache.hadoop.io.Writable> mapperKey, Class<? extends org.apache.hadoop.io.Writable> mapperValue, Class<? extends org.apache.hadoop.mapreduce.Reducer> reducer, Class<? extends org.apache.hadoop.io.Writable> reducerKey, Class<? extends org.apache.hadoop.io.Writable> reducerValue, Class<? extends org.apache.hadoop.mapreduce.OutputFormat> outputFormat) throws IOException
IOException
public static void setS3SafeCombinedInputPath(org.apache.hadoop.mapreduce.Job job, org.apache.hadoop.fs.Path referencePath, org.apache.hadoop.fs.Path inputPathOne, org.apache.hadoop.fs.Path inputPathTwo) throws IOException
IOException
protected Class<? extends org.apache.lucene.analysis.Analyzer> getAnalyzerClassFromOption() throws ClassNotFoundException
ClassNotFoundException
public void setConf(org.apache.hadoop.conf.Configuration conf)
setConf
in interface org.apache.hadoop.conf.Configurable
setConf
in class org.apache.hadoop.conf.Configured
Copyright © 2008–2017 The Apache Software Foundation. All rights reserved.